1 /*
2 * Copyright 2021 Alyssa Rosenzweig
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "util/bitset.h"
7 #include "util/macros.h"
8 #include "util/u_dynarray.h"
9 #include "util/u_qsort.h"
10 #include "agx_builder.h"
11 #include "agx_compile.h"
12 #include "agx_compiler.h"
13 #include "agx_debug.h"
14 #include "agx_opcodes.h"
15 #include "shader_enums.h"
16
17 /* SSA-based register allocator */
18
19 enum ra_class {
20 /* General purpose register */
21 RA_GPR,
22
23 /* Memory, used to assign stack slots */
24 RA_MEM,
25
26 /* Keep last */
27 RA_CLASSES,
28 };
29
30 static inline enum ra_class
ra_class_for_index(agx_index idx)31 ra_class_for_index(agx_index idx)
32 {
33 return idx.memory ? RA_MEM : RA_GPR;
34 }
35
36 struct ra_ctx {
37 agx_context *shader;
38 agx_block *block;
39 agx_instr *instr;
40 uint16_t *ssa_to_reg;
41 uint8_t *ncomps;
42 enum agx_size *sizes;
43 enum ra_class *classes;
44 BITSET_WORD *visited;
45 BITSET_WORD *used_regs[RA_CLASSES];
46
47 /* Maintained while assigning registers */
48 unsigned *max_reg[RA_CLASSES];
49
50 /* For affinities */
51 agx_instr **src_to_collect_phi;
52
53 /* If bit i of used_regs is set, and register i is the first consecutive
54 * register holding an SSA value, then reg_to_ssa[i] is the SSA index of the
55 * value currently in register i.
56 *
57 * Only for GPRs. We can add reg classes later if we have a use case.
58 */
59 uint32_t reg_to_ssa[AGX_NUM_REGS];
60
61 /* Maximum number of registers that RA is allowed to use */
62 unsigned bound[RA_CLASSES];
63 };
64
65 enum agx_size
agx_split_width(const agx_instr * I)66 agx_split_width(const agx_instr *I)
67 {
68 enum agx_size width = ~0;
69
70 agx_foreach_dest(I, d) {
71 if (I->dest[d].type == AGX_INDEX_NULL)
72 continue;
73 else if (width != ~0)
74 assert(width == I->dest[d].size);
75 else
76 width = I->dest[d].size;
77 }
78
79 assert(width != ~0 && "should have been DCE'd");
80 return width;
81 }
82
83 /*
84 * Calculate register demand in 16-bit registers, while gathering widths and
85 * classes. Becuase we allocate in SSA, this calculation is exact in
86 * linear-time. Depends on liveness information.
87 */
88 static unsigned
agx_calc_register_demand(agx_context * ctx)89 agx_calc_register_demand(agx_context *ctx)
90 {
91 uint8_t *widths = calloc(ctx->alloc, sizeof(uint8_t));
92 enum ra_class *classes = calloc(ctx->alloc, sizeof(enum ra_class));
93
94 agx_foreach_instr_global(ctx, I) {
95 agx_foreach_ssa_dest(I, d) {
96 unsigned v = I->dest[d].value;
97 assert(widths[v] == 0 && "broken SSA");
98 /* Round up vectors for easier live range splitting */
99 widths[v] = util_next_power_of_two(agx_index_size_16(I->dest[d]));
100 classes[v] = ra_class_for_index(I->dest[d]);
101 }
102 }
103
104 /* Calculate demand at the start of each block based on live-in, then update
105 * for each instruction processed. Calculate rolling maximum.
106 */
107 unsigned max_demand = 0;
108
109 agx_foreach_block(ctx, block) {
110 unsigned demand = 0;
111
112 /* RA treats the nesting counter as alive throughout if control flow is
113 * used anywhere. This could be optimized.
114 */
115 if (ctx->any_cf)
116 demand++;
117
118 /* Everything live-in */
119 {
120 int i;
121 BITSET_FOREACH_SET(i, block->live_in, ctx->alloc) {
122 if (classes[i] == RA_GPR)
123 demand += widths[i];
124 }
125 }
126
127 max_demand = MAX2(demand, max_demand);
128
129 /* To handle non-power-of-two vectors, sometimes live range splitting
130 * needs extra registers for 1 instruction. This counter tracks the number
131 * of registers to be freed after 1 extra instruction.
132 */
133 unsigned late_kill_count = 0;
134
135 agx_foreach_instr_in_block(block, I) {
136 /* Phis happen in parallel and are already accounted for in the live-in
137 * set, just skip them so we don't double count.
138 */
139 if (I->op == AGX_OPCODE_PHI)
140 continue;
141
142 /* Handle late-kill registers from last instruction */
143 demand -= late_kill_count;
144 late_kill_count = 0;
145
146 /* Kill sources the first time we see them */
147 agx_foreach_src(I, s) {
148 if (!I->src[s].kill)
149 continue;
150 assert(I->src[s].type == AGX_INDEX_NORMAL);
151 if (ra_class_for_index(I->src[s]) != RA_GPR)
152 continue;
153
154 bool skip = false;
155
156 for (unsigned backwards = 0; backwards < s; ++backwards) {
157 if (agx_is_equiv(I->src[backwards], I->src[s])) {
158 skip = true;
159 break;
160 }
161 }
162
163 if (!skip)
164 demand -= widths[I->src[s].value];
165 }
166
167 /* Make destinations live */
168 agx_foreach_ssa_dest(I, d) {
169 if (ra_class_for_index(I->dest[d]) != RA_GPR)
170 continue;
171
172 /* Live range splits allocate at power-of-two granularity. Round up
173 * destination sizes (temporarily) to powers-of-two.
174 */
175 unsigned real_width = widths[I->dest[d].value];
176 unsigned pot_width = util_next_power_of_two(real_width);
177
178 demand += pot_width;
179 late_kill_count += (pot_width - real_width);
180 }
181
182 max_demand = MAX2(demand, max_demand);
183 }
184
185 demand -= late_kill_count;
186 }
187
188 free(widths);
189 free(classes);
190 return max_demand;
191 }
192
193 static bool
find_regs_simple(struct ra_ctx * rctx,enum ra_class cls,unsigned count,unsigned align,unsigned * out)194 find_regs_simple(struct ra_ctx *rctx, enum ra_class cls, unsigned count,
195 unsigned align, unsigned *out)
196 {
197 for (unsigned reg = 0; reg + count <= rctx->bound[cls]; reg += align) {
198 if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + count - 1)) {
199 *out = reg;
200 return true;
201 }
202 }
203
204 return false;
205 }
206
207 /*
208 * Search the register file for the best contiguous aligned region of the given
209 * size to evict when shuffling registers. The region must not contain any
210 * register marked in the passed bitset.
211 *
212 * As a hint, this also takes in the set of registers from killed sources passed
213 * to this instruction. These should be deprioritized, since they are more
214 * expensive to use (extra moves to shuffle the contents away).
215 *
216 * Precondition: such a region exists.
217 *
218 * Postcondition: at least one register in the returned region is already free.
219 */
220 static unsigned
find_best_region_to_evict(struct ra_ctx * rctx,enum ra_class cls,unsigned size,BITSET_WORD * already_evicted,BITSET_WORD * killed)221 find_best_region_to_evict(struct ra_ctx *rctx, enum ra_class cls, unsigned size,
222 BITSET_WORD *already_evicted, BITSET_WORD *killed)
223 {
224 assert(util_is_power_of_two_or_zero(size) && "precondition");
225 assert((rctx->bound[cls] % size) == 0 &&
226 "register file size must be aligned to the maximum vector size");
227 assert(cls == RA_GPR);
228
229 unsigned best_base = ~0;
230 unsigned best_moves = ~0;
231
232 for (unsigned base = 0; base + size <= rctx->bound[cls]; base += size) {
233 /* r0l is unevictable, skip it. By itself, this does not pose a problem.
234 * We are allocating n registers, but the region containing r0l has at
235 * most n-1 free. Since there are at least n free registers total, there
236 * is at least 1 free register outside this region. Thus the region
237 * containing that free register contains at most n-1 occupied registers.
238 * In the worst case, those n-1 occupied registers are moved to the region
239 * with r0l and then the n free registers are used for the destination.
240 * Thus, we do not need extra registers to handle "single point"
241 * unevictability.
242 */
243 if (base == 0 && rctx->shader->any_cf)
244 continue;
245
246 /* Do not evict the same register multiple times. It's not necessary since
247 * we're just shuffling, there are enough free registers elsewhere.
248 */
249 if (BITSET_TEST_RANGE(already_evicted, base, base + size - 1))
250 continue;
251
252 /* Estimate the number of moves required if we pick this region */
253 unsigned moves = 0;
254 bool any_free = false;
255
256 for (unsigned reg = base; reg < base + size; ++reg) {
257 /* We need a move for each blocked register (TODO: we only need a
258 * single move for 32-bit pairs, could optimize to use that instead.)
259 */
260 if (BITSET_TEST(rctx->used_regs[cls], reg))
261 moves++;
262 else
263 any_free = true;
264
265 /* Each clobbered killed register requires a move or a swap. Since
266 * swaps require more instructions, assign a higher cost here. In
267 * practice, 3 is too high but 2 is slightly better than 1.
268 */
269 if (BITSET_TEST(killed, reg))
270 moves += 2;
271 }
272
273 /* Pick the region requiring fewest moves as a heuristic. Regions with no
274 * free registers are skipped even if the heuristic estimates a lower cost
275 * (due to killed sources), since the recursive splitting algorithm
276 * requires at least one free register.
277 */
278 if (any_free && moves < best_moves) {
279 best_moves = moves;
280 best_base = base;
281 }
282 }
283
284 assert(best_base < rctx->bound[cls] &&
285 "not enough registers (should have spilled already)");
286 return best_base;
287 }
288
289 static void
set_ssa_to_reg(struct ra_ctx * rctx,unsigned ssa,unsigned reg)290 set_ssa_to_reg(struct ra_ctx *rctx, unsigned ssa, unsigned reg)
291 {
292 enum ra_class cls = rctx->classes[ssa];
293
294 *(rctx->max_reg[cls]) =
295 MAX2(*(rctx->max_reg[cls]), reg + rctx->ncomps[ssa] - 1);
296
297 rctx->ssa_to_reg[ssa] = reg;
298 }
299
300 static unsigned
assign_regs_by_copying(struct ra_ctx * rctx,unsigned npot_count,unsigned align,const agx_instr * I,struct util_dynarray * copies,BITSET_WORD * clobbered,BITSET_WORD * killed,enum ra_class cls)301 assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align,
302 const agx_instr *I, struct util_dynarray *copies,
303 BITSET_WORD *clobbered, BITSET_WORD *killed,
304 enum ra_class cls)
305 {
306 assert(cls == RA_GPR);
307
308 /* XXX: This needs some special handling but so far it has been prohibitively
309 * difficult to hit the case
310 */
311 if (I->op == AGX_OPCODE_PHI)
312 unreachable("TODO");
313
314 /* Expand the destination to the next power-of-two size. This simplifies
315 * splitting and is accounted for by the demand calculation, so is legal.
316 */
317 unsigned count = util_next_power_of_two(npot_count);
318 assert(align <= count && "still aligned");
319 align = count;
320
321 /* There's not enough contiguous room in the register file. We need to
322 * shuffle some variables around. Look for a range of the register file
323 * that is partially blocked.
324 */
325 unsigned base =
326 find_best_region_to_evict(rctx, cls, count, clobbered, killed);
327
328 assert(count <= 16 && "max allocation size (conservative)");
329 BITSET_DECLARE(evict_set, 16) = {0};
330
331 /* Store the set of blocking registers that need to be evicted */
332 for (unsigned i = 0; i < count; ++i) {
333 if (BITSET_TEST(rctx->used_regs[cls], base + i)) {
334 BITSET_SET(evict_set, i);
335 }
336 }
337
338 /* We are going to allocate the destination to this range, so it is now fully
339 * used. Mark it as such so we don't reassign here later.
340 */
341 BITSET_SET_RANGE(rctx->used_regs[cls], base, base + count - 1);
342
343 /* Before overwriting the range, we need to evict blocked variables */
344 for (unsigned i = 0; i < 16; ++i) {
345 /* Look for subranges that needs eviction */
346 if (!BITSET_TEST(evict_set, i))
347 continue;
348
349 unsigned reg = base + i;
350 uint32_t ssa = rctx->reg_to_ssa[reg];
351 uint32_t nr = rctx->ncomps[ssa];
352 unsigned align = agx_size_align_16(rctx->sizes[ssa]);
353
354 assert(nr >= 1 && "must be assigned");
355 assert(rctx->ssa_to_reg[ssa] == reg &&
356 "variable must start within the range, since vectors are limited");
357
358 for (unsigned j = 0; j < nr; ++j) {
359 assert(BITSET_TEST(evict_set, i + j) &&
360 "variable is allocated contiguous and vectors are limited, "
361 "so evicted in full");
362 }
363
364 /* Assign a new location for the variable. This terminates with finite
365 * recursion because nr is decreasing because of the gap.
366 */
367 assert(nr < count && "fully contained in range that's not full");
368 unsigned new_reg = assign_regs_by_copying(rctx, nr, align, I, copies,
369 clobbered, killed, cls);
370
371 /* Copy the variable over, register by register */
372 for (unsigned i = 0; i < nr; i += align) {
373 assert(cls == RA_GPR);
374
375 struct agx_copy copy = {
376 .dest = new_reg + i,
377 .src = agx_register(reg + i, rctx->sizes[ssa]),
378 };
379
380 assert((copy.dest % agx_size_align_16(rctx->sizes[ssa])) == 0 &&
381 "new dest must be aligned");
382 assert((copy.src.value % agx_size_align_16(rctx->sizes[ssa])) == 0 &&
383 "src must be aligned");
384 util_dynarray_append(copies, struct agx_copy, copy);
385 }
386
387 /* Mark down the set of clobbered registers, so that killed sources may be
388 * handled correctly later.
389 */
390 BITSET_SET_RANGE(clobbered, new_reg, new_reg + nr - 1);
391
392 /* Update bookkeeping for this variable */
393 assert(cls == rctx->classes[cls]);
394 set_ssa_to_reg(rctx, ssa, new_reg);
395 rctx->reg_to_ssa[new_reg] = ssa;
396
397 /* Skip to the next variable */
398 i += nr - 1;
399 }
400
401 /* We overallocated for non-power-of-two vectors. Free up the excess now.
402 * This is modelled as late kill in demand calculation.
403 */
404 if (npot_count != count) {
405 BITSET_CLEAR_RANGE(rctx->used_regs[cls], base + npot_count,
406 base + count - 1);
407 }
408
409 return base;
410 }
411
412 static int
sort_by_size(const void * a_,const void * b_,void * sizes_)413 sort_by_size(const void *a_, const void *b_, void *sizes_)
414 {
415 const enum agx_size *sizes = sizes_;
416 const unsigned *a = a_, *b = b_;
417
418 return sizes[*b] - sizes[*a];
419 }
420
421 /*
422 * Allocating a destination of n consecutive registers may require moving those
423 * registers' contents to the locations of killed sources. For the instruction
424 * to read the correct values, the killed sources themselves need to be moved to
425 * the space where the destination will go.
426 *
427 * This is legal because there is no interference between the killed source and
428 * the destination. This is always possible because, after this insertion, the
429 * destination needs to contain the killed sources already overlapping with the
430 * destination (size k) plus the killed sources clobbered to make room for
431 * livethrough sources overlapping with the destination (at most size |dest|-k),
432 * so the total size is at most k + |dest| - k = |dest| and so fits in the dest.
433 * Sorting by alignment may be necessary.
434 */
435 static void
insert_copies_for_clobbered_killed(struct ra_ctx * rctx,unsigned reg,unsigned count,const agx_instr * I,struct util_dynarray * copies,BITSET_WORD * clobbered)436 insert_copies_for_clobbered_killed(struct ra_ctx *rctx, unsigned reg,
437 unsigned count, const agx_instr *I,
438 struct util_dynarray *copies,
439 BITSET_WORD *clobbered)
440 {
441 unsigned vars[16] = {0};
442 unsigned nr_vars = 0;
443
444 /* Precondition: the nesting counter is not overwritten. Therefore we do not
445 * have to move it. find_best_region_to_evict knows better than to try.
446 */
447 assert(!(reg == 0 && rctx->shader->any_cf) && "r0l is never moved");
448
449 /* Consider the destination clobbered for the purpose of source collection.
450 * This way, killed sources already in the destination will be preserved
451 * (though possibly compacted).
452 */
453 BITSET_SET_RANGE(clobbered, reg, reg + count - 1);
454
455 /* Collect killed clobbered sources, if any */
456 agx_foreach_ssa_src(I, s) {
457 unsigned reg = rctx->ssa_to_reg[I->src[s].value];
458
459 if (I->src[s].kill && ra_class_for_index(I->src[s]) == RA_GPR &&
460 BITSET_TEST(clobbered, reg)) {
461
462 assert(nr_vars < ARRAY_SIZE(vars) &&
463 "cannot clobber more than max variable size");
464
465 vars[nr_vars++] = I->src[s].value;
466 }
467 }
468
469 if (nr_vars == 0)
470 return;
471
472 /* Sort by descending alignment so they are packed with natural alignment */
473 util_qsort_r(vars, nr_vars, sizeof(vars[0]), sort_by_size, rctx->sizes);
474
475 /* Reassign in the destination region */
476 unsigned base = reg;
477
478 /* We align vectors to their sizes, so this assertion holds as long as no
479 * instruction has a source whose scalar size is greater than the entire size
480 * of the vector destination. Yet the killed source must fit within this
481 * destination, so the destination must be bigger and therefore have bigger
482 * alignment.
483 */
484 assert((base % agx_size_align_16(rctx->sizes[vars[0]])) == 0 &&
485 "destination alignment >= largest killed source alignment");
486
487 for (unsigned i = 0; i < nr_vars; ++i) {
488 unsigned var = vars[i];
489 unsigned var_base = rctx->ssa_to_reg[var];
490 unsigned var_count = rctx->ncomps[var];
491 unsigned var_align = agx_size_align_16(rctx->sizes[var]);
492
493 assert(rctx->classes[var] == RA_GPR && "construction");
494 assert((base % var_align) == 0 && "induction");
495 assert((var_count % var_align) == 0 && "no partial variables");
496
497 for (unsigned j = 0; j < var_count; j += var_align) {
498 struct agx_copy copy = {
499 .dest = base + j,
500 .src = agx_register(var_base + j, rctx->sizes[var]),
501 };
502
503 util_dynarray_append(copies, struct agx_copy, copy);
504 }
505
506 set_ssa_to_reg(rctx, var, base);
507 rctx->reg_to_ssa[base] = var;
508
509 base += var_count;
510 }
511
512 assert(base <= reg + count && "no overflow");
513 }
514
515 static unsigned
find_regs(struct ra_ctx * rctx,agx_instr * I,unsigned dest_idx,unsigned count,unsigned align)516 find_regs(struct ra_ctx *rctx, agx_instr *I, unsigned dest_idx, unsigned count,
517 unsigned align)
518 {
519 unsigned reg;
520 assert(count == align);
521
522 enum ra_class cls = ra_class_for_index(I->dest[dest_idx]);
523
524 if (find_regs_simple(rctx, cls, count, align, ®)) {
525 return reg;
526 } else {
527 assert(cls == RA_GPR && "no memory live range splits");
528
529 BITSET_DECLARE(clobbered, AGX_NUM_REGS) = {0};
530 BITSET_DECLARE(killed, AGX_NUM_REGS) = {0};
531 struct util_dynarray copies = {0};
532 util_dynarray_init(&copies, NULL);
533
534 /* Initialize the set of registers killed by this instructions' sources */
535 agx_foreach_ssa_src(I, s) {
536 unsigned v = I->src[s].value;
537
538 if (BITSET_TEST(rctx->visited, v)) {
539 unsigned base = rctx->ssa_to_reg[v];
540 unsigned nr = rctx->ncomps[v];
541 BITSET_SET_RANGE(killed, base, base + nr - 1);
542 }
543 }
544
545 reg = assign_regs_by_copying(rctx, count, align, I, &copies, clobbered,
546 killed, cls);
547 insert_copies_for_clobbered_killed(rctx, reg, count, I, &copies,
548 clobbered);
549
550 /* Insert the necessary copies */
551 agx_builder b = agx_init_builder(rctx->shader, agx_before_instr(I));
552 agx_emit_parallel_copies(
553 &b, copies.data, util_dynarray_num_elements(&copies, struct agx_copy));
554
555 /* assign_regs asserts this is cleared, so clear to be reassigned */
556 BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1);
557 return reg;
558 }
559 }
560
561 /*
562 * Loop over live-in values at the start of the block and mark their registers
563 * as in-use. We process blocks in dominance order, so this handles everything
564 * but loop headers.
565 *
566 * For loop headers, this handles the forward edges but not the back edge.
567 * However, that's okay: we don't want to reserve the registers that are
568 * defined within the loop, because then we'd get a contradiction. Instead we
569 * leave them available and then they become fixed points of a sort.
570 */
571 static void
reserve_live_in(struct ra_ctx * rctx)572 reserve_live_in(struct ra_ctx *rctx)
573 {
574 /* If there are no predecessors, there is nothing live-in */
575 unsigned nr_preds = agx_num_predecessors(rctx->block);
576 if (nr_preds == 0)
577 return;
578
579 agx_builder b =
580 agx_init_builder(rctx->shader, agx_before_block(rctx->block));
581
582 int i;
583 BITSET_FOREACH_SET(i, rctx->block->live_in, rctx->shader->alloc) {
584 /* Skip values defined in loops when processing the loop header */
585 if (!BITSET_TEST(rctx->visited, i))
586 continue;
587
588 unsigned base;
589
590 /* If we split live ranges, the variable might be defined differently at
591 * the end of each predecessor. Join them together with a phi inserted at
592 * the start of the block.
593 */
594 if (nr_preds > 1) {
595 /* We'll fill in the destination after, to coalesce one of the moves */
596 agx_instr *phi = agx_phi_to(&b, agx_null(), nr_preds);
597 enum agx_size size = rctx->sizes[i];
598
599 agx_foreach_predecessor(rctx->block, pred) {
600 unsigned pred_idx = agx_predecessor_index(rctx->block, *pred);
601
602 if ((*pred)->ssa_to_reg_out == NULL) {
603 /* If this is a loop header, we don't know where the register
604 * will end up. So, we create a phi conservatively but don't fill
605 * it in until the end of the loop. Stash in the information
606 * we'll need to fill in the real register later.
607 */
608 assert(rctx->block->loop_header);
609 phi->src[pred_idx] = agx_get_index(i, size);
610 phi->src[pred_idx].memory = rctx->classes[i] == RA_MEM;
611 } else {
612 /* Otherwise, we can build the phi now */
613 unsigned reg = (*pred)->ssa_to_reg_out[i];
614 phi->src[pred_idx] = rctx->classes[i] == RA_MEM
615 ? agx_memory_register(reg, size)
616 : agx_register(reg, size);
617 }
618 }
619
620 /* Pick the phi destination to coalesce a move. Predecessor ordering is
621 * stable, so this means all live-in values get their registers from a
622 * particular predecessor. That means that such a register allocation
623 * is valid here, because it was valid in the predecessor.
624 */
625 phi->dest[0] = phi->src[0];
626 base = phi->dest[0].value;
627 } else {
628 /* If we don't emit a phi, there is already a unique register */
629 assert(nr_preds == 1);
630
631 agx_block **pred = util_dynarray_begin(&rctx->block->predecessors);
632 base = (*pred)->ssa_to_reg_out[i];
633 }
634
635 enum ra_class cls = rctx->classes[i];
636 set_ssa_to_reg(rctx, i, base);
637
638 for (unsigned j = 0; j < rctx->ncomps[i]; ++j) {
639 BITSET_SET(rctx->used_regs[cls], base + j);
640
641 if (cls == RA_GPR)
642 rctx->reg_to_ssa[base + j] = i;
643 }
644 }
645 }
646
647 static void
assign_regs(struct ra_ctx * rctx,agx_index v,unsigned reg)648 assign_regs(struct ra_ctx *rctx, agx_index v, unsigned reg)
649 {
650 enum ra_class cls = ra_class_for_index(v);
651 assert(reg < rctx->bound[cls] && "must not overflow register file");
652 assert(v.type == AGX_INDEX_NORMAL && "only SSA gets registers allocated");
653 set_ssa_to_reg(rctx, v.value, reg);
654
655 assert(!BITSET_TEST(rctx->visited, v.value) && "SSA violated");
656 BITSET_SET(rctx->visited, v.value);
657
658 assert(rctx->ncomps[v.value] >= 1);
659 unsigned end = reg + rctx->ncomps[v.value] - 1;
660
661 assert(!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, end) &&
662 "no interference");
663 BITSET_SET_RANGE(rctx->used_regs[cls], reg, end);
664
665 if (cls == RA_GPR)
666 rctx->reg_to_ssa[reg] = v.value;
667 }
668
669 static void
agx_set_sources(struct ra_ctx * rctx,agx_instr * I)670 agx_set_sources(struct ra_ctx *rctx, agx_instr *I)
671 {
672 assert(I->op != AGX_OPCODE_PHI);
673
674 agx_foreach_ssa_src(I, s) {
675 assert(BITSET_TEST(rctx->visited, I->src[s].value) && "no phis");
676
677 unsigned v = rctx->ssa_to_reg[I->src[s].value];
678 agx_replace_src(I, s, agx_register_like(v, I->src[s]));
679 }
680 }
681
682 static void
agx_set_dests(struct ra_ctx * rctx,agx_instr * I)683 agx_set_dests(struct ra_ctx *rctx, agx_instr *I)
684 {
685 agx_foreach_ssa_dest(I, s) {
686 unsigned v = rctx->ssa_to_reg[I->dest[s].value];
687 I->dest[s] =
688 agx_replace_index(I->dest[s], agx_register_like(v, I->dest[s]));
689 }
690 }
691
692 static unsigned
affinity_base_of_collect(struct ra_ctx * rctx,agx_instr * collect,unsigned src)693 affinity_base_of_collect(struct ra_ctx *rctx, agx_instr *collect, unsigned src)
694 {
695 unsigned src_reg = rctx->ssa_to_reg[collect->src[src].value];
696 unsigned src_offset = src * agx_size_align_16(collect->src[src].size);
697
698 if (src_reg >= src_offset)
699 return src_reg - src_offset;
700 else
701 return ~0;
702 }
703
704 static bool
try_coalesce_with(struct ra_ctx * rctx,agx_index ssa,unsigned count,bool may_be_unvisited,unsigned * out)705 try_coalesce_with(struct ra_ctx *rctx, agx_index ssa, unsigned count,
706 bool may_be_unvisited, unsigned *out)
707 {
708 assert(ssa.type == AGX_INDEX_NORMAL);
709 if (!BITSET_TEST(rctx->visited, ssa.value)) {
710 assert(may_be_unvisited);
711 return false;
712 }
713
714 unsigned base = rctx->ssa_to_reg[ssa.value];
715 enum ra_class cls = ra_class_for_index(ssa);
716
717 if (BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
718 return false;
719
720 assert(base + count <= rctx->bound[cls] && "invariant");
721 *out = base;
722 return true;
723 }
724
725 static unsigned
pick_regs(struct ra_ctx * rctx,agx_instr * I,unsigned d)726 pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
727 {
728 agx_index idx = I->dest[d];
729 enum ra_class cls = ra_class_for_index(idx);
730 assert(idx.type == AGX_INDEX_NORMAL);
731
732 unsigned count = rctx->ncomps[idx.value];
733 assert(count >= 1);
734
735 unsigned align = count;
736
737 /* Try to allocate phis compatibly with their sources */
738 if (I->op == AGX_OPCODE_PHI) {
739 agx_foreach_ssa_src(I, s) {
740 /* Loop headers have phis with a source preceding the definition */
741 bool may_be_unvisited = rctx->block->loop_header;
742
743 unsigned out;
744 if (try_coalesce_with(rctx, I->src[s], count, may_be_unvisited, &out))
745 return out;
746 }
747 }
748
749 /* Try to allocate collects compatibly with their sources */
750 if (I->op == AGX_OPCODE_COLLECT) {
751 agx_foreach_ssa_src(I, s) {
752 assert(BITSET_TEST(rctx->visited, I->src[s].value) &&
753 "registers assigned in an order compatible with dominance "
754 "and this is not a phi node, so we have assigned a register");
755
756 unsigned base = affinity_base_of_collect(rctx, I, s);
757 if (base >= rctx->bound[cls] || (base + count) > rctx->bound[cls])
758 continue;
759
760 /* Unaligned destinations can happen when dest size > src size */
761 if (base % align)
762 continue;
763
764 if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
765 return base;
766 }
767 }
768
769 /* Try to allocate sources of collects contiguously */
770 agx_instr *collect_phi = rctx->src_to_collect_phi[idx.value];
771 if (collect_phi && collect_phi->op == AGX_OPCODE_COLLECT) {
772 agx_instr *collect = collect_phi;
773
774 assert(count == align && "collect sources are scalar");
775
776 /* Find our offset in the collect. If our source is repeated in the
777 * collect, this may not be unique. We arbitrarily choose the first.
778 */
779 unsigned our_source = ~0;
780 agx_foreach_ssa_src(collect, s) {
781 if (agx_is_equiv(collect->src[s], idx)) {
782 our_source = s;
783 break;
784 }
785 }
786
787 assert(our_source < collect->nr_srcs && "source must be in the collect");
788
789 /* See if we can allocate compatibly with any source of the collect */
790 agx_foreach_ssa_src(collect, s) {
791 if (!BITSET_TEST(rctx->visited, collect->src[s].value))
792 continue;
793
794 /* Determine where the collect should start relative to the source */
795 unsigned base = affinity_base_of_collect(rctx, collect, s);
796 if (base >= rctx->bound[cls])
797 continue;
798
799 unsigned our_reg = base + (our_source * align);
800
801 /* Don't allocate past the end of the register file */
802 if ((our_reg + align) > rctx->bound[cls])
803 continue;
804
805 /* If those registers are free, then choose them */
806 if (!BITSET_TEST_RANGE(rctx->used_regs[cls], our_reg,
807 our_reg + align - 1))
808 return our_reg;
809 }
810
811 unsigned collect_align = rctx->ncomps[collect->dest[0].value];
812 unsigned offset = our_source * align;
813
814 /* Prefer ranges of the register file that leave room for all sources of
815 * the collect contiguously.
816 */
817 for (unsigned base = 0;
818 base + (collect->nr_srcs * align) <= rctx->bound[cls];
819 base += collect_align) {
820 if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base,
821 base + (collect->nr_srcs * align) - 1))
822 return base + offset;
823 }
824
825 /* Try to respect the alignment requirement of the collect destination,
826 * which may be greater than the sources (e.g. pack_64_2x32_split). Look
827 * for a register for the source such that the collect base is aligned.
828 */
829 if (collect_align > align) {
830 for (unsigned reg = offset; reg + collect_align <= rctx->bound[cls];
831 reg += collect_align) {
832 if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + count - 1))
833 return reg;
834 }
835 }
836 }
837
838 /* Try to allocate phi sources compatibly with their phis */
839 if (collect_phi && collect_phi->op == AGX_OPCODE_PHI) {
840 agx_instr *phi = collect_phi;
841 unsigned out;
842
843 agx_foreach_ssa_src(phi, s) {
844 if (try_coalesce_with(rctx, phi->src[s], count, true, &out))
845 return out;
846 }
847
848 /* If we're in a loop, we may have already allocated the phi. Try that. */
849 if (phi->dest[0].type == AGX_INDEX_REGISTER) {
850 unsigned base = phi->dest[0].value;
851
852 if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
853 return base;
854 }
855 }
856
857 /* Default to any contiguous sequence of registers */
858 return find_regs(rctx, I, d, count, align);
859 }
860
861 /** Assign registers to SSA values in a block. */
862
863 static void
agx_ra_assign_local(struct ra_ctx * rctx)864 agx_ra_assign_local(struct ra_ctx *rctx)
865 {
866 BITSET_DECLARE(used_regs_gpr, AGX_NUM_REGS) = {0};
867 BITSET_DECLARE(used_regs_mem, AGX_NUM_MODELED_REGS) = {0};
868 uint16_t *ssa_to_reg = calloc(rctx->shader->alloc, sizeof(uint16_t));
869
870 agx_block *block = rctx->block;
871 uint8_t *ncomps = rctx->ncomps;
872 rctx->used_regs[RA_GPR] = used_regs_gpr;
873 rctx->used_regs[RA_MEM] = used_regs_mem;
874 rctx->ssa_to_reg = ssa_to_reg;
875
876 reserve_live_in(rctx);
877
878 /* Force the nesting counter r0l live throughout shaders using control flow.
879 * This could be optimized (sync with agx_calc_register_demand).
880 */
881 if (rctx->shader->any_cf)
882 BITSET_SET(used_regs_gpr, 0);
883
884 agx_foreach_instr_in_block(block, I) {
885 rctx->instr = I;
886
887 /* Optimization: if a split contains the last use of a vector, the split
888 * can be removed by assigning the destinations overlapping the source.
889 */
890 if (I->op == AGX_OPCODE_SPLIT && I->src[0].kill) {
891 assert(ra_class_for_index(I->src[0]) == RA_GPR);
892 unsigned reg = ssa_to_reg[I->src[0].value];
893 unsigned width = agx_size_align_16(agx_split_width(I));
894
895 agx_foreach_dest(I, d) {
896 assert(ra_class_for_index(I->dest[0]) == RA_GPR);
897
898 /* Free up the source */
899 unsigned offset_reg = reg + (d * width);
900 BITSET_CLEAR_RANGE(used_regs_gpr, offset_reg,
901 offset_reg + width - 1);
902
903 /* Assign the destination where the source was */
904 if (!agx_is_null(I->dest[d]))
905 assign_regs(rctx, I->dest[d], offset_reg);
906 }
907
908 unsigned excess =
909 rctx->ncomps[I->src[0].value] - (I->nr_dests * width);
910 if (excess) {
911 BITSET_CLEAR_RANGE(used_regs_gpr, reg + (I->nr_dests * width),
912 reg + rctx->ncomps[I->src[0].value] - 1);
913 }
914
915 agx_set_sources(rctx, I);
916 agx_set_dests(rctx, I);
917 continue;
918 } else if (I->op == AGX_OPCODE_PRELOAD) {
919 /* We must coalesce all preload moves */
920 assert(I->dest[0].size == I->src[0].size);
921 assert(I->src[0].type == AGX_INDEX_REGISTER);
922
923 assign_regs(rctx, I->dest[0], I->src[0].value);
924 agx_set_dests(rctx, I);
925 continue;
926 }
927
928 /* First, free killed sources */
929 agx_foreach_ssa_src(I, s) {
930 if (I->src[s].kill) {
931 enum ra_class cls = ra_class_for_index(I->src[s]);
932 unsigned reg = ssa_to_reg[I->src[s].value];
933 unsigned count = ncomps[I->src[s].value];
934
935 assert(count >= 1);
936 BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1);
937 }
938 }
939
940 /* Next, assign destinations one at a time. This is always legal
941 * because of the SSA form.
942 */
943 agx_foreach_ssa_dest(I, d) {
944 assign_regs(rctx, I->dest[d], pick_regs(rctx, I, d));
945 }
946
947 /* Phi sources are special. Set in the corresponding predecessors */
948 if (I->op != AGX_OPCODE_PHI)
949 agx_set_sources(rctx, I);
950
951 agx_set_dests(rctx, I);
952 }
953
954 block->ssa_to_reg_out = rctx->ssa_to_reg;
955
956 /* Also set the sources for the phis in our successors, since that logically
957 * happens now (given the possibility of live range splits, etc)
958 */
959 agx_foreach_successor(block, succ) {
960 unsigned pred_idx = agx_predecessor_index(succ, block);
961
962 agx_foreach_phi_in_block(succ, phi) {
963 if (phi->src[pred_idx].type == AGX_INDEX_NORMAL) {
964 /* This source needs a fixup */
965 unsigned value = phi->src[pred_idx].value;
966
967 agx_replace_src(
968 phi, pred_idx,
969 agx_register_like(rctx->ssa_to_reg[value], phi->src[pred_idx]));
970 }
971 }
972 }
973 }
974
975 /*
976 * Lower phis to parallel copies at the logical end of a given block. If a block
977 * needs parallel copies inserted, a successor of the block has a phi node. To
978 * have a (nontrivial) phi node, a block must have multiple predecessors. So the
979 * edge from the block to the successor (with phi) is not the only edge entering
980 * the successor. Because the control flow graph has no critical edges, this
981 * edge must therefore be the only edge leaving the block, so the block must
982 * have only a single successor.
983 */
984 static void
agx_insert_parallel_copies(agx_context * ctx,agx_block * block)985 agx_insert_parallel_copies(agx_context *ctx, agx_block *block)
986 {
987 bool any_succ = false;
988 unsigned nr_phi = 0;
989
990 /* Phi nodes logically happen on the control flow edge, so parallel copies
991 * are added at the end of the predecessor */
992 agx_builder b = agx_init_builder(ctx, agx_after_block_logical(block));
993
994 agx_foreach_successor(block, succ) {
995 assert(nr_phi == 0 && "control flow graph has a critical edge");
996
997 agx_foreach_phi_in_block(succ, phi) {
998 assert(!any_succ && "control flow graph has a critical edge");
999 nr_phi++;
1000 }
1001
1002 any_succ = true;
1003
1004 /* Nothing to do if there are no phi nodes */
1005 if (nr_phi == 0)
1006 continue;
1007
1008 unsigned pred_index = agx_predecessor_index(succ, block);
1009
1010 /* Create a parallel copy lowering all the phi nodes */
1011 struct agx_copy *copies = calloc(sizeof(*copies), nr_phi);
1012
1013 unsigned i = 0;
1014
1015 agx_foreach_phi_in_block(succ, phi) {
1016 agx_index dest = phi->dest[0];
1017 agx_index src = phi->src[pred_index];
1018
1019 if (src.type == AGX_INDEX_IMMEDIATE)
1020 src.size = dest.size;
1021
1022 assert(dest.type == AGX_INDEX_REGISTER);
1023 assert(dest.size == src.size);
1024
1025 copies[i++] = (struct agx_copy){
1026 .dest = dest.value,
1027 .dest_mem = dest.memory,
1028 .src = src,
1029 };
1030 }
1031
1032 agx_emit_parallel_copies(&b, copies, nr_phi);
1033
1034 free(copies);
1035 }
1036 }
1037
1038 static inline agx_index
agx_index_as_mem(agx_index idx,unsigned mem_base)1039 agx_index_as_mem(agx_index idx, unsigned mem_base)
1040 {
1041 assert(idx.type == AGX_INDEX_NORMAL);
1042 assert(!idx.memory);
1043 idx.memory = true;
1044 idx.value = mem_base + idx.value;
1045 return idx;
1046 }
1047
1048 /*
1049 * Spill everything to the stack, trivially. For debugging spilling.
1050 *
1051 * Only phis and stack moves can access memory variables.
1052 */
1053 static void
agx_spill_everything(agx_context * ctx)1054 agx_spill_everything(agx_context *ctx)
1055 {
1056 /* Immediates and uniforms are not allowed to be spilled, so they cannot
1057 * appear in phi webs. Lower them first.
1058 */
1059 agx_foreach_block(ctx, block) {
1060 agx_block **preds = util_dynarray_begin(&block->predecessors);
1061
1062 agx_foreach_phi_in_block(block, phi) {
1063 agx_foreach_src(phi, s) {
1064 if (phi->src[s].type == AGX_INDEX_IMMEDIATE ||
1065 phi->src[s].type == AGX_INDEX_UNIFORM) {
1066
1067 agx_builder b =
1068 agx_init_builder(ctx, agx_after_block_logical(preds[s]));
1069
1070 agx_index temp = agx_temp(ctx, phi->dest[0].size);
1071
1072 if (phi->src[s].type == AGX_INDEX_IMMEDIATE)
1073 agx_mov_imm_to(&b, temp, phi->src[s].value);
1074 else
1075 agx_mov_to(&b, temp, phi->src[s]);
1076
1077 agx_replace_src(phi, s, temp);
1078 }
1079 }
1080 }
1081 }
1082
1083 /* Now we can spill everything */
1084 unsigned mem_base = ctx->alloc;
1085 ctx->alloc = mem_base + ctx->alloc;
1086
1087 agx_foreach_instr_global_safe(ctx, I) {
1088 if (I->op == AGX_OPCODE_PHI) {
1089 agx_foreach_ssa_dest(I, d) {
1090 I->dest[d] = agx_replace_index(
1091 I->dest[d], agx_index_as_mem(I->dest[d], mem_base));
1092 }
1093
1094 agx_foreach_ssa_src(I, s) {
1095 agx_replace_src(I, s, agx_index_as_mem(I->src[s], mem_base));
1096 }
1097 } else {
1098 agx_builder b = agx_init_builder(ctx, agx_before_instr(I));
1099 agx_foreach_ssa_src(I, s) {
1100 agx_index fill =
1101 agx_vec_temp(ctx, I->src[s].size, agx_channels(I->src[s]));
1102
1103 agx_mov_to(&b, fill, agx_index_as_mem(I->src[s], mem_base));
1104 agx_replace_src(I, s, fill);
1105 }
1106
1107 agx_foreach_ssa_dest(I, d) {
1108 agx_builder b = agx_init_builder(ctx, agx_after_instr(I));
1109 agx_mov_to(&b, agx_index_as_mem(I->dest[d], mem_base), I->dest[d]);
1110 }
1111 }
1112 }
1113
1114 agx_validate(ctx, "Trivial spill");
1115 }
1116
1117 void
agx_ra(agx_context * ctx)1118 agx_ra(agx_context *ctx)
1119 {
1120 /* Determine maximum possible registers. We won't exceed this! */
1121 unsigned max_possible_regs = AGX_NUM_REGS;
1122
1123 /* Compute shaders need to have their entire workgroup together, so our
1124 * register usage is bounded by the workgroup size.
1125 */
1126 if (gl_shader_stage_is_compute(ctx->stage)) {
1127 unsigned threads_per_workgroup;
1128
1129 /* If we don't know the workgroup size, worst case it. TODO: Optimize
1130 * this, since it'll decimate opencl perf.
1131 */
1132 if (ctx->nir->info.workgroup_size_variable) {
1133 threads_per_workgroup = 1024;
1134 } else {
1135 threads_per_workgroup = ctx->nir->info.workgroup_size[0] *
1136 ctx->nir->info.workgroup_size[1] *
1137 ctx->nir->info.workgroup_size[2];
1138 }
1139
1140 max_possible_regs =
1141 agx_max_registers_for_occupancy(threads_per_workgroup);
1142 }
1143
1144 /* The helper program is unspillable and has a limited register file */
1145 if (ctx->key->is_helper) {
1146 max_possible_regs = 32;
1147 }
1148
1149 /* Calculate the demand. We'll use it to determine if we need to spill and to
1150 * bound register assignment.
1151 */
1152 agx_compute_liveness(ctx);
1153 unsigned effective_demand = agx_calc_register_demand(ctx);
1154 bool spilling = (effective_demand > max_possible_regs);
1155 spilling |= ((agx_compiler_debug & AGX_DBG_SPILL) && ctx->key->has_scratch);
1156
1157 if (spilling) {
1158 assert(ctx->key->has_scratch && "internal shaders are unspillable");
1159 agx_spill_everything(ctx);
1160
1161 /* After spilling, recalculate liveness and demand */
1162 agx_compute_liveness(ctx);
1163 effective_demand = agx_calc_register_demand(ctx);
1164
1165 /* The resulting program can now be assigned registers */
1166 assert(effective_demand <= max_possible_regs && "spiller post-condition");
1167 }
1168
1169 uint8_t *ncomps = calloc(ctx->alloc, sizeof(uint8_t));
1170 enum ra_class *classes = calloc(ctx->alloc, sizeof(enum ra_class));
1171 agx_instr **src_to_collect_phi = calloc(ctx->alloc, sizeof(agx_instr *));
1172 enum agx_size *sizes = calloc(ctx->alloc, sizeof(enum agx_size));
1173 BITSET_WORD *visited = calloc(BITSET_WORDS(ctx->alloc), sizeof(BITSET_WORD));
1174 unsigned max_ncomps = 1;
1175
1176 agx_foreach_instr_global(ctx, I) {
1177 /* Record collects/phis so we can coalesce when assigning */
1178 if (I->op == AGX_OPCODE_COLLECT || I->op == AGX_OPCODE_PHI) {
1179 agx_foreach_ssa_src(I, s) {
1180 src_to_collect_phi[I->src[s].value] = I;
1181 }
1182 }
1183
1184 agx_foreach_ssa_dest(I, d) {
1185 unsigned v = I->dest[d].value;
1186 assert(ncomps[v] == 0 && "broken SSA");
1187 /* Round up vectors for easier live range splitting */
1188 ncomps[v] = util_next_power_of_two(agx_index_size_16(I->dest[d]));
1189 sizes[v] = I->dest[d].size;
1190 classes[v] = ra_class_for_index(I->dest[d]);
1191
1192 max_ncomps = MAX2(max_ncomps, ncomps[v]);
1193 }
1194 }
1195
1196 /* For live range splitting to work properly, ensure the register file is
1197 * aligned to the larger vector size. Most of the time, this is a no-op since
1198 * the largest vector size is usually 128-bit and the register file is
1199 * naturally 128-bit aligned. However, this is required for correctness with
1200 * 3D textureGrad, which can have a source vector of length 6x32-bit,
1201 * rounding up to 256-bit and requiring special accounting here.
1202 */
1203 unsigned reg_file_alignment = MAX2(max_ncomps, 8);
1204 assert(util_is_power_of_two_nonzero(reg_file_alignment));
1205
1206 if (spilling) {
1207 /* We need to allocate scratch registers for lowering spilling later */
1208 effective_demand = MAX2(effective_demand, 6 * 2 /* preloading */);
1209 effective_demand += reg_file_alignment;
1210 }
1211
1212 unsigned demand = ALIGN_POT(effective_demand, reg_file_alignment);
1213 assert(demand <= max_possible_regs && "Invariant");
1214
1215 /* Round up the demand to the maximum number of registers we can use without
1216 * affecting occupancy. This reduces live range splitting.
1217 */
1218 unsigned max_regs = agx_occupancy_for_register_count(demand).max_registers;
1219 if (ctx->key->is_helper)
1220 max_regs = 32;
1221
1222 max_regs = ROUND_DOWN_TO(max_regs, reg_file_alignment);
1223
1224 /* Or, we can bound tightly for debugging */
1225 if (agx_compiler_debug & AGX_DBG_DEMAND)
1226 max_regs = ALIGN_POT(MAX2(demand, 12), reg_file_alignment);
1227
1228 /* ...but not too tightly */
1229 assert((max_regs % reg_file_alignment) == 0 && "occupancy limits aligned");
1230 assert(max_regs >= (6 * 2) && "space for vertex shader preloading");
1231 assert(max_regs <= max_possible_regs);
1232
1233 unsigned max_mem_slot = 0;
1234
1235 /* Assign registers in dominance-order. This coincides with source-order due
1236 * to a NIR invariant, so we do not need special handling for this.
1237 */
1238 agx_foreach_block(ctx, block) {
1239 agx_ra_assign_local(&(struct ra_ctx){
1240 .shader = ctx,
1241 .block = block,
1242 .src_to_collect_phi = src_to_collect_phi,
1243 .ncomps = ncomps,
1244 .sizes = sizes,
1245 .classes = classes,
1246 .visited = visited,
1247 .bound[RA_GPR] = max_regs,
1248 .bound[RA_MEM] = AGX_NUM_MODELED_REGS,
1249 .max_reg[RA_GPR] = &ctx->max_reg,
1250 .max_reg[RA_MEM] = &max_mem_slot,
1251 });
1252 }
1253
1254 if (spilling) {
1255 ctx->spill_base = ctx->scratch_size;
1256 ctx->scratch_size += (max_mem_slot + 1) * 2;
1257 }
1258
1259 /* Vertex shaders preload the vertex/instance IDs (r5, r6) even if the shader
1260 * don't use them. Account for that so the preload doesn't clobber GPRs.
1261 */
1262 if (ctx->nir->info.stage == MESA_SHADER_VERTEX)
1263 ctx->max_reg = MAX2(ctx->max_reg, 6 * 2);
1264
1265 assert(ctx->max_reg <= max_regs);
1266
1267 agx_foreach_instr_global_safe(ctx, ins) {
1268 /* Lower away RA pseudo-instructions */
1269 agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
1270
1271 if (ins->op == AGX_OPCODE_COLLECT) {
1272 assert(ins->dest[0].type == AGX_INDEX_REGISTER);
1273 assert(!ins->dest[0].memory);
1274
1275 unsigned base = ins->dest[0].value;
1276 unsigned width = agx_size_align_16(ins->src[0].size);
1277
1278 struct agx_copy *copies = alloca(sizeof(copies[0]) * ins->nr_srcs);
1279 unsigned n = 0;
1280
1281 /* Move the sources */
1282 agx_foreach_src(ins, i) {
1283 if (agx_is_null(ins->src[i]) || ins->src[i].type == AGX_INDEX_UNDEF)
1284 continue;
1285 assert(ins->src[i].size == ins->src[0].size);
1286
1287 copies[n++] = (struct agx_copy){
1288 .dest = base + (i * width),
1289 .src = ins->src[i],
1290 };
1291 }
1292
1293 agx_emit_parallel_copies(&b, copies, n);
1294 agx_remove_instruction(ins);
1295 continue;
1296 } else if (ins->op == AGX_OPCODE_SPLIT) {
1297 assert(ins->src[0].type == AGX_INDEX_REGISTER ||
1298 ins->src[0].type == AGX_INDEX_UNIFORM);
1299
1300 struct agx_copy copies[4];
1301 assert(ins->nr_dests <= ARRAY_SIZE(copies));
1302
1303 unsigned n = 0;
1304 unsigned width = agx_size_align_16(agx_split_width(ins));
1305
1306 /* Move the sources */
1307 agx_foreach_dest(ins, i) {
1308 if (ins->dest[i].type != AGX_INDEX_REGISTER)
1309 continue;
1310
1311 assert(!ins->dest[i].memory);
1312
1313 agx_index src = ins->src[0];
1314 src.size = ins->dest[i].size;
1315 src.channels_m1 = 0;
1316 src.value += (i * width);
1317
1318 copies[n++] = (struct agx_copy){
1319 .dest = ins->dest[i].value,
1320 .src = src,
1321 };
1322 }
1323
1324 /* Lower away */
1325 agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
1326 agx_emit_parallel_copies(&b, copies, n);
1327 agx_remove_instruction(ins);
1328 continue;
1329 }
1330 }
1331
1332 /* Insert parallel copies lowering phi nodes */
1333 agx_foreach_block(ctx, block) {
1334 agx_insert_parallel_copies(ctx, block);
1335 }
1336
1337 agx_foreach_instr_global_safe(ctx, I) {
1338 switch (I->op) {
1339 /* Pseudoinstructions for RA must be removed now */
1340 case AGX_OPCODE_PHI:
1341 case AGX_OPCODE_PRELOAD:
1342 agx_remove_instruction(I);
1343 break;
1344
1345 /* Coalesced moves can be removed */
1346 case AGX_OPCODE_MOV:
1347 if (I->src[0].type == AGX_INDEX_REGISTER &&
1348 I->dest[0].size == I->src[0].size &&
1349 I->src[0].value == I->dest[0].value &&
1350 I->src[0].memory == I->dest[0].memory) {
1351
1352 assert(I->dest[0].type == AGX_INDEX_REGISTER);
1353 agx_remove_instruction(I);
1354 }
1355 break;
1356
1357 default:
1358 break;
1359 }
1360 }
1361
1362 if (spilling)
1363 agx_lower_spill(ctx);
1364
1365 agx_foreach_block(ctx, block) {
1366 free(block->ssa_to_reg_out);
1367 block->ssa_to_reg_out = NULL;
1368 }
1369
1370 free(src_to_collect_phi);
1371 free(ncomps);
1372 free(sizes);
1373 free(classes);
1374 free(visited);
1375 }
1376