• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2021 Alyssa Rosenzweig
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "util/bitset.h"
7 #include "util/macros.h"
8 #include "util/u_dynarray.h"
9 #include "util/u_qsort.h"
10 #include "agx_builder.h"
11 #include "agx_compile.h"
12 #include "agx_compiler.h"
13 #include "agx_debug.h"
14 #include "agx_opcodes.h"
15 #include "shader_enums.h"
16 
17 /* SSA-based register allocator */
18 
19 enum ra_class {
20    /* General purpose register */
21    RA_GPR,
22 
23    /* Memory, used to assign stack slots */
24    RA_MEM,
25 
26    /* Keep last */
27    RA_CLASSES,
28 };
29 
30 static inline enum ra_class
ra_class_for_index(agx_index idx)31 ra_class_for_index(agx_index idx)
32 {
33    return idx.memory ? RA_MEM : RA_GPR;
34 }
35 
36 struct ra_ctx {
37    agx_context *shader;
38    agx_block *block;
39    agx_instr *instr;
40    uint16_t *ssa_to_reg;
41    uint8_t *ncomps;
42    enum agx_size *sizes;
43    enum ra_class *classes;
44    BITSET_WORD *visited;
45    BITSET_WORD *used_regs[RA_CLASSES];
46 
47    /* Maintained while assigning registers */
48    unsigned *max_reg[RA_CLASSES];
49 
50    /* For affinities */
51    agx_instr **src_to_collect_phi;
52 
53    /* If bit i of used_regs is set, and register i is the first consecutive
54     * register holding an SSA value, then reg_to_ssa[i] is the SSA index of the
55     * value currently in register  i.
56     *
57     * Only for GPRs. We can add reg classes later if we have a use case.
58     */
59    uint32_t reg_to_ssa[AGX_NUM_REGS];
60 
61    /* Maximum number of registers that RA is allowed to use */
62    unsigned bound[RA_CLASSES];
63 };
64 
65 enum agx_size
agx_split_width(const agx_instr * I)66 agx_split_width(const agx_instr *I)
67 {
68    enum agx_size width = ~0;
69 
70    agx_foreach_dest(I, d) {
71       if (I->dest[d].type == AGX_INDEX_NULL)
72          continue;
73       else if (width != ~0)
74          assert(width == I->dest[d].size);
75       else
76          width = I->dest[d].size;
77    }
78 
79    assert(width != ~0 && "should have been DCE'd");
80    return width;
81 }
82 
83 /*
84  * Calculate register demand in 16-bit registers, while gathering widths and
85  * classes. Becuase we allocate in SSA, this calculation is exact in
86  * linear-time. Depends on liveness information.
87  */
88 static unsigned
agx_calc_register_demand(agx_context * ctx)89 agx_calc_register_demand(agx_context *ctx)
90 {
91    uint8_t *widths = calloc(ctx->alloc, sizeof(uint8_t));
92    enum ra_class *classes = calloc(ctx->alloc, sizeof(enum ra_class));
93 
94    agx_foreach_instr_global(ctx, I) {
95       agx_foreach_ssa_dest(I, d) {
96          unsigned v = I->dest[d].value;
97          assert(widths[v] == 0 && "broken SSA");
98          /* Round up vectors for easier live range splitting */
99          widths[v] = util_next_power_of_two(agx_index_size_16(I->dest[d]));
100          classes[v] = ra_class_for_index(I->dest[d]);
101       }
102    }
103 
104    /* Calculate demand at the start of each block based on live-in, then update
105     * for each instruction processed. Calculate rolling maximum.
106     */
107    unsigned max_demand = 0;
108 
109    agx_foreach_block(ctx, block) {
110       unsigned demand = 0;
111 
112       /* RA treats the nesting counter as alive throughout if control flow is
113        * used anywhere. This could be optimized.
114        */
115       if (ctx->any_cf)
116          demand++;
117 
118       /* Everything live-in */
119       {
120          int i;
121          BITSET_FOREACH_SET(i, block->live_in, ctx->alloc) {
122             if (classes[i] == RA_GPR)
123                demand += widths[i];
124          }
125       }
126 
127       max_demand = MAX2(demand, max_demand);
128 
129       /* To handle non-power-of-two vectors, sometimes live range splitting
130        * needs extra registers for 1 instruction. This counter tracks the number
131        * of registers to be freed after 1 extra instruction.
132        */
133       unsigned late_kill_count = 0;
134 
135       agx_foreach_instr_in_block(block, I) {
136          /* Phis happen in parallel and are already accounted for in the live-in
137           * set, just skip them so we don't double count.
138           */
139          if (I->op == AGX_OPCODE_PHI)
140             continue;
141 
142          /* Handle late-kill registers from last instruction */
143          demand -= late_kill_count;
144          late_kill_count = 0;
145 
146          /* Kill sources the first time we see them */
147          agx_foreach_src(I, s) {
148             if (!I->src[s].kill)
149                continue;
150             assert(I->src[s].type == AGX_INDEX_NORMAL);
151             if (ra_class_for_index(I->src[s]) != RA_GPR)
152                continue;
153 
154             bool skip = false;
155 
156             for (unsigned backwards = 0; backwards < s; ++backwards) {
157                if (agx_is_equiv(I->src[backwards], I->src[s])) {
158                   skip = true;
159                   break;
160                }
161             }
162 
163             if (!skip)
164                demand -= widths[I->src[s].value];
165          }
166 
167          /* Make destinations live */
168          agx_foreach_ssa_dest(I, d) {
169             if (ra_class_for_index(I->dest[d]) != RA_GPR)
170                continue;
171 
172             /* Live range splits allocate at power-of-two granularity. Round up
173              * destination sizes (temporarily) to powers-of-two.
174              */
175             unsigned real_width = widths[I->dest[d].value];
176             unsigned pot_width = util_next_power_of_two(real_width);
177 
178             demand += pot_width;
179             late_kill_count += (pot_width - real_width);
180          }
181 
182          max_demand = MAX2(demand, max_demand);
183       }
184 
185       demand -= late_kill_count;
186    }
187 
188    free(widths);
189    free(classes);
190    return max_demand;
191 }
192 
193 static bool
find_regs_simple(struct ra_ctx * rctx,enum ra_class cls,unsigned count,unsigned align,unsigned * out)194 find_regs_simple(struct ra_ctx *rctx, enum ra_class cls, unsigned count,
195                  unsigned align, unsigned *out)
196 {
197    for (unsigned reg = 0; reg + count <= rctx->bound[cls]; reg += align) {
198       if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + count - 1)) {
199          *out = reg;
200          return true;
201       }
202    }
203 
204    return false;
205 }
206 
207 /*
208  * Search the register file for the best contiguous aligned region of the given
209  * size to evict when shuffling registers. The region must not contain any
210  * register marked in the passed bitset.
211  *
212  * As a hint, this also takes in the set of registers from killed sources passed
213  * to this instruction. These should be deprioritized, since they are more
214  * expensive to use (extra moves to shuffle the contents away).
215  *
216  * Precondition: such a region exists.
217  *
218  * Postcondition: at least one register in the returned region is already free.
219  */
220 static unsigned
find_best_region_to_evict(struct ra_ctx * rctx,enum ra_class cls,unsigned size,BITSET_WORD * already_evicted,BITSET_WORD * killed)221 find_best_region_to_evict(struct ra_ctx *rctx, enum ra_class cls, unsigned size,
222                           BITSET_WORD *already_evicted, BITSET_WORD *killed)
223 {
224    assert(util_is_power_of_two_or_zero(size) && "precondition");
225    assert((rctx->bound[cls] % size) == 0 &&
226           "register file size must be aligned to the maximum vector size");
227    assert(cls == RA_GPR);
228 
229    unsigned best_base = ~0;
230    unsigned best_moves = ~0;
231 
232    for (unsigned base = 0; base + size <= rctx->bound[cls]; base += size) {
233       /* r0l is unevictable, skip it. By itself, this does not pose a problem.
234        * We are allocating n registers, but the region containing r0l has at
235        * most n-1 free. Since there are at least n free registers total, there
236        * is at least 1 free register outside this region. Thus the region
237        * containing that free register contains at most n-1 occupied registers.
238        * In the worst case, those n-1 occupied registers are moved to the region
239        * with r0l and then the n free registers are used for the destination.
240        * Thus, we do not need extra registers to handle "single point"
241        * unevictability.
242        */
243       if (base == 0 && rctx->shader->any_cf)
244          continue;
245 
246       /* Do not evict the same register multiple times. It's not necessary since
247        * we're just shuffling, there are enough free registers elsewhere.
248        */
249       if (BITSET_TEST_RANGE(already_evicted, base, base + size - 1))
250          continue;
251 
252       /* Estimate the number of moves required if we pick this region */
253       unsigned moves = 0;
254       bool any_free = false;
255 
256       for (unsigned reg = base; reg < base + size; ++reg) {
257          /* We need a move for each blocked register (TODO: we only need a
258           * single move for 32-bit pairs, could optimize to use that instead.)
259           */
260          if (BITSET_TEST(rctx->used_regs[cls], reg))
261             moves++;
262          else
263             any_free = true;
264 
265          /* Each clobbered killed register requires a move or a swap. Since
266           * swaps require more instructions, assign a higher cost here. In
267           * practice, 3 is too high but 2 is slightly better than 1.
268           */
269          if (BITSET_TEST(killed, reg))
270             moves += 2;
271       }
272 
273       /* Pick the region requiring fewest moves as a heuristic. Regions with no
274        * free registers are skipped even if the heuristic estimates a lower cost
275        * (due to killed sources), since the recursive splitting algorithm
276        * requires at least one free register.
277        */
278       if (any_free && moves < best_moves) {
279          best_moves = moves;
280          best_base = base;
281       }
282    }
283 
284    assert(best_base < rctx->bound[cls] &&
285           "not enough registers (should have spilled already)");
286    return best_base;
287 }
288 
289 static void
set_ssa_to_reg(struct ra_ctx * rctx,unsigned ssa,unsigned reg)290 set_ssa_to_reg(struct ra_ctx *rctx, unsigned ssa, unsigned reg)
291 {
292    enum ra_class cls = rctx->classes[ssa];
293 
294    *(rctx->max_reg[cls]) =
295       MAX2(*(rctx->max_reg[cls]), reg + rctx->ncomps[ssa] - 1);
296 
297    rctx->ssa_to_reg[ssa] = reg;
298 }
299 
300 static unsigned
assign_regs_by_copying(struct ra_ctx * rctx,unsigned npot_count,unsigned align,const agx_instr * I,struct util_dynarray * copies,BITSET_WORD * clobbered,BITSET_WORD * killed,enum ra_class cls)301 assign_regs_by_copying(struct ra_ctx *rctx, unsigned npot_count, unsigned align,
302                        const agx_instr *I, struct util_dynarray *copies,
303                        BITSET_WORD *clobbered, BITSET_WORD *killed,
304                        enum ra_class cls)
305 {
306    assert(cls == RA_GPR);
307 
308    /* XXX: This needs some special handling but so far it has been prohibitively
309     * difficult to hit the case
310     */
311    if (I->op == AGX_OPCODE_PHI)
312       unreachable("TODO");
313 
314    /* Expand the destination to the next power-of-two size. This simplifies
315     * splitting and is accounted for by the demand calculation, so is legal.
316     */
317    unsigned count = util_next_power_of_two(npot_count);
318    assert(align <= count && "still aligned");
319    align = count;
320 
321    /* There's not enough contiguous room in the register file. We need to
322     * shuffle some variables around. Look for a range of the register file
323     * that is partially blocked.
324     */
325    unsigned base =
326       find_best_region_to_evict(rctx, cls, count, clobbered, killed);
327 
328    assert(count <= 16 && "max allocation size (conservative)");
329    BITSET_DECLARE(evict_set, 16) = {0};
330 
331    /* Store the set of blocking registers that need to be evicted */
332    for (unsigned i = 0; i < count; ++i) {
333       if (BITSET_TEST(rctx->used_regs[cls], base + i)) {
334          BITSET_SET(evict_set, i);
335       }
336    }
337 
338    /* We are going to allocate the destination to this range, so it is now fully
339     * used. Mark it as such so we don't reassign here later.
340     */
341    BITSET_SET_RANGE(rctx->used_regs[cls], base, base + count - 1);
342 
343    /* Before overwriting the range, we need to evict blocked variables */
344    for (unsigned i = 0; i < 16; ++i) {
345       /* Look for subranges that needs eviction */
346       if (!BITSET_TEST(evict_set, i))
347          continue;
348 
349       unsigned reg = base + i;
350       uint32_t ssa = rctx->reg_to_ssa[reg];
351       uint32_t nr = rctx->ncomps[ssa];
352       unsigned align = agx_size_align_16(rctx->sizes[ssa]);
353 
354       assert(nr >= 1 && "must be assigned");
355       assert(rctx->ssa_to_reg[ssa] == reg &&
356              "variable must start within the range, since vectors are limited");
357 
358       for (unsigned j = 0; j < nr; ++j) {
359          assert(BITSET_TEST(evict_set, i + j) &&
360                 "variable is allocated contiguous and vectors are limited, "
361                 "so evicted in full");
362       }
363 
364       /* Assign a new location for the variable. This terminates with finite
365        * recursion because nr is decreasing because of the gap.
366        */
367       assert(nr < count && "fully contained in range that's not full");
368       unsigned new_reg = assign_regs_by_copying(rctx, nr, align, I, copies,
369                                                 clobbered, killed, cls);
370 
371       /* Copy the variable over, register by register */
372       for (unsigned i = 0; i < nr; i += align) {
373          assert(cls == RA_GPR);
374 
375          struct agx_copy copy = {
376             .dest = new_reg + i,
377             .src = agx_register(reg + i, rctx->sizes[ssa]),
378          };
379 
380          assert((copy.dest % agx_size_align_16(rctx->sizes[ssa])) == 0 &&
381                 "new dest must be aligned");
382          assert((copy.src.value % agx_size_align_16(rctx->sizes[ssa])) == 0 &&
383                 "src must be aligned");
384          util_dynarray_append(copies, struct agx_copy, copy);
385       }
386 
387       /* Mark down the set of clobbered registers, so that killed sources may be
388        * handled correctly later.
389        */
390       BITSET_SET_RANGE(clobbered, new_reg, new_reg + nr - 1);
391 
392       /* Update bookkeeping for this variable */
393       assert(cls == rctx->classes[cls]);
394       set_ssa_to_reg(rctx, ssa, new_reg);
395       rctx->reg_to_ssa[new_reg] = ssa;
396 
397       /* Skip to the next variable */
398       i += nr - 1;
399    }
400 
401    /* We overallocated for non-power-of-two vectors. Free up the excess now.
402     * This is modelled as late kill in demand calculation.
403     */
404    if (npot_count != count) {
405       BITSET_CLEAR_RANGE(rctx->used_regs[cls], base + npot_count,
406                          base + count - 1);
407    }
408 
409    return base;
410 }
411 
412 static int
sort_by_size(const void * a_,const void * b_,void * sizes_)413 sort_by_size(const void *a_, const void *b_, void *sizes_)
414 {
415    const enum agx_size *sizes = sizes_;
416    const unsigned *a = a_, *b = b_;
417 
418    return sizes[*b] - sizes[*a];
419 }
420 
421 /*
422  * Allocating a destination of n consecutive registers may require moving those
423  * registers' contents to the locations of killed sources. For the instruction
424  * to read the correct values, the killed sources themselves need to be moved to
425  * the space where the destination will go.
426  *
427  * This is legal because there is no interference between the killed source and
428  * the destination. This is always possible because, after this insertion, the
429  * destination needs to contain the killed sources already overlapping with the
430  * destination (size k) plus the killed sources clobbered to make room for
431  * livethrough sources overlapping with the destination (at most size |dest|-k),
432  * so the total size is at most k + |dest| - k = |dest| and so fits in the dest.
433  * Sorting by alignment may be necessary.
434  */
435 static void
insert_copies_for_clobbered_killed(struct ra_ctx * rctx,unsigned reg,unsigned count,const agx_instr * I,struct util_dynarray * copies,BITSET_WORD * clobbered)436 insert_copies_for_clobbered_killed(struct ra_ctx *rctx, unsigned reg,
437                                    unsigned count, const agx_instr *I,
438                                    struct util_dynarray *copies,
439                                    BITSET_WORD *clobbered)
440 {
441    unsigned vars[16] = {0};
442    unsigned nr_vars = 0;
443 
444    /* Precondition: the nesting counter is not overwritten. Therefore we do not
445     * have to move it.  find_best_region_to_evict knows better than to try.
446     */
447    assert(!(reg == 0 && rctx->shader->any_cf) && "r0l is never moved");
448 
449    /* Consider the destination clobbered for the purpose of source collection.
450     * This way, killed sources already in the destination will be preserved
451     * (though possibly compacted).
452     */
453    BITSET_SET_RANGE(clobbered, reg, reg + count - 1);
454 
455    /* Collect killed clobbered sources, if any */
456    agx_foreach_ssa_src(I, s) {
457       unsigned reg = rctx->ssa_to_reg[I->src[s].value];
458 
459       if (I->src[s].kill && ra_class_for_index(I->src[s]) == RA_GPR &&
460           BITSET_TEST(clobbered, reg)) {
461 
462          assert(nr_vars < ARRAY_SIZE(vars) &&
463                 "cannot clobber more than max variable size");
464 
465          vars[nr_vars++] = I->src[s].value;
466       }
467    }
468 
469    if (nr_vars == 0)
470       return;
471 
472    /* Sort by descending alignment so they are packed with natural alignment */
473    util_qsort_r(vars, nr_vars, sizeof(vars[0]), sort_by_size, rctx->sizes);
474 
475    /* Reassign in the destination region */
476    unsigned base = reg;
477 
478    /* We align vectors to their sizes, so this assertion holds as long as no
479     * instruction has a source whose scalar size is greater than the entire size
480     * of the vector destination. Yet the killed source must fit within this
481     * destination, so the destination must be bigger and therefore have bigger
482     * alignment.
483     */
484    assert((base % agx_size_align_16(rctx->sizes[vars[0]])) == 0 &&
485           "destination alignment >= largest killed source alignment");
486 
487    for (unsigned i = 0; i < nr_vars; ++i) {
488       unsigned var = vars[i];
489       unsigned var_base = rctx->ssa_to_reg[var];
490       unsigned var_count = rctx->ncomps[var];
491       unsigned var_align = agx_size_align_16(rctx->sizes[var]);
492 
493       assert(rctx->classes[var] == RA_GPR && "construction");
494       assert((base % var_align) == 0 && "induction");
495       assert((var_count % var_align) == 0 && "no partial variables");
496 
497       for (unsigned j = 0; j < var_count; j += var_align) {
498          struct agx_copy copy = {
499             .dest = base + j,
500             .src = agx_register(var_base + j, rctx->sizes[var]),
501          };
502 
503          util_dynarray_append(copies, struct agx_copy, copy);
504       }
505 
506       set_ssa_to_reg(rctx, var, base);
507       rctx->reg_to_ssa[base] = var;
508 
509       base += var_count;
510    }
511 
512    assert(base <= reg + count && "no overflow");
513 }
514 
515 static unsigned
find_regs(struct ra_ctx * rctx,agx_instr * I,unsigned dest_idx,unsigned count,unsigned align)516 find_regs(struct ra_ctx *rctx, agx_instr *I, unsigned dest_idx, unsigned count,
517           unsigned align)
518 {
519    unsigned reg;
520    assert(count == align);
521 
522    enum ra_class cls = ra_class_for_index(I->dest[dest_idx]);
523 
524    if (find_regs_simple(rctx, cls, count, align, &reg)) {
525       return reg;
526    } else {
527       assert(cls == RA_GPR && "no memory live range splits");
528 
529       BITSET_DECLARE(clobbered, AGX_NUM_REGS) = {0};
530       BITSET_DECLARE(killed, AGX_NUM_REGS) = {0};
531       struct util_dynarray copies = {0};
532       util_dynarray_init(&copies, NULL);
533 
534       /* Initialize the set of registers killed by this instructions' sources */
535       agx_foreach_ssa_src(I, s) {
536          unsigned v = I->src[s].value;
537 
538          if (BITSET_TEST(rctx->visited, v)) {
539             unsigned base = rctx->ssa_to_reg[v];
540             unsigned nr = rctx->ncomps[v];
541             BITSET_SET_RANGE(killed, base, base + nr - 1);
542          }
543       }
544 
545       reg = assign_regs_by_copying(rctx, count, align, I, &copies, clobbered,
546                                    killed, cls);
547       insert_copies_for_clobbered_killed(rctx, reg, count, I, &copies,
548                                          clobbered);
549 
550       /* Insert the necessary copies */
551       agx_builder b = agx_init_builder(rctx->shader, agx_before_instr(I));
552       agx_emit_parallel_copies(
553          &b, copies.data, util_dynarray_num_elements(&copies, struct agx_copy));
554 
555       /* assign_regs asserts this is cleared, so clear to be reassigned */
556       BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1);
557       return reg;
558    }
559 }
560 
561 /*
562  * Loop over live-in values at the start of the block and mark their registers
563  * as in-use. We process blocks in dominance order, so this handles everything
564  * but loop headers.
565  *
566  * For loop headers, this handles the forward edges but not the back edge.
567  * However, that's okay: we don't want to reserve the registers that are
568  * defined within the loop, because then we'd get a contradiction. Instead we
569  * leave them available and then they become fixed points of a sort.
570  */
571 static void
reserve_live_in(struct ra_ctx * rctx)572 reserve_live_in(struct ra_ctx *rctx)
573 {
574    /* If there are no predecessors, there is nothing live-in */
575    unsigned nr_preds = agx_num_predecessors(rctx->block);
576    if (nr_preds == 0)
577       return;
578 
579    agx_builder b =
580       agx_init_builder(rctx->shader, agx_before_block(rctx->block));
581 
582    int i;
583    BITSET_FOREACH_SET(i, rctx->block->live_in, rctx->shader->alloc) {
584       /* Skip values defined in loops when processing the loop header */
585       if (!BITSET_TEST(rctx->visited, i))
586          continue;
587 
588       unsigned base;
589 
590       /* If we split live ranges, the variable might be defined differently at
591        * the end of each predecessor. Join them together with a phi inserted at
592        * the start of the block.
593        */
594       if (nr_preds > 1) {
595          /* We'll fill in the destination after, to coalesce one of the moves */
596          agx_instr *phi = agx_phi_to(&b, agx_null(), nr_preds);
597          enum agx_size size = rctx->sizes[i];
598 
599          agx_foreach_predecessor(rctx->block, pred) {
600             unsigned pred_idx = agx_predecessor_index(rctx->block, *pred);
601 
602             if ((*pred)->ssa_to_reg_out == NULL) {
603                /* If this is a loop header, we don't know where the register
604                 * will end up. So, we create a phi conservatively but don't fill
605                 * it in until the end of the loop. Stash in the information
606                 * we'll need to fill in the real register later.
607                 */
608                assert(rctx->block->loop_header);
609                phi->src[pred_idx] = agx_get_index(i, size);
610                phi->src[pred_idx].memory = rctx->classes[i] == RA_MEM;
611             } else {
612                /* Otherwise, we can build the phi now */
613                unsigned reg = (*pred)->ssa_to_reg_out[i];
614                phi->src[pred_idx] = rctx->classes[i] == RA_MEM
615                                        ? agx_memory_register(reg, size)
616                                        : agx_register(reg, size);
617             }
618          }
619 
620          /* Pick the phi destination to coalesce a move. Predecessor ordering is
621           * stable, so this means all live-in values get their registers from a
622           * particular predecessor. That means that such a register allocation
623           * is valid here, because it was valid in the predecessor.
624           */
625          phi->dest[0] = phi->src[0];
626          base = phi->dest[0].value;
627       } else {
628          /* If we don't emit a phi, there is already a unique register */
629          assert(nr_preds == 1);
630 
631          agx_block **pred = util_dynarray_begin(&rctx->block->predecessors);
632          base = (*pred)->ssa_to_reg_out[i];
633       }
634 
635       enum ra_class cls = rctx->classes[i];
636       set_ssa_to_reg(rctx, i, base);
637 
638       for (unsigned j = 0; j < rctx->ncomps[i]; ++j) {
639          BITSET_SET(rctx->used_regs[cls], base + j);
640 
641          if (cls == RA_GPR)
642             rctx->reg_to_ssa[base + j] = i;
643       }
644    }
645 }
646 
647 static void
assign_regs(struct ra_ctx * rctx,agx_index v,unsigned reg)648 assign_regs(struct ra_ctx *rctx, agx_index v, unsigned reg)
649 {
650    enum ra_class cls = ra_class_for_index(v);
651    assert(reg < rctx->bound[cls] && "must not overflow register file");
652    assert(v.type == AGX_INDEX_NORMAL && "only SSA gets registers allocated");
653    set_ssa_to_reg(rctx, v.value, reg);
654 
655    assert(!BITSET_TEST(rctx->visited, v.value) && "SSA violated");
656    BITSET_SET(rctx->visited, v.value);
657 
658    assert(rctx->ncomps[v.value] >= 1);
659    unsigned end = reg + rctx->ncomps[v.value] - 1;
660 
661    assert(!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, end) &&
662           "no interference");
663    BITSET_SET_RANGE(rctx->used_regs[cls], reg, end);
664 
665    if (cls == RA_GPR)
666       rctx->reg_to_ssa[reg] = v.value;
667 }
668 
669 static void
agx_set_sources(struct ra_ctx * rctx,agx_instr * I)670 agx_set_sources(struct ra_ctx *rctx, agx_instr *I)
671 {
672    assert(I->op != AGX_OPCODE_PHI);
673 
674    agx_foreach_ssa_src(I, s) {
675       assert(BITSET_TEST(rctx->visited, I->src[s].value) && "no phis");
676 
677       unsigned v = rctx->ssa_to_reg[I->src[s].value];
678       agx_replace_src(I, s, agx_register_like(v, I->src[s]));
679    }
680 }
681 
682 static void
agx_set_dests(struct ra_ctx * rctx,agx_instr * I)683 agx_set_dests(struct ra_ctx *rctx, agx_instr *I)
684 {
685    agx_foreach_ssa_dest(I, s) {
686       unsigned v = rctx->ssa_to_reg[I->dest[s].value];
687       I->dest[s] =
688          agx_replace_index(I->dest[s], agx_register_like(v, I->dest[s]));
689    }
690 }
691 
692 static unsigned
affinity_base_of_collect(struct ra_ctx * rctx,agx_instr * collect,unsigned src)693 affinity_base_of_collect(struct ra_ctx *rctx, agx_instr *collect, unsigned src)
694 {
695    unsigned src_reg = rctx->ssa_to_reg[collect->src[src].value];
696    unsigned src_offset = src * agx_size_align_16(collect->src[src].size);
697 
698    if (src_reg >= src_offset)
699       return src_reg - src_offset;
700    else
701       return ~0;
702 }
703 
704 static bool
try_coalesce_with(struct ra_ctx * rctx,agx_index ssa,unsigned count,bool may_be_unvisited,unsigned * out)705 try_coalesce_with(struct ra_ctx *rctx, agx_index ssa, unsigned count,
706                   bool may_be_unvisited, unsigned *out)
707 {
708    assert(ssa.type == AGX_INDEX_NORMAL);
709    if (!BITSET_TEST(rctx->visited, ssa.value)) {
710       assert(may_be_unvisited);
711       return false;
712    }
713 
714    unsigned base = rctx->ssa_to_reg[ssa.value];
715    enum ra_class cls = ra_class_for_index(ssa);
716 
717    if (BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
718       return false;
719 
720    assert(base + count <= rctx->bound[cls] && "invariant");
721    *out = base;
722    return true;
723 }
724 
725 static unsigned
pick_regs(struct ra_ctx * rctx,agx_instr * I,unsigned d)726 pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
727 {
728    agx_index idx = I->dest[d];
729    enum ra_class cls = ra_class_for_index(idx);
730    assert(idx.type == AGX_INDEX_NORMAL);
731 
732    unsigned count = rctx->ncomps[idx.value];
733    assert(count >= 1);
734 
735    unsigned align = count;
736 
737    /* Try to allocate phis compatibly with their sources */
738    if (I->op == AGX_OPCODE_PHI) {
739       agx_foreach_ssa_src(I, s) {
740          /* Loop headers have phis with a source preceding the definition */
741          bool may_be_unvisited = rctx->block->loop_header;
742 
743          unsigned out;
744          if (try_coalesce_with(rctx, I->src[s], count, may_be_unvisited, &out))
745             return out;
746       }
747    }
748 
749    /* Try to allocate collects compatibly with their sources */
750    if (I->op == AGX_OPCODE_COLLECT) {
751       agx_foreach_ssa_src(I, s) {
752          assert(BITSET_TEST(rctx->visited, I->src[s].value) &&
753                 "registers assigned in an order compatible with dominance "
754                 "and this is not a phi node, so we have assigned a register");
755 
756          unsigned base = affinity_base_of_collect(rctx, I, s);
757          if (base >= rctx->bound[cls] || (base + count) > rctx->bound[cls])
758             continue;
759 
760          /* Unaligned destinations can happen when dest size > src size */
761          if (base % align)
762             continue;
763 
764          if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
765             return base;
766       }
767    }
768 
769    /* Try to allocate sources of collects contiguously */
770    agx_instr *collect_phi = rctx->src_to_collect_phi[idx.value];
771    if (collect_phi && collect_phi->op == AGX_OPCODE_COLLECT) {
772       agx_instr *collect = collect_phi;
773 
774       assert(count == align && "collect sources are scalar");
775 
776       /* Find our offset in the collect. If our source is repeated in the
777        * collect, this may not be unique. We arbitrarily choose the first.
778        */
779       unsigned our_source = ~0;
780       agx_foreach_ssa_src(collect, s) {
781          if (agx_is_equiv(collect->src[s], idx)) {
782             our_source = s;
783             break;
784          }
785       }
786 
787       assert(our_source < collect->nr_srcs && "source must be in the collect");
788 
789       /* See if we can allocate compatibly with any source of the collect */
790       agx_foreach_ssa_src(collect, s) {
791          if (!BITSET_TEST(rctx->visited, collect->src[s].value))
792             continue;
793 
794          /* Determine where the collect should start relative to the source */
795          unsigned base = affinity_base_of_collect(rctx, collect, s);
796          if (base >= rctx->bound[cls])
797             continue;
798 
799          unsigned our_reg = base + (our_source * align);
800 
801          /* Don't allocate past the end of the register file */
802          if ((our_reg + align) > rctx->bound[cls])
803             continue;
804 
805          /* If those registers are free, then choose them */
806          if (!BITSET_TEST_RANGE(rctx->used_regs[cls], our_reg,
807                                 our_reg + align - 1))
808             return our_reg;
809       }
810 
811       unsigned collect_align = rctx->ncomps[collect->dest[0].value];
812       unsigned offset = our_source * align;
813 
814       /* Prefer ranges of the register file that leave room for all sources of
815        * the collect contiguously.
816        */
817       for (unsigned base = 0;
818            base + (collect->nr_srcs * align) <= rctx->bound[cls];
819            base += collect_align) {
820          if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base,
821                                 base + (collect->nr_srcs * align) - 1))
822             return base + offset;
823       }
824 
825       /* Try to respect the alignment requirement of the collect destination,
826        * which may be greater than the sources (e.g. pack_64_2x32_split). Look
827        * for a register for the source such that the collect base is aligned.
828        */
829       if (collect_align > align) {
830          for (unsigned reg = offset; reg + collect_align <= rctx->bound[cls];
831               reg += collect_align) {
832             if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + count - 1))
833                return reg;
834          }
835       }
836    }
837 
838    /* Try to allocate phi sources compatibly with their phis */
839    if (collect_phi && collect_phi->op == AGX_OPCODE_PHI) {
840       agx_instr *phi = collect_phi;
841       unsigned out;
842 
843       agx_foreach_ssa_src(phi, s) {
844          if (try_coalesce_with(rctx, phi->src[s], count, true, &out))
845             return out;
846       }
847 
848       /* If we're in a loop, we may have already allocated the phi. Try that. */
849       if (phi->dest[0].type == AGX_INDEX_REGISTER) {
850          unsigned base = phi->dest[0].value;
851 
852          if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
853             return base;
854       }
855    }
856 
857    /* Default to any contiguous sequence of registers */
858    return find_regs(rctx, I, d, count, align);
859 }
860 
861 /** Assign registers to SSA values in a block. */
862 
863 static void
agx_ra_assign_local(struct ra_ctx * rctx)864 agx_ra_assign_local(struct ra_ctx *rctx)
865 {
866    BITSET_DECLARE(used_regs_gpr, AGX_NUM_REGS) = {0};
867    BITSET_DECLARE(used_regs_mem, AGX_NUM_MODELED_REGS) = {0};
868    uint16_t *ssa_to_reg = calloc(rctx->shader->alloc, sizeof(uint16_t));
869 
870    agx_block *block = rctx->block;
871    uint8_t *ncomps = rctx->ncomps;
872    rctx->used_regs[RA_GPR] = used_regs_gpr;
873    rctx->used_regs[RA_MEM] = used_regs_mem;
874    rctx->ssa_to_reg = ssa_to_reg;
875 
876    reserve_live_in(rctx);
877 
878    /* Force the nesting counter r0l live throughout shaders using control flow.
879     * This could be optimized (sync with agx_calc_register_demand).
880     */
881    if (rctx->shader->any_cf)
882       BITSET_SET(used_regs_gpr, 0);
883 
884    agx_foreach_instr_in_block(block, I) {
885       rctx->instr = I;
886 
887       /* Optimization: if a split contains the last use of a vector, the split
888        * can be removed by assigning the destinations overlapping the source.
889        */
890       if (I->op == AGX_OPCODE_SPLIT && I->src[0].kill) {
891          assert(ra_class_for_index(I->src[0]) == RA_GPR);
892          unsigned reg = ssa_to_reg[I->src[0].value];
893          unsigned width = agx_size_align_16(agx_split_width(I));
894 
895          agx_foreach_dest(I, d) {
896             assert(ra_class_for_index(I->dest[0]) == RA_GPR);
897 
898             /* Free up the source */
899             unsigned offset_reg = reg + (d * width);
900             BITSET_CLEAR_RANGE(used_regs_gpr, offset_reg,
901                                offset_reg + width - 1);
902 
903             /* Assign the destination where the source was */
904             if (!agx_is_null(I->dest[d]))
905                assign_regs(rctx, I->dest[d], offset_reg);
906          }
907 
908          unsigned excess =
909             rctx->ncomps[I->src[0].value] - (I->nr_dests * width);
910          if (excess) {
911             BITSET_CLEAR_RANGE(used_regs_gpr, reg + (I->nr_dests * width),
912                                reg + rctx->ncomps[I->src[0].value] - 1);
913          }
914 
915          agx_set_sources(rctx, I);
916          agx_set_dests(rctx, I);
917          continue;
918       } else if (I->op == AGX_OPCODE_PRELOAD) {
919          /* We must coalesce all preload moves */
920          assert(I->dest[0].size == I->src[0].size);
921          assert(I->src[0].type == AGX_INDEX_REGISTER);
922 
923          assign_regs(rctx, I->dest[0], I->src[0].value);
924          agx_set_dests(rctx, I);
925          continue;
926       }
927 
928       /* First, free killed sources */
929       agx_foreach_ssa_src(I, s) {
930          if (I->src[s].kill) {
931             enum ra_class cls = ra_class_for_index(I->src[s]);
932             unsigned reg = ssa_to_reg[I->src[s].value];
933             unsigned count = ncomps[I->src[s].value];
934 
935             assert(count >= 1);
936             BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1);
937          }
938       }
939 
940       /* Next, assign destinations one at a time. This is always legal
941        * because of the SSA form.
942        */
943       agx_foreach_ssa_dest(I, d) {
944          assign_regs(rctx, I->dest[d], pick_regs(rctx, I, d));
945       }
946 
947       /* Phi sources are special. Set in the corresponding predecessors */
948       if (I->op != AGX_OPCODE_PHI)
949          agx_set_sources(rctx, I);
950 
951       agx_set_dests(rctx, I);
952    }
953 
954    block->ssa_to_reg_out = rctx->ssa_to_reg;
955 
956    /* Also set the sources for the phis in our successors, since that logically
957     * happens now (given the possibility of live range splits, etc)
958     */
959    agx_foreach_successor(block, succ) {
960       unsigned pred_idx = agx_predecessor_index(succ, block);
961 
962       agx_foreach_phi_in_block(succ, phi) {
963          if (phi->src[pred_idx].type == AGX_INDEX_NORMAL) {
964             /* This source needs a fixup */
965             unsigned value = phi->src[pred_idx].value;
966 
967             agx_replace_src(
968                phi, pred_idx,
969                agx_register_like(rctx->ssa_to_reg[value], phi->src[pred_idx]));
970          }
971       }
972    }
973 }
974 
975 /*
976  * Lower phis to parallel copies at the logical end of a given block. If a block
977  * needs parallel copies inserted, a successor of the block has a phi node. To
978  * have a (nontrivial) phi node, a block must have multiple predecessors. So the
979  * edge from the block to the successor (with phi) is not the only edge entering
980  * the successor. Because the control flow graph has no critical edges, this
981  * edge must therefore be the only edge leaving the block, so the block must
982  * have only a single successor.
983  */
984 static void
agx_insert_parallel_copies(agx_context * ctx,agx_block * block)985 agx_insert_parallel_copies(agx_context *ctx, agx_block *block)
986 {
987    bool any_succ = false;
988    unsigned nr_phi = 0;
989 
990    /* Phi nodes logically happen on the control flow edge, so parallel copies
991     * are added at the end of the predecessor */
992    agx_builder b = agx_init_builder(ctx, agx_after_block_logical(block));
993 
994    agx_foreach_successor(block, succ) {
995       assert(nr_phi == 0 && "control flow graph has a critical edge");
996 
997       agx_foreach_phi_in_block(succ, phi) {
998          assert(!any_succ && "control flow graph has a critical edge");
999          nr_phi++;
1000       }
1001 
1002       any_succ = true;
1003 
1004       /* Nothing to do if there are no phi nodes */
1005       if (nr_phi == 0)
1006          continue;
1007 
1008       unsigned pred_index = agx_predecessor_index(succ, block);
1009 
1010       /* Create a parallel copy lowering all the phi nodes */
1011       struct agx_copy *copies = calloc(sizeof(*copies), nr_phi);
1012 
1013       unsigned i = 0;
1014 
1015       agx_foreach_phi_in_block(succ, phi) {
1016          agx_index dest = phi->dest[0];
1017          agx_index src = phi->src[pred_index];
1018 
1019          if (src.type == AGX_INDEX_IMMEDIATE)
1020             src.size = dest.size;
1021 
1022          assert(dest.type == AGX_INDEX_REGISTER);
1023          assert(dest.size == src.size);
1024 
1025          copies[i++] = (struct agx_copy){
1026             .dest = dest.value,
1027             .dest_mem = dest.memory,
1028             .src = src,
1029          };
1030       }
1031 
1032       agx_emit_parallel_copies(&b, copies, nr_phi);
1033 
1034       free(copies);
1035    }
1036 }
1037 
1038 static inline agx_index
agx_index_as_mem(agx_index idx,unsigned mem_base)1039 agx_index_as_mem(agx_index idx, unsigned mem_base)
1040 {
1041    assert(idx.type == AGX_INDEX_NORMAL);
1042    assert(!idx.memory);
1043    idx.memory = true;
1044    idx.value = mem_base + idx.value;
1045    return idx;
1046 }
1047 
1048 /*
1049  * Spill everything to the stack, trivially. For debugging spilling.
1050  *
1051  * Only phis and stack moves can access memory variables.
1052  */
1053 static void
agx_spill_everything(agx_context * ctx)1054 agx_spill_everything(agx_context *ctx)
1055 {
1056    /* Immediates and uniforms are not allowed to be spilled, so they cannot
1057     * appear in phi webs. Lower them first.
1058     */
1059    agx_foreach_block(ctx, block) {
1060       agx_block **preds = util_dynarray_begin(&block->predecessors);
1061 
1062       agx_foreach_phi_in_block(block, phi) {
1063          agx_foreach_src(phi, s) {
1064             if (phi->src[s].type == AGX_INDEX_IMMEDIATE ||
1065                 phi->src[s].type == AGX_INDEX_UNIFORM) {
1066 
1067                agx_builder b =
1068                   agx_init_builder(ctx, agx_after_block_logical(preds[s]));
1069 
1070                agx_index temp = agx_temp(ctx, phi->dest[0].size);
1071 
1072                if (phi->src[s].type == AGX_INDEX_IMMEDIATE)
1073                   agx_mov_imm_to(&b, temp, phi->src[s].value);
1074                else
1075                   agx_mov_to(&b, temp, phi->src[s]);
1076 
1077                agx_replace_src(phi, s, temp);
1078             }
1079          }
1080       }
1081    }
1082 
1083    /* Now we can spill everything */
1084    unsigned mem_base = ctx->alloc;
1085    ctx->alloc = mem_base + ctx->alloc;
1086 
1087    agx_foreach_instr_global_safe(ctx, I) {
1088       if (I->op == AGX_OPCODE_PHI) {
1089          agx_foreach_ssa_dest(I, d) {
1090             I->dest[d] = agx_replace_index(
1091                I->dest[d], agx_index_as_mem(I->dest[d], mem_base));
1092          }
1093 
1094          agx_foreach_ssa_src(I, s) {
1095             agx_replace_src(I, s, agx_index_as_mem(I->src[s], mem_base));
1096          }
1097       } else {
1098          agx_builder b = agx_init_builder(ctx, agx_before_instr(I));
1099          agx_foreach_ssa_src(I, s) {
1100             agx_index fill =
1101                agx_vec_temp(ctx, I->src[s].size, agx_channels(I->src[s]));
1102 
1103             agx_mov_to(&b, fill, agx_index_as_mem(I->src[s], mem_base));
1104             agx_replace_src(I, s, fill);
1105          }
1106 
1107          agx_foreach_ssa_dest(I, d) {
1108             agx_builder b = agx_init_builder(ctx, agx_after_instr(I));
1109             agx_mov_to(&b, agx_index_as_mem(I->dest[d], mem_base), I->dest[d]);
1110          }
1111       }
1112    }
1113 
1114    agx_validate(ctx, "Trivial spill");
1115 }
1116 
1117 void
agx_ra(agx_context * ctx)1118 agx_ra(agx_context *ctx)
1119 {
1120    /* Determine maximum possible registers. We won't exceed this! */
1121    unsigned max_possible_regs = AGX_NUM_REGS;
1122 
1123    /* Compute shaders need to have their entire workgroup together, so our
1124     * register usage is bounded by the workgroup size.
1125     */
1126    if (gl_shader_stage_is_compute(ctx->stage)) {
1127       unsigned threads_per_workgroup;
1128 
1129       /* If we don't know the workgroup size, worst case it. TODO: Optimize
1130        * this, since it'll decimate opencl perf.
1131        */
1132       if (ctx->nir->info.workgroup_size_variable) {
1133          threads_per_workgroup = 1024;
1134       } else {
1135          threads_per_workgroup = ctx->nir->info.workgroup_size[0] *
1136                                  ctx->nir->info.workgroup_size[1] *
1137                                  ctx->nir->info.workgroup_size[2];
1138       }
1139 
1140       max_possible_regs =
1141          agx_max_registers_for_occupancy(threads_per_workgroup);
1142    }
1143 
1144    /* The helper program is unspillable and has a limited register file */
1145    if (ctx->key->is_helper) {
1146       max_possible_regs = 32;
1147    }
1148 
1149    /* Calculate the demand. We'll use it to determine if we need to spill and to
1150     * bound register assignment.
1151     */
1152    agx_compute_liveness(ctx);
1153    unsigned effective_demand = agx_calc_register_demand(ctx);
1154    bool spilling = (effective_demand > max_possible_regs);
1155    spilling |= ((agx_compiler_debug & AGX_DBG_SPILL) && ctx->key->has_scratch);
1156 
1157    if (spilling) {
1158       assert(ctx->key->has_scratch && "internal shaders are unspillable");
1159       agx_spill_everything(ctx);
1160 
1161       /* After spilling, recalculate liveness and demand */
1162       agx_compute_liveness(ctx);
1163       effective_demand = agx_calc_register_demand(ctx);
1164 
1165       /* The resulting program can now be assigned registers */
1166       assert(effective_demand <= max_possible_regs && "spiller post-condition");
1167    }
1168 
1169    uint8_t *ncomps = calloc(ctx->alloc, sizeof(uint8_t));
1170    enum ra_class *classes = calloc(ctx->alloc, sizeof(enum ra_class));
1171    agx_instr **src_to_collect_phi = calloc(ctx->alloc, sizeof(agx_instr *));
1172    enum agx_size *sizes = calloc(ctx->alloc, sizeof(enum agx_size));
1173    BITSET_WORD *visited = calloc(BITSET_WORDS(ctx->alloc), sizeof(BITSET_WORD));
1174    unsigned max_ncomps = 1;
1175 
1176    agx_foreach_instr_global(ctx, I) {
1177       /* Record collects/phis so we can coalesce when assigning */
1178       if (I->op == AGX_OPCODE_COLLECT || I->op == AGX_OPCODE_PHI) {
1179          agx_foreach_ssa_src(I, s) {
1180             src_to_collect_phi[I->src[s].value] = I;
1181          }
1182       }
1183 
1184       agx_foreach_ssa_dest(I, d) {
1185          unsigned v = I->dest[d].value;
1186          assert(ncomps[v] == 0 && "broken SSA");
1187          /* Round up vectors for easier live range splitting */
1188          ncomps[v] = util_next_power_of_two(agx_index_size_16(I->dest[d]));
1189          sizes[v] = I->dest[d].size;
1190          classes[v] = ra_class_for_index(I->dest[d]);
1191 
1192          max_ncomps = MAX2(max_ncomps, ncomps[v]);
1193       }
1194    }
1195 
1196    /* For live range splitting to work properly, ensure the register file is
1197     * aligned to the larger vector size. Most of the time, this is a no-op since
1198     * the largest vector size is usually 128-bit and the register file is
1199     * naturally 128-bit aligned. However, this is required for correctness with
1200     * 3D textureGrad, which can have a source vector of length 6x32-bit,
1201     * rounding up to 256-bit and requiring special accounting here.
1202     */
1203    unsigned reg_file_alignment = MAX2(max_ncomps, 8);
1204    assert(util_is_power_of_two_nonzero(reg_file_alignment));
1205 
1206    if (spilling) {
1207       /* We need to allocate scratch registers for lowering spilling later */
1208       effective_demand = MAX2(effective_demand, 6 * 2 /* preloading */);
1209       effective_demand += reg_file_alignment;
1210    }
1211 
1212    unsigned demand = ALIGN_POT(effective_demand, reg_file_alignment);
1213    assert(demand <= max_possible_regs && "Invariant");
1214 
1215    /* Round up the demand to the maximum number of registers we can use without
1216     * affecting occupancy. This reduces live range splitting.
1217     */
1218    unsigned max_regs = agx_occupancy_for_register_count(demand).max_registers;
1219    if (ctx->key->is_helper)
1220       max_regs = 32;
1221 
1222    max_regs = ROUND_DOWN_TO(max_regs, reg_file_alignment);
1223 
1224    /* Or, we can bound tightly for debugging */
1225    if (agx_compiler_debug & AGX_DBG_DEMAND)
1226       max_regs = ALIGN_POT(MAX2(demand, 12), reg_file_alignment);
1227 
1228    /* ...but not too tightly */
1229    assert((max_regs % reg_file_alignment) == 0 && "occupancy limits aligned");
1230    assert(max_regs >= (6 * 2) && "space for vertex shader preloading");
1231    assert(max_regs <= max_possible_regs);
1232 
1233    unsigned max_mem_slot = 0;
1234 
1235    /* Assign registers in dominance-order. This coincides with source-order due
1236     * to a NIR invariant, so we do not need special handling for this.
1237     */
1238    agx_foreach_block(ctx, block) {
1239       agx_ra_assign_local(&(struct ra_ctx){
1240          .shader = ctx,
1241          .block = block,
1242          .src_to_collect_phi = src_to_collect_phi,
1243          .ncomps = ncomps,
1244          .sizes = sizes,
1245          .classes = classes,
1246          .visited = visited,
1247          .bound[RA_GPR] = max_regs,
1248          .bound[RA_MEM] = AGX_NUM_MODELED_REGS,
1249          .max_reg[RA_GPR] = &ctx->max_reg,
1250          .max_reg[RA_MEM] = &max_mem_slot,
1251       });
1252    }
1253 
1254    if (spilling) {
1255       ctx->spill_base = ctx->scratch_size;
1256       ctx->scratch_size += (max_mem_slot + 1) * 2;
1257    }
1258 
1259    /* Vertex shaders preload the vertex/instance IDs (r5, r6) even if the shader
1260     * don't use them. Account for that so the preload doesn't clobber GPRs.
1261     */
1262    if (ctx->nir->info.stage == MESA_SHADER_VERTEX)
1263       ctx->max_reg = MAX2(ctx->max_reg, 6 * 2);
1264 
1265    assert(ctx->max_reg <= max_regs);
1266 
1267    agx_foreach_instr_global_safe(ctx, ins) {
1268       /* Lower away RA pseudo-instructions */
1269       agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
1270 
1271       if (ins->op == AGX_OPCODE_COLLECT) {
1272          assert(ins->dest[0].type == AGX_INDEX_REGISTER);
1273          assert(!ins->dest[0].memory);
1274 
1275          unsigned base = ins->dest[0].value;
1276          unsigned width = agx_size_align_16(ins->src[0].size);
1277 
1278          struct agx_copy *copies = alloca(sizeof(copies[0]) * ins->nr_srcs);
1279          unsigned n = 0;
1280 
1281          /* Move the sources */
1282          agx_foreach_src(ins, i) {
1283             if (agx_is_null(ins->src[i]) || ins->src[i].type == AGX_INDEX_UNDEF)
1284                continue;
1285             assert(ins->src[i].size == ins->src[0].size);
1286 
1287             copies[n++] = (struct agx_copy){
1288                .dest = base + (i * width),
1289                .src = ins->src[i],
1290             };
1291          }
1292 
1293          agx_emit_parallel_copies(&b, copies, n);
1294          agx_remove_instruction(ins);
1295          continue;
1296       } else if (ins->op == AGX_OPCODE_SPLIT) {
1297          assert(ins->src[0].type == AGX_INDEX_REGISTER ||
1298                 ins->src[0].type == AGX_INDEX_UNIFORM);
1299 
1300          struct agx_copy copies[4];
1301          assert(ins->nr_dests <= ARRAY_SIZE(copies));
1302 
1303          unsigned n = 0;
1304          unsigned width = agx_size_align_16(agx_split_width(ins));
1305 
1306          /* Move the sources */
1307          agx_foreach_dest(ins, i) {
1308             if (ins->dest[i].type != AGX_INDEX_REGISTER)
1309                continue;
1310 
1311             assert(!ins->dest[i].memory);
1312 
1313             agx_index src = ins->src[0];
1314             src.size = ins->dest[i].size;
1315             src.channels_m1 = 0;
1316             src.value += (i * width);
1317 
1318             copies[n++] = (struct agx_copy){
1319                .dest = ins->dest[i].value,
1320                .src = src,
1321             };
1322          }
1323 
1324          /* Lower away */
1325          agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
1326          agx_emit_parallel_copies(&b, copies, n);
1327          agx_remove_instruction(ins);
1328          continue;
1329       }
1330    }
1331 
1332    /* Insert parallel copies lowering phi nodes */
1333    agx_foreach_block(ctx, block) {
1334       agx_insert_parallel_copies(ctx, block);
1335    }
1336 
1337    agx_foreach_instr_global_safe(ctx, I) {
1338       switch (I->op) {
1339       /* Pseudoinstructions for RA must be removed now */
1340       case AGX_OPCODE_PHI:
1341       case AGX_OPCODE_PRELOAD:
1342          agx_remove_instruction(I);
1343          break;
1344 
1345       /* Coalesced moves can be removed */
1346       case AGX_OPCODE_MOV:
1347          if (I->src[0].type == AGX_INDEX_REGISTER &&
1348              I->dest[0].size == I->src[0].size &&
1349              I->src[0].value == I->dest[0].value &&
1350              I->src[0].memory == I->dest[0].memory) {
1351 
1352             assert(I->dest[0].type == AGX_INDEX_REGISTER);
1353             agx_remove_instruction(I);
1354          }
1355          break;
1356 
1357       default:
1358          break;
1359       }
1360    }
1361 
1362    if (spilling)
1363       agx_lower_spill(ctx);
1364 
1365    agx_foreach_block(ctx, block) {
1366       free(block->ssa_to_reg_out);
1367       block->ssa_to_reg_out = NULL;
1368    }
1369 
1370    free(src_to_collect_phi);
1371    free(ncomps);
1372    free(sizes);
1373    free(classes);
1374    free(visited);
1375 }
1376