• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2021 Alyssa Rosenzweig
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "util/bitset.h"
7 #include "util/macros.h"
8 #include "util/u_dynarray.h"
9 #include "util/u_math.h"
10 #include "util/u_memory.h"
11 #include "util/u_qsort.h"
12 #include "agx_builder.h"
13 #include "agx_compile.h"
14 #include "agx_compiler.h"
15 #include "agx_debug.h"
16 #include "agx_opcodes.h"
17 #include "shader_enums.h"
18 
19 /* SSA-based register allocator */
20 struct phi_web_node {
21    /* Parent index, or circular for root */
22    uint32_t parent;
23 
24    /* If root, assigned register, or ~0 if no register assigned. */
25    uint16_t reg;
26    bool assigned;
27 
28    /* Rank, at most log2(n) so need ~5-bits */
29    uint8_t rank;
30 };
31 static_assert(sizeof(struct phi_web_node) == 8, "packed");
32 
33 static unsigned
phi_web_find(struct phi_web_node * web,unsigned x)34 phi_web_find(struct phi_web_node *web, unsigned x)
35 {
36    if (web[x].parent == x) {
37       /* Root */
38       return x;
39    } else {
40       /* Search up the tree */
41       unsigned root = x;
42       while (web[root].parent != root)
43          root = web[root].parent;
44 
45       /* Compress path. Second pass ensures O(1) memory usage. */
46       while (web[x].parent != x) {
47          unsigned temp = web[x].parent;
48          web[x].parent = root;
49          x = temp;
50       }
51 
52       return root;
53    }
54 }
55 
56 static void
phi_web_union(struct phi_web_node * web,unsigned x,unsigned y)57 phi_web_union(struct phi_web_node *web, unsigned x, unsigned y)
58 {
59    x = phi_web_find(web, x);
60    y = phi_web_find(web, y);
61 
62    if (x == y)
63       return;
64 
65    /* Union-by-rank: ensure x.rank >= y.rank */
66    if (web[x].rank < web[y].rank) {
67       unsigned temp = x;
68       x = y;
69       y = temp;
70    }
71 
72    web[y].parent = x;
73 
74    /* Increment rank if necessary */
75    if (web[x].rank == web[y].rank) {
76       web[x].rank++;
77    }
78 }
79 
80 struct ra_ctx {
81    agx_context *shader;
82    agx_block *block;
83    agx_instr *instr;
84    uint16_t *ssa_to_reg;
85    uint8_t *ncomps;
86    uint8_t *ncomps_unrounded;
87    enum agx_size *sizes;
88    enum ra_class *classes;
89    BITSET_WORD *visited;
90    BITSET_WORD *used_regs[RA_CLASSES];
91 
92    /* Maintained while assigning registers. Count of registers required, i.e.
93     * the maximum register assigned + 1.
94     */
95    unsigned *count[RA_CLASSES];
96 
97    /* For affinities */
98    agx_instr **src_to_collect_phi;
99    struct phi_web_node *phi_web;
100 
101    /* If bit i of used_regs is set, and register i is the first consecutive
102     * register holding an SSA value, then reg_to_ssa[i] is the SSA index of the
103     * value currently in register  i.
104     *
105     * Only for GPRs. We can add reg classes later if we have a use case.
106     */
107    uint32_t reg_to_ssa[AGX_NUM_REGS];
108 
109    /* Maximum number of registers that RA is allowed to use */
110    unsigned bound[RA_CLASSES];
111 };
112 
113 /*
114  * RA treats the nesting counter, the divergent shuffle temporary, and the
115  * spiller temporaries as alive throughout if used anywhere. This could be
116  * optimized. Using a single power-of-two reserved region at the start ensures
117  * these registers are never shuffled.
118  */
119 static unsigned
reserved_size(agx_context * ctx)120 reserved_size(agx_context *ctx)
121 {
122    if (ctx->has_spill_pcopy_reserved)
123       return 8;
124    else if (ctx->any_quad_divergent_shuffle)
125       return 2;
126    else if (ctx->any_cf)
127       return 1;
128    else
129       return 0;
130 }
131 
132 UNUSED static void
print_reg_file(struct ra_ctx * rctx,FILE * fp)133 print_reg_file(struct ra_ctx *rctx, FILE *fp)
134 {
135    unsigned reserved = reserved_size(rctx->shader);
136 
137    /* Dump the contents */
138    for (unsigned i = reserved; i < rctx->bound[RA_GPR]; ++i) {
139       if (BITSET_TEST(rctx->used_regs[RA_GPR], i)) {
140          uint32_t ssa = rctx->reg_to_ssa[i];
141          unsigned n = rctx->ncomps[ssa];
142          fprintf(fp, "h%u...%u: %u\n", i, i + n - 1, ssa);
143          i += (n - 1);
144       }
145    }
146    fprintf(fp, "\n");
147 
148    /* Dump a visualization of the sizes to understand what live range
149     * splitting is up against.
150     */
151    for (unsigned i = 0; i < rctx->bound[RA_GPR]; ++i) {
152       /* Space out 16-bit vec4s */
153       if (i && (i % 4) == 0) {
154          fprintf(fp, " ");
155       }
156 
157       if (i < reserved) {
158          fprintf(fp, "-");
159       } else if (BITSET_TEST(rctx->used_regs[RA_GPR], i)) {
160          uint32_t ssa = rctx->reg_to_ssa[i];
161          unsigned n = rctx->ncomps[ssa];
162          for (unsigned j = 0; j < n; ++j) {
163             assert(n < 10);
164             fprintf(fp, "%u", n);
165          }
166 
167          i += (n - 1);
168       } else {
169          fprintf(fp, ".");
170       }
171    }
172    fprintf(fp, "\n\n");
173 }
174 
175 enum agx_size
agx_split_width(const agx_instr * I)176 agx_split_width(const agx_instr *I)
177 {
178    enum agx_size width = ~0;
179 
180    agx_foreach_dest(I, d) {
181       if (I->dest[d].type == AGX_INDEX_NULL)
182          continue;
183       else if (width != ~0)
184          assert(width == I->dest[d].size);
185       else
186          width = I->dest[d].size;
187    }
188 
189    assert(width != ~0 && "should have been DCE'd");
190    return width;
191 }
192 
193 /*
194  * Calculate register demand in 16-bit registers, while gathering widths and
195  * classes. Becuase we allocate in SSA, this calculation is exact in
196  * linear-time. Depends on liveness information.
197  */
198 static unsigned
agx_calc_register_demand(agx_context * ctx)199 agx_calc_register_demand(agx_context *ctx)
200 {
201    /* Print detailed demand calculation, helpful to debug spilling */
202    bool debug = false;
203 
204    if (debug) {
205       agx_print_shader(ctx, stdout);
206    }
207 
208    uint8_t *widths = calloc(ctx->alloc, sizeof(uint8_t));
209    enum ra_class *classes = calloc(ctx->alloc, sizeof(enum ra_class));
210 
211    agx_foreach_instr_global(ctx, I) {
212       agx_foreach_ssa_dest(I, d) {
213          unsigned v = I->dest[d].value;
214          assert(widths[v] == 0 && "broken SSA");
215          /* Round up vectors for easier live range splitting */
216          widths[v] = util_next_power_of_two(agx_index_size_16(I->dest[d]));
217          classes[v] = ra_class_for_index(I->dest[d]);
218       }
219    }
220 
221    /* Calculate demand at the start of each block based on live-in, then update
222     * for each instruction processed. Calculate rolling maximum.
223     */
224    unsigned max_demand = 0;
225 
226    agx_foreach_block(ctx, block) {
227       unsigned demand = reserved_size(ctx);
228 
229       /* Everything live-in */
230       {
231          int i;
232          BITSET_FOREACH_SET(i, block->live_in, ctx->alloc) {
233             if (classes[i] == RA_GPR)
234                demand += widths[i];
235          }
236       }
237 
238       max_demand = MAX2(demand, max_demand);
239 
240       /* To handle non-power-of-two vectors, sometimes live range splitting
241        * needs extra registers for 1 instruction. This counter tracks the number
242        * of registers to be freed after 1 extra instruction.
243        */
244       unsigned late_kill_count = 0;
245 
246       if (debug) {
247          printf("\n");
248       }
249 
250       agx_foreach_instr_in_block(block, I) {
251          /* Phis happen in parallel and are already accounted for in the live-in
252           * set, just skip them so we don't double count.
253           */
254          if (I->op == AGX_OPCODE_PHI)
255             continue;
256 
257          if (debug) {
258             printf("%u: ", demand);
259             agx_print_instr(I, stdout);
260          }
261 
262          if (I->op == AGX_OPCODE_PRELOAD) {
263             unsigned size = agx_size_align_16(I->src[0].size);
264             max_demand = MAX2(max_demand, I->src[0].value + size);
265          } else if (I->op == AGX_OPCODE_EXPORT) {
266             unsigned size = agx_size_align_16(I->src[0].size);
267             max_demand = MAX2(max_demand, I->imm + size);
268          }
269 
270          /* Handle late-kill registers from last instruction */
271          demand -= late_kill_count;
272          late_kill_count = 0;
273 
274          /* Kill sources the first time we see them */
275          agx_foreach_src(I, s) {
276             if (!I->src[s].kill)
277                continue;
278             assert(I->src[s].type == AGX_INDEX_NORMAL);
279             if (ra_class_for_index(I->src[s]) != RA_GPR)
280                continue;
281 
282             bool skip = false;
283 
284             for (unsigned backwards = 0; backwards < s; ++backwards) {
285                if (agx_is_equiv(I->src[backwards], I->src[s])) {
286                   skip = true;
287                   break;
288                }
289             }
290 
291             if (!skip)
292                demand -= widths[I->src[s].value];
293          }
294 
295          /* Make destinations live */
296          agx_foreach_ssa_dest(I, d) {
297             if (ra_class_for_index(I->dest[d]) != RA_GPR)
298                continue;
299 
300             /* Live range splits allocate at power-of-two granularity. Round up
301              * destination sizes (temporarily) to powers-of-two.
302              */
303             unsigned real_width = widths[I->dest[d].value];
304             unsigned pot_width = util_next_power_of_two(real_width);
305 
306             demand += pot_width;
307             late_kill_count += (pot_width - real_width);
308          }
309 
310          max_demand = MAX2(demand, max_demand);
311       }
312 
313       demand -= late_kill_count;
314    }
315 
316    free(widths);
317    free(classes);
318    return max_demand;
319 }
320 
321 static bool
find_regs_simple(struct ra_ctx * rctx,enum ra_class cls,unsigned count,unsigned align,unsigned * out)322 find_regs_simple(struct ra_ctx *rctx, enum ra_class cls, unsigned count,
323                  unsigned align, unsigned *out)
324 {
325    for (unsigned reg = 0; reg + count <= rctx->bound[cls]; reg += align) {
326       if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + count - 1)) {
327          *out = reg;
328          return true;
329       }
330    }
331 
332    return false;
333 }
334 
335 /*
336  * Search the register file for the best contiguous aligned region of the given
337  * size to evict when shuffling registers. The region must not contain any
338  * register marked in the passed bitset.
339  *
340  * As a hint, this also takes in the set of registers from killed sources passed
341  * to this instruction. These should be deprioritized, since they are more
342  * expensive to use (extra moves to shuffle the contents away).
343  *
344  * Precondition: such a region exists.
345  *
346  * Postcondition: at least one register in the returned region is already free.
347  */
348 static unsigned
find_best_region_to_evict(struct ra_ctx * rctx,enum ra_class cls,unsigned size,BITSET_WORD * already_evicted,BITSET_WORD * killed)349 find_best_region_to_evict(struct ra_ctx *rctx, enum ra_class cls, unsigned size,
350                           BITSET_WORD *already_evicted, BITSET_WORD *killed)
351 {
352    assert(util_is_power_of_two_or_zero(size) && "precondition");
353    assert((rctx->bound[cls] % size) == 0 &&
354           "register file size must be aligned to the maximum vector size");
355    assert(cls == RA_GPR);
356 
357    /* Useful for testing RA */
358    bool invert = false;
359 
360    unsigned best_base = ~0;
361    unsigned best_moves = invert ? 0 : ~0;
362 
363    for (unsigned base = 0; base + size <= rctx->bound[cls]; base += size) {
364       /* The first k registers are preallocated and unevictable, so must be
365        * skipped. By itself, this does not pose a problem. We are allocating n
366        * registers, but this region has at most n-k free.  Since there are at
367        * least n free registers total, there is at least k free registers
368        * outside this region. Choose any such free register. The region
369        * containing it has at most n-1 occupied registers. In the worst case,
370        * n-k of those registers are are moved to the beginning region and the
371        * remaining (n-1)-(n-k) = k-1 registers are moved to the k-1 free
372        * registers in other regions, given there are k free registers total.
373        * These recursive shuffles work out because everything is power-of-two
374        * sized and naturally aligned, so the sizes shuffled are strictly
375        * descending. So, we do not need extra registers to handle "single
376        * region" unevictability.
377        */
378       if (base < reserved_size(rctx->shader))
379          continue;
380 
381       /* Do not evict the same register multiple times. It's not necessary since
382        * we're just shuffling, there are enough free registers elsewhere.
383        */
384       if (BITSET_TEST_RANGE(already_evicted, base, base + size - 1))
385          continue;
386 
387       /* Estimate the number of moves required if we pick this region */
388       unsigned moves = 0;
389       bool any_free = false;
390 
391       for (unsigned reg = base; reg < base + size; ++reg) {
392          /* We need a move for each blocked register (TODO: we only need a
393           * single move for 32-bit pairs, could optimize to use that instead.)
394           */
395          if (BITSET_TEST(rctx->used_regs[cls], reg))
396             moves++;
397          else
398             any_free = true;
399 
400          /* Each clobbered killed register requires a move or a swap. Since
401           * swaps require more instructions, assign a higher cost here. In
402           * practice, 3 is too high but 2 is slightly better than 1.
403           */
404          if (BITSET_TEST(killed, reg))
405             moves += 2;
406       }
407 
408       /* Pick the region requiring fewest moves as a heuristic. Regions with no
409        * free registers are skipped even if the heuristic estimates a lower cost
410        * (due to killed sources), since the recursive splitting algorithm
411        * requires at least one free register.
412        */
413       if (any_free && ((moves < best_moves) ^ invert)) {
414          best_moves = moves;
415          best_base = base;
416       }
417    }
418 
419    assert(best_base < rctx->bound[cls] &&
420           "not enough registers (should have spilled already)");
421    return best_base;
422 }
423 
424 static void
set_ssa_to_reg(struct ra_ctx * rctx,unsigned ssa,unsigned reg)425 set_ssa_to_reg(struct ra_ctx *rctx, unsigned ssa, unsigned reg)
426 {
427    enum ra_class cls = rctx->classes[ssa];
428    *(rctx->count[cls]) = MAX2(*(rctx->count[cls]), reg + rctx->ncomps[ssa]);
429 
430    rctx->ssa_to_reg[ssa] = reg;
431 
432    if (cls == RA_GPR) {
433       rctx->reg_to_ssa[reg] = ssa;
434    }
435 }
436 
437 /*
438  * Insert parallel copies to move an SSA variable `var` to a new register
439  * `new_reg`. This may require scalarizing.
440  */
441 static void
insert_copy(struct ra_ctx * rctx,struct util_dynarray * copies,unsigned new_reg,unsigned var)442 insert_copy(struct ra_ctx *rctx, struct util_dynarray *copies, unsigned new_reg,
443             unsigned var)
444 {
445    enum agx_size size = rctx->sizes[var];
446    unsigned align = agx_size_align_16(size);
447 
448    for (unsigned i = 0; i < rctx->ncomps[var]; i += align) {
449       struct agx_copy copy = {
450          .dest = new_reg + i,
451          .src = agx_register(rctx->ssa_to_reg[var] + i, size),
452       };
453 
454       assert((copy.dest % align) == 0 && "new dest must be aligned");
455       assert((copy.src.value % align) == 0 && "src must be aligned");
456       util_dynarray_append(copies, struct agx_copy, copy);
457    }
458 }
459 
460 static unsigned
assign_regs_by_copying(struct ra_ctx * rctx,agx_index dest,const agx_instr * I,struct util_dynarray * copies,BITSET_WORD * clobbered,BITSET_WORD * killed)461 assign_regs_by_copying(struct ra_ctx *rctx, agx_index dest, const agx_instr *I,
462                        struct util_dynarray *copies, BITSET_WORD *clobbered,
463                        BITSET_WORD *killed)
464 {
465    assert(dest.type == AGX_INDEX_NORMAL);
466 
467    /* Initialize the worklist with the variable we're assigning */
468    unsigned blocked_vars[16] = {dest.value};
469    size_t nr_blocked = 1;
470 
471    while (nr_blocked > 0) {
472       /* Grab the largest var. TODO: Consider not writing O(N^2) code. */
473       uint32_t ssa = ~0, nr = 0, chosen_idx = ~0;
474       for (unsigned i = 0; i < nr_blocked; ++i) {
475          uint32_t this_ssa = blocked_vars[i];
476          uint32_t this_nr = rctx->ncomps[this_ssa];
477 
478          if (this_nr > nr) {
479             nr = this_nr;
480             ssa = this_ssa;
481             chosen_idx = i;
482          }
483       }
484 
485       assert(ssa != ~0 && nr > 0 && "must have found something");
486       assert(chosen_idx < nr_blocked && "must have found something");
487 
488       /* Pop it from the work list by swapping in the last element */
489       blocked_vars[chosen_idx] = blocked_vars[--nr_blocked];
490 
491       /* We need to shuffle some variables to make room. Look for a range of
492        * the register file that is partially blocked.
493        */
494       unsigned new_reg =
495          find_best_region_to_evict(rctx, RA_GPR, nr, clobbered, killed);
496 
497       /* Blocked registers need to get reassigned. Add them to the worklist. */
498       for (unsigned i = 0; i < nr; ++i) {
499          if (BITSET_TEST(rctx->used_regs[RA_GPR], new_reg + i)) {
500             unsigned blocked_reg = new_reg + i;
501             uint32_t blocked_ssa = rctx->reg_to_ssa[blocked_reg];
502             uint32_t blocked_nr = rctx->ncomps[blocked_ssa];
503 
504             assert(blocked_nr >= 1 && "must be assigned");
505 
506             blocked_vars[nr_blocked++] = blocked_ssa;
507             assert(
508                rctx->ssa_to_reg[blocked_ssa] == blocked_reg &&
509                "variable must start within the range, since vectors are limited");
510 
511             for (unsigned j = 0; j < blocked_nr; ++j) {
512                assert(
513                   BITSET_TEST(rctx->used_regs[RA_GPR], new_reg + i + j) &&
514                   "variable is allocated contiguous and vectors are limited, "
515                   "so evicted in full");
516             }
517 
518             /* Skip to the next variable */
519             i += blocked_nr - 1;
520          }
521       }
522 
523       /* We are going to allocate to this range, so it is now fully used. Mark
524        * it as such so we don't reassign here later.
525        */
526       BITSET_SET_RANGE(rctx->used_regs[RA_GPR], new_reg, new_reg + nr - 1);
527 
528       /* The first iteration is special: it is the original allocation of a
529        * variable. All subsequent iterations pick a new register for a blocked
530        * variable. For those, copy the blocked variable to its new register.
531        */
532       if (ssa != dest.value) {
533          insert_copy(rctx, copies, new_reg, ssa);
534       }
535 
536       /* Mark down the set of clobbered registers, so that killed sources may be
537        * handled correctly later.
538        */
539       BITSET_SET_RANGE(clobbered, new_reg, new_reg + nr - 1);
540 
541       /* Update bookkeeping for this variable */
542       set_ssa_to_reg(rctx, ssa, new_reg);
543    }
544 
545    return rctx->ssa_to_reg[dest.value];
546 }
547 
548 static int
sort_by_size(const void * a_,const void * b_,void * sizes_)549 sort_by_size(const void *a_, const void *b_, void *sizes_)
550 {
551    const enum agx_size *sizes = sizes_;
552    const unsigned *a = a_, *b = b_;
553 
554    return sizes[*b] - sizes[*a];
555 }
556 
557 /*
558  * Allocating a destination of n consecutive registers may require moving those
559  * registers' contents to the locations of killed sources. For the instruction
560  * to read the correct values, the killed sources themselves need to be moved to
561  * the space where the destination will go.
562  *
563  * This is legal because there is no interference between the killed source and
564  * the destination. This is always possible because, after this insertion, the
565  * destination needs to contain the killed sources already overlapping with the
566  * destination (size k) plus the killed sources clobbered to make room for
567  * livethrough sources overlapping with the destination (at most size |dest|-k),
568  * so the total size is at most k + |dest| - k = |dest| and so fits in the dest.
569  * Sorting by alignment may be necessary.
570  */
571 static void
insert_copies_for_clobbered_killed(struct ra_ctx * rctx,unsigned reg,unsigned count,const agx_instr * I,struct util_dynarray * copies,BITSET_WORD * clobbered)572 insert_copies_for_clobbered_killed(struct ra_ctx *rctx, unsigned reg,
573                                    unsigned count, const agx_instr *I,
574                                    struct util_dynarray *copies,
575                                    BITSET_WORD *clobbered)
576 {
577    unsigned vars[16] = {0};
578    unsigned nr_vars = 0;
579 
580    /* Precondition: the reserved region is not shuffled. */
581    assert(reg >= reserved_size(rctx->shader) && "reserved is never moved");
582 
583    /* Consider the destination clobbered for the purpose of source collection.
584     * This way, killed sources already in the destination will be preserved
585     * (though possibly compacted).
586     */
587    BITSET_SET_RANGE(clobbered, reg, reg + count - 1);
588 
589    /* Collect killed clobbered sources, if any */
590    agx_foreach_ssa_src(I, s) {
591       unsigned reg = rctx->ssa_to_reg[I->src[s].value];
592       unsigned nr = rctx->ncomps[I->src[s].value];
593 
594       if (I->src[s].kill && ra_class_for_index(I->src[s]) == RA_GPR &&
595           BITSET_TEST_RANGE(clobbered, reg, reg + nr - 1)) {
596 
597          assert(nr_vars < ARRAY_SIZE(vars) &&
598                 "cannot clobber more than max variable size");
599 
600          vars[nr_vars++] = I->src[s].value;
601       }
602    }
603 
604    if (nr_vars == 0)
605       return;
606 
607    assert(I->op != AGX_OPCODE_PHI && "kill bit not set for phis");
608 
609    /* Sort by descending alignment so they are packed with natural alignment */
610    util_qsort_r(vars, nr_vars, sizeof(vars[0]), sort_by_size, rctx->sizes);
611 
612    /* Reassign in the destination region */
613    unsigned base = reg;
614 
615    /* We align vectors to their sizes, so this assertion holds as long as no
616     * instruction has a source whose scalar size is greater than the entire size
617     * of the vector destination. Yet the killed source must fit within this
618     * destination, so the destination must be bigger and therefore have bigger
619     * alignment.
620     */
621    assert((base % agx_size_align_16(rctx->sizes[vars[0]])) == 0 &&
622           "destination alignment >= largest killed source alignment");
623 
624    for (unsigned i = 0; i < nr_vars; ++i) {
625       unsigned var = vars[i];
626       unsigned var_count = rctx->ncomps[var];
627       unsigned var_align = agx_size_align_16(rctx->sizes[var]);
628 
629       assert(rctx->classes[var] == RA_GPR && "construction");
630       assert((base % var_align) == 0 && "induction");
631       assert((var_count % var_align) == 0 && "no partial variables");
632 
633       insert_copy(rctx, copies, base, var);
634       set_ssa_to_reg(rctx, var, base);
635       base += var_count;
636    }
637 
638    assert(base <= reg + count && "no overflow");
639 }
640 
641 /*
642  * When shuffling registers to assign a phi destination, we can't simply insert
643  * the required moves before the phi, since phis happen in parallel along the
644  * edge. Instead, there are two cases:
645  *
646  * 1. The source of the copy is the destination of a phi. Since we are
647  *    emitting shuffle code, there will be no more reads of that destination
648  *    with the old register. Since the phis all happen in parallel and writes
649  *    precede reads, there was no previous read of that destination either. So
650  *    the old destination is dead. Just replace the phi's destination with the
651  *    moves's destination instead.
652  *
653  * 2. Otherwise, the source of the copy is a live-in value, since it's
654  *    live when assigning phis at the start of a block but it is not a phi.
655  *    If we move in parallel with the phi, the phi will still read the correct
656  *    old register regardless and the destinations can't alias. So, insert a phi
657  *    to do the copy in parallel along the incoming edges.
658  */
659 static void
agx_emit_move_before_phi(agx_context * ctx,agx_block * block,struct agx_copy * copy)660 agx_emit_move_before_phi(agx_context *ctx, agx_block *block,
661                          struct agx_copy *copy)
662 {
663    assert(!copy->dest_mem && !copy->src.memory && "no memory shuffles");
664 
665    /* Look for the phi writing the destination */
666    agx_foreach_phi_in_block(block, phi) {
667       if (agx_is_equiv(agx_as_register(phi->dest[0]), copy->src) &&
668           !phi->dest[0].memory) {
669 
670          phi->dest[0].reg = copy->dest;
671          return;
672       }
673    }
674 
675    /* There wasn't such a phi, so it's live-in. Insert a phi instead. */
676    agx_builder b = agx_init_builder(ctx, agx_before_block(block));
677 
678    agx_instr *phi = agx_phi_to(&b, agx_register_like(copy->dest, copy->src),
679                                agx_num_predecessors(block));
680    assert(!copy->src.kill);
681 
682    agx_foreach_src(phi, s) {
683       phi->src[s] = copy->src;
684    }
685 }
686 
687 static unsigned
find_regs(struct ra_ctx * rctx,agx_instr * I,unsigned dest_idx,unsigned count,unsigned align)688 find_regs(struct ra_ctx *rctx, agx_instr *I, unsigned dest_idx, unsigned count,
689           unsigned align)
690 {
691    unsigned reg;
692    assert(count == align);
693 
694    enum ra_class cls = ra_class_for_index(I->dest[dest_idx]);
695 
696    if (find_regs_simple(rctx, cls, count, align, &reg)) {
697       return reg;
698    } else {
699       assert(cls == RA_GPR && "no memory live range splits");
700 
701       BITSET_DECLARE(clobbered, AGX_NUM_REGS) = {0};
702       BITSET_DECLARE(killed, AGX_NUM_REGS) = {0};
703       struct util_dynarray copies = {0};
704       util_dynarray_init(&copies, NULL);
705 
706       /* Initialize the set of registers killed by this instructions' sources */
707       agx_foreach_ssa_src(I, s) {
708          unsigned v = I->src[s].value;
709 
710          if (BITSET_TEST(rctx->visited, v) && !I->src[s].memory) {
711             unsigned base = rctx->ssa_to_reg[v];
712             unsigned nr = rctx->ncomps[v];
713 
714             assert(base + nr <= AGX_NUM_REGS);
715             BITSET_SET_RANGE(killed, base, base + nr - 1);
716          }
717       }
718 
719       reg = assign_regs_by_copying(rctx, I->dest[dest_idx], I, &copies,
720                                    clobbered, killed);
721       insert_copies_for_clobbered_killed(rctx, reg, count, I, &copies,
722                                          clobbered);
723 
724       /* Insert the necessary copies. Phis need special handling since we can't
725        * insert instructions before the phi.
726        */
727       if (I->op == AGX_OPCODE_PHI) {
728          util_dynarray_foreach(&copies, struct agx_copy, copy) {
729             agx_emit_move_before_phi(rctx->shader, rctx->block, copy);
730          }
731       } else {
732          agx_builder b = agx_init_builder(rctx->shader, agx_before_instr(I));
733          agx_emit_parallel_copies(
734             &b, copies.data,
735             util_dynarray_num_elements(&copies, struct agx_copy));
736       }
737 
738       util_dynarray_fini(&copies);
739 
740       /* assign_regs asserts this is cleared, so clear to be reassigned */
741       BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1);
742       return reg;
743    }
744 }
745 
746 static uint32_t
search_ssa_to_reg_out(struct ra_ctx * ctx,struct agx_block * blk,enum ra_class cls,unsigned ssa)747 search_ssa_to_reg_out(struct ra_ctx *ctx, struct agx_block *blk,
748                       enum ra_class cls, unsigned ssa)
749 {
750    for (unsigned reg = 0; reg < ctx->bound[cls]; ++reg) {
751       if (blk->reg_to_ssa_out[cls][reg] == ssa)
752          return reg;
753    }
754 
755    unreachable("variable not defined in block");
756 }
757 
758 /*
759  * Loop over live-in values at the start of the block and mark their registers
760  * as in-use. We process blocks in dominance order, so this handles everything
761  * but loop headers.
762  *
763  * For loop headers, this handles the forward edges but not the back edge.
764  * However, that's okay: we don't want to reserve the registers that are
765  * defined within the loop, because then we'd get a contradiction. Instead we
766  * leave them available and then they become fixed points of a sort.
767  */
768 static void
reserve_live_in(struct ra_ctx * rctx)769 reserve_live_in(struct ra_ctx *rctx)
770 {
771    /* If there are no predecessors, there is nothing live-in */
772    unsigned nr_preds = agx_num_predecessors(rctx->block);
773    if (nr_preds == 0)
774       return;
775 
776    agx_builder b =
777       agx_init_builder(rctx->shader, agx_before_block(rctx->block));
778 
779    int i;
780    BITSET_FOREACH_SET(i, rctx->block->live_in, rctx->shader->alloc) {
781       /* Skip values defined in loops when processing the loop header */
782       if (!BITSET_TEST(rctx->visited, i))
783          continue;
784 
785       unsigned base;
786       enum ra_class cls = rctx->classes[i];
787       enum agx_size size = rctx->sizes[i];
788 
789       /* We need to use the unrounded channel count, since the extra padding
790        * will be uninitialized and would fail RA validation.
791        */
792       unsigned channels = rctx->ncomps_unrounded[i] / agx_size_align_16(size);
793 
794       /* If we split live ranges, the variable might be defined differently at
795        * the end of each predecessor. Join them together with a phi inserted at
796        * the start of the block.
797        */
798       if (nr_preds > 1) {
799          /* We'll fill in the destination after, to coalesce one of the moves */
800          agx_instr *phi = agx_phi_to(&b, agx_null(), nr_preds);
801 
802          agx_foreach_predecessor(rctx->block, pred) {
803             unsigned pred_idx = agx_predecessor_index(rctx->block, *pred);
804 
805             phi->src[pred_idx] = agx_get_vec_index(i, size, channels);
806             phi->src[pred_idx].memory = cls == RA_MEM;
807 
808             if ((*pred)->reg_to_ssa_out[cls] == NULL) {
809                /* If this is a loop header, we don't know where the register
810                 * will end up. So, we create a phi conservatively but don't fill
811                 * it in until the end of the loop. Stash in the information
812                 * we'll need to fill in the real register later.
813                 */
814                assert(rctx->block->loop_header);
815             } else {
816                /* Otherwise, we can build the phi now */
817                phi->src[pred_idx].reg =
818                   search_ssa_to_reg_out(rctx, *pred, cls, i);
819                phi->src[pred_idx].has_reg = true;
820             }
821          }
822 
823          /* Pick the phi destination to coalesce a move. Predecessor ordering is
824           * stable, so this means all live-in values get their registers from a
825           * particular predecessor. That means that such a register allocation
826           * is valid here, because it was valid in the predecessor.
827           */
828          assert(phi->src[0].has_reg && "not loop source");
829          phi->dest[0] = phi->src[0];
830          base = phi->dest[0].reg;
831       } else {
832          /* If we don't emit a phi, there is already a unique register */
833          assert(nr_preds == 1);
834 
835          agx_block **pred = util_dynarray_begin(&rctx->block->predecessors);
836          /* TODO: Flip logic to eliminate the search */
837          base = search_ssa_to_reg_out(rctx, *pred, cls, i);
838       }
839 
840       set_ssa_to_reg(rctx, i, base);
841 
842       for (unsigned j = 0; j < rctx->ncomps[i]; ++j) {
843          BITSET_SET(rctx->used_regs[cls], base + j);
844       }
845    }
846 }
847 
848 static void
assign_regs(struct ra_ctx * rctx,agx_index v,unsigned reg)849 assign_regs(struct ra_ctx *rctx, agx_index v, unsigned reg)
850 {
851    enum ra_class cls = ra_class_for_index(v);
852    assert(reg < rctx->bound[cls] && "must not overflow register file");
853    assert(v.type == AGX_INDEX_NORMAL && "only SSA gets registers allocated");
854    set_ssa_to_reg(rctx, v.value, reg);
855 
856    assert(!BITSET_TEST(rctx->visited, v.value) && "SSA violated");
857    BITSET_SET(rctx->visited, v.value);
858 
859    assert(rctx->ncomps[v.value] >= 1);
860    unsigned end = reg + rctx->ncomps[v.value] - 1;
861 
862    assert(!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, end) &&
863           "no interference");
864    BITSET_SET_RANGE(rctx->used_regs[cls], reg, end);
865 
866    /* Phi webs need to remember which register they're assigned to */
867    struct phi_web_node *node =
868       &rctx->phi_web[phi_web_find(rctx->phi_web, v.value)];
869 
870    if (!node->assigned) {
871       node->reg = reg;
872       node->assigned = true;
873    }
874 }
875 
876 static void
agx_set_sources(struct ra_ctx * rctx,agx_instr * I)877 agx_set_sources(struct ra_ctx *rctx, agx_instr *I)
878 {
879    assert(I->op != AGX_OPCODE_PHI);
880 
881    agx_foreach_ssa_src(I, s) {
882       assert(BITSET_TEST(rctx->visited, I->src[s].value) && "no phis");
883 
884       I->src[s].reg = rctx->ssa_to_reg[I->src[s].value];
885       I->src[s].has_reg = true;
886    }
887 }
888 
889 static void
agx_set_dests(struct ra_ctx * rctx,agx_instr * I)890 agx_set_dests(struct ra_ctx *rctx, agx_instr *I)
891 {
892    agx_foreach_ssa_dest(I, s) {
893       I->dest[s].reg = rctx->ssa_to_reg[I->dest[s].value];
894       I->dest[s].has_reg = true;
895    }
896 }
897 
898 static unsigned
affinity_base_of_collect(struct ra_ctx * rctx,agx_instr * collect,unsigned src)899 affinity_base_of_collect(struct ra_ctx *rctx, agx_instr *collect, unsigned src)
900 {
901    unsigned src_reg = rctx->ssa_to_reg[collect->src[src].value];
902    unsigned src_offset = src * agx_size_align_16(collect->src[src].size);
903 
904    if (src_reg >= src_offset)
905       return src_reg - src_offset;
906    else
907       return ~0;
908 }
909 
910 static bool
try_coalesce_with(struct ra_ctx * rctx,agx_index ssa,unsigned count,bool may_be_unvisited,unsigned * out)911 try_coalesce_with(struct ra_ctx *rctx, agx_index ssa, unsigned count,
912                   bool may_be_unvisited, unsigned *out)
913 {
914    assert(ssa.type == AGX_INDEX_NORMAL);
915    if (!BITSET_TEST(rctx->visited, ssa.value)) {
916       assert(may_be_unvisited);
917       return false;
918    }
919 
920    unsigned base = rctx->ssa_to_reg[ssa.value];
921    enum ra_class cls = ra_class_for_index(ssa);
922 
923    if (BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
924       return false;
925 
926    assert(base + count <= rctx->bound[cls] && "invariant");
927    *out = base;
928    return true;
929 }
930 
931 static unsigned
pick_regs(struct ra_ctx * rctx,agx_instr * I,unsigned d)932 pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
933 {
934    agx_index idx = I->dest[d];
935    enum ra_class cls = ra_class_for_index(idx);
936    assert(idx.type == AGX_INDEX_NORMAL);
937 
938    unsigned count = rctx->ncomps[idx.value];
939    assert(count >= 1);
940 
941    unsigned align = count;
942 
943    /* Try to allocate entire phi webs compatibly */
944    unsigned phi_idx = phi_web_find(rctx->phi_web, idx.value);
945    if (rctx->phi_web[phi_idx].assigned) {
946       unsigned reg = rctx->phi_web[phi_idx].reg;
947       if ((reg % align) == 0 && reg + align < rctx->bound[cls] &&
948           !BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + align - 1))
949          return reg;
950    }
951 
952    /* Try to allocate moves compatibly with their sources */
953    if (I->op == AGX_OPCODE_MOV && I->src[0].type == AGX_INDEX_NORMAL &&
954        I->src[0].memory == I->dest[0].memory &&
955        I->src[0].size == I->dest[0].size) {
956 
957       unsigned out;
958       if (try_coalesce_with(rctx, I->src[0], count, false, &out))
959          return out;
960    }
961 
962    /* Try to allocate phis compatibly with their sources */
963    if (I->op == AGX_OPCODE_PHI) {
964       agx_foreach_ssa_src(I, s) {
965          /* Loop headers have phis with a source preceding the definition */
966          bool may_be_unvisited = rctx->block->loop_header;
967 
968          unsigned out;
969          if (try_coalesce_with(rctx, I->src[s], count, may_be_unvisited, &out))
970             return out;
971       }
972    }
973 
974    /* Try to allocate collects compatibly with their sources */
975    if (I->op == AGX_OPCODE_COLLECT) {
976       agx_foreach_ssa_src(I, s) {
977          assert(BITSET_TEST(rctx->visited, I->src[s].value) &&
978                 "registers assigned in an order compatible with dominance "
979                 "and this is not a phi node, so we have assigned a register");
980 
981          unsigned base = affinity_base_of_collect(rctx, I, s);
982          if (base >= rctx->bound[cls] || (base + count) > rctx->bound[cls])
983             continue;
984 
985          /* Unaligned destinations can happen when dest size > src size */
986          if (base % align)
987             continue;
988 
989          if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
990             return base;
991       }
992    }
993 
994    /* Try to coalesce scalar exports */
995    agx_instr *collect_phi = rctx->src_to_collect_phi[idx.value];
996    if (collect_phi && collect_phi->op == AGX_OPCODE_EXPORT) {
997       unsigned reg = collect_phi->imm;
998 
999       if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + align - 1) &&
1000           (reg % align) == 0)
1001          return reg;
1002    }
1003 
1004    /* Try to coalesce vector exports */
1005    if (collect_phi && collect_phi->op == AGX_OPCODE_SPLIT) {
1006       if (collect_phi->dest[0].type == AGX_INDEX_NORMAL) {
1007          agx_instr *exp = rctx->src_to_collect_phi[collect_phi->dest[0].value];
1008          if (exp && exp->op == AGX_OPCODE_EXPORT) {
1009             unsigned reg = exp->imm;
1010 
1011             if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg,
1012                                    reg + align - 1) &&
1013                 (reg % align) == 0)
1014                return reg;
1015          }
1016       }
1017    }
1018 
1019    /* Try to allocate sources of collects contiguously */
1020    if (collect_phi && collect_phi->op == AGX_OPCODE_COLLECT) {
1021       agx_instr *collect = collect_phi;
1022 
1023       assert(count == align && "collect sources are scalar");
1024 
1025       /* Find our offset in the collect. If our source is repeated in the
1026        * collect, this may not be unique. We arbitrarily choose the first.
1027        */
1028       unsigned our_source = ~0;
1029       agx_foreach_ssa_src(collect, s) {
1030          if (agx_is_equiv(collect->src[s], idx)) {
1031             our_source = s;
1032             break;
1033          }
1034       }
1035 
1036       assert(our_source < collect->nr_srcs && "source must be in the collect");
1037 
1038       /* See if we can allocate compatibly with any source of the collect */
1039       agx_foreach_ssa_src(collect, s) {
1040          if (!BITSET_TEST(rctx->visited, collect->src[s].value))
1041             continue;
1042 
1043          /* Determine where the collect should start relative to the source */
1044          unsigned base = affinity_base_of_collect(rctx, collect, s);
1045          if (base >= rctx->bound[cls])
1046             continue;
1047 
1048          unsigned our_reg = base + (our_source * align);
1049 
1050          /* Don't allocate past the end of the register file */
1051          if ((our_reg + align) > rctx->bound[cls])
1052             continue;
1053 
1054          /* If those registers are free, then choose them */
1055          if (!BITSET_TEST_RANGE(rctx->used_regs[cls], our_reg,
1056                                 our_reg + align - 1))
1057             return our_reg;
1058       }
1059 
1060       unsigned collect_align = rctx->ncomps[collect->dest[0].value];
1061       unsigned offset = our_source * align;
1062 
1063       /* Prefer ranges of the register file that leave room for all sources of
1064        * the collect contiguously.
1065        */
1066       for (unsigned base = 0;
1067            base + (collect->nr_srcs * align) <= rctx->bound[cls];
1068            base += collect_align) {
1069          if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base,
1070                                 base + (collect->nr_srcs * align) - 1))
1071             return base + offset;
1072       }
1073 
1074       /* Try to respect the alignment requirement of the collect destination,
1075        * which may be greater than the sources (e.g. pack_64_2x32_split). Look
1076        * for a register for the source such that the collect base is aligned.
1077        */
1078       if (collect_align > align) {
1079          for (unsigned reg = offset; reg + collect_align <= rctx->bound[cls];
1080               reg += collect_align) {
1081             if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + count - 1))
1082                return reg;
1083          }
1084       }
1085    }
1086 
1087    /* Try to allocate phi sources compatibly with their phis */
1088    if (collect_phi && collect_phi->op == AGX_OPCODE_PHI) {
1089       agx_instr *phi = collect_phi;
1090       unsigned out;
1091 
1092       agx_foreach_ssa_src(phi, s) {
1093          if (try_coalesce_with(rctx, phi->src[s], count, true, &out))
1094             return out;
1095       }
1096 
1097       /* If we're in a loop, we may have already allocated the phi. Try that. */
1098       if (phi->dest[0].has_reg) {
1099          unsigned base = phi->dest[0].reg;
1100 
1101          if (base + count <= rctx->bound[cls] &&
1102              !BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
1103             return base;
1104       }
1105    }
1106 
1107    /* Default to any contiguous sequence of registers */
1108    return find_regs(rctx, I, d, count, align);
1109 }
1110 
1111 /** Assign registers to SSA values in a block. */
1112 
1113 static void
agx_ra_assign_local(struct ra_ctx * rctx)1114 agx_ra_assign_local(struct ra_ctx *rctx)
1115 {
1116    BITSET_DECLARE(used_regs_gpr, AGX_NUM_REGS) = {0};
1117    BITSET_DECLARE(used_regs_mem, AGX_NUM_MODELED_REGS) = {0};
1118    uint16_t *ssa_to_reg = calloc(rctx->shader->alloc, sizeof(uint16_t));
1119 
1120    agx_block *block = rctx->block;
1121    uint8_t *ncomps = rctx->ncomps;
1122    rctx->used_regs[RA_GPR] = used_regs_gpr;
1123    rctx->used_regs[RA_MEM] = used_regs_mem;
1124    rctx->ssa_to_reg = ssa_to_reg;
1125 
1126    reserve_live_in(rctx);
1127 
1128    /* Force the nesting counter r0l live throughout shaders using control flow.
1129     * This could be optimized (sync with agx_calc_register_demand).
1130     */
1131    if (rctx->shader->any_cf)
1132       BITSET_SET(used_regs_gpr, 0);
1133 
1134    /* Force the zero r0h live throughout shaders using divergent shuffles. */
1135    if (rctx->shader->any_quad_divergent_shuffle) {
1136       assert(rctx->shader->any_cf);
1137       BITSET_SET(used_regs_gpr, 1);
1138    }
1139 
1140    /* Reserve bottom registers as temporaries for parallel copy lowering */
1141    if (rctx->shader->has_spill_pcopy_reserved) {
1142       BITSET_SET_RANGE(used_regs_gpr, 0, 7);
1143    }
1144 
1145    agx_foreach_instr_in_block(block, I) {
1146       rctx->instr = I;
1147 
1148       /* Optimization: if a split contains the last use of a vector, the split
1149        * can be removed by assigning the destinations overlapping the source.
1150        */
1151       if (I->op == AGX_OPCODE_SPLIT && I->src[0].kill) {
1152          assert(ra_class_for_index(I->src[0]) == RA_GPR);
1153          unsigned reg = ssa_to_reg[I->src[0].value];
1154          unsigned width = agx_size_align_16(agx_split_width(I));
1155 
1156          agx_foreach_dest(I, d) {
1157             assert(ra_class_for_index(I->dest[0]) == RA_GPR);
1158 
1159             /* Free up the source */
1160             unsigned offset_reg = reg + (d * width);
1161             BITSET_CLEAR_RANGE(used_regs_gpr, offset_reg,
1162                                offset_reg + width - 1);
1163 
1164             /* Assign the destination where the source was */
1165             if (!agx_is_null(I->dest[d]))
1166                assign_regs(rctx, I->dest[d], offset_reg);
1167          }
1168 
1169          unsigned excess =
1170             rctx->ncomps[I->src[0].value] - (I->nr_dests * width);
1171          if (excess) {
1172             BITSET_CLEAR_RANGE(used_regs_gpr, reg + (I->nr_dests * width),
1173                                reg + rctx->ncomps[I->src[0].value] - 1);
1174          }
1175 
1176          agx_set_sources(rctx, I);
1177          agx_set_dests(rctx, I);
1178          continue;
1179       } else if (I->op == AGX_OPCODE_PRELOAD) {
1180          /* We must coalesce all preload moves */
1181          assert(I->dest[0].size == I->src[0].size);
1182          assert(I->src[0].type == AGX_INDEX_REGISTER);
1183 
1184          /* r1l specifically is a preloaded register. It is reserved during
1185           * demand calculations to ensure we don't need live range shuffling of
1186           * spilling temporaries. But we can still preload to it. So if it's
1187           * reserved, just free it. It'll be fine.
1188           */
1189          if (I->src[0].value == 2) {
1190             BITSET_CLEAR(rctx->used_regs[RA_GPR], 2);
1191          }
1192 
1193          assign_regs(rctx, I->dest[0], I->src[0].value);
1194          agx_set_dests(rctx, I);
1195          continue;
1196       }
1197 
1198       /* First, free killed sources */
1199       agx_foreach_ssa_src(I, s) {
1200          if (I->src[s].kill) {
1201             assert(I->op != AGX_OPCODE_PHI && "phis don't use .kill");
1202 
1203             enum ra_class cls = ra_class_for_index(I->src[s]);
1204             unsigned reg = ssa_to_reg[I->src[s].value];
1205             unsigned count = ncomps[I->src[s].value];
1206 
1207             assert(count >= 1);
1208             BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1);
1209          }
1210       }
1211 
1212       /* Next, assign destinations one at a time. This is always legal
1213        * because of the SSA form.
1214        */
1215       agx_foreach_ssa_dest(I, d) {
1216          if (I->op == AGX_OPCODE_PHI && I->dest[d].has_reg)
1217             continue;
1218 
1219          assign_regs(rctx, I->dest[d], pick_regs(rctx, I, d));
1220       }
1221 
1222       /* Phi sources are special. Set in the corresponding predecessors */
1223       if (I->op != AGX_OPCODE_PHI)
1224          agx_set_sources(rctx, I);
1225 
1226       agx_set_dests(rctx, I);
1227    }
1228 
1229    for (unsigned i = 0; i < RA_CLASSES; ++i) {
1230       block->reg_to_ssa_out[i] =
1231          malloc(rctx->bound[i] * sizeof(*block->reg_to_ssa_out[i]));
1232 
1233       /* Initialize with sentinel so we don't have unused regs mapping to r0 */
1234       memset(block->reg_to_ssa_out[i], 0xFF,
1235              rctx->bound[i] * sizeof(*block->reg_to_ssa_out[i]));
1236    }
1237 
1238    int i;
1239    BITSET_FOREACH_SET(i, block->live_out, rctx->shader->alloc) {
1240       block->reg_to_ssa_out[rctx->classes[i]][rctx->ssa_to_reg[i]] = i;
1241    }
1242 
1243    /* Also set the sources for the phis in our successors, since that logically
1244     * happens now (given the possibility of live range splits, etc)
1245     */
1246    agx_foreach_successor(block, succ) {
1247       unsigned pred_idx = agx_predecessor_index(succ, block);
1248 
1249       agx_foreach_phi_in_block(succ, phi) {
1250          if (phi->src[pred_idx].type == AGX_INDEX_NORMAL &&
1251              !phi->src[pred_idx].has_reg) {
1252             /* This source needs a fixup */
1253             unsigned value = phi->src[pred_idx].value;
1254             phi->src[pred_idx].reg = rctx->ssa_to_reg[value];
1255             phi->src[pred_idx].has_reg = true;
1256          }
1257       }
1258    }
1259 
1260    free(rctx->ssa_to_reg);
1261 }
1262 
1263 /*
1264  * Lower phis to parallel copies at the logical end of a given block. If a block
1265  * needs parallel copies inserted, a successor of the block has a phi node. To
1266  * have a (nontrivial) phi node, a block must have multiple predecessors. So the
1267  * edge from the block to the successor (with phi) is not the only edge entering
1268  * the successor. Because the control flow graph has no critical edges, this
1269  * edge must therefore be the only edge leaving the block, so the block must
1270  * have only a single successor.
1271  */
1272 static void
agx_insert_parallel_copies(agx_context * ctx,agx_block * block)1273 agx_insert_parallel_copies(agx_context *ctx, agx_block *block)
1274 {
1275    bool any_succ = false;
1276    unsigned nr_phi = 0;
1277 
1278    /* Phi nodes logically happen on the control flow edge, so parallel copies
1279     * are added at the end of the predecessor */
1280    agx_builder b = agx_init_builder(ctx, agx_after_block_logical(block));
1281 
1282    agx_foreach_successor(block, succ) {
1283       assert(nr_phi == 0 && "control flow graph has a critical edge");
1284 
1285       agx_foreach_phi_in_block(succ, phi) {
1286          assert(!any_succ && "control flow graph has a critical edge");
1287          nr_phi += agx_channels(phi->dest[0]);
1288       }
1289 
1290       any_succ = true;
1291 
1292       /* Nothing to do if there are no phi nodes */
1293       if (nr_phi == 0)
1294          continue;
1295 
1296       unsigned pred_index = agx_predecessor_index(succ, block);
1297 
1298       /* Create a parallel copy lowering all the phi nodes */
1299       struct agx_copy *copies = calloc(sizeof(*copies), nr_phi);
1300 
1301       unsigned i = 0;
1302 
1303       agx_foreach_phi_in_block(succ, phi) {
1304          agx_index dest = phi->dest[0];
1305          agx_index src = phi->src[pred_index];
1306 
1307          if (src.type == AGX_INDEX_IMMEDIATE)
1308             src.size = dest.size;
1309 
1310          assert(dest.type == AGX_INDEX_REGISTER);
1311          assert(dest.size == src.size);
1312 
1313          /* Scalarize the phi, since the parallel copy lowering doesn't handle
1314           * vector phis. While we scalarize phis in NIR, we can generate vector
1315           * phis from spilling so must take care.
1316           */
1317          for (unsigned c = 0; c < agx_channels(phi->dest[0]); ++c) {
1318             agx_index src_ = src;
1319             unsigned offs = c * agx_size_align_16(src.size);
1320 
1321             if (src.type != AGX_INDEX_IMMEDIATE) {
1322                assert(src.type == AGX_INDEX_UNIFORM ||
1323                       src.type == AGX_INDEX_REGISTER);
1324                src_.value += offs;
1325                src_.channels_m1 = 1 - 1;
1326             }
1327 
1328             assert(i < nr_phi);
1329             copies[i++] = (struct agx_copy){
1330                .dest = dest.value + offs,
1331                .dest_mem = dest.memory,
1332                .src = src_,
1333             };
1334          }
1335       }
1336 
1337       agx_emit_parallel_copies(&b, copies, nr_phi);
1338 
1339       free(copies);
1340    }
1341 }
1342 
1343 static void
lower_exports(agx_context * ctx)1344 lower_exports(agx_context *ctx)
1345 {
1346    struct agx_copy copies[AGX_NUM_REGS];
1347    unsigned nr = 0;
1348    agx_block *block = agx_exit_block(ctx);
1349 
1350    agx_foreach_instr_in_block_safe(block, I) {
1351       if (I->op != AGX_OPCODE_EXPORT)
1352          continue;
1353 
1354       assert(agx_channels(I->src[0]) == 1 && "scalarized in frontend");
1355       assert(nr < ARRAY_SIZE(copies));
1356 
1357       copies[nr++] = (struct agx_copy){
1358          .dest = I->imm,
1359          .src = I->src[0],
1360       };
1361 
1362       /* We cannot use fewer registers than we export */
1363       ctx->max_reg =
1364          MAX2(ctx->max_reg, I->imm + agx_size_align_16(I->src[0].size));
1365    }
1366 
1367    agx_builder b = agx_init_builder(ctx, agx_after_block_logical(block));
1368    agx_emit_parallel_copies(&b, copies, nr);
1369 }
1370 
1371 void
agx_ra(agx_context * ctx)1372 agx_ra(agx_context *ctx)
1373 {
1374    bool force_spilling =
1375       (agx_compiler_debug & AGX_DBG_SPILL) && ctx->key->has_scratch;
1376 
1377    /* Determine maximum possible registers. We won't exceed this! */
1378    unsigned max_possible_regs = AGX_NUM_REGS;
1379 
1380    /* Compute shaders need to have their entire workgroup together, so our
1381     * register usage is bounded by the workgroup size.
1382     */
1383    if (gl_shader_stage_is_compute(ctx->stage)) {
1384       unsigned threads_per_workgroup;
1385 
1386       /* If we don't know the workgroup size, worst case it. TODO: Optimize
1387        * this, since it'll decimate opencl perf.
1388        */
1389       if (ctx->nir->info.workgroup_size_variable) {
1390          threads_per_workgroup = 1024;
1391       } else {
1392          threads_per_workgroup = ctx->nir->info.workgroup_size[0] *
1393                                  ctx->nir->info.workgroup_size[1] *
1394                                  ctx->nir->info.workgroup_size[2];
1395       }
1396 
1397       max_possible_regs =
1398          agx_max_registers_for_occupancy(threads_per_workgroup);
1399    }
1400 
1401    if (force_spilling) {
1402       /* Even when testing spilling, we need enough room for preloaded/exported
1403        * regs.
1404        */
1405       unsigned d = 24;
1406       unsigned max_ncomps = 8;
1407 
1408       agx_foreach_instr_global(ctx, I) {
1409          if (I->op == AGX_OPCODE_PRELOAD) {
1410             unsigned size = agx_size_align_16(I->src[0].size);
1411             d = MAX2(d, I->src[0].value + size);
1412          } else if (I->op == AGX_OPCODE_EXPORT) {
1413             unsigned size = agx_size_align_16(I->src[0].size);
1414             d = MAX2(d, I->imm + size);
1415          } else if (I->op == AGX_OPCODE_IMAGE_WRITE) {
1416             /* vec4 source + vec4 coordinates + bindless handle + reserved */
1417             d = MAX2(d, 26);
1418          } else if (I->op == AGX_OPCODE_TEXTURE_SAMPLE &&
1419                     (I->lod_mode == AGX_LOD_MODE_LOD_GRAD ||
1420                      I->lod_mode == AGX_LOD_MODE_LOD_GRAD_MIN)) {
1421             /* as above but with big gradient */
1422             d = MAX2(d, 36);
1423          }
1424 
1425          agx_foreach_ssa_dest(I, v) {
1426             max_ncomps = MAX2(max_ncomps, agx_index_size_16(I->dest[v]));
1427          }
1428       }
1429 
1430       max_possible_regs = ALIGN_POT(d, util_next_power_of_two(max_ncomps));
1431    } else if (ctx->key->is_helper) {
1432       /* The helper program is unspillable and has a limited register file */
1433       max_possible_regs = 32;
1434    }
1435 
1436    /* Calculate the demand. We'll use it to determine if we need to spill and to
1437     * bound register assignment.
1438     */
1439    agx_compute_liveness(ctx);
1440    unsigned effective_demand = agx_calc_register_demand(ctx);
1441    bool spilling = (effective_demand > max_possible_regs);
1442 
1443    if (spilling) {
1444       assert(ctx->key->has_scratch && "internal shaders are unspillable");
1445       agx_spill(ctx, max_possible_regs);
1446 
1447       /* After spilling, recalculate liveness and demand */
1448       agx_compute_liveness(ctx);
1449       effective_demand = agx_calc_register_demand(ctx);
1450 
1451       /* The resulting program can now be assigned registers */
1452       assert(effective_demand <= max_possible_regs && "spiller post-condition");
1453    }
1454 
1455    /* Record all phi webs. First initialize the union-find data structure with
1456     * all SSA defs in their own singletons, then union together anything related
1457     * by a phi. The resulting union-find structure will be the webs.
1458     */
1459    struct phi_web_node *phi_web = calloc(ctx->alloc, sizeof(*phi_web));
1460    for (unsigned i = 0; i < ctx->alloc; ++i) {
1461       phi_web[i].parent = i;
1462    }
1463 
1464    agx_foreach_block(ctx, block) {
1465       agx_foreach_phi_in_block(block, phi) {
1466          agx_foreach_ssa_src(phi, s) {
1467             phi_web_union(phi_web, phi->dest[0].value, phi->src[s].value);
1468          }
1469       }
1470    }
1471 
1472    uint8_t *ncomps = calloc(ctx->alloc, sizeof(uint8_t));
1473    uint8_t *ncomps_unrounded = calloc(ctx->alloc, sizeof(uint8_t));
1474    enum ra_class *classes = calloc(ctx->alloc, sizeof(enum ra_class));
1475    agx_instr **src_to_collect_phi = calloc(ctx->alloc, sizeof(agx_instr *));
1476    enum agx_size *sizes = calloc(ctx->alloc, sizeof(enum agx_size));
1477    BITSET_WORD *visited = calloc(BITSET_WORDS(ctx->alloc), sizeof(BITSET_WORD));
1478    unsigned max_ncomps = 1;
1479 
1480    agx_foreach_instr_global(ctx, I) {
1481       /* Record collects/phis so we can coalesce when assigning */
1482       if (I->op == AGX_OPCODE_COLLECT || I->op == AGX_OPCODE_PHI ||
1483           I->op == AGX_OPCODE_EXPORT || I->op == AGX_OPCODE_SPLIT) {
1484          agx_foreach_ssa_src(I, s) {
1485             src_to_collect_phi[I->src[s].value] = I;
1486          }
1487       }
1488 
1489       agx_foreach_ssa_dest(I, d) {
1490          unsigned v = I->dest[d].value;
1491          assert(ncomps[v] == 0 && "broken SSA");
1492          /* Round up vectors for easier live range splitting */
1493          ncomps_unrounded[v] = agx_index_size_16(I->dest[d]);
1494          ncomps[v] = util_next_power_of_two(ncomps_unrounded[v]);
1495          sizes[v] = I->dest[d].size;
1496          classes[v] = ra_class_for_index(I->dest[d]);
1497 
1498          max_ncomps = MAX2(max_ncomps, ncomps[v]);
1499       }
1500    }
1501 
1502    /* For live range splitting to work properly, ensure the register file is
1503     * aligned to the larger vector size. Most of the time, this is a no-op since
1504     * the largest vector size is usually 128-bit and the register file is
1505     * naturally 128-bit aligned. However, this is required for correctness with
1506     * 3D textureGrad, which can have a source vector of length 6x32-bit,
1507     * rounding up to 256-bit and requiring special accounting here.
1508     */
1509    unsigned reg_file_alignment = MAX2(max_ncomps, 8);
1510    assert(util_is_power_of_two_nonzero(reg_file_alignment));
1511 
1512    unsigned demand = ALIGN_POT(effective_demand, reg_file_alignment);
1513    assert(demand <= max_possible_regs && "Invariant");
1514 
1515    /* Round up the demand to the maximum number of registers we can use without
1516     * affecting occupancy. This reduces live range splitting.
1517     */
1518    unsigned max_regs = agx_occupancy_for_register_count(demand).max_registers;
1519    if (ctx->key->is_helper || force_spilling)
1520       max_regs = max_possible_regs;
1521 
1522    max_regs = ROUND_DOWN_TO(max_regs, reg_file_alignment);
1523 
1524    /* Or, we can bound tightly for debugging */
1525    if (agx_compiler_debug & AGX_DBG_DEMAND)
1526       max_regs = ALIGN_POT(MAX2(demand, 12), reg_file_alignment);
1527 
1528    /* ...but not too tightly */
1529    assert((max_regs % reg_file_alignment) == 0 && "occupancy limits aligned");
1530    assert(max_regs >= (6 * 2) && "space for vertex shader preloading");
1531    assert(max_regs <= max_possible_regs);
1532 
1533    unsigned reg_count = 0, mem_slot_count = 0;
1534 
1535    /* Assign registers in dominance-order. This coincides with source-order due
1536     * to a NIR invariant, so we do not need special handling for this.
1537     */
1538    agx_foreach_block(ctx, block) {
1539       agx_ra_assign_local(&(struct ra_ctx){
1540          .shader = ctx,
1541          .block = block,
1542          .src_to_collect_phi = src_to_collect_phi,
1543          .phi_web = phi_web,
1544          .ncomps = ncomps,
1545          .ncomps_unrounded = ncomps_unrounded,
1546          .sizes = sizes,
1547          .classes = classes,
1548          .visited = visited,
1549          .bound[RA_GPR] = max_regs,
1550          .bound[RA_MEM] = AGX_NUM_MODELED_REGS,
1551          .count[RA_GPR] = &reg_count,
1552          .count[RA_MEM] = &mem_slot_count,
1553       });
1554    }
1555 
1556    ctx->max_reg = reg_count ? (reg_count - 1) : 0;
1557    ctx->spill_base_B = ctx->scratch_size_B;
1558    ctx->scratch_size_B += mem_slot_count * 2;
1559 
1560    /* Vertex shaders preload the vertex/instance IDs (r5, r6) even if the shader
1561     * don't use them. Account for that so the preload doesn't clobber GPRs.
1562     * Hardware tessellation eval shaders preload patch/instance IDs there.
1563     */
1564    if (ctx->nir->info.stage == MESA_SHADER_VERTEX ||
1565        ctx->nir->info.stage == MESA_SHADER_TESS_EVAL)
1566       ctx->max_reg = MAX2(ctx->max_reg, 6 * 2);
1567 
1568    assert(ctx->max_reg <= max_regs);
1569 
1570    /* Validate RA after assigning registers just before lowering SSA */
1571    agx_validate_ra(ctx);
1572 
1573    agx_foreach_instr_global_safe(ctx, ins) {
1574       /* Lower away SSA */
1575       agx_foreach_ssa_dest(ins, d) {
1576          ins->dest[d] =
1577             agx_replace_index(ins->dest[d], agx_as_register(ins->dest[d]));
1578       }
1579 
1580       agx_foreach_ssa_src(ins, s) {
1581          agx_replace_src(ins, s, agx_as_register(ins->src[s]));
1582       }
1583 
1584       /* Lower away RA pseudo-instructions */
1585       agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
1586 
1587       if (ins->op == AGX_OPCODE_COLLECT) {
1588          assert(ins->dest[0].type == AGX_INDEX_REGISTER);
1589          assert(!ins->dest[0].memory);
1590 
1591          unsigned base = ins->dest[0].value;
1592          unsigned width = agx_size_align_16(ins->src[0].size);
1593 
1594          struct agx_copy *copies = alloca(sizeof(copies[0]) * ins->nr_srcs);
1595          unsigned n = 0;
1596 
1597          /* Move the sources */
1598          agx_foreach_src(ins, i) {
1599             if (agx_is_null(ins->src[i]) || ins->src[i].type == AGX_INDEX_UNDEF)
1600                continue;
1601             assert(ins->src[i].size == ins->src[0].size);
1602 
1603             assert(n < ins->nr_srcs);
1604             copies[n++] = (struct agx_copy){
1605                .dest = base + (i * width),
1606                .src = ins->src[i],
1607             };
1608          }
1609 
1610          agx_emit_parallel_copies(&b, copies, n);
1611          agx_remove_instruction(ins);
1612          continue;
1613       } else if (ins->op == AGX_OPCODE_SPLIT) {
1614          assert(ins->src[0].type == AGX_INDEX_REGISTER ||
1615                 ins->src[0].type == AGX_INDEX_UNIFORM);
1616 
1617          struct agx_copy copies[4];
1618          assert(ins->nr_dests <= ARRAY_SIZE(copies));
1619 
1620          unsigned n = 0;
1621          unsigned width = agx_size_align_16(agx_split_width(ins));
1622 
1623          /* Move the sources */
1624          agx_foreach_dest(ins, i) {
1625             if (ins->dest[i].type != AGX_INDEX_REGISTER)
1626                continue;
1627 
1628             assert(!ins->dest[i].memory);
1629 
1630             agx_index src = ins->src[0];
1631             src.size = ins->dest[i].size;
1632             src.channels_m1 = 0;
1633             src.value += (i * width);
1634 
1635             assert(n < ARRAY_SIZE(copies));
1636             copies[n++] = (struct agx_copy){
1637                .dest = ins->dest[i].value,
1638                .src = src,
1639             };
1640          }
1641 
1642          /* Lower away */
1643          agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
1644          agx_emit_parallel_copies(&b, copies, n);
1645          agx_remove_instruction(ins);
1646          continue;
1647       }
1648    }
1649 
1650    /* Insert parallel copies lowering phi nodes and exports */
1651    agx_foreach_block(ctx, block) {
1652       agx_insert_parallel_copies(ctx, block);
1653    }
1654 
1655    lower_exports(ctx);
1656 
1657    agx_foreach_instr_global_safe(ctx, I) {
1658       switch (I->op) {
1659       /* Pseudoinstructions for RA must be removed now */
1660       case AGX_OPCODE_PHI:
1661       case AGX_OPCODE_PRELOAD:
1662          agx_remove_instruction(I);
1663          break;
1664 
1665       /* Coalesced moves can be removed */
1666       case AGX_OPCODE_MOV:
1667          if (I->src[0].type == AGX_INDEX_REGISTER &&
1668              I->dest[0].size == I->src[0].size &&
1669              I->src[0].value == I->dest[0].value &&
1670              I->src[0].memory == I->dest[0].memory) {
1671 
1672             assert(I->dest[0].type == AGX_INDEX_REGISTER);
1673             agx_remove_instruction(I);
1674          }
1675          break;
1676 
1677       default:
1678          break;
1679       }
1680    }
1681 
1682    if (spilling)
1683       agx_lower_spill(ctx);
1684 
1685    agx_foreach_block(ctx, block) {
1686       for (unsigned i = 0; i < ARRAY_SIZE(block->reg_to_ssa_out); ++i) {
1687          free(block->reg_to_ssa_out[i]);
1688          block->reg_to_ssa_out[i] = NULL;
1689       }
1690    }
1691 
1692    free(phi_web);
1693    free(src_to_collect_phi);
1694    free(ncomps);
1695    free(ncomps_unrounded);
1696    free(sizes);
1697    free(classes);
1698    free(visited);
1699 }
1700