asahi/compiler/agx_register_allocate.c

/*
 * Copyright 2021 Alyssa Rosenzweig
 * SPDX-License-Identifier: MIT
 */

#include "util/bitset.h"
#include "util/macros.h"
#include "util/u_dynarray.h"
#include "util/u_math.h"
#include "util/u_memory.h"
#include "util/u_qsort.h"
#include "agx_builder.h"
#include "agx_compile.h"
#include "agx_compiler.h"
#include "agx_debug.h"
#include "agx_opcodes.h"
#include "shader_enums.h"

/* SSA-based register allocator */
struct phi_web_node {
   /* Parent index, or circular for root */
   uint32_t parent;

   /* If root, assigned register, or ~0 if no register assigned. */
   uint16_t reg;
   bool assigned;

   /* Rank, at most log2(n) so need ~5-bits */
   uint8_t rank;
};
static_assert(sizeof(struct phi_web_node) == 8, "packed");

static unsigned
phi_web_find(struct phi_web_node *web, unsigned x)
{
   if (web[x].parent == x) {
      /* Root */
      return x;
   } else {
      /* Search up the tree */
      unsigned root = x;
      while (web[root].parent != root)
         root = web[root].parent;

      /* Compress path. Second pass ensures O(1) memory usage. */
      while (web[x].parent != x) {
         unsigned temp = web[x].parent;
         web[x].parent = root;
         x = temp;
      }

      return root;
   }
}

static void
phi_web_union(struct phi_web_node *web, unsigned x, unsigned y)
{
   x = phi_web_find(web, x);
   y = phi_web_find(web, y);

   if (x == y)
      return;

   /* Union-by-rank: ensure x.rank >= y.rank */
   if (web[x].rank < web[y].rank) {
      unsigned temp = x;
      x = y;
      y = temp;
   }

   web[y].parent = x;

   /* Increment rank if necessary */
   if (web[x].rank == web[y].rank) {
      web[x].rank++;
   }
}

struct ra_ctx {
   agx_context *shader;
   agx_block *block;
   agx_instr *instr;
   uint16_t *ssa_to_reg;
   uint8_t *ncomps;
   uint8_t *ncomps_unrounded;
   enum agx_size *sizes;
   enum ra_class *classes;
   BITSET_WORD *visited;
   BITSET_WORD *used_regs[RA_CLASSES];

   /* Maintained while assigning registers. Count of registers required, i.e.
    * the maximum register assigned + 1.
    */
   unsigned *count[RA_CLASSES];

   /* For affinities */
   agx_instr **src_to_collect_phi;
   struct phi_web_node *phi_web;

   /* If bit i of used_regs is set, and register i is the first consecutive
    * register holding an SSA value, then reg_to_ssa[i] is the SSA index of the
    * value currently in register  i.
    *
    * Only for GPRs. We can add reg classes later if we have a use case.
    */
   uint32_t reg_to_ssa[AGX_NUM_REGS];

   /* Maximum number of registers that RA is allowed to use */
   unsigned bound[RA_CLASSES];
};

/*
 * RA treats the nesting counter, the divergent shuffle temporary, and the
 * spiller temporaries as alive throughout if used anywhere. This could be
 * optimized. Using a single power-of-two reserved region at the start ensures
 * these registers are never shuffled.
 */
static unsigned
reserved_size(agx_context *ctx)
{
   if (ctx->has_spill_pcopy_reserved)
      return 8;
   else if (ctx->any_quad_divergent_shuffle)
      return 2;
   else if (ctx->any_cf)
      return 1;
   else
      return 0;
}

UNUSED static void
print_reg_file(struct ra_ctx *rctx, FILE *fp)
{
   unsigned reserved = reserved_size(rctx->shader);

   /* Dump the contents */
   for (unsigned i = reserved; i < rctx->bound[RA_GPR]; ++i) {
      if (BITSET_TEST(rctx->used_regs[RA_GPR], i)) {
         uint32_t ssa = rctx->reg_to_ssa[i];
         unsigned n = rctx->ncomps[ssa];
         fprintf(fp, "h%u...%u: %u\n", i, i + n - 1, ssa);
         i += (n - 1);
      }
   }
   fprintf(fp, "\n");

   /* Dump a visualization of the sizes to understand what live range
    * splitting is up against.
    */
   for (unsigned i = 0; i < rctx->bound[RA_GPR]; ++i) {
      /* Space out 16-bit vec4s */
      if (i && (i % 4) == 0) {
         fprintf(fp, " ");
      }

      if (i < reserved) {
         fprintf(fp, "-");
      } else if (BITSET_TEST(rctx->used_regs[RA_GPR], i)) {
         uint32_t ssa = rctx->reg_to_ssa[i];
         unsigned n = rctx->ncomps[ssa];
         for (unsigned j = 0; j < n; ++j) {
            assert(n < 10);
            fprintf(fp, "%u", n);
         }

         i += (n - 1);
      } else {
         fprintf(fp, ".");
      }
   }
   fprintf(fp, "\n\n");
}

enum agx_size
agx_split_width(const agx_instr *I)
{
   enum agx_size width = ~0;

   agx_foreach_dest(I, d) {
      if (I->dest[d].type == AGX_INDEX_NULL)
         continue;
      else if (width != ~0)
         assert(width == I->dest[d].size);
      else
         width = I->dest[d].size;
   }

   assert(width != ~0 && "should have been DCE'd");
   return width;
}

/*
 * Calculate register demand in 16-bit registers, while gathering widths and
 * classes. Becuase we allocate in SSA, this calculation is exact in
 * linear-time. Depends on liveness information.
 */
static unsigned
agx_calc_register_demand(agx_context *ctx)
{
   /* Print detailed demand calculation, helpful to debug spilling */
   bool debug = false;

   if (debug) {
      agx_print_shader(ctx, stdout);
   }

   uint8_t *widths = calloc(ctx->alloc, sizeof(uint8_t));
   enum ra_class *classes = calloc(ctx->alloc, sizeof(enum ra_class));

   agx_foreach_instr_global(ctx, I) {
      agx_foreach_ssa_dest(I, d) {
         unsigned v = I->dest[d].value;
         assert(widths[v] == 0 && "broken SSA");
         /* Round up vectors for easier live range splitting */
         widths[v] = util_next_power_of_two(agx_index_size_16(I->dest[d]));
         classes[v] = ra_class_for_index(I->dest[d]);
      }
   }

   /* Calculate demand at the start of each block based on live-in, then update
    * for each instruction processed. Calculate rolling maximum.
    */
   unsigned max_demand = 0;

   agx_foreach_block(ctx, block) {
      unsigned demand = reserved_size(ctx);

      /* Everything live-in */
      {
         int i;
         BITSET_FOREACH_SET(i, block->live_in, ctx->alloc) {
            if (classes[i] == RA_GPR)
               demand += widths[i];
         }
      }

      max_demand = MAX2(demand, max_demand);

      /* To handle non-power-of-two vectors, sometimes live range splitting
       * needs extra registers for 1 instruction. This counter tracks the number
       * of registers to be freed after 1 extra instruction.
       */
      unsigned late_kill_count = 0;

      if (debug) {
         printf("\n");
      }

      agx_foreach_instr_in_block(block, I) {
         /* Phis happen in parallel and are already accounted for in the live-in
          * set, just skip them so we don't double count.
          */
         if (I->op == AGX_OPCODE_PHI)
            continue;

         if (debug) {
            printf("%u: ", demand);
            agx_print_instr(I, stdout);
         }

         if (I->op == AGX_OPCODE_PRELOAD) {
            unsigned size = agx_size_align_16(I->src[0].size);
            max_demand = MAX2(max_demand, I->src[0].value + size);
         } else if (I->op == AGX_OPCODE_EXPORT) {
            unsigned size = agx_size_align_16(I->src[0].size);
            max_demand = MAX2(max_demand, I->imm + size);
         }

         /* Handle late-kill registers from last instruction */
         demand -= late_kill_count;
         late_kill_count = 0;

         /* Kill sources the first time we see them */
         agx_foreach_src(I, s) {
            if (!I->src[s].kill)
               continue;
            assert(I->src[s].type == AGX_INDEX_NORMAL);
            if (ra_class_for_index(I->src[s]) != RA_GPR)
               continue;

            bool skip = false;

            for (unsigned backwards = 0; backwards < s; ++backwards) {
               if (agx_is_equiv(I->src[backwards], I->src[s])) {
                  skip = true;
                  break;
               }
            }

            if (!skip)
               demand -= widths[I->src[s].value];
         }

         /* Make destinations live */
         agx_foreach_ssa_dest(I, d) {
            if (ra_class_for_index(I->dest[d]) != RA_GPR)
               continue;

            /* Live range splits allocate at power-of-two granularity. Round up
             * destination sizes (temporarily) to powers-of-two.
             */
            unsigned real_width = widths[I->dest[d].value];
            unsigned pot_width = util_next_power_of_two(real_width);

            demand += pot_width;
            late_kill_count += (pot_width - real_width);
         }

         max_demand = MAX2(demand, max_demand);
      }

      demand -= late_kill_count;
   }

   free(widths);
   free(classes);
   return max_demand;
}

static bool
find_regs_simple(struct ra_ctx *rctx, enum ra_class cls, unsigned count,
                 unsigned align, unsigned *out)
{
   for (unsigned reg = 0; reg + count <= rctx->bound[cls]; reg += align) {
      if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + count - 1)) {
         *out = reg;
         return true;
      }
   }

   return false;
}

/*
 * Search the register file for the best contiguous aligned region of the given
 * size to evict when shuffling registers. The region must not contain any
 * register marked in the passed bitset.
 *
 * As a hint, this also takes in the set of registers from killed sources passed
 * to this instruction. These should be deprioritized, since they are more
 * expensive to use (extra moves to shuffle the contents away).
 *
 * Precondition: such a region exists.
 *
 * Postcondition: at least one register in the returned region is already free.
 */
static unsigned
find_best_region_to_evict(struct ra_ctx *rctx, enum ra_class cls, unsigned size,
                          BITSET_WORD *already_evicted, BITSET_WORD *killed)
{
   assert(util_is_power_of_two_or_zero(size) && "precondition");
   assert((rctx->bound[cls] % size) == 0 &&
          "register file size must be aligned to the maximum vector size");
   assert(cls == RA_GPR);

   /* Useful for testing RA */
   bool invert = false;

   unsigned best_base = ~0;
   unsigned best_moves = invert ? 0 : ~0;

   for (unsigned base = 0; base + size <= rctx->bound[cls]; base += size) {
      /* The first k registers are preallocated and unevictable, so must be
       * skipped. By itself, this does not pose a problem. We are allocating n
       * registers, but this region has at most n-k free.  Since there are at
       * least n free registers total, there is at least k free registers
       * outside this region. Choose any such free register. The region
       * containing it has at most n-1 occupied registers. In the worst case,
       * n-k of those registers are are moved to the beginning region and the
       * remaining (n-1)-(n-k) = k-1 registers are moved to the k-1 free
       * registers in other regions, given there are k free registers total.
       * These recursive shuffles work out because everything is power-of-two
       * sized and naturally aligned, so the sizes shuffled are strictly
       * descending. So, we do not need extra registers to handle "single
       * region" unevictability.
       */
      if (base < reserved_size(rctx->shader))
         continue;

      /* Do not evict the same register multiple times. It's not necessary since
       * we're just shuffling, there are enough free registers elsewhere.
       */
      if (BITSET_TEST_RANGE(already_evicted, base, base + size - 1))
         continue;

      /* Estimate the number of moves required if we pick this region */
      unsigned moves = 0;
      bool any_free = false;

      for (unsigned reg = base; reg < base + size; ++reg) {
         /* We need a move for each blocked register (TODO: we only need a
          * single move for 32-bit pairs, could optimize to use that instead.)
          */
         if (BITSET_TEST(rctx->used_regs[cls], reg))
            moves++;
         else
            any_free = true;

         /* Each clobbered killed register requires a move or a swap. Since
          * swaps require more instructions, assign a higher cost here. In
          * practice, 3 is too high but 2 is slightly better than 1.
          */
         if (BITSET_TEST(killed, reg))
            moves += 2;
      }

      /* Pick the region requiring fewest moves as a heuristic. Regions with no
       * free registers are skipped even if the heuristic estimates a lower cost
       * (due to killed sources), since the recursive splitting algorithm
       * requires at least one free register.
       */
      if (any_free && ((moves < best_moves) ^ invert)) {
         best_moves = moves;
         best_base = base;
      }
   }

   assert(best_base < rctx->bound[cls] &&
          "not enough registers (should have spilled already)");
   return best_base;
}

static void
set_ssa_to_reg(struct ra_ctx *rctx, unsigned ssa, unsigned reg)
{
   enum ra_class cls = rctx->classes[ssa];
   *(rctx->count[cls]) = MAX2(*(rctx->count[cls]), reg + rctx->ncomps[ssa]);

   rctx->ssa_to_reg[ssa] = reg;

   if (cls == RA_GPR) {
      rctx->reg_to_ssa[reg] = ssa;
   }
}

/*
 * Insert parallel copies to move an SSA variable `var` to a new register
 * `new_reg`. This may require scalarizing.
 */
static void
insert_copy(struct ra_ctx *rctx, struct util_dynarray *copies, unsigned new_reg,
            unsigned var)
{
   enum agx_size size = rctx->sizes[var];
   unsigned align = agx_size_align_16(size);

   for (unsigned i = 0; i < rctx->ncomps[var]; i += align) {
      struct agx_copy copy = {
         .dest = new_reg + i,
         .src = agx_register(rctx->ssa_to_reg[var] + i, size),
      };

      assert((copy.dest % align) == 0 && "new dest must be aligned");
      assert((copy.src.value % align) == 0 && "src must be aligned");
      util_dynarray_append(copies, struct agx_copy, copy);
   }
}

static unsigned
assign_regs_by_copying(struct ra_ctx *rctx, agx_index dest, const agx_instr *I,
                       struct util_dynarray *copies, BITSET_WORD *clobbered,
                       BITSET_WORD *killed)
{
   assert(dest.type == AGX_INDEX_NORMAL);

   /* Initialize the worklist with the variable we're assigning */
   unsigned blocked_vars[16] = {dest.value};
   size_t nr_blocked = 1;

   while (nr_blocked > 0) {
      /* Grab the largest var. TODO: Consider not writing O(N^2) code. */
      uint32_t ssa = ~0, nr = 0, chosen_idx = ~0;
      for (unsigned i = 0; i < nr_blocked; ++i) {
         uint32_t this_ssa = blocked_vars[i];
         uint32_t this_nr = rctx->ncomps[this_ssa];

         if (this_nr > nr) {
            nr = this_nr;
            ssa = this_ssa;
            chosen_idx = i;
         }
      }

      assert(ssa != ~0 && nr > 0 && "must have found something");
      assert(chosen_idx < nr_blocked && "must have found something");

      /* Pop it from the work list by swapping in the last element */
      blocked_vars[chosen_idx] = blocked_vars[--nr_blocked];

      /* We need to shuffle some variables to make room. Look for a range of
       * the register file that is partially blocked.
       */
      unsigned new_reg =
         find_best_region_to_evict(rctx, RA_GPR, nr, clobbered, killed);

      /* Blocked registers need to get reassigned. Add them to the worklist. */
      for (unsigned i = 0; i < nr; ++i) {
         if (BITSET_TEST(rctx->used_regs[RA_GPR], new_reg + i)) {
            unsigned blocked_reg = new_reg + i;
            uint32_t blocked_ssa = rctx->reg_to_ssa[blocked_reg];
            uint32_t blocked_nr = rctx->ncomps[blocked_ssa];

            assert(blocked_nr >= 1 && "must be assigned");

            blocked_vars[nr_blocked++] = blocked_ssa;
            assert(
               rctx->ssa_to_reg[blocked_ssa] == blocked_reg &&
               "variable must start within the range, since vectors are limited");

            for (unsigned j = 0; j < blocked_nr; ++j) {
               assert(
                  BITSET_TEST(rctx->used_regs[RA_GPR], new_reg + i + j) &&
                  "variable is allocated contiguous and vectors are limited, "
                  "so evicted in full");
            }

            /* Skip to the next variable */
            i += blocked_nr - 1;
         }
      }

      /* We are going to allocate to this range, so it is now fully used. Mark
       * it as such so we don't reassign here later.
       */
      BITSET_SET_RANGE(rctx->used_regs[RA_GPR], new_reg, new_reg + nr - 1);

      /* The first iteration is special: it is the original allocation of a
       * variable. All subsequent iterations pick a new register for a blocked
       * variable. For those, copy the blocked variable to its new register.
       */
      if (ssa != dest.value) {
         insert_copy(rctx, copies, new_reg, ssa);
      }

      /* Mark down the set of clobbered registers, so that killed sources may be
       * handled correctly later.
       */
      BITSET_SET_RANGE(clobbered, new_reg, new_reg + nr - 1);

      /* Update bookkeeping for this variable */
      set_ssa_to_reg(rctx, ssa, new_reg);
   }

   return rctx->ssa_to_reg[dest.value];
}

static int
sort_by_size(const void *a_, const void *b_, void *sizes_)
{
   const enum agx_size *sizes = sizes_;
   const unsigned *a = a_, *b = b_;

   return sizes[*b] - sizes[*a];
}

/*
 * Allocating a destination of n consecutive registers may require moving those
 * registers' contents to the locations of killed sources. For the instruction
 * to read the correct values, the killed sources themselves need to be moved to
 * the space where the destination will go.
 *
 * This is legal because there is no interference between the killed source and
 * the destination. This is always possible because, after this insertion, the
 * destination needs to contain the killed sources already overlapping with the
 * destination (size k) plus the killed sources clobbered to make room for
 * livethrough sources overlapping with the destination (at most size |dest|-k),
 * so the total size is at most k + |dest| - k = |dest| and so fits in the dest.
 * Sorting by alignment may be necessary.
 */
static void
insert_copies_for_clobbered_killed(struct ra_ctx *rctx, unsigned reg,
                                   unsigned count, const agx_instr *I,
                                   struct util_dynarray *copies,
                                   BITSET_WORD *clobbered)
{
   unsigned vars[16] = {0};
   unsigned nr_vars = 0;

   /* Precondition: the reserved region is not shuffled. */
   assert(reg >= reserved_size(rctx->shader) && "reserved is never moved");

   /* Consider the destination clobbered for the purpose of source collection.
    * This way, killed sources already in the destination will be preserved
    * (though possibly compacted).
    */
   BITSET_SET_RANGE(clobbered, reg, reg + count - 1);

   /* Collect killed clobbered sources, if any */
   agx_foreach_ssa_src(I, s) {
      unsigned reg = rctx->ssa_to_reg[I->src[s].value];
      unsigned nr = rctx->ncomps[I->src[s].value];

      if (I->src[s].kill && ra_class_for_index(I->src[s]) == RA_GPR &&
          BITSET_TEST_RANGE(clobbered, reg, reg + nr - 1)) {

         assert(nr_vars < ARRAY_SIZE(vars) &&
                "cannot clobber more than max variable size");

         vars[nr_vars++] = I->src[s].value;
      }
   }

   if (nr_vars == 0)
      return;

   assert(I->op != AGX_OPCODE_PHI && "kill bit not set for phis");

   /* Sort by descending alignment so they are packed with natural alignment */
   util_qsort_r(vars, nr_vars, sizeof(vars[0]), sort_by_size, rctx->sizes);

   /* Reassign in the destination region */
   unsigned base = reg;

   /* We align vectors to their sizes, so this assertion holds as long as no
    * instruction has a source whose scalar size is greater than the entire size
    * of the vector destination. Yet the killed source must fit within this
    * destination, so the destination must be bigger and therefore have bigger
    * alignment.
    */
   assert((base % agx_size_align_16(rctx->sizes[vars[0]])) == 0 &&
          "destination alignment >= largest killed source alignment");

   for (unsigned i = 0; i < nr_vars; ++i) {
      unsigned var = vars[i];
      unsigned var_count = rctx->ncomps[var];
      unsigned var_align = agx_size_align_16(rctx->sizes[var]);

      assert(rctx->classes[var] == RA_GPR && "construction");
      assert((base % var_align) == 0 && "induction");
      assert((var_count % var_align) == 0 && "no partial variables");

      insert_copy(rctx, copies, base, var);
      set_ssa_to_reg(rctx, var, base);
      base += var_count;
   }

   assert(base <= reg + count && "no overflow");
}

/*
 * When shuffling registers to assign a phi destination, we can't simply insert
 * the required moves before the phi, since phis happen in parallel along the
 * edge. Instead, there are two cases:
 *
 * 1. The source of the copy is the destination of a phi. Since we are
 *    emitting shuffle code, there will be no more reads of that destination
 *    with the old register. Since the phis all happen in parallel and writes
 *    precede reads, there was no previous read of that destination either. So
 *    the old destination is dead. Just replace the phi's destination with the
 *    moves's destination instead.
 *
 * 2. Otherwise, the source of the copy is a live-in value, since it's
 *    live when assigning phis at the start of a block but it is not a phi.
 *    If we move in parallel with the phi, the phi will still read the correct
 *    old register regardless and the destinations can't alias. So, insert a phi
 *    to do the copy in parallel along the incoming edges.
 */
static void
agx_emit_move_before_phi(agx_context *ctx, agx_block *block,
                         struct agx_copy *copy)
{
   assert(!copy->dest_mem && !copy->src.memory && "no memory shuffles");

   /* Look for the phi writing the destination */
   agx_foreach_phi_in_block(block, phi) {
      if (agx_is_equiv(agx_as_register(phi->dest[0]), copy->src) &&
          !phi->dest[0].memory) {

         phi->dest[0].reg = copy->dest;
         return;
      }
   }

   /* There wasn't such a phi, so it's live-in. Insert a phi instead. */
   agx_builder b = agx_init_builder(ctx, agx_before_block(block));

   agx_instr *phi = agx_phi_to(&b, agx_register_like(copy->dest, copy->src),
                               agx_num_predecessors(block));
   assert(!copy->src.kill);

   agx_foreach_src(phi, s) {
      phi->src[s] = copy->src;
   }
}

static unsigned
find_regs(struct ra_ctx *rctx, agx_instr *I, unsigned dest_idx, unsigned count,
          unsigned align)
{
   unsigned reg;
   assert(count == align);

   enum ra_class cls = ra_class_for_index(I->dest[dest_idx]);

   if (find_regs_simple(rctx, cls, count, align, &reg)) {
      return reg;
   } else {
      assert(cls == RA_GPR && "no memory live range splits");

      BITSET_DECLARE(clobbered, AGX_NUM_REGS) = {0};
      BITSET_DECLARE(killed, AGX_NUM_REGS) = {0};
      struct util_dynarray copies = {0};
      util_dynarray_init(&copies, NULL);

      /* Initialize the set of registers killed by this instructions' sources */
      agx_foreach_ssa_src(I, s) {
         unsigned v = I->src[s].value;

         if (BITSET_TEST(rctx->visited, v) && !I->src[s].memory) {
            unsigned base = rctx->ssa_to_reg[v];
            unsigned nr = rctx->ncomps[v];

            assert(base + nr <= AGX_NUM_REGS);
            BITSET_SET_RANGE(killed, base, base + nr - 1);
         }
      }

      reg = assign_regs_by_copying(rctx, I->dest[dest_idx], I, &copies,
                                   clobbered, killed);
      insert_copies_for_clobbered_killed(rctx, reg, count, I, &copies,
                                         clobbered);

      /* Insert the necessary copies. Phis need special handling since we can't
       * insert instructions before the phi.
       */
      if (I->op == AGX_OPCODE_PHI) {
         util_dynarray_foreach(&copies, struct agx_copy, copy) {
            agx_emit_move_before_phi(rctx->shader, rctx->block, copy);
         }
      } else {
         agx_builder b = agx_init_builder(rctx->shader, agx_before_instr(I));
         agx_emit_parallel_copies(
            &b, copies.data,
            util_dynarray_num_elements(&copies, struct agx_copy));
      }

      util_dynarray_fini(&copies);

      /* assign_regs asserts this is cleared, so clear to be reassigned */
      BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1);
      return reg;
   }
}

static uint32_t
search_ssa_to_reg_out(struct ra_ctx *ctx, struct agx_block *blk,
                      enum ra_class cls, unsigned ssa)
{
   for (unsigned reg = 0; reg < ctx->bound[cls]; ++reg) {
      if (blk->reg_to_ssa_out[cls][reg] == ssa)
         return reg;
   }

   unreachable("variable not defined in block");
}

/*
 * Loop over live-in values at the start of the block and mark their registers
 * as in-use. We process blocks in dominance order, so this handles everything
 * but loop headers.
 *
 * For loop headers, this handles the forward edges but not the back edge.
 * However, that's okay: we don't want to reserve the registers that are
 * defined within the loop, because then we'd get a contradiction. Instead we
 * leave them available and then they become fixed points of a sort.
 */
static void
reserve_live_in(struct ra_ctx *rctx)
{
   /* If there are no predecessors, there is nothing live-in */
   unsigned nr_preds = agx_num_predecessors(rctx->block);
   if (nr_preds == 0)
      return;

   agx_builder b =
      agx_init_builder(rctx->shader, agx_before_block(rctx->block));

   int i;
   BITSET_FOREACH_SET(i, rctx->block->live_in, rctx->shader->alloc) {
      /* Skip values defined in loops when processing the loop header */
      if (!BITSET_TEST(rctx->visited, i))
         continue;

      unsigned base;
      enum ra_class cls = rctx->classes[i];
      enum agx_size size = rctx->sizes[i];

      /* We need to use the unrounded channel count, since the extra padding
       * will be uninitialized and would fail RA validation.
       */
      unsigned channels = rctx->ncomps_unrounded[i] / agx_size_align_16(size);

      /* If we split live ranges, the variable might be defined differently at
       * the end of each predecessor. Join them together with a phi inserted at
       * the start of the block.
       */
      if (nr_preds > 1) {
         /* We'll fill in the destination after, to coalesce one of the moves */
         agx_instr *phi = agx_phi_to(&b, agx_null(), nr_preds);

         agx_foreach_predecessor(rctx->block, pred) {
            unsigned pred_idx = agx_predecessor_index(rctx->block, *pred);

            phi->src[pred_idx] = agx_get_vec_index(i, size, channels);
            phi->src[pred_idx].memory = cls == RA_MEM;

            if ((*pred)->reg_to_ssa_out[cls] == NULL) {
               /* If this is a loop header, we don't know where the register
                * will end up. So, we create a phi conservatively but don't fill
                * it in until the end of the loop. Stash in the information
                * we'll need to fill in the real register later.
                */
               assert(rctx->block->loop_header);
            } else {
               /* Otherwise, we can build the phi now */
               phi->src[pred_idx].reg =
                  search_ssa_to_reg_out(rctx, *pred, cls, i);
               phi->src[pred_idx].has_reg = true;
            }
         }

         /* Pick the phi destination to coalesce a move. Predecessor ordering is
          * stable, so this means all live-in values get their registers from a
          * particular predecessor. That means that such a register allocation
          * is valid here, because it was valid in the predecessor.
          */
         assert(phi->src[0].has_reg && "not loop source");
         phi->dest[0] = phi->src[0];
         base = phi->dest[0].reg;
      } else {
         /* If we don't emit a phi, there is already a unique register */
         assert(nr_preds == 1);

         agx_block **pred = util_dynarray_begin(&rctx->block->predecessors);
         /* TODO: Flip logic to eliminate the search */
         base = search_ssa_to_reg_out(rctx, *pred, cls, i);
      }

      set_ssa_to_reg(rctx, i, base);

      for (unsigned j = 0; j < rctx->ncomps[i]; ++j) {
         BITSET_SET(rctx->used_regs[cls], base + j);
      }
   }
}

static void
assign_regs(struct ra_ctx *rctx, agx_index v, unsigned reg)
{
   enum ra_class cls = ra_class_for_index(v);
   assert(reg < rctx->bound[cls] && "must not overflow register file");
   assert(v.type == AGX_INDEX_NORMAL && "only SSA gets registers allocated");
   set_ssa_to_reg(rctx, v.value, reg);

   assert(!BITSET_TEST(rctx->visited, v.value) && "SSA violated");
   BITSET_SET(rctx->visited, v.value);

   assert(rctx->ncomps[v.value] >= 1);
   unsigned end = reg + rctx->ncomps[v.value] - 1;

   assert(!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, end) &&
          "no interference");
   BITSET_SET_RANGE(rctx->used_regs[cls], reg, end);

   /* Phi webs need to remember which register they're assigned to */
   struct phi_web_node *node =
      &rctx->phi_web[phi_web_find(rctx->phi_web, v.value)];

   if (!node->assigned) {
      node->reg = reg;
      node->assigned = true;
   }
}

static void
agx_set_sources(struct ra_ctx *rctx, agx_instr *I)
{
   assert(I->op != AGX_OPCODE_PHI);

   agx_foreach_ssa_src(I, s) {
      assert(BITSET_TEST(rctx->visited, I->src[s].value) && "no phis");

      I->src[s].reg = rctx->ssa_to_reg[I->src[s].value];
      I->src[s].has_reg = true;
   }
}

static void
agx_set_dests(struct ra_ctx *rctx, agx_instr *I)
{
   agx_foreach_ssa_dest(I, s) {
      I->dest[s].reg = rctx->ssa_to_reg[I->dest[s].value];
      I->dest[s].has_reg = true;
   }
}

static unsigned
affinity_base_of_collect(struct ra_ctx *rctx, agx_instr *collect, unsigned src)
{
   unsigned src_reg = rctx->ssa_to_reg[collect->src[src].value];
   unsigned src_offset = src * agx_size_align_16(collect->src[src].size);

   if (src_reg >= src_offset)
      return src_reg - src_offset;
   else
      return ~0;
}

static bool
try_coalesce_with(struct ra_ctx *rctx, agx_index ssa, unsigned count,
                  bool may_be_unvisited, unsigned *out)
{
   assert(ssa.type == AGX_INDEX_NORMAL);
   if (!BITSET_TEST(rctx->visited, ssa.value)) {
      assert(may_be_unvisited);
      return false;
   }

   unsigned base = rctx->ssa_to_reg[ssa.value];
   enum ra_class cls = ra_class_for_index(ssa);

   if (BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
      return false;

   assert(base + count <= rctx->bound[cls] && "invariant");
   *out = base;
   return true;
}

static unsigned
pick_regs(struct ra_ctx *rctx, agx_instr *I, unsigned d)
{
   agx_index idx = I->dest[d];
   enum ra_class cls = ra_class_for_index(idx);
   assert(idx.type == AGX_INDEX_NORMAL);

   unsigned count = rctx->ncomps[idx.value];
   assert(count >= 1);

   unsigned align = count;

   /* Try to allocate entire phi webs compatibly */
   unsigned phi_idx = phi_web_find(rctx->phi_web, idx.value);
   if (rctx->phi_web[phi_idx].assigned) {
      unsigned reg = rctx->phi_web[phi_idx].reg;
      if ((reg % align) == 0 && reg + align < rctx->bound[cls] &&
          !BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + align - 1))
         return reg;
   }

   /* Try to allocate moves compatibly with their sources */
   if (I->op == AGX_OPCODE_MOV && I->src[0].type == AGX_INDEX_NORMAL &&
       I->src[0].memory == I->dest[0].memory &&
       I->src[0].size == I->dest[0].size) {

      unsigned out;
      if (try_coalesce_with(rctx, I->src[0], count, false, &out))
         return out;
   }

   /* Try to allocate phis compatibly with their sources */
   if (I->op == AGX_OPCODE_PHI) {
      agx_foreach_ssa_src(I, s) {
         /* Loop headers have phis with a source preceding the definition */
         bool may_be_unvisited = rctx->block->loop_header;

         unsigned out;
         if (try_coalesce_with(rctx, I->src[s], count, may_be_unvisited, &out))
            return out;
      }
   }

   /* Try to allocate collects compatibly with their sources */
   if (I->op == AGX_OPCODE_COLLECT) {
      agx_foreach_ssa_src(I, s) {
         assert(BITSET_TEST(rctx->visited, I->src[s].value) &&
                "registers assigned in an order compatible with dominance "
                "and this is not a phi node, so we have assigned a register");

         unsigned base = affinity_base_of_collect(rctx, I, s);
         if (base >= rctx->bound[cls] || (base + count) > rctx->bound[cls])
            continue;

         /* Unaligned destinations can happen when dest size > src size */
         if (base % align)
            continue;

         if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
            return base;
      }
   }

   /* Try to coalesce scalar exports */
   agx_instr *collect_phi = rctx->src_to_collect_phi[idx.value];
   if (collect_phi && collect_phi->op == AGX_OPCODE_EXPORT) {
      unsigned reg = collect_phi->imm;

      if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + align - 1) &&
          (reg % align) == 0)
         return reg;
   }

   /* Try to coalesce vector exports */
   if (collect_phi && collect_phi->op == AGX_OPCODE_SPLIT) {
      if (collect_phi->dest[0].type == AGX_INDEX_NORMAL) {
         agx_instr *exp = rctx->src_to_collect_phi[collect_phi->dest[0].value];
         if (exp && exp->op == AGX_OPCODE_EXPORT) {
            unsigned reg = exp->imm;

            if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg,
                                   reg + align - 1) &&
                (reg % align) == 0)
               return reg;
         }
      }
   }

   /* Try to allocate sources of collects contiguously */
   if (collect_phi && collect_phi->op == AGX_OPCODE_COLLECT) {
      agx_instr *collect = collect_phi;

      assert(count == align && "collect sources are scalar");

      /* Find our offset in the collect. If our source is repeated in the
       * collect, this may not be unique. We arbitrarily choose the first.
       */
      unsigned our_source = ~0;
      agx_foreach_ssa_src(collect, s) {
         if (agx_is_equiv(collect->src[s], idx)) {
            our_source = s;
            break;
         }
      }

      assert(our_source < collect->nr_srcs && "source must be in the collect");

      /* See if we can allocate compatibly with any source of the collect */
      agx_foreach_ssa_src(collect, s) {
         if (!BITSET_TEST(rctx->visited, collect->src[s].value))
            continue;

         /* Determine where the collect should start relative to the source */
         unsigned base = affinity_base_of_collect(rctx, collect, s);
         if (base >= rctx->bound[cls])
            continue;

         unsigned our_reg = base + (our_source * align);

         /* Don't allocate past the end of the register file */
         if ((our_reg + align) > rctx->bound[cls])
            continue;

         /* If those registers are free, then choose them */
         if (!BITSET_TEST_RANGE(rctx->used_regs[cls], our_reg,
                                our_reg + align - 1))
            return our_reg;
      }

      unsigned collect_align = rctx->ncomps[collect->dest[0].value];
      unsigned offset = our_source * align;

      /* Prefer ranges of the register file that leave room for all sources of
       * the collect contiguously.
       */
      for (unsigned base = 0;
           base + (collect->nr_srcs * align) <= rctx->bound[cls];
           base += collect_align) {
         if (!BITSET_TEST_RANGE(rctx->used_regs[cls], base,
                                base + (collect->nr_srcs * align) - 1))
            return base + offset;
      }

      /* Try to respect the alignment requirement of the collect destination,
       * which may be greater than the sources (e.g. pack_64_2x32_split). Look
       * for a register for the source such that the collect base is aligned.
       */
      if (collect_align > align) {
         for (unsigned reg = offset; reg + collect_align <= rctx->bound[cls];
              reg += collect_align) {
            if (!BITSET_TEST_RANGE(rctx->used_regs[cls], reg, reg + count - 1))
               return reg;
         }
      }
   }

   /* Try to allocate phi sources compatibly with their phis */
   if (collect_phi && collect_phi->op == AGX_OPCODE_PHI) {
      agx_instr *phi = collect_phi;
      unsigned out;

      agx_foreach_ssa_src(phi, s) {
         if (try_coalesce_with(rctx, phi->src[s], count, true, &out))
            return out;
      }

      /* If we're in a loop, we may have already allocated the phi. Try that. */
      if (phi->dest[0].has_reg) {
         unsigned base = phi->dest[0].reg;

         if (base + count <= rctx->bound[cls] &&
             !BITSET_TEST_RANGE(rctx->used_regs[cls], base, base + count - 1))
            return base;
      }
   }

   /* Default to any contiguous sequence of registers */
   return find_regs(rctx, I, d, count, align);
}

/** Assign registers to SSA values in a block. */

static void
agx_ra_assign_local(struct ra_ctx *rctx)
{
   BITSET_DECLARE(used_regs_gpr, AGX_NUM_REGS) = {0};
   BITSET_DECLARE(used_regs_mem, AGX_NUM_MODELED_REGS) = {0};
   uint16_t *ssa_to_reg = calloc(rctx->shader->alloc, sizeof(uint16_t));

   agx_block *block = rctx->block;
   uint8_t *ncomps = rctx->ncomps;
   rctx->used_regs[RA_GPR] = used_regs_gpr;
   rctx->used_regs[RA_MEM] = used_regs_mem;
   rctx->ssa_to_reg = ssa_to_reg;

   reserve_live_in(rctx);

   /* Force the nesting counter r0l live throughout shaders using control flow.
    * This could be optimized (sync with agx_calc_register_demand).
    */
   if (rctx->shader->any_cf)
      BITSET_SET(used_regs_gpr, 0);

   /* Force the zero r0h live throughout shaders using divergent shuffles. */
   if (rctx->shader->any_quad_divergent_shuffle) {
      assert(rctx->shader->any_cf);
      BITSET_SET(used_regs_gpr, 1);
   }

   /* Reserve bottom registers as temporaries for parallel copy lowering */
   if (rctx->shader->has_spill_pcopy_reserved) {
      BITSET_SET_RANGE(used_regs_gpr, 0, 7);
   }

   agx_foreach_instr_in_block(block, I) {
      rctx->instr = I;

      /* Optimization: if a split contains the last use of a vector, the split
       * can be removed by assigning the destinations overlapping the source.
       */
      if (I->op == AGX_OPCODE_SPLIT && I->src[0].kill) {
         assert(ra_class_for_index(I->src[0]) == RA_GPR);
         unsigned reg = ssa_to_reg[I->src[0].value];
         unsigned width = agx_size_align_16(agx_split_width(I));

         agx_foreach_dest(I, d) {
            assert(ra_class_for_index(I->dest[0]) == RA_GPR);

            /* Free up the source */
            unsigned offset_reg = reg + (d * width);
            BITSET_CLEAR_RANGE(used_regs_gpr, offset_reg,
                               offset_reg + width - 1);

            /* Assign the destination where the source was */
            if (!agx_is_null(I->dest[d]))
               assign_regs(rctx, I->dest[d], offset_reg);
         }

         unsigned excess =
            rctx->ncomps[I->src[0].value] - (I->nr_dests * width);
         if (excess) {
            BITSET_CLEAR_RANGE(used_regs_gpr, reg + (I->nr_dests * width),
                               reg + rctx->ncomps[I->src[0].value] - 1);
         }

         agx_set_sources(rctx, I);
         agx_set_dests(rctx, I);
         continue;
      } else if (I->op == AGX_OPCODE_PRELOAD) {
         /* We must coalesce all preload moves */
         assert(I->dest[0].size == I->src[0].size);
         assert(I->src[0].type == AGX_INDEX_REGISTER);

         /* r1l specifically is a preloaded register. It is reserved during
          * demand calculations to ensure we don't need live range shuffling of
          * spilling temporaries. But we can still preload to it. So if it's
          * reserved, just free it. It'll be fine.
          */
         if (I->src[0].value == 2) {
            BITSET_CLEAR(rctx->used_regs[RA_GPR], 2);
         }

         assign_regs(rctx, I->dest[0], I->src[0].value);
         agx_set_dests(rctx, I);
         continue;
      }

      /* First, free killed sources */
      agx_foreach_ssa_src(I, s) {
         if (I->src[s].kill) {
            assert(I->op != AGX_OPCODE_PHI && "phis don't use .kill");

            enum ra_class cls = ra_class_for_index(I->src[s]);
            unsigned reg = ssa_to_reg[I->src[s].value];
            unsigned count = ncomps[I->src[s].value];

            assert(count >= 1);
            BITSET_CLEAR_RANGE(rctx->used_regs[cls], reg, reg + count - 1);
         }
      }

      /* Next, assign destinations one at a time. This is always legal
       * because of the SSA form.
       */
      agx_foreach_ssa_dest(I, d) {
         if (I->op == AGX_OPCODE_PHI && I->dest[d].has_reg)
            continue;

         assign_regs(rctx, I->dest[d], pick_regs(rctx, I, d));
      }

      /* Phi sources are special. Set in the corresponding predecessors */
      if (I->op != AGX_OPCODE_PHI)
         agx_set_sources(rctx, I);

      agx_set_dests(rctx, I);
   }

   for (unsigned i = 0; i < RA_CLASSES; ++i) {
      block->reg_to_ssa_out[i] =
         malloc(rctx->bound[i] * sizeof(*block->reg_to_ssa_out[i]));

      /* Initialize with sentinel so we don't have unused regs mapping to r0 */
      memset(block->reg_to_ssa_out[i], 0xFF,
             rctx->bound[i] * sizeof(*block->reg_to_ssa_out[i]));
   }

   int i;
   BITSET_FOREACH_SET(i, block->live_out, rctx->shader->alloc) {
      block->reg_to_ssa_out[rctx->classes[i]][rctx->ssa_to_reg[i]] = i;
   }

   /* Also set the sources for the phis in our successors, since that logically
    * happens now (given the possibility of live range splits, etc)
    */
   agx_foreach_successor(block, succ) {
      unsigned pred_idx = agx_predecessor_index(succ, block);

      agx_foreach_phi_in_block(succ, phi) {
         if (phi->src[pred_idx].type == AGX_INDEX_NORMAL &&
             !phi->src[pred_idx].has_reg) {
            /* This source needs a fixup */
            unsigned value = phi->src[pred_idx].value;
            phi->src[pred_idx].reg = rctx->ssa_to_reg[value];
            phi->src[pred_idx].has_reg = true;
         }
      }
   }

   free(rctx->ssa_to_reg);
}

/*
 * Lower phis to parallel copies at the logical end of a given block. If a block
 * needs parallel copies inserted, a successor of the block has a phi node. To
 * have a (nontrivial) phi node, a block must have multiple predecessors. So the
 * edge from the block to the successor (with phi) is not the only edge entering
 * the successor. Because the control flow graph has no critical edges, this
 * edge must therefore be the only edge leaving the block, so the block must
 * have only a single successor.
 */
static void
agx_insert_parallel_copies(agx_context *ctx, agx_block *block)
{
   bool any_succ = false;
   unsigned nr_phi = 0;

   /* Phi nodes logically happen on the control flow edge, so parallel copies
    * are added at the end of the predecessor */
   agx_builder b = agx_init_builder(ctx, agx_after_block_logical(block));

   agx_foreach_successor(block, succ) {
      assert(nr_phi == 0 && "control flow graph has a critical edge");

      agx_foreach_phi_in_block(succ, phi) {
         assert(!any_succ && "control flow graph has a critical edge");
         nr_phi += agx_channels(phi->dest[0]);
      }

      any_succ = true;

      /* Nothing to do if there are no phi nodes */
      if (nr_phi == 0)
         continue;

      unsigned pred_index = agx_predecessor_index(succ, block);

      /* Create a parallel copy lowering all the phi nodes */
      struct agx_copy *copies = calloc(sizeof(*copies), nr_phi);

      unsigned i = 0;

      agx_foreach_phi_in_block(succ, phi) {
         agx_index dest = phi->dest[0];
         agx_index src = phi->src[pred_index];

         if (src.type == AGX_INDEX_IMMEDIATE)
            src.size = dest.size;

         assert(dest.type == AGX_INDEX_REGISTER);
         assert(dest.size == src.size);

         /* Scalarize the phi, since the parallel copy lowering doesn't handle
          * vector phis. While we scalarize phis in NIR, we can generate vector
          * phis from spilling so must take care.
          */
         for (unsigned c = 0; c < agx_channels(phi->dest[0]); ++c) {
            agx_index src_ = src;
            unsigned offs = c * agx_size_align_16(src.size);

            if (src.type != AGX_INDEX_IMMEDIATE) {
               assert(src.type == AGX_INDEX_UNIFORM ||
                      src.type == AGX_INDEX_REGISTER);
               src_.value += offs;
               src_.channels_m1 = 1 - 1;
            }

            assert(i < nr_phi);
            copies[i++] = (struct agx_copy){
               .dest = dest.value + offs,
               .dest_mem = dest.memory,
               .src = src_,
            };
         }
      }

      agx_emit_parallel_copies(&b, copies, nr_phi);

      free(copies);
   }
}

static void
lower_exports(agx_context *ctx)
{
   struct agx_copy copies[AGX_NUM_REGS];
   unsigned nr = 0;
   agx_block *block = agx_exit_block(ctx);

   agx_foreach_instr_in_block_safe(block, I) {
      if (I->op != AGX_OPCODE_EXPORT)
         continue;

      assert(agx_channels(I->src[0]) == 1 && "scalarized in frontend");
      assert(nr < ARRAY_SIZE(copies));

      copies[nr++] = (struct agx_copy){
         .dest = I->imm,
         .src = I->src[0],
      };

      /* We cannot use fewer registers than we export */
      ctx->max_reg =
         MAX2(ctx->max_reg, I->imm + agx_size_align_16(I->src[0].size));
   }

   agx_builder b = agx_init_builder(ctx, agx_after_block_logical(block));
   agx_emit_parallel_copies(&b, copies, nr);
}

void
agx_ra(agx_context *ctx)
{
   bool force_spilling =
      (agx_compiler_debug & AGX_DBG_SPILL) && ctx->key->has_scratch;

   /* Determine maximum possible registers. We won't exceed this! */
   unsigned max_possible_regs = AGX_NUM_REGS;

   /* Compute shaders need to have their entire workgroup together, so our
    * register usage is bounded by the workgroup size.
    */
   if (gl_shader_stage_is_compute(ctx->stage)) {
      unsigned threads_per_workgroup;

      /* If we don't know the workgroup size, worst case it. TODO: Optimize
       * this, since it'll decimate opencl perf.
       */
      if (ctx->nir->info.workgroup_size_variable) {
         threads_per_workgroup = 1024;
      } else {
         threads_per_workgroup = ctx->nir->info.workgroup_size[0] *
                                 ctx->nir->info.workgroup_size[1] *
                                 ctx->nir->info.workgroup_size[2];
      }

      max_possible_regs =
         agx_max_registers_for_occupancy(threads_per_workgroup);
   }

   if (force_spilling) {
      /* Even when testing spilling, we need enough room for preloaded/exported
       * regs.
       */
      unsigned d = 24;
      unsigned max_ncomps = 8;

      agx_foreach_instr_global(ctx, I) {
         if (I->op == AGX_OPCODE_PRELOAD) {
            unsigned size = agx_size_align_16(I->src[0].size);
            d = MAX2(d, I->src[0].value + size);
         } else if (I->op == AGX_OPCODE_EXPORT) {
            unsigned size = agx_size_align_16(I->src[0].size);
            d = MAX2(d, I->imm + size);
         } else if (I->op == AGX_OPCODE_IMAGE_WRITE) {
            /* vec4 source + vec4 coordinates + bindless handle + reserved */
            d = MAX2(d, 26);
         } else if (I->op == AGX_OPCODE_TEXTURE_SAMPLE &&
                    (I->lod_mode == AGX_LOD_MODE_LOD_GRAD ||
                     I->lod_mode == AGX_LOD_MODE_LOD_GRAD_MIN)) {
            /* as above but with big gradient */
            d = MAX2(d, 36);
         }

         agx_foreach_ssa_dest(I, v) {
            max_ncomps = MAX2(max_ncomps, agx_index_size_16(I->dest[v]));
         }
      }

      max_possible_regs = ALIGN_POT(d, util_next_power_of_two(max_ncomps));
   } else if (ctx->key->is_helper) {
      /* The helper program is unspillable and has a limited register file */
      max_possible_regs = 32;
   }

   /* Calculate the demand. We'll use it to determine if we need to spill and to
    * bound register assignment.
    */
   agx_compute_liveness(ctx);
   unsigned effective_demand = agx_calc_register_demand(ctx);
   bool spilling = (effective_demand > max_possible_regs);

   if (spilling) {
      assert(ctx->key->has_scratch && "internal shaders are unspillable");
      agx_spill(ctx, max_possible_regs);

      /* After spilling, recalculate liveness and demand */
      agx_compute_liveness(ctx);
      effective_demand = agx_calc_register_demand(ctx);

      /* The resulting program can now be assigned registers */
      assert(effective_demand <= max_possible_regs && "spiller post-condition");
   }

   /* Record all phi webs. First initialize the union-find data structure with
    * all SSA defs in their own singletons, then union together anything related
    * by a phi. The resulting union-find structure will be the webs.
    */
   struct phi_web_node *phi_web = calloc(ctx->alloc, sizeof(*phi_web));
   for (unsigned i = 0; i < ctx->alloc; ++i) {
      phi_web[i].parent = i;
   }

   agx_foreach_block(ctx, block) {
      agx_foreach_phi_in_block(block, phi) {
         agx_foreach_ssa_src(phi, s) {
            phi_web_union(phi_web, phi->dest[0].value, phi->src[s].value);
         }
      }
   }

   uint8_t *ncomps = calloc(ctx->alloc, sizeof(uint8_t));
   uint8_t *ncomps_unrounded = calloc(ctx->alloc, sizeof(uint8_t));
   enum ra_class *classes = calloc(ctx->alloc, sizeof(enum ra_class));
   agx_instr **src_to_collect_phi = calloc(ctx->alloc, sizeof(agx_instr *));
   enum agx_size *sizes = calloc(ctx->alloc, sizeof(enum agx_size));
   BITSET_WORD *visited = calloc(BITSET_WORDS(ctx->alloc), sizeof(BITSET_WORD));
   unsigned max_ncomps = 1;

   agx_foreach_instr_global(ctx, I) {
      /* Record collects/phis so we can coalesce when assigning */
      if (I->op == AGX_OPCODE_COLLECT || I->op == AGX_OPCODE_PHI ||
          I->op == AGX_OPCODE_EXPORT || I->op == AGX_OPCODE_SPLIT) {
         agx_foreach_ssa_src(I, s) {
            src_to_collect_phi[I->src[s].value] = I;
         }
      }

      agx_foreach_ssa_dest(I, d) {
         unsigned v = I->dest[d].value;
         assert(ncomps[v] == 0 && "broken SSA");
         /* Round up vectors for easier live range splitting */
         ncomps_unrounded[v] = agx_index_size_16(I->dest[d]);
         ncomps[v] = util_next_power_of_two(ncomps_unrounded[v]);
         sizes[v] = I->dest[d].size;
         classes[v] = ra_class_for_index(I->dest[d]);

         max_ncomps = MAX2(max_ncomps, ncomps[v]);
      }
   }

   /* For live range splitting to work properly, ensure the register file is
    * aligned to the larger vector size. Most of the time, this is a no-op since
    * the largest vector size is usually 128-bit and the register file is
    * naturally 128-bit aligned. However, this is required for correctness with
    * 3D textureGrad, which can have a source vector of length 6x32-bit,
    * rounding up to 256-bit and requiring special accounting here.
    */
   unsigned reg_file_alignment = MAX2(max_ncomps, 8);
   assert(util_is_power_of_two_nonzero(reg_file_alignment));

   unsigned demand = ALIGN_POT(effective_demand, reg_file_alignment);
   assert(demand <= max_possible_regs && "Invariant");

   /* Round up the demand to the maximum number of registers we can use without
    * affecting occupancy. This reduces live range splitting.
    */
   unsigned max_regs = agx_occupancy_for_register_count(demand).max_registers;
   if (ctx->key->is_helper || force_spilling)
      max_regs = max_possible_regs;

   max_regs = ROUND_DOWN_TO(max_regs, reg_file_alignment);

   /* Or, we can bound tightly for debugging */
   if (agx_compiler_debug & AGX_DBG_DEMAND)
      max_regs = ALIGN_POT(MAX2(demand, 12), reg_file_alignment);

   /* ...but not too tightly */
   assert((max_regs % reg_file_alignment) == 0 && "occupancy limits aligned");
   assert(max_regs >= (6 * 2) && "space for vertex shader preloading");
   assert(max_regs <= max_possible_regs);

   unsigned reg_count = 0, mem_slot_count = 0;

   /* Assign registers in dominance-order. This coincides with source-order due
    * to a NIR invariant, so we do not need special handling for this.
    */
   agx_foreach_block(ctx, block) {
      agx_ra_assign_local(&(struct ra_ctx){
         .shader = ctx,
         .block = block,
         .src_to_collect_phi = src_to_collect_phi,
         .phi_web = phi_web,
         .ncomps = ncomps,
         .ncomps_unrounded = ncomps_unrounded,
         .sizes = sizes,
         .classes = classes,
         .visited = visited,
         .bound[RA_GPR] = max_regs,
         .bound[RA_MEM] = AGX_NUM_MODELED_REGS,
         .count[RA_GPR] = &reg_count,
         .count[RA_MEM] = &mem_slot_count,
      });
   }

   ctx->max_reg = reg_count ? (reg_count - 1) : 0;
   ctx->spill_base_B = ctx->scratch_size_B;
   ctx->scratch_size_B += mem_slot_count * 2;

   /* Vertex shaders preload the vertex/instance IDs (r5, r6) even if the shader
    * don't use them. Account for that so the preload doesn't clobber GPRs.
    * Hardware tessellation eval shaders preload patch/instance IDs there.
    */
   if (ctx->nir->info.stage == MESA_SHADER_VERTEX ||
       ctx->nir->info.stage == MESA_SHADER_TESS_EVAL)
      ctx->max_reg = MAX2(ctx->max_reg, 6 * 2);

   assert(ctx->max_reg <= max_regs);

   /* Validate RA after assigning registers just before lowering SSA */
   agx_validate_ra(ctx);

   agx_foreach_instr_global_safe(ctx, ins) {
      /* Lower away SSA */
      agx_foreach_ssa_dest(ins, d) {
         ins->dest[d] =
            agx_replace_index(ins->dest[d], agx_as_register(ins->dest[d]));
      }

      agx_foreach_ssa_src(ins, s) {
         agx_replace_src(ins, s, agx_as_register(ins->src[s]));
      }

      /* Lower away RA pseudo-instructions */
      agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));

      if (ins->op == AGX_OPCODE_COLLECT) {
         assert(ins->dest[0].type == AGX_INDEX_REGISTER);
         assert(!ins->dest[0].memory);

         unsigned base = ins->dest[0].value;
         unsigned width = agx_size_align_16(ins->src[0].size);

         struct agx_copy *copies = alloca(sizeof(copies[0]) * ins->nr_srcs);
         unsigned n = 0;

         /* Move the sources */
         agx_foreach_src(ins, i) {
            if (agx_is_null(ins->src[i]) || ins->src[i].type == AGX_INDEX_UNDEF)
               continue;
            assert(ins->src[i].size == ins->src[0].size);

            assert(n < ins->nr_srcs);
            copies[n++] = (struct agx_copy){
               .dest = base + (i * width),
               .src = ins->src[i],
            };
         }

         agx_emit_parallel_copies(&b, copies, n);
         agx_remove_instruction(ins);
         continue;
      } else if (ins->op == AGX_OPCODE_SPLIT) {
         assert(ins->src[0].type == AGX_INDEX_REGISTER ||
                ins->src[0].type == AGX_INDEX_UNIFORM);

         struct agx_copy copies[4];
         assert(ins->nr_dests <= ARRAY_SIZE(copies));

         unsigned n = 0;
         unsigned width = agx_size_align_16(agx_split_width(ins));

         /* Move the sources */
         agx_foreach_dest(ins, i) {
            if (ins->dest[i].type != AGX_INDEX_REGISTER)
               continue;

            assert(!ins->dest[i].memory);

            agx_index src = ins->src[0];
            src.size = ins->dest[i].size;
            src.channels_m1 = 0;
            src.value += (i * width);

            assert(n < ARRAY_SIZE(copies));
            copies[n++] = (struct agx_copy){
               .dest = ins->dest[i].value,
               .src = src,
            };
         }

         /* Lower away */
         agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
         agx_emit_parallel_copies(&b, copies, n);
         agx_remove_instruction(ins);
         continue;
      }
   }

   /* Insert parallel copies lowering phi nodes and exports */
   agx_foreach_block(ctx, block) {
      agx_insert_parallel_copies(ctx, block);
   }

   lower_exports(ctx);

   agx_foreach_instr_global_safe(ctx, I) {
      switch (I->op) {
      /* Pseudoinstructions for RA must be removed now */
      case AGX_OPCODE_PHI:
      case AGX_OPCODE_PRELOAD:
         agx_remove_instruction(I);
         break;

      /* Coalesced moves can be removed */
      case AGX_OPCODE_MOV:
         if (I->src[0].type == AGX_INDEX_REGISTER &&
             I->dest[0].size == I->src[0].size &&
             I->src[0].value == I->dest[0].value &&
             I->src[0].memory == I->dest[0].memory) {

            assert(I->dest[0].type == AGX_INDEX_REGISTER);
            agx_remove_instruction(I);
         }
         break;

      default:
         break;
      }
   }

   if (spilling)
      agx_lower_spill(ctx);

   agx_foreach_block(ctx, block) {
      for (unsigned i = 0; i < ARRAY_SIZE(block->reg_to_ssa_out); ++i) {
         free(block->reg_to_ssa_out[i]);
         block->reg_to_ssa_out[i] = NULL;
      }
   }

   free(phi_web);
   free(src_to_collect_phi);
   free(ncomps);
   free(ncomps_unrounded);
   free(sizes);
   free(classes);
   free(visited);
}