• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2023-2024 Alyssa Rosenzweig
3  * Copyright 2023-2024 Valve Corporation
4  * Copyright 2022 Collabora Ltd.
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include "util/bitset.h"
9 #include "util/hash_table.h"
10 #include "util/ralloc.h"
11 #include "util/u_dynarray.h"
12 #include "util/u_qsort.h"
13 #include "agx_builder.h"
14 #include "agx_compiler.h"
15 #include "agx_opcodes.h"
16 
17 /*
18  * An implementation of "Register Spilling and Live-Range Splitting for SSA-Form
19  * Programs" by Braun and Hack.
20  */
21 
22 /*
23  * Next-use distances are logically in ℤ ∪ {∞}. Modeled as saturating uint32 and
24  * referred to as dist_t.
25  *
26  * next_uses represents a next-use map. This is a sparse data structure mapping
27  * variable names to next-use dist_t's. Variables with no later use (infinite
28  * next-use distance) are not stored explicitly, making the time/space
29  * requirements O(live variables). This is important for performance and memory
30  * usage on big shaders with many blocks.
31  *
32  * For now, next_uses is backed by a Mesa hash table, but it could be optimized
33  * to something more specialized in the future.
34  */
35 #define DIST_INFINITY (UINT32_MAX)
36 typedef uint32_t dist_t;
37 
38 static dist_t
dist_sum(dist_t A,dist_t B)39 dist_sum(dist_t A, dist_t B)
40 {
41    return (A + B < A) ? DIST_INFINITY : (A + B);
42 }
43 
44 struct next_uses {
45    struct hash_table_u64 *ht;
46 };
47 
48 static void
init_next_uses(struct next_uses * nu,void * memctx)49 init_next_uses(struct next_uses *nu, void *memctx)
50 {
51    nu->ht = _mesa_hash_table_u64_create(memctx);
52 }
53 
54 static void
destroy_next_uses(struct next_uses * nu)55 destroy_next_uses(struct next_uses *nu)
56 {
57    _mesa_hash_table_u64_destroy(nu->ht);
58 }
59 
60 static void
clear_next_uses(struct next_uses * nu)61 clear_next_uses(struct next_uses *nu)
62 {
63    _mesa_hash_table_u64_clear(nu->ht);
64 }
65 
66 static void
copy_next_uses(struct next_uses * nu,const struct next_uses * from)67 copy_next_uses(struct next_uses *nu, const struct next_uses *from)
68 {
69    clear_next_uses(nu);
70 
71    hash_table_u64_foreach(from->ht, use) {
72       _mesa_hash_table_u64_insert(nu->ht, use.key, use.data);
73    }
74 }
75 
76 static void
set_next_use(struct next_uses * nu,unsigned node,dist_t dist)77 set_next_use(struct next_uses *nu, unsigned node, dist_t dist)
78 {
79    if (dist == DIST_INFINITY) {
80       _mesa_hash_table_u64_remove(nu->ht, node);
81    } else {
82       uintptr_t as_ptr = (uintptr_t)(dist + 1);
83       assert(as_ptr != 0 && "non-NULL");
84 
85       _mesa_hash_table_u64_insert(nu->ht, node, (void *)as_ptr);
86    }
87 }
88 
89 static dist_t
search_next_uses(const struct next_uses * nu,unsigned node)90 search_next_uses(const struct next_uses *nu, unsigned node)
91 {
92    void *ent = _mesa_hash_table_u64_search(nu->ht, node);
93    if (!ent)
94       return DIST_INFINITY;
95 
96    uintptr_t raw = (uintptr_t)ent;
97    return raw - 1;
98 }
99 
100 #define foreach_next_use(nu, node, dist)                                       \
101    hash_table_u64_foreach((nu)->ht, use_)                                      \
102       for (uint32_t _terminator = 1, node = use_.key,                          \
103                     UNUSED dist = ((uintptr_t)use_.data) - 1;                  \
104            _terminator != 0; _terminator = 0)
105 
106 /*
107  * Calculate the minimum of two next-use sets. Values absent from one of the
108  * underlying sets are infinity so do not contribute to the minimum, instead
109  * acting like a set union.
110  */
111 static bool
minimum_next_uses(struct next_uses * nu,const struct next_uses * from)112 minimum_next_uses(struct next_uses *nu, const struct next_uses *from)
113 {
114    bool progress = false;
115 
116    foreach_next_use(from, node, from_dist) {
117       dist_t nu_dist = search_next_uses(nu, node);
118 
119       if (from_dist < nu_dist) {
120          set_next_use(nu, node, from_dist);
121          progress = true;
122       }
123    }
124 
125    return progress;
126 }
127 
128 static uint32_t
instr_cycles(const agx_instr * I)129 instr_cycles(const agx_instr *I)
130 {
131    return 1;
132 }
133 
134 struct spill_block {
135    /* Set of values available in the register file at the end */
136    unsigned W_exit[AGX_NUM_REGS];
137    unsigned nW_exit;
138 
139    unsigned W_entry[AGX_NUM_REGS];
140    unsigned nW_entry;
141 
142    /* Set of live-out spilled values at the end of the block */
143    unsigned *S_exit;
144    unsigned nS_exit;
145 
146    unsigned *S_entry;
147    unsigned nS_entry;
148 
149    /* Estimate */
150    uint32_t cycles;
151 
152    /* Next-use maps at the start/end of the block */
153    struct next_uses next_use_in;
154    struct next_uses next_use_out;
155 };
156 
157 struct spill_ctx {
158    void *memctx;
159    agx_context *shader;
160    agx_block *block;
161 
162    /* Set of values currently available in the register file */
163    BITSET_WORD *W;
164 
165    /* |W| = Current register pressure */
166    unsigned nW;
167 
168    /* Local IPs of next-use */
169    dist_t *next_uses;
170 
171    /* Current local IP relative to the start of the block */
172    uint32_t ip;
173 
174    /* Set of live values that have been spilled. Contrary to the paper, this
175     * is not a subset of W: the definition in the paper is bogus.
176     */
177    BITSET_WORD *S;
178 
179    /* Widths of vectors */
180    uint8_t *channels;
181    enum agx_size *size;
182 
183    /* Mapping of rematerializable values to their definitions, or NULL for nodes
184     * that are not materializable.
185     */
186    agx_instr **remat;
187 
188    /* Maximum register pressure allowed */
189    unsigned k;
190 
191    /* Number of variables */
192    unsigned n;
193 
194    /* Information on blocks indexed in source order */
195    struct spill_block *blocks;
196 
197    /* Base memory index reserved for spilled indices */
198    unsigned spill_base;
199 };
200 
201 static inline struct spill_block *
spill_block(struct spill_ctx * ctx,agx_block * block)202 spill_block(struct spill_ctx *ctx, agx_block *block)
203 {
204    return &ctx->blocks[block->index];
205 }
206 
207 /* Calculate the register demand of a node. This is rounded up to a power-of-two
208  * to match the equivalent calculations in RA.
209  */
210 static unsigned
node_size(struct spill_ctx * ctx,unsigned node)211 node_size(struct spill_ctx *ctx, unsigned node)
212 {
213    return util_next_power_of_two(ctx->channels[node]) *
214           agx_size_align_16(ctx->size[node]);
215 }
216 
217 /*
218  * Map a control flow edge to a block. Assumes no critical edges.
219  */
220 static agx_block *
agx_edge_to_block(agx_block * pred,agx_block * succ)221 agx_edge_to_block(agx_block *pred, agx_block *succ)
222 {
223    /* End of predecessor is unique if there's a single successor */
224    if (agx_num_successors(pred) == 1)
225       return pred;
226 
227    /* The predecessor has multiple successors, meaning this is not the only
228     * edge leaving the predecessor. Therefore, it is the only edge entering
229     * the successor (otherwise the edge would be critical), so the start of
230     * the successor is unique.
231     */
232    assert(agx_num_predecessors(succ) == 1 && "critical edge detected");
233    return succ;
234 }
235 
236 /*
237  * Get a cursor to insert along a control flow edge: either at the start of the
238  * successor or the end of the predecessor. This relies on the control flow
239  * graph having no critical edges.
240  */
241 static agx_cursor
agx_along_edge(agx_block * pred,agx_block * succ)242 agx_along_edge(agx_block *pred, agx_block *succ)
243 {
244    agx_block *to = agx_edge_to_block(pred, succ);
245 
246    if (to == pred)
247       return agx_after_block_logical(pred);
248    else
249       return agx_before_block(succ);
250 }
251 
252 static inline agx_index
agx_index_as_mem(agx_index idx,unsigned mem_base)253 agx_index_as_mem(agx_index idx, unsigned mem_base)
254 {
255    assert(idx.type == AGX_INDEX_NORMAL);
256    assert(!idx.memory);
257    idx.memory = true;
258    idx.value = mem_base + idx.value;
259    return idx;
260 }
261 
262 static unsigned
chase_mem_index(agx_index ref,unsigned mem_base)263 chase_mem_index(agx_index ref, unsigned mem_base)
264 {
265    assert(ref.type == AGX_INDEX_NORMAL);
266    return ref.memory ? (ref.value - mem_base) : ref.value;
267 }
268 
269 static agx_index
reconstruct_index(struct spill_ctx * ctx,unsigned node)270 reconstruct_index(struct spill_ctx *ctx, unsigned node)
271 {
272    return agx_get_vec_index(node, ctx->size[node], ctx->channels[node]);
273 }
274 
275 static bool
can_remat(agx_instr * I)276 can_remat(agx_instr *I)
277 {
278    switch (I->op) {
279    case AGX_OPCODE_MOV_IMM:
280    case AGX_OPCODE_GET_SR:
281       return true;
282    default:
283       return false;
284    }
285 }
286 
287 static agx_instr *
remat_to(agx_builder * b,agx_index dst,struct spill_ctx * ctx,unsigned node)288 remat_to(agx_builder *b, agx_index dst, struct spill_ctx *ctx, unsigned node)
289 {
290    agx_instr *I = ctx->remat[node];
291    assert(can_remat(I));
292 
293    switch (I->op) {
294    case AGX_OPCODE_MOV_IMM:
295       return agx_mov_imm_to(b, dst, I->imm);
296    case AGX_OPCODE_GET_SR:
297       return agx_get_sr_to(b, dst, I->sr);
298    default:
299       unreachable("invalid remat");
300    }
301 }
302 
303 static void
insert_spill(agx_builder * b,struct spill_ctx * ctx,unsigned node)304 insert_spill(agx_builder *b, struct spill_ctx *ctx, unsigned node)
305 {
306    if (!ctx->remat[node]) {
307       agx_index idx = reconstruct_index(ctx, node);
308       agx_mov_to(b, agx_index_as_mem(idx, ctx->spill_base), idx);
309 
310       /* We only need the extra registers reserved if we actually spilled
311        * instead of just remat.
312        */
313       b->shader->has_spill_pcopy_reserved = true;
314    }
315 }
316 
317 static void
insert_reload(struct spill_ctx * ctx,agx_block * block,agx_cursor cursor,unsigned node)318 insert_reload(struct spill_ctx *ctx, agx_block *block, agx_cursor cursor,
319               unsigned node)
320 {
321    agx_builder b = agx_init_builder(ctx->shader, cursor);
322    agx_index idx = reconstruct_index(ctx, node);
323 
324    /* Reloading breaks SSA, but agx_repair_ssa will repair */
325    if (ctx->remat[node]) {
326       remat_to(&b, idx, ctx, node);
327    } else {
328       agx_mov_to(&b, idx, agx_index_as_mem(idx, ctx->spill_base));
329    }
330 }
331 
332 /* Insert into the register file */
333 static void
insert_W(struct spill_ctx * ctx,unsigned v)334 insert_W(struct spill_ctx *ctx, unsigned v)
335 {
336    assert(v < ctx->n);
337    assert(!BITSET_TEST(ctx->W, v));
338 
339    BITSET_SET(ctx->W, v);
340    ctx->nW += node_size(ctx, v);
341 }
342 
343 /* Remove from the register file */
344 static void
remove_W(struct spill_ctx * ctx,unsigned v)345 remove_W(struct spill_ctx *ctx, unsigned v)
346 {
347    assert(v < ctx->n);
348    assert(BITSET_TEST(ctx->W, v));
349 
350    BITSET_CLEAR(ctx->W, v);
351    ctx->nW -= node_size(ctx, v);
352 }
353 
354 static void
remove_W_if_present(struct spill_ctx * ctx,unsigned v)355 remove_W_if_present(struct spill_ctx *ctx, unsigned v)
356 {
357    assert(v < ctx->n);
358 
359    if (BITSET_TEST(ctx->W, v))
360       remove_W(ctx, v);
361 }
362 
363 struct candidate {
364    unsigned node;
365    dist_t dist;
366 };
367 
368 static int
cmp_dist(const void * left_,const void * right_,void * ctx_)369 cmp_dist(const void *left_, const void *right_, void *ctx_)
370 {
371    struct spill_ctx *ctx = ctx_;
372    const struct candidate *left = left_;
373    const struct candidate *right = right_;
374 
375    /* We assume that rematerializing - even before every instruction - is
376     * cheaper than spilling. As long as one of the nodes is rematerializable
377     * (with distance > 0), we choose it over spilling. Within a class of nodes
378     * (rematerializable or not), compare by next-use-distance.
379     */
380    bool remat_left = ctx->remat[left->node] != NULL && left->dist > 0;
381    bool remat_right = ctx->remat[right->node] != NULL && right->dist > 0;
382 
383    if (remat_left != remat_right)
384       return remat_left ? 1 : -1;
385    else
386       return (left->dist > right->dist) - (left->dist < right->dist);
387 }
388 
389 /*
390  * Limit the register file W to maximum size m by evicting registers.
391  */
392 static ATTRIBUTE_NOINLINE void
limit(struct spill_ctx * ctx,agx_instr * I,unsigned m)393 limit(struct spill_ctx *ctx, agx_instr *I, unsigned m)
394 {
395    /* Nothing to do if we're already below the limit */
396    if (ctx->nW <= m)
397       return;
398 
399    /* Gather candidates for eviction. Note that next_uses gives IPs whereas
400     * cmp_dist expects relative distances. This requires us to subtract ctx->ip
401     * to ensure that cmp_dist works properly. Even though logically it shouldn't
402     * affect the sorted order, practically this matters for correctness with
403     * rematerialization. See the dist=0 test in cmp_dist.
404     */
405    struct candidate *candidates = alloca(ctx->nW * sizeof(struct candidate));
406    unsigned j = 0;
407 
408    int i;
409    BITSET_FOREACH_SET(i, ctx->W, ctx->n) {
410       assert(j < ctx->nW);
411 
412       candidates[j++] = (struct candidate){
413          .node = i,
414          .dist = ctx->next_uses[i] - ctx->ip,
415       };
416    }
417 
418    /* Sort by next-use distance */
419    util_qsort_r(candidates, j, sizeof(struct candidate), cmp_dist, ctx);
420 
421    /* Evict what doesn't fit */
422    unsigned new_weight = 0;
423 
424    for (i = 0; i < j; ++i) {
425       unsigned v = candidates[i].node;
426       unsigned comps = node_size(ctx, v);
427 
428       if ((new_weight + comps) <= m) {
429          new_weight += comps;
430       } else {
431          /* Insert a spill if we haven't spilled before and there is
432           * another use
433           */
434          if (!BITSET_TEST(ctx->S, v) && candidates[i].dist < DIST_INFINITY) {
435             agx_builder b = agx_init_builder(ctx->shader, agx_before_instr(I));
436             insert_spill(&b, ctx, v);
437             BITSET_SET(ctx->S, v);
438          }
439 
440          remove_W(ctx, v);
441 
442          /* We keep going in case we can pack in a scalar */
443       }
444    }
445 }
446 
447 /*
448  * Insert coupling code on block boundaries. This must ensure:
449  *
450  *    - anything live-in we expect to have spilled is spilled
451  *    - anything live-in we expect to have filled is filled
452  *    - phi sources are spilled if the destination is spilled
453  *    - phi sources are filled if the destination is not spilled
454  *
455  * The latter two requirements ensure correct pressure calculations for phis.
456  */
457 static ATTRIBUTE_NOINLINE void
insert_coupling_code(struct spill_ctx * ctx,agx_block * pred,agx_block * succ)458 insert_coupling_code(struct spill_ctx *ctx, agx_block *pred, agx_block *succ)
459 {
460    struct spill_block *sp = spill_block(ctx, pred);
461    struct spill_block *ss = spill_block(ctx, succ);
462 
463    agx_foreach_phi_in_block(succ, I) {
464       if (!I->dest[0].memory)
465          continue;
466 
467       agx_builder b =
468          agx_init_builder(ctx->shader, agx_before_function(ctx->shader));
469 
470       unsigned s = agx_predecessor_index(succ, pred);
471 
472       /* Copy immediate/uniform phi sources to memory variables at the start of
473        * the program, where pressure is zero and hence the copy is legal.
474        */
475       if (I->src[s].type != AGX_INDEX_NORMAL) {
476          assert(I->src[s].type == AGX_INDEX_IMMEDIATE ||
477                 I->src[s].type == AGX_INDEX_UNIFORM);
478 
479          agx_index mem = agx_temp_like(ctx->shader, I->dest[0]);
480          assert(mem.memory);
481 
482          agx_index gpr = agx_temp_like(ctx->shader, I->dest[0]);
483          gpr.memory = false;
484 
485          if (I->src[s].type == AGX_INDEX_IMMEDIATE)
486             agx_mov_imm_to(&b, gpr, I->src[s].value);
487          else
488             agx_mov_to(&b, gpr, I->src[s]);
489 
490          agx_mov_to(&b, mem, gpr);
491          I->src[s] = mem;
492          continue;
493       }
494 
495       bool spilled = false;
496       for (unsigned i = 0; i < sp->nS_exit; ++i) {
497          if (sp->S_exit[i] == I->src[s].value) {
498             spilled = true;
499             break;
500          }
501       }
502 
503       if (!spilled) {
504          /* Spill the phi source. TODO: avoid redundant spills here */
505          agx_builder b =
506             agx_init_builder(ctx->shader, agx_after_block_logical(pred));
507 
508          insert_spill(&b, ctx, I->src[s].value);
509       }
510 
511       if (ctx->remat[I->src[s].value]) {
512          unsigned node = I->src[s].value;
513          agx_index idx = reconstruct_index(ctx, node);
514          agx_index tmp = agx_temp_like(ctx->shader, idx);
515 
516          remat_to(&b, tmp, ctx, node);
517          agx_mov_to(&b, agx_index_as_mem(idx, ctx->spill_base), tmp);
518       }
519 
520       /* Use the spilled version */
521       I->src[s] = agx_index_as_mem(I->src[s], ctx->spill_base);
522    }
523 
524    /* Anything assumed to be spilled at the start of succ must be spilled along
525     * all edges.
526     */
527    for (unsigned i = 0; i < ss->nS_entry; ++i) {
528       unsigned v = ss->S_entry[i];
529 
530       bool spilled = false;
531       for (unsigned j = 0; j < sp->nS_exit; ++j) {
532          if (sp->S_exit[j] == v) {
533             spilled = true;
534             break;
535          }
536       }
537 
538       /* We handle spilling phi destinations separately */
539       agx_foreach_phi_in_block(succ, phi) {
540          if (chase_mem_index(phi->dest[0], ctx->spill_base) == v) {
541             spilled = true;
542             break;
543          }
544       }
545 
546       if (spilled)
547          continue;
548 
549       agx_builder b = agx_init_builder(ctx->shader, agx_along_edge(pred, succ));
550       insert_spill(&b, ctx, v);
551    }
552 
553    /* Variables in W at the start of succ must be defined along the edge. */
554    for (unsigned i = 0; i < ss->nW_entry; ++i) {
555       unsigned node = ss->W_entry[i];
556       bool defined = false;
557 
558       /* Variables live at the end of the predecessor are live along the edge */
559       for (unsigned j = 0; j < sp->nW_exit; ++j) {
560          if (sp->W_exit[j] == node) {
561             defined = true;
562             break;
563          }
564       }
565 
566       /* Phis are defined along the edge */
567       agx_foreach_phi_in_block(succ, phi) {
568          if (phi->dest[0].value == node) {
569             defined = true;
570             break;
571          }
572       }
573 
574       if (defined)
575          continue;
576 
577       /* Otherwise, inserting a reload defines the variable along the edge */
578       agx_block *reload_block = agx_edge_to_block(pred, succ);
579       insert_reload(ctx, reload_block, agx_along_edge(pred, succ), node);
580    }
581 
582    agx_foreach_phi_in_block(succ, I) {
583       if (I->dest[0].memory)
584          continue;
585 
586       unsigned s = agx_predecessor_index(succ, pred);
587 
588       /* Treat immediate/uniform phi sources as registers for pressure
589        * accounting and phi lowering purposes. Parallel copy lowering can handle
590        * a copy from a immediate/uniform to a register, but not from an
591        * immediate/uniform directly to memory.
592        */
593       if (I->src[s].type != AGX_INDEX_NORMAL) {
594          assert(I->src[s].type == AGX_INDEX_IMMEDIATE ||
595                 I->src[s].type == AGX_INDEX_UNIFORM);
596 
597          continue;
598       }
599 
600       bool live = false;
601       for (unsigned i = 0; i < sp->nW_exit; ++i) {
602          if (sp->W_exit[i] == I->src[s].value) {
603             live = true;
604             break;
605          }
606       }
607 
608       /* Fill the phi source in the predecessor */
609       if (!live) {
610          agx_block *reload_block = agx_edge_to_block(pred, succ);
611          insert_reload(ctx, reload_block, agx_along_edge(pred, succ),
612                        I->src[s].value);
613       }
614 
615       /* Leave as-is for the GPR version */
616       assert(!I->src[s].memory);
617    }
618 }
619 
620 /*
621  * Produce an array of next-use IPs relative to the start of the block. This is
622  * an array of dist_t scalars, representing the next-use IP of each SSA dest
623  * (right-to-left) and SSA source (left-to-right) of each instruction in the
624  * block (bottom-to-top). Its size equals the # of SSA sources in the block.
625  */
626 static ATTRIBUTE_NOINLINE void
calculate_local_next_use(struct spill_ctx * ctx,struct util_dynarray * out)627 calculate_local_next_use(struct spill_ctx *ctx, struct util_dynarray *out)
628 {
629    struct spill_block *sb = spill_block(ctx, ctx->block);
630    unsigned ip = sb->cycles;
631 
632    util_dynarray_init(out, NULL);
633 
634    struct next_uses nu;
635    init_next_uses(&nu, NULL);
636 
637    foreach_next_use(&sb->next_use_out, i, dist) {
638       set_next_use(&nu, i, dist_sum(ip, dist));
639    }
640 
641    agx_foreach_instr_in_block_rev(ctx->block, I) {
642       ip -= instr_cycles(I);
643 
644       if (I->op != AGX_OPCODE_PHI) {
645          agx_foreach_ssa_dest_rev(I, d) {
646             unsigned v = I->dest[d].value;
647 
648             util_dynarray_append(out, dist_t, search_next_uses(&nu, v));
649          }
650 
651          agx_foreach_ssa_src(I, s) {
652             unsigned v = I->src[s].value;
653 
654             util_dynarray_append(out, dist_t, search_next_uses(&nu, v));
655             set_next_use(&nu, v, ip);
656          }
657       }
658    }
659 
660    assert(ip == 0 && "cycle counting is consistent");
661    destroy_next_uses(&nu);
662 }
663 
664 static agx_cursor
agx_before_instr_logical(agx_block * block,agx_instr * I)665 agx_before_instr_logical(agx_block *block, agx_instr *I)
666 {
667    if (I->op == AGX_OPCODE_EXPORT) {
668       agx_instr *first = agx_first_instr(block);
669 
670       while (I != first && I->op == AGX_OPCODE_EXPORT) {
671          I = agx_prev_op(I);
672       }
673 
674       if (I == first && first->op == AGX_OPCODE_EXPORT) {
675          return agx_before_block(block);
676       } else {
677          return agx_after_instr(I);
678       }
679    } else {
680       return agx_before_instr(I);
681    }
682 }
683 
684 /*
685  * Insert spills/fills for a single basic block, following Belady's algorithm.
686  * Corresponds to minAlgorithm from the paper.
687  */
688 static ATTRIBUTE_NOINLINE void
min_algorithm(struct spill_ctx * ctx)689 min_algorithm(struct spill_ctx *ctx)
690 {
691    struct spill_block *sblock = spill_block(ctx, ctx->block);
692    struct util_dynarray local_next_ip;
693    calculate_local_next_use(ctx, &local_next_ip);
694 
695    /* next_uses gives the distance from the start of the block, so prepopulate
696     * with next_use_in.
697     */
698    foreach_next_use(&sblock->next_use_in, key, dist) {
699       assert(key < ctx->n);
700       ctx->next_uses[key] = dist;
701    }
702 
703    dist_t *next_ips = util_dynarray_element(&local_next_ip, dist_t, 0);
704    unsigned next_use_cursor =
705       util_dynarray_num_elements(&local_next_ip, dist_t);
706 
707    /* Iterate each instruction in forward order */
708    agx_foreach_instr_in_block(ctx->block, I) {
709       assert(ctx->nW <= ctx->k && "invariant");
710 
711       /* Debug to check against our RA demand calculations */
712       if (0) {
713          printf("%u: ", ctx->nW);
714          agx_print_instr(I, stdout);
715       }
716 
717       /* Phis are special since they happen along the edge. When we initialized
718        * W and S, we implicitly chose which phis are spilled. So, here we just
719        * need to rewrite the phis to write into memory.
720        *
721        * Phi sources are handled later.
722        */
723       if (I->op == AGX_OPCODE_PHI) {
724          if (!BITSET_TEST(ctx->W, I->dest[0].value)) {
725             I->dest[0] = agx_index_as_mem(I->dest[0], ctx->spill_base);
726          }
727 
728          ctx->ip += instr_cycles(I);
729          continue;
730       }
731 
732       /* Any source that is not in W needs to be reloaded. Gather the set R of
733        * such values.
734        */
735       unsigned R[AGX_MAX_NORMAL_SOURCES];
736       unsigned nR = 0;
737 
738       agx_foreach_ssa_src(I, s) {
739          unsigned node = I->src[s].value;
740          if (BITSET_TEST(ctx->W, node))
741             continue;
742 
743          /* Mark this variable as needing a reload. */
744          assert(node < ctx->n);
745          assert(BITSET_TEST(ctx->S, node) && "must have been spilled");
746          assert(nR < ARRAY_SIZE(R) && "maximum source count");
747          R[nR++] = node;
748 
749          /* The inserted reload will add the value to the register file. */
750          insert_W(ctx, node);
751       }
752 
753       /* Limit W to make space for the sources we just added */
754       limit(ctx, I, ctx->k);
755 
756       /* Update next-use distances for this instruction. Unlike the paper, we
757        * prune dead values from W as we go. This doesn't affect correctness, but
758        * it speeds up limit() on average.
759        */
760       agx_foreach_ssa_src_rev(I, s) {
761          assert(next_use_cursor >= 1);
762 
763          unsigned next_ip = next_ips[--next_use_cursor];
764          assert((next_ip == DIST_INFINITY) == I->src[s].kill);
765 
766          if (next_ip == DIST_INFINITY)
767             remove_W_if_present(ctx, I->src[s].value);
768          else
769             ctx->next_uses[I->src[s].value] = next_ip;
770       }
771 
772       agx_foreach_ssa_dest(I, d) {
773          assert(next_use_cursor >= 1);
774          unsigned next_ip = next_ips[--next_use_cursor];
775 
776          if (next_ip == DIST_INFINITY)
777             remove_W_if_present(ctx, I->dest[d].value);
778          else
779             ctx->next_uses[I->dest[d].value] = next_ip;
780       }
781 
782       /* Count how many registers we need for destinations. Because of
783        * SSA form, destinations are unique.
784        */
785       unsigned dest_size = 0;
786       agx_foreach_ssa_dest(I, d) {
787          dest_size += node_size(ctx, I->dest[d].value);
788       }
789 
790       /* Limit W to make space for the destinations. */
791       limit(ctx, I, ctx->k - dest_size);
792 
793       /* Destinations are now in the register file */
794       agx_foreach_ssa_dest(I, d) {
795          insert_W(ctx, I->dest[d].value);
796       }
797 
798       /* Add reloads for the sources in front of the instruction. We need to be
799        * careful around exports, hoisting the reloads to before all exports.
800        *
801        * This is legal since all exports happen in parallel and all registers
802        * are dead after the exports. The register file
803        * must be big enough for everything exported, so it must be big enough
804        * for all the reloaded values right before the parallel exports.
805        */
806       for (unsigned i = 0; i < nR; ++i) {
807          insert_reload(ctx, ctx->block, agx_before_instr_logical(ctx->block, I),
808                        R[i]);
809       }
810 
811       ctx->ip += instr_cycles(I);
812    }
813 
814    assert(next_use_cursor == 0 && "exactly sized");
815 
816    int i;
817    BITSET_FOREACH_SET(i, ctx->W, ctx->n)
818       sblock->W_exit[sblock->nW_exit++] = i;
819 
820    unsigned nS = __bitset_count(ctx->S, BITSET_WORDS(ctx->n));
821    sblock->S_exit = ralloc_array(ctx->memctx, unsigned, nS);
822 
823    BITSET_FOREACH_SET(i, ctx->S, ctx->n)
824       sblock->S_exit[sblock->nS_exit++] = i;
825 
826    assert(nS == sblock->nS_exit);
827    util_dynarray_fini(&local_next_ip);
828 }
829 
830 /*
831  * TODO: Implement section 4.2 of the paper.
832  *
833  * For now, we implement the simpler heuristic in Hack's thesis: sort
834  * the live-in set (+ destinations of phis) by next-use distance.
835  */
836 static ATTRIBUTE_NOINLINE void
compute_w_entry_loop_header(struct spill_ctx * ctx)837 compute_w_entry_loop_header(struct spill_ctx *ctx)
838 {
839    agx_block *block = ctx->block;
840    struct spill_block *sb = spill_block(ctx, block);
841 
842    unsigned nP = __bitset_count(block->live_in, BITSET_WORDS(ctx->n));
843    struct candidate *candidates = calloc(nP, sizeof(struct candidate));
844    unsigned j = 0;
845 
846    foreach_next_use(&sb->next_use_in, i, dist) {
847       assert(j < nP);
848       candidates[j++] = (struct candidate){.node = i, .dist = dist};
849    }
850 
851    assert(j == nP);
852 
853    /* Sort by next-use distance */
854    util_qsort_r(candidates, j, sizeof(struct candidate), cmp_dist, ctx);
855 
856    /* Take as much as we can */
857    for (unsigned i = 0; i < j; ++i) {
858       unsigned node = candidates[i].node;
859       unsigned comps = node_size(ctx, node);
860 
861       if ((ctx->nW + comps) <= ctx->k) {
862          insert_W(ctx, node);
863          sb->W_entry[sb->nW_entry++] = node;
864       }
865    }
866 
867    assert(ctx->nW <= ctx->k);
868    free(candidates);
869 }
870 
871 /*
872  * Compute W_entry for a block. Section 4.2 in the paper.
873  */
874 static ATTRIBUTE_NOINLINE void
compute_w_entry(struct spill_ctx * ctx)875 compute_w_entry(struct spill_ctx *ctx)
876 {
877    agx_block *block = ctx->block;
878    struct spill_block *sb = spill_block(ctx, block);
879 
880    /* Nothing to do for start blocks */
881    if (agx_num_predecessors(block) == 0)
882       return;
883 
884    /* Loop headers have a different heuristic */
885    if (block->loop_header) {
886       compute_w_entry_loop_header(ctx);
887       return;
888    }
889 
890    /* Usual blocks follow */
891    unsigned *freq = calloc(ctx->n, sizeof(unsigned));
892 
893    /* Record what's written at the end of each predecessor */
894    agx_foreach_predecessor(ctx->block, P) {
895       struct spill_block *sp = spill_block(ctx, *P);
896 
897       for (unsigned i = 0; i < sp->nW_exit; ++i) {
898          unsigned v = sp->W_exit[i];
899          freq[v]++;
900       }
901    }
902 
903    struct candidate *candidates = calloc(ctx->n, sizeof(struct candidate));
904    unsigned j = 0;
905 
906    /* Variables that are in all predecessors are assumed in W_entry. Phis and
907     * variables in some predecessors are scored by next-use.
908     */
909    foreach_next_use(&sb->next_use_in, i, dist) {
910       if (freq[i] == agx_num_predecessors(ctx->block)) {
911          insert_W(ctx, i);
912       } else if (freq[i]) {
913          candidates[j++] = (struct candidate){.node = i, .dist = dist};
914       }
915    }
916 
917    agx_foreach_phi_in_block(ctx->block, I) {
918       bool all_found = true;
919 
920       agx_foreach_predecessor(ctx->block, pred) {
921          struct spill_block *sp = spill_block(ctx, *pred);
922          bool found = false;
923 
924          agx_index src = I->src[agx_predecessor_index(ctx->block, *pred)];
925          if (src.type != AGX_INDEX_NORMAL)
926             continue;
927 
928          unsigned v = src.value;
929          for (unsigned i = 0; i < sp->nW_exit; ++i) {
930             if (sp->W_exit[i] == v) {
931                found = true;
932                break;
933             }
934          }
935 
936          all_found &= found;
937       }
938 
939       /* Heuristic: if any phi source is spilled, spill the whole phi. This is
940        * suboptimal, but it massively reduces pointless fill/spill chains with
941        * massive phi webs.
942        */
943       if (!all_found)
944          continue;
945 
946       candidates[j++] = (struct candidate){
947          .node = I->dest[0].value,
948          .dist = search_next_uses(&sb->next_use_in, I->dest[0].value),
949       };
950    }
951 
952    /* Sort by next-use distance */
953    util_qsort_r(candidates, j, sizeof(struct candidate), cmp_dist, ctx);
954 
955    /* Take as much as we can */
956    for (unsigned i = 0; i < j; ++i) {
957       unsigned node = candidates[i].node;
958       unsigned comps = node_size(ctx, node);
959 
960       if ((ctx->nW + comps) <= ctx->k) {
961          insert_W(ctx, node);
962          sb->W_entry[sb->nW_entry++] = node;
963       }
964    }
965 
966    assert(ctx->nW <= ctx->k && "invariant");
967 
968    free(freq);
969    free(candidates);
970 }
971 
972 /*
973  * We initialize S with the union of S at the exit of (forward edge)
974  * predecessors and the complement of W, intersected with the live-in set. The
975  * former propagates S forward. The latter ensures we spill along the edge when
976  * a live value is not selected for the entry W.
977  */
978 static ATTRIBUTE_NOINLINE void
compute_s_entry(struct spill_ctx * ctx)979 compute_s_entry(struct spill_ctx *ctx)
980 {
981    unsigned v;
982 
983    agx_foreach_predecessor(ctx->block, pred) {
984       struct spill_block *sp = spill_block(ctx, *pred);
985 
986       for (unsigned i = 0; i < sp->nS_exit; ++i) {
987          v = sp->S_exit[i];
988 
989          if (BITSET_TEST(ctx->block->live_in, v))
990             BITSET_SET(ctx->S, v);
991       }
992    }
993 
994    BITSET_FOREACH_SET(v, ctx->block->live_in, ctx->n) {
995       if (!BITSET_TEST(ctx->W, v))
996          BITSET_SET(ctx->S, v);
997    }
998 
999    /* Copy ctx->S to S_entry for later look-ups with coupling code */
1000    struct spill_block *sb = spill_block(ctx, ctx->block);
1001    unsigned nS = __bitset_count(ctx->S, BITSET_WORDS(ctx->n));
1002    sb->S_entry = ralloc_array(ctx->memctx, unsigned, nS);
1003 
1004    int i;
1005    BITSET_FOREACH_SET(i, ctx->S, ctx->n)
1006       sb->S_entry[sb->nS_entry++] = i;
1007 
1008    assert(sb->nS_entry == nS);
1009 }
1010 
1011 static ATTRIBUTE_NOINLINE void
global_next_use_distances(agx_context * ctx,void * memctx,struct spill_block * blocks)1012 global_next_use_distances(agx_context *ctx, void *memctx,
1013                           struct spill_block *blocks)
1014 {
1015    u_worklist worklist;
1016    u_worklist_init(&worklist, ctx->num_blocks, NULL);
1017 
1018    agx_foreach_block(ctx, block) {
1019       struct spill_block *sb = &blocks[block->index];
1020 
1021       init_next_uses(&sb->next_use_in, memctx);
1022       init_next_uses(&sb->next_use_out, memctx);
1023 
1024       agx_foreach_instr_in_block(block, I) {
1025          sb->cycles += instr_cycles(I);
1026       }
1027 
1028       agx_worklist_push_head(&worklist, block);
1029    }
1030 
1031    /* Definitions that have been seen */
1032    BITSET_WORD *defined =
1033       malloc(BITSET_WORDS(ctx->alloc) * sizeof(BITSET_WORD));
1034 
1035    struct next_uses dists;
1036    init_next_uses(&dists, NULL);
1037 
1038    /* Iterate the work list in reverse order since liveness is backwards */
1039    while (!u_worklist_is_empty(&worklist)) {
1040       agx_block *blk = agx_worklist_pop_head(&worklist);
1041       struct spill_block *sb = &blocks[blk->index];
1042 
1043       /* Definitions that have been seen */
1044       memset(defined, 0, BITSET_WORDS(ctx->alloc) * sizeof(BITSET_WORD));
1045 
1046       /* Initialize all distances to infinity */
1047       clear_next_uses(&dists);
1048 
1049       uint32_t cycle = 0;
1050 
1051       /* Calculate dists. Phis are handled separately. */
1052       agx_foreach_instr_in_block(blk, I) {
1053          if (I->op == AGX_OPCODE_PHI) {
1054             cycle++;
1055             continue;
1056          }
1057 
1058          /* Record first use before def. Phi sources are handled
1059           * above, because they logically happen in the
1060           * predecessor.
1061           */
1062          agx_foreach_ssa_src(I, s) {
1063             if (BITSET_TEST(defined, I->src[s].value))
1064                continue;
1065             if (search_next_uses(&dists, I->src[s].value) < DIST_INFINITY)
1066                continue;
1067 
1068             assert(I->src[s].value < ctx->alloc);
1069             set_next_use(&dists, I->src[s].value, cycle);
1070          }
1071 
1072          /* Record defs */
1073          agx_foreach_ssa_dest(I, d) {
1074             assert(I->dest[d].value < ctx->alloc);
1075             BITSET_SET(defined, I->dest[d].value);
1076          }
1077 
1078          cycle += instr_cycles(I);
1079       }
1080 
1081       /* Apply transfer function to get our entry state. */
1082       foreach_next_use(&sb->next_use_out, node, dist) {
1083          set_next_use(&sb->next_use_in, node, dist_sum(dist, sb->cycles));
1084       }
1085 
1086       foreach_next_use(&dists, node, dist) {
1087          set_next_use(&sb->next_use_in, node, dist);
1088       }
1089 
1090       int i;
1091       BITSET_FOREACH_SET(i, defined, ctx->alloc) {
1092          set_next_use(&sb->next_use_in, i, DIST_INFINITY);
1093       }
1094 
1095       /* Propagate the live in of the successor (blk) to the live out of
1096        * predecessors.
1097        *
1098        * Phi nodes are logically on the control flow edge and act in parallel.
1099        * To handle when propagating, we kill writes from phis and make live the
1100        * corresponding sources.
1101        */
1102       agx_foreach_predecessor(blk, pred) {
1103          struct spill_block *sp = &blocks[(*pred)->index];
1104          copy_next_uses(&dists, &sb->next_use_in);
1105 
1106          /* Kill write */
1107          agx_foreach_phi_in_block(blk, I) {
1108             assert(I->dest[0].type == AGX_INDEX_NORMAL);
1109             set_next_use(&dists, I->dest[0].value, DIST_INFINITY);
1110          }
1111 
1112          /* Make live the corresponding source */
1113          agx_foreach_phi_in_block(blk, I) {
1114             agx_index operand = I->src[agx_predecessor_index(blk, *pred)];
1115             if (operand.type == AGX_INDEX_NORMAL)
1116                set_next_use(&dists, operand.value, 0);
1117          }
1118 
1119          /* Join by taking minimum */
1120          if (minimum_next_uses(&sp->next_use_out, &dists))
1121             agx_worklist_push_tail(&worklist, *pred);
1122       }
1123    }
1124 
1125    free(defined);
1126    u_worklist_fini(&worklist);
1127    destroy_next_uses(&dists);
1128 }
1129 
1130 static ATTRIBUTE_NOINLINE void
validate_next_use_info(UNUSED agx_context * ctx,UNUSED struct spill_block * blocks)1131 validate_next_use_info(UNUSED agx_context *ctx,
1132                        UNUSED struct spill_block *blocks)
1133 {
1134 #ifndef NDEBUG
1135    int i;
1136 
1137    agx_foreach_block(ctx, blk) {
1138       struct spill_block *sb = &blocks[blk->index];
1139 
1140       /* Invariant: next-use distance is finite iff the node is live */
1141       BITSET_FOREACH_SET(i, blk->live_in, ctx->alloc)
1142          assert(search_next_uses(&sb->next_use_in, i) < DIST_INFINITY);
1143 
1144       BITSET_FOREACH_SET(i, blk->live_out, ctx->alloc)
1145          assert(search_next_uses(&sb->next_use_out, i) < DIST_INFINITY);
1146 
1147       foreach_next_use(&sb->next_use_in, i, _)
1148          assert(BITSET_TEST(blk->live_in, i));
1149 
1150       foreach_next_use(&sb->next_use_out, i, _)
1151          assert(BITSET_TEST(blk->live_out, i));
1152    }
1153 #endif
1154 }
1155 
1156 void
agx_spill(agx_context * ctx,unsigned k)1157 agx_spill(agx_context *ctx, unsigned k)
1158 {
1159    void *memctx = ralloc_context(NULL);
1160 
1161    /* We need extra registers for memory-memory swaps */
1162    k -= 8;
1163 
1164    uint8_t *channels = rzalloc_array(memctx, uint8_t, ctx->alloc);
1165    dist_t *next_uses = rzalloc_array(memctx, dist_t, ctx->alloc);
1166    enum agx_size *sizes = rzalloc_array(memctx, enum agx_size, ctx->alloc);
1167    agx_instr **remat = rzalloc_array(memctx, agx_instr *, ctx->alloc);
1168 
1169    agx_foreach_instr_global(ctx, I) {
1170       if (can_remat(I))
1171          remat[I->dest[0].value] = I;
1172 
1173       /* Measure vectors */
1174       agx_foreach_ssa_dest(I, d) {
1175          assert(sizes[I->dest[d].value] == 0 && "broken SSA");
1176          assert(channels[I->dest[d].value] == 0 && "broken SSA");
1177 
1178          sizes[I->dest[d].value] = I->dest[d].size;
1179          channels[I->dest[d].value] = agx_channels(I->dest[d]);
1180       }
1181    }
1182 
1183    struct spill_block *blocks =
1184       rzalloc_array(memctx, struct spill_block, ctx->num_blocks);
1185 
1186    /* Step 1. Compute global next-use distances */
1187    global_next_use_distances(ctx, memctx, blocks);
1188    validate_next_use_info(ctx, blocks);
1189 
1190    /* Reserve a memory variable for every regular variable */
1191    unsigned n = ctx->alloc;
1192    ctx->alloc *= 2;
1193 
1194    BITSET_WORD *W = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(n));
1195    BITSET_WORD *S = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(n));
1196 
1197    agx_foreach_block(ctx, block) {
1198       memset(W, 0, BITSET_WORDS(n) * sizeof(BITSET_WORD));
1199       memset(S, 0, BITSET_WORDS(n) * sizeof(BITSET_WORD));
1200 
1201       struct spill_ctx sctx = {
1202          .memctx = memctx,
1203          .shader = ctx,
1204          .n = n,
1205          .channels = channels,
1206          .size = sizes,
1207          .remat = remat,
1208          .next_uses = next_uses,
1209          .block = block,
1210          .blocks = blocks,
1211          .k = k,
1212          .W = W,
1213          .S = S,
1214          .spill_base = n,
1215       };
1216 
1217       compute_w_entry(&sctx);
1218       compute_s_entry(&sctx);
1219       min_algorithm(&sctx);
1220    }
1221 
1222    /* Now that all blocks are processed separately, stitch it together */
1223    agx_foreach_block(ctx, block) {
1224       struct spill_ctx sctx = {
1225          .memctx = memctx,
1226          .shader = ctx,
1227          .n = n,
1228          .channels = channels,
1229          .size = sizes,
1230          .remat = remat,
1231          .block = block,
1232          .blocks = blocks,
1233          .k = k,
1234          .W = W,
1235          .S = S,
1236          .spill_base = n,
1237       };
1238 
1239       agx_foreach_predecessor(block, pred) {
1240          /* After spilling phi sources, insert coupling code */
1241          insert_coupling_code(&sctx, *pred, block);
1242       }
1243    }
1244 
1245    ralloc_free(memctx);
1246 
1247    /* Spilling breaks SSA, so we need to repair before validating */
1248    agx_repair_ssa(ctx);
1249    agx_validate(ctx, "Spilling");
1250 
1251    /* Remat can introduce dead code */
1252    agx_dce(ctx, false);
1253 }
1254