1 /*
2 * Copyright 2023-2024 Alyssa Rosenzweig
3 * Copyright 2023-2024 Valve Corporation
4 * Copyright 2022 Collabora Ltd.
5 * SPDX-License-Identifier: MIT
6 */
7
8 #include "util/bitset.h"
9 #include "util/hash_table.h"
10 #include "util/ralloc.h"
11 #include "util/u_dynarray.h"
12 #include "util/u_qsort.h"
13 #include "agx_builder.h"
14 #include "agx_compiler.h"
15 #include "agx_opcodes.h"
16
17 /*
18 * An implementation of "Register Spilling and Live-Range Splitting for SSA-Form
19 * Programs" by Braun and Hack.
20 */
21
22 /*
23 * Next-use distances are logically in ℤ ∪ {∞}. Modeled as saturating uint32 and
24 * referred to as dist_t.
25 *
26 * next_uses represents a next-use map. This is a sparse data structure mapping
27 * variable names to next-use dist_t's. Variables with no later use (infinite
28 * next-use distance) are not stored explicitly, making the time/space
29 * requirements O(live variables). This is important for performance and memory
30 * usage on big shaders with many blocks.
31 *
32 * For now, next_uses is backed by a Mesa hash table, but it could be optimized
33 * to something more specialized in the future.
34 */
35 #define DIST_INFINITY (UINT32_MAX)
36 typedef uint32_t dist_t;
37
38 static dist_t
dist_sum(dist_t A,dist_t B)39 dist_sum(dist_t A, dist_t B)
40 {
41 return (A + B < A) ? DIST_INFINITY : (A + B);
42 }
43
44 struct next_uses {
45 struct hash_table_u64 *ht;
46 };
47
48 static void
init_next_uses(struct next_uses * nu,void * memctx)49 init_next_uses(struct next_uses *nu, void *memctx)
50 {
51 nu->ht = _mesa_hash_table_u64_create(memctx);
52 }
53
54 static void
destroy_next_uses(struct next_uses * nu)55 destroy_next_uses(struct next_uses *nu)
56 {
57 _mesa_hash_table_u64_destroy(nu->ht);
58 }
59
60 static void
clear_next_uses(struct next_uses * nu)61 clear_next_uses(struct next_uses *nu)
62 {
63 _mesa_hash_table_u64_clear(nu->ht);
64 }
65
66 static void
copy_next_uses(struct next_uses * nu,const struct next_uses * from)67 copy_next_uses(struct next_uses *nu, const struct next_uses *from)
68 {
69 clear_next_uses(nu);
70
71 hash_table_u64_foreach(from->ht, use) {
72 _mesa_hash_table_u64_insert(nu->ht, use.key, use.data);
73 }
74 }
75
76 static void
set_next_use(struct next_uses * nu,unsigned node,dist_t dist)77 set_next_use(struct next_uses *nu, unsigned node, dist_t dist)
78 {
79 if (dist == DIST_INFINITY) {
80 _mesa_hash_table_u64_remove(nu->ht, node);
81 } else {
82 uintptr_t as_ptr = (uintptr_t)(dist + 1);
83 assert(as_ptr != 0 && "non-NULL");
84
85 _mesa_hash_table_u64_insert(nu->ht, node, (void *)as_ptr);
86 }
87 }
88
89 static dist_t
search_next_uses(const struct next_uses * nu,unsigned node)90 search_next_uses(const struct next_uses *nu, unsigned node)
91 {
92 void *ent = _mesa_hash_table_u64_search(nu->ht, node);
93 if (!ent)
94 return DIST_INFINITY;
95
96 uintptr_t raw = (uintptr_t)ent;
97 return raw - 1;
98 }
99
100 #define foreach_next_use(nu, node, dist) \
101 hash_table_u64_foreach((nu)->ht, use_) \
102 for (uint32_t _terminator = 1, node = use_.key, \
103 UNUSED dist = ((uintptr_t)use_.data) - 1; \
104 _terminator != 0; _terminator = 0)
105
106 /*
107 * Calculate the minimum of two next-use sets. Values absent from one of the
108 * underlying sets are infinity so do not contribute to the minimum, instead
109 * acting like a set union.
110 */
111 static bool
minimum_next_uses(struct next_uses * nu,const struct next_uses * from)112 minimum_next_uses(struct next_uses *nu, const struct next_uses *from)
113 {
114 bool progress = false;
115
116 foreach_next_use(from, node, from_dist) {
117 dist_t nu_dist = search_next_uses(nu, node);
118
119 if (from_dist < nu_dist) {
120 set_next_use(nu, node, from_dist);
121 progress = true;
122 }
123 }
124
125 return progress;
126 }
127
128 static uint32_t
instr_cycles(const agx_instr * I)129 instr_cycles(const agx_instr *I)
130 {
131 return 1;
132 }
133
134 struct spill_block {
135 /* Set of values available in the register file at the end */
136 unsigned W_exit[AGX_NUM_REGS];
137 unsigned nW_exit;
138
139 unsigned W_entry[AGX_NUM_REGS];
140 unsigned nW_entry;
141
142 /* Set of live-out spilled values at the end of the block */
143 unsigned *S_exit;
144 unsigned nS_exit;
145
146 unsigned *S_entry;
147 unsigned nS_entry;
148
149 /* Estimate */
150 uint32_t cycles;
151
152 /* Next-use maps at the start/end of the block */
153 struct next_uses next_use_in;
154 struct next_uses next_use_out;
155 };
156
157 struct spill_ctx {
158 void *memctx;
159 agx_context *shader;
160 agx_block *block;
161
162 /* Set of values currently available in the register file */
163 BITSET_WORD *W;
164
165 /* |W| = Current register pressure */
166 unsigned nW;
167
168 /* Local IPs of next-use */
169 dist_t *next_uses;
170
171 /* Current local IP relative to the start of the block */
172 uint32_t ip;
173
174 /* Set of live values that have been spilled. Contrary to the paper, this
175 * is not a subset of W: the definition in the paper is bogus.
176 */
177 BITSET_WORD *S;
178
179 /* Widths of vectors */
180 uint8_t *channels;
181 enum agx_size *size;
182
183 /* Mapping of rematerializable values to their definitions, or NULL for nodes
184 * that are not materializable.
185 */
186 agx_instr **remat;
187
188 /* Maximum register pressure allowed */
189 unsigned k;
190
191 /* Number of variables */
192 unsigned n;
193
194 /* Information on blocks indexed in source order */
195 struct spill_block *blocks;
196
197 /* Base memory index reserved for spilled indices */
198 unsigned spill_base;
199 };
200
201 static inline struct spill_block *
spill_block(struct spill_ctx * ctx,agx_block * block)202 spill_block(struct spill_ctx *ctx, agx_block *block)
203 {
204 return &ctx->blocks[block->index];
205 }
206
207 /* Calculate the register demand of a node. This is rounded up to a power-of-two
208 * to match the equivalent calculations in RA.
209 */
210 static unsigned
node_size(struct spill_ctx * ctx,unsigned node)211 node_size(struct spill_ctx *ctx, unsigned node)
212 {
213 return util_next_power_of_two(ctx->channels[node]) *
214 agx_size_align_16(ctx->size[node]);
215 }
216
217 /*
218 * Map a control flow edge to a block. Assumes no critical edges.
219 */
220 static agx_block *
agx_edge_to_block(agx_block * pred,agx_block * succ)221 agx_edge_to_block(agx_block *pred, agx_block *succ)
222 {
223 /* End of predecessor is unique if there's a single successor */
224 if (agx_num_successors(pred) == 1)
225 return pred;
226
227 /* The predecessor has multiple successors, meaning this is not the only
228 * edge leaving the predecessor. Therefore, it is the only edge entering
229 * the successor (otherwise the edge would be critical), so the start of
230 * the successor is unique.
231 */
232 assert(agx_num_predecessors(succ) == 1 && "critical edge detected");
233 return succ;
234 }
235
236 /*
237 * Get a cursor to insert along a control flow edge: either at the start of the
238 * successor or the end of the predecessor. This relies on the control flow
239 * graph having no critical edges.
240 */
241 static agx_cursor
agx_along_edge(agx_block * pred,agx_block * succ)242 agx_along_edge(agx_block *pred, agx_block *succ)
243 {
244 agx_block *to = agx_edge_to_block(pred, succ);
245
246 if (to == pred)
247 return agx_after_block_logical(pred);
248 else
249 return agx_before_block(succ);
250 }
251
252 static inline agx_index
agx_index_as_mem(agx_index idx,unsigned mem_base)253 agx_index_as_mem(agx_index idx, unsigned mem_base)
254 {
255 assert(idx.type == AGX_INDEX_NORMAL);
256 assert(!idx.memory);
257 idx.memory = true;
258 idx.value = mem_base + idx.value;
259 return idx;
260 }
261
262 static unsigned
chase_mem_index(agx_index ref,unsigned mem_base)263 chase_mem_index(agx_index ref, unsigned mem_base)
264 {
265 assert(ref.type == AGX_INDEX_NORMAL);
266 return ref.memory ? (ref.value - mem_base) : ref.value;
267 }
268
269 static agx_index
reconstruct_index(struct spill_ctx * ctx,unsigned node)270 reconstruct_index(struct spill_ctx *ctx, unsigned node)
271 {
272 return agx_get_vec_index(node, ctx->size[node], ctx->channels[node]);
273 }
274
275 static bool
can_remat(agx_instr * I)276 can_remat(agx_instr *I)
277 {
278 switch (I->op) {
279 case AGX_OPCODE_MOV_IMM:
280 case AGX_OPCODE_GET_SR:
281 return true;
282 default:
283 return false;
284 }
285 }
286
287 static agx_instr *
remat_to(agx_builder * b,agx_index dst,struct spill_ctx * ctx,unsigned node)288 remat_to(agx_builder *b, agx_index dst, struct spill_ctx *ctx, unsigned node)
289 {
290 agx_instr *I = ctx->remat[node];
291 assert(can_remat(I));
292
293 switch (I->op) {
294 case AGX_OPCODE_MOV_IMM:
295 return agx_mov_imm_to(b, dst, I->imm);
296 case AGX_OPCODE_GET_SR:
297 return agx_get_sr_to(b, dst, I->sr);
298 default:
299 unreachable("invalid remat");
300 }
301 }
302
303 static void
insert_spill(agx_builder * b,struct spill_ctx * ctx,unsigned node)304 insert_spill(agx_builder *b, struct spill_ctx *ctx, unsigned node)
305 {
306 if (!ctx->remat[node]) {
307 agx_index idx = reconstruct_index(ctx, node);
308 agx_mov_to(b, agx_index_as_mem(idx, ctx->spill_base), idx);
309
310 /* We only need the extra registers reserved if we actually spilled
311 * instead of just remat.
312 */
313 b->shader->has_spill_pcopy_reserved = true;
314 }
315 }
316
317 static void
insert_reload(struct spill_ctx * ctx,agx_block * block,agx_cursor cursor,unsigned node)318 insert_reload(struct spill_ctx *ctx, agx_block *block, agx_cursor cursor,
319 unsigned node)
320 {
321 agx_builder b = agx_init_builder(ctx->shader, cursor);
322 agx_index idx = reconstruct_index(ctx, node);
323
324 /* Reloading breaks SSA, but agx_repair_ssa will repair */
325 if (ctx->remat[node]) {
326 remat_to(&b, idx, ctx, node);
327 } else {
328 agx_mov_to(&b, idx, agx_index_as_mem(idx, ctx->spill_base));
329 }
330 }
331
332 /* Insert into the register file */
333 static void
insert_W(struct spill_ctx * ctx,unsigned v)334 insert_W(struct spill_ctx *ctx, unsigned v)
335 {
336 assert(v < ctx->n);
337 assert(!BITSET_TEST(ctx->W, v));
338
339 BITSET_SET(ctx->W, v);
340 ctx->nW += node_size(ctx, v);
341 }
342
343 /* Remove from the register file */
344 static void
remove_W(struct spill_ctx * ctx,unsigned v)345 remove_W(struct spill_ctx *ctx, unsigned v)
346 {
347 assert(v < ctx->n);
348 assert(BITSET_TEST(ctx->W, v));
349
350 BITSET_CLEAR(ctx->W, v);
351 ctx->nW -= node_size(ctx, v);
352 }
353
354 static void
remove_W_if_present(struct spill_ctx * ctx,unsigned v)355 remove_W_if_present(struct spill_ctx *ctx, unsigned v)
356 {
357 assert(v < ctx->n);
358
359 if (BITSET_TEST(ctx->W, v))
360 remove_W(ctx, v);
361 }
362
363 struct candidate {
364 unsigned node;
365 dist_t dist;
366 };
367
368 static int
cmp_dist(const void * left_,const void * right_,void * ctx_)369 cmp_dist(const void *left_, const void *right_, void *ctx_)
370 {
371 struct spill_ctx *ctx = ctx_;
372 const struct candidate *left = left_;
373 const struct candidate *right = right_;
374
375 /* We assume that rematerializing - even before every instruction - is
376 * cheaper than spilling. As long as one of the nodes is rematerializable
377 * (with distance > 0), we choose it over spilling. Within a class of nodes
378 * (rematerializable or not), compare by next-use-distance.
379 */
380 bool remat_left = ctx->remat[left->node] != NULL && left->dist > 0;
381 bool remat_right = ctx->remat[right->node] != NULL && right->dist > 0;
382
383 if (remat_left != remat_right)
384 return remat_left ? 1 : -1;
385 else
386 return (left->dist > right->dist) - (left->dist < right->dist);
387 }
388
389 /*
390 * Limit the register file W to maximum size m by evicting registers.
391 */
392 static ATTRIBUTE_NOINLINE void
limit(struct spill_ctx * ctx,agx_instr * I,unsigned m)393 limit(struct spill_ctx *ctx, agx_instr *I, unsigned m)
394 {
395 /* Nothing to do if we're already below the limit */
396 if (ctx->nW <= m)
397 return;
398
399 /* Gather candidates for eviction. Note that next_uses gives IPs whereas
400 * cmp_dist expects relative distances. This requires us to subtract ctx->ip
401 * to ensure that cmp_dist works properly. Even though logically it shouldn't
402 * affect the sorted order, practically this matters for correctness with
403 * rematerialization. See the dist=0 test in cmp_dist.
404 */
405 struct candidate *candidates = alloca(ctx->nW * sizeof(struct candidate));
406 unsigned j = 0;
407
408 int i;
409 BITSET_FOREACH_SET(i, ctx->W, ctx->n) {
410 assert(j < ctx->nW);
411
412 candidates[j++] = (struct candidate){
413 .node = i,
414 .dist = ctx->next_uses[i] - ctx->ip,
415 };
416 }
417
418 /* Sort by next-use distance */
419 util_qsort_r(candidates, j, sizeof(struct candidate), cmp_dist, ctx);
420
421 /* Evict what doesn't fit */
422 unsigned new_weight = 0;
423
424 for (i = 0; i < j; ++i) {
425 unsigned v = candidates[i].node;
426 unsigned comps = node_size(ctx, v);
427
428 if ((new_weight + comps) <= m) {
429 new_weight += comps;
430 } else {
431 /* Insert a spill if we haven't spilled before and there is
432 * another use
433 */
434 if (!BITSET_TEST(ctx->S, v) && candidates[i].dist < DIST_INFINITY) {
435 agx_builder b = agx_init_builder(ctx->shader, agx_before_instr(I));
436 insert_spill(&b, ctx, v);
437 BITSET_SET(ctx->S, v);
438 }
439
440 remove_W(ctx, v);
441
442 /* We keep going in case we can pack in a scalar */
443 }
444 }
445 }
446
447 /*
448 * Insert coupling code on block boundaries. This must ensure:
449 *
450 * - anything live-in we expect to have spilled is spilled
451 * - anything live-in we expect to have filled is filled
452 * - phi sources are spilled if the destination is spilled
453 * - phi sources are filled if the destination is not spilled
454 *
455 * The latter two requirements ensure correct pressure calculations for phis.
456 */
457 static ATTRIBUTE_NOINLINE void
insert_coupling_code(struct spill_ctx * ctx,agx_block * pred,agx_block * succ)458 insert_coupling_code(struct spill_ctx *ctx, agx_block *pred, agx_block *succ)
459 {
460 struct spill_block *sp = spill_block(ctx, pred);
461 struct spill_block *ss = spill_block(ctx, succ);
462
463 agx_foreach_phi_in_block(succ, I) {
464 if (!I->dest[0].memory)
465 continue;
466
467 agx_builder b =
468 agx_init_builder(ctx->shader, agx_before_function(ctx->shader));
469
470 unsigned s = agx_predecessor_index(succ, pred);
471
472 /* Copy immediate/uniform phi sources to memory variables at the start of
473 * the program, where pressure is zero and hence the copy is legal.
474 */
475 if (I->src[s].type != AGX_INDEX_NORMAL) {
476 assert(I->src[s].type == AGX_INDEX_IMMEDIATE ||
477 I->src[s].type == AGX_INDEX_UNIFORM);
478
479 agx_index mem = agx_temp_like(ctx->shader, I->dest[0]);
480 assert(mem.memory);
481
482 agx_index gpr = agx_temp_like(ctx->shader, I->dest[0]);
483 gpr.memory = false;
484
485 if (I->src[s].type == AGX_INDEX_IMMEDIATE)
486 agx_mov_imm_to(&b, gpr, I->src[s].value);
487 else
488 agx_mov_to(&b, gpr, I->src[s]);
489
490 agx_mov_to(&b, mem, gpr);
491 I->src[s] = mem;
492 continue;
493 }
494
495 bool spilled = false;
496 for (unsigned i = 0; i < sp->nS_exit; ++i) {
497 if (sp->S_exit[i] == I->src[s].value) {
498 spilled = true;
499 break;
500 }
501 }
502
503 if (!spilled) {
504 /* Spill the phi source. TODO: avoid redundant spills here */
505 agx_builder b =
506 agx_init_builder(ctx->shader, agx_after_block_logical(pred));
507
508 insert_spill(&b, ctx, I->src[s].value);
509 }
510
511 if (ctx->remat[I->src[s].value]) {
512 unsigned node = I->src[s].value;
513 agx_index idx = reconstruct_index(ctx, node);
514 agx_index tmp = agx_temp_like(ctx->shader, idx);
515
516 remat_to(&b, tmp, ctx, node);
517 agx_mov_to(&b, agx_index_as_mem(idx, ctx->spill_base), tmp);
518 }
519
520 /* Use the spilled version */
521 I->src[s] = agx_index_as_mem(I->src[s], ctx->spill_base);
522 }
523
524 /* Anything assumed to be spilled at the start of succ must be spilled along
525 * all edges.
526 */
527 for (unsigned i = 0; i < ss->nS_entry; ++i) {
528 unsigned v = ss->S_entry[i];
529
530 bool spilled = false;
531 for (unsigned j = 0; j < sp->nS_exit; ++j) {
532 if (sp->S_exit[j] == v) {
533 spilled = true;
534 break;
535 }
536 }
537
538 /* We handle spilling phi destinations separately */
539 agx_foreach_phi_in_block(succ, phi) {
540 if (chase_mem_index(phi->dest[0], ctx->spill_base) == v) {
541 spilled = true;
542 break;
543 }
544 }
545
546 if (spilled)
547 continue;
548
549 agx_builder b = agx_init_builder(ctx->shader, agx_along_edge(pred, succ));
550 insert_spill(&b, ctx, v);
551 }
552
553 /* Variables in W at the start of succ must be defined along the edge. */
554 for (unsigned i = 0; i < ss->nW_entry; ++i) {
555 unsigned node = ss->W_entry[i];
556 bool defined = false;
557
558 /* Variables live at the end of the predecessor are live along the edge */
559 for (unsigned j = 0; j < sp->nW_exit; ++j) {
560 if (sp->W_exit[j] == node) {
561 defined = true;
562 break;
563 }
564 }
565
566 /* Phis are defined along the edge */
567 agx_foreach_phi_in_block(succ, phi) {
568 if (phi->dest[0].value == node) {
569 defined = true;
570 break;
571 }
572 }
573
574 if (defined)
575 continue;
576
577 /* Otherwise, inserting a reload defines the variable along the edge */
578 agx_block *reload_block = agx_edge_to_block(pred, succ);
579 insert_reload(ctx, reload_block, agx_along_edge(pred, succ), node);
580 }
581
582 agx_foreach_phi_in_block(succ, I) {
583 if (I->dest[0].memory)
584 continue;
585
586 unsigned s = agx_predecessor_index(succ, pred);
587
588 /* Treat immediate/uniform phi sources as registers for pressure
589 * accounting and phi lowering purposes. Parallel copy lowering can handle
590 * a copy from a immediate/uniform to a register, but not from an
591 * immediate/uniform directly to memory.
592 */
593 if (I->src[s].type != AGX_INDEX_NORMAL) {
594 assert(I->src[s].type == AGX_INDEX_IMMEDIATE ||
595 I->src[s].type == AGX_INDEX_UNIFORM);
596
597 continue;
598 }
599
600 bool live = false;
601 for (unsigned i = 0; i < sp->nW_exit; ++i) {
602 if (sp->W_exit[i] == I->src[s].value) {
603 live = true;
604 break;
605 }
606 }
607
608 /* Fill the phi source in the predecessor */
609 if (!live) {
610 agx_block *reload_block = agx_edge_to_block(pred, succ);
611 insert_reload(ctx, reload_block, agx_along_edge(pred, succ),
612 I->src[s].value);
613 }
614
615 /* Leave as-is for the GPR version */
616 assert(!I->src[s].memory);
617 }
618 }
619
620 /*
621 * Produce an array of next-use IPs relative to the start of the block. This is
622 * an array of dist_t scalars, representing the next-use IP of each SSA dest
623 * (right-to-left) and SSA source (left-to-right) of each instruction in the
624 * block (bottom-to-top). Its size equals the # of SSA sources in the block.
625 */
626 static ATTRIBUTE_NOINLINE void
calculate_local_next_use(struct spill_ctx * ctx,struct util_dynarray * out)627 calculate_local_next_use(struct spill_ctx *ctx, struct util_dynarray *out)
628 {
629 struct spill_block *sb = spill_block(ctx, ctx->block);
630 unsigned ip = sb->cycles;
631
632 util_dynarray_init(out, NULL);
633
634 struct next_uses nu;
635 init_next_uses(&nu, NULL);
636
637 foreach_next_use(&sb->next_use_out, i, dist) {
638 set_next_use(&nu, i, dist_sum(ip, dist));
639 }
640
641 agx_foreach_instr_in_block_rev(ctx->block, I) {
642 ip -= instr_cycles(I);
643
644 if (I->op != AGX_OPCODE_PHI) {
645 agx_foreach_ssa_dest_rev(I, d) {
646 unsigned v = I->dest[d].value;
647
648 util_dynarray_append(out, dist_t, search_next_uses(&nu, v));
649 }
650
651 agx_foreach_ssa_src(I, s) {
652 unsigned v = I->src[s].value;
653
654 util_dynarray_append(out, dist_t, search_next_uses(&nu, v));
655 set_next_use(&nu, v, ip);
656 }
657 }
658 }
659
660 assert(ip == 0 && "cycle counting is consistent");
661 destroy_next_uses(&nu);
662 }
663
664 static agx_cursor
agx_before_instr_logical(agx_block * block,agx_instr * I)665 agx_before_instr_logical(agx_block *block, agx_instr *I)
666 {
667 if (I->op == AGX_OPCODE_EXPORT) {
668 agx_instr *first = agx_first_instr(block);
669
670 while (I != first && I->op == AGX_OPCODE_EXPORT) {
671 I = agx_prev_op(I);
672 }
673
674 if (I == first && first->op == AGX_OPCODE_EXPORT) {
675 return agx_before_block(block);
676 } else {
677 return agx_after_instr(I);
678 }
679 } else {
680 return agx_before_instr(I);
681 }
682 }
683
684 /*
685 * Insert spills/fills for a single basic block, following Belady's algorithm.
686 * Corresponds to minAlgorithm from the paper.
687 */
688 static ATTRIBUTE_NOINLINE void
min_algorithm(struct spill_ctx * ctx)689 min_algorithm(struct spill_ctx *ctx)
690 {
691 struct spill_block *sblock = spill_block(ctx, ctx->block);
692 struct util_dynarray local_next_ip;
693 calculate_local_next_use(ctx, &local_next_ip);
694
695 /* next_uses gives the distance from the start of the block, so prepopulate
696 * with next_use_in.
697 */
698 foreach_next_use(&sblock->next_use_in, key, dist) {
699 assert(key < ctx->n);
700 ctx->next_uses[key] = dist;
701 }
702
703 dist_t *next_ips = util_dynarray_element(&local_next_ip, dist_t, 0);
704 unsigned next_use_cursor =
705 util_dynarray_num_elements(&local_next_ip, dist_t);
706
707 /* Iterate each instruction in forward order */
708 agx_foreach_instr_in_block(ctx->block, I) {
709 assert(ctx->nW <= ctx->k && "invariant");
710
711 /* Debug to check against our RA demand calculations */
712 if (0) {
713 printf("%u: ", ctx->nW);
714 agx_print_instr(I, stdout);
715 }
716
717 /* Phis are special since they happen along the edge. When we initialized
718 * W and S, we implicitly chose which phis are spilled. So, here we just
719 * need to rewrite the phis to write into memory.
720 *
721 * Phi sources are handled later.
722 */
723 if (I->op == AGX_OPCODE_PHI) {
724 if (!BITSET_TEST(ctx->W, I->dest[0].value)) {
725 I->dest[0] = agx_index_as_mem(I->dest[0], ctx->spill_base);
726 }
727
728 ctx->ip += instr_cycles(I);
729 continue;
730 }
731
732 /* Any source that is not in W needs to be reloaded. Gather the set R of
733 * such values.
734 */
735 unsigned R[AGX_MAX_NORMAL_SOURCES];
736 unsigned nR = 0;
737
738 agx_foreach_ssa_src(I, s) {
739 unsigned node = I->src[s].value;
740 if (BITSET_TEST(ctx->W, node))
741 continue;
742
743 /* Mark this variable as needing a reload. */
744 assert(node < ctx->n);
745 assert(BITSET_TEST(ctx->S, node) && "must have been spilled");
746 assert(nR < ARRAY_SIZE(R) && "maximum source count");
747 R[nR++] = node;
748
749 /* The inserted reload will add the value to the register file. */
750 insert_W(ctx, node);
751 }
752
753 /* Limit W to make space for the sources we just added */
754 limit(ctx, I, ctx->k);
755
756 /* Update next-use distances for this instruction. Unlike the paper, we
757 * prune dead values from W as we go. This doesn't affect correctness, but
758 * it speeds up limit() on average.
759 */
760 agx_foreach_ssa_src_rev(I, s) {
761 assert(next_use_cursor >= 1);
762
763 unsigned next_ip = next_ips[--next_use_cursor];
764 assert((next_ip == DIST_INFINITY) == I->src[s].kill);
765
766 if (next_ip == DIST_INFINITY)
767 remove_W_if_present(ctx, I->src[s].value);
768 else
769 ctx->next_uses[I->src[s].value] = next_ip;
770 }
771
772 agx_foreach_ssa_dest(I, d) {
773 assert(next_use_cursor >= 1);
774 unsigned next_ip = next_ips[--next_use_cursor];
775
776 if (next_ip == DIST_INFINITY)
777 remove_W_if_present(ctx, I->dest[d].value);
778 else
779 ctx->next_uses[I->dest[d].value] = next_ip;
780 }
781
782 /* Count how many registers we need for destinations. Because of
783 * SSA form, destinations are unique.
784 */
785 unsigned dest_size = 0;
786 agx_foreach_ssa_dest(I, d) {
787 dest_size += node_size(ctx, I->dest[d].value);
788 }
789
790 /* Limit W to make space for the destinations. */
791 limit(ctx, I, ctx->k - dest_size);
792
793 /* Destinations are now in the register file */
794 agx_foreach_ssa_dest(I, d) {
795 insert_W(ctx, I->dest[d].value);
796 }
797
798 /* Add reloads for the sources in front of the instruction. We need to be
799 * careful around exports, hoisting the reloads to before all exports.
800 *
801 * This is legal since all exports happen in parallel and all registers
802 * are dead after the exports. The register file
803 * must be big enough for everything exported, so it must be big enough
804 * for all the reloaded values right before the parallel exports.
805 */
806 for (unsigned i = 0; i < nR; ++i) {
807 insert_reload(ctx, ctx->block, agx_before_instr_logical(ctx->block, I),
808 R[i]);
809 }
810
811 ctx->ip += instr_cycles(I);
812 }
813
814 assert(next_use_cursor == 0 && "exactly sized");
815
816 int i;
817 BITSET_FOREACH_SET(i, ctx->W, ctx->n)
818 sblock->W_exit[sblock->nW_exit++] = i;
819
820 unsigned nS = __bitset_count(ctx->S, BITSET_WORDS(ctx->n));
821 sblock->S_exit = ralloc_array(ctx->memctx, unsigned, nS);
822
823 BITSET_FOREACH_SET(i, ctx->S, ctx->n)
824 sblock->S_exit[sblock->nS_exit++] = i;
825
826 assert(nS == sblock->nS_exit);
827 util_dynarray_fini(&local_next_ip);
828 }
829
830 /*
831 * TODO: Implement section 4.2 of the paper.
832 *
833 * For now, we implement the simpler heuristic in Hack's thesis: sort
834 * the live-in set (+ destinations of phis) by next-use distance.
835 */
836 static ATTRIBUTE_NOINLINE void
compute_w_entry_loop_header(struct spill_ctx * ctx)837 compute_w_entry_loop_header(struct spill_ctx *ctx)
838 {
839 agx_block *block = ctx->block;
840 struct spill_block *sb = spill_block(ctx, block);
841
842 unsigned nP = __bitset_count(block->live_in, BITSET_WORDS(ctx->n));
843 struct candidate *candidates = calloc(nP, sizeof(struct candidate));
844 unsigned j = 0;
845
846 foreach_next_use(&sb->next_use_in, i, dist) {
847 assert(j < nP);
848 candidates[j++] = (struct candidate){.node = i, .dist = dist};
849 }
850
851 assert(j == nP);
852
853 /* Sort by next-use distance */
854 util_qsort_r(candidates, j, sizeof(struct candidate), cmp_dist, ctx);
855
856 /* Take as much as we can */
857 for (unsigned i = 0; i < j; ++i) {
858 unsigned node = candidates[i].node;
859 unsigned comps = node_size(ctx, node);
860
861 if ((ctx->nW + comps) <= ctx->k) {
862 insert_W(ctx, node);
863 sb->W_entry[sb->nW_entry++] = node;
864 }
865 }
866
867 assert(ctx->nW <= ctx->k);
868 free(candidates);
869 }
870
871 /*
872 * Compute W_entry for a block. Section 4.2 in the paper.
873 */
874 static ATTRIBUTE_NOINLINE void
compute_w_entry(struct spill_ctx * ctx)875 compute_w_entry(struct spill_ctx *ctx)
876 {
877 agx_block *block = ctx->block;
878 struct spill_block *sb = spill_block(ctx, block);
879
880 /* Nothing to do for start blocks */
881 if (agx_num_predecessors(block) == 0)
882 return;
883
884 /* Loop headers have a different heuristic */
885 if (block->loop_header) {
886 compute_w_entry_loop_header(ctx);
887 return;
888 }
889
890 /* Usual blocks follow */
891 unsigned *freq = calloc(ctx->n, sizeof(unsigned));
892
893 /* Record what's written at the end of each predecessor */
894 agx_foreach_predecessor(ctx->block, P) {
895 struct spill_block *sp = spill_block(ctx, *P);
896
897 for (unsigned i = 0; i < sp->nW_exit; ++i) {
898 unsigned v = sp->W_exit[i];
899 freq[v]++;
900 }
901 }
902
903 struct candidate *candidates = calloc(ctx->n, sizeof(struct candidate));
904 unsigned j = 0;
905
906 /* Variables that are in all predecessors are assumed in W_entry. Phis and
907 * variables in some predecessors are scored by next-use.
908 */
909 foreach_next_use(&sb->next_use_in, i, dist) {
910 if (freq[i] == agx_num_predecessors(ctx->block)) {
911 insert_W(ctx, i);
912 } else if (freq[i]) {
913 candidates[j++] = (struct candidate){.node = i, .dist = dist};
914 }
915 }
916
917 agx_foreach_phi_in_block(ctx->block, I) {
918 bool all_found = true;
919
920 agx_foreach_predecessor(ctx->block, pred) {
921 struct spill_block *sp = spill_block(ctx, *pred);
922 bool found = false;
923
924 agx_index src = I->src[agx_predecessor_index(ctx->block, *pred)];
925 if (src.type != AGX_INDEX_NORMAL)
926 continue;
927
928 unsigned v = src.value;
929 for (unsigned i = 0; i < sp->nW_exit; ++i) {
930 if (sp->W_exit[i] == v) {
931 found = true;
932 break;
933 }
934 }
935
936 all_found &= found;
937 }
938
939 /* Heuristic: if any phi source is spilled, spill the whole phi. This is
940 * suboptimal, but it massively reduces pointless fill/spill chains with
941 * massive phi webs.
942 */
943 if (!all_found)
944 continue;
945
946 candidates[j++] = (struct candidate){
947 .node = I->dest[0].value,
948 .dist = search_next_uses(&sb->next_use_in, I->dest[0].value),
949 };
950 }
951
952 /* Sort by next-use distance */
953 util_qsort_r(candidates, j, sizeof(struct candidate), cmp_dist, ctx);
954
955 /* Take as much as we can */
956 for (unsigned i = 0; i < j; ++i) {
957 unsigned node = candidates[i].node;
958 unsigned comps = node_size(ctx, node);
959
960 if ((ctx->nW + comps) <= ctx->k) {
961 insert_W(ctx, node);
962 sb->W_entry[sb->nW_entry++] = node;
963 }
964 }
965
966 assert(ctx->nW <= ctx->k && "invariant");
967
968 free(freq);
969 free(candidates);
970 }
971
972 /*
973 * We initialize S with the union of S at the exit of (forward edge)
974 * predecessors and the complement of W, intersected with the live-in set. The
975 * former propagates S forward. The latter ensures we spill along the edge when
976 * a live value is not selected for the entry W.
977 */
978 static ATTRIBUTE_NOINLINE void
compute_s_entry(struct spill_ctx * ctx)979 compute_s_entry(struct spill_ctx *ctx)
980 {
981 unsigned v;
982
983 agx_foreach_predecessor(ctx->block, pred) {
984 struct spill_block *sp = spill_block(ctx, *pred);
985
986 for (unsigned i = 0; i < sp->nS_exit; ++i) {
987 v = sp->S_exit[i];
988
989 if (BITSET_TEST(ctx->block->live_in, v))
990 BITSET_SET(ctx->S, v);
991 }
992 }
993
994 BITSET_FOREACH_SET(v, ctx->block->live_in, ctx->n) {
995 if (!BITSET_TEST(ctx->W, v))
996 BITSET_SET(ctx->S, v);
997 }
998
999 /* Copy ctx->S to S_entry for later look-ups with coupling code */
1000 struct spill_block *sb = spill_block(ctx, ctx->block);
1001 unsigned nS = __bitset_count(ctx->S, BITSET_WORDS(ctx->n));
1002 sb->S_entry = ralloc_array(ctx->memctx, unsigned, nS);
1003
1004 int i;
1005 BITSET_FOREACH_SET(i, ctx->S, ctx->n)
1006 sb->S_entry[sb->nS_entry++] = i;
1007
1008 assert(sb->nS_entry == nS);
1009 }
1010
1011 static ATTRIBUTE_NOINLINE void
global_next_use_distances(agx_context * ctx,void * memctx,struct spill_block * blocks)1012 global_next_use_distances(agx_context *ctx, void *memctx,
1013 struct spill_block *blocks)
1014 {
1015 u_worklist worklist;
1016 u_worklist_init(&worklist, ctx->num_blocks, NULL);
1017
1018 agx_foreach_block(ctx, block) {
1019 struct spill_block *sb = &blocks[block->index];
1020
1021 init_next_uses(&sb->next_use_in, memctx);
1022 init_next_uses(&sb->next_use_out, memctx);
1023
1024 agx_foreach_instr_in_block(block, I) {
1025 sb->cycles += instr_cycles(I);
1026 }
1027
1028 agx_worklist_push_head(&worklist, block);
1029 }
1030
1031 /* Definitions that have been seen */
1032 BITSET_WORD *defined =
1033 malloc(BITSET_WORDS(ctx->alloc) * sizeof(BITSET_WORD));
1034
1035 struct next_uses dists;
1036 init_next_uses(&dists, NULL);
1037
1038 /* Iterate the work list in reverse order since liveness is backwards */
1039 while (!u_worklist_is_empty(&worklist)) {
1040 agx_block *blk = agx_worklist_pop_head(&worklist);
1041 struct spill_block *sb = &blocks[blk->index];
1042
1043 /* Definitions that have been seen */
1044 memset(defined, 0, BITSET_WORDS(ctx->alloc) * sizeof(BITSET_WORD));
1045
1046 /* Initialize all distances to infinity */
1047 clear_next_uses(&dists);
1048
1049 uint32_t cycle = 0;
1050
1051 /* Calculate dists. Phis are handled separately. */
1052 agx_foreach_instr_in_block(blk, I) {
1053 if (I->op == AGX_OPCODE_PHI) {
1054 cycle++;
1055 continue;
1056 }
1057
1058 /* Record first use before def. Phi sources are handled
1059 * above, because they logically happen in the
1060 * predecessor.
1061 */
1062 agx_foreach_ssa_src(I, s) {
1063 if (BITSET_TEST(defined, I->src[s].value))
1064 continue;
1065 if (search_next_uses(&dists, I->src[s].value) < DIST_INFINITY)
1066 continue;
1067
1068 assert(I->src[s].value < ctx->alloc);
1069 set_next_use(&dists, I->src[s].value, cycle);
1070 }
1071
1072 /* Record defs */
1073 agx_foreach_ssa_dest(I, d) {
1074 assert(I->dest[d].value < ctx->alloc);
1075 BITSET_SET(defined, I->dest[d].value);
1076 }
1077
1078 cycle += instr_cycles(I);
1079 }
1080
1081 /* Apply transfer function to get our entry state. */
1082 foreach_next_use(&sb->next_use_out, node, dist) {
1083 set_next_use(&sb->next_use_in, node, dist_sum(dist, sb->cycles));
1084 }
1085
1086 foreach_next_use(&dists, node, dist) {
1087 set_next_use(&sb->next_use_in, node, dist);
1088 }
1089
1090 int i;
1091 BITSET_FOREACH_SET(i, defined, ctx->alloc) {
1092 set_next_use(&sb->next_use_in, i, DIST_INFINITY);
1093 }
1094
1095 /* Propagate the live in of the successor (blk) to the live out of
1096 * predecessors.
1097 *
1098 * Phi nodes are logically on the control flow edge and act in parallel.
1099 * To handle when propagating, we kill writes from phis and make live the
1100 * corresponding sources.
1101 */
1102 agx_foreach_predecessor(blk, pred) {
1103 struct spill_block *sp = &blocks[(*pred)->index];
1104 copy_next_uses(&dists, &sb->next_use_in);
1105
1106 /* Kill write */
1107 agx_foreach_phi_in_block(blk, I) {
1108 assert(I->dest[0].type == AGX_INDEX_NORMAL);
1109 set_next_use(&dists, I->dest[0].value, DIST_INFINITY);
1110 }
1111
1112 /* Make live the corresponding source */
1113 agx_foreach_phi_in_block(blk, I) {
1114 agx_index operand = I->src[agx_predecessor_index(blk, *pred)];
1115 if (operand.type == AGX_INDEX_NORMAL)
1116 set_next_use(&dists, operand.value, 0);
1117 }
1118
1119 /* Join by taking minimum */
1120 if (minimum_next_uses(&sp->next_use_out, &dists))
1121 agx_worklist_push_tail(&worklist, *pred);
1122 }
1123 }
1124
1125 free(defined);
1126 u_worklist_fini(&worklist);
1127 destroy_next_uses(&dists);
1128 }
1129
1130 static ATTRIBUTE_NOINLINE void
validate_next_use_info(UNUSED agx_context * ctx,UNUSED struct spill_block * blocks)1131 validate_next_use_info(UNUSED agx_context *ctx,
1132 UNUSED struct spill_block *blocks)
1133 {
1134 #ifndef NDEBUG
1135 int i;
1136
1137 agx_foreach_block(ctx, blk) {
1138 struct spill_block *sb = &blocks[blk->index];
1139
1140 /* Invariant: next-use distance is finite iff the node is live */
1141 BITSET_FOREACH_SET(i, blk->live_in, ctx->alloc)
1142 assert(search_next_uses(&sb->next_use_in, i) < DIST_INFINITY);
1143
1144 BITSET_FOREACH_SET(i, blk->live_out, ctx->alloc)
1145 assert(search_next_uses(&sb->next_use_out, i) < DIST_INFINITY);
1146
1147 foreach_next_use(&sb->next_use_in, i, _)
1148 assert(BITSET_TEST(blk->live_in, i));
1149
1150 foreach_next_use(&sb->next_use_out, i, _)
1151 assert(BITSET_TEST(blk->live_out, i));
1152 }
1153 #endif
1154 }
1155
1156 void
agx_spill(agx_context * ctx,unsigned k)1157 agx_spill(agx_context *ctx, unsigned k)
1158 {
1159 void *memctx = ralloc_context(NULL);
1160
1161 /* We need extra registers for memory-memory swaps */
1162 k -= 8;
1163
1164 uint8_t *channels = rzalloc_array(memctx, uint8_t, ctx->alloc);
1165 dist_t *next_uses = rzalloc_array(memctx, dist_t, ctx->alloc);
1166 enum agx_size *sizes = rzalloc_array(memctx, enum agx_size, ctx->alloc);
1167 agx_instr **remat = rzalloc_array(memctx, agx_instr *, ctx->alloc);
1168
1169 agx_foreach_instr_global(ctx, I) {
1170 if (can_remat(I))
1171 remat[I->dest[0].value] = I;
1172
1173 /* Measure vectors */
1174 agx_foreach_ssa_dest(I, d) {
1175 assert(sizes[I->dest[d].value] == 0 && "broken SSA");
1176 assert(channels[I->dest[d].value] == 0 && "broken SSA");
1177
1178 sizes[I->dest[d].value] = I->dest[d].size;
1179 channels[I->dest[d].value] = agx_channels(I->dest[d]);
1180 }
1181 }
1182
1183 struct spill_block *blocks =
1184 rzalloc_array(memctx, struct spill_block, ctx->num_blocks);
1185
1186 /* Step 1. Compute global next-use distances */
1187 global_next_use_distances(ctx, memctx, blocks);
1188 validate_next_use_info(ctx, blocks);
1189
1190 /* Reserve a memory variable for every regular variable */
1191 unsigned n = ctx->alloc;
1192 ctx->alloc *= 2;
1193
1194 BITSET_WORD *W = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(n));
1195 BITSET_WORD *S = ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(n));
1196
1197 agx_foreach_block(ctx, block) {
1198 memset(W, 0, BITSET_WORDS(n) * sizeof(BITSET_WORD));
1199 memset(S, 0, BITSET_WORDS(n) * sizeof(BITSET_WORD));
1200
1201 struct spill_ctx sctx = {
1202 .memctx = memctx,
1203 .shader = ctx,
1204 .n = n,
1205 .channels = channels,
1206 .size = sizes,
1207 .remat = remat,
1208 .next_uses = next_uses,
1209 .block = block,
1210 .blocks = blocks,
1211 .k = k,
1212 .W = W,
1213 .S = S,
1214 .spill_base = n,
1215 };
1216
1217 compute_w_entry(&sctx);
1218 compute_s_entry(&sctx);
1219 min_algorithm(&sctx);
1220 }
1221
1222 /* Now that all blocks are processed separately, stitch it together */
1223 agx_foreach_block(ctx, block) {
1224 struct spill_ctx sctx = {
1225 .memctx = memctx,
1226 .shader = ctx,
1227 .n = n,
1228 .channels = channels,
1229 .size = sizes,
1230 .remat = remat,
1231 .block = block,
1232 .blocks = blocks,
1233 .k = k,
1234 .W = W,
1235 .S = S,
1236 .spill_base = n,
1237 };
1238
1239 agx_foreach_predecessor(block, pred) {
1240 /* After spilling phi sources, insert coupling code */
1241 insert_coupling_code(&sctx, *pred, block);
1242 }
1243 }
1244
1245 ralloc_free(memctx);
1246
1247 /* Spilling breaks SSA, so we need to repair before validating */
1248 agx_repair_ssa(ctx);
1249 agx_validate(ctx, "Spilling");
1250
1251 /* Remat can introduce dead code */
1252 agx_dce(ctx, false);
1253 }
1254