1 /*
2 * Copyright (C) 2020 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors (Collabora):
24 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25 */
26
27 #include "compiler.h"
28 #include "nodearray.h"
29 #include "bi_builder.h"
30 #include "util/u_memory.h"
31
32 struct lcra_state {
33 unsigned node_count;
34 uint64_t *affinity;
35
36 /* Linear constraints imposed. For each node there there is a
37 * 'nodearray' structure, which changes between a sparse and dense
38 * array depending on the number of elements.
39 *
40 * Each element is itself a bit field denoting whether (c_j - c_i) bias
41 * is present or not, including negative biases.
42 *
43 * We support up to 8 components so the bias is in range
44 * [-7, 7] encoded by a 16-bit field
45 */
46 nodearray *linear;
47
48 /* Before solving, forced registers; after solving, solutions. */
49 unsigned *solutions;
50
51 /** Node which caused register allocation to fail */
52 unsigned spill_node;
53 };
54
55 /* This module is an implementation of "Linearly Constrained
56 * Register Allocation". The paper is available in PDF form
57 * (https://people.collabora.com/~alyssa/LCRA.pdf) as well as Markdown+LaTeX
58 * (https://gitlab.freedesktop.org/alyssa/lcra/blob/master/LCRA.md)
59 */
60
61 static struct lcra_state *
lcra_alloc_equations(unsigned node_count)62 lcra_alloc_equations(unsigned node_count)
63 {
64 struct lcra_state *l = calloc(1, sizeof(*l));
65
66 l->node_count = node_count;
67
68 l->linear = calloc(sizeof(l->linear[0]), node_count);
69 l->solutions = calloc(sizeof(l->solutions[0]), node_count);
70 l->affinity = calloc(sizeof(l->affinity[0]), node_count);
71
72 memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count);
73
74 return l;
75 }
76
77 static void
lcra_free(struct lcra_state * l)78 lcra_free(struct lcra_state *l)
79 {
80 for (unsigned i = 0; i < l->node_count; ++i)
81 nodearray_reset(&l->linear[i]);
82
83 free(l->linear);
84 free(l->affinity);
85 free(l->solutions);
86 free(l);
87 }
88
89 static void
lcra_add_node_interference(struct lcra_state * l,unsigned i,unsigned cmask_i,unsigned j,unsigned cmask_j)90 lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j)
91 {
92 if (i == j)
93 return;
94
95 nodearray_value constraint_fw = 0;
96 nodearray_value constraint_bw = 0;
97
98 /* The constraint bits are reversed from lcra.c so that register
99 * allocation can be done in parallel for every possible solution,
100 * with lower-order bits representing smaller registers. */
101
102 for (unsigned D = 0; D < 8; ++D) {
103 if (cmask_i & (cmask_j << D)) {
104 constraint_fw |= (1 << (7 + D));
105 constraint_bw |= (1 << (7 - D));
106 }
107
108 if (cmask_i & (cmask_j >> D)) {
109 constraint_bw |= (1 << (7 + D));
110 constraint_fw |= (1 << (7 - D));
111 }
112 }
113
114 /* Use dense arrays after adding 256 elements */
115 nodearray_orr(&l->linear[j], i, constraint_fw, 256, l->node_count);
116 nodearray_orr(&l->linear[i], j, constraint_bw, 256, l->node_count);
117 }
118
119 static bool
lcra_test_linear(struct lcra_state * l,unsigned * solutions,unsigned i)120 lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i)
121 {
122 signed constant = solutions[i];
123
124 if (nodearray_is_sparse(&l->linear[i])) {
125 nodearray_sparse_foreach(&l->linear[i], elem) {
126 unsigned j = nodearray_sparse_key(elem);
127 nodearray_value constraint = nodearray_sparse_value(elem);
128
129 if (solutions[j] == ~0) continue;
130
131 signed lhs = constant - solutions[j];
132
133 if (lhs < -7 || lhs > 7)
134 continue;
135
136 if (constraint & (1 << (lhs + 7)))
137 return false;
138 }
139
140 return true;
141 }
142
143 nodearray_value *row = l->linear[i].dense;
144
145 for (unsigned j = 0; j < l->node_count; ++j) {
146 if (solutions[j] == ~0) continue;
147
148 signed lhs = constant - solutions[j];
149
150 if (lhs < -7 || lhs > 7)
151 continue;
152
153 if (row[j] & (1 << (lhs + 7)))
154 return false;
155 }
156
157 return true;
158 }
159
160 static bool
lcra_solve(struct lcra_state * l)161 lcra_solve(struct lcra_state *l)
162 {
163 for (unsigned step = 0; step < l->node_count; ++step) {
164 if (l->solutions[step] != ~0) continue;
165 if (l->affinity[step] == 0) continue;
166
167 bool succ = false;
168
169 u_foreach_bit64(r, l->affinity[step]) {
170 l->solutions[step] = r;
171
172 if (lcra_test_linear(l, l->solutions, step)) {
173 succ = true;
174 break;
175 }
176 }
177
178 /* Out of registers - prepare to spill */
179 if (!succ) {
180 l->spill_node = step;
181 return false;
182 }
183 }
184
185 return true;
186 }
187
188 /* Register spilling is implemented with a cost-benefit system. Costs are set
189 * by the user. Benefits are calculated from the constraints. */
190
191 static unsigned
lcra_count_constraints(struct lcra_state * l,unsigned i)192 lcra_count_constraints(struct lcra_state *l, unsigned i)
193 {
194 unsigned count = 0;
195 nodearray *constraints = &l->linear[i];
196
197 if (nodearray_is_sparse(constraints)) {
198 nodearray_sparse_foreach(constraints, elem)
199 count += util_bitcount(nodearray_sparse_value(elem));
200 } else {
201 nodearray_dense_foreach_64(constraints, elem)
202 count += util_bitcount64(*elem);
203 }
204
205 return count;
206 }
207
208 /* Construct an affinity mask such that the vector with `count` elements does
209 * not intersect any of the registers in the bitset `clobber`. In other words,
210 * an allocated register r needs to satisfy for each i < count: a + i != b.
211 * Equivalently that's a != b - i, so we need a \ne { b - i : i < n }. For the
212 * entire clobber set B, we need a \ne union b \in B { b - i : i < n }, where
213 * that union is the desired clobber set. That may be written equivalently as
214 * the union over i < n of (B - i), where subtraction is defined elementwise
215 * and corresponds to a shift of the entire bitset.
216 *
217 * EVEN_BITS_MASK is an affinity mask for aligned register pairs. Interpreted
218 * as a bit set, it is { x : 0 <= x < 64 if x is even }
219 */
220
221 #define EVEN_BITS_MASK (0x5555555555555555ull)
222
223 static uint64_t
bi_make_affinity(uint64_t clobber,unsigned count,bool split_file)224 bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
225 {
226 uint64_t clobbered = 0;
227
228 for (unsigned i = 0; i < count; ++i)
229 clobbered |= (clobber >> i);
230
231 /* Don't allocate past the end of the register file */
232 if (count > 1) {
233 unsigned excess = count - 1;
234 uint64_t mask = BITFIELD_MASK(excess);
235 clobbered |= mask << (64 - excess);
236
237 if (split_file)
238 clobbered |= mask << (16 - excess);
239 }
240
241 /* Don't allocate the middle if we split out the middle */
242 if (split_file)
243 clobbered |= BITFIELD64_MASK(32) << 16;
244
245 /* We can use a register iff it's not clobberred */
246 return ~clobbered;
247 }
248
249 static void
bi_mark_interference(bi_block * block,struct lcra_state * l,uint8_t * live,uint64_t preload_live,unsigned node_count,bool is_blend,bool split_file,bool aligned_sr)250 bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live, uint64_t preload_live, unsigned node_count, bool is_blend, bool split_file, bool aligned_sr)
251 {
252 bi_foreach_instr_in_block_rev(block, ins) {
253 /* Mark all registers live after the instruction as
254 * interfering with the destination */
255
256 bi_foreach_dest(ins, d) {
257 unsigned node = bi_get_node(ins->dest[d]);
258
259 if (node >= node_count)
260 continue;
261
262 /* Don't allocate to anything that's read later as a
263 * preloaded register. The affinity is the intersection
264 * of affinity masks for each write. Since writes have
265 * offsets, but the affinity is for the whole node, we
266 * need to offset the affinity opposite the write
267 * offset, so we shift right. */
268 unsigned count = bi_count_write_registers(ins, d);
269 unsigned offset = ins->dest[d].offset;
270 uint64_t affinity = bi_make_affinity(preload_live, count, split_file) >> offset;
271 /* Valhall needs >= 64-bit staging writes to be pair-aligned */
272 if (aligned_sr && (count >= 2 || offset))
273 affinity &= EVEN_BITS_MASK;
274
275 l->affinity[node] &= affinity;
276
277 for (unsigned i = 0; i < node_count; ++i) {
278 uint8_t r = live[i];
279
280 /* Nodes only interfere if they occupy
281 * /different values/ at the same time
282 * (Boissinot). In particular, sources of
283 * moves do not interfere with their
284 * destinations. This enables a limited form of
285 * coalescing.
286 */
287 if (ins->op == BI_OPCODE_MOV_I32 &&
288 i == bi_get_node(ins->src[0])) {
289
290 r &= ~BITFIELD_BIT(ins->src[0].offset);
291 }
292
293 if (r) {
294 lcra_add_node_interference(l, node,
295 bi_writemask(ins, d), i, r);
296 }
297 }
298
299 unsigned node_first = bi_get_node(ins->dest[0]);
300 if (d == 1 && node_first < node_count) {
301 lcra_add_node_interference(l, node, bi_writemask(ins, 1),
302 node_first, bi_writemask(ins, 0));
303 }
304 }
305
306 /* Valhall needs >= 64-bit reads to be pair-aligned */
307 if (aligned_sr) {
308 bi_foreach_src(ins, s) {
309 if (bi_count_read_registers(ins, s) >= 2) {
310 unsigned node = bi_get_node(ins->src[s]);
311
312 if (node < node_count)
313 l->affinity[node] &= EVEN_BITS_MASK;
314 }
315 }
316 }
317
318 if (!is_blend && ins->op == BI_OPCODE_BLEND) {
319 /* Blend shaders might clobber r0-r15, r48. */
320 uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48);
321
322 for (unsigned i = 0; i < node_count; ++i) {
323 if (live[i])
324 l->affinity[i] &= ~clobber;
325 }
326 }
327
328 /* Update live_in */
329 preload_live = bi_postra_liveness_ins(preload_live, ins);
330 bi_liveness_ins_update(live, ins, node_count);
331 }
332
333 block->reg_live_in = preload_live;
334 }
335
336 static void
bi_compute_interference(bi_context * ctx,struct lcra_state * l,bool full_regs)337 bi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs)
338 {
339 unsigned node_count = bi_max_temp(ctx);
340
341 bi_compute_liveness(ctx);
342 bi_postra_liveness(ctx);
343
344 bi_foreach_block_rev(ctx, blk) {
345 uint8_t *live = mem_dup(blk->live_out, node_count);
346
347 bi_mark_interference(blk, l, live, blk->reg_live_out,
348 node_count, ctx->inputs->is_blend, !full_regs,
349 ctx->arch >= 9);
350
351 free(live);
352 }
353 }
354
355 static struct lcra_state *
bi_allocate_registers(bi_context * ctx,bool * success,bool full_regs)356 bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
357 {
358 unsigned node_count = bi_max_temp(ctx);
359 struct lcra_state *l = lcra_alloc_equations(node_count);
360
361 /* Blend shaders are restricted to R0-R15. Other shaders at full
362 * occupancy also can access R48-R63. At half occupancy they can access
363 * the whole file. */
364
365 uint64_t default_affinity =
366 ctx->inputs->is_blend ? BITFIELD64_MASK(16) :
367 full_regs ? BITFIELD64_MASK(64) :
368 (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));
369
370 /* To test spilling, mimic a small register file */
371 if (bifrost_debug & BIFROST_DBG_SPILL && !ctx->inputs->is_blend)
372 default_affinity &= BITFIELD64_MASK(48) << 8;
373
374 bi_foreach_instr_global(ctx, ins) {
375 bi_foreach_dest(ins, d) {
376 unsigned dest = bi_get_node(ins->dest[d]);
377
378 if (dest < node_count)
379 l->affinity[dest] = default_affinity;
380 }
381
382 /* Blend shaders expect the src colour to be in r0-r3 */
383 if (ins->op == BI_OPCODE_BLEND &&
384 !ctx->inputs->is_blend) {
385 unsigned node = bi_get_node(ins->src[0]);
386 assert(node < node_count);
387 l->solutions[node] = 0;
388
389 /* Dual source blend input in r4-r7 */
390 node = bi_get_node(ins->src[4]);
391 if (node < node_count)
392 l->solutions[node] = 4;
393
394 /* Writes to R48 */
395 node = bi_get_node(ins->dest[0]);
396 if (!bi_is_null(ins->dest[0])) {
397 assert(node < node_count);
398 l->solutions[node] = 48;
399 }
400 }
401
402 /* Coverage mask writes stay in R60 */
403 if ((ins->op == BI_OPCODE_ATEST ||
404 ins->op == BI_OPCODE_ZS_EMIT) &&
405 !bi_is_null(ins->dest[0])) {
406 unsigned node = bi_get_node(ins->dest[0]);
407 assert(node < node_count);
408 l->solutions[node] = 60;
409 }
410
411 /* Experimentally, it seems coverage masks inputs to ATEST must
412 * be in R60. Otherwise coverage mask writes do not work with
413 * early-ZS with pixel-frequency-shading (this combination of
414 * settings is legal if depth/stencil writes are disabled).
415 */
416 if (ins->op == BI_OPCODE_ATEST) {
417 unsigned node = bi_get_node(ins->src[0]);
418 assert(node < node_count);
419 l->solutions[node] = 60;
420 }
421 }
422
423 bi_compute_interference(ctx, l, full_regs);
424
425 /* Coalesce register moves if we're allowed. We need to be careful due
426 * to the restricted affinity induced by the blend shader ABI.
427 */
428 bi_foreach_instr_global(ctx, I) {
429 if (I->op != BI_OPCODE_MOV_I32) continue;
430 if (I->src[0].type != BI_INDEX_REGISTER) continue;
431
432 unsigned reg = I->src[0].value;
433 unsigned node = bi_get_node(I->dest[0]);
434 assert(node < node_count);
435
436 if (l->solutions[node] != ~0) continue;
437
438 uint64_t affinity = l->affinity[node];
439
440 if (ctx->inputs->is_blend) {
441 /* We're allowed to coalesce the moves to these */
442 affinity |= BITFIELD64_BIT(48);
443 affinity |= BITFIELD64_BIT(60);
444 }
445
446 /* Try to coalesce */
447 if (affinity & BITFIELD64_BIT(reg)) {
448 l->solutions[node] = reg;
449
450 if (!lcra_test_linear(l, l->solutions, node))
451 l->solutions[node] = ~0;
452 }
453 }
454
455 *success = lcra_solve(l);
456
457 return l;
458 }
459
460 static bi_index
bi_reg_from_index(bi_context * ctx,struct lcra_state * l,bi_index index)461 bi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index)
462 {
463 /* Offsets can only be applied when we register allocated an index, or
464 * alternatively for FAU's encoding */
465
466 ASSERTED bool is_offset = (index.offset > 0) &&
467 (index.type != BI_INDEX_FAU);
468 unsigned node_count = bi_max_temp(ctx);
469
470 /* Did we run RA for this index at all */
471 if (bi_get_node(index) >= node_count) {
472 assert(!is_offset);
473 return index;
474 }
475
476 /* LCRA didn't bother solving this index (how lazy!) */
477 signed solution = l->solutions[bi_get_node(index)];
478 if (solution < 0) {
479 assert(!is_offset);
480 return index;
481 }
482
483 /* todo: do we want to compose with the subword swizzle? */
484 bi_index new_index = bi_register(solution + index.offset);
485 new_index.swizzle = index.swizzle;
486 new_index.abs = index.abs;
487 new_index.neg = index.neg;
488 return new_index;
489 }
490
491 /* Dual texture instructions write to two sets of staging registers, modeled as
492 * two destinations in the IR. The first set is communicated with the usual
493 * staging register mechanism. The second set is encoded in the texture
494 * operation descriptor. This is quite unusual, and requires the following late
495 * fixup.
496 */
497 static void
bi_fixup_dual_tex_register(bi_instr * I)498 bi_fixup_dual_tex_register(bi_instr *I)
499 {
500 assert(I->dest[1].type == BI_INDEX_REGISTER);
501 assert(I->src[3].type == BI_INDEX_CONSTANT);
502
503 struct bifrost_dual_texture_operation desc = {
504 .secondary_register = I->dest[1].value
505 };
506
507 I->src[3].value |= bi_dual_tex_as_u32(desc);
508 }
509
510 static void
bi_install_registers(bi_context * ctx,struct lcra_state * l)511 bi_install_registers(bi_context *ctx, struct lcra_state *l)
512 {
513 bi_foreach_instr_global(ctx, ins) {
514 bi_foreach_dest(ins, d)
515 ins->dest[d] = bi_reg_from_index(ctx, l, ins->dest[d]);
516
517 bi_foreach_src(ins, s)
518 ins->src[s] = bi_reg_from_index(ctx, l, ins->src[s]);
519
520 if (ins->op == BI_OPCODE_TEXC && !bi_is_null(ins->dest[1]))
521 bi_fixup_dual_tex_register(ins);
522 }
523 }
524
525 static void
bi_rewrite_index_src_single(bi_instr * ins,bi_index old,bi_index new)526 bi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new)
527 {
528 bi_foreach_src(ins, i) {
529 if (bi_is_equiv(ins->src[i], old)) {
530 ins->src[i].type = new.type;
531 ins->src[i].reg = new.reg;
532 ins->src[i].value = new.value;
533 }
534 }
535 }
536
537 /* If register allocation fails, find the best spill node */
538
539 static signed
bi_choose_spill_node(bi_context * ctx,struct lcra_state * l)540 bi_choose_spill_node(bi_context *ctx, struct lcra_state *l)
541 {
542 /* Pick a node satisfying bi_spill_register's preconditions */
543 BITSET_WORD *no_spill = calloc(sizeof(BITSET_WORD), BITSET_WORDS(l->node_count));
544
545 bi_foreach_instr_global(ctx, ins) {
546 bi_foreach_dest(ins, d) {
547 unsigned node = bi_get_node(ins->dest[d]);
548
549 if (node >= l->node_count)
550 continue;
551
552 /* Don't allow spilling coverage mask writes because the
553 * register preload logic assumes it will stay in R60.
554 * This could be optimized.
555 */
556 if (ins->no_spill ||
557 ins->op == BI_OPCODE_ATEST ||
558 ins->op == BI_OPCODE_ZS_EMIT ||
559 (ins->op == BI_OPCODE_MOV_I32 &&
560 ins->src[0].type == BI_INDEX_REGISTER &&
561 ins->src[0].value == 60)) {
562 BITSET_SET(no_spill, node);
563 }
564 }
565 }
566
567 unsigned best_benefit = 0.0;
568 signed best_node = -1;
569
570 if (nodearray_is_sparse(&l->linear[l->spill_node])) {
571 nodearray_sparse_foreach(&l->linear[l->spill_node], elem) {
572 unsigned i = nodearray_sparse_key(elem);
573 unsigned constraint = nodearray_sparse_value(elem);
574
575 /* Only spill nodes that interfere with the node failing
576 * register allocation. It's pointless to spill anything else */
577 if (!constraint) continue;
578
579 if (BITSET_TEST(no_spill, i)) continue;
580
581 unsigned benefit = lcra_count_constraints(l, i);
582
583 if (benefit > best_benefit) {
584 best_benefit = benefit;
585 best_node = i;
586 }
587 }
588 } else {
589 nodearray_value *row = l->linear[l->spill_node].dense;
590
591 for (unsigned i = 0; i < l->node_count; ++i) {
592 /* Only spill nodes that interfere with the node failing
593 * register allocation. It's pointless to spill anything else */
594 if (!row[i]) continue;
595
596 if (BITSET_TEST(no_spill, i)) continue;
597
598 unsigned benefit = lcra_count_constraints(l, i);
599
600 if (benefit > best_benefit) {
601 best_benefit = benefit;
602 best_node = i;
603 }
604 }
605 }
606
607 free(no_spill);
608 return best_node;
609 }
610
611 static unsigned
bi_count_read_index(bi_instr * I,bi_index index)612 bi_count_read_index(bi_instr *I, bi_index index)
613 {
614 unsigned max = 0;
615
616 bi_foreach_src(I, s) {
617 if (bi_is_equiv(I->src[s], index)) {
618 unsigned count = bi_count_read_registers(I, s);
619 max = MAX2(max, count + I->src[s].offset);
620 }
621 }
622
623 return max;
624 }
625
626 /*
627 * Wrappers to emit loads/stores to thread-local storage in an appropriate way
628 * for the target, so the spill/fill code becomes architecture-independent.
629 */
630
631 static bi_index
bi_tls_ptr(bool hi)632 bi_tls_ptr(bool hi)
633 {
634 return bi_fau(BIR_FAU_TLS_PTR, hi);
635 }
636
637 static bi_instr *
bi_load_tl(bi_builder * b,unsigned bits,bi_index src,unsigned offset)638 bi_load_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset)
639 {
640 if (b->shader->arch >= 9) {
641 return bi_load_to(b, bits, src, bi_tls_ptr(false),
642 bi_tls_ptr(true), BI_SEG_TL, offset);
643 } else {
644 return bi_load_to(b, bits, src, bi_imm_u32(offset), bi_zero(),
645 BI_SEG_TL, 0);
646 }
647 }
648
649 static void
bi_store_tl(bi_builder * b,unsigned bits,bi_index src,unsigned offset)650 bi_store_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset)
651 {
652 if (b->shader->arch >= 9) {
653 bi_store(b, bits, src, bi_tls_ptr(false), bi_tls_ptr(true), BI_SEG_TL, offset);
654 } else {
655 bi_store(b, bits, src, bi_imm_u32(offset), bi_zero(), BI_SEG_TL, 0);
656 }
657 }
658
659 /* Once we've chosen a spill node, spill it and returns bytes spilled */
660
661 static unsigned
bi_spill_register(bi_context * ctx,bi_index index,uint32_t offset)662 bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)
663 {
664 bi_builder b = { .shader = ctx };
665 unsigned channels = 0;
666
667 /* Spill after every store, fill before every load */
668 bi_foreach_instr_global_safe(ctx, I) {
669 bi_foreach_dest(I, d) {
670 if (!bi_is_equiv(I->dest[d], index)) continue;
671
672 unsigned extra = I->dest[d].offset;
673 bi_index tmp = bi_temp(ctx);
674
675 I->dest[d] = bi_replace_index(I->dest[d], tmp);
676 I->no_spill = true;
677
678 unsigned count = bi_count_write_registers(I, d);
679 unsigned bits = count * 32;
680
681 b.cursor = bi_after_instr(I);
682 bi_store_tl(&b, bits, tmp, offset + 4 * extra);
683
684 ctx->spills++;
685 channels = MAX2(channels, extra + count);
686 }
687
688 if (bi_has_arg(I, index)) {
689 b.cursor = bi_before_instr(I);
690 bi_index tmp = bi_temp(ctx);
691
692 unsigned bits = bi_count_read_index(I, index) * 32;
693 bi_rewrite_index_src_single(I, index, tmp);
694
695 bi_instr *ld = bi_load_tl(&b, bits, tmp, offset);
696 ld->no_spill = true;
697 ctx->fills++;
698 }
699 }
700
701 return (channels * 4);
702 }
703
704 /*
705 * For transition, lower collects and splits before RA, rather than after RA.
706 * LCRA knows how to deal with offsets (broken SSA), but not how to coalesce
707 * these vector moves.
708 */
709 static void
bi_lower_vector(bi_context * ctx)710 bi_lower_vector(bi_context *ctx)
711 {
712 bi_index *remap = calloc(ctx->ssa_alloc, sizeof(bi_index));
713
714 bi_foreach_instr_global_safe(ctx, I) {
715 bi_builder b = bi_init_builder(ctx, bi_after_instr(I));
716
717 if (I->op == BI_OPCODE_SPLIT_I32) {
718 bi_index src = I->src[0];
719 assert(src.offset == 0);
720
721 for (unsigned i = 0; i < I->nr_dests; ++i) {
722 if (bi_is_null(I->dest[i]))
723 continue;
724
725 src.offset = i;
726 bi_mov_i32_to(&b, I->dest[i], src);
727
728 if (bi_is_ssa(I->dest[i]))
729 remap[I->dest[i].value] = src;
730 }
731
732 bi_remove_instruction(I);
733 } else if (I->op == BI_OPCODE_COLLECT_I32) {
734 bi_index dest = I->dest[0];
735 assert(dest.offset == 0);
736 assert((bi_is_ssa(dest) || I->nr_srcs == 1) && "nir_lower_phis_to_scalar");
737
738 for (unsigned i = 0; i < I->nr_srcs; ++i) {
739 if (bi_is_null(I->src[i]))
740 continue;
741
742 dest.offset = i;
743 bi_mov_i32_to(&b, dest, I->src[i]);
744 }
745
746 bi_remove_instruction(I);
747 }
748 }
749
750 bi_foreach_instr_global(ctx, I) {
751 bi_foreach_src(I, s) {
752 if (bi_is_ssa(I->src[s]) && !bi_is_null(remap[I->src[s].value]))
753 I->src[s] = bi_replace_index(I->src[s], remap[I->src[s].value]);
754 }
755 }
756
757 free(remap);
758
759 /* After generating a pile of moves, clean up */
760 bi_opt_dead_code_eliminate(ctx);
761 }
762
763 /*
764 * Check if the instruction requires a "tied" operand. Such instructions MUST
765 * allocate their source and destination to the same register. This is a
766 * constraint on RA, and may require extra moves.
767 *
768 * In particular, this is the case for Bifrost instructions that both read and
769 * write with the staging register mechanism.
770 */
771 static bool
bi_is_tied(const bi_instr * I)772 bi_is_tied(const bi_instr *I)
773 {
774 if (bi_is_null(I->src[0]))
775 return false;
776
777 return (I->op == BI_OPCODE_TEXC ||
778 I->op == BI_OPCODE_ATOM_RETURN_I32 ||
779 I->op == BI_OPCODE_AXCHG_I32 ||
780 I->op == BI_OPCODE_ACMPXCHG_I32);
781 }
782
783 /*
784 * For transition, coalesce tied operands together, as LCRA knows how to handle
785 * non-SSA operands but doesn't know about tied operands.
786 *
787 * This breaks the SSA form of the program, but that doesn't matter for LCRA.
788 */
789 static void
bi_coalesce_tied(bi_context * ctx)790 bi_coalesce_tied(bi_context *ctx)
791 {
792 bi_foreach_instr_global(ctx, I) {
793 if (!bi_is_tied(I)) continue;
794
795 bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
796 unsigned n = bi_count_read_registers(I, 0);
797
798 for (unsigned i = 0; i < n; ++i) {
799 bi_index dst = I->dest[0], src = I->src[0];
800
801 assert(dst.offset == 0 && src.offset == 0);
802 dst.offset = src.offset = i;
803
804 bi_mov_i32_to(&b, dst, src);
805 }
806
807 I->src[0] = bi_replace_index(I->src[0], I->dest[0]);
808 }
809 }
810
811 static unsigned
find_or_allocate_temp(unsigned * map,unsigned value,unsigned * alloc)812 find_or_allocate_temp(unsigned *map, unsigned value, unsigned *alloc)
813 {
814 if (!map[value])
815 map[value] = ++(*alloc);
816
817 assert(map[value]);
818 return map[value] - 1;
819 }
820
821 /* Reassigns numbering to get rid of gaps in the indices and to prioritize
822 * smaller register classes */
823
824 static void
squeeze_index(bi_context * ctx)825 squeeze_index(bi_context *ctx)
826 {
827 unsigned *map = rzalloc_array(ctx, unsigned, ctx->ssa_alloc);
828 ctx->ssa_alloc = 0;
829
830 bi_foreach_instr_global(ctx, I) {
831 bi_foreach_dest(I, d) {
832 if (I->dest[d].type == BI_INDEX_NORMAL)
833 I->dest[d].value = find_or_allocate_temp(map, I->dest[d].value, &ctx->ssa_alloc);
834 }
835
836 bi_foreach_src(I, s) {
837 if (I->src[s].type == BI_INDEX_NORMAL)
838 I->src[s].value = find_or_allocate_temp(map, I->src[s].value, &ctx->ssa_alloc);
839 }
840 }
841
842 ralloc_free(map);
843 }
844
845 void
bi_register_allocate(bi_context * ctx)846 bi_register_allocate(bi_context *ctx)
847 {
848 struct lcra_state *l = NULL;
849 bool success = false;
850
851 unsigned iter_count = 1000; /* max iterations */
852
853 /* Number of bytes of memory we've spilled into */
854 unsigned spill_count = ctx->info.tls_size;
855
856 if (ctx->arch >= 9)
857 va_lower_split_64bit(ctx);
858
859 bi_lower_vector(ctx);
860
861 /* Lower tied operands. SSA is broken from here on. */
862 bi_coalesce_tied(ctx);
863 squeeze_index(ctx);
864
865 /* Try with reduced register pressure to improve thread count */
866 if (ctx->arch >= 7) {
867 l = bi_allocate_registers(ctx, &success, false);
868
869 if (success) {
870 ctx->info.work_reg_count = 32;
871 } else {
872 lcra_free(l);
873 l = NULL;
874 }
875 }
876
877 /* Otherwise, use the register file and spill until we succeed */
878 while (!success && ((iter_count--) > 0)) {
879 l = bi_allocate_registers(ctx, &success, true);
880
881 if (success) {
882 ctx->info.work_reg_count = 64;
883 } else {
884 signed spill_node = bi_choose_spill_node(ctx, l);
885 lcra_free(l);
886 l = NULL;
887
888 if (spill_node == -1)
889 unreachable("Failed to choose spill node\n");
890
891 if (ctx->inputs->is_blend)
892 unreachable("Blend shaders may not spill");
893
894 /* By default, we use packed TLS addressing on Valhall.
895 * We cannot cross 16 byte boundaries with packed TLS
896 * addressing. Align to ensure this doesn't happen. This
897 * could be optimized a bit.
898 */
899 if (ctx->arch >= 9)
900 spill_count = ALIGN_POT(spill_count, 16);
901
902 spill_count += bi_spill_register(ctx,
903 bi_node_to_index(spill_node, bi_max_temp(ctx)),
904 spill_count);
905
906 /* In case the spill affected an instruction with tied
907 * operands, we need to fix up.
908 */
909 bi_coalesce_tied(ctx);
910 }
911 }
912
913 assert(success);
914 assert(l != NULL);
915
916 ctx->info.tls_size = spill_count;
917 bi_install_registers(ctx, l);
918
919 lcra_free(l);
920 }
921