• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2020 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors (Collabora):
24  *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25  */
26 
27 #include "util/u_memory.h"
28 #include "bi_builder.h"
29 #include "compiler.h"
30 #include "nodearray.h"
31 
32 struct lcra_state {
33    unsigned node_count;
34    uint64_t *affinity;
35 
36    /* Linear constraints imposed. For each node there there is a
37     * 'nodearray' structure, which changes between a sparse and dense
38     * array depending on the number of elements.
39     *
40     * Each element is itself a bit field denoting whether (c_j - c_i) bias
41     * is present or not, including negative biases.
42     *
43     * We support up to 8 components so the bias is in range
44     * [-7, 7] encoded by a 16-bit field
45     */
46    nodearray *linear;
47 
48    /* Before solving, forced registers; after solving, solutions. */
49    unsigned *solutions;
50 
51    /** Node which caused register allocation to fail */
52    unsigned spill_node;
53 };
54 
55 /* This module is an implementation of "Linearly Constrained
56  * Register Allocation". The paper is available in PDF form
57  * (https://people.collabora.com/~alyssa/LCRA.pdf) as well as Markdown+LaTeX
58  * (https://gitlab.freedesktop.org/alyssa/lcra/blob/master/LCRA.md)
59  */
60 
61 static struct lcra_state *
lcra_alloc_equations(unsigned node_count)62 lcra_alloc_equations(unsigned node_count)
63 {
64    struct lcra_state *l = calloc(1, sizeof(*l));
65 
66    l->node_count = node_count;
67 
68    l->linear = calloc(sizeof(l->linear[0]), node_count);
69    l->solutions = calloc(sizeof(l->solutions[0]), node_count);
70    l->affinity = calloc(sizeof(l->affinity[0]), node_count);
71 
72    memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count);
73 
74    return l;
75 }
76 
77 static void
lcra_free(struct lcra_state * l)78 lcra_free(struct lcra_state *l)
79 {
80    for (unsigned i = 0; i < l->node_count; ++i)
81       nodearray_reset(&l->linear[i]);
82 
83    free(l->linear);
84    free(l->affinity);
85    free(l->solutions);
86    free(l);
87 }
88 
89 static void
lcra_add_node_interference(struct lcra_state * l,unsigned i,unsigned cmask_i,unsigned j,unsigned cmask_j)90 lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i,
91                            unsigned j, unsigned cmask_j)
92 {
93    if (i == j)
94       return;
95 
96    nodearray_value constraint_fw = 0;
97    nodearray_value constraint_bw = 0;
98 
99    /* The constraint bits are reversed from lcra.c so that register
100     * allocation can be done in parallel for every possible solution,
101     * with lower-order bits representing smaller registers. */
102 
103    for (unsigned D = 0; D < 8; ++D) {
104       if (cmask_i & (cmask_j << D)) {
105          constraint_fw |= (1 << (7 + D));
106          constraint_bw |= (1 << (7 - D));
107       }
108 
109       if (cmask_i & (cmask_j >> D)) {
110          constraint_bw |= (1 << (7 + D));
111          constraint_fw |= (1 << (7 - D));
112       }
113    }
114 
115    /* Use dense arrays after adding 256 elements */
116    nodearray_orr(&l->linear[j], i, constraint_fw, 256, l->node_count);
117    nodearray_orr(&l->linear[i], j, constraint_bw, 256, l->node_count);
118 }
119 
120 static bool
lcra_test_linear(struct lcra_state * l,unsigned * solutions,unsigned i)121 lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i)
122 {
123    signed constant = solutions[i];
124 
125    if (nodearray_is_sparse(&l->linear[i])) {
126       nodearray_sparse_foreach(&l->linear[i], elem) {
127          unsigned j = nodearray_sparse_key(elem);
128          nodearray_value constraint = nodearray_sparse_value(elem);
129 
130          if (solutions[j] == ~0)
131             continue;
132 
133          signed lhs = constant - solutions[j];
134 
135          if (lhs < -7 || lhs > 7)
136             continue;
137 
138          if (constraint & (1 << (lhs + 7)))
139             return false;
140       }
141 
142       return true;
143    }
144 
145    nodearray_value *row = l->linear[i].dense;
146 
147    for (unsigned j = 0; j < l->node_count; ++j) {
148       if (solutions[j] == ~0)
149          continue;
150 
151       signed lhs = constant - solutions[j];
152 
153       if (lhs < -7 || lhs > 7)
154          continue;
155 
156       if (row[j] & (1 << (lhs + 7)))
157          return false;
158    }
159 
160    return true;
161 }
162 
163 static bool
lcra_solve(struct lcra_state * l)164 lcra_solve(struct lcra_state *l)
165 {
166    for (unsigned step = 0; step < l->node_count; ++step) {
167       if (l->solutions[step] != ~0)
168          continue;
169       if (l->affinity[step] == 0)
170          continue;
171 
172       bool succ = false;
173 
174       u_foreach_bit64(r, l->affinity[step]) {
175          l->solutions[step] = r;
176 
177          if (lcra_test_linear(l, l->solutions, step)) {
178             succ = true;
179             break;
180          }
181       }
182 
183       /* Out of registers - prepare to spill */
184       if (!succ) {
185          l->spill_node = step;
186          return false;
187       }
188    }
189 
190    return true;
191 }
192 
193 /* Register spilling is implemented with a cost-benefit system. Costs are set
194  * by the user. Benefits are calculated from the constraints. */
195 
196 static unsigned
lcra_count_constraints(struct lcra_state * l,unsigned i)197 lcra_count_constraints(struct lcra_state *l, unsigned i)
198 {
199    unsigned count = 0;
200    nodearray *constraints = &l->linear[i];
201 
202    if (nodearray_is_sparse(constraints)) {
203       nodearray_sparse_foreach(constraints, elem)
204          count += util_bitcount(nodearray_sparse_value(elem));
205    } else {
206       nodearray_dense_foreach_64(constraints, elem)
207          count += util_bitcount64(*elem);
208    }
209 
210    return count;
211 }
212 
213 /* Liveness analysis is a backwards-may dataflow analysis pass. Within a block,
214  * we compute live_out from live_in. The intrablock pass is linear-time. It
215  * returns whether progress was made. */
216 
217 static void
bi_liveness_ins_update_ra(uint8_t * live,bi_instr * ins)218 bi_liveness_ins_update_ra(uint8_t *live, bi_instr *ins)
219 {
220    /* live_in[s] = GEN[s] + (live_out[s] - KILL[s]) */
221 
222    bi_foreach_dest(ins, d) {
223       live[ins->dest[d].value] &= ~bi_writemask(ins, d);
224    }
225 
226    bi_foreach_ssa_src(ins, src) {
227       unsigned count = bi_count_read_registers(ins, src);
228       unsigned rmask = BITFIELD_MASK(count);
229 
230       live[ins->src[src].value] |= (rmask << ins->src[src].offset);
231    }
232 }
233 
234 static bool
liveness_block_update(bi_block * blk,unsigned temp_count)235 liveness_block_update(bi_block *blk, unsigned temp_count)
236 {
237    bool progress = false;
238 
239    /* live_out[s] = sum { p in succ[s] } ( live_in[p] ) */
240    bi_foreach_successor(blk, succ) {
241       for (unsigned i = 0; i < temp_count; ++i)
242          blk->live_out[i] |= succ->live_in[i];
243    }
244 
245    uint8_t *live = ralloc_array(blk, uint8_t, temp_count);
246    memcpy(live, blk->live_out, temp_count);
247 
248    bi_foreach_instr_in_block_rev(blk, ins)
249       bi_liveness_ins_update_ra(live, ins);
250 
251    /* To figure out progress, diff live_in */
252 
253    for (unsigned i = 0; (i < temp_count) && !progress; ++i)
254       progress |= (blk->live_in[i] != live[i]);
255 
256    ralloc_free(blk->live_in);
257    blk->live_in = live;
258 
259    return progress;
260 }
261 
262 /* Globally, liveness analysis uses a fixed-point algorithm based on a
263  * worklist. We initialize a work list with the exit block. We iterate the work
264  * list to compute live_in from live_out for each block on the work list,
265  * adding the predecessors of the block to the work list if we made progress.
266  */
267 
268 static void
bi_compute_liveness_ra(bi_context * ctx)269 bi_compute_liveness_ra(bi_context *ctx)
270 {
271    u_worklist worklist;
272    bi_worklist_init(ctx, &worklist);
273 
274    bi_foreach_block(ctx, block) {
275       if (block->live_in)
276          ralloc_free(block->live_in);
277 
278       if (block->live_out)
279          ralloc_free(block->live_out);
280 
281       block->live_in = rzalloc_array(block, uint8_t, ctx->ssa_alloc);
282       block->live_out = rzalloc_array(block, uint8_t, ctx->ssa_alloc);
283 
284       bi_worklist_push_tail(&worklist, block);
285    }
286 
287    while (!u_worklist_is_empty(&worklist)) {
288       /* Pop off in reverse order since liveness is backwards */
289       bi_block *blk = bi_worklist_pop_tail(&worklist);
290 
291       /* Update liveness information. If we made progress, we need to
292        * reprocess the predecessors
293        */
294       if (liveness_block_update(blk, ctx->ssa_alloc)) {
295          bi_foreach_predecessor(blk, pred)
296             bi_worklist_push_head(&worklist, *pred);
297       }
298    }
299 
300    u_worklist_fini(&worklist);
301 }
302 
303 /* Construct an affinity mask such that the vector with `count` elements does
304  * not intersect any of the registers in the bitset `clobber`. In other words,
305  * an allocated register r needs to satisfy for each i < count: a + i != b.
306  * Equivalently that's a != b - i, so we need a \ne { b - i : i < n }. For the
307  * entire clobber set B, we need a \ne union b \in B { b - i : i < n }, where
308  * that union is the desired clobber set. That may be written equivalently as
309  * the union over i < n of (B - i), where subtraction is defined elementwise
310  * and corresponds to a shift of the entire bitset.
311  *
312  * EVEN_BITS_MASK is an affinity mask for aligned register pairs. Interpreted
313  * as a bit set, it is { x : 0 <= x < 64 if x is even }
314  */
315 
316 #define EVEN_BITS_MASK (0x5555555555555555ull)
317 
318 static uint64_t
bi_make_affinity(uint64_t clobber,unsigned count,bool split_file)319 bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
320 {
321    uint64_t clobbered = 0;
322 
323    for (unsigned i = 0; i < count; ++i)
324       clobbered |= (clobber >> i);
325 
326    /* Don't allocate past the end of the register file */
327    if (count > 1) {
328       unsigned excess = count - 1;
329       uint64_t mask = BITFIELD_MASK(excess);
330       clobbered |= mask << (64 - excess);
331 
332       if (split_file)
333          clobbered |= mask << (16 - excess);
334    }
335 
336    /* Don't allocate the middle if we split out the middle */
337    if (split_file)
338       clobbered |= BITFIELD64_MASK(32) << 16;
339 
340    /* We can use a register iff it's not clobberred */
341    return ~clobbered;
342 }
343 
344 static void
bi_mark_interference(bi_block * block,struct lcra_state * l,uint8_t * live,uint64_t preload_live,unsigned node_count,bool is_blend,bool split_file,bool aligned_sr)345 bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live,
346                      uint64_t preload_live, unsigned node_count, bool is_blend,
347                      bool split_file, bool aligned_sr)
348 {
349    bi_foreach_instr_in_block_rev(block, ins) {
350       /* Mark all registers live after the instruction as
351        * interfering with the destination */
352 
353       bi_foreach_dest(ins, d) {
354          unsigned node = ins->dest[d].value;
355 
356          /* Don't allocate to anything that's read later as a
357           * preloaded register. The affinity is the intersection
358           * of affinity masks for each write. Since writes have
359           * offsets, but the affinity is for the whole node, we
360           * need to offset the affinity opposite the write
361           * offset, so we shift right. */
362          unsigned count = bi_count_write_registers(ins, d);
363          unsigned offset = ins->dest[d].offset;
364          uint64_t affinity =
365             bi_make_affinity(preload_live, count, split_file) >> offset;
366          /* Valhall needs >= 64-bit staging writes to be pair-aligned */
367          if (aligned_sr && (count >= 2 || offset))
368             affinity &= EVEN_BITS_MASK;
369 
370          l->affinity[node] &= affinity;
371 
372          for (unsigned i = 0; i < node_count; ++i) {
373             uint8_t r = live[i];
374 
375             /* Nodes only interfere if they occupy
376              * /different values/ at the same time
377              * (Boissinot). In particular, sources of
378              * moves do not interfere with their
379              * destinations. This enables a limited form of
380              * coalescing.
381              */
382             if (ins->op == BI_OPCODE_MOV_I32 && bi_is_ssa(ins->src[0]) &&
383                 i == ins->src[0].value) {
384 
385                r &= ~BITFIELD_BIT(ins->src[0].offset);
386             }
387 
388             if (r) {
389                lcra_add_node_interference(l, node, bi_writemask(ins, d), i, r);
390             }
391          }
392 
393          unsigned node_first = ins->dest[0].value;
394          if (d == 1) {
395             lcra_add_node_interference(l, node, bi_writemask(ins, 1),
396                                        node_first, bi_writemask(ins, 0));
397          }
398       }
399 
400       /* Valhall needs >= 64-bit reads to be pair-aligned */
401       if (aligned_sr) {
402          bi_foreach_ssa_src(ins, s) {
403             if (bi_count_read_registers(ins, s) >= 2)
404                l->affinity[ins->src[s].value] &= EVEN_BITS_MASK;
405          }
406       }
407 
408       if (!is_blend && ins->op == BI_OPCODE_BLEND) {
409          /* Blend shaders might clobber r0-r15, r48. */
410          uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48);
411 
412          for (unsigned i = 0; i < node_count; ++i) {
413             if (live[i])
414                l->affinity[i] &= ~clobber;
415          }
416       }
417 
418       /* Update live_in */
419       preload_live = bi_postra_liveness_ins(preload_live, ins);
420       bi_liveness_ins_update_ra(live, ins);
421    }
422 
423    block->reg_live_in = preload_live;
424 }
425 
426 static void
bi_compute_interference(bi_context * ctx,struct lcra_state * l,bool full_regs)427 bi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs)
428 {
429    bi_compute_liveness_ra(ctx);
430    bi_postra_liveness(ctx);
431 
432    bi_foreach_block_rev(ctx, blk) {
433       uint8_t *live = mem_dup(blk->live_out, ctx->ssa_alloc);
434 
435       bi_mark_interference(blk, l, live, blk->reg_live_out, ctx->ssa_alloc,
436                            ctx->inputs->is_blend, !full_regs, ctx->arch >= 9);
437 
438       free(live);
439    }
440 }
441 
442 static struct lcra_state *
bi_allocate_registers(bi_context * ctx,bool * success,bool full_regs)443 bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
444 {
445    struct lcra_state *l = lcra_alloc_equations(ctx->ssa_alloc);
446 
447    /* Blend shaders are restricted to R0-R15. Other shaders at full
448     * occupancy also can access R48-R63. At half occupancy they can access
449     * the whole file. */
450 
451    uint64_t default_affinity =
452       ctx->inputs->is_blend ? BITFIELD64_MASK(16)
453       : full_regs           ? BITFIELD64_MASK(64)
454                   : (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));
455 
456    /* To test spilling, mimic a small register file */
457    if (bifrost_debug & BIFROST_DBG_SPILL && !ctx->inputs->is_blend)
458       default_affinity &= BITFIELD64_MASK(48) << 8;
459 
460    bi_foreach_instr_global(ctx, ins) {
461       bi_foreach_dest(ins, d)
462          l->affinity[ins->dest[d].value] = default_affinity;
463 
464       /* Blend shaders expect the src colour to be in r0-r3 */
465       if (ins->op == BI_OPCODE_BLEND && !ctx->inputs->is_blend) {
466          assert(bi_is_ssa(ins->src[0]));
467          l->solutions[ins->src[0].value] = 0;
468 
469          /* Dual source blend input in r4-r7 */
470          if (bi_is_ssa(ins->src[4]))
471             l->solutions[ins->src[4].value] = 4;
472 
473          /* Writes to R48 */
474          if (!bi_is_null(ins->dest[0]))
475             l->solutions[ins->dest[0].value] = 48;
476       }
477 
478       /* Coverage mask writes stay in R60 */
479       if ((ins->op == BI_OPCODE_ATEST || ins->op == BI_OPCODE_ZS_EMIT) &&
480           !bi_is_null(ins->dest[0])) {
481          l->solutions[ins->dest[0].value] = 60;
482       }
483 
484       /* Experimentally, it seems coverage masks inputs to ATEST must
485        * be in R60. Otherwise coverage mask writes do not work with
486        * early-ZS with pixel-frequency-shading (this combination of
487        * settings is legal if depth/stencil writes are disabled).
488        * Allowing a FAU index also seems to work on Valhall, at least.
489        */
490       if (ins->op == BI_OPCODE_ATEST) {
491          assert(bi_is_ssa(ins->src[0]) || ins->src[0].type == BI_INDEX_FAU);
492          if (bi_is_ssa(ins->src[0]))
493             l->solutions[ins->src[0].value] = 60;
494       }
495    }
496 
497    bi_compute_interference(ctx, l, full_regs);
498 
499    /* Coalesce register moves if we're allowed. We need to be careful due
500     * to the restricted affinity induced by the blend shader ABI.
501     */
502    bi_foreach_instr_global(ctx, I) {
503       if (I->op != BI_OPCODE_MOV_I32)
504          continue;
505       if (I->src[0].type != BI_INDEX_REGISTER)
506          continue;
507 
508       unsigned reg = I->src[0].value;
509       unsigned node = I->dest[0].value;
510 
511       if (l->solutions[node] != ~0)
512          continue;
513 
514       uint64_t affinity = l->affinity[node];
515 
516       if (ctx->inputs->is_blend) {
517          /* We're allowed to coalesce the moves to these */
518          affinity |= BITFIELD64_BIT(48);
519          affinity |= BITFIELD64_BIT(60);
520       }
521 
522       /* Try to coalesce */
523       if (affinity & BITFIELD64_BIT(reg)) {
524          l->solutions[node] = reg;
525 
526          if (!lcra_test_linear(l, l->solutions, node))
527             l->solutions[node] = ~0;
528       }
529    }
530 
531    *success = lcra_solve(l);
532 
533    return l;
534 }
535 
536 static bi_index
bi_reg_from_index(bi_context * ctx,struct lcra_state * l,bi_index index)537 bi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index)
538 {
539    /* Offsets can only be applied when we register allocated an index, or
540     * alternatively for FAU's encoding */
541 
542    ASSERTED bool is_offset = (index.offset > 0) && (index.type != BI_INDEX_FAU);
543 
544    /* Did we run RA for this index at all */
545    if (!bi_is_ssa(index)) {
546       assert(!is_offset);
547       return index;
548    }
549 
550    /* LCRA didn't bother solving this index (how lazy!) */
551    signed solution = l->solutions[index.value];
552    if (solution < 0) {
553       assert(!is_offset);
554       return index;
555    }
556 
557    /* todo: do we want to compose with the subword swizzle? */
558    bi_index new_index = bi_register(solution + index.offset);
559    new_index.swizzle = index.swizzle;
560    new_index.abs = index.abs;
561    new_index.neg = index.neg;
562    return new_index;
563 }
564 
565 /* Dual texture instructions write to two sets of staging registers, modeled as
566  * two destinations in the IR. The first set is communicated with the usual
567  * staging register mechanism. The second set is encoded in the texture
568  * operation descriptor. This is quite unusual, and requires the following late
569  * fixup.
570  */
571 static void
bi_fixup_dual_tex_register(bi_instr * I)572 bi_fixup_dual_tex_register(bi_instr *I)
573 {
574    assert(I->dest[1].type == BI_INDEX_REGISTER);
575    assert(I->src[3].type == BI_INDEX_CONSTANT);
576 
577    struct bifrost_dual_texture_operation desc = {
578       .secondary_register = I->dest[1].value,
579    };
580 
581    I->src[3].value |= bi_dual_tex_as_u32(desc);
582 }
583 
584 static void
bi_install_registers(bi_context * ctx,struct lcra_state * l)585 bi_install_registers(bi_context *ctx, struct lcra_state *l)
586 {
587    bi_foreach_instr_global(ctx, ins) {
588       bi_foreach_dest(ins, d)
589          ins->dest[d] = bi_reg_from_index(ctx, l, ins->dest[d]);
590 
591       bi_foreach_src(ins, s)
592          ins->src[s] = bi_reg_from_index(ctx, l, ins->src[s]);
593 
594       if (ins->op == BI_OPCODE_TEXC_DUAL)
595          bi_fixup_dual_tex_register(ins);
596    }
597 }
598 
599 static void
bi_rewrite_index_src_single(bi_instr * ins,bi_index old,bi_index new)600 bi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new)
601 {
602    bi_foreach_src(ins, i) {
603       if (bi_is_equiv(ins->src[i], old)) {
604          ins->src[i].type = new.type;
605          ins->src[i].value = new.value;
606       }
607    }
608 }
609 
610 /* If register allocation fails, find the best spill node */
611 
612 static signed
bi_choose_spill_node(bi_context * ctx,struct lcra_state * l)613 bi_choose_spill_node(bi_context *ctx, struct lcra_state *l)
614 {
615    /* Pick a node satisfying bi_spill_register's preconditions */
616    BITSET_WORD *no_spill =
617       calloc(sizeof(BITSET_WORD), BITSET_WORDS(l->node_count));
618 
619    bi_foreach_instr_global(ctx, ins) {
620       bi_foreach_dest(ins, d) {
621          /* Don't allow spilling coverage mask writes because the
622           * register preload logic assumes it will stay in R60.
623           * This could be optimized.
624           */
625          if (ins->no_spill || ins->op == BI_OPCODE_ATEST ||
626              ins->op == BI_OPCODE_ZS_EMIT ||
627              (ins->op == BI_OPCODE_MOV_I32 &&
628               ins->src[0].type == BI_INDEX_REGISTER &&
629               ins->src[0].value == 60)) {
630             BITSET_SET(no_spill, ins->dest[d].value);
631          }
632       }
633    }
634 
635    unsigned best_benefit = 0.0;
636    signed best_node = -1;
637 
638    if (nodearray_is_sparse(&l->linear[l->spill_node])) {
639       nodearray_sparse_foreach(&l->linear[l->spill_node], elem) {
640          unsigned i = nodearray_sparse_key(elem);
641          unsigned constraint = nodearray_sparse_value(elem);
642 
643          /* Only spill nodes that interfere with the node failing
644           * register allocation. It's pointless to spill anything else */
645          if (!constraint)
646             continue;
647 
648          if (BITSET_TEST(no_spill, i))
649             continue;
650 
651          unsigned benefit = lcra_count_constraints(l, i);
652 
653          if (benefit > best_benefit) {
654             best_benefit = benefit;
655             best_node = i;
656          }
657       }
658    } else {
659       nodearray_value *row = l->linear[l->spill_node].dense;
660 
661       for (unsigned i = 0; i < l->node_count; ++i) {
662          /* Only spill nodes that interfere with the node failing
663           * register allocation. It's pointless to spill anything else */
664          if (!row[i])
665             continue;
666 
667          if (BITSET_TEST(no_spill, i))
668             continue;
669 
670          unsigned benefit = lcra_count_constraints(l, i);
671 
672          if (benefit > best_benefit) {
673             best_benefit = benefit;
674             best_node = i;
675          }
676       }
677    }
678 
679    free(no_spill);
680    return best_node;
681 }
682 
683 static unsigned
bi_count_read_index(bi_instr * I,bi_index index)684 bi_count_read_index(bi_instr *I, bi_index index)
685 {
686    unsigned max = 0;
687 
688    bi_foreach_src(I, s) {
689       if (bi_is_equiv(I->src[s], index)) {
690          unsigned count = bi_count_read_registers(I, s);
691          max = MAX2(max, count + I->src[s].offset);
692       }
693    }
694 
695    return max;
696 }
697 
698 /*
699  * Wrappers to emit loads/stores to thread-local storage in an appropriate way
700  * for the target, so the spill/fill code becomes architecture-independent.
701  */
702 
703 static bi_index
bi_tls_ptr(bool hi)704 bi_tls_ptr(bool hi)
705 {
706    return bi_fau(BIR_FAU_TLS_PTR, hi);
707 }
708 
709 static bi_instr *
bi_load_tl(bi_builder * b,unsigned bits,bi_index src,unsigned offset)710 bi_load_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset)
711 {
712    if (b->shader->arch >= 9) {
713       return bi_load_to(b, bits, src, bi_tls_ptr(false), bi_tls_ptr(true),
714                         BI_SEG_TL, offset);
715    } else {
716       return bi_load_to(b, bits, src, bi_imm_u32(offset), bi_zero(), BI_SEG_TL,
717                         0);
718    }
719 }
720 
721 static void
bi_store_tl(bi_builder * b,unsigned bits,bi_index src,unsigned offset)722 bi_store_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset)
723 {
724    if (b->shader->arch >= 9) {
725       bi_store(b, bits, src, bi_tls_ptr(false), bi_tls_ptr(true), BI_SEG_TL,
726                offset);
727    } else {
728       bi_store(b, bits, src, bi_imm_u32(offset), bi_zero(), BI_SEG_TL, 0);
729    }
730 }
731 
732 /* Once we've chosen a spill node, spill it and returns bytes spilled */
733 
734 static unsigned
bi_spill_register(bi_context * ctx,bi_index index,uint32_t offset)735 bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)
736 {
737    bi_builder b = {.shader = ctx};
738    unsigned channels = 0;
739 
740    /* Spill after every store, fill before every load */
741    bi_foreach_instr_global_safe(ctx, I) {
742       bi_foreach_dest(I, d) {
743          if (!bi_is_equiv(I->dest[d], index))
744             continue;
745 
746          unsigned extra = I->dest[d].offset;
747          bi_index tmp = bi_temp(ctx);
748 
749          I->dest[d] = bi_replace_index(I->dest[d], tmp);
750          I->no_spill = true;
751 
752          unsigned count = bi_count_write_registers(I, d);
753          unsigned bits = count * 32;
754 
755          b.cursor = bi_after_instr(I);
756          bi_store_tl(&b, bits, tmp, offset + 4 * extra);
757 
758          ctx->spills++;
759          channels = MAX2(channels, extra + count);
760       }
761 
762       if (bi_has_arg(I, index)) {
763          b.cursor = bi_before_instr(I);
764          bi_index tmp = bi_temp(ctx);
765 
766          unsigned bits = bi_count_read_index(I, index) * 32;
767          bi_rewrite_index_src_single(I, index, tmp);
768 
769          bi_instr *ld = bi_load_tl(&b, bits, tmp, offset);
770          ld->no_spill = true;
771          ctx->fills++;
772       }
773    }
774 
775    return (channels * 4);
776 }
777 
778 /*
779  * For transition, lower collects and splits before RA, rather than after RA.
780  * LCRA knows how to deal with offsets (broken SSA), but not how to coalesce
781  * these vector moves.
782  */
783 static void
bi_lower_vector(bi_context * ctx,unsigned first_reg)784 bi_lower_vector(bi_context *ctx, unsigned first_reg)
785 {
786    bi_index *remap = calloc(ctx->ssa_alloc, sizeof(bi_index));
787 
788    bi_foreach_instr_global_safe(ctx, I) {
789       bi_builder b = bi_init_builder(ctx, bi_after_instr(I));
790 
791       if (I->op == BI_OPCODE_SPLIT_I32) {
792          bi_index src = I->src[0];
793          assert(src.offset == 0);
794 
795          bi_foreach_dest(I, i) {
796             src.offset = i;
797             bi_mov_i32_to(&b, I->dest[i], src);
798 
799             if (I->dest[i].value < first_reg)
800                remap[I->dest[i].value] = src;
801          }
802 
803          bi_remove_instruction(I);
804       } else if (I->op == BI_OPCODE_COLLECT_I32) {
805          bi_index dest = I->dest[0];
806          assert(dest.offset == 0);
807          assert(((dest.value < first_reg) || I->nr_srcs == 1) &&
808                 "nir_lower_phis_to_scalar");
809 
810          bi_foreach_src(I, i) {
811             if (bi_is_null(I->src[i]))
812                continue;
813 
814             dest.offset = i;
815             bi_mov_i32_to(&b, dest, I->src[i]);
816          }
817 
818          bi_remove_instruction(I);
819       }
820    }
821 
822    bi_foreach_instr_global(ctx, I) {
823       bi_foreach_ssa_src(I, s) {
824          if (I->src[s].value < first_reg && !bi_is_null(remap[I->src[s].value]))
825             bi_replace_src(I, s, remap[I->src[s].value]);
826       }
827    }
828 
829    free(remap);
830 
831    /* After generating a pile of moves, clean up */
832    bi_compute_liveness_ra(ctx);
833 
834    bi_foreach_block_rev(ctx, block) {
835       uint8_t *live = rzalloc_array(block, uint8_t, ctx->ssa_alloc);
836 
837       bi_foreach_successor(block, succ) {
838          for (unsigned i = 0; i < ctx->ssa_alloc; ++i)
839             live[i] |= succ->live_in[i];
840       }
841 
842       bi_foreach_instr_in_block_safe_rev(block, ins) {
843          bool all_null = true;
844 
845          bi_foreach_dest(ins, d) {
846             if (live[ins->dest[d].value] & bi_writemask(ins, d))
847                all_null = false;
848          }
849 
850          if (all_null && !bi_side_effects(ins))
851             bi_remove_instruction(ins);
852          else
853             bi_liveness_ins_update_ra(live, ins);
854       }
855 
856       ralloc_free(block->live_in);
857       block->live_in = live;
858    }
859 }
860 
861 /*
862  * Check if the instruction requires a "tied" operand. Such instructions MUST
863  * allocate their source and destination to the same register. This is a
864  * constraint on RA, and may require extra moves.
865  *
866  * In particular, this is the case for Bifrost instructions that both read and
867  * write with the staging register mechanism.
868  */
869 static bool
bi_is_tied(const bi_instr * I)870 bi_is_tied(const bi_instr *I)
871 {
872    return (I->op == BI_OPCODE_TEXC || I->op == BI_OPCODE_TEXC_DUAL ||
873            I->op == BI_OPCODE_ATOM_RETURN_I32 || I->op == BI_OPCODE_AXCHG_I32 ||
874            I->op == BI_OPCODE_ACMPXCHG_I32) &&
875           !bi_is_null(I->src[0]);
876 }
877 
878 /*
879  * For transition, coalesce tied operands together, as LCRA knows how to handle
880  * non-SSA operands but doesn't know about tied operands.
881  *
882  * This breaks the SSA form of the program, but that doesn't matter for LCRA.
883  */
884 static void
bi_coalesce_tied(bi_context * ctx)885 bi_coalesce_tied(bi_context *ctx)
886 {
887    bi_foreach_instr_global(ctx, I) {
888       if (!bi_is_tied(I))
889          continue;
890 
891       bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
892       unsigned n = bi_count_read_registers(I, 0);
893 
894       for (unsigned i = 0; i < n; ++i) {
895          bi_index dst = I->dest[0], src = I->src[0];
896 
897          assert(dst.offset == 0 && src.offset == 0);
898          dst.offset = src.offset = i;
899 
900          bi_mov_i32_to(&b, dst, src);
901       }
902 
903       bi_replace_src(I, 0, I->dest[0]);
904    }
905 }
906 
907 static unsigned
find_or_allocate_temp(unsigned * map,unsigned value,unsigned * alloc)908 find_or_allocate_temp(unsigned *map, unsigned value, unsigned *alloc)
909 {
910    if (!map[value])
911       map[value] = ++(*alloc);
912 
913    assert(map[value]);
914    return map[value] - 1;
915 }
916 
917 /* Reassigns numbering to get rid of gaps in the indices and to prioritize
918  * smaller register classes */
919 
920 static void
squeeze_index(bi_context * ctx)921 squeeze_index(bi_context *ctx)
922 {
923    unsigned *map = rzalloc_array(ctx, unsigned, ctx->ssa_alloc);
924    ctx->ssa_alloc = 0;
925 
926    bi_foreach_instr_global(ctx, I) {
927       bi_foreach_dest(I, d)
928          I->dest[d].value =
929             find_or_allocate_temp(map, I->dest[d].value, &ctx->ssa_alloc);
930 
931       bi_foreach_ssa_src(I, s)
932          I->src[s].value =
933             find_or_allocate_temp(map, I->src[s].value, &ctx->ssa_alloc);
934    }
935 
936    ralloc_free(map);
937 }
938 
939 /*
940  * Brainless out-of-SSA pass. The eventual goal is to go out-of-SSA after RA and
941  * coalesce implicitly with biased colouring in a tree scan allocator. For now,
942  * this should be good enough for LCRA.
943  */
944 static unsigned
bi_out_of_ssa(bi_context * ctx)945 bi_out_of_ssa(bi_context *ctx)
946 {
947    bi_index zero = bi_fau(BIR_FAU_IMMEDIATE | 0, false);
948    unsigned first_reg = ctx->ssa_alloc;
949 
950    /* Trivially lower phis */
951    bi_foreach_block(ctx, block) {
952       bi_foreach_instr_in_block_safe(block, I) {
953          if (I->op != BI_OPCODE_PHI)
954             break;
955 
956          /* Assign a register for the phi */
957          bi_index reg = bi_temp(ctx);
958          assert(reg.value >= first_reg);
959 
960          /* Lower to a move in each predecessor. The destinations
961           * cannot interfere so these can be sequentialized
962           * in arbitrary order.
963           */
964          bi_foreach_predecessor(block, pred) {
965             bi_builder b = bi_init_builder(ctx, bi_after_block_logical(*pred));
966             unsigned i = bi_predecessor_index(block, *pred);
967 
968             assert(!I->src[i].abs);
969             assert(!I->src[i].neg);
970             assert(I->src[i].swizzle == BI_SWIZZLE_H01);
971 
972             /* MOV of immediate needs lowering on Valhall */
973             if (ctx->arch >= 9 && I->src[i].type == BI_INDEX_CONSTANT)
974                bi_iadd_imm_i32_to(&b, reg, zero, I->src[i].value);
975             else
976                bi_mov_i32_to(&b, reg, I->src[i]);
977          }
978 
979          /* Replace the phi with a move */
980          bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
981          bi_mov_i32_to(&b, I->dest[0], reg);
982          bi_remove_instruction(I);
983 
984          /* Propagate that move within the block. The destination
985           * is SSA and the source is not written in this block,
986           * so this is legal. The move itself will be DCE'd if
987           * possible in the next pass.
988           */
989          bi_foreach_instr_in_block_rev(block, prop) {
990             if (prop->op == BI_OPCODE_PHI)
991                break;
992 
993             bi_foreach_src(prop, s) {
994                if (bi_is_equiv(prop->src[s], I->dest[0])) {
995                   bi_replace_src(prop, s, reg);
996                }
997             }
998          }
999       }
1000    }
1001 
1002    /* Try to locally propagate the moves we created. We need to be extra
1003     * careful because we're not in SSA at this point, as such this
1004     * algorithm is quadratic. This will go away when we go out of SSA after
1005     * RA.
1006     */
1007    BITSET_WORD *used =
1008       calloc(sizeof(BITSET_WORD), BITSET_WORDS(ctx->ssa_alloc));
1009    BITSET_WORD *multiple_uses =
1010       calloc(sizeof(BITSET_WORD), BITSET_WORDS(ctx->ssa_alloc));
1011 
1012    bi_foreach_instr_global(ctx, I) {
1013       bi_foreach_ssa_src(I, s) {
1014          if (BITSET_TEST(used, I->src[s].value))
1015             BITSET_SET(multiple_uses, I->src[s].value);
1016          else
1017             BITSET_SET(used, I->src[s].value);
1018       }
1019    }
1020 
1021    bi_foreach_block(ctx, block) {
1022       bi_foreach_instr_in_block_safe_rev(block, mov) {
1023          /* Match "reg = ssa" */
1024          if (mov->op != BI_OPCODE_MOV_I32)
1025             continue;
1026          if (mov->dest[0].type != BI_INDEX_NORMAL)
1027             continue;
1028          if (mov->dest[0].value < first_reg)
1029             continue;
1030          if (!bi_is_ssa(mov->src[0]))
1031             continue;
1032          if (mov->src[0].value >= first_reg)
1033             continue;
1034          if (BITSET_TEST(multiple_uses, mov->src[0].value))
1035             continue;
1036 
1037          bool found = false;
1038 
1039          /* Look locally for the write of the SSA */
1040          bi_foreach_instr_in_block_rev(block, I) {
1041             bool bail = false;
1042 
1043             bi_foreach_src(I, s) {
1044                /* Bail: write-after-read */
1045                if (bi_is_equiv(I->src[s], mov->dest[0]))
1046                   bail = true;
1047             }
1048 
1049             if (bail)
1050                break;
1051 
1052             bi_foreach_dest(I, d) {
1053                /* Bail: write-after-write */
1054                if (bi_is_equiv(I->dest[d], mov->dest[0]))
1055                   break;
1056 
1057                if (!bi_is_equiv(I->dest[d], mov->src[0]))
1058                   continue;
1059 
1060                /* We found it, replace */
1061                I->dest[d] = bi_replace_index(I->dest[d], mov->dest[0]);
1062                found = true;
1063                break;
1064             }
1065 
1066             if (found)
1067                break;
1068          }
1069 
1070          if (found)
1071             bi_remove_instruction(mov);
1072       }
1073    }
1074 
1075    free(used);
1076    free(multiple_uses);
1077    return first_reg;
1078 }
1079 
1080 void
bi_register_allocate(bi_context * ctx)1081 bi_register_allocate(bi_context *ctx)
1082 {
1083    struct lcra_state *l = NULL;
1084    bool success = false;
1085 
1086    unsigned iter_count = 2000; /* max iterations */
1087 
1088    /* Number of bytes of memory we've spilled into */
1089    unsigned spill_count = ctx->info.tls_size;
1090 
1091    if (ctx->arch >= 9)
1092       va_lower_split_64bit(ctx);
1093 
1094    /* Lower tied operands. SSA is broken from here on. */
1095    unsigned first_reg = bi_out_of_ssa(ctx);
1096    bi_lower_vector(ctx, first_reg);
1097    bi_coalesce_tied(ctx);
1098    squeeze_index(ctx);
1099 
1100    /* Try with reduced register pressure to improve thread count */
1101    if (ctx->arch >= 7) {
1102       l = bi_allocate_registers(ctx, &success, false);
1103 
1104       if (success) {
1105          ctx->info.work_reg_count = 32;
1106       } else {
1107          lcra_free(l);
1108          l = NULL;
1109       }
1110    }
1111 
1112    /* Otherwise, use the register file and spill until we succeed */
1113    while (!success && ((iter_count--) > 0)) {
1114       l = bi_allocate_registers(ctx, &success, true);
1115 
1116       if (success) {
1117          ctx->info.work_reg_count = 64;
1118       } else {
1119          signed spill_node = bi_choose_spill_node(ctx, l);
1120          lcra_free(l);
1121          l = NULL;
1122 
1123          if (spill_node == -1)
1124             unreachable("Failed to choose spill node\n");
1125 
1126          if (ctx->inputs->is_blend)
1127             unreachable("Blend shaders may not spill");
1128 
1129          /* By default, we use packed TLS addressing on Valhall.
1130           * We cannot cross 16 byte boundaries with packed TLS
1131           * addressing. Align to ensure this doesn't happen. This
1132           * could be optimized a bit.
1133           */
1134          if (ctx->arch >= 9)
1135             spill_count = ALIGN_POT(spill_count, 16);
1136 
1137          spill_count +=
1138             bi_spill_register(ctx, bi_get_index(spill_node), spill_count);
1139 
1140          /* In case the spill affected an instruction with tied
1141           * operands, we need to fix up.
1142           */
1143          bi_coalesce_tied(ctx);
1144       }
1145    }
1146 
1147    assert(success);
1148    assert(l != NULL);
1149 
1150    ctx->info.tls_size = spill_count;
1151    bi_install_registers(ctx, l);
1152 
1153    lcra_free(l);
1154 }
1155