• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2020 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors (Collabora):
24  *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25  */
26 
27 #include "compiler.h"
28 #include "nodearray.h"
29 #include "bi_builder.h"
30 #include "util/u_memory.h"
31 
32 struct lcra_state {
33         unsigned node_count;
34         uint64_t *affinity;
35 
36         /* Linear constraints imposed. For each node there there is a
37          * 'nodearray' structure, which changes between a sparse and dense
38          * array depending on the number of elements.
39          *
40          * Each element is itself a bit field denoting whether (c_j - c_i) bias
41          * is present or not, including negative biases.
42          *
43          * We support up to 8 components so the bias is in range
44          * [-7, 7] encoded by a 16-bit field
45          */
46         nodearray *linear;
47 
48         /* Before solving, forced registers; after solving, solutions. */
49         unsigned *solutions;
50 
51         /** Node which caused register allocation to fail */
52         unsigned spill_node;
53 };
54 
55 /* This module is an implementation of "Linearly Constrained
56  * Register Allocation". The paper is available in PDF form
57  * (https://people.collabora.com/~alyssa/LCRA.pdf) as well as Markdown+LaTeX
58  * (https://gitlab.freedesktop.org/alyssa/lcra/blob/master/LCRA.md)
59  */
60 
61 static struct lcra_state *
lcra_alloc_equations(unsigned node_count)62 lcra_alloc_equations(unsigned node_count)
63 {
64         struct lcra_state *l = calloc(1, sizeof(*l));
65 
66         l->node_count = node_count;
67 
68         l->linear = calloc(sizeof(l->linear[0]), node_count);
69         l->solutions = calloc(sizeof(l->solutions[0]), node_count);
70         l->affinity = calloc(sizeof(l->affinity[0]), node_count);
71 
72         memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count);
73 
74         return l;
75 }
76 
77 static void
lcra_free(struct lcra_state * l)78 lcra_free(struct lcra_state *l)
79 {
80         for (unsigned i = 0; i < l->node_count; ++i)
81                 nodearray_reset(&l->linear[i]);
82 
83         free(l->linear);
84         free(l->affinity);
85         free(l->solutions);
86         free(l);
87 }
88 
89 static void
lcra_add_node_interference(struct lcra_state * l,unsigned i,unsigned cmask_i,unsigned j,unsigned cmask_j)90 lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j)
91 {
92         if (i == j)
93                 return;
94 
95         nodearray_value constraint_fw = 0;
96         nodearray_value constraint_bw = 0;
97 
98         /* The constraint bits are reversed from lcra.c so that register
99          * allocation can be done in parallel for every possible solution,
100          * with lower-order bits representing smaller registers. */
101 
102         for (unsigned D = 0; D < 8; ++D) {
103                 if (cmask_i & (cmask_j << D)) {
104                         constraint_fw |= (1 << (7 + D));
105                         constraint_bw |= (1 << (7 - D));
106                 }
107 
108                 if (cmask_i & (cmask_j >> D)) {
109                         constraint_bw |= (1 << (7 + D));
110                         constraint_fw |= (1 << (7 - D));
111                 }
112         }
113 
114         /* Use dense arrays after adding 256 elements */
115         nodearray_orr(&l->linear[j], i, constraint_fw, 256, l->node_count);
116         nodearray_orr(&l->linear[i], j, constraint_bw, 256, l->node_count);
117 }
118 
119 static bool
lcra_test_linear(struct lcra_state * l,unsigned * solutions,unsigned i)120 lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i)
121 {
122         signed constant = solutions[i];
123 
124         if (nodearray_is_sparse(&l->linear[i])) {
125                 nodearray_sparse_foreach(&l->linear[i], elem) {
126                         unsigned j = nodearray_sparse_key(elem);
127                         nodearray_value constraint = nodearray_sparse_value(elem);
128 
129                         if (solutions[j] == ~0) continue;
130 
131                         signed lhs = constant - solutions[j];
132 
133                         if (lhs < -7 || lhs > 7)
134                                 continue;
135 
136                         if (constraint & (1 << (lhs + 7)))
137                                 return false;
138                 }
139 
140                 return true;
141         }
142 
143         nodearray_value *row = l->linear[i].dense;
144 
145         for (unsigned j = 0; j < l->node_count; ++j) {
146                 if (solutions[j] == ~0) continue;
147 
148                 signed lhs = constant - solutions[j];
149 
150                 if (lhs < -7 || lhs > 7)
151                         continue;
152 
153                 if (row[j] & (1 << (lhs + 7)))
154                         return false;
155         }
156 
157         return true;
158 }
159 
160 static bool
lcra_solve(struct lcra_state * l)161 lcra_solve(struct lcra_state *l)
162 {
163         for (unsigned step = 0; step < l->node_count; ++step) {
164                 if (l->solutions[step] != ~0) continue;
165                 if (l->affinity[step] == 0) continue;
166 
167                 bool succ = false;
168 
169                 u_foreach_bit64(r, l->affinity[step]) {
170                         l->solutions[step] = r;
171 
172                         if (lcra_test_linear(l, l->solutions, step)) {
173                                 succ = true;
174                                 break;
175                         }
176                 }
177 
178                 /* Out of registers - prepare to spill */
179                 if (!succ) {
180                         l->spill_node = step;
181                         return false;
182                 }
183         }
184 
185         return true;
186 }
187 
188 /* Register spilling is implemented with a cost-benefit system. Costs are set
189  * by the user. Benefits are calculated from the constraints. */
190 
191 static unsigned
lcra_count_constraints(struct lcra_state * l,unsigned i)192 lcra_count_constraints(struct lcra_state *l, unsigned i)
193 {
194         unsigned count = 0;
195         nodearray *constraints = &l->linear[i];
196 
197         if (nodearray_is_sparse(constraints)) {
198                 nodearray_sparse_foreach(constraints, elem)
199                         count += util_bitcount(nodearray_sparse_value(elem));
200         } else {
201                 nodearray_dense_foreach_64(constraints, elem)
202                         count += util_bitcount64(*elem);
203         }
204 
205         return count;
206 }
207 
208 /* Construct an affinity mask such that the vector with `count` elements does
209  * not intersect any of the registers in the bitset `clobber`. In other words,
210  * an allocated register r needs to satisfy for each i < count: a + i != b.
211  * Equivalently that's a != b - i, so we need a \ne { b - i : i < n }. For the
212  * entire clobber set B, we need a \ne union b \in B { b - i : i < n }, where
213  * that union is the desired clobber set. That may be written equivalently as
214  * the union over i < n of (B - i), where subtraction is defined elementwise
215  * and corresponds to a shift of the entire bitset.
216  *
217  * EVEN_BITS_MASK is an affinity mask for aligned register pairs. Interpreted
218  * as a bit set, it is { x : 0 <= x < 64 if x is even }
219  */
220 
221 #define EVEN_BITS_MASK (0x5555555555555555ull)
222 
223 static uint64_t
bi_make_affinity(uint64_t clobber,unsigned count,bool split_file)224 bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
225 {
226         uint64_t clobbered = 0;
227 
228         for (unsigned i = 0; i < count; ++i)
229                 clobbered |= (clobber >> i);
230 
231         /* Don't allocate past the end of the register file */
232         if (count > 1) {
233                 unsigned excess = count - 1;
234                 uint64_t mask = BITFIELD_MASK(excess);
235                 clobbered |= mask << (64 - excess);
236 
237                 if (split_file)
238                         clobbered |= mask << (16 - excess);
239         }
240 
241         /* Don't allocate the middle if we split out the middle */
242         if (split_file)
243                 clobbered |= BITFIELD64_MASK(32) << 16;
244 
245         /* We can use a register iff it's not clobberred */
246         return ~clobbered;
247 }
248 
249 static void
bi_mark_interference(bi_block * block,struct lcra_state * l,uint8_t * live,uint64_t preload_live,unsigned node_count,bool is_blend,bool split_file,bool aligned_sr)250 bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live, uint64_t preload_live, unsigned node_count, bool is_blend, bool split_file, bool aligned_sr)
251 {
252         bi_foreach_instr_in_block_rev(block, ins) {
253                 /* Mark all registers live after the instruction as
254                  * interfering with the destination */
255 
256                 bi_foreach_dest(ins, d) {
257                         unsigned node = bi_get_node(ins->dest[d]);
258 
259                         if (node >= node_count)
260                                 continue;
261 
262                         /* Don't allocate to anything that's read later as a
263                          * preloaded register. The affinity is the intersection
264                          * of affinity masks for each write. Since writes have
265                          * offsets, but the affinity is for the whole node, we
266                          * need to offset the affinity opposite the write
267                          * offset, so we shift right. */
268                         unsigned count = bi_count_write_registers(ins, d);
269                         unsigned offset = ins->dest[d].offset;
270                         uint64_t affinity = bi_make_affinity(preload_live, count, split_file) >> offset;
271                         /* Valhall needs >= 64-bit staging writes to be pair-aligned */
272                         if (aligned_sr && (count >= 2 || offset))
273                                 affinity &= EVEN_BITS_MASK;
274 
275                         l->affinity[node] &= affinity;
276 
277                         for (unsigned i = 0; i < node_count; ++i) {
278                                 uint8_t r = live[i];
279 
280                                 /* Nodes only interfere if they occupy
281                                  * /different values/ at the same time
282                                  * (Boissinot). In particular, sources of
283                                  * moves do not interfere with their
284                                  * destinations. This enables a limited form of
285                                  * coalescing.
286                                  */
287                                 if (ins->op == BI_OPCODE_MOV_I32 &&
288                                     i == bi_get_node(ins->src[0])) {
289 
290                                         r &= ~BITFIELD_BIT(ins->src[0].offset);
291                                 }
292 
293                                 if (r) {
294                                         lcra_add_node_interference(l, node,
295                                                         bi_writemask(ins, d), i, r);
296                                 }
297                         }
298 
299                         unsigned node_first = bi_get_node(ins->dest[0]);
300                         if (d == 1 && node_first < node_count) {
301                                 lcra_add_node_interference(l, node, bi_writemask(ins, 1),
302                                                            node_first, bi_writemask(ins, 0));
303                         }
304                 }
305 
306                 /* Valhall needs >= 64-bit reads to be pair-aligned */
307                 if (aligned_sr) {
308                         bi_foreach_src(ins, s) {
309                                 if (bi_count_read_registers(ins, s) >= 2) {
310                                         unsigned node = bi_get_node(ins->src[s]);
311 
312                                         if (node < node_count)
313                                                 l->affinity[node] &= EVEN_BITS_MASK;
314                                 }
315                         }
316                 }
317 
318                 if (!is_blend && ins->op == BI_OPCODE_BLEND) {
319                         /* Blend shaders might clobber r0-r15, r48. */
320                         uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48);
321 
322                         for (unsigned i = 0; i < node_count; ++i) {
323                                 if (live[i])
324                                         l->affinity[i] &= ~clobber;
325                         }
326                 }
327 
328                 /* Update live_in */
329                 preload_live = bi_postra_liveness_ins(preload_live, ins);
330                 bi_liveness_ins_update(live, ins, node_count);
331         }
332 
333         block->reg_live_in = preload_live;
334 }
335 
336 static void
bi_compute_interference(bi_context * ctx,struct lcra_state * l,bool full_regs)337 bi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs)
338 {
339         unsigned node_count = bi_max_temp(ctx);
340 
341         bi_compute_liveness(ctx);
342         bi_postra_liveness(ctx);
343 
344         bi_foreach_block_rev(ctx, blk) {
345                 uint8_t *live = mem_dup(blk->live_out, node_count);
346 
347                 bi_mark_interference(blk, l, live, blk->reg_live_out,
348                                 node_count, ctx->inputs->is_blend, !full_regs,
349                                 ctx->arch >= 9);
350 
351                 free(live);
352         }
353 }
354 
355 static struct lcra_state *
bi_allocate_registers(bi_context * ctx,bool * success,bool full_regs)356 bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
357 {
358         unsigned node_count = bi_max_temp(ctx);
359         struct lcra_state *l = lcra_alloc_equations(node_count);
360 
361         /* Blend shaders are restricted to R0-R15. Other shaders at full
362          * occupancy also can access R48-R63. At half occupancy they can access
363          * the whole file. */
364 
365         uint64_t default_affinity =
366                 ctx->inputs->is_blend ? BITFIELD64_MASK(16) :
367                 full_regs ? BITFIELD64_MASK(64) :
368                 (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));
369 
370         /* To test spilling, mimic a small register file */
371         if (bifrost_debug & BIFROST_DBG_SPILL && !ctx->inputs->is_blend)
372                 default_affinity &= BITFIELD64_MASK(48) << 8;
373 
374         bi_foreach_instr_global(ctx, ins) {
375                 bi_foreach_dest(ins, d) {
376                         unsigned dest = bi_get_node(ins->dest[d]);
377 
378                         if (dest < node_count)
379                                 l->affinity[dest] = default_affinity;
380                 }
381 
382                 /* Blend shaders expect the src colour to be in r0-r3 */
383                 if (ins->op == BI_OPCODE_BLEND &&
384                     !ctx->inputs->is_blend) {
385                         unsigned node = bi_get_node(ins->src[0]);
386                         assert(node < node_count);
387                         l->solutions[node] = 0;
388 
389                         /* Dual source blend input in r4-r7 */
390                         node = bi_get_node(ins->src[4]);
391                         if (node < node_count)
392                                 l->solutions[node] = 4;
393 
394                         /* Writes to R48 */
395                         node = bi_get_node(ins->dest[0]);
396                         if (!bi_is_null(ins->dest[0])) {
397                                 assert(node < node_count);
398                                 l->solutions[node] = 48;
399                         }
400                 }
401 
402                 /* Coverage mask writes stay in R60 */
403                 if ((ins->op == BI_OPCODE_ATEST ||
404                      ins->op == BI_OPCODE_ZS_EMIT) &&
405                     !bi_is_null(ins->dest[0])) {
406                         unsigned node = bi_get_node(ins->dest[0]);
407                         assert(node < node_count);
408                         l->solutions[node] = 60;
409                 }
410 
411                 /* Experimentally, it seems coverage masks inputs to ATEST must
412                  * be in R60. Otherwise coverage mask writes do not work with
413                  * early-ZS with pixel-frequency-shading (this combination of
414                  * settings is legal if depth/stencil writes are disabled).
415                  */
416                 if (ins->op == BI_OPCODE_ATEST) {
417                         unsigned node = bi_get_node(ins->src[0]);
418                         assert(node < node_count);
419                         l->solutions[node] = 60;
420                 }
421         }
422 
423         bi_compute_interference(ctx, l, full_regs);
424 
425         /* Coalesce register moves if we're allowed. We need to be careful due
426          * to the restricted affinity induced by the blend shader ABI.
427          */
428         bi_foreach_instr_global(ctx, I) {
429                 if (I->op != BI_OPCODE_MOV_I32) continue;
430                 if (I->src[0].type != BI_INDEX_REGISTER) continue;
431 
432                 unsigned reg = I->src[0].value;
433                 unsigned node = bi_get_node(I->dest[0]);
434                 assert(node < node_count);
435 
436                 if (l->solutions[node] != ~0) continue;
437 
438                 uint64_t affinity = l->affinity[node];
439 
440                 if (ctx->inputs->is_blend) {
441                         /* We're allowed to coalesce the moves to these */
442                         affinity |= BITFIELD64_BIT(48);
443                         affinity |= BITFIELD64_BIT(60);
444                 }
445 
446                 /* Try to coalesce */
447                 if (affinity & BITFIELD64_BIT(reg)) {
448                         l->solutions[node] = reg;
449 
450                         if (!lcra_test_linear(l, l->solutions, node))
451                                 l->solutions[node] = ~0;
452                 }
453         }
454 
455         *success = lcra_solve(l);
456 
457         return l;
458 }
459 
460 static bi_index
bi_reg_from_index(bi_context * ctx,struct lcra_state * l,bi_index index)461 bi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index)
462 {
463         /* Offsets can only be applied when we register allocated an index, or
464          * alternatively for FAU's encoding */
465 
466         ASSERTED bool is_offset = (index.offset > 0) &&
467                 (index.type != BI_INDEX_FAU);
468         unsigned node_count = bi_max_temp(ctx);
469 
470         /* Did we run RA for this index at all */
471         if (bi_get_node(index) >= node_count) {
472                 assert(!is_offset);
473                 return index;
474         }
475 
476         /* LCRA didn't bother solving this index (how lazy!) */
477         signed solution = l->solutions[bi_get_node(index)];
478         if (solution < 0) {
479                 assert(!is_offset);
480                 return index;
481         }
482 
483         /* todo: do we want to compose with the subword swizzle? */
484         bi_index new_index = bi_register(solution + index.offset);
485         new_index.swizzle = index.swizzle;
486         new_index.abs = index.abs;
487         new_index.neg = index.neg;
488         return new_index;
489 }
490 
491 /* Dual texture instructions write to two sets of staging registers, modeled as
492  * two destinations in the IR. The first set is communicated with the usual
493  * staging register mechanism. The second set is encoded in the texture
494  * operation descriptor. This is quite unusual, and requires the following late
495  * fixup.
496  */
497 static void
bi_fixup_dual_tex_register(bi_instr * I)498 bi_fixup_dual_tex_register(bi_instr *I)
499 {
500         assert(I->dest[1].type == BI_INDEX_REGISTER);
501         assert(I->src[3].type == BI_INDEX_CONSTANT);
502 
503         struct bifrost_dual_texture_operation desc = {
504                 .secondary_register = I->dest[1].value
505         };
506 
507         I->src[3].value |= bi_dual_tex_as_u32(desc);
508 }
509 
510 static void
bi_install_registers(bi_context * ctx,struct lcra_state * l)511 bi_install_registers(bi_context *ctx, struct lcra_state *l)
512 {
513         bi_foreach_instr_global(ctx, ins) {
514                 bi_foreach_dest(ins, d)
515                         ins->dest[d] = bi_reg_from_index(ctx, l, ins->dest[d]);
516 
517                 bi_foreach_src(ins, s)
518                         ins->src[s] = bi_reg_from_index(ctx, l, ins->src[s]);
519 
520                 if (ins->op == BI_OPCODE_TEXC && !bi_is_null(ins->dest[1]))
521                         bi_fixup_dual_tex_register(ins);
522         }
523 }
524 
525 static void
bi_rewrite_index_src_single(bi_instr * ins,bi_index old,bi_index new)526 bi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new)
527 {
528         bi_foreach_src(ins, i) {
529                 if (bi_is_equiv(ins->src[i], old)) {
530                         ins->src[i].type = new.type;
531                         ins->src[i].reg = new.reg;
532                         ins->src[i].value = new.value;
533                 }
534         }
535 }
536 
537 /* If register allocation fails, find the best spill node */
538 
539 static signed
bi_choose_spill_node(bi_context * ctx,struct lcra_state * l)540 bi_choose_spill_node(bi_context *ctx, struct lcra_state *l)
541 {
542         /* Pick a node satisfying bi_spill_register's preconditions */
543         BITSET_WORD *no_spill = calloc(sizeof(BITSET_WORD), BITSET_WORDS(l->node_count));
544 
545         bi_foreach_instr_global(ctx, ins) {
546                 bi_foreach_dest(ins, d) {
547                         unsigned node = bi_get_node(ins->dest[d]);
548 
549                         if (node >= l->node_count)
550                                 continue;
551 
552                         /* Don't allow spilling coverage mask writes because the
553                          * register preload logic assumes it will stay in R60.
554                          * This could be optimized.
555                          */
556                         if (ins->no_spill ||
557                             ins->op == BI_OPCODE_ATEST ||
558                             ins->op == BI_OPCODE_ZS_EMIT ||
559                             (ins->op == BI_OPCODE_MOV_I32 &&
560                              ins->src[0].type == BI_INDEX_REGISTER &&
561                              ins->src[0].value == 60)) {
562                                 BITSET_SET(no_spill, node);
563                         }
564                 }
565         }
566 
567         unsigned best_benefit = 0.0;
568         signed best_node = -1;
569 
570         if (nodearray_is_sparse(&l->linear[l->spill_node])) {
571                 nodearray_sparse_foreach(&l->linear[l->spill_node], elem) {
572                         unsigned i = nodearray_sparse_key(elem);
573                         unsigned constraint = nodearray_sparse_value(elem);
574 
575                         /* Only spill nodes that interfere with the node failing
576                          * register allocation. It's pointless to spill anything else */
577                         if (!constraint) continue;
578 
579                         if (BITSET_TEST(no_spill, i)) continue;
580 
581                         unsigned benefit = lcra_count_constraints(l, i);
582 
583                         if (benefit > best_benefit) {
584                                 best_benefit = benefit;
585                                 best_node = i;
586                         }
587                 }
588         } else {
589                 nodearray_value *row = l->linear[l->spill_node].dense;
590 
591                 for (unsigned i = 0; i < l->node_count; ++i) {
592                         /* Only spill nodes that interfere with the node failing
593                          * register allocation. It's pointless to spill anything else */
594                         if (!row[i]) continue;
595 
596                         if (BITSET_TEST(no_spill, i)) continue;
597 
598                         unsigned benefit = lcra_count_constraints(l, i);
599 
600                         if (benefit > best_benefit) {
601                                 best_benefit = benefit;
602                                 best_node = i;
603                         }
604                 }
605         }
606 
607         free(no_spill);
608         return best_node;
609 }
610 
611 static unsigned
bi_count_read_index(bi_instr * I,bi_index index)612 bi_count_read_index(bi_instr *I, bi_index index)
613 {
614         unsigned max = 0;
615 
616         bi_foreach_src(I, s) {
617                 if (bi_is_equiv(I->src[s], index)) {
618                         unsigned count = bi_count_read_registers(I, s);
619                         max = MAX2(max, count + I->src[s].offset);
620                 }
621         }
622 
623         return max;
624 }
625 
626 /*
627  * Wrappers to emit loads/stores to thread-local storage in an appropriate way
628  * for the target, so the spill/fill code becomes architecture-independent.
629  */
630 
631 static bi_index
bi_tls_ptr(bool hi)632 bi_tls_ptr(bool hi)
633 {
634         return bi_fau(BIR_FAU_TLS_PTR, hi);
635 }
636 
637 static bi_instr *
bi_load_tl(bi_builder * b,unsigned bits,bi_index src,unsigned offset)638 bi_load_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset)
639 {
640         if (b->shader->arch >= 9) {
641                 return bi_load_to(b, bits, src, bi_tls_ptr(false),
642                                   bi_tls_ptr(true), BI_SEG_TL, offset);
643         } else {
644                 return bi_load_to(b, bits, src, bi_imm_u32(offset), bi_zero(),
645                                   BI_SEG_TL, 0);
646         }
647 }
648 
649 static void
bi_store_tl(bi_builder * b,unsigned bits,bi_index src,unsigned offset)650 bi_store_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset)
651 {
652         if (b->shader->arch >= 9) {
653                 bi_store(b, bits, src, bi_tls_ptr(false), bi_tls_ptr(true), BI_SEG_TL, offset);
654         } else {
655                 bi_store(b, bits, src, bi_imm_u32(offset), bi_zero(), BI_SEG_TL, 0);
656         }
657 }
658 
659 /* Once we've chosen a spill node, spill it and returns bytes spilled */
660 
661 static unsigned
bi_spill_register(bi_context * ctx,bi_index index,uint32_t offset)662 bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)
663 {
664         bi_builder b = { .shader = ctx };
665         unsigned channels = 0;
666 
667         /* Spill after every store, fill before every load */
668         bi_foreach_instr_global_safe(ctx, I) {
669                 bi_foreach_dest(I, d) {
670                         if (!bi_is_equiv(I->dest[d], index)) continue;
671 
672                         unsigned extra = I->dest[d].offset;
673                         bi_index tmp = bi_temp(ctx);
674 
675                         I->dest[d] = bi_replace_index(I->dest[d], tmp);
676                         I->no_spill = true;
677 
678                         unsigned count = bi_count_write_registers(I, d);
679                         unsigned bits = count * 32;
680 
681                         b.cursor = bi_after_instr(I);
682                         bi_store_tl(&b, bits, tmp, offset + 4 * extra);
683 
684                         ctx->spills++;
685                         channels = MAX2(channels, extra + count);
686                 }
687 
688                 if (bi_has_arg(I, index)) {
689                         b.cursor = bi_before_instr(I);
690                         bi_index tmp = bi_temp(ctx);
691 
692                         unsigned bits = bi_count_read_index(I, index) * 32;
693                         bi_rewrite_index_src_single(I, index, tmp);
694 
695                         bi_instr *ld = bi_load_tl(&b, bits, tmp, offset);
696                         ld->no_spill = true;
697                         ctx->fills++;
698                 }
699         }
700 
701         return (channels * 4);
702 }
703 
704 /*
705  * For transition, lower collects and splits before RA, rather than after RA.
706  * LCRA knows how to deal with offsets (broken SSA), but not how to coalesce
707  * these vector moves.
708  */
709 static void
bi_lower_vector(bi_context * ctx)710 bi_lower_vector(bi_context *ctx)
711 {
712         bi_index *remap = calloc(ctx->ssa_alloc, sizeof(bi_index));
713 
714         bi_foreach_instr_global_safe(ctx, I) {
715                 bi_builder b = bi_init_builder(ctx, bi_after_instr(I));
716 
717                 if (I->op == BI_OPCODE_SPLIT_I32) {
718                         bi_index src = I->src[0];
719                         assert(src.offset == 0);
720 
721                         for (unsigned i = 0; i < I->nr_dests; ++i) {
722                                 if (bi_is_null(I->dest[i]))
723                                         continue;
724 
725                                 src.offset = i;
726                                 bi_mov_i32_to(&b, I->dest[i], src);
727 
728                                 if (bi_is_ssa(I->dest[i]))
729                                         remap[I->dest[i].value] = src;
730                         }
731 
732                         bi_remove_instruction(I);
733                 } else if (I->op == BI_OPCODE_COLLECT_I32) {
734                         bi_index dest = I->dest[0];
735                         assert(dest.offset == 0);
736                         assert((bi_is_ssa(dest) || I->nr_srcs == 1) && "nir_lower_phis_to_scalar");
737 
738                         for (unsigned i = 0; i < I->nr_srcs; ++i) {
739                                 if (bi_is_null(I->src[i]))
740                                         continue;
741 
742                                 dest.offset = i;
743                                 bi_mov_i32_to(&b, dest, I->src[i]);
744                         }
745 
746                         bi_remove_instruction(I);
747                 }
748         }
749 
750         bi_foreach_instr_global(ctx, I) {
751                 bi_foreach_src(I, s) {
752                         if (bi_is_ssa(I->src[s]) && !bi_is_null(remap[I->src[s].value]))
753                                 I->src[s] = bi_replace_index(I->src[s], remap[I->src[s].value]);
754                 }
755         }
756 
757         free(remap);
758 
759         /* After generating a pile of moves, clean up */
760         bi_opt_dead_code_eliminate(ctx);
761 }
762 
763 /*
764  * Check if the instruction requires a "tied" operand. Such instructions MUST
765  * allocate their source and destination to the same register. This is a
766  * constraint on RA, and may require extra moves.
767  *
768  * In particular, this is the case for Bifrost instructions that both read and
769  * write with the staging register mechanism.
770  */
771 static bool
bi_is_tied(const bi_instr * I)772 bi_is_tied(const bi_instr *I)
773 {
774         if (bi_is_null(I->src[0]))
775                 return false;
776 
777         return (I->op == BI_OPCODE_TEXC ||
778                 I->op == BI_OPCODE_ATOM_RETURN_I32 ||
779                 I->op == BI_OPCODE_AXCHG_I32 ||
780                 I->op == BI_OPCODE_ACMPXCHG_I32);
781 }
782 
783 /*
784  * For transition, coalesce tied operands together, as LCRA knows how to handle
785  * non-SSA operands but doesn't know about tied operands.
786  *
787  * This breaks the SSA form of the program, but that doesn't matter for LCRA.
788  */
789 static void
bi_coalesce_tied(bi_context * ctx)790 bi_coalesce_tied(bi_context *ctx)
791 {
792         bi_foreach_instr_global(ctx, I) {
793                 if (!bi_is_tied(I)) continue;
794 
795                 bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
796                 unsigned n = bi_count_read_registers(I, 0);
797 
798                 for (unsigned i = 0; i < n; ++i) {
799                         bi_index dst = I->dest[0], src = I->src[0];
800 
801                         assert(dst.offset == 0 && src.offset == 0);
802                         dst.offset = src.offset = i;
803 
804                         bi_mov_i32_to(&b, dst, src);
805                 }
806 
807                 I->src[0] = bi_replace_index(I->src[0], I->dest[0]);
808         }
809 }
810 
811 static unsigned
find_or_allocate_temp(unsigned * map,unsigned value,unsigned * alloc)812 find_or_allocate_temp(unsigned *map, unsigned value, unsigned *alloc)
813 {
814         if (!map[value])
815                 map[value] = ++(*alloc);
816 
817         assert(map[value]);
818         return map[value] - 1;
819 }
820 
821 /* Reassigns numbering to get rid of gaps in the indices and to prioritize
822  * smaller register classes */
823 
824 static void
squeeze_index(bi_context * ctx)825 squeeze_index(bi_context *ctx)
826 {
827         unsigned *map = rzalloc_array(ctx, unsigned, ctx->ssa_alloc);
828         ctx->ssa_alloc = 0;
829 
830         bi_foreach_instr_global(ctx, I) {
831                 bi_foreach_dest(I, d) {
832                         if (I->dest[d].type == BI_INDEX_NORMAL)
833                                 I->dest[d].value = find_or_allocate_temp(map, I->dest[d].value, &ctx->ssa_alloc);
834                 }
835 
836                 bi_foreach_src(I, s) {
837                         if (I->src[s].type == BI_INDEX_NORMAL)
838                                 I->src[s].value = find_or_allocate_temp(map, I->src[s].value, &ctx->ssa_alloc);
839                 }
840         }
841 
842         ralloc_free(map);
843 }
844 
845 void
bi_register_allocate(bi_context * ctx)846 bi_register_allocate(bi_context *ctx)
847 {
848         struct lcra_state *l = NULL;
849         bool success = false;
850 
851         unsigned iter_count = 1000; /* max iterations */
852 
853         /* Number of bytes of memory we've spilled into */
854         unsigned spill_count = ctx->info.tls_size;
855 
856         if (ctx->arch >= 9)
857                 va_lower_split_64bit(ctx);
858 
859         bi_lower_vector(ctx);
860 
861         /* Lower tied operands. SSA is broken from here on. */
862         bi_coalesce_tied(ctx);
863         squeeze_index(ctx);
864 
865         /* Try with reduced register pressure to improve thread count */
866         if (ctx->arch >= 7) {
867                 l = bi_allocate_registers(ctx, &success, false);
868 
869                 if (success) {
870                         ctx->info.work_reg_count = 32;
871                 } else {
872                         lcra_free(l);
873                         l = NULL;
874                 }
875         }
876 
877         /* Otherwise, use the register file and spill until we succeed */
878         while (!success && ((iter_count--) > 0)) {
879                 l = bi_allocate_registers(ctx, &success, true);
880 
881                 if (success) {
882                         ctx->info.work_reg_count = 64;
883                 } else {
884                         signed spill_node = bi_choose_spill_node(ctx, l);
885                         lcra_free(l);
886                         l = NULL;
887 
888                         if (spill_node == -1)
889                                 unreachable("Failed to choose spill node\n");
890 
891                         if (ctx->inputs->is_blend)
892                                 unreachable("Blend shaders may not spill");
893 
894                         /* By default, we use packed TLS addressing on Valhall.
895                          * We cannot cross 16 byte boundaries with packed TLS
896                          * addressing. Align to ensure this doesn't happen. This
897                          * could be optimized a bit.
898                          */
899                         if (ctx->arch >= 9)
900                                 spill_count = ALIGN_POT(spill_count, 16);
901 
902                         spill_count += bi_spill_register(ctx,
903                                         bi_node_to_index(spill_node, bi_max_temp(ctx)),
904                                         spill_count);
905 
906                         /* In case the spill affected an instruction with tied
907                          * operands, we need to fix up.
908                          */
909                         bi_coalesce_tied(ctx);
910                 }
911         }
912 
913         assert(success);
914         assert(l != NULL);
915 
916         ctx->info.tls_size = spill_count;
917         bi_install_registers(ctx, l);
918 
919         lcra_free(l);
920 }
921