• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2020 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors (Collabora):
24  *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25  */
26 
27 #include "compiler.h"
28 #include "bi_builder.h"
29 #include "util/u_memory.h"
30 
31 struct lcra_state {
32         unsigned node_count;
33         uint64_t *affinity;
34 
35         /* Linear constraints imposed. Nested array sized upfront, organized as
36          * linear[node_left][node_right]. That is, calculate indices as:
37          *
38          * Each element is itself a bit field denoting whether (c_j - c_i) bias
39          * is present or not, including negative biases.
40          *
41          * Note for Bifrost, there are 4 components so the bias is in range
42          * [-3, 3] so encoded by 8-bit field. */
43 
44         uint8_t *linear;
45 
46         /* Before solving, forced registers; after solving, solutions. */
47         unsigned *solutions;
48 
49         /** Node which caused register allocation to fail */
50         unsigned spill_node;
51 };
52 
53 /* This module is an implementation of "Linearly Constrained
54  * Register Allocation". The paper is available in PDF form
55  * (https://people.collabora.com/~alyssa/LCRA.pdf) as well as Markdown+LaTeX
56  * (https://gitlab.freedesktop.org/alyssa/lcra/blob/master/LCRA.md)
57  */
58 
59 static struct lcra_state *
lcra_alloc_equations(unsigned node_count)60 lcra_alloc_equations(unsigned node_count)
61 {
62         struct lcra_state *l = calloc(1, sizeof(*l));
63 
64         l->node_count = node_count;
65 
66         l->linear = calloc(sizeof(l->linear[0]), node_count * node_count);
67         l->solutions = calloc(sizeof(l->solutions[0]), node_count);
68         l->affinity = calloc(sizeof(l->affinity[0]), node_count);
69 
70         memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count);
71 
72         return l;
73 }
74 
75 static void
lcra_free(struct lcra_state * l)76 lcra_free(struct lcra_state *l)
77 {
78         free(l->linear);
79         free(l->affinity);
80         free(l->solutions);
81         free(l);
82 }
83 
84 static void
lcra_add_node_interference(struct lcra_state * l,unsigned i,unsigned cmask_i,unsigned j,unsigned cmask_j)85 lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j)
86 {
87         if (i == j)
88                 return;
89 
90         uint8_t constraint_fw = 0;
91         uint8_t constraint_bw = 0;
92 
93         for (unsigned D = 0; D < 4; ++D) {
94                 if (cmask_i & (cmask_j << D)) {
95                         constraint_bw |= (1 << (3 + D));
96                         constraint_fw |= (1 << (3 - D));
97                 }
98 
99                 if (cmask_i & (cmask_j >> D)) {
100                         constraint_fw |= (1 << (3 + D));
101                         constraint_bw |= (1 << (3 - D));
102                 }
103         }
104 
105         l->linear[j * l->node_count + i] |= constraint_fw;
106         l->linear[i * l->node_count + j] |= constraint_bw;
107 }
108 
109 static bool
lcra_test_linear(struct lcra_state * l,unsigned * solutions,unsigned i)110 lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i)
111 {
112         uint8_t *row = &l->linear[i * l->node_count];
113         signed constant = solutions[i];
114 
115         for (unsigned j = 0; j < l->node_count; ++j) {
116                 if (solutions[j] == ~0) continue;
117 
118                 signed lhs = solutions[j] - constant;
119 
120                 if (lhs < -3 || lhs > 3)
121                         continue;
122 
123                 if (row[j] & (1 << (lhs + 3)))
124                         return false;
125         }
126 
127         return true;
128 }
129 
130 static bool
lcra_solve(struct lcra_state * l)131 lcra_solve(struct lcra_state *l)
132 {
133         for (unsigned step = 0; step < l->node_count; ++step) {
134                 if (l->solutions[step] != ~0) continue;
135                 if (l->affinity[step] == 0) continue;
136 
137                 bool succ = false;
138 
139                 u_foreach_bit64(r, l->affinity[step]) {
140                         l->solutions[step] = r;
141 
142                         if (lcra_test_linear(l, l->solutions, step)) {
143                                 succ = true;
144                                 break;
145                         }
146                 }
147 
148                 /* Out of registers - prepare to spill */
149                 if (!succ) {
150                         l->spill_node = step;
151                         return false;
152                 }
153         }
154 
155         return true;
156 }
157 
158 /* Register spilling is implemented with a cost-benefit system. Costs are set
159  * by the user. Benefits are calculated from the constraints. */
160 
161 static unsigned
lcra_count_constraints(struct lcra_state * l,unsigned i)162 lcra_count_constraints(struct lcra_state *l, unsigned i)
163 {
164         unsigned count = 0;
165         uint8_t *constraints = &l->linear[i * l->node_count];
166 
167         for (unsigned j = 0; j < l->node_count; ++j)
168                 count += util_bitcount(constraints[j]);
169 
170         return count;
171 }
172 
173 /* Construct an affinity mask such that the vector with `count` elements does
174  * not intersect any of the registers in the bitset `clobber`. In other words,
175  * an allocated register r needs to satisfy for each i < count: a + i != b.
176  * Equivalently that's a != b - i, so we need a \ne { b - i : i < n }. For the
177  * entire clobber set B, we need a \ne union b \in B { b - i : i < n }, where
178  * that union is the desired clobber set. That may be written equivalently as
179  * the union over i < n of (B - i), where subtraction is defined elementwise
180  * and corresponds to a shift of the entire bitset.
181  *
182  * EVEN_BITS_MASK is an affinity mask for aligned register pairs. Interpreted
183  * as a bit set, it is { x : 0 <= x < 64 if x is even }
184  */
185 
186 #define EVEN_BITS_MASK (0x5555555555555555ull)
187 
188 static uint64_t
bi_make_affinity(uint64_t clobber,unsigned count,bool split_file)189 bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
190 {
191         uint64_t clobbered = 0;
192 
193         for (unsigned i = 0; i < count; ++i)
194                 clobbered |= (clobber >> i);
195 
196         /* Don't allocate past the end of the register file */
197         if (count > 1) {
198                 unsigned excess = count - 1;
199                 uint64_t mask = BITFIELD_MASK(excess);
200                 clobbered |= mask << (64 - excess);
201 
202                 if (split_file)
203                         clobbered |= mask << (16 - excess);
204         }
205 
206         /* Don't allocate the middle if we split out the middle */
207         if (split_file)
208                 clobbered |= BITFIELD64_MASK(32) << 16;
209 
210         /* We can use a register iff it's not clobberred */
211         return ~clobbered;
212 }
213 
214 static void
bi_mark_interference(bi_block * block,struct lcra_state * l,uint8_t * live,uint64_t preload_live,unsigned node_count,bool is_blend,bool split_file,bool aligned_sr)215 bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live, uint64_t preload_live, unsigned node_count, bool is_blend, bool split_file, bool aligned_sr)
216 {
217         bi_foreach_instr_in_block_rev(block, ins) {
218                 /* Mark all registers live after the instruction as
219                  * interfering with the destination */
220 
221                 bi_foreach_dest(ins, d) {
222                         unsigned node = bi_get_node(ins->dest[d]);
223 
224                         if (node >= node_count)
225                                 continue;
226 
227                         /* Don't allocate to anything that's read later as a
228                          * preloaded register. The affinity is the intersection
229                          * of affinity masks for each write. Since writes have
230                          * offsets, but the affinity is for the whole node, we
231                          * need to offset the affinity opposite the write
232                          * offset, so we shift right. */
233                         unsigned count = bi_count_write_registers(ins, d);
234                         unsigned offset = ins->dest[d].offset;
235                         uint64_t affinity = bi_make_affinity(preload_live, count, split_file);
236 
237                         /* Valhall needs >= 64-bit staging writes to be pair-aligned */
238                         if (aligned_sr && count >= 2)
239                                 affinity &= EVEN_BITS_MASK;
240 
241                         l->affinity[node] &= (affinity >> offset);
242 
243                         for (unsigned i = 0; i < node_count; ++i) {
244                                 if (live[i]) {
245                                         lcra_add_node_interference(l, node,
246                                                         bi_writemask(ins, d), i, live[i]);
247                                 }
248                         }
249                 }
250 
251                 /* Valhall needs >= 64-bit staging reads to be pair-aligned */
252                 if (aligned_sr && bi_count_read_registers(ins, 0) >= 2) {
253                         unsigned node = bi_get_node(ins->src[0]);
254 
255                         if (node < node_count)
256                                 l->affinity[node] &= EVEN_BITS_MASK;
257                 }
258 
259                 if (!is_blend && ins->op == BI_OPCODE_BLEND) {
260                         /* Blend shaders might clobber r0-r15, r48. */
261                         uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48);
262 
263                         for (unsigned i = 0; i < node_count; ++i) {
264                                 if (live[i])
265                                         l->affinity[i] &= ~clobber;
266                         }
267                 }
268 
269                 /* Update live_in */
270                 preload_live = bi_postra_liveness_ins(preload_live, ins);
271                 bi_liveness_ins_update(live, ins, node_count);
272         }
273 
274         block->reg_live_in = preload_live;
275 }
276 
277 static void
bi_compute_interference(bi_context * ctx,struct lcra_state * l,bool full_regs)278 bi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs)
279 {
280         unsigned node_count = bi_max_temp(ctx);
281 
282         bi_compute_liveness(ctx);
283         bi_postra_liveness(ctx);
284 
285         bi_foreach_block_rev(ctx, blk) {
286                 uint8_t *live = mem_dup(blk->live_out, node_count);
287 
288                 bi_mark_interference(blk, l, live, blk->reg_live_out,
289                                 node_count, ctx->inputs->is_blend, !full_regs,
290                                 ctx->arch >= 9);
291 
292                 free(live);
293         }
294 }
295 
296 static struct lcra_state *
bi_allocate_registers(bi_context * ctx,bool * success,bool full_regs)297 bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
298 {
299         unsigned node_count = bi_max_temp(ctx);
300         struct lcra_state *l = lcra_alloc_equations(node_count);
301 
302         /* Blend shaders are restricted to R0-R15. Other shaders at full
303          * occupancy also can access R48-R63. At half occupancy they can access
304          * the whole file. */
305 
306         uint64_t default_affinity =
307                 ctx->inputs->is_blend ? BITFIELD64_MASK(16) :
308                 full_regs ? BITFIELD64_MASK(64) :
309                 (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));
310 
311         bi_foreach_instr_global(ctx, ins) {
312                 bi_foreach_dest(ins, d) {
313                         unsigned dest = bi_get_node(ins->dest[d]);
314 
315                         /* Blend shaders expect the src colour to be in r0-r3 */
316                         if (ins->op == BI_OPCODE_BLEND &&
317                             !ctx->inputs->is_blend) {
318                                 unsigned node = bi_get_node(ins->src[0]);
319                                 assert(node < node_count);
320                                 l->solutions[node] = 0;
321                         }
322 
323                         if (dest < node_count)
324                                 l->affinity[dest] = default_affinity;
325                 }
326 
327         }
328 
329         bi_compute_interference(ctx, l, full_regs);
330 
331         *success = lcra_solve(l);
332 
333         return l;
334 }
335 
336 static bi_index
bi_reg_from_index(bi_context * ctx,struct lcra_state * l,bi_index index)337 bi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index)
338 {
339         /* Offsets can only be applied when we register allocated an index, or
340          * alternatively for FAU's encoding */
341 
342         ASSERTED bool is_offset = (index.offset > 0) &&
343                 (index.type != BI_INDEX_FAU);
344         unsigned node_count = bi_max_temp(ctx);
345 
346         /* Did we run RA for this index at all */
347         if (bi_get_node(index) >= node_count) {
348                 assert(!is_offset);
349                 return index;
350         }
351 
352         /* LCRA didn't bother solving this index (how lazy!) */
353         signed solution = l->solutions[bi_get_node(index)];
354         if (solution < 0) {
355                 assert(!is_offset);
356                 return index;
357         }
358 
359         /* todo: do we want to compose with the subword swizzle? */
360         bi_index new_index = bi_register(solution + index.offset);
361         new_index.swizzle = index.swizzle;
362         new_index.abs = index.abs;
363         new_index.neg = index.neg;
364         return new_index;
365 }
366 
367 static void
bi_install_registers(bi_context * ctx,struct lcra_state * l)368 bi_install_registers(bi_context *ctx, struct lcra_state *l)
369 {
370         bi_foreach_instr_global(ctx, ins) {
371                 bi_foreach_dest(ins, d)
372                         ins->dest[d] = bi_reg_from_index(ctx, l, ins->dest[d]);
373 
374                 bi_foreach_src(ins, s)
375                         ins->src[s] = bi_reg_from_index(ctx, l, ins->src[s]);
376         }
377 }
378 
379 static void
bi_rewrite_index_src_single(bi_instr * ins,bi_index old,bi_index new)380 bi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new)
381 {
382         bi_foreach_src(ins, i) {
383                 if (bi_is_equiv(ins->src[i], old)) {
384                         ins->src[i].type = new.type;
385                         ins->src[i].reg = new.reg;
386                         ins->src[i].value = new.value;
387                 }
388         }
389 }
390 
391 /* If register allocation fails, find the best spill node */
392 
393 static signed
bi_choose_spill_node(bi_context * ctx,struct lcra_state * l)394 bi_choose_spill_node(bi_context *ctx, struct lcra_state *l)
395 {
396         /* Pick a node satisfying bi_spill_register's preconditions */
397         BITSET_WORD *no_spill = calloc(sizeof(BITSET_WORD), BITSET_WORDS(l->node_count));
398 
399         bi_foreach_instr_global(ctx, ins) {
400                 bi_foreach_dest(ins, d) {
401                         unsigned node = bi_get_node(ins->dest[d]);
402 
403                         if (node < l->node_count && ins->no_spill)
404                                 BITSET_SET(no_spill, node);
405                 }
406         }
407 
408         unsigned best_benefit = 0.0;
409         signed best_node = -1;
410 
411         for (unsigned i = 0; i < l->node_count; ++i) {
412                 if (BITSET_TEST(no_spill, i)) continue;
413 
414                 /* Only spill nodes that interfere with the node failing
415                  * register allocation. It's pointless to spill anything else */
416                 if (!l->linear[(l->spill_node * l->node_count) + i]) continue;
417 
418                 unsigned benefit = lcra_count_constraints(l, i);
419 
420                 if (benefit > best_benefit) {
421                         best_benefit = benefit;
422                         best_node = i;
423                 }
424         }
425 
426         free(no_spill);
427         return best_node;
428 }
429 
430 static unsigned
bi_count_read_index(bi_instr * I,bi_index index)431 bi_count_read_index(bi_instr *I, bi_index index)
432 {
433         unsigned max = 0;
434 
435         bi_foreach_src(I, s) {
436                 if (bi_is_equiv(I->src[s], index)) {
437                         unsigned count = bi_count_read_registers(I, s);
438                         max = MAX2(max, count + I->src[s].offset);
439                 }
440         }
441 
442         return max;
443 }
444 
445 /* Once we've chosen a spill node, spill it and returns bytes spilled */
446 
447 static unsigned
bi_spill_register(bi_context * ctx,bi_index index,uint32_t offset)448 bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)
449 {
450         bi_builder b = { .shader = ctx };
451         unsigned channels = 0;
452 
453         /* Spill after every store, fill before every load */
454         bi_foreach_instr_global_safe(ctx, I) {
455                 bi_foreach_dest(I, d) {
456                         if (!bi_is_equiv(I->dest[d], index)) continue;
457 
458                         unsigned extra = I->dest[d].offset;
459                         bi_index tmp = bi_temp(ctx);
460 
461                         I->dest[d] = bi_replace_index(I->dest[d], tmp);
462                         I->no_spill = true;
463 
464                         unsigned count = bi_count_write_registers(I, d);
465                         unsigned bits = count * 32;
466 
467                         b.cursor = bi_after_instr(I);
468                         bi_index loc = bi_imm_u32(offset + 4 * extra);
469                         bi_store(&b, bits, tmp, loc, bi_zero(), BI_SEG_TL);
470 
471                         ctx->spills++;
472                         channels = MAX2(channels, extra + count);
473                 }
474 
475                 if (bi_has_arg(I, index)) {
476                         b.cursor = bi_before_instr(I);
477                         bi_index tmp = bi_temp(ctx);
478 
479                         unsigned bits = bi_count_read_index(I, index) * 32;
480                         bi_rewrite_index_src_single(I, index, tmp);
481 
482                         bi_instr *ld = bi_load_to(&b, bits, tmp,
483                                         bi_imm_u32(offset), bi_zero(), BI_SEG_TL);
484                         ld->no_spill = true;
485                         ctx->fills++;
486                 }
487         }
488 
489         return (channels * 4);
490 }
491 
492 void
bi_register_allocate(bi_context * ctx)493 bi_register_allocate(bi_context *ctx)
494 {
495         struct lcra_state *l = NULL;
496         bool success = false;
497 
498         unsigned iter_count = 1000; /* max iterations */
499 
500         /* Number of bytes of memory we've spilled into */
501         unsigned spill_count = ctx->info->tls_size;
502 
503         /* Try with reduced register pressure to improve thread count on v7 */
504         if (ctx->arch == 7) {
505                 bi_invalidate_liveness(ctx);
506                 l = bi_allocate_registers(ctx, &success, false);
507 
508                 if (success) {
509                         ctx->info->work_reg_count = 32;
510                 } else {
511                         lcra_free(l);
512                         l = NULL;
513                 }
514         }
515 
516         /* Otherwise, use the register file and spill until we succeed */
517         while (!success && ((iter_count--) > 0)) {
518                 bi_invalidate_liveness(ctx);
519                 l = bi_allocate_registers(ctx, &success, true);
520 
521                 if (success) {
522                         ctx->info->work_reg_count = 64;
523                 } else {
524                         signed spill_node = bi_choose_spill_node(ctx, l);
525                         lcra_free(l);
526                         l = NULL;
527 
528                         if (spill_node == -1)
529                                 unreachable("Failed to choose spill node\n");
530 
531                         spill_count += bi_spill_register(ctx,
532                                         bi_node_to_index(spill_node, bi_max_temp(ctx)),
533                                         spill_count);
534                 }
535         }
536 
537         assert(success);
538         assert(l != NULL);
539 
540         ctx->info->tls_size = spill_count;
541         bi_install_registers(ctx, l);
542 
543         lcra_free(l);
544 }
545