1 /*
2 * Copyright (C) 2020 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors (Collabora):
24 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25 */
26
27 #include "compiler.h"
28 #include "bi_builder.h"
29 #include "util/u_memory.h"
30
31 struct lcra_state {
32 unsigned node_count;
33 uint64_t *affinity;
34
35 /* Linear constraints imposed. Nested array sized upfront, organized as
36 * linear[node_left][node_right]. That is, calculate indices as:
37 *
38 * Each element is itself a bit field denoting whether (c_j - c_i) bias
39 * is present or not, including negative biases.
40 *
41 * Note for Bifrost, there are 4 components so the bias is in range
42 * [-3, 3] so encoded by 8-bit field. */
43
44 uint8_t *linear;
45
46 /* Before solving, forced registers; after solving, solutions. */
47 unsigned *solutions;
48
49 /** Node which caused register allocation to fail */
50 unsigned spill_node;
51 };
52
53 /* This module is an implementation of "Linearly Constrained
54 * Register Allocation". The paper is available in PDF form
55 * (https://people.collabora.com/~alyssa/LCRA.pdf) as well as Markdown+LaTeX
56 * (https://gitlab.freedesktop.org/alyssa/lcra/blob/master/LCRA.md)
57 */
58
59 static struct lcra_state *
lcra_alloc_equations(unsigned node_count)60 lcra_alloc_equations(unsigned node_count)
61 {
62 struct lcra_state *l = calloc(1, sizeof(*l));
63
64 l->node_count = node_count;
65
66 l->linear = calloc(sizeof(l->linear[0]), node_count * node_count);
67 l->solutions = calloc(sizeof(l->solutions[0]), node_count);
68 l->affinity = calloc(sizeof(l->affinity[0]), node_count);
69
70 memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count);
71
72 return l;
73 }
74
75 static void
lcra_free(struct lcra_state * l)76 lcra_free(struct lcra_state *l)
77 {
78 free(l->linear);
79 free(l->affinity);
80 free(l->solutions);
81 free(l);
82 }
83
84 static void
lcra_add_node_interference(struct lcra_state * l,unsigned i,unsigned cmask_i,unsigned j,unsigned cmask_j)85 lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j)
86 {
87 if (i == j)
88 return;
89
90 uint8_t constraint_fw = 0;
91 uint8_t constraint_bw = 0;
92
93 for (unsigned D = 0; D < 4; ++D) {
94 if (cmask_i & (cmask_j << D)) {
95 constraint_bw |= (1 << (3 + D));
96 constraint_fw |= (1 << (3 - D));
97 }
98
99 if (cmask_i & (cmask_j >> D)) {
100 constraint_fw |= (1 << (3 + D));
101 constraint_bw |= (1 << (3 - D));
102 }
103 }
104
105 l->linear[j * l->node_count + i] |= constraint_fw;
106 l->linear[i * l->node_count + j] |= constraint_bw;
107 }
108
109 static bool
lcra_test_linear(struct lcra_state * l,unsigned * solutions,unsigned i)110 lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i)
111 {
112 uint8_t *row = &l->linear[i * l->node_count];
113 signed constant = solutions[i];
114
115 for (unsigned j = 0; j < l->node_count; ++j) {
116 if (solutions[j] == ~0) continue;
117
118 signed lhs = solutions[j] - constant;
119
120 if (lhs < -3 || lhs > 3)
121 continue;
122
123 if (row[j] & (1 << (lhs + 3)))
124 return false;
125 }
126
127 return true;
128 }
129
130 static bool
lcra_solve(struct lcra_state * l)131 lcra_solve(struct lcra_state *l)
132 {
133 for (unsigned step = 0; step < l->node_count; ++step) {
134 if (l->solutions[step] != ~0) continue;
135 if (l->affinity[step] == 0) continue;
136
137 bool succ = false;
138
139 u_foreach_bit64(r, l->affinity[step]) {
140 l->solutions[step] = r;
141
142 if (lcra_test_linear(l, l->solutions, step)) {
143 succ = true;
144 break;
145 }
146 }
147
148 /* Out of registers - prepare to spill */
149 if (!succ) {
150 l->spill_node = step;
151 return false;
152 }
153 }
154
155 return true;
156 }
157
158 /* Register spilling is implemented with a cost-benefit system. Costs are set
159 * by the user. Benefits are calculated from the constraints. */
160
161 static unsigned
lcra_count_constraints(struct lcra_state * l,unsigned i)162 lcra_count_constraints(struct lcra_state *l, unsigned i)
163 {
164 unsigned count = 0;
165 uint8_t *constraints = &l->linear[i * l->node_count];
166
167 for (unsigned j = 0; j < l->node_count; ++j)
168 count += util_bitcount(constraints[j]);
169
170 return count;
171 }
172
173 /* Construct an affinity mask such that the vector with `count` elements does
174 * not intersect any of the registers in the bitset `clobber`. In other words,
175 * an allocated register r needs to satisfy for each i < count: a + i != b.
176 * Equivalently that's a != b - i, so we need a \ne { b - i : i < n }. For the
177 * entire clobber set B, we need a \ne union b \in B { b - i : i < n }, where
178 * that union is the desired clobber set. That may be written equivalently as
179 * the union over i < n of (B - i), where subtraction is defined elementwise
180 * and corresponds to a shift of the entire bitset.
181 *
182 * EVEN_BITS_MASK is an affinity mask for aligned register pairs. Interpreted
183 * as a bit set, it is { x : 0 <= x < 64 if x is even }
184 */
185
186 #define EVEN_BITS_MASK (0x5555555555555555ull)
187
188 static uint64_t
bi_make_affinity(uint64_t clobber,unsigned count,bool split_file)189 bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
190 {
191 uint64_t clobbered = 0;
192
193 for (unsigned i = 0; i < count; ++i)
194 clobbered |= (clobber >> i);
195
196 /* Don't allocate past the end of the register file */
197 if (count > 1) {
198 unsigned excess = count - 1;
199 uint64_t mask = BITFIELD_MASK(excess);
200 clobbered |= mask << (64 - excess);
201
202 if (split_file)
203 clobbered |= mask << (16 - excess);
204 }
205
206 /* Don't allocate the middle if we split out the middle */
207 if (split_file)
208 clobbered |= BITFIELD64_MASK(32) << 16;
209
210 /* We can use a register iff it's not clobberred */
211 return ~clobbered;
212 }
213
214 static void
bi_mark_interference(bi_block * block,struct lcra_state * l,uint8_t * live,uint64_t preload_live,unsigned node_count,bool is_blend,bool split_file,bool aligned_sr)215 bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live, uint64_t preload_live, unsigned node_count, bool is_blend, bool split_file, bool aligned_sr)
216 {
217 bi_foreach_instr_in_block_rev(block, ins) {
218 /* Mark all registers live after the instruction as
219 * interfering with the destination */
220
221 bi_foreach_dest(ins, d) {
222 unsigned node = bi_get_node(ins->dest[d]);
223
224 if (node >= node_count)
225 continue;
226
227 /* Don't allocate to anything that's read later as a
228 * preloaded register. The affinity is the intersection
229 * of affinity masks for each write. Since writes have
230 * offsets, but the affinity is for the whole node, we
231 * need to offset the affinity opposite the write
232 * offset, so we shift right. */
233 unsigned count = bi_count_write_registers(ins, d);
234 unsigned offset = ins->dest[d].offset;
235 uint64_t affinity = bi_make_affinity(preload_live, count, split_file);
236
237 /* Valhall needs >= 64-bit staging writes to be pair-aligned */
238 if (aligned_sr && count >= 2)
239 affinity &= EVEN_BITS_MASK;
240
241 l->affinity[node] &= (affinity >> offset);
242
243 for (unsigned i = 0; i < node_count; ++i) {
244 if (live[i]) {
245 lcra_add_node_interference(l, node,
246 bi_writemask(ins, d), i, live[i]);
247 }
248 }
249 }
250
251 /* Valhall needs >= 64-bit staging reads to be pair-aligned */
252 if (aligned_sr && bi_count_read_registers(ins, 0) >= 2) {
253 unsigned node = bi_get_node(ins->src[0]);
254
255 if (node < node_count)
256 l->affinity[node] &= EVEN_BITS_MASK;
257 }
258
259 if (!is_blend && ins->op == BI_OPCODE_BLEND) {
260 /* Blend shaders might clobber r0-r15, r48. */
261 uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48);
262
263 for (unsigned i = 0; i < node_count; ++i) {
264 if (live[i])
265 l->affinity[i] &= ~clobber;
266 }
267 }
268
269 /* Update live_in */
270 preload_live = bi_postra_liveness_ins(preload_live, ins);
271 bi_liveness_ins_update(live, ins, node_count);
272 }
273
274 block->reg_live_in = preload_live;
275 }
276
277 static void
bi_compute_interference(bi_context * ctx,struct lcra_state * l,bool full_regs)278 bi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs)
279 {
280 unsigned node_count = bi_max_temp(ctx);
281
282 bi_compute_liveness(ctx);
283 bi_postra_liveness(ctx);
284
285 bi_foreach_block_rev(ctx, blk) {
286 uint8_t *live = mem_dup(blk->live_out, node_count);
287
288 bi_mark_interference(blk, l, live, blk->reg_live_out,
289 node_count, ctx->inputs->is_blend, !full_regs,
290 ctx->arch >= 9);
291
292 free(live);
293 }
294 }
295
296 static struct lcra_state *
bi_allocate_registers(bi_context * ctx,bool * success,bool full_regs)297 bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
298 {
299 unsigned node_count = bi_max_temp(ctx);
300 struct lcra_state *l = lcra_alloc_equations(node_count);
301
302 /* Blend shaders are restricted to R0-R15. Other shaders at full
303 * occupancy also can access R48-R63. At half occupancy they can access
304 * the whole file. */
305
306 uint64_t default_affinity =
307 ctx->inputs->is_blend ? BITFIELD64_MASK(16) :
308 full_regs ? BITFIELD64_MASK(64) :
309 (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));
310
311 bi_foreach_instr_global(ctx, ins) {
312 bi_foreach_dest(ins, d) {
313 unsigned dest = bi_get_node(ins->dest[d]);
314
315 /* Blend shaders expect the src colour to be in r0-r3 */
316 if (ins->op == BI_OPCODE_BLEND &&
317 !ctx->inputs->is_blend) {
318 unsigned node = bi_get_node(ins->src[0]);
319 assert(node < node_count);
320 l->solutions[node] = 0;
321 }
322
323 if (dest < node_count)
324 l->affinity[dest] = default_affinity;
325 }
326
327 }
328
329 bi_compute_interference(ctx, l, full_regs);
330
331 *success = lcra_solve(l);
332
333 return l;
334 }
335
336 static bi_index
bi_reg_from_index(bi_context * ctx,struct lcra_state * l,bi_index index)337 bi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index)
338 {
339 /* Offsets can only be applied when we register allocated an index, or
340 * alternatively for FAU's encoding */
341
342 ASSERTED bool is_offset = (index.offset > 0) &&
343 (index.type != BI_INDEX_FAU);
344 unsigned node_count = bi_max_temp(ctx);
345
346 /* Did we run RA for this index at all */
347 if (bi_get_node(index) >= node_count) {
348 assert(!is_offset);
349 return index;
350 }
351
352 /* LCRA didn't bother solving this index (how lazy!) */
353 signed solution = l->solutions[bi_get_node(index)];
354 if (solution < 0) {
355 assert(!is_offset);
356 return index;
357 }
358
359 /* todo: do we want to compose with the subword swizzle? */
360 bi_index new_index = bi_register(solution + index.offset);
361 new_index.swizzle = index.swizzle;
362 new_index.abs = index.abs;
363 new_index.neg = index.neg;
364 return new_index;
365 }
366
367 static void
bi_install_registers(bi_context * ctx,struct lcra_state * l)368 bi_install_registers(bi_context *ctx, struct lcra_state *l)
369 {
370 bi_foreach_instr_global(ctx, ins) {
371 bi_foreach_dest(ins, d)
372 ins->dest[d] = bi_reg_from_index(ctx, l, ins->dest[d]);
373
374 bi_foreach_src(ins, s)
375 ins->src[s] = bi_reg_from_index(ctx, l, ins->src[s]);
376 }
377 }
378
379 static void
bi_rewrite_index_src_single(bi_instr * ins,bi_index old,bi_index new)380 bi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new)
381 {
382 bi_foreach_src(ins, i) {
383 if (bi_is_equiv(ins->src[i], old)) {
384 ins->src[i].type = new.type;
385 ins->src[i].reg = new.reg;
386 ins->src[i].value = new.value;
387 }
388 }
389 }
390
391 /* If register allocation fails, find the best spill node */
392
393 static signed
bi_choose_spill_node(bi_context * ctx,struct lcra_state * l)394 bi_choose_spill_node(bi_context *ctx, struct lcra_state *l)
395 {
396 /* Pick a node satisfying bi_spill_register's preconditions */
397 BITSET_WORD *no_spill = calloc(sizeof(BITSET_WORD), BITSET_WORDS(l->node_count));
398
399 bi_foreach_instr_global(ctx, ins) {
400 bi_foreach_dest(ins, d) {
401 unsigned node = bi_get_node(ins->dest[d]);
402
403 if (node < l->node_count && ins->no_spill)
404 BITSET_SET(no_spill, node);
405 }
406 }
407
408 unsigned best_benefit = 0.0;
409 signed best_node = -1;
410
411 for (unsigned i = 0; i < l->node_count; ++i) {
412 if (BITSET_TEST(no_spill, i)) continue;
413
414 /* Only spill nodes that interfere with the node failing
415 * register allocation. It's pointless to spill anything else */
416 if (!l->linear[(l->spill_node * l->node_count) + i]) continue;
417
418 unsigned benefit = lcra_count_constraints(l, i);
419
420 if (benefit > best_benefit) {
421 best_benefit = benefit;
422 best_node = i;
423 }
424 }
425
426 free(no_spill);
427 return best_node;
428 }
429
430 static unsigned
bi_count_read_index(bi_instr * I,bi_index index)431 bi_count_read_index(bi_instr *I, bi_index index)
432 {
433 unsigned max = 0;
434
435 bi_foreach_src(I, s) {
436 if (bi_is_equiv(I->src[s], index)) {
437 unsigned count = bi_count_read_registers(I, s);
438 max = MAX2(max, count + I->src[s].offset);
439 }
440 }
441
442 return max;
443 }
444
445 /* Once we've chosen a spill node, spill it and returns bytes spilled */
446
447 static unsigned
bi_spill_register(bi_context * ctx,bi_index index,uint32_t offset)448 bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)
449 {
450 bi_builder b = { .shader = ctx };
451 unsigned channels = 0;
452
453 /* Spill after every store, fill before every load */
454 bi_foreach_instr_global_safe(ctx, I) {
455 bi_foreach_dest(I, d) {
456 if (!bi_is_equiv(I->dest[d], index)) continue;
457
458 unsigned extra = I->dest[d].offset;
459 bi_index tmp = bi_temp(ctx);
460
461 I->dest[d] = bi_replace_index(I->dest[d], tmp);
462 I->no_spill = true;
463
464 unsigned count = bi_count_write_registers(I, d);
465 unsigned bits = count * 32;
466
467 b.cursor = bi_after_instr(I);
468 bi_index loc = bi_imm_u32(offset + 4 * extra);
469 bi_store(&b, bits, tmp, loc, bi_zero(), BI_SEG_TL);
470
471 ctx->spills++;
472 channels = MAX2(channels, extra + count);
473 }
474
475 if (bi_has_arg(I, index)) {
476 b.cursor = bi_before_instr(I);
477 bi_index tmp = bi_temp(ctx);
478
479 unsigned bits = bi_count_read_index(I, index) * 32;
480 bi_rewrite_index_src_single(I, index, tmp);
481
482 bi_instr *ld = bi_load_to(&b, bits, tmp,
483 bi_imm_u32(offset), bi_zero(), BI_SEG_TL);
484 ld->no_spill = true;
485 ctx->fills++;
486 }
487 }
488
489 return (channels * 4);
490 }
491
492 void
bi_register_allocate(bi_context * ctx)493 bi_register_allocate(bi_context *ctx)
494 {
495 struct lcra_state *l = NULL;
496 bool success = false;
497
498 unsigned iter_count = 1000; /* max iterations */
499
500 /* Number of bytes of memory we've spilled into */
501 unsigned spill_count = ctx->info->tls_size;
502
503 /* Try with reduced register pressure to improve thread count on v7 */
504 if (ctx->arch == 7) {
505 bi_invalidate_liveness(ctx);
506 l = bi_allocate_registers(ctx, &success, false);
507
508 if (success) {
509 ctx->info->work_reg_count = 32;
510 } else {
511 lcra_free(l);
512 l = NULL;
513 }
514 }
515
516 /* Otherwise, use the register file and spill until we succeed */
517 while (!success && ((iter_count--) > 0)) {
518 bi_invalidate_liveness(ctx);
519 l = bi_allocate_registers(ctx, &success, true);
520
521 if (success) {
522 ctx->info->work_reg_count = 64;
523 } else {
524 signed spill_node = bi_choose_spill_node(ctx, l);
525 lcra_free(l);
526 l = NULL;
527
528 if (spill_node == -1)
529 unreachable("Failed to choose spill node\n");
530
531 spill_count += bi_spill_register(ctx,
532 bi_node_to_index(spill_node, bi_max_temp(ctx)),
533 spill_count);
534 }
535 }
536
537 assert(success);
538 assert(l != NULL);
539
540 ctx->info->tls_size = spill_count;
541 bi_install_registers(ctx, l);
542
543 lcra_free(l);
544 }
545