1 /*
2 * Copyright © 2021 Valve Corporation
3 * Copyright © 2014 Rob Clark <robclark@freedesktop.org>
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ir3_ra.h"
8 #include "ir3_shader.h"
9
10 #include "util/u_math.h"
11
12 /* Allocating shared registers can pose a challenge, because their live
13 * intervals use the physical CFG which has extra edges inserted that are
14 * pretty much always critical edges. This causes problems with phi nodes,
15 * because copies for phi nodes have to happen "along the edge," and similarly
16 * causes problems when reunifying values that have had their live range split.
17 * Problematic phi nodes should be relatively rare, so we ban them for now.
18 * The solution we choose for live-range splitting is to integrate spilling and
19 * register allcoation and spill to vector registers rather than split a live
20 * range, which negates some of the advantages of SSA-based RA, but it isn't as
21 * bad as it seems because the conditions needed (vector shared registers, which
22 * only movmsk currently produces, or fixed registers which we don't do) are
23 * relatively rare. Spilling is also much cheaper than spilling vector registers
24 * to private memory.
25 */
26
27 struct ra_interval {
28 struct ir3_reg_interval interval;
29
30 struct rb_node physreg_node;
31 physreg_t physreg_start, physreg_end;
32
33 /* If this interval was spilled, the original physreg_start before spilling.
34 * Used when reloading live outs.
35 */
36 physreg_t physreg_start_orig;
37
38 /* Where the shared register is spilled to. If there were no uses when it's
39 * spilled it could be the original defining instruction.
40 */
41 struct ir3_register *spill_def;
42
43 /* Whether this contains a source of the current instruction that can't be
44 * spilled.
45 */
46 bool src;
47
48 bool needs_reload;
49 };
50
51 struct ra_block_state {
52 bool visited;
53
54 /* For blocks whose successors are visited first (i.e. loop backedges), which
55 * values should be live at the end.
56 */
57 BITSET_WORD *live_out;
58 };
59
60 struct ra_ctx {
61 struct ir3_reg_ctx reg_ctx;
62
63 BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
64
65 struct rb_tree physreg_intervals;
66
67 struct ra_interval *intervals;
68
69 struct ir3_liveness *live;
70
71 struct hash_table *pcopy_src_map;
72
73 struct ra_block_state *blocks;
74
75 unsigned start;
76 };
77
78 static struct ra_interval *
ir3_reg_interval_to_ra_interval(struct ir3_reg_interval * interval)79 ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
80 {
81 return rb_node_data(struct ra_interval, interval, interval);
82 }
83
84 static struct ra_interval *
rb_node_to_interval(struct rb_node * node)85 rb_node_to_interval(struct rb_node *node)
86 {
87 return rb_node_data(struct ra_interval, node, physreg_node);
88 }
89
90 static const struct ra_interval *
rb_node_to_interval_const(const struct rb_node * node)91 rb_node_to_interval_const(const struct rb_node *node)
92 {
93 return rb_node_data(struct ra_interval, node, physreg_node);
94 }
95
96 static struct ra_interval *
ra_interval_next(struct ra_interval * interval)97 ra_interval_next(struct ra_interval *interval)
98 {
99 struct rb_node *next = rb_node_next(&interval->physreg_node);
100 return next ? rb_node_to_interval(next) : NULL;
101 }
102
103 static struct ra_interval *
ra_interval_next_or_null(struct ra_interval * interval)104 ra_interval_next_or_null(struct ra_interval *interval)
105 {
106 return interval ? ra_interval_next(interval) : NULL;
107 }
108
109 static int
ra_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)110 ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
111 {
112 const struct ra_interval *a = rb_node_to_interval_const(_a);
113 const struct ra_interval *b = rb_node_to_interval_const(_b);
114 return b->physreg_start - a->physreg_start;
115 }
116
117 static int
ra_interval_cmp(const struct rb_node * node,const void * data)118 ra_interval_cmp(const struct rb_node *node, const void *data)
119 {
120 physreg_t reg = *(const physreg_t *)data;
121 const struct ra_interval *interval = rb_node_to_interval_const(node);
122 if (interval->physreg_start > reg)
123 return -1;
124 else if (interval->physreg_end <= reg)
125 return 1;
126 else
127 return 0;
128 }
129
130 static struct ra_ctx *
ir3_reg_ctx_to_ctx(struct ir3_reg_ctx * ctx)131 ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
132 {
133 return rb_node_data(struct ra_ctx, ctx, reg_ctx);
134 }
135
136 static struct ra_interval *
ra_interval_search_sloppy(struct rb_tree * tree,physreg_t reg)137 ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
138 {
139 struct rb_node *node = rb_tree_search_sloppy(tree, ®, ra_interval_cmp);
140 return node ? rb_node_to_interval(node) : NULL;
141 }
142
143 /* Get the interval covering the reg, or the closest to the right if it
144 * doesn't exist.
145 */
146 static struct ra_interval *
ra_interval_search_right(struct rb_tree * tree,physreg_t reg)147 ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
148 {
149 struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
150 if (!interval) {
151 return NULL;
152 } else if (interval->physreg_end > reg) {
153 return interval;
154 } else {
155 /* There is no interval covering reg, and ra_file_search_sloppy()
156 * returned the closest range to the left, so the next interval to the
157 * right should be the closest to the right.
158 */
159 return ra_interval_next_or_null(interval);
160 }
161 }
162
163 static struct ra_interval *
ra_ctx_search_right(struct ra_ctx * ctx,physreg_t reg)164 ra_ctx_search_right(struct ra_ctx *ctx, physreg_t reg)
165 {
166 return ra_interval_search_right(&ctx->physreg_intervals, reg);
167 }
168
169 static void
interval_add(struct ir3_reg_ctx * reg_ctx,struct ir3_reg_interval * _interval)170 interval_add(struct ir3_reg_ctx *reg_ctx, struct ir3_reg_interval *_interval)
171 {
172 struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
173 struct ra_ctx *ctx = ir3_reg_ctx_to_ctx(reg_ctx);
174
175 /* We can assume in this case that physreg_start/physreg_end is already
176 * initialized.
177 */
178 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
179 BITSET_CLEAR(ctx->available, i);
180 }
181
182 rb_tree_insert(&ctx->physreg_intervals, &interval->physreg_node,
183 ra_interval_insert_cmp);
184 }
185
186 static void
interval_delete(struct ir3_reg_ctx * reg_ctx,struct ir3_reg_interval * _interval)187 interval_delete(struct ir3_reg_ctx *reg_ctx, struct ir3_reg_interval *_interval)
188 {
189 struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
190 struct ra_ctx *ctx = ir3_reg_ctx_to_ctx(reg_ctx);
191
192 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
193 BITSET_SET(ctx->available, i);
194 }
195
196 rb_tree_remove(&ctx->physreg_intervals, &interval->physreg_node);
197 }
198
199 static void
interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)200 interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
201 struct ir3_reg_interval *_child)
202 {
203 struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
204 struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
205
206 child->physreg_start =
207 parent->physreg_start + (child->interval.reg->interval_start -
208 parent->interval.reg->interval_start);
209 child->physreg_end =
210 child->physreg_start +
211 (child->interval.reg->interval_end - child->interval.reg->interval_start);
212
213 interval_add(ctx, _child);
214 }
215
216 static void
ra_ctx_init(struct ra_ctx * ctx)217 ra_ctx_init(struct ra_ctx *ctx)
218 {
219 ctx->reg_ctx.interval_add = interval_add;
220 ctx->reg_ctx.interval_delete = interval_delete;
221 ctx->reg_ctx.interval_readd = interval_readd;
222 }
223
224 static void
ra_ctx_reset_block(struct ra_ctx * ctx)225 ra_ctx_reset_block(struct ra_ctx *ctx)
226 {
227 for (unsigned i = 0; i < RA_SHARED_SIZE; i++) {
228 BITSET_SET(ctx->available, i);
229 }
230
231 rb_tree_init(&ctx->reg_ctx.intervals);
232 rb_tree_init(&ctx->physreg_intervals);
233 }
234
235 static void
ra_interval_init(struct ra_interval * interval,struct ir3_register * reg)236 ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
237 {
238 ir3_reg_interval_init(&interval->interval, reg);
239 }
240
241 static physreg_t
ra_interval_get_physreg(const struct ra_interval * interval)242 ra_interval_get_physreg(const struct ra_interval *interval)
243 {
244 unsigned child_start = interval->interval.reg->interval_start;
245
246 while (interval->interval.parent) {
247 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
248 }
249
250 return interval->physreg_start +
251 (child_start - interval->interval.reg->interval_start);
252 }
253
254 static unsigned
ra_interval_get_num(const struct ra_interval * interval)255 ra_interval_get_num(const struct ra_interval *interval)
256 {
257 return ra_physreg_to_num(ra_interval_get_physreg(interval),
258 interval->interval.reg->flags);
259 }
260
261 static void
ra_interval_dump(struct log_stream * stream,struct ra_interval * interval)262 ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
263 {
264 mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
265
266 ir3_reg_interval_dump(stream, &interval->interval);
267 }
268
269 static void
ra_ctx_dump(struct ra_ctx * ctx)270 ra_ctx_dump(struct ra_ctx *ctx)
271 {
272 struct log_stream *stream = mesa_log_streami();
273
274 mesa_log_stream_printf(stream, "shared:\n");
275 rb_tree_foreach (struct ra_interval, interval, &ctx->physreg_intervals,
276 physreg_node) {
277 ra_interval_dump(stream, interval);
278 }
279
280 unsigned start, end;
281 mesa_log_stream_printf(stream, "available:\n");
282 BITSET_FOREACH_RANGE (start, end, ctx->available, RA_SHARED_SIZE) {
283 mesa_log_stream_printf(stream, "%u-%u ", start, end);
284 }
285 mesa_log_stream_printf(stream, "\n");
286 mesa_log_stream_printf(stream, "start: %u\n", ctx->start);
287 }
288
289 static bool
get_reg_specified(struct ra_ctx * ctx,struct ir3_register * reg,physreg_t physreg)290 get_reg_specified(struct ra_ctx *ctx, struct ir3_register *reg, physreg_t physreg)
291 {
292 for (unsigned i = 0; i < reg_size(reg); i++) {
293 physreg_t cur_physreg = physreg + i;
294
295 if (!BITSET_TEST(ctx->available, cur_physreg)) {
296 /* If physreg is unavailable, we might still be able to use it if the
297 * value it holds is in the same merge set at the same offset as the
298 * value in reg.
299 */
300 if (!reg->merge_set) {
301 return false;
302 }
303
304 /* Find the interval for the current element of physreg. */
305 struct ra_interval *interval = ra_ctx_search_right(ctx, cur_physreg);
306
307 /* It must exist since the physreg is unavailable. */
308 assert(interval->physreg_start <= cur_physreg);
309
310 struct ir3_register *live_reg = interval->interval.reg;
311
312 if (reg->merge_set != live_reg->merge_set) {
313 return false;
314 }
315
316 /* We want to check if the currently live value at physreg+i (live_reg)
317 * is at the same offset as the new value (reg+i) in their shared merge
318 * set. However, we cannot simply compare their merge set offsets as
319 * live_reg may be larger than reg+i (e.g., a collect) and reg+i may
320 * not be at the start of live_reg's interval. To account for this, we
321 * want to know the merge set offset delta between live_reg's value at
322 * physreg+i and the start of its interval. This always equals the
323 * delta between the physregs within intervals.
324 */
325 unsigned cur_merge_set_offset = reg->merge_set_offset + i;
326 unsigned interval_offset = cur_physreg - interval->physreg_start;
327 if (cur_merge_set_offset !=
328 live_reg->merge_set_offset + interval_offset) {
329 return false;
330 }
331 }
332 }
333
334 return true;
335 }
336
337 static unsigned
reg_file_size(struct ir3_register * reg)338 reg_file_size(struct ir3_register *reg)
339 {
340 if (reg->flags & IR3_REG_HALF)
341 return RA_SHARED_HALF_SIZE;
342 else
343 return RA_SHARED_SIZE;
344 }
345
346 static physreg_t
find_best_gap(struct ra_ctx * ctx,struct ir3_register * dst,unsigned size,unsigned align)347 find_best_gap(struct ra_ctx *ctx, struct ir3_register *dst, unsigned size,
348 unsigned align)
349 {
350 unsigned file_size = reg_file_size(dst);
351
352 /* This can happen if we create a very large merge set. Just bail out in that
353 * case.
354 */
355 if (size > file_size)
356 return (physreg_t) ~0;
357
358 unsigned start = ALIGN(ctx->start, align) % (file_size - size + align);
359 unsigned candidate = start;
360 do {
361 bool is_available = true;
362 for (unsigned i = 0; i < size; i++) {
363 if (!BITSET_TEST(ctx->available, candidate + i)) {
364 is_available = false;
365 break;
366 }
367 }
368
369 if (is_available) {
370 ctx->start = (candidate + size) % file_size;
371 return candidate;
372 }
373
374 candidate += align;
375 if (candidate + size > file_size)
376 candidate = 0;
377 } while (candidate != start);
378
379 return (physreg_t)~0;
380 }
381
382 static physreg_t
find_best_spill_reg(struct ra_ctx * ctx,struct ir3_register * reg,unsigned size,unsigned align)383 find_best_spill_reg(struct ra_ctx *ctx, struct ir3_register *reg,
384 unsigned size, unsigned align)
385 {
386 unsigned file_size = reg_file_size(reg);
387 unsigned min_cost = UINT_MAX;
388
389 unsigned start = ALIGN(ctx->start, align) % (file_size - size + align);
390 physreg_t candidate = start;
391 physreg_t best_reg = (physreg_t)~0;
392 do {
393 unsigned cost = 0;
394
395 /* Iterate through intervals we'd need to spill to use this reg. */
396 for (struct ra_interval *interval = ra_ctx_search_right(ctx, candidate);
397 interval && interval->physreg_start < candidate + size;
398 interval = ra_interval_next_or_null(interval)) {
399 /* We can't spill sources of the current instruction when reloading
400 * sources.
401 */
402 if (interval->src) {
403 cost = UINT_MAX;
404 break;
405 }
406
407 /* We prefer spilling intervals that already have been spilled, so we
408 * don't have to emit another mov.
409 */
410 if (!interval->spill_def)
411 cost += (interval->physreg_end - interval->physreg_start);
412 }
413
414 if (cost < min_cost) {
415 min_cost = cost;
416 best_reg = candidate;
417 }
418
419 candidate += align;
420 if (candidate + size > file_size)
421 candidate = 0;
422 } while (candidate != start);
423
424 return best_reg;
425 }
426
427 static struct ir3_register *
split(struct ir3_register * def,unsigned offset,struct ir3_instruction * before)428 split(struct ir3_register *def, unsigned offset, struct ir3_instruction *before)
429 {
430 if (reg_elems(def) == 1) {
431 assert(offset == 0);
432 return def;
433 }
434
435 struct ir3_instruction *split =
436 ir3_instr_create_at(ir3_after_instr(before), OPC_META_SPLIT, 1, 1);
437 split->split.off = offset;
438 struct ir3_register *dst = __ssa_dst(split);
439 struct ir3_register *src =
440 ir3_src_create(split, INVALID_REG, def->flags & (IR3_REG_HALF | IR3_REG_SSA));
441 src->wrmask = def->wrmask;
442 src->def = def;
443 return dst;
444 }
445
446 static struct ir3_register *
extract(struct ir3_register * parent_def,unsigned offset,unsigned elems,struct ir3_instruction * before)447 extract(struct ir3_register *parent_def, unsigned offset, unsigned elems,
448 struct ir3_instruction *before)
449 {
450 if (offset == 0 && elems == reg_elems(parent_def))
451 return parent_def;
452
453 if (elems == 1)
454 return split(parent_def, offset, before);
455
456 struct ir3_instruction *collect =
457 ir3_instr_create_at(ir3_after_instr(before), OPC_META_COLLECT, 1, elems);
458 struct ir3_register *dst = __ssa_dst(collect);
459 dst->flags |= parent_def->flags & IR3_REG_HALF;
460 dst->wrmask = MASK(elems);
461
462 for (unsigned i = 0; i < elems; i++) {
463 ir3_src_create(collect, INVALID_REG,
464 parent_def->flags & (IR3_REG_HALF | IR3_REG_SSA))->def =
465 split(parent_def, offset + i, before);
466 }
467
468 return dst;
469 }
470
471 static void
spill_interval_children(struct ra_interval * interval,struct ir3_instruction * before)472 spill_interval_children(struct ra_interval *interval,
473 struct ir3_instruction *before)
474 {
475 rb_tree_foreach (struct ra_interval, child, &interval->interval.children,
476 interval.node) {
477 if (!child->spill_def) {
478 child->spill_def = extract(interval->spill_def,
479 (child->interval.reg->interval_start -
480 interval->interval.reg->interval_start) /
481 reg_elem_size(interval->interval.reg),
482 reg_elems(child->interval.reg), before);
483 interval->physreg_start_orig = child->physreg_start;
484 }
485 spill_interval_children(child, before);
486 }
487 }
488
489 static void
spill_interval(struct ra_ctx * ctx,struct ra_interval * interval)490 spill_interval(struct ra_ctx *ctx, struct ra_interval *interval)
491 {
492 struct ir3_instruction *before = interval->interval.reg->instr;
493
494 d("spilling ssa_%u:%u", before->serialno, interval->interval.reg->name);
495
496 if (!interval->spill_def) {
497 /* If this is a phi node or input, we need to insert the demotion to a
498 * regular register after the last phi or input in the block.
499 */
500 if (before->opc == OPC_META_PHI ||
501 before->opc == OPC_META_INPUT) {
502 struct ir3_block *block = before->block;
503 struct ir3_instruction *last_phi_input = NULL;
504 foreach_instr_from (instr, before, &block->instr_list) {
505 if (instr->opc != before->opc)
506 break;
507 last_phi_input = instr;
508 }
509 before = last_phi_input;
510 }
511
512 struct ir3_instruction *mov =
513 ir3_instr_create_at(ir3_after_instr(before), OPC_MOV, 1, 1);
514 mov->flags |= IR3_INSTR_SHARED_SPILL;
515 struct ir3_register *dst = __ssa_dst(mov);
516 dst->flags |= (interval->interval.reg->flags & IR3_REG_HALF);
517 dst->wrmask = interval->interval.reg->wrmask;
518 mov->repeat = reg_elems(dst) - 1;
519 ir3_src_create(mov, interval->interval.reg->num,
520 IR3_REG_SHARED | (mov->repeat ? IR3_REG_R : 0) |
521 (interval->interval.reg->flags & IR3_REG_HALF))->wrmask =
522 interval->interval.reg->wrmask;
523 mov->cat1.src_type = mov->cat1.dst_type =
524 (interval->interval.reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
525
526 interval->spill_def = dst;
527 interval->physreg_start_orig = interval->physreg_start;
528 }
529
530 spill_interval_children(interval, interval->spill_def->instr);
531
532 ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
533 }
534
535 /* Try to demote a scalar ALU instruction to a normal ALU instruction, using the
536 * spilled sources. We have to take into account restrictions on the number of
537 * shared sources that only exist for normal ALU instructions.
538 */
539 static bool
try_demote_instruction(struct ra_ctx * ctx,struct ir3_instruction * instr)540 try_demote_instruction(struct ra_ctx *ctx, struct ir3_instruction *instr)
541 {
542 /* First, check restrictions. */
543 switch (opc_cat(instr->opc)) {
544 case 1:
545 /* MOVMSK is special and can't be demoted. It also has no sources so must
546 * go before the check below.
547 */
548 if (instr->opc == OPC_MOVMSK)
549 return false;
550
551 assert(instr->srcs_count >= 1);
552 if (!(instr->srcs[0]->flags & (IR3_REG_CONST | IR3_REG_IMMED)))
553 return false;
554 break;
555 case 2: {
556 /* We need one source to either be demotable or an immediate. */
557 if (instr->srcs_count > 1) {
558 struct ra_interval *src0_interval =
559 (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
560 struct ra_interval *src1_interval =
561 (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
562 if (!(src0_interval && src0_interval->spill_def) &&
563 !(src1_interval && src1_interval->spill_def) &&
564 !(instr->srcs[0]->flags & IR3_REG_IMMED) &&
565 !(instr->srcs[1]->flags & IR3_REG_IMMED))
566 return false;
567 }
568 break;
569 }
570 case 3: {
571 struct ra_interval *src0_interval =
572 (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
573 struct ra_interval *src1_interval =
574 (instr->srcs[1]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[1]->def->name] : NULL;
575
576 /* src1 cannot be shared */
577 if (src1_interval && !src1_interval->spill_def) {
578 /* Try to swap src0 and src1, similar to what copy prop does. */
579 if (!is_mad(instr->opc))
580 return false;
581
582 if ((src0_interval && src0_interval->spill_def) ||
583 (instr->srcs[0]->flags & IR3_REG_IMMED)) {
584 struct ir3_register *src0 = instr->srcs[0];
585 instr->srcs[0] = instr->srcs[1];
586 instr->srcs[1] = src0;
587 } else {
588 return false;
589 }
590 }
591 break;
592 }
593 case 4: {
594 assert(instr->srcs[0]->flags & IR3_REG_SSA);
595 struct ra_interval *src_interval = &ctx->intervals[instr->srcs[0]->def->name];
596 if (!src_interval->spill_def)
597 return false;
598 break;
599 }
600
601 default:
602 return false;
603 }
604
605 d("demoting instruction");
606
607 /* If the instruction is already not a scalar ALU instruction, we should've
608 * skipped reloading and just demoted sources directly, so we should never
609 * get here.
610 */
611 assert(instr->dsts[0]->flags & IR3_REG_SHARED);
612
613 /* Now we actually demote the instruction */
614 ra_foreach_src (src, instr) {
615 assert(src->flags & IR3_REG_SHARED);
616 struct ra_interval *interval = &ctx->intervals[src->def->name];
617 if (interval->spill_def) {
618 src->def = interval->spill_def;
619 src->flags &= ~IR3_REG_SHARED;
620 interval->needs_reload = false;
621 if (interval->interval.inserted)
622 ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
623 while (interval->interval.parent)
624 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
625 interval->src = false;
626 }
627 }
628
629 struct ra_interval *dst_interval = &ctx->intervals[instr->dsts[0]->name];
630 instr->dsts[0]->flags &= ~IR3_REG_SHARED;
631 ra_interval_init(dst_interval, instr->dsts[0]);
632 dst_interval->spill_def = instr->dsts[0];
633
634 instr->flags |= IR3_INSTR_SHARED_SPILL;
635
636 return true;
637 }
638
639 /* Free up [start, start + size) by spilling live intervals.
640 */
641 static void
free_space(struct ra_ctx * ctx,physreg_t start,unsigned size)642 free_space(struct ra_ctx *ctx, physreg_t start, unsigned size)
643 {
644 struct ra_interval *interval = ra_ctx_search_right(ctx, start);
645 while (interval && interval->physreg_start < start + size) {
646 struct ra_interval *next = ra_interval_next_or_null(interval);
647 spill_interval(ctx, interval);
648 interval = next;
649 }
650 }
651
652 static physreg_t
get_reg(struct ra_ctx * ctx,struct ir3_register * reg,bool src)653 get_reg(struct ra_ctx *ctx, struct ir3_register *reg, bool src)
654 {
655 if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
656 physreg_t preferred_reg =
657 reg->merge_set->preferred_reg + reg->merge_set_offset;
658 if (preferred_reg < reg_file_size(reg) &&
659 preferred_reg % reg_elem_size(reg) == 0 &&
660 get_reg_specified(ctx, reg, preferred_reg))
661 return preferred_reg;
662 }
663
664 /* If this register is a subset of a merge set which we have not picked a
665 * register for, first try to allocate enough space for the entire merge
666 * set.
667 */
668 unsigned size = reg_size(reg);
669 if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
670 size < reg->merge_set->size) {
671 physreg_t best_reg = find_best_gap(ctx, reg, reg->merge_set->size,
672 reg->merge_set->alignment);
673 if (best_reg != (physreg_t)~0u) {
674 best_reg += reg->merge_set_offset;
675 return best_reg;
676 }
677 }
678
679 /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
680 * Because this doesn't introduce unnecessary dependencies, and it
681 * potentially avoids needing (ss) syncs for write after read hazards for
682 * SFU instructions:
683 */
684 if (!src && (is_sfu(reg->instr) || is_alu(reg->instr))) {
685 for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
686 struct ir3_register *src = reg->instr->srcs[i];
687 if (!ra_reg_is_src(src))
688 continue;
689 if ((src->flags & IR3_REG_SHARED) && reg_size(src) >= size) {
690 struct ra_interval *src_interval = &ctx->intervals[src->def->name];
691 physreg_t src_physreg = ra_interval_get_physreg(src_interval);
692 if (src_physreg % reg_elem_size(reg) == 0 &&
693 src_physreg + size <= reg_file_size(reg) &&
694 get_reg_specified(ctx, reg, src_physreg))
695 return src_physreg;
696 }
697 }
698 }
699
700 return find_best_gap(ctx, reg, size, reg_elem_size(reg));
701 }
702
703 /* The reload process is split in two, first we allocate a register to reload to
704 * for all sources that need a reload and then we actually execute the reload.
705 * This is to allow us to demote shared ALU instructions to non-shared whenever
706 * we would otherwise need to spill to reload, without leaving dangling unused
707 * reload mov's from previously processed sources. So, for example, we could
708 * need to reload both sources of an add, but after reloading the first source
709 * we realize that we would need to spill to reload the second source and we
710 * should demote the add instead, which means cancelling the first reload.
711 */
712 static void
reload_src(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)713 reload_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
714 struct ir3_register *src)
715 {
716 struct ir3_register *reg = src->def;
717 struct ra_interval *interval = &ctx->intervals[reg->name];
718 unsigned size = reg_size(reg);
719
720 physreg_t best_reg = get_reg(ctx, reg, true);
721
722 if (best_reg == (physreg_t)~0u) {
723 if (try_demote_instruction(ctx, instr))
724 return;
725
726 best_reg = find_best_spill_reg(ctx, reg, size, reg_elem_size(reg));
727 assert(best_reg != (physreg_t)~0u);
728
729 free_space(ctx, best_reg, size);
730 }
731
732 d("reload src %u physreg %u", reg->name, best_reg);
733 interval->physreg_start = best_reg;
734 interval->physreg_end = best_reg + size;
735 interval->needs_reload = true;
736 ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
737
738 while (interval->interval.parent)
739 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
740
741 interval->src = true;
742 }
743
744 static void
reload_interval(struct ra_ctx * ctx,struct ir3_cursor cursor,struct ra_interval * interval)745 reload_interval(struct ra_ctx *ctx, struct ir3_cursor cursor,
746 struct ra_interval *interval)
747 {
748 struct ir3_register *def = interval->interval.reg;
749 struct ir3_instruction *mov = ir3_instr_create_at(cursor, OPC_MOV, 1, 1);
750 mov->flags |= IR3_INSTR_SHARED_SPILL;
751 unsigned flags = IR3_REG_SHARED | (def->flags & IR3_REG_HALF);
752 ir3_dst_create(mov, ra_physreg_to_num(interval->physreg_start, flags),
753 flags)->wrmask = def->wrmask;
754 mov->repeat = reg_elems(def) - 1;
755 struct ir3_register *mov_src =
756 ir3_src_create(mov, INVALID_REG, IR3_REG_SSA | (def->flags & IR3_REG_HALF) |
757 (mov->repeat ? IR3_REG_R : 0));
758 assert(interval->spill_def);
759 mov_src->def = interval->spill_def;
760 mov_src->wrmask = def->wrmask;
761 mov->cat1.src_type = mov->cat1.dst_type =
762 (def->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
763 }
764
765 static void
reload_src_finalize(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)766 reload_src_finalize(struct ra_ctx *ctx, struct ir3_instruction *instr,
767 struct ir3_register *src)
768 {
769 struct ir3_register *reg = src->def;
770 struct ra_interval *interval = &ctx->intervals[reg->name];
771
772 if (!interval->needs_reload)
773 return;
774
775 reload_interval(ctx, ir3_before_instr(instr), interval);
776
777 interval->needs_reload = false;
778 }
779
780 static bool
can_demote_src(struct ir3_instruction * instr)781 can_demote_src(struct ir3_instruction *instr)
782 {
783 switch (instr->opc) {
784 case OPC_SCAN_MACRO:
785 case OPC_META_COLLECT:
786 return false;
787 case OPC_MOV:
788 /* non-shared -> shared floating-point conversions and
789 * 8-bit sign extension don't work.
790 */
791 return (!(instr->dsts[0]->flags & IR3_REG_SHARED) ||
792 !((full_type(instr->cat1.src_type) == TYPE_F32 ||
793 full_type(instr->cat1.dst_type) == TYPE_F32) ||
794 (instr->cat1.src_type == TYPE_U8 &&
795 full_type(instr->cat1.dst_type) == TYPE_S32)));
796 default:
797 return (!is_alu(instr) && !is_sfu(instr)) ||
798 !(instr->dsts[0]->flags & IR3_REG_SHARED);
799 }
800 }
801
802 /* Ensure that this source is never spilled while reloading other sources.
803 */
804 static void
mark_src(struct ra_ctx * ctx,struct ir3_register * src)805 mark_src(struct ra_ctx *ctx, struct ir3_register *src)
806 {
807 if (!(src->flags & IR3_REG_SHARED))
808 return;
809
810 struct ra_interval *interval = &ctx->intervals[src->def->name];
811
812 if (interval->interval.inserted) {
813 while (interval->interval.parent)
814 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
815
816 interval->src = true;
817 }
818 }
819
820 static void
ensure_src_live(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)821 ensure_src_live(struct ra_ctx *ctx, struct ir3_instruction *instr,
822 struct ir3_register *src)
823 {
824 if (!(src->flags & IR3_REG_SHARED))
825 return;
826
827 struct ra_interval *interval = &ctx->intervals[src->def->name];
828
829 if (!interval->interval.inserted) {
830 /* In some cases we cannot demote shared reg sources to non-shared regs,
831 * then we have to reload it.
832 */
833 assert(interval->spill_def);
834 if (!can_demote_src(instr)) {
835 reload_src(ctx, instr, src);
836 } else {
837 if (instr->opc == OPC_META_PARALLEL_COPY) {
838 /* Stash away the original def to use later in case we actually have
839 * to insert a reload.
840 */
841 _mesa_hash_table_insert(ctx->pcopy_src_map, src, src->def);
842 }
843 src->def = interval->spill_def;
844 src->flags &= ~IR3_REG_SHARED;
845 }
846 }
847 }
848
849 static void
assign_src(struct ra_ctx * ctx,struct ir3_register * src)850 assign_src(struct ra_ctx *ctx, struct ir3_register *src)
851 {
852 if (!(src->flags & IR3_REG_SHARED))
853 return;
854
855 struct ra_interval *interval = &ctx->intervals[src->def->name];
856 assert(interval->interval.inserted);
857 src->num = ra_physreg_to_num(ra_interval_get_physreg(interval), src->flags);
858
859 if ((src->flags & IR3_REG_FIRST_KILL) &&
860 !interval->interval.parent &&
861 rb_tree_is_empty(&interval->interval.children))
862 ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
863
864 while (interval->interval.parent)
865 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
866
867 interval->src = false;
868 }
869
870 static void
handle_dst(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * dst)871 handle_dst(struct ra_ctx *ctx, struct ir3_instruction *instr,
872 struct ir3_register *dst)
873 {
874 if (!(dst->flags & IR3_REG_SHARED))
875 return;
876
877 struct ra_interval *interval = &ctx->intervals[dst->name];
878 ra_interval_init(interval, dst);
879 interval->spill_def = NULL;
880
881 if (dst->tied) {
882 struct ir3_register *tied_def = dst->tied->def;
883 struct ra_interval *tied_interval = &ctx->intervals[tied_def->name];
884 if ((dst->tied->flags & IR3_REG_KILL) &&
885 !tied_interval->interval.parent &&
886 rb_tree_is_empty(&tied_interval->interval.children)) {
887 dst->num = dst->tied->num;
888 interval->physreg_start = tied_interval->physreg_start;
889 interval->physreg_end = tied_interval->physreg_end;
890 ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
891 return;
892 }
893 }
894
895 physreg_t physreg = get_reg(ctx, dst, false);
896 if (physreg == (physreg_t) ~0u) {
897 if (try_demote_instruction(ctx, instr))
898 return;
899
900 unsigned size = reg_size(dst);
901 physreg = find_best_spill_reg(ctx, dst, size, reg_elem_size(dst));
902 assert(physreg != (physreg_t)~0u);
903 free_space(ctx, physreg, size);
904 }
905
906 dst->num = ra_physreg_to_num(physreg, dst->flags);
907
908 /* Non-trivial collects (i.e., ones that will introduce moves because the
909 * sources don't line-up with the destination) may cause source intervals to
910 * get implicitly moved when they are inserted as children of the destination
911 * interval. Since we don't support moving intervals in shared RA, this may
912 * cause illegal register allocations. Prevent this by making sure
913 * non-trivial collects will not share a merge set with (and will not be a
914 * parent interval of) their components. Detect this by checking if a dst got
915 * a register assignment that does not correspond with the existing preferred
916 * reg of its merge set, as this might cause a future collect covering its
917 * interval to become non-trivial.
918 */
919 if (dst->merge_set && dst->merge_set->preferred_reg != (physreg_t)~0 &&
920 physreg != dst->merge_set->preferred_reg + dst->merge_set_offset) {
921 dst->merge_set = NULL;
922 dst->interval_start = ctx->live->interval_offset;
923 dst->interval_end = dst->interval_start + reg_size(dst);
924 ctx->live->interval_offset = dst->interval_end;
925 }
926
927 ra_update_affinity(reg_file_size(dst), dst, physreg);
928 interval->physreg_start = physreg;
929 interval->physreg_end = physreg + reg_size(dst);
930 ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
931 d("insert dst %u physreg %u", dst->name, physreg);
932
933 if (dst->tied) {
934 struct ir3_instruction *mov = ir3_instr_create_at(
935 ir3_before_instr(instr), OPC_META_PARALLEL_COPY, 1, 1);
936 unsigned flags = IR3_REG_SHARED | (dst->flags & IR3_REG_HALF);
937 ir3_dst_create(mov, dst->num, flags)->wrmask = dst->wrmask;
938 ir3_src_create(mov, dst->tied->num, flags)->wrmask = dst->wrmask;
939 mov->cat1.src_type = mov->cat1.dst_type =
940 (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;;
941 dst->tied->num = dst->num;
942 }
943 }
944
945 static void
handle_src_late(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)946 handle_src_late(struct ra_ctx *ctx, struct ir3_instruction *instr,
947 struct ir3_register *src)
948 {
949 if (!(src->flags & IR3_REG_SHARED))
950 return;
951
952 struct ra_interval *interval = &ctx->intervals[src->def->name];
953 reload_src_finalize(ctx, instr, src);
954
955 /* Remove killed sources that have to be killed late due to being merged with
956 * other defs.
957 */
958 if (!(src->flags & IR3_REG_KILL))
959 return;
960
961 if (interval->interval.inserted)
962 ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
963 }
964
965 static void
handle_normal_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)966 handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
967 {
968 ra_foreach_src (src, instr)
969 mark_src(ctx, src);
970
971 ra_foreach_src (src, instr)
972 ensure_src_live(ctx, instr, src);
973
974 ra_foreach_src_rev (src, instr)
975 assign_src(ctx, src);
976
977 ra_foreach_dst (dst, instr)
978 handle_dst(ctx, instr, dst);
979
980 ra_foreach_src (src, instr)
981 handle_src_late(ctx, instr, src);
982 }
983
984 static void
handle_split(struct ra_ctx * ctx,struct ir3_instruction * split)985 handle_split(struct ra_ctx *ctx, struct ir3_instruction *split)
986 {
987 struct ir3_register *src = split->srcs[0];
988 struct ir3_register *dst = split->dsts[0];
989
990 if (!(dst->flags & IR3_REG_SHARED))
991 return;
992
993 if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
994 handle_normal_instr(ctx, split);
995 return;
996 }
997
998 struct ra_interval *src_interval = &ctx->intervals[src->def->name];
999 struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1000
1001 ra_interval_init(dst_interval, dst);
1002 dst_interval->spill_def = NULL;
1003
1004 if (src_interval->spill_def) {
1005 struct ir3_instruction *spill_split =
1006 ir3_instr_create_at(ir3_after_instr(split), OPC_META_SPLIT, 1, 1);
1007 struct ir3_register *dst = __ssa_dst(spill_split);
1008 struct ir3_register *src =
1009 ir3_src_create(spill_split, INVALID_REG, IR3_REG_SSA);
1010 src->def = src_interval->spill_def;
1011 src->wrmask = src_interval->spill_def->wrmask;
1012 spill_split->split.off = split->split.off;
1013 dst_interval->spill_def = dst;
1014 list_del(&split->node);
1015 return;
1016 }
1017
1018 dst_interval->physreg_start =
1019 src_interval->physreg_start + dst->merge_set_offset -
1020 src->def->merge_set_offset;
1021 dst_interval->physreg_end = dst_interval->physreg_start + reg_size(dst);
1022 ir3_reg_interval_insert(&ctx->reg_ctx, &dst_interval->interval);
1023 src->num = ra_interval_get_num(src_interval);
1024 dst->num = ra_interval_get_num(dst_interval);
1025 d("insert dst %u physreg %u", dst->name, dst_interval->physreg_start);
1026
1027 if (src->flags & IR3_REG_KILL)
1028 ir3_reg_interval_remove(&ctx->reg_ctx, &src_interval->interval);
1029 }
1030
1031 static void
handle_phi(struct ra_ctx * ctx,struct ir3_instruction * phi)1032 handle_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
1033 {
1034 struct ir3_register *dst = phi->dsts[0];
1035
1036 if (!(dst->flags & IR3_REG_SHARED))
1037 return;
1038
1039 struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1040 ra_interval_init(dst_interval, dst);
1041
1042 /* In some rare cases, it's possible to have a phi node with a physical-only
1043 * source. Here's a contrived example:
1044 *
1045 * loop {
1046 * if non-uniform {
1047 * if uniform {
1048 * x_1 = ...;
1049 * continue;
1050 * }
1051 * x_2 = ...;
1052 * } else {
1053 * break;
1054 * }
1055 * // continue block
1056 * x_3 = phi(x_1, x_2)
1057 * }
1058 *
1059 * Assuming x_1 and x_2 are uniform, x_3 will also be uniform, because all
1060 * threads that stay in the loop take the same branch to the continue block,
1061 * however execution may fall through from the assignment to x_2 to the
1062 * break statement because the outer if is non-uniform, and then it will fall
1063 * through again to the continue block, so if x_3 is to be in a shared reg
1064 * then the phi needs an extra source pointing to the break statement, which
1065 * itself needs a phi node:
1066 *
1067 * loop {
1068 * if non-uniform {
1069 * if uniform {
1070 * x_1 = ...;
1071 * continue;
1072 * }
1073 * x_2 = ...;
1074 * } else {
1075 * x_4 = phi(undef, x_2)
1076 * break;
1077 * }
1078 * // continue block
1079 * x_3 = phi(x_1, x_2, x_4)
1080 * }
1081 */
1082
1083 /* phi nodes are special because we cannot spill them normally, instead we
1084 * have to spill the parallel copies that their sources point to and make the
1085 * entire phi not shared anymore.
1086 */
1087
1088 physreg_t physreg = get_reg(ctx, dst, false);
1089 if (physreg == (physreg_t) ~0u) {
1090 d("spilling phi destination");
1091 dst->flags &= ~IR3_REG_SHARED;
1092 dst_interval->spill_def = dst;
1093 phi->flags |= IR3_INSTR_SHARED_SPILL;
1094
1095 foreach_src (src, phi) {
1096 src->flags &= ~IR3_REG_SHARED;
1097 if (src->def)
1098 src->def->flags &= ~IR3_REG_SHARED;
1099 }
1100
1101 return;
1102 }
1103
1104 dst->num = ra_physreg_to_num(physreg, dst->flags);
1105 dst_interval->spill_def = NULL;
1106 dst_interval->physreg_start = physreg;
1107 dst_interval->physreg_end = physreg + reg_size(dst);
1108 ir3_reg_interval_insert(&ctx->reg_ctx, &dst_interval->interval);
1109
1110 ra_foreach_src_n (src, i, phi) {
1111 /* We assume that any phis with non-logical sources aren't promoted. */
1112 assert(i < phi->block->predecessors_count);
1113 src->num = dst->num;
1114 src->def->num = dst->num;
1115 }
1116 }
1117
1118 static void
handle_pcopy(struct ra_ctx * ctx,struct ir3_instruction * pcopy)1119 handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *pcopy)
1120 {
1121 /* For parallel copies, we only handle the source. The destination is handled
1122 * later when processing phi nodes.
1123 */
1124
1125 ra_foreach_src (src, pcopy)
1126 mark_src(ctx, src);
1127
1128 ra_foreach_src (src, pcopy)
1129 ensure_src_live(ctx, pcopy, src);
1130
1131 ra_foreach_src_rev (src, pcopy)
1132 assign_src(ctx, src);
1133
1134 ra_foreach_src (src, pcopy)
1135 handle_src_late(ctx, pcopy, src);
1136 }
1137
1138 static void
handle_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1139 handle_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1140 {
1141 instr->flags &= ~IR3_INSTR_SHARED_SPILL;
1142
1143 switch (instr->opc) {
1144 case OPC_META_SPLIT:
1145 handle_split(ctx, instr);
1146 break;
1147 case OPC_META_PHI:
1148 handle_phi(ctx, instr);
1149 break;
1150 case OPC_META_PARALLEL_COPY:
1151 handle_pcopy(ctx, instr);
1152 break;
1153 default:
1154 handle_normal_instr(ctx, instr);
1155 }
1156 }
1157
1158 /* In case we define a value outside a loop, use it inside the loop, then spill
1159 * it afterwards inside the same loop, we could lose the value so we have to
1160 * reload it. We have to reload it after any parallel copy instruction, when the
1161 * live shared registers equal the live-in of the backedge. lower_pcopy() will
1162 * then move any non-shared parallel copies down past the reload.
1163 */
1164 static void
reload_live_outs(struct ra_ctx * ctx,struct ir3_block * block)1165 reload_live_outs(struct ra_ctx *ctx, struct ir3_block *block)
1166 {
1167 struct ra_block_state *state = &ctx->blocks[block->index];
1168 unsigned name;
1169 BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) {
1170 struct ir3_register *reg = ctx->live->definitions[name];
1171
1172 struct ra_interval *interval = &ctx->intervals[name];
1173 if (!interval->interval.inserted) {
1174 d("reloading %d at end of backedge", reg->name);
1175
1176 /* When this interval was spilled inside the loop, we probably chose a
1177 * different physreg for it than the original physreg when it was
1178 * defined outside the loop. Restore the original physreg so that we
1179 * spill it correctly.
1180 */
1181 unsigned size = interval->physreg_end - interval->physreg_start;
1182 interval->physreg_start = interval->physreg_start_orig;
1183 interval->physreg_end = interval->physreg_start + size;
1184
1185 reload_interval(ctx, ir3_before_terminator(block), interval);
1186 }
1187 }
1188 }
1189
1190 static void
record_pred_live_out(struct ra_ctx * ctx,struct ra_interval * interval,struct ir3_block * pred)1191 record_pred_live_out(struct ra_ctx *ctx,
1192 struct ra_interval *interval,
1193 struct ir3_block *pred)
1194 {
1195 struct ra_block_state *state = &ctx->blocks[pred->index];
1196
1197 struct ir3_register *def = interval->interval.reg;
1198 BITSET_SET(state->live_out, def->name);
1199
1200 rb_tree_foreach (struct ra_interval, child,
1201 &interval->interval.children, interval.node) {
1202 record_pred_live_out(ctx, child, pred);
1203 }
1204 }
1205
1206 static void
record_pred_live_outs(struct ra_ctx * ctx,struct ir3_block * block)1207 record_pred_live_outs(struct ra_ctx *ctx, struct ir3_block *block)
1208 {
1209 for (unsigned i = 0; i < block->predecessors_count; i++) {
1210 struct ir3_block *pred = block->predecessors[i];
1211 struct ra_block_state *state = &ctx->blocks[pred->index];
1212 if (state->visited)
1213 continue;
1214
1215 state->live_out = rzalloc_array(NULL, BITSET_WORD,
1216 BITSET_WORDS(ctx->live->definitions_count));
1217
1218
1219 rb_tree_foreach (struct ra_interval, interval,
1220 &ctx->reg_ctx.intervals, interval.node) {
1221 record_pred_live_out(ctx, interval, pred);
1222 }
1223 }
1224 }
1225
1226 static void
handle_block(struct ra_ctx * ctx,struct ir3_block * block)1227 handle_block(struct ra_ctx *ctx, struct ir3_block *block)
1228 {
1229 ra_ctx_reset_block(ctx);
1230
1231 unsigned name;
1232 BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1233 ctx->live->definitions_count) {
1234 struct ir3_register *def = ctx->live->definitions[name];
1235 struct ra_interval *interval = &ctx->intervals[name];
1236
1237 /* Non-shared definitions may still be definitions we spilled by demoting
1238 * them, so we still need to initialize the interval. But we shouldn't
1239 * make these intervals live.
1240 */
1241 ra_interval_init(interval, def);
1242
1243 if ((def->flags & IR3_REG_SHARED) && !interval->spill_def) {
1244 ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
1245 }
1246 }
1247
1248 if (RA_DEBUG) {
1249 d("after live-in block %u:\n", block->index);
1250 ra_ctx_dump(ctx);
1251 }
1252
1253 if (block->predecessors_count > 1)
1254 record_pred_live_outs(ctx, block);
1255
1256 foreach_instr_safe (instr, &block->instr_list) {
1257 di(instr, "processing");
1258
1259 handle_instr(ctx, instr);
1260
1261 if (RA_DEBUG)
1262 ra_ctx_dump(ctx);
1263 }
1264
1265 if (block->successors[0]) {
1266 struct ra_block_state *state = &ctx->blocks[block->successors[0]->index];
1267
1268 if (state->visited) {
1269 assert(!block->successors[1]);
1270
1271 reload_live_outs(ctx, block);
1272 }
1273 }
1274
1275 ctx->blocks[block->index].visited = true;
1276 }
1277
1278 static void
lower_pcopy(struct ir3 * ir,struct ra_ctx * ctx)1279 lower_pcopy(struct ir3 *ir, struct ra_ctx *ctx)
1280 {
1281 foreach_block (block, &ir->block_list) {
1282 foreach_instr_safe (instr, &block->instr_list) {
1283 /* At this point due to spilling there may be parallel copies from
1284 * shared to non-shared registers and vice versa. Lowering these after
1285 * RA may produce cycles involving shared and non-shared registers,
1286 * which would need to be resolved by swapping a shared and non-shared
1287 * register which is something we can't handle. However by lowering
1288 * these to moves now, we can make sure that cycles only involve
1289 * non-shared registers. To avoid illegally moving a shared register
1290 * read or write across the parallel copy, which may have other
1291 * conflicting reads/writes if there's a cycle, we need to move copies
1292 * from non-shared to shared below the shared copies, and we need to
1293 * move copies from shared to non-shared above them. So, we have the
1294 * following order:
1295 *
1296 * 1. shared->non-shared copies (spills)
1297 * 2. shared->shared copies (one parallel copy as there may be cycles)
1298 * 3. non-shared->shared copies (reloads)
1299 * 4. non-shared->non-shared copies
1300 *
1301 * We split out the non-shared->non-shared copies as a separate step.
1302 */
1303 if (instr->opc == OPC_META_PARALLEL_COPY) {
1304 for (unsigned i = 0; i < instr->srcs_count; i++) {
1305 if ((instr->srcs[i]->flags & IR3_REG_SHARED) &&
1306 !(instr->dsts[i]->flags & IR3_REG_SHARED)) {
1307 /* shared->non-shared. Create a spill move and rewrite the
1308 * source to be the destination of the move (so that the
1309 * original shared->non-shared copy becomes a
1310 * non-shared->non-shared copy).
1311 */
1312 struct ir3_instruction *mov = ir3_instr_create_at(
1313 ir3_before_instr(instr), OPC_MOV, 1, 1);
1314 mov->flags |= IR3_INSTR_SHARED_SPILL;
1315 struct ir3_register *dst =
1316 ir3_dst_create(mov, INVALID_REG, instr->dsts[i]->flags);
1317 dst->wrmask = instr->dsts[i]->wrmask;
1318 dst->instr = mov;
1319 mov->repeat = reg_elems(mov->dsts[0]) - 1;
1320 struct ir3_register *src =
1321 ir3_src_create(mov, instr->srcs[i]->num,
1322 instr->srcs[i]->flags |
1323 (mov->repeat ? IR3_REG_R : 0));
1324 src->wrmask = instr->srcs[i]->wrmask;
1325 mov->cat1.dst_type = mov->cat1.src_type =
1326 (mov->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
1327 instr->srcs[i]->flags = mov->dsts[0]->flags;
1328 instr->srcs[i]->def = mov->dsts[0];
1329 }
1330 }
1331
1332 for (unsigned i = 0; i < instr->dsts_count;) {
1333 if ((instr->dsts[i]->flags & IR3_REG_SHARED) &&
1334 (instr->srcs[i]->flags & IR3_REG_SSA) &&
1335 !(instr->srcs[i]->flags & IR3_REG_SHARED)) {
1336 /* non-shared->shared. Create a reload move.
1337 */
1338 struct ir3_instruction *mov =
1339 ir3_instr_create_at(ir3_after_instr(instr), OPC_MOV, 1, 1);
1340 mov->flags |= IR3_INSTR_SHARED_SPILL;
1341 struct ir3_register *dst =
1342 ir3_dst_create(mov, instr->dsts[i]->num,
1343 instr->dsts[i]->flags);
1344 dst->instr = mov;
1345 dst->wrmask = instr->dsts[i]->wrmask;
1346 mov->repeat = reg_elems(mov->dsts[0]) - 1;
1347 struct ir3_register *src =
1348 ir3_src_create(mov, INVALID_REG, instr->srcs[i]->flags |
1349 (mov->repeat ? IR3_REG_R : 0));
1350 src->def = instr->srcs[i]->def;
1351 src->wrmask = instr->srcs[i]->wrmask;
1352 mov->cat1.dst_type = mov->cat1.src_type =
1353 (mov->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
1354
1355 /* When we spill a parallel copy source, we lose the
1356 * information of where it originally points to since we make
1357 * it point to the spill def. If we later decide not to also
1358 * spill the phi associated with it, we have to restore it
1359 * here using the stashed original source so that RA
1360 * validation can check that we did the correct thing.
1361 *
1362 * Because SSA-ness goes away after validation, this is really
1363 * just about validation.
1364 */
1365 struct ir3_block *succ = block->successors[0];
1366 unsigned pred_idx = ir3_block_get_pred_index(succ, block);
1367 foreach_instr (phi, &succ->instr_list) {
1368 if (phi->opc != OPC_META_PHI)
1369 break;
1370
1371 if (phi->srcs[pred_idx]->def == instr->dsts[i]) {
1372 struct ir3_register *def =
1373 _mesa_hash_table_search(ctx->pcopy_src_map,
1374 instr->srcs[i])->data;
1375 phi->srcs[pred_idx]->def = def;
1376 break;
1377 }
1378 }
1379
1380 instr->srcs[i] = instr->srcs[instr->srcs_count - 1];
1381 instr->dsts[i] = instr->dsts[instr->dsts_count - 1];
1382 instr->srcs_count--;
1383 instr->dsts_count--;
1384 continue;
1385 }
1386
1387 i++;
1388 }
1389
1390 /* Move any non-shared copies to a separate parallel copy
1391 * instruction right at the end of the block, after any reloads. At
1392 * this point all copies should be {shared,immediate}->shared or
1393 * {non-shared,immediate}->non-shared.
1394 */
1395 unsigned non_shared_copies = 0;
1396 for (unsigned i = 0; i < instr->dsts_count; i++) {
1397 if (!(instr->dsts[i]->flags & IR3_REG_SHARED))
1398 non_shared_copies++;
1399 }
1400
1401 if (non_shared_copies != 0) {
1402 struct ir3_instruction *pcopy = ir3_instr_create_at(
1403 ir3_before_terminator(block), OPC_META_PARALLEL_COPY,
1404 non_shared_copies, non_shared_copies);
1405
1406 unsigned j = 0;
1407 for (unsigned i = 0; i < instr->dsts_count;) {
1408 if (!(instr->dsts[i]->flags & IR3_REG_SHARED)) {
1409 pcopy->dsts[j] = instr->dsts[i];
1410 pcopy->srcs[j] = instr->srcs[i];
1411 pcopy->dsts[j]->instr = pcopy;
1412 instr->srcs[i] = instr->srcs[instr->srcs_count - 1];
1413 instr->dsts[i] = instr->dsts[instr->dsts_count - 1];
1414 instr->srcs_count--;
1415 instr->dsts_count--;
1416 j++;
1417 continue;
1418 }
1419 i++;
1420 }
1421
1422 pcopy->srcs_count = pcopy->dsts_count = j;
1423 if (instr->dsts_count == 0)
1424 list_del(&instr->node);
1425 }
1426 }
1427 }
1428 }
1429 }
1430
1431 static void
finalize(struct ir3 * ir)1432 finalize(struct ir3 *ir)
1433 {
1434 foreach_block (block, &ir->block_list) {
1435 foreach_instr (instr, &block->instr_list) {
1436 for (unsigned i = 0; i < instr->dsts_count; i++) {
1437 if (instr->dsts[i]->flags & IR3_REG_SHARED) {
1438 instr->dsts[i]->flags &= ~IR3_REG_SSA;
1439 }
1440 }
1441
1442 for (unsigned i = 0; i < instr->srcs_count; i++) {
1443 if (instr->srcs[i]->flags & IR3_REG_SHARED) {
1444 instr->srcs[i]->flags &= ~IR3_REG_SSA;
1445 instr->srcs[i]->def = NULL;
1446 }
1447 }
1448 }
1449 }
1450 }
1451
1452 void
ir3_ra_shared(struct ir3_shader_variant * v,struct ir3_liveness ** live_ptr)1453 ir3_ra_shared(struct ir3_shader_variant *v, struct ir3_liveness **live_ptr)
1454 {
1455 struct ra_ctx ctx;
1456 struct ir3_liveness *live = *live_ptr;
1457
1458 ra_ctx_init(&ctx);
1459 ctx.intervals = rzalloc_array(NULL, struct ra_interval,
1460 live->definitions_count);
1461 ctx.blocks = rzalloc_array(NULL, struct ra_block_state,
1462 live->block_count);
1463 ctx.start = 0;
1464 ctx.live = live;
1465 ctx.pcopy_src_map = _mesa_pointer_hash_table_create(NULL);
1466
1467 foreach_block (block, &v->ir->block_list) {
1468 handle_block(&ctx, block);
1469 }
1470
1471 lower_pcopy(v->ir, &ctx);
1472
1473 for (unsigned i = 0; i < live->block_count; i++) {
1474 if (ctx.blocks[i].live_out)
1475 ralloc_free(ctx.blocks[i].live_out);
1476 }
1477
1478 ralloc_free(ctx.intervals);
1479 ralloc_free(ctx.pcopy_src_map);
1480 ralloc_free(ctx.blocks);
1481
1482 ir3_ra_validate(v, RA_FULL_SIZE, RA_HALF_SIZE, live->block_count, true);
1483 finalize(v->ir);
1484
1485 /* Recalculate liveness and register pressure now that additional values have
1486 * been added.
1487 * TODO we should only do this if any values have been spilled/reloaded.
1488 * Note: since we don't have to recreate merge sets, we have to manually copy
1489 * interval_offset to the new liveness struct.
1490 */
1491 unsigned interval_offset = live->interval_offset;
1492 void *live_mem_ctx = ralloc_parent(live);
1493 ralloc_free(live);
1494 *live_ptr = ir3_calc_liveness(live_mem_ctx, v->ir);
1495 (*live_ptr)->interval_offset = interval_offset;
1496 }
1497
1498