1 /*
2 * Copyright © 2021 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "util/rb_tree.h"
7 #include "ir3_ra.h"
8 #include "ir3_shader.h"
9
10 /*
11 * This pass does two things:
12 *
13 * 1. Calculates the maximum register pressure. To do this, we need to use the
14 * exact same technique that RA uses for combining meta_split instructions
15 * with their sources, so that our calculation agrees with RA.
16 * 2. Spills when the register pressure is exceeded a limit calculated by RA.
17 * The implementation is based on "Register Spilling and Live-Range Splitting
18 * for SSA-Form Programs" by Braun and Hack, although again care has to be
19 * taken to handle combining split/collect instructions.
20 */
21
22 struct reg_or_immed {
23 unsigned flags;
24 union {
25 struct ir3_register *def;
26 uint32_t uimm;
27 unsigned const_num;
28 };
29 };
30
31 struct ra_spill_interval {
32 struct ir3_reg_interval interval;
33
34 struct rb_node node;
35 struct rb_node half_node;
36
37 /* The current SSA value/const/immed this source is mapped to. */
38 struct reg_or_immed dst;
39
40 /* When computing use distances we use the distance relative to the start
41 * of the block. So, for example, a value that's defined in cycle 5 of the
42 * block and used 6 cycles later will always have a next_use_distance of 11
43 * until we reach that use.
44 */
45 unsigned next_use_distance;
46
47 /* Whether this value was reloaded and therefore doesn't need to be
48 * spilled again. Corresponds to the S set in the paper.
49 */
50 bool already_spilled;
51
52 /* We need to add sources early for accounting purposes, but we have to
53 * insert the reload code for them last. Keep track of whether this interval
54 * needs to be reloaded later.
55 */
56 bool needs_reload;
57
58 /* Keep track of whether this interval currently can't be spilled because:
59 * - It or one of its children is a source and we're making space for
60 * sources.
61 * - It is a destination and we're making space for destinations.
62 */
63 bool cant_spill;
64
65 /* Whether this interval can be rematerialized. */
66 bool can_rematerialize;
67 };
68
69 struct ra_spill_block_state {
70 unsigned *next_use_end;
71 unsigned *next_use_start;
72
73 unsigned cycles;
74
75 /* Map from SSA def to reg_or_immed it is mapped to at the end of the block.
76 * This map only contains values which we didn't spill, so it also serves as
77 * a record of the new live-out set for this block.
78 */
79 struct hash_table *remap;
80
81 /* For blocks whose successors are visited first (i.e. loop backedges), which
82 * values should be live at the end.
83 */
84 BITSET_WORD *live_out;
85
86 bool visited;
87 };
88
89 struct ra_spill_ctx {
90 struct ir3_reg_ctx reg_ctx;
91
92 struct ra_spill_interval **intervals;
93 unsigned intervals_count;
94
95 /* rb tree of live intervals that we can spill, ordered by next-use distance.
96 * full_live_intervals contains the full+shared intervals in the merged_regs
97 * case. We use this list to determine what to spill.
98 */
99 struct rb_tree full_live_intervals;
100 struct rb_tree half_live_intervals;
101
102 struct ir3_pressure cur_pressure, max_pressure;
103
104 struct ir3_pressure limit_pressure;
105
106 /* When spilling, we need to reserve a register to serve as the zero'd
107 * "base". For simplicity we reserve a register at the beginning so that it's
108 * always available.
109 */
110 struct ir3_register *base_reg;
111
112 /* Current pvtmem offset in bytes. */
113 unsigned spill_slot;
114
115 struct ir3_liveness *live;
116
117 const struct ir3_compiler *compiler;
118
119 struct ra_spill_block_state *blocks;
120
121 bool spilling;
122
123 bool merged_regs;
124 };
125
126 static void
add_base_reg(struct ra_spill_ctx * ctx,struct ir3 * ir)127 add_base_reg(struct ra_spill_ctx *ctx, struct ir3 *ir)
128 {
129 struct ir3_block *start = ir3_start_block(ir);
130 struct ir3_builder build = ir3_builder_at(ir3_after_block(start));
131
132 /* We need to stick it after any meta instructions which need to be first. */
133 struct ir3_instruction *after = NULL;
134 foreach_instr (instr, &start->instr_list) {
135 if (instr->opc != OPC_META_INPUT &&
136 instr->opc != OPC_META_TEX_PREFETCH) {
137 after = instr;
138 break;
139 }
140 }
141
142 struct ir3_instruction *mov = create_immed(&build, 0);
143
144 if (after)
145 ir3_instr_move_before(mov, after);
146
147 ctx->base_reg = mov->dsts[0];
148
149 /* We don't create an interval, etc. for the base reg, so just lower the
150 * register pressure limit to account for it. We assume it's always
151 * available for simplicity.
152 */
153 ctx->limit_pressure.full -= reg_size(ctx->base_reg);
154 }
155
156
157 /* Compute the number of cycles per instruction used for next-use-distance
158 * analysis. This is just approximate, obviously.
159 */
160 static unsigned
instr_cycles(struct ir3_instruction * instr)161 instr_cycles(struct ir3_instruction *instr)
162 {
163 if (instr->opc == OPC_META_PARALLEL_COPY) {
164 unsigned cycles = 0;
165 for (unsigned i = 0; i < instr->dsts_count; i++) {
166 if (!instr->srcs[i]->def ||
167 instr->srcs[i]->def->merge_set != instr->dsts[i]->merge_set) {
168 cycles += reg_elems(instr->srcs[i]);
169 }
170 }
171
172 return cycles;
173 }
174
175 if (instr->opc == OPC_META_COLLECT) {
176 unsigned cycles = 0;
177 for (unsigned i = 0; i < instr->srcs_count; i++) {
178 if (!instr->srcs[i]->def ||
179 instr->srcs[i]->def->merge_set != instr->dsts[0]->merge_set) {
180 cycles++;
181 }
182 }
183
184 return cycles;
185 }
186
187 if (is_meta(instr))
188 return 0;
189
190 return 1 + instr->repeat;
191 }
192
193 static bool
compute_block_next_distance(struct ra_spill_ctx * ctx,struct ir3_block * block,unsigned * tmp_next_use)194 compute_block_next_distance(struct ra_spill_ctx *ctx, struct ir3_block *block,
195 unsigned *tmp_next_use)
196 {
197 struct ra_spill_block_state *state = &ctx->blocks[block->index];
198 memcpy(tmp_next_use, state->next_use_end,
199 ctx->live->definitions_count * sizeof(*tmp_next_use));
200
201 unsigned cycle = state->cycles;
202 foreach_instr_rev (instr, &block->instr_list) {
203 ra_foreach_dst (dst, instr) {
204 dst->next_use = tmp_next_use[dst->name];
205 }
206
207 ra_foreach_src (src, instr) {
208 src->next_use = tmp_next_use[src->def->name];
209 }
210
211 cycle -= instr_cycles(instr);
212
213 if (instr->opc == OPC_META_PARALLEL_COPY) {
214 ra_foreach_src_n (src, i, instr) {
215 if (src->def->merge_set == instr->dsts[i]->merge_set &&
216 src->def->merge_set_offset == instr->dsts[i]->merge_set_offset) {
217 tmp_next_use[src->def->name] =
218 tmp_next_use[instr->dsts[i]->name];
219 } else {
220 tmp_next_use[src->def->name] = cycle;
221 }
222 }
223 } else if (instr->opc != OPC_META_PHI) {
224 ra_foreach_src (src, instr) {
225 tmp_next_use[src->def->name] = cycle;
226 }
227 }
228
229 ra_foreach_dst (dst, instr) {
230 tmp_next_use[dst->name] = UINT_MAX;
231 }
232 }
233
234 memcpy(state->next_use_start, tmp_next_use,
235 ctx->live->definitions_count * sizeof(*tmp_next_use));
236
237 bool progress = false;
238 for (unsigned i = 0; i < block->predecessors_count; i++) {
239 const struct ir3_block *pred = block->predecessors[i];
240 struct ra_spill_block_state *pred_state = &ctx->blocks[pred->index];
241
242 /* Add a large-enough distance in front of edges exiting the loop so that
243 * variables that are live-through the loop but not used inside it are
244 * prioritized for spilling, as per the paper. This just needs to be
245 * larger than the longest path through the loop.
246 */
247 bool loop_exit = pred->loop_depth < block->loop_depth;
248 unsigned block_distance = pred_state->cycles + (loop_exit ? 100000 : 0);
249
250 for (unsigned j = 0; j < ctx->live->definitions_count; j++) {
251 if (state->next_use_start[j] < UINT_MAX &&
252 state->next_use_start[j] + block_distance <
253 pred_state->next_use_end[j]) {
254 pred_state->next_use_end[j] = state->next_use_start[j] +
255 block_distance;
256 progress = true;
257 }
258 }
259
260 foreach_instr (phi, &block->instr_list) {
261 if (phi->opc != OPC_META_PHI)
262 break;
263 if (!phi->srcs[i]->def)
264 continue;
265 unsigned src = phi->srcs[i]->def->name;
266 if (phi->dsts[0]->next_use < UINT_MAX &&
267 phi->dsts[0]->next_use + block_distance <
268 pred_state->next_use_end[src]) {
269 pred_state->next_use_end[src] = phi->dsts[0]->next_use +
270 block_distance;
271 progress = true;
272 }
273 }
274 }
275
276 return progress;
277 }
278
279 static void
compute_next_distance(struct ra_spill_ctx * ctx,struct ir3 * ir)280 compute_next_distance(struct ra_spill_ctx *ctx, struct ir3 *ir)
281 {
282 for (unsigned i = 0; i < ctx->live->block_count; i++) {
283 ctx->blocks[i].next_use_start =
284 ralloc_array(ctx, unsigned, ctx->live->definitions_count);
285 ctx->blocks[i].next_use_end =
286 ralloc_array(ctx, unsigned, ctx->live->definitions_count);
287
288 for (unsigned j = 0; j < ctx->live->definitions_count; j++) {
289 ctx->blocks[i].next_use_start[j] = UINT_MAX;
290 ctx->blocks[i].next_use_end[j] = UINT_MAX;
291 }
292 }
293
294 foreach_block (block, &ir->block_list) {
295 struct ra_spill_block_state *state = &ctx->blocks[block->index];
296 state->cycles = 0;
297 foreach_instr (instr, &block->instr_list) {
298 state->cycles += instr_cycles(instr);
299 foreach_dst (dst, instr) {
300 dst->spill_slot = ~0;
301 }
302 }
303 }
304
305 unsigned *tmp_next_use =
306 ralloc_array(ctx, unsigned, ctx->live->definitions_count);
307
308 bool progress = true;
309 while (progress) {
310 progress = false;
311 foreach_block_rev (block, &ir->block_list) {
312 progress |= compute_block_next_distance(ctx, block, tmp_next_use);
313 }
314 }
315 }
316
317 static bool
can_rematerialize(struct ir3_register * reg)318 can_rematerialize(struct ir3_register *reg)
319 {
320 if (reg->flags & IR3_REG_ARRAY)
321 return false;
322 if (reg->instr->opc != OPC_MOV)
323 return false;
324 if (!(reg->instr->srcs[0]->flags & (IR3_REG_IMMED | IR3_REG_CONST)))
325 return false;
326 if (reg->instr->srcs[0]->flags & IR3_REG_RELATIV)
327 return false;
328 return true;
329 }
330
331 static struct ir3_register *
rematerialize(struct ir3_register * reg,struct ir3_cursor cursor)332 rematerialize(struct ir3_register *reg, struct ir3_cursor cursor)
333 {
334 d("rematerializing ssa_%u:%u", reg->instr->serialno, reg->name);
335
336 struct ir3_instruction *remat =
337 ir3_instr_create_at(cursor, reg->instr->opc, 1, reg->instr->srcs_count);
338 struct ir3_register *dst = __ssa_dst(remat);
339 dst->flags |= reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
340 for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
341 struct ir3_register *src =
342 ir3_src_create(remat, INVALID_REG, reg->instr->srcs[i]->flags);
343 *src = *reg->instr->srcs[i];
344 }
345
346 remat->cat1 = reg->instr->cat1;
347
348 dst->merge_set = reg->merge_set;
349 dst->merge_set_offset = reg->merge_set_offset;
350 dst->interval_start = reg->interval_start;
351 dst->interval_end = reg->interval_end;
352 return dst;
353 }
354
355 static void
ra_spill_interval_init(struct ra_spill_interval * interval,struct ir3_register * reg)356 ra_spill_interval_init(struct ra_spill_interval *interval,
357 struct ir3_register *reg)
358 {
359 ir3_reg_interval_init(&interval->interval, reg);
360 interval->dst.flags = reg->flags;
361 interval->dst.def = reg;
362 interval->already_spilled = false;
363 interval->needs_reload = false;
364 interval->cant_spill = false;
365 interval->can_rematerialize = can_rematerialize(reg);
366 }
367
368 static struct ra_spill_interval *
ir3_reg_interval_to_interval(struct ir3_reg_interval * interval)369 ir3_reg_interval_to_interval(struct ir3_reg_interval *interval)
370 {
371 return rb_node_data(struct ra_spill_interval, interval, interval);
372 }
373
374 static struct ra_spill_interval *
ra_spill_interval_root(struct ra_spill_interval * interval)375 ra_spill_interval_root(struct ra_spill_interval *interval)
376 {
377 struct ir3_reg_interval *ir3_interval = &interval->interval;
378 while (ir3_interval->parent)
379 ir3_interval = ir3_interval->parent;
380 return ir3_reg_interval_to_interval(ir3_interval);
381 }
382
383 static struct ra_spill_ctx *
ir3_reg_ctx_to_ctx(struct ir3_reg_ctx * ctx)384 ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
385 {
386 return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx);
387 }
388
389 static int
spill_interval_cmp(const struct ra_spill_interval * a,const struct ra_spill_interval * b)390 spill_interval_cmp(const struct ra_spill_interval *a,
391 const struct ra_spill_interval *b)
392 {
393 /* Prioritize intervals that we can rematerialize. */
394 if (a->can_rematerialize && !b->can_rematerialize)
395 return 1;
396 if (!a->can_rematerialize && b->can_rematerialize)
397 return -1;
398
399 return a->next_use_distance - b->next_use_distance;
400 }
401
402 static int
ra_spill_interval_cmp(const struct rb_node * _a,const struct rb_node * _b)403 ra_spill_interval_cmp(const struct rb_node *_a, const struct rb_node *_b)
404 {
405 const struct ra_spill_interval *a =
406 rb_node_data(const struct ra_spill_interval, _a, node);
407 const struct ra_spill_interval *b =
408 rb_node_data(const struct ra_spill_interval, _b, node);
409 return spill_interval_cmp(a, b);
410 }
411
412 static int
ra_spill_interval_half_cmp(const struct rb_node * _a,const struct rb_node * _b)413 ra_spill_interval_half_cmp(const struct rb_node *_a, const struct rb_node *_b)
414 {
415 const struct ra_spill_interval *a =
416 rb_node_data(const struct ra_spill_interval, _a, half_node);
417 const struct ra_spill_interval *b =
418 rb_node_data(const struct ra_spill_interval, _b, half_node);
419 return spill_interval_cmp(a, b);
420 }
421
422 static void
interval_add(struct ir3_reg_ctx * _ctx,struct ir3_reg_interval * _interval)423 interval_add(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
424 {
425 struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
426 struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
427
428 unsigned size = reg_size(interval->interval.reg);
429 if (interval->interval.reg->flags & IR3_REG_SHARED) {
430 ctx->cur_pressure.shared += size;
431 if (interval->interval.reg->flags & IR3_REG_HALF)
432 ctx->cur_pressure.shared_half += size;
433 } else {
434 if (interval->interval.reg->flags & IR3_REG_HALF) {
435 ctx->cur_pressure.half += size;
436 if (ctx->spilling) {
437 rb_tree_insert(&ctx->half_live_intervals, &interval->half_node,
438 ra_spill_interval_half_cmp);
439 }
440 }
441 if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) {
442 ctx->cur_pressure.full += size;
443 if (ctx->spilling) {
444 rb_tree_insert(&ctx->full_live_intervals, &interval->node,
445 ra_spill_interval_cmp);
446 }
447 }
448 }
449 }
450
451 static void
interval_delete(struct ir3_reg_ctx * _ctx,struct ir3_reg_interval * _interval)452 interval_delete(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
453 {
454 struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
455 struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
456
457 unsigned size = reg_size(interval->interval.reg);
458 if (interval->interval.reg->flags & IR3_REG_SHARED) {
459 ctx->cur_pressure.shared -= size;
460 if (interval->interval.reg->flags & IR3_REG_HALF)
461 ctx->cur_pressure.shared_half -= size;
462 } else {
463 if (interval->interval.reg->flags & IR3_REG_HALF) {
464 ctx->cur_pressure.half -= size;
465 if (ctx->spilling) {
466 rb_tree_remove(&ctx->half_live_intervals, &interval->half_node);
467 }
468 }
469 if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) {
470 ctx->cur_pressure.full -= size;
471 if (ctx->spilling) {
472 rb_tree_remove(&ctx->full_live_intervals, &interval->node);
473 }
474 }
475 }
476 }
477
478 static void
interval_readd(struct ir3_reg_ctx * _ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)479 interval_readd(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_parent,
480 struct ir3_reg_interval *_child)
481 {
482 interval_add(_ctx, _child);
483 }
484
485 static void
spill_ctx_init(struct ra_spill_ctx * ctx,struct ir3_shader_variant * v,struct ir3_liveness * live)486 spill_ctx_init(struct ra_spill_ctx *ctx, struct ir3_shader_variant *v,
487 struct ir3_liveness *live)
488 {
489 ctx->live = live;
490 ctx->intervals = ralloc_array(ctx, struct ra_spill_interval *,
491 ctx->live->definitions_count);
492 struct ra_spill_interval *intervals =
493 rzalloc_array(ctx, struct ra_spill_interval,
494 ctx->live->definitions_count);
495 for (unsigned i = 0; i < ctx->live->definitions_count; i++)
496 ctx->intervals[i] = &intervals[i];
497
498 ctx->intervals_count = ctx->live->definitions_count;
499 ctx->compiler = v->compiler;
500 ctx->merged_regs = v->mergedregs;
501
502 rb_tree_init(&ctx->reg_ctx.intervals);
503 ctx->reg_ctx.interval_add = interval_add;
504 ctx->reg_ctx.interval_delete = interval_delete;
505 ctx->reg_ctx.interval_readd = interval_readd;
506 }
507
508 static void
ra_spill_ctx_insert(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval)509 ra_spill_ctx_insert(struct ra_spill_ctx *ctx,
510 struct ra_spill_interval *interval)
511 {
512 ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
513 }
514
515 static void
ra_spill_ctx_remove(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval)516 ra_spill_ctx_remove(struct ra_spill_ctx *ctx,
517 struct ra_spill_interval *interval)
518 {
519 ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
520 }
521
522 static void
init_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)523 init_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
524 {
525 struct ra_spill_interval *interval = ctx->intervals[dst->name];
526 ra_spill_interval_init(interval, dst);
527 if (ctx->spilling) {
528 interval->next_use_distance = dst->next_use;
529
530 /* We only need to keep track of used-ness if this value may be
531 * rematerialized. This also keeps us from nuking things that may be
532 * in the keeps list (e.g. atomics, input splits).
533 */
534 if (interval->can_rematerialize)
535 dst->instr->flags |= IR3_INSTR_UNUSED;
536 }
537 }
538
539 static void
insert_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)540 insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
541 {
542 struct ra_spill_interval *interval = ctx->intervals[dst->name];
543 if (interval->interval.inserted)
544 return;
545
546 ra_spill_ctx_insert(ctx, interval);
547 interval->cant_spill = true;
548
549 /* For precolored inputs, make sure we leave enough registers to allow for
550 * holes in the inputs. It can happen that the binning shader has a lower
551 * register pressure than the main shader, but the main shader decided to
552 * add holes between the inputs which means that the binning shader has a
553 * higher register demand.
554 */
555 if (dst->instr->opc == OPC_META_INPUT && dst->num != INVALID_REG) {
556 physreg_t physreg = ra_reg_get_physreg(dst);
557 physreg_t max = physreg + reg_size(dst);
558
559 if (interval->interval.reg->flags & IR3_REG_SHARED) {
560 ctx->max_pressure.shared = MAX2(ctx->max_pressure.shared, max);
561 if (interval->interval.reg->flags & IR3_REG_HALF) {
562 ctx->max_pressure.shared_half = MAX2(ctx->max_pressure.shared_half,
563 max);
564 }
565 } else if (interval->interval.reg->flags & IR3_REG_HALF) {
566 ctx->max_pressure.half = MAX2(ctx->max_pressure.half, max);
567 } else {
568 ctx->max_pressure.full = MAX2(ctx->max_pressure.full, max);
569 }
570 }
571 }
572
573 static void
insert_src(struct ra_spill_ctx * ctx,struct ir3_register * src)574 insert_src(struct ra_spill_ctx *ctx, struct ir3_register *src)
575 {
576 struct ra_spill_interval *interval = ctx->intervals[src->def->name];
577
578 if (!interval->interval.inserted) {
579 ra_spill_ctx_insert(ctx, interval);
580 interval->needs_reload = true;
581 interval->already_spilled = true;
582 }
583
584 ra_spill_interval_root(interval)->cant_spill = true;
585
586 }
587
588 static void
remove_src_early(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)589 remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
590 struct ir3_register *src)
591 {
592 struct ra_spill_interval *interval = ctx->intervals[src->def->name];
593
594 if (ctx->spilling) {
595 /* It might happen that a collect that cannot be coalesced with one of its
596 * sources while spilling can be coalesced with it afterwards. In this
597 * case, we might be able to remove it here during spilling but not
598 * afterwards (because it may have a child interval). If this happens, we
599 * could end up with a register pressure that is higher after spilling
600 * than before. Prevent this by never removing collects early while
601 * spilling.
602 */
603 if (src->def->instr->opc == OPC_META_COLLECT)
604 return;
605 }
606
607 if (!interval->interval.inserted || interval->interval.parent ||
608 !rb_tree_is_empty(&interval->interval.children))
609 return;
610
611 ra_spill_ctx_remove(ctx, interval);
612 }
613
614 static void
remove_src(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)615 remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
616 struct ir3_register *src)
617 {
618 struct ra_spill_interval *interval = ctx->intervals[src->def->name];
619
620 if (!interval->interval.inserted)
621 return;
622
623 ra_spill_ctx_remove(ctx, interval);
624 }
625
626 static void
finish_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)627 finish_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
628 {
629 struct ra_spill_interval *interval = ctx->intervals[dst->name];
630 interval->cant_spill = false;
631 }
632
633 static void
remove_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)634 remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
635 {
636 struct ra_spill_interval *interval = ctx->intervals[dst->name];
637
638 if (!interval->interval.inserted)
639 return;
640
641 ra_spill_ctx_remove(ctx, interval);
642 }
643
644 static void
update_src_next_use(struct ra_spill_ctx * ctx,struct ir3_register * src)645 update_src_next_use(struct ra_spill_ctx *ctx, struct ir3_register *src)
646 {
647 struct ra_spill_interval *interval = ctx->intervals[src->def->name];
648
649 assert(interval->interval.inserted);
650
651 interval->next_use_distance = src->next_use;
652
653 /* If this node is inserted in one of the trees, then it needs to be resorted
654 * as its key has changed.
655 */
656 if (!interval->interval.parent && !(src->flags & IR3_REG_SHARED)) {
657 if (src->flags & IR3_REG_HALF) {
658 rb_tree_remove(&ctx->half_live_intervals, &interval->half_node);
659 rb_tree_insert(&ctx->half_live_intervals, &interval->half_node,
660 ra_spill_interval_half_cmp);
661 }
662 if (ctx->merged_regs || !(src->flags & IR3_REG_HALF)) {
663 rb_tree_remove(&ctx->full_live_intervals, &interval->node);
664 rb_tree_insert(&ctx->full_live_intervals, &interval->node,
665 ra_spill_interval_cmp);
666 }
667 }
668 }
669
670 static unsigned
get_spill_slot(struct ra_spill_ctx * ctx,struct ir3_register * reg)671 get_spill_slot(struct ra_spill_ctx *ctx, struct ir3_register *reg)
672 {
673 if (reg->merge_set) {
674 if (reg->merge_set->spill_slot == ~0) {
675 reg->merge_set->spill_slot = ALIGN_POT(ctx->spill_slot,
676 reg->merge_set->alignment * 2);
677 ctx->spill_slot = reg->merge_set->spill_slot + reg->merge_set->size * 2;
678 }
679 return reg->merge_set->spill_slot + reg->merge_set_offset * 2;
680 } else {
681 if (reg->spill_slot == ~0) {
682 reg->spill_slot = ALIGN_POT(ctx->spill_slot, reg_elem_size(reg) * 2);
683 ctx->spill_slot = reg->spill_slot + reg_size(reg) * 2;
684 }
685 return reg->spill_slot;
686 }
687 }
688
689 static void
set_src_val(struct ir3_register * src,const struct reg_or_immed * val)690 set_src_val(struct ir3_register *src, const struct reg_or_immed *val)
691 {
692 if (val->flags & IR3_REG_IMMED) {
693 src->flags = IR3_REG_IMMED | (val->flags & IR3_REG_HALF);
694 src->uim_val = val->uimm;
695 src->def = NULL;
696 } else if (val->flags & IR3_REG_CONST) {
697 src->flags = IR3_REG_CONST | (val->flags & IR3_REG_HALF);
698 src->num = val->const_num;
699 src->def = NULL;
700 } else {
701 src->def = val->def;
702 val->def->instr->flags &= ~IR3_INSTR_UNUSED;
703 }
704 }
705
706 static struct ir3_register *
materialize_pcopy_src(const struct reg_or_immed * src,struct ir3_builder * builder)707 materialize_pcopy_src(const struct reg_or_immed *src,
708 struct ir3_builder *builder)
709 {
710 struct ir3_instruction *mov = ir3_build_instr(builder, OPC_MOV, 1, 1);
711 struct ir3_register *dst = __ssa_dst(mov);
712 dst->flags |= src->flags & IR3_REG_HALF;
713 struct ir3_register *mov_src = ir3_src_create(mov, INVALID_REG, src->flags);
714 set_src_val(mov_src, src);
715 mov->cat1.src_type = mov->cat1.dst_type =
716 (src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
717 return dst;
718 }
719
720 static void
spill(struct ra_spill_ctx * ctx,const struct reg_or_immed * val,unsigned spill_slot,struct ir3_cursor cursor)721 spill(struct ra_spill_ctx *ctx, const struct reg_or_immed *val,
722 unsigned spill_slot, struct ir3_cursor cursor)
723 {
724 struct ir3_register *reg;
725 struct ir3_builder builder = ir3_builder_at(cursor);
726
727 /* If spilling an immed/const pcopy src, we need to actually materialize it
728 * first with a mov.
729 */
730 if (val->flags & (IR3_REG_CONST | IR3_REG_IMMED)) {
731 reg = materialize_pcopy_src(val, &builder);
732 } else {
733 reg = val->def;
734 reg->instr->flags &= ~IR3_INSTR_UNUSED;
735 }
736
737 d("spilling ssa_%u:%u to %u", reg->instr->serialno, reg->name,
738 spill_slot);
739
740 unsigned elems = reg_elems(reg);
741 struct ir3_instruction *spill =
742 ir3_build_instr(&builder, OPC_SPILL_MACRO, 0, 3);
743 ir3_src_create(spill, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg;
744 unsigned src_flags = reg->flags & (IR3_REG_HALF | IR3_REG_IMMED |
745 IR3_REG_CONST | IR3_REG_SSA |
746 IR3_REG_ARRAY);
747 struct ir3_register *src = ir3_src_create(spill, INVALID_REG, src_flags);
748 ir3_src_create(spill, INVALID_REG, IR3_REG_IMMED)->uim_val = elems;
749 spill->cat6.dst_offset = spill_slot;
750 spill->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
751
752 src->def = reg;
753 if (reg->flags & IR3_REG_ARRAY) {
754 src->size = reg->size;
755 src->array.id = reg->array.id;
756 src->array.offset = 0;
757 } else {
758 src->wrmask = reg->wrmask;
759 }
760 }
761
762 static void
spill_interval(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_cursor cursor)763 spill_interval(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval,
764 struct ir3_cursor cursor)
765 {
766 if (interval->can_rematerialize && !interval->interval.reg->merge_set)
767 return;
768
769 spill(ctx, &interval->dst, get_spill_slot(ctx, interval->interval.reg),
770 cursor);
771 }
772
773 /* This is similar to "limit" in the paper. */
774 static void
limit(struct ra_spill_ctx * ctx,struct ir3_cursor cursor)775 limit(struct ra_spill_ctx *ctx, struct ir3_cursor cursor)
776 {
777 if (ctx->cur_pressure.half > ctx->limit_pressure.half) {
778 d("cur half pressure %u exceeds %u", ctx->cur_pressure.half,
779 ctx->limit_pressure.half);
780 rb_tree_foreach_safe (struct ra_spill_interval, interval,
781 &ctx->half_live_intervals, half_node) {
782 d("trying ssa_%u:%u", interval->interval.reg->instr->serialno,
783 interval->interval.reg->name);
784 if (!interval->cant_spill) {
785 if (!interval->already_spilled)
786 spill_interval(ctx, interval, cursor);
787 ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
788 if (ctx->cur_pressure.half <= ctx->limit_pressure.half)
789 break;
790 }
791 }
792
793 assert(ctx->cur_pressure.half <= ctx->limit_pressure.half);
794 }
795
796 if (ctx->cur_pressure.full > ctx->limit_pressure.full) {
797 d("cur full pressure %u exceeds %u", ctx->cur_pressure.full,
798 ctx->limit_pressure.full);
799 rb_tree_foreach_safe (struct ra_spill_interval, interval,
800 &ctx->full_live_intervals, node) {
801 d("trying ssa_%u:%u", interval->interval.reg->instr->serialno,
802 interval->interval.reg->name);
803 if (!interval->cant_spill) {
804 if (!interval->already_spilled)
805 spill_interval(ctx, interval, cursor);
806 ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
807 if (ctx->cur_pressure.full <= ctx->limit_pressure.full)
808 break;
809 } else {
810 d("can't spill");
811 }
812 }
813
814 assert(ctx->cur_pressure.full <= ctx->limit_pressure.full);
815 }
816 }
817
818 /* There's a corner case where we reload a value which has overlapping live
819 * values already reloaded, either because it's the child of some other interval
820 * that was already reloaded or some of its children have already been
821 * reloaded. Because RA only expects overlapping source/dest intervals for meta
822 * instructions (split/collect), and we don't want to add register pressure by
823 * creating an entirely separate value, we need to add splits and collects to
824 * deal with this case. These splits/collects have to also have correct merge
825 * set information, so that it doesn't result in any actual code or register
826 * pressure in practice.
827 */
828
829 static void
add_to_merge_set(struct ir3_merge_set * set,struct ir3_register * def,unsigned offset)830 add_to_merge_set(struct ir3_merge_set *set, struct ir3_register *def,
831 unsigned offset)
832 {
833 def->merge_set = set;
834 def->merge_set_offset = offset;
835 def->interval_start = set->interval_start + offset;
836 def->interval_end = set->interval_start + offset + reg_size(def);
837 }
838
839 static struct ir3_register *
split(struct ir3_register * def,unsigned offset,struct ir3_builder * builder)840 split(struct ir3_register *def, unsigned offset, struct ir3_builder *builder)
841 {
842 if (reg_elems(def) == 1) {
843 assert(offset == 0);
844 return def;
845 }
846
847 assert(!(def->flags & IR3_REG_ARRAY));
848 assert(def->merge_set);
849 struct ir3_instruction *split =
850 ir3_build_instr(builder, OPC_META_SPLIT, 1, 1);
851 split->split.off = offset;
852 struct ir3_register *dst = __ssa_dst(split);
853 dst->flags |= def->flags & IR3_REG_HALF;
854 struct ir3_register *src = ir3_src_create(split, INVALID_REG, def->flags);
855 src->wrmask = def->wrmask;
856 src->def = def;
857 add_to_merge_set(def->merge_set, dst,
858 def->merge_set_offset + offset * reg_elem_size(def));
859 return dst;
860 }
861
862 static struct ir3_register *
extract(struct ir3_register * parent_def,unsigned offset,unsigned elems,struct ir3_cursor cursor)863 extract(struct ir3_register *parent_def, unsigned offset, unsigned elems,
864 struct ir3_cursor cursor)
865 {
866 if (offset == 0 && elems == reg_elems(parent_def))
867 return parent_def;
868
869 struct ir3_builder builder = ir3_builder_at(cursor);
870 struct ir3_register *srcs[elems];
871 for (unsigned i = 0; i < elems; i++) {
872 srcs[i] = split(parent_def, offset + i, &builder);
873 }
874
875 struct ir3_instruction *collect =
876 ir3_build_instr(&builder, OPC_META_COLLECT, 1, elems);
877 struct ir3_register *dst = __ssa_dst(collect);
878 dst->flags |= parent_def->flags & IR3_REG_HALF;
879 dst->wrmask = MASK(elems);
880 add_to_merge_set(parent_def->merge_set, dst, parent_def->merge_set_offset);
881
882 for (unsigned i = 0; i < elems; i++) {
883 ir3_src_create(collect, INVALID_REG, parent_def->flags)->def = srcs[i];
884 }
885
886 return dst;
887 }
888
889 static struct ir3_register *
reload(struct ra_spill_ctx * ctx,struct ir3_register * reg,struct ir3_cursor cursor)890 reload(struct ra_spill_ctx *ctx, struct ir3_register *reg,
891 struct ir3_cursor cursor)
892 {
893 unsigned spill_slot = get_spill_slot(ctx, reg);
894
895 d("reloading ssa_%u:%u from %u", reg->instr->serialno, reg->name,
896 spill_slot);
897
898 unsigned elems = reg_elems(reg);
899 struct ir3_instruction *reload =
900 ir3_instr_create_at(cursor, OPC_RELOAD_MACRO, 1, 3);
901 struct ir3_register *dst = __ssa_dst(reload);
902 dst->flags |= reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
903 /* The reload may be split into multiple pieces, and if the destination
904 * overlaps with the base register then it could get clobbered before the
905 * last ldp in the sequence. Note that we always reserve space for the base
906 * register throughout the whole program, so effectively extending its live
907 * range past the end of the instruction isn't a problem for our pressure
908 * accounting.
909 */
910 dst->flags |= IR3_REG_EARLY_CLOBBER;
911 ir3_src_create(reload, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg;
912 struct ir3_register *offset_reg =
913 ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED);
914 offset_reg->uim_val = spill_slot;
915 ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED)->uim_val = elems;
916 reload->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
917
918 if (reg->flags & IR3_REG_ARRAY) {
919 dst->array.offset = 0;
920 dst->array.id = reg->array.id;
921 dst->size = reg->size;
922 } else {
923 dst->wrmask = reg->wrmask;
924 }
925
926 dst->merge_set = reg->merge_set;
927 dst->merge_set_offset = reg->merge_set_offset;
928 dst->interval_start = reg->interval_start;
929 dst->interval_end = reg->interval_end;
930 return dst;
931 }
932
933 static void
rewrite_src_interval(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_register * def,struct ir3_cursor cursor)934 rewrite_src_interval(struct ra_spill_ctx *ctx,
935 struct ra_spill_interval *interval,
936 struct ir3_register *def,
937 struct ir3_cursor cursor)
938 {
939 interval->dst.flags = def->flags;
940 interval->dst.def = def;
941 interval->needs_reload = false;
942
943 rb_tree_foreach (struct ra_spill_interval, child,
944 &interval->interval.children, interval.node) {
945 struct ir3_register *child_reg = child->interval.reg;
946 struct ir3_register *child_def =
947 extract(def, (child_reg->interval_start -
948 interval->interval.reg->interval_start) / reg_elem_size(def),
949 reg_elems(child_reg), cursor);
950 rewrite_src_interval(ctx, child, child_def, cursor);
951 }
952 }
953
954 static void
reload_def(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_cursor cursor)955 reload_def(struct ra_spill_ctx *ctx, struct ir3_register *def,
956 struct ir3_cursor cursor)
957 {
958 unsigned elems = reg_elems(def);
959 struct ra_spill_interval *interval = ctx->intervals[def->name];
960
961 struct ir3_reg_interval *ir3_parent = interval->interval.parent;
962
963 if (ir3_parent) {
964 struct ra_spill_interval *parent =
965 ir3_reg_interval_to_interval(ir3_parent);
966 if (!parent->needs_reload) {
967 interval->dst.flags = def->flags;
968 interval->dst.def = extract(
969 parent->dst.def, (def->interval_start - parent->dst.def->interval_start) /
970 reg_elem_size(def), elems, cursor);
971 return;
972 }
973 }
974
975 struct ir3_register *dst;
976 if (interval->can_rematerialize)
977 dst = rematerialize(def, cursor);
978 else
979 dst = reload(ctx, def, cursor);
980
981 rewrite_src_interval(ctx, interval, dst, cursor);
982 }
983
984 static void
reload_src(struct ra_spill_ctx * ctx,struct ir3_cursor cursor,struct ir3_register * src)985 reload_src(struct ra_spill_ctx *ctx, struct ir3_cursor cursor,
986 struct ir3_register *src)
987 {
988 struct ra_spill_interval *interval = ctx->intervals[src->def->name];
989
990 if (interval->needs_reload) {
991 reload_def(ctx, src->def, cursor);
992 }
993
994 ra_spill_interval_root(interval)->cant_spill = false;
995 }
996
997 static void
rewrite_src(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)998 rewrite_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
999 struct ir3_register *src)
1000 {
1001 struct ra_spill_interval *interval = ctx->intervals[src->def->name];
1002
1003 set_src_val(src, &interval->dst);
1004 }
1005
1006 static void
update_max_pressure(struct ra_spill_ctx * ctx)1007 update_max_pressure(struct ra_spill_ctx *ctx)
1008 {
1009 d("pressure:");
1010 d("\tfull: %u", ctx->cur_pressure.full);
1011 d("\thalf: %u", ctx->cur_pressure.half);
1012 d("\tshared: %u", ctx->cur_pressure.shared);
1013 d("\tshared half: %u", ctx->cur_pressure.shared_half);
1014
1015 ctx->max_pressure.full =
1016 MAX2(ctx->max_pressure.full, ctx->cur_pressure.full);
1017 ctx->max_pressure.half =
1018 MAX2(ctx->max_pressure.half, ctx->cur_pressure.half);
1019 ctx->max_pressure.shared =
1020 MAX2(ctx->max_pressure.shared, ctx->cur_pressure.shared);
1021 ctx->max_pressure.shared_half =
1022 MAX2(ctx->max_pressure.shared_half, ctx->cur_pressure.shared_half);
1023 }
1024
1025 static void
handle_instr(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)1026 handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
1027 {
1028 ra_foreach_dst (dst, instr) {
1029 /* No need to handle instructions inserted while spilling. Most are
1030 * ignored automatically by virtue of being inserted before the current
1031 * instruction. However, for live-ins, we may insert extracts after the
1032 * phis. Trying to handle them ends badly as they don't have intervals
1033 * allocated.
1034 * Note: since liveness is calculated before spilling, original
1035 * instruction have a name while new ones don't.
1036 */
1037 if (!dst->name)
1038 return;
1039
1040 init_dst(ctx, dst);
1041 }
1042
1043 if (ctx->spilling) {
1044 ra_foreach_src (src, instr)
1045 insert_src(ctx, src);
1046 }
1047
1048 /* Handle tied and early-kill destinations. If a destination is tied to a
1049 * source and that source is live-through, then we need to allocate a new
1050 * register for the destination which is live-through itself and cannot
1051 * overlap the sources. Similarly early-kill destinations cannot overlap
1052 * sources.
1053 */
1054
1055 ra_foreach_dst (dst, instr) {
1056 struct ir3_register *tied_src = dst->tied;
1057 if ((tied_src && !(tied_src->flags & IR3_REG_FIRST_KILL)) ||
1058 (dst->flags & IR3_REG_EARLY_CLOBBER))
1059 insert_dst(ctx, dst);
1060 }
1061
1062 if (ctx->spilling)
1063 limit(ctx, ir3_before_instr(instr));
1064 else
1065 update_max_pressure(ctx);
1066
1067 if (ctx->spilling) {
1068 ra_foreach_src (src, instr) {
1069 reload_src(ctx, ir3_before_instr(instr), src);
1070 update_src_next_use(ctx, src);
1071 }
1072 }
1073
1074 ra_foreach_src (src, instr) {
1075 if (src->flags & IR3_REG_FIRST_KILL)
1076 remove_src_early(ctx, instr, src);
1077 }
1078
1079 ra_foreach_dst (dst, instr) {
1080 insert_dst(ctx, dst);
1081 }
1082
1083 if (ctx->spilling)
1084 limit(ctx, ir3_before_instr(instr));
1085 else
1086 update_max_pressure(ctx);
1087
1088 /* We have to remove sources before rewriting them so that we can lookup the
1089 * interval to remove before the source itself is changed.
1090 */
1091 ra_foreach_src (src, instr) {
1092 if (src->flags & IR3_REG_FIRST_KILL)
1093 remove_src(ctx, instr, src);
1094 }
1095
1096 if (ctx->spilling) {
1097 ra_foreach_src (src, instr) {
1098 rewrite_src(ctx, instr, src);
1099 }
1100 }
1101
1102 ra_foreach_dst (dst, instr) {
1103 finish_dst(ctx, dst);
1104 }
1105
1106 for (unsigned i = 0; i < instr->dsts_count; i++) {
1107 if (ra_reg_is_dst(instr->dsts[i]) &&
1108 (instr->dsts[i]->flags & IR3_REG_UNUSED))
1109 remove_dst(ctx, instr->dsts[i]);
1110 }
1111 }
1112
1113 static struct ra_spill_interval *
create_temp_interval(struct ra_spill_ctx * ctx,struct ir3_register * def)1114 create_temp_interval(struct ra_spill_ctx *ctx, struct ir3_register *def)
1115 {
1116 unsigned name = ctx->intervals_count++;
1117 unsigned offset = ctx->live->interval_offset;
1118
1119 /* This is kinda hacky, but we need to create a fake SSA def here that is
1120 * only used as part of the pcopy accounting. See below.
1121 */
1122 struct ir3_register *reg = rzalloc(ctx, struct ir3_register);
1123 *reg = *def;
1124 reg->name = name;
1125 reg->interval_start = offset;
1126 reg->interval_end = offset + reg_size(def);
1127 reg->merge_set = NULL;
1128
1129 ctx->intervals = reralloc(ctx, ctx->intervals, struct ra_spill_interval *,
1130 ctx->intervals_count);
1131 struct ra_spill_interval *interval = rzalloc(ctx, struct ra_spill_interval);
1132 ra_spill_interval_init(interval, reg);
1133 ctx->intervals[name] = interval;
1134 ctx->live->interval_offset += reg_size(def);
1135 return interval;
1136 }
1137
1138 /* In the sequence of copies generated (see below), would this source be killed?
1139 */
1140 static bool
is_last_pcopy_src(struct ir3_instruction * pcopy,unsigned src_n)1141 is_last_pcopy_src(struct ir3_instruction *pcopy, unsigned src_n)
1142 {
1143 struct ir3_register *src = pcopy->srcs[src_n];
1144 if (!(src->flags & IR3_REG_KILL))
1145 return false;
1146 for (unsigned j = src_n + 1; j < pcopy->srcs_count; j++) {
1147 if (pcopy->srcs[j]->def == src->def)
1148 return false;
1149 }
1150 return true;
1151 }
1152
1153 /* Parallel copies are different from normal instructions. The sources together
1154 * may be larger than the entire register file, so we cannot just reload every
1155 * source like normal, and indeed that probably wouldn't be a great idea.
1156 * Instead we essentially need to lower the parallel copy to "copies," just like
1157 * in the normal CSSA construction, although we implement the copies by
1158 * reloading and then possibly spilling values. We essentially just shuffle
1159 * around the sources until each source either (a) is live or (b) has the same
1160 * spill slot as its corresponding destination. We do this by decomposing the
1161 * copy into a series of copies, so:
1162 *
1163 * a, b, c = d, e, f
1164 *
1165 * becomes:
1166 *
1167 * d' = d
1168 * e' = e
1169 * f' = f
1170 * a = d'
1171 * b = e'
1172 * c = f'
1173 *
1174 * the temporary SSA values d', e', and f' never actually show up in the result.
1175 * They are only used for our internal accounting. They may, however, have their
1176 * own spill slot created for them. Similarly, we don't actually emit any copy
1177 * instructions, although we emit the spills/reloads that *would've* been
1178 * required if those copies were there.
1179 *
1180 * TODO: in order to reduce the number of temporaries and therefore spill slots,
1181 * we could instead do a more complicated analysis that considers the location
1182 * transfer graph.
1183 *
1184 * In addition, we actually remove the parallel copy and rewrite all its uses
1185 * (in the phi nodes) rather than rewrite its sources at the end. Recreating it
1186 * later turns out to be easier than keeping it up-to-date throughout this pass,
1187 * since we may have to remove entries for phi sources that are spilled and add
1188 * entries for live-outs that are spilled and reloaded, which can happen here
1189 * and then possibly be undone or done again when processing live-ins of the
1190 * successor block.
1191 */
1192
1193 static void
handle_pcopy(struct ra_spill_ctx * ctx,struct ir3_instruction * pcopy)1194 handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
1195 {
1196 ra_foreach_dst (dst, pcopy) {
1197 struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
1198 ra_spill_interval_init(dst_interval, dst);
1199 }
1200
1201 foreach_src_n (src, i, pcopy) {
1202 struct ir3_register *dst = pcopy->dsts[i];
1203 if (!(dst->flags & IR3_REG_SSA))
1204 continue;
1205
1206 d("processing src %u", i);
1207
1208 /* Skip the intermediate copy for cases where the source is merged with
1209 * the destination. Crucially this means that we also don't reload/spill
1210 * it if it's been spilled, because it shares the same spill slot.
1211 */
1212 if ((src->flags & IR3_REG_SSA) && src->def->merge_set &&
1213 src->def->merge_set == dst->merge_set &&
1214 src->def->merge_set_offset == dst->merge_set_offset) {
1215 struct ra_spill_interval *src_interval = ctx->intervals[src->def->name];
1216 struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
1217 if (src_interval->interval.inserted) {
1218 update_src_next_use(ctx, src);
1219 if (is_last_pcopy_src(pcopy, i))
1220 ra_spill_ctx_remove(ctx, src_interval);
1221 dst_interval->cant_spill = true;
1222 ra_spill_ctx_insert(ctx, dst_interval);
1223 limit(ctx, ir3_before_instr(pcopy));
1224 dst_interval->cant_spill = false;
1225 dst_interval->dst = src_interval->dst;
1226 }
1227 } else if (src->flags & IR3_REG_SSA) {
1228 struct ra_spill_interval *temp_interval =
1229 create_temp_interval(ctx, dst);
1230 struct ir3_register *temp = temp_interval->interval.reg;
1231 temp_interval->next_use_distance = src->next_use;
1232
1233 insert_src(ctx, src);
1234 limit(ctx, ir3_before_instr(pcopy));
1235 reload_src(ctx, ir3_before_instr(pcopy), src);
1236 update_src_next_use(ctx, src);
1237 if (is_last_pcopy_src(pcopy, i))
1238 remove_src(ctx, pcopy, src);
1239 struct ra_spill_interval *src_interval =
1240 ctx->intervals[src->def->name];
1241 temp_interval->dst = src_interval->dst;
1242
1243 temp_interval->cant_spill = true;
1244 ra_spill_ctx_insert(ctx, temp_interval);
1245 limit(ctx, ir3_before_instr(pcopy));
1246 temp_interval->cant_spill = false;
1247
1248 src->flags = temp->flags;
1249 src->def = temp;
1250 }
1251 }
1252
1253 d("done with pcopy srcs");
1254
1255 foreach_src_n (src, i, pcopy) {
1256 struct ir3_register *dst = pcopy->dsts[i];
1257 if (!(dst->flags & IR3_REG_SSA))
1258 continue;
1259
1260 if ((src->flags & IR3_REG_SSA) && src->def->merge_set &&
1261 src->def->merge_set == dst->merge_set &&
1262 src->def->merge_set_offset == dst->merge_set_offset)
1263 continue;
1264
1265 struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
1266
1267 if (!(src->flags & IR3_REG_SSA)) {
1268 dst_interval->cant_spill = true;
1269 ra_spill_ctx_insert(ctx, dst_interval);
1270 limit(ctx, ir3_before_instr(pcopy));
1271 dst_interval->cant_spill = false;
1272
1273 assert(src->flags & (IR3_REG_CONST | IR3_REG_IMMED));
1274 if (src->flags & IR3_REG_CONST) {
1275 dst_interval->dst.flags = src->flags;
1276 dst_interval->dst.const_num = src->num;
1277 } else {
1278 dst_interval->dst.flags = src->flags;
1279 dst_interval->dst.uimm = src->uim_val;
1280 }
1281 } else {
1282 struct ra_spill_interval *temp_interval = ctx->intervals[src->def->name];
1283
1284 insert_src(ctx, src);
1285 limit(ctx, ir3_before_instr(pcopy));
1286 reload_src(ctx, ir3_before_instr(pcopy), src);
1287 remove_src(ctx, pcopy, src);
1288
1289 dst_interval->dst = temp_interval->dst;
1290 ra_spill_ctx_insert(ctx, dst_interval);
1291 }
1292 }
1293
1294 pcopy->flags |= IR3_INSTR_UNUSED;
1295 }
1296
1297 static void
handle_input_phi(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)1298 handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
1299 {
1300 if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1301 return;
1302
1303 init_dst(ctx, instr->dsts[0]);
1304 insert_dst(ctx, instr->dsts[0]);
1305 finish_dst(ctx, instr->dsts[0]);
1306 }
1307
1308 static void
remove_input_phi(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)1309 remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
1310 {
1311 if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1312 return;
1313
1314 if (instr->opc == OPC_META_TEX_PREFETCH) {
1315 ra_foreach_src (src, instr) {
1316 if (src->flags & IR3_REG_FIRST_KILL)
1317 remove_src(ctx, instr, src);
1318 }
1319 }
1320 if (instr->dsts[0]->flags & IR3_REG_UNUSED)
1321 remove_dst(ctx, instr->dsts[0]);
1322 }
1323
1324 static void
handle_live_in(struct ra_spill_ctx * ctx,struct ir3_block * block,struct ir3_register * def)1325 handle_live_in(struct ra_spill_ctx *ctx, struct ir3_block *block,
1326 struct ir3_register *def)
1327 {
1328 struct ra_spill_interval *interval = ctx->intervals[def->name];
1329 ra_spill_interval_init(interval, def);
1330 if (ctx->spilling) {
1331 interval->next_use_distance =
1332 ctx->blocks[block->index].next_use_start[def->name];
1333 }
1334
1335 ra_spill_ctx_insert(ctx, interval);
1336 }
1337
1338 static bool
is_live_in_phi(struct ir3_register * def,struct ir3_block * block)1339 is_live_in_phi(struct ir3_register *def, struct ir3_block *block)
1340 {
1341 return def->instr->opc == OPC_META_PHI && def->instr->block == block;
1342 }
1343
1344 static bool
is_live_in_pred(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block,unsigned pred_idx)1345 is_live_in_pred(struct ra_spill_ctx *ctx, struct ir3_register *def,
1346 struct ir3_block *block, unsigned pred_idx)
1347 {
1348 struct ir3_block *pred = block->predecessors[pred_idx];
1349 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1350 if (is_live_in_phi(def, block)) {
1351 def = def->instr->srcs[pred_idx]->def;
1352 if (!def)
1353 return false;
1354 }
1355
1356 return _mesa_hash_table_search(state->remap, def);
1357 }
1358
1359 static bool
is_live_in_undef(struct ir3_register * def,struct ir3_block * block,unsigned pred_idx)1360 is_live_in_undef(struct ir3_register *def,
1361 struct ir3_block *block, unsigned pred_idx)
1362 {
1363 if (!is_live_in_phi(def, block))
1364 return false;
1365
1366 return !def->instr->srcs[pred_idx]->def;
1367 }
1368
1369 static struct reg_or_immed *
read_live_in(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block,unsigned pred_idx)1370 read_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
1371 struct ir3_block *block, unsigned pred_idx)
1372 {
1373 struct ir3_block *pred = block->predecessors[pred_idx];
1374 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1375
1376 if (is_live_in_phi(def, block)) {
1377 def = def->instr->srcs[pred_idx]->def;
1378 if (!def)
1379 return NULL;
1380 }
1381
1382 struct hash_entry *entry = _mesa_hash_table_search(state->remap, def);
1383 if (entry)
1384 return entry->data;
1385 else
1386 return NULL;
1387 }
1388
1389 static bool
is_live_in_all_preds(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1390 is_live_in_all_preds(struct ra_spill_ctx *ctx, struct ir3_register *def,
1391 struct ir3_block *block)
1392 {
1393 for (unsigned i = 0; i < block->predecessors_count; i++) {
1394 if (!is_live_in_pred(ctx, def, block, i))
1395 return false;
1396 }
1397
1398 return true;
1399 }
1400
1401 static void
spill_live_in(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1402 spill_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
1403 struct ir3_block *block)
1404 {
1405 for (unsigned i = 0; i < block->predecessors_count; i++) {
1406 struct ir3_block *pred = block->predecessors[i];
1407 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1408
1409 if (!state->visited)
1410 continue;
1411
1412 struct reg_or_immed *pred_def = read_live_in(ctx, def, block, i);
1413 if (pred_def) {
1414 spill(ctx, pred_def, get_spill_slot(ctx, def),
1415 ir3_before_terminator(pred));
1416 }
1417 }
1418 }
1419
1420 static void
spill_live_ins(struct ra_spill_ctx * ctx,struct ir3_block * block)1421 spill_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block)
1422 {
1423 bool all_preds_visited = true;
1424 for (unsigned i = 0; i < block->predecessors_count; i++) {
1425 struct ir3_block *pred = block->predecessors[i];
1426 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1427 if (!state->visited) {
1428 all_preds_visited = false;
1429 break;
1430 }
1431 }
1432
1433 /* Note: in the paper they explicitly spill live-through values first, but we
1434 * should be doing that automatically by virtue of picking the largest
1435 * distance due to the extra distance added to edges out of loops.
1436 *
1437 * TODO: Keep track of pressure in each block and preemptively spill
1438 * live-through values as described in the paper to avoid spilling them
1439 * inside the loop.
1440 */
1441
1442 if (ctx->cur_pressure.half > ctx->limit_pressure.half) {
1443 rb_tree_foreach_safe (struct ra_spill_interval, interval,
1444 &ctx->half_live_intervals, half_node) {
1445 if (all_preds_visited &&
1446 is_live_in_all_preds(ctx, interval->interval.reg, block))
1447 continue;
1448 if (interval->interval.reg->merge_set ||
1449 !interval->can_rematerialize)
1450 spill_live_in(ctx, interval->interval.reg, block);
1451 ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
1452 if (ctx->cur_pressure.half <= ctx->limit_pressure.half)
1453 break;
1454 }
1455 }
1456
1457 if (ctx->cur_pressure.full > ctx->limit_pressure.full) {
1458 rb_tree_foreach_safe (struct ra_spill_interval, interval,
1459 &ctx->full_live_intervals, node) {
1460 if (all_preds_visited &&
1461 is_live_in_all_preds(ctx, interval->interval.reg, block))
1462 continue;
1463 spill_live_in(ctx, interval->interval.reg, block);
1464 ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
1465 if (ctx->cur_pressure.full <= ctx->limit_pressure.full)
1466 break;
1467 }
1468 }
1469 }
1470
1471 static void
live_in_rewrite(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct reg_or_immed * new_val,struct ir3_block * block,unsigned pred_idx)1472 live_in_rewrite(struct ra_spill_ctx *ctx,
1473 struct ra_spill_interval *interval,
1474 struct reg_or_immed *new_val,
1475 struct ir3_block *block, unsigned pred_idx)
1476 {
1477 struct ir3_block *pred = block->predecessors[pred_idx];
1478 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1479 struct ir3_register *def = interval->interval.reg;
1480 if (is_live_in_phi(def, block)) {
1481 def = def->instr->srcs[pred_idx]->def;
1482 }
1483
1484 if (def)
1485 _mesa_hash_table_insert(state->remap, def, new_val);
1486 }
1487
1488 static void
reload_live_in(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1489 reload_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
1490 struct ir3_block *block)
1491 {
1492 struct ra_spill_interval *interval = ctx->intervals[def->name];
1493 for (unsigned i = 0; i < block->predecessors_count; i++) {
1494 struct ir3_block *pred = block->predecessors[i];
1495 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1496 if (!state->visited)
1497 continue;
1498
1499 if (is_live_in_undef(def, block, i))
1500 continue;
1501
1502 struct reg_or_immed *new_val = read_live_in(ctx, def, block, i);
1503
1504 if (!new_val) {
1505 new_val = ralloc(ctx, struct reg_or_immed);
1506 if (interval->can_rematerialize)
1507 new_val->def = rematerialize(def, ir3_before_terminator(pred));
1508 else
1509 new_val->def = reload(ctx, def, ir3_before_terminator(pred));
1510 new_val->flags = new_val->def->flags;
1511 }
1512 live_in_rewrite(ctx, interval, new_val, block, i);
1513 }
1514 }
1515
1516 static void
reload_live_ins(struct ra_spill_ctx * ctx,struct ir3_block * block)1517 reload_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block)
1518 {
1519 rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
1520 interval.node) {
1521 reload_live_in(ctx, interval->interval.reg, block);
1522 }
1523 }
1524
1525 static void
add_live_in_phi(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_register * parent_def,struct ir3_block * block)1526 add_live_in_phi(struct ra_spill_ctx *ctx, struct ir3_register *def,
1527 struct ir3_register *parent_def, struct ir3_block *block)
1528 {
1529 struct ra_spill_interval *interval = ctx->intervals[def->name];
1530 if (!interval->interval.inserted)
1531 return;
1532
1533 bool needs_phi = false;
1534 struct ir3_register *cur_def = NULL;
1535 for (unsigned i = 0; i < block->predecessors_count; i++) {
1536 struct ir3_block *pred = block->predecessors[i];
1537 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1538
1539 if (!state->visited) {
1540 needs_phi = true;
1541 break;
1542 }
1543
1544 struct hash_entry *entry =
1545 _mesa_hash_table_search(state->remap, def);
1546 assert(entry);
1547 struct reg_or_immed *pred_val = entry->data;
1548 if ((pred_val->flags & (IR3_REG_IMMED | IR3_REG_CONST)) ||
1549 !pred_val->def ||
1550 (cur_def && cur_def != pred_val->def)) {
1551 needs_phi = true;
1552 break;
1553 }
1554 cur_def = pred_val->def;
1555 }
1556
1557 if (!needs_phi) {
1558 interval->dst.def = cur_def;
1559 interval->dst.flags = cur_def->flags;
1560
1561 rb_tree_foreach (struct ra_spill_interval, child,
1562 &interval->interval.children, interval.node) {
1563 add_live_in_phi(ctx, child->interval.reg, cur_def, block);
1564 }
1565
1566 return;
1567 }
1568
1569 if (parent_def) {
1570 /* We have a child interval that needs a phi but whose parent does not
1571 * need one (otherwise parent_def would be NULL). Just extract the child
1572 * from the parent without creating a phi for the child.
1573 */
1574 unsigned offset = (def->interval_start - parent_def->interval_start) /
1575 reg_elem_size(def);
1576 struct ir3_register *extracted =
1577 extract(parent_def, offset, reg_elems(def), ir3_after_phis(block));
1578 rewrite_src_interval(ctx, interval, extracted,
1579 ir3_after_instr(extracted->instr));
1580 return;
1581 }
1582
1583 struct ir3_instruction *phi = ir3_instr_create_at(
1584 ir3_before_block(block), OPC_META_PHI, 1, block->predecessors_count);
1585 struct ir3_register *dst = __ssa_dst(phi);
1586 dst->flags |= def->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
1587 dst->size = def->size;
1588 dst->wrmask = def->wrmask;
1589
1590 dst->interval_start = def->interval_start;
1591 dst->interval_end = def->interval_end;
1592 dst->merge_set = def->merge_set;
1593 dst->merge_set_offset = def->merge_set_offset;
1594
1595 for (unsigned i = 0; i < block->predecessors_count; i++) {
1596 struct ir3_block *pred = block->predecessors[i];
1597 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1598 struct ir3_register *src = ir3_src_create(phi, INVALID_REG, dst->flags);
1599 src->size = def->size;
1600 src->wrmask = def->wrmask;
1601
1602 if (state->visited) {
1603 struct hash_entry *entry =
1604 _mesa_hash_table_search(state->remap, def);
1605 assert(entry);
1606 struct reg_or_immed *new_val = entry->data;
1607 set_src_val(src, new_val);
1608 } else {
1609 src->def = def;
1610 }
1611 }
1612
1613 interval->dst.def = dst;
1614 interval->dst.flags = dst->flags;
1615 rewrite_src_interval(ctx, interval, dst, ir3_after_phis(block));
1616 }
1617
1618 static void
add_live_in_phis(struct ra_spill_ctx * ctx,struct ir3_block * block)1619 add_live_in_phis(struct ra_spill_ctx *ctx, struct ir3_block *block)
1620 {
1621 rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
1622 interval.node) {
1623 if (BITSET_TEST(ctx->live->live_in[block->index],
1624 interval->interval.reg->name)) {
1625 add_live_in_phi(ctx, interval->interval.reg, NULL, block);
1626 }
1627 }
1628 }
1629
1630 /* When spilling a block with a single predecessors, the pred may have other
1631 * successors so we can't choose what's live in and we can't spill/restore
1632 * anything. Just make the inserted intervals exactly match the predecessor. If
1633 * it wasn't live in the predecessor then it must've already been spilled. Also,
1634 * there are no phi nodes and no live-ins.
1635 */
1636 static void
spill_single_pred_live_in(struct ra_spill_ctx * ctx,struct ir3_block * block)1637 spill_single_pred_live_in(struct ra_spill_ctx *ctx,
1638 struct ir3_block *block)
1639 {
1640 unsigned name;
1641 BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1642 ctx->live->definitions_count) {
1643 struct ir3_register *reg = ctx->live->definitions[name];
1644 struct ra_spill_interval *interval = ctx->intervals[reg->name];
1645 struct reg_or_immed *val = read_live_in(ctx, reg, block, 0);
1646 if (val)
1647 interval->dst = *val;
1648 else
1649 ra_spill_ctx_remove(ctx, interval);
1650 }
1651 }
1652
1653 static void
rewrite_phi(struct ra_spill_ctx * ctx,struct ir3_instruction * phi,struct ir3_block * block)1654 rewrite_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *phi,
1655 struct ir3_block *block)
1656 {
1657 if (!(phi->dsts[0]->flags & IR3_REG_SSA))
1658 return;
1659
1660 if (!ctx->intervals[phi->dsts[0]->name]->interval.inserted) {
1661 phi->flags |= IR3_INSTR_UNUSED;
1662 return;
1663 }
1664
1665 for (unsigned i = 0; i < block->predecessors_count; i++) {
1666 struct ir3_block *pred = block->predecessors[i];
1667 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1668
1669 if (!state->visited)
1670 continue;
1671
1672 struct ir3_register *src = phi->srcs[i];
1673 if (!src->def)
1674 continue;
1675
1676 struct hash_entry *entry =
1677 _mesa_hash_table_search(state->remap, src->def);
1678 assert(entry);
1679 struct reg_or_immed *new_val = entry->data;
1680 set_src_val(src, new_val);
1681 }
1682 }
1683
1684 static void
spill_live_out(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_block * block)1685 spill_live_out(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval,
1686 struct ir3_block *block)
1687 {
1688 struct ir3_register *def = interval->interval.reg;
1689
1690 if (interval->interval.reg->merge_set ||
1691 !interval->can_rematerialize) {
1692 spill(ctx, &interval->dst, get_spill_slot(ctx, def),
1693 ir3_before_terminator(block));
1694 }
1695 ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
1696 }
1697
1698 static void
spill_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1699 spill_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1700 {
1701 struct ra_spill_block_state *state = &ctx->blocks[block->index];
1702 rb_tree_foreach_safe (struct ra_spill_interval, interval,
1703 &ctx->reg_ctx.intervals, interval.node) {
1704 if (!BITSET_TEST(state->live_out, interval->interval.reg->name)) {
1705 spill_live_out(ctx, interval, block);
1706 }
1707 }
1708 }
1709
1710 static void
reload_live_out(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1711 reload_live_out(struct ra_spill_ctx *ctx, struct ir3_register *def,
1712 struct ir3_block *block)
1713 {
1714 struct ra_spill_interval *interval = ctx->intervals[def->name];
1715 ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
1716
1717 reload_def(ctx, def, ir3_before_terminator(block));
1718 }
1719
1720 static void
reload_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1721 reload_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1722 {
1723 struct ra_spill_block_state *state = &ctx->blocks[block->index];
1724 unsigned name;
1725 BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) {
1726 struct ir3_register *reg = ctx->live->definitions[name];
1727 struct ra_spill_interval *interval = ctx->intervals[name];
1728 if (!interval->interval.inserted)
1729 reload_live_out(ctx, reg, block);
1730 }
1731 }
1732
1733 static void
update_live_out_phis(struct ra_spill_ctx * ctx,struct ir3_block * block)1734 update_live_out_phis(struct ra_spill_ctx *ctx, struct ir3_block *block)
1735 {
1736 assert(!block->successors[1]);
1737 struct ir3_block *succ = block->successors[0];
1738 unsigned pred_idx = ir3_block_get_pred_index(succ, block);
1739
1740 foreach_instr (instr, &succ->instr_list) {
1741 if (instr->opc != OPC_META_PHI)
1742 break;
1743
1744 struct ir3_register *def = instr->srcs[pred_idx]->def;
1745 if (!def)
1746 continue;
1747
1748 struct ra_spill_interval *interval = ctx->intervals[def->name];
1749 if (!interval->interval.inserted)
1750 continue;
1751 set_src_val(instr->srcs[pred_idx], &interval->dst);
1752 }
1753 }
1754
1755 static void
record_pred_live_out(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_block * block,unsigned pred_idx)1756 record_pred_live_out(struct ra_spill_ctx *ctx,
1757 struct ra_spill_interval *interval,
1758 struct ir3_block *block, unsigned pred_idx)
1759 {
1760 struct ir3_block *pred = block->predecessors[pred_idx];
1761 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1762
1763 struct ir3_register *def = interval->interval.reg;
1764 if (is_live_in_phi(def, block)) {
1765 def = def->instr->srcs[pred_idx]->def;
1766 }
1767 BITSET_SET(state->live_out, def->name);
1768
1769 rb_tree_foreach (struct ra_spill_interval, child,
1770 &interval->interval.children, interval.node) {
1771 record_pred_live_out(ctx, child, block, pred_idx);
1772 }
1773 }
1774
1775 static void
record_pred_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1776 record_pred_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1777 {
1778 for (unsigned i = 0; i < block->predecessors_count; i++) {
1779 struct ir3_block *pred = block->predecessors[i];
1780 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1781 if (state->visited)
1782 continue;
1783
1784 state->live_out = rzalloc_array(ctx, BITSET_WORD,
1785 BITSET_WORDS(ctx->live->definitions_count));
1786
1787
1788 rb_tree_foreach (struct ra_spill_interval, interval,
1789 &ctx->reg_ctx.intervals, interval.node) {
1790 record_pred_live_out(ctx, interval, block, i);
1791 }
1792 }
1793 }
1794
1795 static void
record_live_out(struct ra_spill_ctx * ctx,struct ra_spill_block_state * state,struct ra_spill_interval * interval)1796 record_live_out(struct ra_spill_ctx *ctx,
1797 struct ra_spill_block_state *state,
1798 struct ra_spill_interval *interval)
1799 {
1800 if (!(interval->dst.flags & IR3_REG_SSA) ||
1801 interval->dst.def) {
1802 struct reg_or_immed *val = ralloc(ctx, struct reg_or_immed);
1803 *val = interval->dst;
1804 _mesa_hash_table_insert(state->remap, interval->interval.reg, val);
1805 }
1806 rb_tree_foreach (struct ra_spill_interval, child,
1807 &interval->interval.children, interval.node) {
1808 record_live_out(ctx, state, child);
1809 }
1810 }
1811
1812 static void
record_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1813 record_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1814 {
1815 struct ra_spill_block_state *state = &ctx->blocks[block->index];
1816 state->remap = _mesa_pointer_hash_table_create(ctx);
1817
1818 rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
1819 interval.node) {
1820 record_live_out(ctx, state, interval);
1821 }
1822 }
1823
1824 static void
handle_block(struct ra_spill_ctx * ctx,struct ir3_block * block)1825 handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block)
1826 {
1827 memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure));
1828 rb_tree_init(&ctx->reg_ctx.intervals);
1829 rb_tree_init(&ctx->full_live_intervals);
1830 rb_tree_init(&ctx->half_live_intervals);
1831
1832 unsigned name;
1833 BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1834 ctx->live->definitions_count) {
1835 struct ir3_register *reg = ctx->live->definitions[name];
1836 handle_live_in(ctx, block, reg);
1837 }
1838
1839 foreach_instr (instr, &block->instr_list) {
1840 if (instr->opc != OPC_META_PHI && instr->opc != OPC_META_INPUT &&
1841 instr->opc != OPC_META_TEX_PREFETCH)
1842 break;
1843 handle_input_phi(ctx, instr);
1844 }
1845
1846 if (ctx->spilling) {
1847 if (block->predecessors_count == 1 &&
1848 block->predecessors[0]->successors[1]) {
1849 spill_single_pred_live_in(ctx, block);
1850 } else {
1851 spill_live_ins(ctx, block);
1852 reload_live_ins(ctx, block);
1853 record_pred_live_outs(ctx, block);
1854 foreach_instr (instr, &block->instr_list) {
1855 if (instr->opc != OPC_META_PHI)
1856 break;
1857 rewrite_phi(ctx, instr, block);
1858 }
1859 add_live_in_phis(ctx, block);
1860 }
1861 } else {
1862 update_max_pressure(ctx);
1863 }
1864
1865 foreach_instr (instr, &block->instr_list) {
1866 di(instr, "processing");
1867
1868 if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT ||
1869 instr->opc == OPC_META_TEX_PREFETCH)
1870 remove_input_phi(ctx, instr);
1871 else if (ctx->spilling && instr->opc == OPC_META_PARALLEL_COPY)
1872 handle_pcopy(ctx, instr);
1873 else if (ctx->spilling && instr->opc == OPC_MOV &&
1874 instr->dsts[0] == ctx->base_reg)
1875 /* skip */;
1876 else
1877 handle_instr(ctx, instr);
1878 }
1879
1880 if (ctx->spilling && block->successors[0]) {
1881 struct ra_spill_block_state *state =
1882 &ctx->blocks[block->successors[0]->index];
1883 if (state->visited) {
1884 assert(!block->successors[1]);
1885
1886 spill_live_outs(ctx, block);
1887 reload_live_outs(ctx, block);
1888 update_live_out_phis(ctx, block);
1889 }
1890 }
1891
1892 if (ctx->spilling) {
1893 record_live_outs(ctx, block);
1894 ctx->blocks[block->index].visited = true;
1895 }
1896 }
1897
1898 static bool
simplify_phi_node(struct ir3_instruction * phi)1899 simplify_phi_node(struct ir3_instruction *phi)
1900 {
1901 struct ir3_register *def = NULL;
1902 foreach_src (src, phi) {
1903 /* Ignore phi sources which point to the phi itself. */
1904 if (src->def == phi->dsts[0])
1905 continue;
1906 /* If it's undef or it doesn't match the previous sources, bail */
1907 if (!src->def || (def && def != src->def))
1908 return false;
1909 def = src->def;
1910 }
1911
1912 phi->data = def;
1913 phi->flags |= IR3_INSTR_UNUSED;
1914 return true;
1915 }
1916
1917 static struct ir3_register *
simplify_phi_def(struct ir3_register * def)1918 simplify_phi_def(struct ir3_register *def)
1919 {
1920 if (def->instr->opc == OPC_META_PHI) {
1921 struct ir3_instruction *phi = def->instr;
1922
1923 /* Note: this function is always called at least once after visiting the
1924 * phi, so either there has been a simplified phi in the meantime, in
1925 * which case we will set progress=true and visit the definition again, or
1926 * phi->data already has the most up-to-date value. Therefore we don't
1927 * have to recursively check phi->data.
1928 */
1929 if (phi->data)
1930 return phi->data;
1931 }
1932
1933 return def;
1934 }
1935
1936 static void
simplify_phi_srcs(struct ir3_instruction * instr)1937 simplify_phi_srcs(struct ir3_instruction *instr)
1938 {
1939 foreach_src (src, instr) {
1940 if (src->def)
1941 src->def = simplify_phi_def(src->def);
1942 }
1943 }
1944
1945 /* We insert phi nodes for all live-ins of loops in case we need to split the
1946 * live range. This pass cleans that up for the case where the live range didn't
1947 * actually need to be split.
1948 */
1949 static void
simplify_phi_nodes(struct ir3 * ir)1950 simplify_phi_nodes(struct ir3 *ir)
1951 {
1952 foreach_block (block, &ir->block_list) {
1953 foreach_instr (instr, &block->instr_list) {
1954 if (instr->opc != OPC_META_PHI)
1955 break;
1956 instr->data = NULL;
1957 }
1958 }
1959
1960 bool progress;
1961 do {
1962 progress = false;
1963 foreach_block (block, &ir->block_list) {
1964 foreach_instr (instr, &block->instr_list) {
1965 if (instr->opc == OPC_META_PHI || (instr->flags & IR3_INSTR_UNUSED))
1966 continue;
1967
1968 simplify_phi_srcs(instr);
1969 }
1970
1971 /* Visit phi nodes in the successors to make sure that phi sources are
1972 * always visited at least once after visiting the definition they
1973 * point to. See note in simplify_phi_def() for why this is necessary.
1974 */
1975 for (unsigned i = 0; i < 2; i++) {
1976 struct ir3_block *succ = block->successors[i];
1977 if (!succ)
1978 continue;
1979 foreach_instr (instr, &succ->instr_list) {
1980 if (instr->opc != OPC_META_PHI)
1981 break;
1982 if (instr->flags & IR3_INSTR_UNUSED) {
1983 if (instr->data)
1984 instr->data = simplify_phi_def(instr->data);
1985 } else {
1986 simplify_phi_srcs(instr);
1987 progress |= simplify_phi_node(instr);
1988 }
1989 }
1990 }
1991 }
1992 } while (progress);
1993 }
1994
1995 static void
unmark_dead(struct ir3 * ir)1996 unmark_dead(struct ir3 *ir)
1997 {
1998 foreach_block (block, &ir->block_list) {
1999 foreach_instr (instr, &block->instr_list) {
2000 instr->flags &= ~IR3_INSTR_UNUSED;
2001 }
2002 }
2003 }
2004
2005 /* Simple pass to remove now-dead phi nodes and pcopy instructions. We mark
2006 * which ones are dead along the way, so there's nothing to compute here.
2007 */
2008 static void
cleanup_dead(struct ir3 * ir)2009 cleanup_dead(struct ir3 *ir)
2010 {
2011 foreach_block (block, &ir->block_list) {
2012 foreach_instr_safe (instr, &block->instr_list) {
2013 if (instr->flags & IR3_INSTR_UNUSED) {
2014 if (instr->opc == OPC_META_PARALLEL_COPY) {
2015 /* There may be non-SSA shared copies, we need to preserve these.
2016 */
2017 for (unsigned i = 0; i < instr->dsts_count;) {
2018 if (instr->dsts[i]->flags & IR3_REG_SSA) {
2019 instr->dsts[i] = instr->dsts[--instr->dsts_count];
2020 instr->srcs[i] = instr->srcs[--instr->srcs_count];
2021 } else {
2022 i++;
2023 }
2024 }
2025
2026 if (instr->dsts_count == 0)
2027 list_delinit(&instr->node);
2028 } else {
2029 list_delinit(&instr->node);
2030 }
2031 }
2032 }
2033 }
2034 }
2035
2036 /* Deal with merge sets after spilling. Spilling generally leaves the merge sets
2037 * in a mess, and even if we properly cleaned up after ourselves, we would want
2038 * to recompute the merge sets afterward anway. That's because
2039 * spilling/reloading can "break up" phi webs and split/collect webs so that
2040 * allocating them to the same register no longer gives any benefit. For
2041 * example, imagine we have this:
2042 *
2043 * if (...) {
2044 * foo = ...
2045 * } else {
2046 * bar = ...
2047 * }
2048 * baz = phi(foo, bar)
2049 *
2050 * and we spill "baz":
2051 *
2052 * if (...) {
2053 * foo = ...
2054 * spill(foo)
2055 * } else {
2056 * bar = ...
2057 * spill(bar)
2058 * }
2059 * baz = reload()
2060 *
2061 * now foo, bar, and baz don't have to be allocated to the same register. How
2062 * exactly the merge sets change can be complicated, so it's easier just to
2063 * recompute them.
2064 *
2065 * However, there's a wrinkle in this: those same merge sets determine the
2066 * register pressure, due to multiple values inhabiting the same register! And
2067 * we assume that this sharing happens when spilling. Therefore we need a
2068 * three-step procedure:
2069 *
2070 * 1. Drop the original merge sets.
2071 * 2. Calculate which values *must* be merged, being careful to only use the
2072 * interval information which isn't trashed by spilling, and forcibly merge
2073 * them.
2074 * 3. Let ir3_merge_regs() finish the job, including recalculating the
2075 * intervals.
2076 */
2077
2078 static void
fixup_merge_sets(struct ir3_liveness * live,struct ir3 * ir)2079 fixup_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
2080 {
2081 foreach_block (block, &ir->block_list) {
2082 foreach_instr (instr, &block->instr_list) {
2083 foreach_dst (dst, instr) {
2084 dst->merge_set = NULL;
2085 dst->merge_set_offset = 0;
2086 }
2087 }
2088 }
2089
2090 ir3_index_instrs_for_merge_sets(ir);
2091
2092 foreach_block (block, &ir->block_list) {
2093 foreach_instr (instr, &block->instr_list) {
2094 if (instr->opc != OPC_META_SPLIT &&
2095 instr->opc != OPC_META_COLLECT)
2096 continue;
2097
2098 struct ir3_register *dst = instr->dsts[0];
2099 ra_foreach_src (src, instr) {
2100 if (!(src->flags & IR3_REG_KILL) &&
2101 src->def->interval_start < dst->interval_end &&
2102 dst->interval_start < src->def->interval_end) {
2103 ir3_force_merge(dst, src->def,
2104 src->def->interval_start - dst->interval_start);
2105 }
2106 }
2107 }
2108 }
2109
2110 ir3_merge_regs(live, ir);
2111 }
2112
2113 void
ir3_calc_pressure(struct ir3_shader_variant * v,struct ir3_liveness * live,struct ir3_pressure * max_pressure)2114 ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
2115 struct ir3_pressure *max_pressure)
2116 {
2117 struct ra_spill_ctx *ctx = rzalloc(NULL, struct ra_spill_ctx);
2118 spill_ctx_init(ctx, v, live);
2119
2120 foreach_block (block, &v->ir->block_list) {
2121 handle_block(ctx, block);
2122 }
2123
2124 assert(ctx->cur_pressure.full == 0);
2125 assert(ctx->cur_pressure.half == 0);
2126 assert(ctx->cur_pressure.shared == 0);
2127 assert(ctx->cur_pressure.shared_half == 0);
2128
2129 *max_pressure = ctx->max_pressure;
2130 ralloc_free(ctx);
2131 }
2132
2133 bool
ir3_spill(struct ir3 * ir,struct ir3_shader_variant * v,struct ir3_liveness ** live,const struct ir3_pressure * limit_pressure)2134 ir3_spill(struct ir3 *ir, struct ir3_shader_variant *v,
2135 struct ir3_liveness **live,
2136 const struct ir3_pressure *limit_pressure)
2137 {
2138 void *mem_ctx = ralloc_parent(*live);
2139 struct ra_spill_ctx *ctx = rzalloc(mem_ctx, struct ra_spill_ctx);
2140 spill_ctx_init(ctx, v, *live);
2141
2142 ctx->spilling = true;
2143
2144 ctx->blocks = rzalloc_array(ctx, struct ra_spill_block_state,
2145 ctx->live->block_count);
2146 rb_tree_init(&ctx->full_live_intervals);
2147 rb_tree_init(&ctx->half_live_intervals);
2148
2149 ctx->limit_pressure = *limit_pressure;
2150 ctx->spill_slot = v->pvtmem_size;
2151
2152 add_base_reg(ctx, ir);
2153 compute_next_distance(ctx, ir);
2154
2155 unmark_dead(ir);
2156
2157 foreach_block (block, &ir->block_list) {
2158 handle_block(ctx, block);
2159 }
2160
2161 simplify_phi_nodes(ir);
2162
2163 cleanup_dead(ir);
2164
2165 ir3_create_parallel_copies(ir);
2166
2167 /* After this point, we're done mutating the IR. Liveness has been trashed,
2168 * so recalculate it. We'll need it for recalculating the merge sets.
2169 */
2170 ralloc_free(ctx->live);
2171 *live = ir3_calc_liveness(mem_ctx, ir);
2172
2173 fixup_merge_sets(*live, ir);
2174
2175 v->pvtmem_size = ctx->spill_slot;
2176 ralloc_free(ctx);
2177
2178 return true;
2179 }
2180