1 /*
2 * Copyright (C) 2021 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "util/rb_tree.h"
25 #include "ir3_ra.h"
26 #include "ir3_shader.h"
27
28 /*
29 * This pass does two things:
30 *
31 * 1. Calculates the maximum register pressure. To do this, we need to use the
32 * exact same technique that RA uses for combining meta_split instructions
33 * with their sources, so that our calculation agrees with RA.
34 * 2. Spills when the register pressure is exceeded a limit calculated by RA.
35 * The implementation is based on "Register Spilling and Live-Range Splitting
36 * for SSA-Form Programs" by Braun and Hack, although again care has to be
37 * taken to handle combining split/collect instructions.
38 */
39
40 struct reg_or_immed {
41 unsigned flags;
42 union {
43 struct ir3_register *def;
44 uint32_t uimm;
45 unsigned const_num;
46 };
47 };
48
49 struct ra_spill_interval {
50 struct ir3_reg_interval interval;
51
52 struct rb_node node;
53 struct rb_node half_node;
54
55 /* The current SSA value/const/immed this source is mapped to. */
56 struct reg_or_immed dst;
57
58 /* When computing use distances we use the distance relative to the start
59 * of the block. So, for example, a value that's defined in cycle 5 of the
60 * block and used 6 cycles later will always have a next_use_distance of 11
61 * until we reach that use.
62 */
63 unsigned next_use_distance;
64
65 /* Whether this value was reloaded and therefore doesn't need to be
66 * spilled again. Corresponds to the S set in the paper.
67 */
68 bool already_spilled;
69
70 /* We need to add sources early for accounting purposes, but we have to
71 * insert the reload code for them last. Keep track of whether this interval
72 * needs to be reloaded later.
73 */
74 bool needs_reload;
75
76 /* Keep track of whether this interval currently can't be spilled because:
77 * - It or one of its children is a source and we're making space for
78 * sources.
79 * - It is a destination and we're making space for destinations.
80 */
81 bool cant_spill;
82
83 /* Whether this interval can be rematerialized. */
84 bool can_rematerialize;
85 };
86
87 struct ra_spill_block_state {
88 unsigned *next_use_end;
89 unsigned *next_use_start;
90
91 unsigned cycles;
92
93 /* Map from SSA def to reg_or_immed it is mapped to at the end of the block.
94 * This map only contains values which we didn't spill, so it also serves as
95 * a record of the new live-out set for this block.
96 */
97 struct hash_table *remap;
98
99 /* For blocks whose successors are visited first (i.e. loop backedges), which
100 * values should be live at the end.
101 */
102 BITSET_WORD *live_out;
103
104 bool visited;
105 };
106
107 struct ra_spill_ctx {
108 struct ir3_reg_ctx reg_ctx;
109
110 struct ra_spill_interval **intervals;
111 unsigned intervals_count;
112
113 /* rb tree of live intervals that we can spill, ordered by next-use distance.
114 * full_live_intervals contains the full+shared intervals in the merged_regs
115 * case. We use this list to determine what to spill.
116 */
117 struct rb_tree full_live_intervals;
118 struct rb_tree half_live_intervals;
119
120 struct ir3_pressure cur_pressure, max_pressure;
121
122 struct ir3_pressure limit_pressure;
123
124 /* When spilling, we need to reserve a register to serve as the zero'd
125 * "base". For simplicity we reserve a register at the beginning so that it's
126 * always available.
127 */
128 struct ir3_register *base_reg;
129
130 /* Current pvtmem offset in bytes. */
131 unsigned spill_slot;
132
133 struct ir3_liveness *live;
134
135 const struct ir3_compiler *compiler;
136
137 struct ra_spill_block_state *blocks;
138
139 bool spilling;
140
141 bool merged_regs;
142 };
143
144 static void
add_base_reg(struct ra_spill_ctx * ctx,struct ir3 * ir)145 add_base_reg(struct ra_spill_ctx *ctx, struct ir3 *ir)
146 {
147 struct ir3_block *start = ir3_start_block(ir);
148
149 /* We need to stick it after any meta instructions which need to be first. */
150 struct ir3_instruction *after = NULL;
151 foreach_instr (instr, &start->instr_list) {
152 if (instr->opc != OPC_META_INPUT &&
153 instr->opc != OPC_META_TEX_PREFETCH) {
154 after = instr;
155 break;
156 }
157 }
158
159 struct ir3_instruction *mov = create_immed(start, 0);
160
161 if (after)
162 ir3_instr_move_before(mov, after);
163
164 ctx->base_reg = mov->dsts[0];
165
166 /* We don't create an interval, etc. for the base reg, so just lower the
167 * register pressure limit to account for it. We assume it's always
168 * available for simplicity.
169 */
170 ctx->limit_pressure.full -= reg_size(ctx->base_reg);
171 }
172
173
174 /* Compute the number of cycles per instruction used for next-use-distance
175 * analysis. This is just approximate, obviously.
176 */
177 static unsigned
instr_cycles(struct ir3_instruction * instr)178 instr_cycles(struct ir3_instruction *instr)
179 {
180 if (instr->opc == OPC_META_PARALLEL_COPY) {
181 unsigned cycles = 0;
182 for (unsigned i = 0; i < instr->dsts_count; i++) {
183 if (!instr->srcs[i]->def ||
184 instr->srcs[i]->def->merge_set != instr->dsts[i]->merge_set) {
185 cycles += reg_elems(instr->srcs[i]);
186 }
187 }
188
189 return cycles;
190 }
191
192 if (instr->opc == OPC_META_COLLECT) {
193 unsigned cycles = 0;
194 for (unsigned i = 0; i < instr->srcs_count; i++) {
195 if (!instr->srcs[i]->def ||
196 instr->srcs[i]->def->merge_set != instr->dsts[0]->merge_set) {
197 cycles++;
198 }
199 }
200
201 return cycles;
202 }
203
204 if (is_meta(instr))
205 return 0;
206
207 return 1 + instr->repeat;
208 }
209
210 static bool
compute_block_next_distance(struct ra_spill_ctx * ctx,struct ir3_block * block,unsigned * tmp_next_use)211 compute_block_next_distance(struct ra_spill_ctx *ctx, struct ir3_block *block,
212 unsigned *tmp_next_use)
213 {
214 struct ra_spill_block_state *state = &ctx->blocks[block->index];
215 memcpy(tmp_next_use, state->next_use_end,
216 ctx->live->definitions_count * sizeof(*tmp_next_use));
217
218 unsigned cycle = state->cycles;
219 foreach_instr_rev (instr, &block->instr_list) {
220 ra_foreach_dst (dst, instr) {
221 dst->next_use = tmp_next_use[dst->name];
222 }
223
224 ra_foreach_src (src, instr) {
225 src->next_use = tmp_next_use[src->def->name];
226 }
227
228 cycle -= instr_cycles(instr);
229
230 if (instr->opc == OPC_META_PARALLEL_COPY) {
231 ra_foreach_src_n (src, i, instr) {
232 if (src->def->merge_set == instr->dsts[i]->merge_set &&
233 src->def->merge_set_offset == instr->dsts[i]->merge_set_offset) {
234 tmp_next_use[src->def->name] =
235 tmp_next_use[instr->dsts[i]->name];
236 } else {
237 tmp_next_use[src->def->name] = cycle;
238 }
239 }
240 } else if (instr->opc != OPC_META_PHI) {
241 ra_foreach_src (src, instr) {
242 tmp_next_use[src->def->name] = cycle;
243 }
244 }
245
246 ra_foreach_dst (dst, instr) {
247 tmp_next_use[dst->name] = UINT_MAX;
248 }
249 }
250
251 memcpy(state->next_use_start, tmp_next_use,
252 ctx->live->definitions_count * sizeof(*tmp_next_use));
253
254 bool progress = false;
255 for (unsigned i = 0; i < block->predecessors_count; i++) {
256 const struct ir3_block *pred = block->predecessors[i];
257 struct ra_spill_block_state *pred_state = &ctx->blocks[pred->index];
258
259 /* Add a large-enough distance in front of edges exiting the loop so that
260 * variables that are live-through the loop but not used inside it are
261 * prioritized for spilling, as per the paper. This just needs to be
262 * larger than the longest path through the loop.
263 */
264 bool loop_exit = pred->loop_depth < block->loop_depth;
265 unsigned block_distance = pred_state->cycles + (loop_exit ? 100000 : 0);
266
267 for (unsigned j = 0; j < ctx->live->definitions_count; j++) {
268 if (state->next_use_start[j] < UINT_MAX &&
269 state->next_use_start[j] + block_distance <
270 pred_state->next_use_end[j]) {
271 pred_state->next_use_end[j] = state->next_use_start[j] +
272 block_distance;
273 progress = true;
274 }
275 }
276
277 foreach_instr (phi, &block->instr_list) {
278 if (phi->opc != OPC_META_PHI)
279 break;
280 if (!phi->srcs[i]->def)
281 continue;
282 unsigned src = phi->srcs[i]->def->name;
283 if (phi->dsts[0]->next_use < UINT_MAX &&
284 phi->dsts[0]->next_use + block_distance <
285 pred_state->next_use_end[src]) {
286 pred_state->next_use_end[src] = phi->dsts[0]->next_use +
287 block_distance;
288 progress = true;
289 }
290 }
291 }
292
293 return progress;
294 }
295
296 static void
compute_next_distance(struct ra_spill_ctx * ctx,struct ir3 * ir)297 compute_next_distance(struct ra_spill_ctx *ctx, struct ir3 *ir)
298 {
299 for (unsigned i = 0; i < ctx->live->block_count; i++) {
300 ctx->blocks[i].next_use_start =
301 ralloc_array(ctx, unsigned, ctx->live->definitions_count);
302 ctx->blocks[i].next_use_end =
303 ralloc_array(ctx, unsigned, ctx->live->definitions_count);
304
305 for (unsigned j = 0; j < ctx->live->definitions_count; j++) {
306 ctx->blocks[i].next_use_start[j] = UINT_MAX;
307 ctx->blocks[i].next_use_end[j] = UINT_MAX;
308 }
309 }
310
311 foreach_block (block, &ir->block_list) {
312 struct ra_spill_block_state *state = &ctx->blocks[block->index];
313 state->cycles = 0;
314 foreach_instr (instr, &block->instr_list) {
315 state->cycles += instr_cycles(instr);
316 foreach_dst (dst, instr) {
317 dst->spill_slot = ~0;
318 }
319 }
320 }
321
322 unsigned *tmp_next_use =
323 ralloc_array(ctx, unsigned, ctx->live->definitions_count);
324
325 bool progress = true;
326 while (progress) {
327 progress = false;
328 foreach_block_rev (block, &ir->block_list) {
329 progress |= compute_block_next_distance(ctx, block, tmp_next_use);
330 }
331 }
332 }
333
334 static bool
can_rematerialize(struct ir3_register * reg)335 can_rematerialize(struct ir3_register *reg)
336 {
337 if (reg->flags & IR3_REG_ARRAY)
338 return false;
339 if (reg->instr->opc != OPC_MOV)
340 return false;
341 if (!(reg->instr->srcs[0]->flags & (IR3_REG_IMMED | IR3_REG_CONST)))
342 return false;
343 if (reg->instr->srcs[0]->flags & IR3_REG_RELATIV)
344 return false;
345 return true;
346 }
347
348 static struct ir3_register *
rematerialize(struct ir3_register * reg,struct ir3_instruction * after,struct ir3_block * block)349 rematerialize(struct ir3_register *reg, struct ir3_instruction *after,
350 struct ir3_block *block)
351 {
352 d("rematerializing ssa_%u:%u", reg->instr->serialno, reg->name);
353
354 struct ir3_instruction *remat =
355 ir3_instr_create(block, reg->instr->opc, 1, reg->instr->srcs_count);
356 struct ir3_register *dst = __ssa_dst(remat);
357 dst->flags |= reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
358 for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
359 struct ir3_register *src =
360 ir3_src_create(remat, INVALID_REG, reg->instr->srcs[i]->flags);
361 *src = *reg->instr->srcs[i];
362 }
363
364 remat->cat1 = reg->instr->cat1;
365
366 dst->merge_set = reg->merge_set;
367 dst->merge_set_offset = reg->merge_set_offset;
368 dst->interval_start = reg->interval_start;
369 dst->interval_end = reg->interval_end;
370
371 if (after)
372 ir3_instr_move_before(remat, after);
373
374 return dst;
375 }
376
377 static void
ra_spill_interval_init(struct ra_spill_interval * interval,struct ir3_register * reg)378 ra_spill_interval_init(struct ra_spill_interval *interval,
379 struct ir3_register *reg)
380 {
381 ir3_reg_interval_init(&interval->interval, reg);
382 interval->dst.flags = reg->flags;
383 interval->dst.def = reg;
384 interval->already_spilled = false;
385 interval->needs_reload = false;
386 interval->cant_spill = false;
387 interval->can_rematerialize = can_rematerialize(reg);
388 }
389
390 static struct ra_spill_interval *
ir3_reg_interval_to_interval(struct ir3_reg_interval * interval)391 ir3_reg_interval_to_interval(struct ir3_reg_interval *interval)
392 {
393 return rb_node_data(struct ra_spill_interval, interval, interval);
394 }
395
396 static struct ra_spill_interval *
ra_spill_interval_root(struct ra_spill_interval * interval)397 ra_spill_interval_root(struct ra_spill_interval *interval)
398 {
399 struct ir3_reg_interval *ir3_interval = &interval->interval;
400 while (ir3_interval->parent)
401 ir3_interval = ir3_interval->parent;
402 return ir3_reg_interval_to_interval(ir3_interval);
403 }
404
405 static struct ra_spill_ctx *
ir3_reg_ctx_to_ctx(struct ir3_reg_ctx * ctx)406 ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
407 {
408 return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx);
409 }
410
411 static int
spill_interval_cmp(const struct ra_spill_interval * a,const struct ra_spill_interval * b)412 spill_interval_cmp(const struct ra_spill_interval *a,
413 const struct ra_spill_interval *b)
414 {
415 /* Prioritize intervals that we can rematerialize. */
416 if (a->can_rematerialize && !b->can_rematerialize)
417 return 1;
418 if (!a->can_rematerialize && b->can_rematerialize)
419 return -1;
420
421 return a->next_use_distance - b->next_use_distance;
422 }
423
424 static int
ra_spill_interval_cmp(const struct rb_node * _a,const struct rb_node * _b)425 ra_spill_interval_cmp(const struct rb_node *_a, const struct rb_node *_b)
426 {
427 const struct ra_spill_interval *a =
428 rb_node_data(const struct ra_spill_interval, _a, node);
429 const struct ra_spill_interval *b =
430 rb_node_data(const struct ra_spill_interval, _b, node);
431 return spill_interval_cmp(a, b);
432 }
433
434 static int
ra_spill_interval_half_cmp(const struct rb_node * _a,const struct rb_node * _b)435 ra_spill_interval_half_cmp(const struct rb_node *_a, const struct rb_node *_b)
436 {
437 const struct ra_spill_interval *a =
438 rb_node_data(const struct ra_spill_interval, _a, half_node);
439 const struct ra_spill_interval *b =
440 rb_node_data(const struct ra_spill_interval, _b, half_node);
441 return spill_interval_cmp(a, b);
442 }
443
444 static void
interval_add(struct ir3_reg_ctx * _ctx,struct ir3_reg_interval * _interval)445 interval_add(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
446 {
447 struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
448 struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
449
450 unsigned size = reg_size(interval->interval.reg);
451 if (interval->interval.reg->flags & IR3_REG_SHARED) {
452 ctx->cur_pressure.shared += size;
453 } else {
454 if (interval->interval.reg->flags & IR3_REG_HALF) {
455 ctx->cur_pressure.half += size;
456 if (ctx->spilling) {
457 rb_tree_insert(&ctx->half_live_intervals, &interval->half_node,
458 ra_spill_interval_half_cmp);
459 }
460 }
461 if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) {
462 ctx->cur_pressure.full += size;
463 if (ctx->spilling) {
464 rb_tree_insert(&ctx->full_live_intervals, &interval->node,
465 ra_spill_interval_cmp);
466 }
467 }
468 }
469 }
470
471 static void
interval_delete(struct ir3_reg_ctx * _ctx,struct ir3_reg_interval * _interval)472 interval_delete(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
473 {
474 struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
475 struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
476
477 unsigned size = reg_size(interval->interval.reg);
478 if (interval->interval.reg->flags & IR3_REG_SHARED) {
479 ctx->cur_pressure.shared -= size;
480 } else {
481 if (interval->interval.reg->flags & IR3_REG_HALF) {
482 ctx->cur_pressure.half -= size;
483 if (ctx->spilling) {
484 rb_tree_remove(&ctx->half_live_intervals, &interval->half_node);
485 }
486 }
487 if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) {
488 ctx->cur_pressure.full -= size;
489 if (ctx->spilling) {
490 rb_tree_remove(&ctx->full_live_intervals, &interval->node);
491 }
492 }
493 }
494 }
495
496 static void
interval_readd(struct ir3_reg_ctx * _ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)497 interval_readd(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_parent,
498 struct ir3_reg_interval *_child)
499 {
500 interval_add(_ctx, _child);
501 }
502
503 static void
spill_ctx_init(struct ra_spill_ctx * ctx,struct ir3_shader_variant * v,struct ir3_liveness * live)504 spill_ctx_init(struct ra_spill_ctx *ctx, struct ir3_shader_variant *v,
505 struct ir3_liveness *live)
506 {
507 ctx->live = live;
508 ctx->intervals = ralloc_array(ctx, struct ra_spill_interval *,
509 ctx->live->definitions_count);
510 struct ra_spill_interval *intervals =
511 rzalloc_array(ctx, struct ra_spill_interval,
512 ctx->live->definitions_count);
513 for (unsigned i = 0; i < ctx->live->definitions_count; i++)
514 ctx->intervals[i] = &intervals[i];
515
516 ctx->intervals_count = ctx->live->definitions_count;
517 ctx->compiler = v->compiler;
518 ctx->merged_regs = v->mergedregs;
519
520 rb_tree_init(&ctx->reg_ctx.intervals);
521 ctx->reg_ctx.interval_add = interval_add;
522 ctx->reg_ctx.interval_delete = interval_delete;
523 ctx->reg_ctx.interval_readd = interval_readd;
524 }
525
526 static void
ra_spill_ctx_insert(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval)527 ra_spill_ctx_insert(struct ra_spill_ctx *ctx,
528 struct ra_spill_interval *interval)
529 {
530 ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
531 }
532
533 static void
ra_spill_ctx_remove(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval)534 ra_spill_ctx_remove(struct ra_spill_ctx *ctx,
535 struct ra_spill_interval *interval)
536 {
537 ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
538 }
539
540 static void
init_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)541 init_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
542 {
543 struct ra_spill_interval *interval = ctx->intervals[dst->name];
544 ra_spill_interval_init(interval, dst);
545 if (ctx->spilling) {
546 interval->next_use_distance = dst->next_use;
547
548 /* We only need to keep track of used-ness if this value may be
549 * rematerialized. This also keeps us from nuking things that may be
550 * in the keeps list (e.g. atomics, input splits).
551 */
552 if (interval->can_rematerialize)
553 dst->instr->flags |= IR3_INSTR_UNUSED;
554 }
555 }
556
557 static void
insert_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)558 insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
559 {
560 struct ra_spill_interval *interval = ctx->intervals[dst->name];
561 if (interval->interval.inserted)
562 return;
563
564 ra_spill_ctx_insert(ctx, interval);
565 interval->cant_spill = true;
566
567 /* For precolored inputs, make sure we leave enough registers to allow for
568 * holes in the inputs. It can happen that the binning shader has a lower
569 * register pressure than the main shader, but the main shader decided to
570 * add holes between the inputs which means that the binning shader has a
571 * higher register demand.
572 */
573 if (dst->instr->opc == OPC_META_INPUT && dst->num != INVALID_REG) {
574 physreg_t physreg = ra_reg_get_physreg(dst);
575 physreg_t max = physreg + reg_size(dst);
576
577 if (interval->interval.reg->flags & IR3_REG_SHARED)
578 ctx->max_pressure.shared = MAX2(ctx->max_pressure.shared, max);
579 else if (interval->interval.reg->flags & IR3_REG_HALF)
580 ctx->max_pressure.half = MAX2(ctx->max_pressure.half, max);
581 else
582 ctx->max_pressure.full = MAX2(ctx->max_pressure.full, max);
583 }
584 }
585
586 static void
insert_src(struct ra_spill_ctx * ctx,struct ir3_register * src)587 insert_src(struct ra_spill_ctx *ctx, struct ir3_register *src)
588 {
589 struct ra_spill_interval *interval = ctx->intervals[src->def->name];
590
591 if (!interval->interval.inserted) {
592 ra_spill_ctx_insert(ctx, interval);
593 interval->needs_reload = true;
594 interval->already_spilled = true;
595 }
596
597 ra_spill_interval_root(interval)->cant_spill = true;
598
599 }
600
601 static void
remove_src_early(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)602 remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
603 struct ir3_register *src)
604 {
605 struct ra_spill_interval *interval = ctx->intervals[src->def->name];
606
607 if (!interval->interval.inserted || interval->interval.parent ||
608 !rb_tree_is_empty(&interval->interval.children))
609 return;
610
611 ra_spill_ctx_remove(ctx, interval);
612 }
613
614 static void
remove_src(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)615 remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
616 struct ir3_register *src)
617 {
618 struct ra_spill_interval *interval = ctx->intervals[src->def->name];
619
620 if (!interval->interval.inserted)
621 return;
622
623 ra_spill_ctx_remove(ctx, interval);
624 }
625
626 static void
finish_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)627 finish_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
628 {
629 struct ra_spill_interval *interval = ctx->intervals[dst->name];
630 interval->cant_spill = false;
631 }
632
633 static void
remove_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)634 remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
635 {
636 struct ra_spill_interval *interval = ctx->intervals[dst->name];
637
638 if (!interval->interval.inserted)
639 return;
640
641 ra_spill_ctx_remove(ctx, interval);
642 }
643
644 static void
update_src_next_use(struct ra_spill_ctx * ctx,struct ir3_register * src)645 update_src_next_use(struct ra_spill_ctx *ctx, struct ir3_register *src)
646 {
647 struct ra_spill_interval *interval = ctx->intervals[src->def->name];
648
649 assert(interval->interval.inserted);
650
651 interval->next_use_distance = src->next_use;
652
653 /* If this node is inserted in one of the trees, then it needs to be resorted
654 * as its key has changed.
655 */
656 if (!interval->interval.parent && !(src->flags & IR3_REG_SHARED)) {
657 if (src->flags & IR3_REG_HALF) {
658 rb_tree_remove(&ctx->half_live_intervals, &interval->half_node);
659 rb_tree_insert(&ctx->half_live_intervals, &interval->half_node,
660 ra_spill_interval_half_cmp);
661 }
662 if (ctx->merged_regs || !(src->flags & IR3_REG_HALF)) {
663 rb_tree_remove(&ctx->full_live_intervals, &interval->node);
664 rb_tree_insert(&ctx->full_live_intervals, &interval->node,
665 ra_spill_interval_cmp);
666 }
667 }
668 }
669
670 static unsigned
get_spill_slot(struct ra_spill_ctx * ctx,struct ir3_register * reg)671 get_spill_slot(struct ra_spill_ctx *ctx, struct ir3_register *reg)
672 {
673 if (reg->merge_set) {
674 if (reg->merge_set->spill_slot == ~0) {
675 reg->merge_set->spill_slot = ALIGN_POT(ctx->spill_slot,
676 reg->merge_set->alignment);
677 ctx->spill_slot = reg->merge_set->spill_slot + reg->merge_set->size * 2;
678 }
679 return reg->merge_set->spill_slot + reg->merge_set_offset * 2;
680 } else {
681 if (reg->spill_slot == ~0) {
682 reg->spill_slot = ALIGN_POT(ctx->spill_slot, reg_elem_size(reg));
683 ctx->spill_slot = reg->spill_slot + reg_size(reg) * 2;
684 }
685 return reg->spill_slot;
686 }
687 }
688
689 static void
set_src_val(struct ir3_register * src,const struct reg_or_immed * val)690 set_src_val(struct ir3_register *src, const struct reg_or_immed *val)
691 {
692 if (val->flags & IR3_REG_IMMED) {
693 src->flags = IR3_REG_IMMED | (val->flags & IR3_REG_HALF);
694 src->uim_val = val->uimm;
695 src->def = NULL;
696 } else if (val->flags & IR3_REG_CONST) {
697 src->flags = IR3_REG_CONST | (val->flags & IR3_REG_HALF);
698 src->num = val->const_num;
699 src->def = NULL;
700 } else {
701 src->def = val->def;
702 val->def->instr->flags &= ~IR3_INSTR_UNUSED;
703 }
704 }
705
706 static struct ir3_register *
materialize_pcopy_src(const struct reg_or_immed * src,struct ir3_instruction * instr,struct ir3_block * block)707 materialize_pcopy_src(const struct reg_or_immed *src,
708 struct ir3_instruction *instr,
709 struct ir3_block *block)
710 {
711 struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
712 struct ir3_register *dst = __ssa_dst(mov);
713 dst->flags |= src->flags & IR3_REG_HALF;
714 struct ir3_register *mov_src = ir3_src_create(mov, INVALID_REG, src->flags);
715 set_src_val(mov_src, src);
716 mov->cat1.src_type = mov->cat1.dst_type =
717 (src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
718
719 if (instr)
720 ir3_instr_move_before(mov, instr);
721 return dst;
722 }
723
724 static void
spill(struct ra_spill_ctx * ctx,const struct reg_or_immed * val,unsigned spill_slot,struct ir3_instruction * instr,struct ir3_block * block)725 spill(struct ra_spill_ctx *ctx, const struct reg_or_immed *val,
726 unsigned spill_slot, struct ir3_instruction *instr, struct ir3_block *block)
727 {
728 struct ir3_register *reg;
729
730 /* If spilling an immed/const pcopy src, we need to actually materialize it
731 * first with a mov.
732 */
733 if (val->flags & (IR3_REG_CONST | IR3_REG_IMMED)) {
734 reg = materialize_pcopy_src(val, instr, block);
735 } else {
736 reg = val->def;
737 reg->instr->flags &= ~IR3_INSTR_UNUSED;
738 }
739
740 d("spilling ssa_%u:%u to %u", reg->instr->serialno, reg->name,
741 spill_slot);
742
743 unsigned elems = reg_elems(reg);
744 struct ir3_instruction *spill =
745 ir3_instr_create(block, OPC_SPILL_MACRO, 0, 3);
746 ir3_src_create(spill, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg;
747 unsigned src_flags = reg->flags & (IR3_REG_HALF | IR3_REG_IMMED |
748 IR3_REG_CONST | IR3_REG_SSA |
749 IR3_REG_ARRAY);
750 struct ir3_register *src = ir3_src_create(spill, INVALID_REG, src_flags);
751 ir3_src_create(spill, INVALID_REG, IR3_REG_IMMED)->uim_val = elems;
752 spill->cat6.dst_offset = spill_slot;
753 spill->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
754
755 src->def = reg;
756 if (reg->flags & IR3_REG_ARRAY) {
757 src->size = reg->size;
758 src->array.id = reg->array.id;
759 src->array.offset = 0;
760 } else {
761 src->wrmask = reg->wrmask;
762 }
763
764 if (instr)
765 ir3_instr_move_before(spill, instr);
766 }
767
768 static void
spill_interval(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_instruction * instr,struct ir3_block * block)769 spill_interval(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval,
770 struct ir3_instruction *instr, struct ir3_block *block)
771 {
772 if (interval->can_rematerialize && !interval->interval.reg->merge_set)
773 return;
774
775 spill(ctx, &interval->dst, get_spill_slot(ctx, interval->interval.reg),
776 instr, block);
777 }
778
779 /* This is similar to "limit" in the paper. */
780 static void
limit(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)781 limit(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
782 {
783 if (ctx->cur_pressure.half > ctx->limit_pressure.half) {
784 d("cur half pressure %u exceeds %u", ctx->cur_pressure.half,
785 ctx->limit_pressure.half);
786 rb_tree_foreach_safe (struct ra_spill_interval, interval,
787 &ctx->half_live_intervals, half_node) {
788 d("trying ssa_%u:%u", interval->interval.reg->instr->serialno,
789 interval->interval.reg->name);
790 if (!interval->cant_spill) {
791 if (!interval->already_spilled)
792 spill_interval(ctx, interval, instr, instr->block);
793 ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
794 if (ctx->cur_pressure.half <= ctx->limit_pressure.half)
795 break;
796 }
797 }
798
799 assert(ctx->cur_pressure.half <= ctx->limit_pressure.half);
800 }
801
802 if (ctx->cur_pressure.full > ctx->limit_pressure.full) {
803 d("cur full pressure %u exceeds %u", ctx->cur_pressure.full,
804 ctx->limit_pressure.full);
805 rb_tree_foreach_safe (struct ra_spill_interval, interval,
806 &ctx->full_live_intervals, node) {
807 d("trying ssa_%u:%u", interval->interval.reg->instr->serialno,
808 interval->interval.reg->name);
809 if (!interval->cant_spill) {
810 if (!interval->already_spilled)
811 spill_interval(ctx, interval, instr, instr->block);
812 ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
813 if (ctx->cur_pressure.full <= ctx->limit_pressure.full)
814 break;
815 } else {
816 d("can't spill");
817 }
818 }
819
820 assert(ctx->cur_pressure.full <= ctx->limit_pressure.full);
821 }
822 }
823
824 /* There's a corner case where we reload a value which has overlapping live
825 * values already reloaded, either because it's the child of some other interval
826 * that was already reloaded or some of its children have already been
827 * reloaded. Because RA only expects overlapping source/dest intervals for meta
828 * instructions (split/collect), and we don't want to add register pressure by
829 * creating an entirely separate value, we need to add splits and collects to
830 * deal with this case. These splits/collects have to also have correct merge
831 * set information, so that it doesn't result in any actual code or register
832 * pressure in practice.
833 */
834
835 static void
add_to_merge_set(struct ir3_merge_set * set,struct ir3_register * def,unsigned offset)836 add_to_merge_set(struct ir3_merge_set *set, struct ir3_register *def,
837 unsigned offset)
838 {
839 def->merge_set = set;
840 def->merge_set_offset = offset;
841 def->interval_start = set->interval_start + offset;
842 def->interval_end = set->interval_start + offset + reg_size(def);
843 }
844
845 static struct ir3_register *
split(struct ir3_register * def,unsigned offset,struct ir3_instruction * after,struct ir3_block * block)846 split(struct ir3_register *def, unsigned offset,
847 struct ir3_instruction *after, struct ir3_block *block)
848 {
849 if (reg_elems(def) == 1) {
850 assert(offset == 0);
851 return def;
852 }
853
854 assert(!(def->flags & IR3_REG_ARRAY));
855 assert(def->merge_set);
856 struct ir3_instruction *split =
857 ir3_instr_create(block, OPC_META_SPLIT, 1, 1);
858 struct ir3_register *dst = __ssa_dst(split);
859 dst->flags |= def->flags & IR3_REG_HALF;
860 struct ir3_register *src = ir3_src_create(split, INVALID_REG, def->flags);
861 src->wrmask = def->wrmask;
862 src->def = def;
863 add_to_merge_set(def->merge_set, dst,
864 def->merge_set_offset + offset * reg_elem_size(def));
865 if (after)
866 ir3_instr_move_before(split, after);
867 return dst;
868 }
869
870 static struct ir3_register *
extract(struct ir3_register * parent_def,unsigned offset,unsigned elems,struct ir3_instruction * after,struct ir3_block * block)871 extract(struct ir3_register *parent_def, unsigned offset, unsigned elems,
872 struct ir3_instruction *after, struct ir3_block *block)
873 {
874 if (offset == 0 && elems == reg_elems(parent_def))
875 return parent_def;
876
877 struct ir3_register *srcs[elems];
878 for (unsigned i = 0; i < elems; i++) {
879 srcs[i] = split(parent_def, offset + i, after, block);
880 }
881
882 struct ir3_instruction *collect =
883 ir3_instr_create(block, OPC_META_COLLECT, 1, elems);
884 struct ir3_register *dst = __ssa_dst(collect);
885 dst->flags |= parent_def->flags & IR3_REG_HALF;
886 dst->wrmask = MASK(elems);
887 add_to_merge_set(parent_def->merge_set, dst, parent_def->merge_set_offset);
888
889 for (unsigned i = 0; i < elems; i++) {
890 ir3_src_create(collect, INVALID_REG, parent_def->flags)->def = srcs[i];
891 }
892
893 if (after)
894 ir3_instr_move_before(collect, after);
895 return dst;
896 }
897
898 static struct ir3_register *
reload(struct ra_spill_ctx * ctx,struct ir3_register * reg,struct ir3_instruction * after,struct ir3_block * block)899 reload(struct ra_spill_ctx *ctx, struct ir3_register *reg,
900 struct ir3_instruction *after, struct ir3_block *block)
901 {
902 unsigned spill_slot = get_spill_slot(ctx, reg);
903
904 d("reloading ssa_%u:%u from %u", reg->instr->serialno, reg->name,
905 spill_slot);
906
907 unsigned elems = reg_elems(reg);
908 struct ir3_instruction *reload =
909 ir3_instr_create(block, OPC_RELOAD_MACRO, 1, 3);
910 struct ir3_register *dst = __ssa_dst(reload);
911 dst->flags |= reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
912 /* The reload may be split into multiple pieces, and if the destination
913 * overlaps with the base register then it could get clobbered before the
914 * last ldp in the sequence. Note that we always reserve space for the base
915 * register throughout the whole program, so effectively extending its live
916 * range past the end of the instruction isn't a problem for our pressure
917 * accounting.
918 */
919 dst->flags |= IR3_REG_EARLY_CLOBBER;
920 ir3_src_create(reload, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg;
921 struct ir3_register *offset_reg =
922 ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED);
923 offset_reg->uim_val = spill_slot;
924 ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED)->uim_val = elems;
925 reload->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
926
927 if (reg->flags & IR3_REG_ARRAY) {
928 dst->array.offset = 0;
929 dst->array.id = reg->array.id;
930 dst->size = reg->size;
931 } else {
932 dst->wrmask = MASK(elems);
933 }
934
935 dst->merge_set = reg->merge_set;
936 dst->merge_set_offset = reg->merge_set_offset;
937 dst->interval_start = reg->interval_start;
938 dst->interval_end = reg->interval_end;
939
940 if (after)
941 ir3_instr_move_before(reload, after);
942
943 return dst;
944 }
945
946 static void
rewrite_src_interval(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_register * def,struct ir3_instruction * instr,struct ir3_block * block)947 rewrite_src_interval(struct ra_spill_ctx *ctx,
948 struct ra_spill_interval *interval,
949 struct ir3_register *def,
950 struct ir3_instruction *instr,
951 struct ir3_block *block)
952 {
953 interval->dst.flags = def->flags;
954 interval->dst.def = def;
955 interval->needs_reload = false;
956
957 rb_tree_foreach (struct ra_spill_interval, child,
958 &interval->interval.children, interval.node) {
959 struct ir3_register *child_reg = child->interval.reg;
960 struct ir3_register *child_def =
961 extract(def, (child_reg->interval_start -
962 interval->interval.reg->interval_start) / reg_elem_size(def),
963 reg_elems(child_reg), instr, block);
964 rewrite_src_interval(ctx, child, child_def, instr, block);
965 }
966 }
967
968 static void
reload_def(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_instruction * instr,struct ir3_block * block)969 reload_def(struct ra_spill_ctx *ctx, struct ir3_register *def,
970 struct ir3_instruction *instr, struct ir3_block *block)
971 {
972 unsigned elems = reg_elems(def);
973 struct ra_spill_interval *interval = ctx->intervals[def->name];
974
975 struct ir3_reg_interval *ir3_parent = interval->interval.parent;
976
977 if (ir3_parent) {
978 struct ra_spill_interval *parent =
979 ir3_reg_interval_to_interval(ir3_parent);
980 if (!parent->needs_reload) {
981 interval->dst.flags = def->flags;
982 interval->dst.def = extract(
983 parent->dst.def, (def->interval_start - parent->dst.def->interval_start) /
984 reg_elem_size(def), elems, instr, block);
985 return;
986 }
987 }
988
989 struct ir3_register *dst;
990 if (interval->can_rematerialize)
991 dst = rematerialize(def, instr, block);
992 else
993 dst = reload(ctx, def, instr, block);
994
995 rewrite_src_interval(ctx, interval, dst, instr, block);
996 }
997
998 static void
reload_src(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)999 reload_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
1000 struct ir3_register *src)
1001 {
1002 struct ra_spill_interval *interval = ctx->intervals[src->def->name];
1003
1004 if (interval->needs_reload) {
1005 reload_def(ctx, src->def, instr, instr->block);
1006 }
1007
1008 ra_spill_interval_root(interval)->cant_spill = false;
1009 }
1010
1011 static void
rewrite_src(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)1012 rewrite_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
1013 struct ir3_register *src)
1014 {
1015 struct ra_spill_interval *interval = ctx->intervals[src->def->name];
1016
1017 set_src_val(src, &interval->dst);
1018 }
1019
1020 static void
update_max_pressure(struct ra_spill_ctx * ctx)1021 update_max_pressure(struct ra_spill_ctx *ctx)
1022 {
1023 d("pressure:");
1024 d("\tfull: %u", ctx->cur_pressure.full);
1025 d("\thalf: %u", ctx->cur_pressure.half);
1026 d("\tshared: %u", ctx->cur_pressure.shared);
1027
1028 ctx->max_pressure.full =
1029 MAX2(ctx->max_pressure.full, ctx->cur_pressure.full);
1030 ctx->max_pressure.half =
1031 MAX2(ctx->max_pressure.half, ctx->cur_pressure.half);
1032 ctx->max_pressure.shared =
1033 MAX2(ctx->max_pressure.shared, ctx->cur_pressure.shared);
1034 }
1035
1036 static void
handle_instr(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)1037 handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
1038 {
1039 ra_foreach_dst (dst, instr) {
1040 init_dst(ctx, dst);
1041 }
1042
1043 if (ctx->spilling) {
1044 ra_foreach_src (src, instr)
1045 insert_src(ctx, src);
1046 }
1047
1048 /* Handle tied and early-kill destinations. If a destination is tied to a
1049 * source and that source is live-through, then we need to allocate a new
1050 * register for the destination which is live-through itself and cannot
1051 * overlap the sources. Similarly early-kill destinations cannot overlap
1052 * sources.
1053 */
1054
1055 ra_foreach_dst (dst, instr) {
1056 struct ir3_register *tied_src = dst->tied;
1057 if ((tied_src && !(tied_src->flags & IR3_REG_FIRST_KILL)) ||
1058 (dst->flags & IR3_REG_EARLY_CLOBBER))
1059 insert_dst(ctx, dst);
1060 }
1061
1062 if (ctx->spilling)
1063 limit(ctx, instr);
1064 else
1065 update_max_pressure(ctx);
1066
1067 if (ctx->spilling) {
1068 ra_foreach_src (src, instr) {
1069 reload_src(ctx, instr, src);
1070 update_src_next_use(ctx, src);
1071 }
1072 }
1073
1074 ra_foreach_src (src, instr) {
1075 if (src->flags & IR3_REG_FIRST_KILL)
1076 remove_src_early(ctx, instr, src);
1077 }
1078
1079 ra_foreach_dst (dst, instr) {
1080 insert_dst(ctx, dst);
1081 }
1082
1083 if (ctx->spilling)
1084 limit(ctx, instr);
1085 else
1086 update_max_pressure(ctx);
1087
1088 /* We have to remove sources before rewriting them so that we can lookup the
1089 * interval to remove before the source itself is changed.
1090 */
1091 ra_foreach_src (src, instr) {
1092 if (src->flags & IR3_REG_FIRST_KILL)
1093 remove_src(ctx, instr, src);
1094 }
1095
1096 if (ctx->spilling) {
1097 ra_foreach_src (src, instr) {
1098 rewrite_src(ctx, instr, src);
1099 }
1100 }
1101
1102 ra_foreach_dst (dst, instr) {
1103 finish_dst(ctx, dst);
1104 }
1105
1106 for (unsigned i = 0; i < instr->dsts_count; i++) {
1107 if (ra_reg_is_dst(instr->dsts[i]) &&
1108 (instr->dsts[i]->flags & IR3_REG_UNUSED))
1109 remove_dst(ctx, instr->dsts[i]);
1110 }
1111 }
1112
1113 static struct ra_spill_interval *
create_temp_interval(struct ra_spill_ctx * ctx,struct ir3_register * def)1114 create_temp_interval(struct ra_spill_ctx *ctx, struct ir3_register *def)
1115 {
1116 unsigned name = ctx->intervals_count++;
1117 unsigned offset = ctx->live->interval_offset;
1118
1119 /* This is kinda hacky, but we need to create a fake SSA def here that is
1120 * only used as part of the pcopy accounting. See below.
1121 */
1122 struct ir3_register *reg = rzalloc(ctx, struct ir3_register);
1123 *reg = *def;
1124 reg->name = name;
1125 reg->interval_start = offset;
1126 reg->interval_end = offset + reg_size(def);
1127 reg->merge_set = NULL;
1128
1129 ctx->intervals = reralloc(ctx, ctx->intervals, struct ra_spill_interval *,
1130 ctx->intervals_count);
1131 struct ra_spill_interval *interval = rzalloc(ctx, struct ra_spill_interval);
1132 ra_spill_interval_init(interval, reg);
1133 ctx->intervals[name] = interval;
1134 ctx->live->interval_offset += reg_size(def);
1135 return interval;
1136 }
1137
1138 /* In the sequence of copies generated (see below), would this source be killed?
1139 */
1140 static bool
is_last_pcopy_src(struct ir3_instruction * pcopy,unsigned src_n)1141 is_last_pcopy_src(struct ir3_instruction *pcopy, unsigned src_n)
1142 {
1143 struct ir3_register *src = pcopy->srcs[src_n];
1144 if (!(src->flags & IR3_REG_KILL))
1145 return false;
1146 for (unsigned j = src_n + 1; j < pcopy->srcs_count; j++) {
1147 if (pcopy->srcs[j]->def == src->def)
1148 return false;
1149 }
1150 return true;
1151 }
1152
1153 /* Parallel copies are different from normal instructions. The sources together
1154 * may be larger than the entire register file, so we cannot just reload every
1155 * source like normal, and indeed that probably wouldn't be a great idea.
1156 * Instead we essentially need to lower the parallel copy to "copies," just like
1157 * in the normal CSSA construction, although we implement the copies by
1158 * reloading and then possibly spilling values. We essentially just shuffle
1159 * around the sources until each source either (a) is live or (b) has the same
1160 * spill slot as its corresponding destination. We do this by decomposing the
1161 * copy into a series of copies, so:
1162 *
1163 * a, b, c = d, e, f
1164 *
1165 * becomes:
1166 *
1167 * d' = d
1168 * e' = e
1169 * f' = f
1170 * a = d'
1171 * b = e'
1172 * c = f'
1173 *
1174 * the temporary SSA values d', e', and f' never actually show up in the result.
1175 * They are only used for our internal accounting. They may, however, have their
1176 * own spill slot created for them. Similarly, we don't actually emit any copy
1177 * instructions, although we emit the spills/reloads that *would've* been
1178 * required if those copies were there.
1179 *
1180 * TODO: in order to reduce the number of temporaries and therefore spill slots,
1181 * we could instead do a more complicated analysis that considers the location
1182 * transfer graph.
1183 *
1184 * In addition, we actually remove the parallel copy and rewrite all its uses
1185 * (in the phi nodes) rather than rewrite its sources at the end. Recreating it
1186 * later turns out to be easier than keeping it up-to-date throughout this pass,
1187 * since we may have to remove entries for phi sources that are spilled and add
1188 * entries for live-outs that are spilled and reloaded, which can happen here
1189 * and then possibly be undone or done again when processing live-ins of the
1190 * successor block.
1191 */
1192
1193 static void
handle_pcopy(struct ra_spill_ctx * ctx,struct ir3_instruction * pcopy)1194 handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
1195 {
1196 ra_foreach_dst (dst, pcopy) {
1197 struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
1198 ra_spill_interval_init(dst_interval, dst);
1199 }
1200
1201 foreach_src_n (src, i, pcopy) {
1202 struct ir3_register *dst = pcopy->dsts[i];
1203 if (!(dst->flags & IR3_REG_SSA))
1204 continue;
1205
1206 d("processing src %u", i);
1207
1208 /* Skip the intermediate copy for cases where the source is merged with
1209 * the destination. Crucially this means that we also don't reload/spill
1210 * it if it's been spilled, because it shares the same spill slot.
1211 */
1212 if ((src->flags & IR3_REG_SSA) && src->def->merge_set &&
1213 src->def->merge_set == dst->merge_set &&
1214 src->def->merge_set_offset == dst->merge_set_offset) {
1215 struct ra_spill_interval *src_interval = ctx->intervals[src->def->name];
1216 struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
1217 if (src_interval->interval.inserted) {
1218 update_src_next_use(ctx, src);
1219 if (is_last_pcopy_src(pcopy, i))
1220 ra_spill_ctx_remove(ctx, src_interval);
1221 dst_interval->cant_spill = true;
1222 ra_spill_ctx_insert(ctx, dst_interval);
1223 limit(ctx, pcopy);
1224 dst_interval->cant_spill = false;
1225 dst_interval->dst = src_interval->dst;
1226 }
1227 } else if (src->flags & IR3_REG_SSA) {
1228 struct ra_spill_interval *temp_interval =
1229 create_temp_interval(ctx, dst);
1230 struct ir3_register *temp = temp_interval->interval.reg;
1231 temp_interval->next_use_distance = src->next_use;
1232
1233 insert_src(ctx, src);
1234 limit(ctx, pcopy);
1235 reload_src(ctx, pcopy, src);
1236 update_src_next_use(ctx, src);
1237 if (is_last_pcopy_src(pcopy, i))
1238 remove_src(ctx, pcopy, src);
1239 struct ra_spill_interval *src_interval =
1240 ctx->intervals[src->def->name];
1241 temp_interval->dst = src_interval->dst;
1242
1243 temp_interval->cant_spill = true;
1244 ra_spill_ctx_insert(ctx, temp_interval);
1245 limit(ctx, pcopy);
1246 temp_interval->cant_spill = false;
1247
1248 src->flags = temp->flags;
1249 src->def = temp;
1250 }
1251 }
1252
1253 d("done with pcopy srcs");
1254
1255 foreach_src_n (src, i, pcopy) {
1256 struct ir3_register *dst = pcopy->dsts[i];
1257 if (!(dst->flags & IR3_REG_SSA))
1258 continue;
1259
1260 if ((src->flags & IR3_REG_SSA) && src->def->merge_set &&
1261 src->def->merge_set == dst->merge_set &&
1262 src->def->merge_set_offset == dst->merge_set_offset)
1263 continue;
1264
1265 struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
1266
1267 if (!(src->flags & IR3_REG_SSA)) {
1268 dst_interval->cant_spill = true;
1269 ra_spill_ctx_insert(ctx, dst_interval);
1270 limit(ctx, pcopy);
1271 dst_interval->cant_spill = false;
1272
1273 assert(src->flags & (IR3_REG_CONST | IR3_REG_IMMED));
1274 if (src->flags & IR3_REG_CONST) {
1275 dst_interval->dst.flags = src->flags;
1276 dst_interval->dst.const_num = src->num;
1277 } else {
1278 dst_interval->dst.flags = src->flags;
1279 dst_interval->dst.uimm = src->uim_val;
1280 }
1281 } else {
1282 struct ra_spill_interval *temp_interval = ctx->intervals[src->def->name];
1283
1284 insert_src(ctx, src);
1285 limit(ctx, pcopy);
1286 reload_src(ctx, pcopy, src);
1287 remove_src(ctx, pcopy, src);
1288
1289 dst_interval->dst = temp_interval->dst;
1290 ra_spill_ctx_insert(ctx, dst_interval);
1291 }
1292 }
1293
1294 pcopy->flags |= IR3_INSTR_UNUSED;
1295 }
1296
1297 static void
handle_input_phi(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)1298 handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
1299 {
1300 if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1301 return;
1302
1303 init_dst(ctx, instr->dsts[0]);
1304 insert_dst(ctx, instr->dsts[0]);
1305 finish_dst(ctx, instr->dsts[0]);
1306 }
1307
1308 static void
remove_input_phi(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)1309 remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
1310 {
1311 if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1312 return;
1313
1314 if (instr->opc == OPC_META_TEX_PREFETCH) {
1315 ra_foreach_src (src, instr)
1316 remove_src(ctx, instr, src);
1317 }
1318 if (instr->dsts[0]->flags & IR3_REG_UNUSED)
1319 remove_dst(ctx, instr->dsts[0]);
1320 }
1321
1322 static void
handle_live_in(struct ra_spill_ctx * ctx,struct ir3_block * block,struct ir3_register * def)1323 handle_live_in(struct ra_spill_ctx *ctx, struct ir3_block *block,
1324 struct ir3_register *def)
1325 {
1326 struct ra_spill_interval *interval = ctx->intervals[def->name];
1327 ra_spill_interval_init(interval, def);
1328 if (ctx->spilling) {
1329 interval->next_use_distance =
1330 ctx->blocks[block->index].next_use_start[def->name];
1331 }
1332
1333 ra_spill_ctx_insert(ctx, interval);
1334 }
1335
1336 static bool
is_live_in_phi(struct ir3_register * def,struct ir3_block * block)1337 is_live_in_phi(struct ir3_register *def, struct ir3_block *block)
1338 {
1339 return def->instr->opc == OPC_META_PHI && def->instr->block == block;
1340 }
1341
1342 static bool
is_live_in_pred(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block,unsigned pred_idx)1343 is_live_in_pred(struct ra_spill_ctx *ctx, struct ir3_register *def,
1344 struct ir3_block *block, unsigned pred_idx)
1345 {
1346 struct ir3_block *pred = block->predecessors[pred_idx];
1347 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1348 if (is_live_in_phi(def, block)) {
1349 def = def->instr->srcs[pred_idx]->def;
1350 if (!def)
1351 return false;
1352 }
1353
1354 return _mesa_hash_table_search(state->remap, def);
1355 }
1356
1357 static bool
is_live_in_undef(struct ir3_register * def,struct ir3_block * block,unsigned pred_idx)1358 is_live_in_undef(struct ir3_register *def,
1359 struct ir3_block *block, unsigned pred_idx)
1360 {
1361 if (!is_live_in_phi(def, block))
1362 return false;
1363
1364 return !def->instr->srcs[pred_idx]->def;
1365 }
1366
1367 static struct reg_or_immed *
read_live_in(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block,unsigned pred_idx)1368 read_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
1369 struct ir3_block *block, unsigned pred_idx)
1370 {
1371 struct ir3_block *pred = block->predecessors[pred_idx];
1372 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1373
1374 if (is_live_in_phi(def, block)) {
1375 def = def->instr->srcs[pred_idx]->def;
1376 if (!def)
1377 return NULL;
1378 }
1379
1380 struct hash_entry *entry = _mesa_hash_table_search(state->remap, def);
1381 if (entry)
1382 return entry->data;
1383 else
1384 return NULL;
1385 }
1386
1387 static bool
is_live_in_all_preds(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1388 is_live_in_all_preds(struct ra_spill_ctx *ctx, struct ir3_register *def,
1389 struct ir3_block *block)
1390 {
1391 for (unsigned i = 0; i < block->predecessors_count; i++) {
1392 if (!is_live_in_pred(ctx, def, block, i))
1393 return false;
1394 }
1395
1396 return true;
1397 }
1398
1399 static void
spill_live_in(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1400 spill_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
1401 struct ir3_block *block)
1402 {
1403 for (unsigned i = 0; i < block->predecessors_count; i++) {
1404 struct ir3_block *pred = block->predecessors[i];
1405 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1406
1407 if (!state->visited)
1408 continue;
1409
1410 struct reg_or_immed *pred_def = read_live_in(ctx, def, block, i);
1411 if (pred_def) {
1412 spill(ctx, pred_def, get_spill_slot(ctx, def), NULL, pred);
1413 }
1414 }
1415 }
1416
1417 static void
spill_live_ins(struct ra_spill_ctx * ctx,struct ir3_block * block)1418 spill_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block)
1419 {
1420 bool all_preds_visited = true;
1421 for (unsigned i = 0; i < block->predecessors_count; i++) {
1422 struct ir3_block *pred = block->predecessors[i];
1423 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1424 if (!state->visited) {
1425 all_preds_visited = false;
1426 break;
1427 }
1428 }
1429
1430 /* Note: in the paper they explicitly spill live-through values first, but we
1431 * should be doing that automatically by virtue of picking the largest
1432 * distance due to the extra distance added to edges out of loops.
1433 *
1434 * TODO: Keep track of pressure in each block and preemptively spill
1435 * live-through values as described in the paper to avoid spilling them
1436 * inside the loop.
1437 */
1438
1439 if (ctx->cur_pressure.half > ctx->limit_pressure.half) {
1440 rb_tree_foreach_safe (struct ra_spill_interval, interval,
1441 &ctx->half_live_intervals, half_node) {
1442 if (all_preds_visited &&
1443 is_live_in_all_preds(ctx, interval->interval.reg, block))
1444 continue;
1445 if (interval->interval.reg->merge_set ||
1446 !interval->can_rematerialize)
1447 spill_live_in(ctx, interval->interval.reg, block);
1448 ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
1449 if (ctx->cur_pressure.half <= ctx->limit_pressure.half)
1450 break;
1451 }
1452 }
1453
1454 if (ctx->cur_pressure.full > ctx->limit_pressure.full) {
1455 rb_tree_foreach_safe (struct ra_spill_interval, interval,
1456 &ctx->full_live_intervals, node) {
1457 if (all_preds_visited &&
1458 is_live_in_all_preds(ctx, interval->interval.reg, block))
1459 continue;
1460 spill_live_in(ctx, interval->interval.reg, block);
1461 ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
1462 if (ctx->cur_pressure.full <= ctx->limit_pressure.full)
1463 break;
1464 }
1465 }
1466 }
1467
1468 static void
live_in_rewrite(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct reg_or_immed * new_val,struct ir3_block * block,unsigned pred_idx)1469 live_in_rewrite(struct ra_spill_ctx *ctx,
1470 struct ra_spill_interval *interval,
1471 struct reg_or_immed *new_val,
1472 struct ir3_block *block, unsigned pred_idx)
1473 {
1474 struct ir3_block *pred = block->predecessors[pred_idx];
1475 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1476 struct ir3_register *def = interval->interval.reg;
1477 if (is_live_in_phi(def, block)) {
1478 def = def->instr->srcs[pred_idx]->def;
1479 }
1480
1481 if (def)
1482 _mesa_hash_table_insert(state->remap, def, new_val);
1483
1484 rb_tree_foreach (struct ra_spill_interval, child,
1485 &interval->interval.children, interval.node) {
1486 assert(new_val->flags & IR3_REG_SSA);
1487 struct ir3_register *child_def =
1488 extract(new_val->def,
1489 (child->interval.reg->interval_start - def->interval_start) /
1490 reg_elem_size(def), reg_elems(child->interval.reg),
1491 NULL, pred);
1492 struct reg_or_immed *child_val = ralloc(ctx, struct reg_or_immed);
1493 child_val->def = child_def;
1494 child_val->flags = child_def->flags;
1495 live_in_rewrite(ctx, child, child_val, block, pred_idx);
1496 }
1497 }
1498
1499 static void
reload_live_in(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1500 reload_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
1501 struct ir3_block *block)
1502 {
1503 struct ra_spill_interval *interval = ctx->intervals[def->name];
1504 for (unsigned i = 0; i < block->predecessors_count; i++) {
1505 struct ir3_block *pred = block->predecessors[i];
1506 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1507 if (!state->visited)
1508 continue;
1509
1510 if (is_live_in_undef(def, block, i))
1511 continue;
1512
1513 struct reg_or_immed *new_val = read_live_in(ctx, def, block, i);
1514
1515 if (!new_val) {
1516 new_val = ralloc(ctx, struct reg_or_immed);
1517 if (interval->can_rematerialize)
1518 new_val->def = rematerialize(def, NULL, pred);
1519 else
1520 new_val->def = reload(ctx, def, NULL, pred);
1521 new_val->flags = new_val->def->flags;
1522 }
1523 live_in_rewrite(ctx, interval, new_val, block, i);
1524 }
1525 }
1526
1527 static void
reload_live_ins(struct ra_spill_ctx * ctx,struct ir3_block * block)1528 reload_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block)
1529 {
1530 rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
1531 interval.node) {
1532 reload_live_in(ctx, interval->interval.reg, block);
1533 }
1534 }
1535
1536 static void
add_live_in_phi(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1537 add_live_in_phi(struct ra_spill_ctx *ctx, struct ir3_register *def,
1538 struct ir3_block *block)
1539 {
1540 struct ra_spill_interval *interval = ctx->intervals[def->name];
1541 if (!interval->interval.inserted)
1542 return;
1543
1544 bool needs_phi = false;
1545 struct ir3_register *cur_def = NULL;
1546 for (unsigned i = 0; i < block->predecessors_count; i++) {
1547 struct ir3_block *pred = block->predecessors[i];
1548 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1549
1550 if (!state->visited) {
1551 needs_phi = true;
1552 break;
1553 }
1554
1555 struct hash_entry *entry =
1556 _mesa_hash_table_search(state->remap, def);
1557 assert(entry);
1558 struct reg_or_immed *pred_val = entry->data;
1559 if ((pred_val->flags & (IR3_REG_IMMED | IR3_REG_CONST)) ||
1560 !pred_val->def ||
1561 (cur_def && cur_def != pred_val->def)) {
1562 needs_phi = true;
1563 break;
1564 }
1565 cur_def = pred_val->def;
1566 }
1567
1568 if (!needs_phi) {
1569 interval->dst.def = cur_def;
1570 interval->dst.flags = cur_def->flags;
1571 return;
1572 }
1573
1574 struct ir3_instruction *phi =
1575 ir3_instr_create(block, OPC_META_PHI, 1, block->predecessors_count);
1576 struct ir3_register *dst = __ssa_dst(phi);
1577 dst->flags |= def->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
1578 dst->size = def->size;
1579 dst->wrmask = def->wrmask;
1580
1581 dst->interval_start = def->interval_start;
1582 dst->interval_end = def->interval_end;
1583 dst->merge_set = def->merge_set;
1584 dst->merge_set_offset = def->merge_set_offset;
1585
1586 for (unsigned i = 0; i < block->predecessors_count; i++) {
1587 struct ir3_block *pred = block->predecessors[i];
1588 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1589 struct ir3_register *src = ir3_src_create(phi, INVALID_REG, dst->flags);
1590 src->size = def->size;
1591 src->wrmask = def->wrmask;
1592
1593 if (state->visited) {
1594 struct hash_entry *entry =
1595 _mesa_hash_table_search(state->remap, def);
1596 assert(entry);
1597 struct reg_or_immed *new_val = entry->data;
1598 set_src_val(src, new_val);
1599 } else {
1600 src->def = def;
1601 }
1602 }
1603
1604 interval->dst.def = dst;
1605 interval->dst.flags = dst->flags;
1606
1607 ir3_instr_move_before_block(phi, block);
1608 }
1609
1610 /* When spilling a block with a single predecessors, the pred may have other
1611 * successors so we can't choose what's live in and we can't spill/restore
1612 * anything. Just make the inserted intervals exactly match the predecessor. If
1613 * it wasn't live in the predecessor then it must've already been spilled. Also,
1614 * there are no phi nodes and no live-ins.
1615 */
1616 static void
spill_single_pred_live_in(struct ra_spill_ctx * ctx,struct ir3_block * block)1617 spill_single_pred_live_in(struct ra_spill_ctx *ctx,
1618 struct ir3_block *block)
1619 {
1620 unsigned name;
1621 BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1622 ctx->live->definitions_count) {
1623 struct ir3_register *reg = ctx->live->definitions[name];
1624 struct ra_spill_interval *interval = ctx->intervals[reg->name];
1625 struct reg_or_immed *val = read_live_in(ctx, reg, block, 0);
1626 if (val)
1627 interval->dst = *val;
1628 else
1629 ra_spill_ctx_remove(ctx, interval);
1630 }
1631 }
1632
1633 static void
rewrite_phi(struct ra_spill_ctx * ctx,struct ir3_instruction * phi,struct ir3_block * block)1634 rewrite_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *phi,
1635 struct ir3_block *block)
1636 {
1637 if (!(phi->dsts[0]->flags & IR3_REG_SSA))
1638 return;
1639
1640 if (!ctx->intervals[phi->dsts[0]->name]->interval.inserted) {
1641 phi->flags |= IR3_INSTR_UNUSED;
1642 return;
1643 }
1644
1645 for (unsigned i = 0; i < block->predecessors_count; i++) {
1646 struct ir3_block *pred = block->predecessors[i];
1647 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1648
1649 if (!state->visited)
1650 continue;
1651
1652 struct ir3_register *src = phi->srcs[i];
1653 if (!src->def)
1654 continue;
1655
1656 struct hash_entry *entry =
1657 _mesa_hash_table_search(state->remap, src->def);
1658 assert(entry);
1659 struct reg_or_immed *new_val = entry->data;
1660 set_src_val(src, new_val);
1661 }
1662 }
1663
1664 static void
spill_live_out(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_block * block)1665 spill_live_out(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval,
1666 struct ir3_block *block)
1667 {
1668 struct ir3_register *def = interval->interval.reg;
1669
1670 if (interval->interval.reg->merge_set ||
1671 !interval->can_rematerialize)
1672 spill(ctx, &interval->dst, get_spill_slot(ctx, def), NULL, block);
1673 ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
1674 }
1675
1676 static void
spill_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1677 spill_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1678 {
1679 struct ra_spill_block_state *state = &ctx->blocks[block->index];
1680 rb_tree_foreach_safe (struct ra_spill_interval, interval,
1681 &ctx->reg_ctx.intervals, interval.node) {
1682 if (!BITSET_TEST(state->live_out, interval->interval.reg->name)) {
1683 spill_live_out(ctx, interval, block);
1684 }
1685 }
1686 }
1687
1688 static void
reload_live_out(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1689 reload_live_out(struct ra_spill_ctx *ctx, struct ir3_register *def,
1690 struct ir3_block *block)
1691 {
1692 struct ra_spill_interval *interval = ctx->intervals[def->name];
1693 ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
1694
1695 reload_def(ctx, def, NULL, block);
1696 }
1697
1698 static void
reload_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1699 reload_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1700 {
1701 struct ra_spill_block_state *state = &ctx->blocks[block->index];
1702 unsigned name;
1703 BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) {
1704 struct ir3_register *reg = ctx->live->definitions[name];
1705 struct ra_spill_interval *interval = ctx->intervals[name];
1706 if (!interval->interval.inserted)
1707 reload_live_out(ctx, reg, block);
1708 }
1709 }
1710
1711 static void
update_live_out_phis(struct ra_spill_ctx * ctx,struct ir3_block * block)1712 update_live_out_phis(struct ra_spill_ctx *ctx, struct ir3_block *block)
1713 {
1714 assert(!block->successors[1]);
1715 struct ir3_block *succ = block->successors[0];
1716 unsigned pred_idx = ir3_block_get_pred_index(succ, block);
1717
1718 foreach_instr (instr, &succ->instr_list) {
1719 if (instr->opc != OPC_META_PHI)
1720 break;
1721
1722 struct ir3_register *def = instr->srcs[pred_idx]->def;
1723 if (!def)
1724 continue;
1725
1726 struct ra_spill_interval *interval = ctx->intervals[def->name];
1727 if (!interval->interval.inserted)
1728 continue;
1729 set_src_val(instr->srcs[pred_idx], &interval->dst);
1730 }
1731 }
1732
1733 static void
record_pred_live_out(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_block * block,unsigned pred_idx)1734 record_pred_live_out(struct ra_spill_ctx *ctx,
1735 struct ra_spill_interval *interval,
1736 struct ir3_block *block, unsigned pred_idx)
1737 {
1738 struct ir3_block *pred = block->predecessors[pred_idx];
1739 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1740
1741 struct ir3_register *def = interval->interval.reg;
1742 if (is_live_in_phi(def, block)) {
1743 def = def->instr->srcs[pred_idx]->def;
1744 }
1745 BITSET_SET(state->live_out, def->name);
1746
1747 rb_tree_foreach (struct ra_spill_interval, child,
1748 &interval->interval.children, interval.node) {
1749 record_pred_live_out(ctx, child, block, pred_idx);
1750 }
1751 }
1752
1753 static void
record_pred_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1754 record_pred_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1755 {
1756 for (unsigned i = 0; i < block->predecessors_count; i++) {
1757 struct ir3_block *pred = block->predecessors[i];
1758 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1759 if (state->visited)
1760 continue;
1761
1762 state->live_out = rzalloc_array(ctx, BITSET_WORD,
1763 BITSET_WORDS(ctx->live->definitions_count));
1764
1765
1766 rb_tree_foreach (struct ra_spill_interval, interval,
1767 &ctx->reg_ctx.intervals, interval.node) {
1768 record_pred_live_out(ctx, interval, block, i);
1769 }
1770 }
1771 }
1772
1773 static void
record_live_out(struct ra_spill_ctx * ctx,struct ra_spill_block_state * state,struct ra_spill_interval * interval)1774 record_live_out(struct ra_spill_ctx *ctx,
1775 struct ra_spill_block_state *state,
1776 struct ra_spill_interval *interval)
1777 {
1778 if (!(interval->dst.flags & IR3_REG_SSA) ||
1779 interval->dst.def) {
1780 struct reg_or_immed *val = ralloc(ctx, struct reg_or_immed);
1781 *val = interval->dst;
1782 _mesa_hash_table_insert(state->remap, interval->interval.reg, val);
1783 }
1784 rb_tree_foreach (struct ra_spill_interval, child,
1785 &interval->interval.children, interval.node) {
1786 record_live_out(ctx, state, child);
1787 }
1788 }
1789
1790 static void
record_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1791 record_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1792 {
1793 struct ra_spill_block_state *state = &ctx->blocks[block->index];
1794 state->remap = _mesa_pointer_hash_table_create(ctx);
1795
1796 rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
1797 interval.node) {
1798 record_live_out(ctx, state, interval);
1799 }
1800 }
1801
1802 static void
handle_block(struct ra_spill_ctx * ctx,struct ir3_block * block)1803 handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block)
1804 {
1805 memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure));
1806 rb_tree_init(&ctx->reg_ctx.intervals);
1807 rb_tree_init(&ctx->full_live_intervals);
1808 rb_tree_init(&ctx->half_live_intervals);
1809
1810 unsigned name;
1811 BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1812 ctx->live->definitions_count) {
1813 struct ir3_register *reg = ctx->live->definitions[name];
1814 handle_live_in(ctx, block, reg);
1815 }
1816
1817 foreach_instr (instr, &block->instr_list) {
1818 if (instr->opc != OPC_META_PHI && instr->opc != OPC_META_INPUT &&
1819 instr->opc != OPC_META_TEX_PREFETCH)
1820 break;
1821 handle_input_phi(ctx, instr);
1822 }
1823
1824 if (ctx->spilling) {
1825 if (block->predecessors_count == 1) {
1826 spill_single_pred_live_in(ctx, block);
1827 } else {
1828 spill_live_ins(ctx, block);
1829 reload_live_ins(ctx, block);
1830 record_pred_live_outs(ctx, block);
1831 foreach_instr (instr, &block->instr_list) {
1832 if (instr->opc != OPC_META_PHI)
1833 break;
1834 rewrite_phi(ctx, instr, block);
1835 }
1836 BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1837 ctx->live->definitions_count) {
1838 struct ir3_register *reg = ctx->live->definitions[name];
1839 add_live_in_phi(ctx, reg, block);
1840 }
1841 }
1842 } else {
1843 update_max_pressure(ctx);
1844 }
1845
1846 foreach_instr (instr, &block->instr_list) {
1847 di(instr, "processing");
1848
1849 if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT ||
1850 instr->opc == OPC_META_TEX_PREFETCH)
1851 remove_input_phi(ctx, instr);
1852 else if (ctx->spilling && instr->opc == OPC_META_PARALLEL_COPY)
1853 handle_pcopy(ctx, instr);
1854 else if (ctx->spilling && instr->opc == OPC_MOV &&
1855 instr->dsts[0] == ctx->base_reg)
1856 /* skip */;
1857 else
1858 handle_instr(ctx, instr);
1859 }
1860
1861 if (ctx->spilling && block->successors[0]) {
1862 struct ra_spill_block_state *state =
1863 &ctx->blocks[block->successors[0]->index];
1864 if (state->visited) {
1865 assert(!block->successors[1]);
1866
1867 spill_live_outs(ctx, block);
1868 reload_live_outs(ctx, block);
1869 update_live_out_phis(ctx, block);
1870 }
1871 }
1872
1873 if (ctx->spilling) {
1874 record_live_outs(ctx, block);
1875 ctx->blocks[block->index].visited = true;
1876 }
1877 }
1878
1879 static bool
simplify_phi_node(struct ir3_instruction * phi)1880 simplify_phi_node(struct ir3_instruction *phi)
1881 {
1882 struct ir3_register *def = NULL;
1883 foreach_src (src, phi) {
1884 /* Ignore phi sources which point to the phi itself. */
1885 if (src->def == phi->dsts[0])
1886 continue;
1887 /* If it's undef or it doesn't match the previous sources, bail */
1888 if (!src->def || (def && def != src->def))
1889 return false;
1890 def = src->def;
1891 }
1892
1893 phi->data = def;
1894 phi->flags |= IR3_INSTR_UNUSED;
1895 return true;
1896 }
1897
1898 static struct ir3_register *
simplify_phi_def(struct ir3_register * def)1899 simplify_phi_def(struct ir3_register *def)
1900 {
1901 if (def->instr->opc == OPC_META_PHI) {
1902 struct ir3_instruction *phi = def->instr;
1903
1904 /* Note: this function is always called at least once after visiting the
1905 * phi, so either there has been a simplified phi in the meantime, in
1906 * which case we will set progress=true and visit the definition again, or
1907 * phi->data already has the most up-to-date value. Therefore we don't
1908 * have to recursively check phi->data.
1909 */
1910 if (phi->data)
1911 return phi->data;
1912 }
1913
1914 return def;
1915 }
1916
1917 static void
simplify_phi_srcs(struct ir3_instruction * instr)1918 simplify_phi_srcs(struct ir3_instruction *instr)
1919 {
1920 foreach_src (src, instr) {
1921 if (src->def)
1922 src->def = simplify_phi_def(src->def);
1923 }
1924 }
1925
1926 /* We insert phi nodes for all live-ins of loops in case we need to split the
1927 * live range. This pass cleans that up for the case where the live range didn't
1928 * actually need to be split.
1929 */
1930 static void
simplify_phi_nodes(struct ir3 * ir)1931 simplify_phi_nodes(struct ir3 *ir)
1932 {
1933 foreach_block (block, &ir->block_list) {
1934 foreach_instr (instr, &block->instr_list) {
1935 if (instr->opc != OPC_META_PHI)
1936 break;
1937 instr->data = NULL;
1938 }
1939 }
1940
1941 bool progress;
1942 do {
1943 progress = false;
1944 foreach_block (block, &ir->block_list) {
1945 foreach_instr (instr, &block->instr_list) {
1946 if (instr->opc == OPC_META_PHI || (instr->flags & IR3_INSTR_UNUSED))
1947 continue;
1948
1949 simplify_phi_srcs(instr);
1950 }
1951
1952 /* Visit phi nodes in the successors to make sure that phi sources are
1953 * always visited at least once after visiting the definition they
1954 * point to. See note in simplify_phi_def() for why this is necessary.
1955 */
1956 for (unsigned i = 0; i < 2; i++) {
1957 struct ir3_block *succ = block->successors[i];
1958 if (!succ)
1959 continue;
1960 foreach_instr (instr, &succ->instr_list) {
1961 if (instr->opc != OPC_META_PHI)
1962 break;
1963 if (instr->flags & IR3_INSTR_UNUSED) {
1964 if (instr->data)
1965 instr->data = simplify_phi_def(instr->data);
1966 } else {
1967 simplify_phi_srcs(instr);
1968 progress |= simplify_phi_node(instr);
1969 }
1970 }
1971 }
1972 }
1973 } while (progress);
1974 }
1975
1976 static void
unmark_dead(struct ir3 * ir)1977 unmark_dead(struct ir3 *ir)
1978 {
1979 foreach_block (block, &ir->block_list) {
1980 foreach_instr (instr, &block->instr_list) {
1981 instr->flags &= ~IR3_INSTR_UNUSED;
1982 }
1983 }
1984 }
1985
1986 /* Simple pass to remove now-dead phi nodes and pcopy instructions. We mark
1987 * which ones are dead along the way, so there's nothing to compute here.
1988 */
1989 static void
cleanup_dead(struct ir3 * ir)1990 cleanup_dead(struct ir3 *ir)
1991 {
1992 foreach_block (block, &ir->block_list) {
1993 foreach_instr_safe (instr, &block->instr_list) {
1994 if (instr->flags & IR3_INSTR_UNUSED) {
1995 if (instr->opc == OPC_META_PARALLEL_COPY) {
1996 /* There may be non-SSA shared copies, we need to preserve these.
1997 */
1998 for (unsigned i = 0; i < instr->dsts_count;) {
1999 if (instr->dsts[i]->flags & IR3_REG_SSA) {
2000 instr->dsts[i] = instr->dsts[--instr->dsts_count];
2001 instr->srcs[i] = instr->srcs[--instr->srcs_count];
2002 } else {
2003 i++;
2004 }
2005 }
2006
2007 if (instr->dsts_count == 0)
2008 list_delinit(&instr->node);
2009 } else {
2010 list_delinit(&instr->node);
2011 }
2012 }
2013 }
2014 }
2015 }
2016
2017 /* Deal with merge sets after spilling. Spilling generally leaves the merge sets
2018 * in a mess, and even if we properly cleaned up after ourselves, we would want
2019 * to recompute the merge sets afterward anway. That's because
2020 * spilling/reloading can "break up" phi webs and split/collect webs so that
2021 * allocating them to the same register no longer gives any benefit. For
2022 * example, imagine we have this:
2023 *
2024 * if (...) {
2025 * foo = ...
2026 * } else {
2027 * bar = ...
2028 * }
2029 * baz = phi(foo, bar)
2030 *
2031 * and we spill "baz":
2032 *
2033 * if (...) {
2034 * foo = ...
2035 * spill(foo)
2036 * } else {
2037 * bar = ...
2038 * spill(bar)
2039 * }
2040 * baz = reload()
2041 *
2042 * now foo, bar, and baz don't have to be allocated to the same register. How
2043 * exactly the merge sets change can be complicated, so it's easier just to
2044 * recompute them.
2045 *
2046 * However, there's a wrinkle in this: those same merge sets determine the
2047 * register pressure, due to multiple values inhabiting the same register! And
2048 * we assume that this sharing happens when spilling. Therefore we need a
2049 * three-step procedure:
2050 *
2051 * 1. Drop the original merge sets.
2052 * 2. Calculate which values *must* be merged, being careful to only use the
2053 * interval information which isn't trashed by spilling, and forcibly merge
2054 * them.
2055 * 3. Let ir3_merge_regs() finish the job, including recalculating the
2056 * intervals.
2057 */
2058
2059 static void
fixup_merge_sets(struct ir3_liveness * live,struct ir3 * ir)2060 fixup_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
2061 {
2062 foreach_block (block, &ir->block_list) {
2063 foreach_instr (instr, &block->instr_list) {
2064 ra_foreach_dst (dst, instr) {
2065 dst->merge_set = NULL;
2066 dst->merge_set_offset = 0;
2067 }
2068 }
2069 }
2070
2071 foreach_block (block, &ir->block_list) {
2072 foreach_instr (instr, &block->instr_list) {
2073 if (instr->opc != OPC_META_SPLIT &&
2074 instr->opc != OPC_META_COLLECT)
2075 continue;
2076
2077 struct ir3_register *dst = instr->dsts[0];
2078 ra_foreach_src (src, instr) {
2079 if (!(src->flags & IR3_REG_KILL) &&
2080 src->def->interval_start < dst->interval_end &&
2081 dst->interval_start < src->def->interval_end) {
2082 ir3_force_merge(dst, src->def,
2083 src->def->interval_start - dst->interval_start);
2084 }
2085 }
2086 }
2087 }
2088
2089 ir3_merge_regs(live, ir);
2090 }
2091
2092 void
ir3_calc_pressure(struct ir3_shader_variant * v,struct ir3_liveness * live,struct ir3_pressure * max_pressure)2093 ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
2094 struct ir3_pressure *max_pressure)
2095 {
2096 struct ra_spill_ctx *ctx = rzalloc(NULL, struct ra_spill_ctx);
2097 spill_ctx_init(ctx, v, live);
2098
2099 foreach_block (block, &v->ir->block_list) {
2100 handle_block(ctx, block);
2101 }
2102
2103 assert(ctx->cur_pressure.full == 0);
2104 assert(ctx->cur_pressure.half == 0);
2105 assert(ctx->cur_pressure.shared == 0);
2106
2107 *max_pressure = ctx->max_pressure;
2108 ralloc_free(ctx);
2109 }
2110
2111 bool
ir3_spill(struct ir3 * ir,struct ir3_shader_variant * v,struct ir3_liveness ** live,const struct ir3_pressure * limit_pressure)2112 ir3_spill(struct ir3 *ir, struct ir3_shader_variant *v,
2113 struct ir3_liveness **live,
2114 const struct ir3_pressure *limit_pressure)
2115 {
2116 void *mem_ctx = ralloc_parent(*live);
2117 struct ra_spill_ctx *ctx = rzalloc(mem_ctx, struct ra_spill_ctx);
2118 spill_ctx_init(ctx, v, *live);
2119
2120 ctx->spilling = true;
2121
2122 ctx->blocks = rzalloc_array(ctx, struct ra_spill_block_state,
2123 ctx->live->block_count);
2124 rb_tree_init(&ctx->full_live_intervals);
2125 rb_tree_init(&ctx->half_live_intervals);
2126
2127 ctx->limit_pressure = *limit_pressure;
2128 ctx->spill_slot = v->pvtmem_size;
2129
2130 add_base_reg(ctx, ir);
2131 compute_next_distance(ctx, ir);
2132
2133 unmark_dead(ir);
2134
2135 foreach_block (block, &ir->block_list) {
2136 handle_block(ctx, block);
2137 }
2138
2139 simplify_phi_nodes(ir);
2140
2141 cleanup_dead(ir);
2142
2143 ir3_create_parallel_copies(ir);
2144
2145 /* After this point, we're done mutating the IR. Liveness has been trashed,
2146 * so recalculate it. We'll need it for recalculating the merge sets.
2147 */
2148 ralloc_free(ctx->live);
2149 *live = ir3_calc_liveness(mem_ctx, ir);
2150
2151 fixup_merge_sets(*live, ir);
2152
2153 v->pvtmem_size = ctx->spill_slot;
2154 ralloc_free(ctx);
2155
2156 return true;
2157 }
2158