• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "util/rb_tree.h"
7 #include "ir3_ra.h"
8 #include "ir3_shader.h"
9 
10 /*
11  * This pass does two things:
12  *
13  * 1. Calculates the maximum register pressure. To do this, we need to use the
14  *    exact same technique that RA uses for combining meta_split instructions
15  *    with their sources, so that our calculation agrees with RA.
16  * 2. Spills when the register pressure is exceeded a limit calculated by RA.
17  *    The implementation is based on "Register Spilling and Live-Range Splitting
18  *    for SSA-Form Programs" by Braun and Hack, although again care has to be
19  *    taken to handle combining split/collect instructions.
20  */
21 
22 struct reg_or_immed {
23    unsigned flags;
24    union {
25       struct ir3_register *def;
26       uint32_t uimm;
27       unsigned const_num;
28    };
29 };
30 
31 struct ra_spill_interval {
32    struct ir3_reg_interval interval;
33 
34    struct rb_node node;
35    struct rb_node half_node;
36 
37    /* The current SSA value/const/immed this source is mapped to. */
38    struct reg_or_immed dst;
39 
40    /* When computing use distances we use the distance relative to the start
41     * of the block. So, for example, a value that's defined in cycle 5 of the
42     * block and used 6 cycles later will always have a next_use_distance of 11
43     * until we reach that use.
44     */
45    unsigned next_use_distance;
46 
47    /* Whether this value was reloaded and therefore doesn't need to be
48     * spilled again. Corresponds to the S set in the paper.
49     */
50    bool already_spilled;
51 
52    /* We need to add sources early for accounting purposes, but we have to
53     * insert the reload code for them last. Keep track of whether this interval
54     * needs to be reloaded later.
55     */
56    bool needs_reload;
57 
58    /* Keep track of whether this interval currently can't be spilled because:
59     * - It or one of its children is a source and we're making space for
60     *   sources.
61     * - It is a destination and we're making space for destinations.
62     */
63    bool cant_spill;
64 
65    /* Whether this interval can be rematerialized. */
66    bool can_rematerialize;
67 };
68 
69 struct ra_spill_block_state {
70    unsigned *next_use_end;
71    unsigned *next_use_start;
72 
73    unsigned cycles;
74 
75    /* Map from SSA def to reg_or_immed it is mapped to at the end of the block.
76     * This map only contains values which we didn't spill, so it also serves as
77     * a record of the new live-out set for this block.
78     */
79    struct hash_table *remap;
80 
81    /* For blocks whose successors are visited first (i.e. loop backedges), which
82     * values should be live at the end.
83     */
84    BITSET_WORD *live_out;
85 
86    bool visited;
87 };
88 
89 struct ra_spill_ctx {
90    struct ir3_reg_ctx reg_ctx;
91 
92    struct ra_spill_interval **intervals;
93    unsigned intervals_count;
94 
95    /* rb tree of live intervals that we can spill, ordered by next-use distance.
96     * full_live_intervals contains the full+shared intervals in the merged_regs
97     * case. We use this list to determine what to spill.
98     */
99    struct rb_tree full_live_intervals;
100    struct rb_tree half_live_intervals;
101 
102    struct ir3_pressure cur_pressure, max_pressure;
103 
104    struct ir3_pressure limit_pressure;
105 
106    /* When spilling, we need to reserve a register to serve as the zero'd
107     * "base". For simplicity we reserve a register at the beginning so that it's
108     * always available.
109     */
110    struct ir3_register *base_reg;
111 
112    /* Current pvtmem offset in bytes. */
113    unsigned spill_slot;
114 
115    struct ir3_liveness *live;
116 
117    const struct ir3_compiler *compiler;
118 
119    struct ra_spill_block_state *blocks;
120 
121    bool spilling;
122 
123    bool merged_regs;
124 };
125 
126 static void
add_base_reg(struct ra_spill_ctx * ctx,struct ir3 * ir)127 add_base_reg(struct ra_spill_ctx *ctx, struct ir3 *ir)
128 {
129    struct ir3_block *start = ir3_start_block(ir);
130    struct ir3_builder build = ir3_builder_at(ir3_after_block(start));
131 
132    /* We need to stick it after any meta instructions which need to be first. */
133    struct ir3_instruction *after = NULL;
134    foreach_instr (instr, &start->instr_list) {
135       if (instr->opc != OPC_META_INPUT &&
136           instr->opc != OPC_META_TEX_PREFETCH) {
137          after = instr;
138          break;
139       }
140    }
141 
142    struct ir3_instruction *mov = create_immed(&build, 0);
143 
144    if (after)
145       ir3_instr_move_before(mov, after);
146 
147    ctx->base_reg = mov->dsts[0];
148 
149    /* We don't create an interval, etc. for the base reg, so just lower the
150     * register pressure limit to account for it. We assume it's always
151     * available for simplicity.
152     */
153    ctx->limit_pressure.full -= reg_size(ctx->base_reg);
154 }
155 
156 
157 /* Compute the number of cycles per instruction used for next-use-distance
158  * analysis. This is just approximate, obviously.
159  */
160 static unsigned
instr_cycles(struct ir3_instruction * instr)161 instr_cycles(struct ir3_instruction *instr)
162 {
163    if (instr->opc == OPC_META_PARALLEL_COPY) {
164       unsigned cycles = 0;
165       for (unsigned i = 0; i < instr->dsts_count; i++) {
166          if (!instr->srcs[i]->def ||
167              instr->srcs[i]->def->merge_set != instr->dsts[i]->merge_set) {
168             cycles += reg_elems(instr->srcs[i]);
169          }
170       }
171 
172       return cycles;
173    }
174 
175    if (instr->opc == OPC_META_COLLECT) {
176       unsigned cycles = 0;
177       for (unsigned i = 0; i < instr->srcs_count; i++) {
178          if (!instr->srcs[i]->def ||
179              instr->srcs[i]->def->merge_set != instr->dsts[0]->merge_set) {
180             cycles++;
181          }
182       }
183 
184       return cycles;
185    }
186 
187    if (is_meta(instr))
188       return 0;
189 
190    return 1 + instr->repeat;
191 }
192 
193 static bool
compute_block_next_distance(struct ra_spill_ctx * ctx,struct ir3_block * block,unsigned * tmp_next_use)194 compute_block_next_distance(struct ra_spill_ctx *ctx, struct ir3_block *block,
195                             unsigned *tmp_next_use)
196 {
197    struct ra_spill_block_state *state = &ctx->blocks[block->index];
198    memcpy(tmp_next_use, state->next_use_end,
199           ctx->live->definitions_count * sizeof(*tmp_next_use));
200 
201    unsigned cycle = state->cycles;
202    foreach_instr_rev (instr, &block->instr_list) {
203       ra_foreach_dst (dst, instr) {
204          dst->next_use = tmp_next_use[dst->name];
205       }
206 
207       ra_foreach_src (src, instr) {
208          src->next_use = tmp_next_use[src->def->name];
209       }
210 
211       cycle -= instr_cycles(instr);
212 
213       if (instr->opc == OPC_META_PARALLEL_COPY) {
214          ra_foreach_src_n (src, i, instr) {
215             if (src->def->merge_set == instr->dsts[i]->merge_set &&
216                 src->def->merge_set_offset == instr->dsts[i]->merge_set_offset) {
217                tmp_next_use[src->def->name] =
218                   tmp_next_use[instr->dsts[i]->name];
219             } else {
220                tmp_next_use[src->def->name] = cycle;
221             }
222          }
223       } else if (instr->opc != OPC_META_PHI) {
224          ra_foreach_src (src, instr) {
225             tmp_next_use[src->def->name] = cycle;
226          }
227       }
228 
229       ra_foreach_dst (dst, instr) {
230          tmp_next_use[dst->name] = UINT_MAX;
231       }
232    }
233 
234    memcpy(state->next_use_start, tmp_next_use,
235           ctx->live->definitions_count * sizeof(*tmp_next_use));
236 
237    bool progress = false;
238    for (unsigned i = 0; i < block->predecessors_count; i++) {
239       const struct ir3_block *pred = block->predecessors[i];
240       struct ra_spill_block_state *pred_state = &ctx->blocks[pred->index];
241 
242       /* Add a large-enough distance in front of edges exiting the loop so that
243        * variables that are live-through the loop but not used inside it are
244        * prioritized for spilling, as per the paper. This just needs to be
245        * larger than the longest path through the loop.
246        */
247       bool loop_exit = pred->loop_depth < block->loop_depth;
248       unsigned block_distance = pred_state->cycles + (loop_exit ? 100000 : 0);
249 
250       for (unsigned j = 0; j < ctx->live->definitions_count; j++) {
251          if (state->next_use_start[j] < UINT_MAX &&
252              state->next_use_start[j] + block_distance <
253              pred_state->next_use_end[j]) {
254             pred_state->next_use_end[j] = state->next_use_start[j] +
255                block_distance;
256             progress = true;
257          }
258       }
259 
260       foreach_instr (phi, &block->instr_list) {
261          if (phi->opc != OPC_META_PHI)
262             break;
263          if (!phi->srcs[i]->def)
264             continue;
265          unsigned src = phi->srcs[i]->def->name;
266          if (phi->dsts[0]->next_use < UINT_MAX &&
267              phi->dsts[0]->next_use + block_distance <
268              pred_state->next_use_end[src]) {
269             pred_state->next_use_end[src] = phi->dsts[0]->next_use +
270                block_distance;
271             progress = true;
272          }
273       }
274    }
275 
276    return progress;
277 }
278 
279 static void
compute_next_distance(struct ra_spill_ctx * ctx,struct ir3 * ir)280 compute_next_distance(struct ra_spill_ctx *ctx, struct ir3 *ir)
281 {
282    for (unsigned i = 0; i < ctx->live->block_count; i++) {
283       ctx->blocks[i].next_use_start =
284          ralloc_array(ctx, unsigned, ctx->live->definitions_count);
285       ctx->blocks[i].next_use_end =
286          ralloc_array(ctx, unsigned, ctx->live->definitions_count);
287 
288       for (unsigned j = 0; j < ctx->live->definitions_count; j++) {
289          ctx->blocks[i].next_use_start[j] = UINT_MAX;
290          ctx->blocks[i].next_use_end[j] = UINT_MAX;
291       }
292    }
293 
294    foreach_block (block, &ir->block_list) {
295       struct ra_spill_block_state *state = &ctx->blocks[block->index];
296       state->cycles = 0;
297       foreach_instr (instr, &block->instr_list) {
298          state->cycles += instr_cycles(instr);
299          foreach_dst (dst, instr) {
300             dst->spill_slot = ~0;
301          }
302       }
303    }
304 
305    unsigned *tmp_next_use =
306       ralloc_array(ctx, unsigned, ctx->live->definitions_count);
307 
308    bool progress = true;
309    while (progress) {
310       progress = false;
311       foreach_block_rev (block, &ir->block_list) {
312          progress |= compute_block_next_distance(ctx, block, tmp_next_use);
313       }
314    }
315 }
316 
317 static bool
can_rematerialize(struct ir3_register * reg)318 can_rematerialize(struct ir3_register *reg)
319 {
320    if (reg->flags & IR3_REG_ARRAY)
321       return false;
322    if (reg->instr->opc != OPC_MOV)
323       return false;
324    if (!(reg->instr->srcs[0]->flags & (IR3_REG_IMMED | IR3_REG_CONST)))
325       return false;
326    if (reg->instr->srcs[0]->flags & IR3_REG_RELATIV)
327       return false;
328    return true;
329 }
330 
331 static struct ir3_register *
rematerialize(struct ir3_register * reg,struct ir3_cursor cursor)332 rematerialize(struct ir3_register *reg, struct ir3_cursor cursor)
333 {
334    d("rematerializing ssa_%u:%u", reg->instr->serialno, reg->name);
335 
336    struct ir3_instruction *remat =
337       ir3_instr_create_at(cursor, reg->instr->opc, 1, reg->instr->srcs_count);
338    struct ir3_register *dst = __ssa_dst(remat);
339    dst->flags |= reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
340    for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
341       struct ir3_register *src =
342          ir3_src_create(remat, INVALID_REG, reg->instr->srcs[i]->flags);
343       *src = *reg->instr->srcs[i];
344    }
345 
346    remat->cat1 = reg->instr->cat1;
347 
348    dst->merge_set = reg->merge_set;
349    dst->merge_set_offset = reg->merge_set_offset;
350    dst->interval_start = reg->interval_start;
351    dst->interval_end = reg->interval_end;
352    return dst;
353 }
354 
355 static void
ra_spill_interval_init(struct ra_spill_interval * interval,struct ir3_register * reg)356 ra_spill_interval_init(struct ra_spill_interval *interval,
357                        struct ir3_register *reg)
358 {
359    ir3_reg_interval_init(&interval->interval, reg);
360    interval->dst.flags = reg->flags;
361    interval->dst.def = reg;
362    interval->already_spilled = false;
363    interval->needs_reload = false;
364    interval->cant_spill = false;
365    interval->can_rematerialize = can_rematerialize(reg);
366 }
367 
368 static struct ra_spill_interval *
ir3_reg_interval_to_interval(struct ir3_reg_interval * interval)369 ir3_reg_interval_to_interval(struct ir3_reg_interval *interval)
370 {
371    return rb_node_data(struct ra_spill_interval, interval, interval);
372 }
373 
374 static struct ra_spill_interval *
ra_spill_interval_root(struct ra_spill_interval * interval)375 ra_spill_interval_root(struct ra_spill_interval *interval)
376 {
377    struct ir3_reg_interval *ir3_interval = &interval->interval;
378    while (ir3_interval->parent)
379       ir3_interval = ir3_interval->parent;
380    return ir3_reg_interval_to_interval(ir3_interval);
381 }
382 
383 static struct ra_spill_ctx *
ir3_reg_ctx_to_ctx(struct ir3_reg_ctx * ctx)384 ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
385 {
386    return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx);
387 }
388 
389 static int
spill_interval_cmp(const struct ra_spill_interval * a,const struct ra_spill_interval * b)390 spill_interval_cmp(const struct ra_spill_interval *a,
391                    const struct ra_spill_interval *b)
392 {
393    /* Prioritize intervals that we can rematerialize. */
394    if (a->can_rematerialize && !b->can_rematerialize)
395       return 1;
396    if (!a->can_rematerialize && b->can_rematerialize)
397       return -1;
398 
399    return a->next_use_distance - b->next_use_distance;
400 }
401 
402 static int
ra_spill_interval_cmp(const struct rb_node * _a,const struct rb_node * _b)403 ra_spill_interval_cmp(const struct rb_node *_a, const struct rb_node *_b)
404 {
405    const struct ra_spill_interval *a =
406       rb_node_data(const struct ra_spill_interval, _a, node);
407    const struct ra_spill_interval *b =
408       rb_node_data(const struct ra_spill_interval, _b, node);
409    return spill_interval_cmp(a, b);
410 }
411 
412 static int
ra_spill_interval_half_cmp(const struct rb_node * _a,const struct rb_node * _b)413 ra_spill_interval_half_cmp(const struct rb_node *_a, const struct rb_node *_b)
414 {
415    const struct ra_spill_interval *a =
416       rb_node_data(const struct ra_spill_interval, _a, half_node);
417    const struct ra_spill_interval *b =
418       rb_node_data(const struct ra_spill_interval, _b, half_node);
419    return spill_interval_cmp(a, b);
420 }
421 
422 static void
interval_add(struct ir3_reg_ctx * _ctx,struct ir3_reg_interval * _interval)423 interval_add(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
424 {
425    struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
426    struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
427 
428    unsigned size = reg_size(interval->interval.reg);
429    if (interval->interval.reg->flags & IR3_REG_SHARED) {
430       ctx->cur_pressure.shared += size;
431       if (interval->interval.reg->flags & IR3_REG_HALF)
432          ctx->cur_pressure.shared_half += size;
433    } else {
434       if (interval->interval.reg->flags & IR3_REG_HALF) {
435          ctx->cur_pressure.half += size;
436          if (ctx->spilling) {
437             rb_tree_insert(&ctx->half_live_intervals, &interval->half_node,
438                            ra_spill_interval_half_cmp);
439          }
440       }
441       if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) {
442          ctx->cur_pressure.full += size;
443          if (ctx->spilling) {
444             rb_tree_insert(&ctx->full_live_intervals, &interval->node,
445                            ra_spill_interval_cmp);
446          }
447       }
448    }
449 }
450 
451 static void
interval_delete(struct ir3_reg_ctx * _ctx,struct ir3_reg_interval * _interval)452 interval_delete(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
453 {
454    struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
455    struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
456 
457    unsigned size = reg_size(interval->interval.reg);
458    if (interval->interval.reg->flags & IR3_REG_SHARED) {
459       ctx->cur_pressure.shared -= size;
460       if (interval->interval.reg->flags & IR3_REG_HALF)
461          ctx->cur_pressure.shared_half -= size;
462    } else {
463       if (interval->interval.reg->flags & IR3_REG_HALF) {
464          ctx->cur_pressure.half -= size;
465          if (ctx->spilling) {
466             rb_tree_remove(&ctx->half_live_intervals, &interval->half_node);
467          }
468       }
469       if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) {
470          ctx->cur_pressure.full -= size;
471          if (ctx->spilling) {
472             rb_tree_remove(&ctx->full_live_intervals, &interval->node);
473          }
474       }
475    }
476 }
477 
478 static void
interval_readd(struct ir3_reg_ctx * _ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)479 interval_readd(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_parent,
480                struct ir3_reg_interval *_child)
481 {
482    interval_add(_ctx, _child);
483 }
484 
485 static void
spill_ctx_init(struct ra_spill_ctx * ctx,struct ir3_shader_variant * v,struct ir3_liveness * live)486 spill_ctx_init(struct ra_spill_ctx *ctx, struct ir3_shader_variant *v,
487                struct ir3_liveness *live)
488 {
489    ctx->live = live;
490    ctx->intervals = ralloc_array(ctx, struct ra_spill_interval *,
491                                  ctx->live->definitions_count);
492    struct ra_spill_interval *intervals =
493       rzalloc_array(ctx, struct ra_spill_interval,
494                     ctx->live->definitions_count);
495    for (unsigned i = 0; i < ctx->live->definitions_count; i++)
496       ctx->intervals[i] = &intervals[i];
497 
498    ctx->intervals_count = ctx->live->definitions_count;
499    ctx->compiler = v->compiler;
500    ctx->merged_regs = v->mergedregs;
501 
502    rb_tree_init(&ctx->reg_ctx.intervals);
503    ctx->reg_ctx.interval_add = interval_add;
504    ctx->reg_ctx.interval_delete = interval_delete;
505    ctx->reg_ctx.interval_readd = interval_readd;
506 }
507 
508 static void
ra_spill_ctx_insert(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval)509 ra_spill_ctx_insert(struct ra_spill_ctx *ctx,
510                     struct ra_spill_interval *interval)
511 {
512    ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
513 }
514 
515 static void
ra_spill_ctx_remove(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval)516 ra_spill_ctx_remove(struct ra_spill_ctx *ctx,
517                     struct ra_spill_interval *interval)
518 {
519    ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
520 }
521 
522 static void
init_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)523 init_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
524 {
525    struct ra_spill_interval *interval = ctx->intervals[dst->name];
526    ra_spill_interval_init(interval, dst);
527    if (ctx->spilling) {
528       interval->next_use_distance = dst->next_use;
529 
530       /* We only need to keep track of used-ness if this value may be
531        * rematerialized. This also keeps us from nuking things that may be
532        * in the keeps list (e.g. atomics, input splits).
533        */
534       if (interval->can_rematerialize)
535          dst->instr->flags |= IR3_INSTR_UNUSED;
536    }
537 }
538 
539 static void
insert_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)540 insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
541 {
542    struct ra_spill_interval *interval = ctx->intervals[dst->name];
543    if (interval->interval.inserted)
544       return;
545 
546    ra_spill_ctx_insert(ctx, interval);
547    interval->cant_spill = true;
548 
549    /* For precolored inputs, make sure we leave enough registers to allow for
550     * holes in the inputs. It can happen that the binning shader has a lower
551     * register pressure than the main shader, but the main shader decided to
552     * add holes between the inputs which means that the binning shader has a
553     * higher register demand.
554     */
555    if (dst->instr->opc == OPC_META_INPUT && dst->num != INVALID_REG) {
556       physreg_t physreg = ra_reg_get_physreg(dst);
557       physreg_t max = physreg + reg_size(dst);
558 
559       if (interval->interval.reg->flags & IR3_REG_SHARED) {
560          ctx->max_pressure.shared = MAX2(ctx->max_pressure.shared, max);
561          if (interval->interval.reg->flags & IR3_REG_HALF) {
562             ctx->max_pressure.shared_half = MAX2(ctx->max_pressure.shared_half,
563                                                  max);
564          }
565       } else if (interval->interval.reg->flags & IR3_REG_HALF) {
566          ctx->max_pressure.half = MAX2(ctx->max_pressure.half, max);
567       } else {
568          ctx->max_pressure.full = MAX2(ctx->max_pressure.full, max);
569       }
570    }
571 }
572 
573 static void
insert_src(struct ra_spill_ctx * ctx,struct ir3_register * src)574 insert_src(struct ra_spill_ctx *ctx, struct ir3_register *src)
575 {
576    struct ra_spill_interval *interval = ctx->intervals[src->def->name];
577 
578    if (!interval->interval.inserted) {
579       ra_spill_ctx_insert(ctx, interval);
580       interval->needs_reload = true;
581       interval->already_spilled = true;
582    }
583 
584    ra_spill_interval_root(interval)->cant_spill = true;
585 
586 }
587 
588 static void
remove_src_early(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)589 remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
590                  struct ir3_register *src)
591 {
592    struct ra_spill_interval *interval = ctx->intervals[src->def->name];
593 
594    if (ctx->spilling) {
595       /* It might happen that a collect that cannot be coalesced with one of its
596        * sources while spilling can be coalesced with it afterwards. In this
597        * case, we might be able to remove it here during spilling but not
598        * afterwards (because it may have a child interval). If this happens, we
599        * could end up with a register pressure that is higher after spilling
600        * than before. Prevent this by never removing collects early while
601        * spilling.
602        */
603       if (src->def->instr->opc == OPC_META_COLLECT)
604          return;
605    }
606 
607    if (!interval->interval.inserted || interval->interval.parent ||
608        !rb_tree_is_empty(&interval->interval.children))
609       return;
610 
611    ra_spill_ctx_remove(ctx, interval);
612 }
613 
614 static void
remove_src(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)615 remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
616            struct ir3_register *src)
617 {
618    struct ra_spill_interval *interval = ctx->intervals[src->def->name];
619 
620    if (!interval->interval.inserted)
621       return;
622 
623    ra_spill_ctx_remove(ctx, interval);
624 }
625 
626 static void
finish_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)627 finish_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
628 {
629    struct ra_spill_interval *interval = ctx->intervals[dst->name];
630    interval->cant_spill = false;
631 }
632 
633 static void
remove_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)634 remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
635 {
636    struct ra_spill_interval *interval = ctx->intervals[dst->name];
637 
638    if (!interval->interval.inserted)
639       return;
640 
641    ra_spill_ctx_remove(ctx, interval);
642 }
643 
644 static void
update_src_next_use(struct ra_spill_ctx * ctx,struct ir3_register * src)645 update_src_next_use(struct ra_spill_ctx *ctx, struct ir3_register *src)
646 {
647    struct ra_spill_interval *interval = ctx->intervals[src->def->name];
648 
649    assert(interval->interval.inserted);
650 
651    interval->next_use_distance = src->next_use;
652 
653    /* If this node is inserted in one of the trees, then it needs to be resorted
654     * as its key has changed.
655     */
656    if (!interval->interval.parent && !(src->flags & IR3_REG_SHARED)) {
657       if (src->flags & IR3_REG_HALF) {
658          rb_tree_remove(&ctx->half_live_intervals, &interval->half_node);
659          rb_tree_insert(&ctx->half_live_intervals, &interval->half_node,
660                         ra_spill_interval_half_cmp);
661       }
662       if (ctx->merged_regs || !(src->flags & IR3_REG_HALF)) {
663          rb_tree_remove(&ctx->full_live_intervals, &interval->node);
664          rb_tree_insert(&ctx->full_live_intervals, &interval->node,
665                         ra_spill_interval_cmp);
666       }
667    }
668 }
669 
670 static unsigned
get_spill_slot(struct ra_spill_ctx * ctx,struct ir3_register * reg)671 get_spill_slot(struct ra_spill_ctx *ctx, struct ir3_register *reg)
672 {
673    if (reg->merge_set) {
674       if (reg->merge_set->spill_slot == ~0) {
675          reg->merge_set->spill_slot = ALIGN_POT(ctx->spill_slot,
676                                                 reg->merge_set->alignment * 2);
677          ctx->spill_slot = reg->merge_set->spill_slot + reg->merge_set->size * 2;
678       }
679       return reg->merge_set->spill_slot + reg->merge_set_offset * 2;
680    } else {
681       if (reg->spill_slot == ~0) {
682          reg->spill_slot = ALIGN_POT(ctx->spill_slot, reg_elem_size(reg) * 2);
683          ctx->spill_slot = reg->spill_slot + reg_size(reg) * 2;
684       }
685       return reg->spill_slot;
686    }
687 }
688 
689 static void
set_src_val(struct ir3_register * src,const struct reg_or_immed * val)690 set_src_val(struct ir3_register *src, const struct reg_or_immed *val)
691 {
692    if (val->flags & IR3_REG_IMMED) {
693       src->flags = IR3_REG_IMMED | (val->flags & IR3_REG_HALF);
694       src->uim_val = val->uimm;
695       src->def = NULL;
696    } else if (val->flags & IR3_REG_CONST) {
697       src->flags = IR3_REG_CONST | (val->flags & IR3_REG_HALF);
698       src->num = val->const_num;
699       src->def = NULL;
700    } else {
701       src->def = val->def;
702       val->def->instr->flags &= ~IR3_INSTR_UNUSED;
703    }
704 }
705 
706 static struct ir3_register *
materialize_pcopy_src(const struct reg_or_immed * src,struct ir3_builder * builder)707 materialize_pcopy_src(const struct reg_or_immed *src,
708                       struct ir3_builder *builder)
709 {
710    struct ir3_instruction *mov = ir3_build_instr(builder, OPC_MOV, 1, 1);
711    struct ir3_register *dst = __ssa_dst(mov);
712    dst->flags |= src->flags & IR3_REG_HALF;
713    struct ir3_register *mov_src = ir3_src_create(mov, INVALID_REG, src->flags);
714    set_src_val(mov_src, src);
715    mov->cat1.src_type = mov->cat1.dst_type =
716       (src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
717    return dst;
718 }
719 
720 static void
spill(struct ra_spill_ctx * ctx,const struct reg_or_immed * val,unsigned spill_slot,struct ir3_cursor cursor)721 spill(struct ra_spill_ctx *ctx, const struct reg_or_immed *val,
722       unsigned spill_slot, struct ir3_cursor cursor)
723 {
724    struct ir3_register *reg;
725    struct ir3_builder builder = ir3_builder_at(cursor);
726 
727    /* If spilling an immed/const pcopy src, we need to actually materialize it
728     * first with a mov.
729     */
730    if (val->flags & (IR3_REG_CONST | IR3_REG_IMMED)) {
731       reg = materialize_pcopy_src(val, &builder);
732    } else {
733       reg = val->def;
734       reg->instr->flags &= ~IR3_INSTR_UNUSED;
735    }
736 
737    d("spilling ssa_%u:%u to %u", reg->instr->serialno, reg->name,
738      spill_slot);
739 
740    unsigned elems = reg_elems(reg);
741    struct ir3_instruction *spill =
742       ir3_build_instr(&builder, OPC_SPILL_MACRO, 0, 3);
743    ir3_src_create(spill, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg;
744    unsigned src_flags = reg->flags & (IR3_REG_HALF | IR3_REG_IMMED |
745                                       IR3_REG_CONST | IR3_REG_SSA |
746                                       IR3_REG_ARRAY);
747    struct ir3_register *src = ir3_src_create(spill, INVALID_REG, src_flags);
748    ir3_src_create(spill, INVALID_REG, IR3_REG_IMMED)->uim_val = elems;
749    spill->cat6.dst_offset = spill_slot;
750    spill->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
751 
752    src->def = reg;
753    if (reg->flags & IR3_REG_ARRAY) {
754       src->size = reg->size;
755       src->array.id = reg->array.id;
756       src->array.offset = 0;
757    } else {
758       src->wrmask = reg->wrmask;
759    }
760 }
761 
762 static void
spill_interval(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_cursor cursor)763 spill_interval(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval,
764                struct ir3_cursor cursor)
765 {
766    if (interval->can_rematerialize && !interval->interval.reg->merge_set)
767       return;
768 
769    spill(ctx, &interval->dst, get_spill_slot(ctx, interval->interval.reg),
770          cursor);
771 }
772 
773 /* This is similar to "limit" in the paper. */
774 static void
limit(struct ra_spill_ctx * ctx,struct ir3_cursor cursor)775 limit(struct ra_spill_ctx *ctx, struct ir3_cursor cursor)
776 {
777    if (ctx->cur_pressure.half > ctx->limit_pressure.half) {
778       d("cur half pressure %u exceeds %u", ctx->cur_pressure.half,
779         ctx->limit_pressure.half);
780       rb_tree_foreach_safe (struct ra_spill_interval, interval,
781                             &ctx->half_live_intervals, half_node) {
782          d("trying ssa_%u:%u", interval->interval.reg->instr->serialno,
783            interval->interval.reg->name);
784          if (!interval->cant_spill) {
785             if (!interval->already_spilled)
786                spill_interval(ctx, interval, cursor);
787             ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
788             if (ctx->cur_pressure.half <= ctx->limit_pressure.half)
789                break;
790          }
791       }
792 
793       assert(ctx->cur_pressure.half <= ctx->limit_pressure.half);
794    }
795 
796    if (ctx->cur_pressure.full > ctx->limit_pressure.full) {
797       d("cur full pressure %u exceeds %u", ctx->cur_pressure.full,
798         ctx->limit_pressure.full);
799       rb_tree_foreach_safe (struct ra_spill_interval, interval,
800                             &ctx->full_live_intervals, node) {
801          d("trying ssa_%u:%u", interval->interval.reg->instr->serialno,
802            interval->interval.reg->name);
803          if (!interval->cant_spill) {
804             if (!interval->already_spilled)
805                spill_interval(ctx, interval, cursor);
806             ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
807             if (ctx->cur_pressure.full <= ctx->limit_pressure.full)
808                break;
809          } else {
810             d("can't spill");
811          }
812       }
813 
814       assert(ctx->cur_pressure.full <= ctx->limit_pressure.full);
815    }
816 }
817 
818 /* There's a corner case where we reload a value which has overlapping live
819  * values already reloaded, either because it's the child of some other interval
820  * that was already reloaded or some of its children have already been
821  * reloaded. Because RA only expects overlapping source/dest intervals for meta
822  * instructions (split/collect), and we don't want to add register pressure by
823  * creating an entirely separate value, we need to add splits and collects to
824  * deal with this case. These splits/collects have to also have correct merge
825  * set information, so that it doesn't result in any actual code or register
826  * pressure in practice.
827  */
828 
829 static void
add_to_merge_set(struct ir3_merge_set * set,struct ir3_register * def,unsigned offset)830 add_to_merge_set(struct ir3_merge_set *set, struct ir3_register *def,
831                  unsigned offset)
832 {
833    def->merge_set = set;
834    def->merge_set_offset = offset;
835    def->interval_start = set->interval_start + offset;
836    def->interval_end = set->interval_start + offset + reg_size(def);
837 }
838 
839 static struct ir3_register *
split(struct ir3_register * def,unsigned offset,struct ir3_builder * builder)840 split(struct ir3_register *def, unsigned offset, struct ir3_builder *builder)
841 {
842    if (reg_elems(def) == 1) {
843       assert(offset == 0);
844       return def;
845    }
846 
847    assert(!(def->flags & IR3_REG_ARRAY));
848    assert(def->merge_set);
849    struct ir3_instruction *split =
850       ir3_build_instr(builder, OPC_META_SPLIT, 1, 1);
851    split->split.off = offset;
852    struct ir3_register *dst = __ssa_dst(split);
853    dst->flags |= def->flags & IR3_REG_HALF;
854    struct ir3_register *src = ir3_src_create(split, INVALID_REG, def->flags);
855    src->wrmask = def->wrmask;
856    src->def = def;
857    add_to_merge_set(def->merge_set, dst,
858                     def->merge_set_offset + offset * reg_elem_size(def));
859    return dst;
860 }
861 
862 static struct ir3_register *
extract(struct ir3_register * parent_def,unsigned offset,unsigned elems,struct ir3_cursor cursor)863 extract(struct ir3_register *parent_def, unsigned offset, unsigned elems,
864         struct ir3_cursor cursor)
865 {
866    if (offset == 0 && elems == reg_elems(parent_def))
867       return parent_def;
868 
869    struct ir3_builder builder = ir3_builder_at(cursor);
870    struct ir3_register *srcs[elems];
871    for (unsigned i = 0; i < elems; i++) {
872       srcs[i] = split(parent_def, offset + i, &builder);
873    }
874 
875    struct ir3_instruction *collect =
876       ir3_build_instr(&builder, OPC_META_COLLECT, 1, elems);
877    struct ir3_register *dst = __ssa_dst(collect);
878    dst->flags |= parent_def->flags & IR3_REG_HALF;
879    dst->wrmask = MASK(elems);
880    add_to_merge_set(parent_def->merge_set, dst, parent_def->merge_set_offset);
881 
882    for (unsigned i = 0; i < elems; i++) {
883       ir3_src_create(collect, INVALID_REG, parent_def->flags)->def = srcs[i];
884    }
885 
886    return dst;
887 }
888 
889 static struct ir3_register *
reload(struct ra_spill_ctx * ctx,struct ir3_register * reg,struct ir3_cursor cursor)890 reload(struct ra_spill_ctx *ctx, struct ir3_register *reg,
891        struct ir3_cursor cursor)
892 {
893    unsigned spill_slot = get_spill_slot(ctx, reg);
894 
895    d("reloading ssa_%u:%u from %u", reg->instr->serialno, reg->name,
896      spill_slot);
897 
898    unsigned elems = reg_elems(reg);
899    struct ir3_instruction *reload =
900       ir3_instr_create_at(cursor, OPC_RELOAD_MACRO, 1, 3);
901    struct ir3_register *dst = __ssa_dst(reload);
902    dst->flags |= reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
903    /* The reload may be split into multiple pieces, and if the destination
904     * overlaps with the base register then it could get clobbered before the
905     * last ldp in the sequence. Note that we always reserve space for the base
906     * register throughout the whole program, so effectively extending its live
907     * range past the end of the instruction isn't a problem for our pressure
908     * accounting.
909     */
910    dst->flags |= IR3_REG_EARLY_CLOBBER;
911    ir3_src_create(reload, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg;
912    struct ir3_register *offset_reg =
913       ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED);
914    offset_reg->uim_val = spill_slot;
915    ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED)->uim_val = elems;
916    reload->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
917 
918    if (reg->flags & IR3_REG_ARRAY) {
919       dst->array.offset = 0;
920       dst->array.id = reg->array.id;
921       dst->size = reg->size;
922    } else {
923       dst->wrmask = reg->wrmask;
924    }
925 
926    dst->merge_set = reg->merge_set;
927    dst->merge_set_offset = reg->merge_set_offset;
928    dst->interval_start = reg->interval_start;
929    dst->interval_end = reg->interval_end;
930    return dst;
931 }
932 
933 static void
rewrite_src_interval(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_register * def,struct ir3_cursor cursor)934 rewrite_src_interval(struct ra_spill_ctx *ctx,
935                     struct ra_spill_interval *interval,
936                     struct ir3_register *def,
937                     struct ir3_cursor cursor)
938 {
939    interval->dst.flags = def->flags;
940    interval->dst.def = def;
941    interval->needs_reload = false;
942 
943    rb_tree_foreach (struct ra_spill_interval, child,
944                     &interval->interval.children, interval.node) {
945       struct ir3_register *child_reg = child->interval.reg;
946       struct ir3_register *child_def =
947          extract(def, (child_reg->interval_start -
948                        interval->interval.reg->interval_start) / reg_elem_size(def),
949                  reg_elems(child_reg), cursor);
950       rewrite_src_interval(ctx, child, child_def, cursor);
951    }
952 }
953 
954 static void
reload_def(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_cursor cursor)955 reload_def(struct ra_spill_ctx *ctx, struct ir3_register *def,
956            struct ir3_cursor cursor)
957 {
958    unsigned elems = reg_elems(def);
959    struct ra_spill_interval *interval = ctx->intervals[def->name];
960 
961    struct ir3_reg_interval *ir3_parent = interval->interval.parent;
962 
963    if (ir3_parent) {
964       struct ra_spill_interval *parent =
965          ir3_reg_interval_to_interval(ir3_parent);
966       if (!parent->needs_reload) {
967          interval->dst.flags = def->flags;
968          interval->dst.def = extract(
969             parent->dst.def, (def->interval_start - parent->dst.def->interval_start) /
970             reg_elem_size(def), elems, cursor);
971          return;
972       }
973    }
974 
975    struct ir3_register *dst;
976    if (interval->can_rematerialize)
977       dst = rematerialize(def, cursor);
978    else
979       dst = reload(ctx, def, cursor);
980 
981    rewrite_src_interval(ctx, interval, dst, cursor);
982 }
983 
984 static void
reload_src(struct ra_spill_ctx * ctx,struct ir3_cursor cursor,struct ir3_register * src)985 reload_src(struct ra_spill_ctx *ctx, struct ir3_cursor cursor,
986             struct ir3_register *src)
987 {
988    struct ra_spill_interval *interval = ctx->intervals[src->def->name];
989 
990    if (interval->needs_reload) {
991       reload_def(ctx, src->def, cursor);
992    }
993 
994    ra_spill_interval_root(interval)->cant_spill = false;
995 }
996 
997 static void
rewrite_src(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)998 rewrite_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
999             struct ir3_register *src)
1000 {
1001    struct ra_spill_interval *interval = ctx->intervals[src->def->name];
1002 
1003    set_src_val(src, &interval->dst);
1004 }
1005 
1006 static void
update_max_pressure(struct ra_spill_ctx * ctx)1007 update_max_pressure(struct ra_spill_ctx *ctx)
1008 {
1009    d("pressure:");
1010    d("\tfull: %u", ctx->cur_pressure.full);
1011    d("\thalf: %u", ctx->cur_pressure.half);
1012    d("\tshared: %u", ctx->cur_pressure.shared);
1013    d("\tshared half: %u", ctx->cur_pressure.shared_half);
1014 
1015    ctx->max_pressure.full =
1016       MAX2(ctx->max_pressure.full, ctx->cur_pressure.full);
1017    ctx->max_pressure.half =
1018       MAX2(ctx->max_pressure.half, ctx->cur_pressure.half);
1019    ctx->max_pressure.shared =
1020       MAX2(ctx->max_pressure.shared, ctx->cur_pressure.shared);
1021    ctx->max_pressure.shared_half =
1022       MAX2(ctx->max_pressure.shared_half, ctx->cur_pressure.shared_half);
1023 }
1024 
1025 static void
handle_instr(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)1026 handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
1027 {
1028    ra_foreach_dst (dst, instr) {
1029       /* No need to handle instructions inserted while spilling. Most are
1030        * ignored automatically by virtue of being inserted before the current
1031        * instruction. However, for live-ins, we may insert extracts after the
1032        * phis. Trying to handle them ends badly as they don't have intervals
1033        * allocated.
1034        * Note: since liveness is calculated before spilling, original
1035        * instruction have a name while new ones don't.
1036        */
1037       if (!dst->name)
1038          return;
1039 
1040       init_dst(ctx, dst);
1041    }
1042 
1043    if (ctx->spilling) {
1044       ra_foreach_src (src, instr)
1045          insert_src(ctx, src);
1046    }
1047 
1048    /* Handle tied and early-kill destinations. If a destination is tied to a
1049     * source and that source is live-through, then we need to allocate a new
1050     * register for the destination which is live-through itself and cannot
1051     * overlap the sources. Similarly early-kill destinations cannot overlap
1052     * sources.
1053     */
1054 
1055    ra_foreach_dst (dst, instr) {
1056       struct ir3_register *tied_src = dst->tied;
1057       if ((tied_src && !(tied_src->flags & IR3_REG_FIRST_KILL)) ||
1058           (dst->flags & IR3_REG_EARLY_CLOBBER))
1059          insert_dst(ctx, dst);
1060    }
1061 
1062    if (ctx->spilling)
1063       limit(ctx, ir3_before_instr(instr));
1064    else
1065       update_max_pressure(ctx);
1066 
1067    if (ctx->spilling) {
1068       ra_foreach_src (src, instr) {
1069          reload_src(ctx, ir3_before_instr(instr), src);
1070          update_src_next_use(ctx, src);
1071       }
1072    }
1073 
1074    ra_foreach_src (src, instr) {
1075       if (src->flags & IR3_REG_FIRST_KILL)
1076          remove_src_early(ctx, instr, src);
1077    }
1078 
1079    ra_foreach_dst (dst, instr) {
1080       insert_dst(ctx, dst);
1081    }
1082 
1083    if (ctx->spilling)
1084       limit(ctx, ir3_before_instr(instr));
1085    else
1086       update_max_pressure(ctx);
1087 
1088    /* We have to remove sources before rewriting them so that we can lookup the
1089     * interval to remove before the source itself is changed.
1090     */
1091    ra_foreach_src (src, instr) {
1092       if (src->flags & IR3_REG_FIRST_KILL)
1093          remove_src(ctx, instr, src);
1094    }
1095 
1096    if (ctx->spilling) {
1097       ra_foreach_src (src, instr) {
1098          rewrite_src(ctx, instr, src);
1099       }
1100    }
1101 
1102    ra_foreach_dst (dst, instr) {
1103       finish_dst(ctx, dst);
1104    }
1105 
1106    for (unsigned i = 0; i < instr->dsts_count; i++) {
1107       if (ra_reg_is_dst(instr->dsts[i]) &&
1108           (instr->dsts[i]->flags & IR3_REG_UNUSED))
1109          remove_dst(ctx, instr->dsts[i]);
1110    }
1111 }
1112 
1113 static struct ra_spill_interval *
create_temp_interval(struct ra_spill_ctx * ctx,struct ir3_register * def)1114 create_temp_interval(struct ra_spill_ctx *ctx, struct ir3_register *def)
1115 {
1116    unsigned name = ctx->intervals_count++;
1117    unsigned offset = ctx->live->interval_offset;
1118 
1119    /* This is kinda hacky, but we need to create a fake SSA def here that is
1120     * only used as part of the pcopy accounting. See below.
1121     */
1122    struct ir3_register *reg = rzalloc(ctx, struct ir3_register);
1123    *reg = *def;
1124    reg->name = name;
1125    reg->interval_start = offset;
1126    reg->interval_end = offset + reg_size(def);
1127    reg->merge_set = NULL;
1128 
1129    ctx->intervals = reralloc(ctx, ctx->intervals, struct ra_spill_interval *,
1130                              ctx->intervals_count);
1131    struct ra_spill_interval *interval = rzalloc(ctx, struct ra_spill_interval);
1132    ra_spill_interval_init(interval, reg);
1133    ctx->intervals[name] = interval;
1134    ctx->live->interval_offset += reg_size(def);
1135    return interval;
1136 }
1137 
1138 /* In the sequence of copies generated (see below), would this source be killed?
1139  */
1140 static bool
is_last_pcopy_src(struct ir3_instruction * pcopy,unsigned src_n)1141 is_last_pcopy_src(struct ir3_instruction *pcopy, unsigned src_n)
1142 {
1143    struct ir3_register *src = pcopy->srcs[src_n];
1144    if (!(src->flags & IR3_REG_KILL))
1145       return false;
1146    for (unsigned j = src_n + 1; j < pcopy->srcs_count; j++) {
1147       if (pcopy->srcs[j]->def == src->def)
1148          return false;
1149    }
1150    return true;
1151 }
1152 
1153 /* Parallel copies are different from normal instructions. The sources together
1154  * may be larger than the entire register file, so we cannot just reload every
1155  * source like normal, and indeed that probably wouldn't be a great idea.
1156  * Instead we essentially need to lower the parallel copy to "copies," just like
1157  * in the normal CSSA construction, although we implement the copies by
1158  * reloading and then possibly spilling values. We essentially just shuffle
1159  * around the sources until each source either (a) is live or (b) has the same
1160  * spill slot as its corresponding destination. We do this by decomposing the
1161  * copy into a series of copies, so:
1162  *
1163  * a, b, c = d, e, f
1164  *
1165  * becomes:
1166  *
1167  * d' = d
1168  * e' = e
1169  * f' = f
1170  * a = d'
1171  * b = e'
1172  * c = f'
1173  *
1174  * the temporary SSA values d', e', and f' never actually show up in the result.
1175  * They are only used for our internal accounting. They may, however, have their
1176  * own spill slot created for them. Similarly, we don't actually emit any copy
1177  * instructions, although we emit the spills/reloads that *would've* been
1178  * required if those copies were there.
1179  *
1180  * TODO: in order to reduce the number of temporaries and therefore spill slots,
1181  * we could instead do a more complicated analysis that considers the location
1182  * transfer graph.
1183  *
1184  * In addition, we actually remove the parallel copy and rewrite all its uses
1185  * (in the phi nodes) rather than rewrite its sources at the end. Recreating it
1186  * later turns out to be easier than keeping it up-to-date throughout this pass,
1187  * since we may have to remove entries for phi sources that are spilled and add
1188  * entries for live-outs that are spilled and reloaded, which can happen here
1189  * and then possibly be undone or done again when processing live-ins of the
1190  * successor block.
1191  */
1192 
1193 static void
handle_pcopy(struct ra_spill_ctx * ctx,struct ir3_instruction * pcopy)1194 handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
1195 {
1196    ra_foreach_dst (dst, pcopy) {
1197       struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
1198       ra_spill_interval_init(dst_interval, dst);
1199    }
1200 
1201    foreach_src_n (src, i, pcopy) {
1202       struct ir3_register *dst = pcopy->dsts[i];
1203       if (!(dst->flags & IR3_REG_SSA))
1204          continue;
1205 
1206       d("processing src %u", i);
1207 
1208       /* Skip the intermediate copy for cases where the source is merged with
1209        * the destination. Crucially this means that we also don't reload/spill
1210        * it if it's been spilled, because it shares the same spill slot.
1211        */
1212       if ((src->flags & IR3_REG_SSA) && src->def->merge_set &&
1213           src->def->merge_set == dst->merge_set &&
1214           src->def->merge_set_offset == dst->merge_set_offset) {
1215          struct ra_spill_interval *src_interval = ctx->intervals[src->def->name];
1216          struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
1217          if (src_interval->interval.inserted) {
1218             update_src_next_use(ctx, src);
1219             if (is_last_pcopy_src(pcopy, i))
1220                ra_spill_ctx_remove(ctx, src_interval);
1221             dst_interval->cant_spill = true;
1222             ra_spill_ctx_insert(ctx, dst_interval);
1223             limit(ctx, ir3_before_instr(pcopy));
1224             dst_interval->cant_spill = false;
1225             dst_interval->dst = src_interval->dst;
1226          }
1227       } else if (src->flags & IR3_REG_SSA) {
1228          struct ra_spill_interval *temp_interval =
1229             create_temp_interval(ctx, dst);
1230          struct ir3_register *temp = temp_interval->interval.reg;
1231          temp_interval->next_use_distance = src->next_use;
1232 
1233          insert_src(ctx, src);
1234          limit(ctx, ir3_before_instr(pcopy));
1235          reload_src(ctx, ir3_before_instr(pcopy), src);
1236          update_src_next_use(ctx, src);
1237          if (is_last_pcopy_src(pcopy, i))
1238             remove_src(ctx, pcopy, src);
1239          struct ra_spill_interval *src_interval =
1240             ctx->intervals[src->def->name];
1241          temp_interval->dst = src_interval->dst;
1242 
1243          temp_interval->cant_spill = true;
1244          ra_spill_ctx_insert(ctx, temp_interval);
1245          limit(ctx, ir3_before_instr(pcopy));
1246          temp_interval->cant_spill = false;
1247 
1248          src->flags = temp->flags;
1249          src->def = temp;
1250       }
1251    }
1252 
1253    d("done with pcopy srcs");
1254 
1255    foreach_src_n (src, i, pcopy) {
1256       struct ir3_register *dst = pcopy->dsts[i];
1257       if (!(dst->flags & IR3_REG_SSA))
1258          continue;
1259 
1260       if ((src->flags & IR3_REG_SSA) && src->def->merge_set &&
1261           src->def->merge_set == dst->merge_set &&
1262           src->def->merge_set_offset == dst->merge_set_offset)
1263          continue;
1264 
1265       struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
1266 
1267       if (!(src->flags & IR3_REG_SSA)) {
1268          dst_interval->cant_spill = true;
1269          ra_spill_ctx_insert(ctx, dst_interval);
1270          limit(ctx, ir3_before_instr(pcopy));
1271          dst_interval->cant_spill = false;
1272 
1273          assert(src->flags & (IR3_REG_CONST | IR3_REG_IMMED));
1274          if (src->flags & IR3_REG_CONST) {
1275             dst_interval->dst.flags = src->flags;
1276             dst_interval->dst.const_num = src->num;
1277          } else {
1278             dst_interval->dst.flags = src->flags;
1279             dst_interval->dst.uimm = src->uim_val;
1280          }
1281       } else {
1282          struct ra_spill_interval *temp_interval = ctx->intervals[src->def->name];
1283 
1284          insert_src(ctx, src);
1285          limit(ctx, ir3_before_instr(pcopy));
1286          reload_src(ctx, ir3_before_instr(pcopy), src);
1287          remove_src(ctx, pcopy, src);
1288 
1289          dst_interval->dst = temp_interval->dst;
1290          ra_spill_ctx_insert(ctx, dst_interval);
1291       }
1292    }
1293 
1294    pcopy->flags |= IR3_INSTR_UNUSED;
1295 }
1296 
1297 static void
handle_input_phi(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)1298 handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
1299 {
1300    if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1301       return;
1302 
1303    init_dst(ctx, instr->dsts[0]);
1304    insert_dst(ctx, instr->dsts[0]);
1305    finish_dst(ctx, instr->dsts[0]);
1306 }
1307 
1308 static void
remove_input_phi(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)1309 remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
1310 {
1311    if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1312       return;
1313 
1314    if (instr->opc == OPC_META_TEX_PREFETCH) {
1315       ra_foreach_src (src, instr) {
1316          if (src->flags & IR3_REG_FIRST_KILL)
1317             remove_src(ctx, instr, src);
1318       }
1319    }
1320    if (instr->dsts[0]->flags & IR3_REG_UNUSED)
1321       remove_dst(ctx, instr->dsts[0]);
1322 }
1323 
1324 static void
handle_live_in(struct ra_spill_ctx * ctx,struct ir3_block * block,struct ir3_register * def)1325 handle_live_in(struct ra_spill_ctx *ctx, struct ir3_block *block,
1326                struct ir3_register *def)
1327 {
1328    struct ra_spill_interval *interval = ctx->intervals[def->name];
1329    ra_spill_interval_init(interval, def);
1330    if (ctx->spilling) {
1331       interval->next_use_distance =
1332          ctx->blocks[block->index].next_use_start[def->name];
1333    }
1334 
1335    ra_spill_ctx_insert(ctx, interval);
1336 }
1337 
1338 static bool
is_live_in_phi(struct ir3_register * def,struct ir3_block * block)1339 is_live_in_phi(struct ir3_register *def, struct ir3_block *block)
1340 {
1341    return def->instr->opc == OPC_META_PHI && def->instr->block == block;
1342 }
1343 
1344 static bool
is_live_in_pred(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block,unsigned pred_idx)1345 is_live_in_pred(struct ra_spill_ctx *ctx, struct ir3_register *def,
1346                 struct ir3_block *block, unsigned pred_idx)
1347 {
1348    struct ir3_block *pred = block->predecessors[pred_idx];
1349    struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1350    if (is_live_in_phi(def, block)) {
1351       def = def->instr->srcs[pred_idx]->def;
1352       if (!def)
1353          return false;
1354    }
1355 
1356    return _mesa_hash_table_search(state->remap, def);
1357 }
1358 
1359 static bool
is_live_in_undef(struct ir3_register * def,struct ir3_block * block,unsigned pred_idx)1360 is_live_in_undef(struct ir3_register *def,
1361                  struct ir3_block *block, unsigned pred_idx)
1362 {
1363    if (!is_live_in_phi(def, block))
1364       return false;
1365 
1366    return !def->instr->srcs[pred_idx]->def;
1367 }
1368 
1369 static struct reg_or_immed *
read_live_in(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block,unsigned pred_idx)1370 read_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
1371              struct ir3_block *block, unsigned pred_idx)
1372 {
1373    struct ir3_block *pred = block->predecessors[pred_idx];
1374    struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1375 
1376    if (is_live_in_phi(def, block)) {
1377       def = def->instr->srcs[pred_idx]->def;
1378       if (!def)
1379          return NULL;
1380    }
1381 
1382    struct hash_entry *entry = _mesa_hash_table_search(state->remap, def);
1383    if (entry)
1384       return entry->data;
1385    else
1386       return NULL;
1387 }
1388 
1389 static bool
is_live_in_all_preds(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1390 is_live_in_all_preds(struct ra_spill_ctx *ctx, struct ir3_register *def,
1391                      struct ir3_block *block)
1392 {
1393    for (unsigned i = 0; i < block->predecessors_count; i++) {
1394       if (!is_live_in_pred(ctx, def, block, i))
1395          return false;
1396    }
1397 
1398    return true;
1399 }
1400 
1401 static void
spill_live_in(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1402 spill_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
1403               struct ir3_block *block)
1404 {
1405    for (unsigned i = 0; i < block->predecessors_count; i++) {
1406       struct ir3_block *pred = block->predecessors[i];
1407       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1408 
1409       if (!state->visited)
1410          continue;
1411 
1412       struct reg_or_immed *pred_def = read_live_in(ctx, def, block, i);
1413       if (pred_def) {
1414          spill(ctx, pred_def, get_spill_slot(ctx, def),
1415                ir3_before_terminator(pred));
1416       }
1417    }
1418 }
1419 
1420 static void
spill_live_ins(struct ra_spill_ctx * ctx,struct ir3_block * block)1421 spill_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block)
1422 {
1423    bool all_preds_visited = true;
1424    for (unsigned i = 0; i < block->predecessors_count; i++) {
1425       struct ir3_block *pred = block->predecessors[i];
1426       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1427       if (!state->visited) {
1428          all_preds_visited = false;
1429          break;
1430       }
1431    }
1432 
1433    /* Note: in the paper they explicitly spill live-through values first, but we
1434     * should be doing that automatically by virtue of picking the largest
1435     * distance due to the extra distance added to edges out of loops.
1436     *
1437     * TODO: Keep track of pressure in each block and preemptively spill
1438     * live-through values as described in the paper to avoid spilling them
1439     * inside the loop.
1440     */
1441 
1442    if (ctx->cur_pressure.half > ctx->limit_pressure.half) {
1443       rb_tree_foreach_safe (struct ra_spill_interval, interval,
1444                             &ctx->half_live_intervals, half_node) {
1445          if (all_preds_visited &&
1446              is_live_in_all_preds(ctx, interval->interval.reg, block))
1447             continue;
1448          if (interval->interval.reg->merge_set ||
1449              !interval->can_rematerialize)
1450             spill_live_in(ctx, interval->interval.reg, block);
1451          ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
1452          if (ctx->cur_pressure.half <= ctx->limit_pressure.half)
1453             break;
1454       }
1455    }
1456 
1457    if (ctx->cur_pressure.full > ctx->limit_pressure.full) {
1458       rb_tree_foreach_safe (struct ra_spill_interval, interval,
1459                             &ctx->full_live_intervals, node) {
1460          if (all_preds_visited &&
1461              is_live_in_all_preds(ctx, interval->interval.reg, block))
1462             continue;
1463          spill_live_in(ctx, interval->interval.reg, block);
1464          ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
1465          if (ctx->cur_pressure.full <= ctx->limit_pressure.full)
1466             break;
1467       }
1468    }
1469 }
1470 
1471 static void
live_in_rewrite(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct reg_or_immed * new_val,struct ir3_block * block,unsigned pred_idx)1472 live_in_rewrite(struct ra_spill_ctx *ctx,
1473                 struct ra_spill_interval *interval,
1474                 struct reg_or_immed *new_val,
1475                 struct ir3_block *block, unsigned pred_idx)
1476 {
1477    struct ir3_block *pred = block->predecessors[pred_idx];
1478    struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1479    struct ir3_register *def = interval->interval.reg;
1480    if (is_live_in_phi(def, block)) {
1481       def = def->instr->srcs[pred_idx]->def;
1482    }
1483 
1484    if (def)
1485       _mesa_hash_table_insert(state->remap, def, new_val);
1486 }
1487 
1488 static void
reload_live_in(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1489 reload_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
1490                struct ir3_block *block)
1491 {
1492    struct ra_spill_interval *interval = ctx->intervals[def->name];
1493    for (unsigned i = 0; i < block->predecessors_count; i++) {
1494       struct ir3_block *pred = block->predecessors[i];
1495       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1496       if (!state->visited)
1497          continue;
1498 
1499       if (is_live_in_undef(def, block, i))
1500          continue;
1501 
1502       struct reg_or_immed *new_val = read_live_in(ctx, def, block, i);
1503 
1504       if (!new_val) {
1505          new_val = ralloc(ctx, struct reg_or_immed);
1506          if (interval->can_rematerialize)
1507             new_val->def = rematerialize(def, ir3_before_terminator(pred));
1508          else
1509             new_val->def = reload(ctx, def, ir3_before_terminator(pred));
1510          new_val->flags = new_val->def->flags;
1511       }
1512       live_in_rewrite(ctx, interval, new_val, block, i);
1513    }
1514 }
1515 
1516 static void
reload_live_ins(struct ra_spill_ctx * ctx,struct ir3_block * block)1517 reload_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block)
1518 {
1519    rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
1520                     interval.node) {
1521       reload_live_in(ctx, interval->interval.reg, block);
1522    }
1523 }
1524 
1525 static void
add_live_in_phi(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_register * parent_def,struct ir3_block * block)1526 add_live_in_phi(struct ra_spill_ctx *ctx, struct ir3_register *def,
1527                 struct ir3_register *parent_def, struct ir3_block *block)
1528 {
1529    struct ra_spill_interval *interval = ctx->intervals[def->name];
1530    if (!interval->interval.inserted)
1531       return;
1532 
1533    bool needs_phi = false;
1534    struct ir3_register *cur_def = NULL;
1535    for (unsigned i = 0; i < block->predecessors_count; i++) {
1536       struct ir3_block *pred = block->predecessors[i];
1537       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1538 
1539       if (!state->visited) {
1540          needs_phi = true;
1541          break;
1542       }
1543 
1544       struct hash_entry *entry =
1545          _mesa_hash_table_search(state->remap, def);
1546       assert(entry);
1547       struct reg_or_immed *pred_val = entry->data;
1548       if ((pred_val->flags & (IR3_REG_IMMED | IR3_REG_CONST)) ||
1549           !pred_val->def ||
1550           (cur_def && cur_def != pred_val->def)) {
1551          needs_phi = true;
1552          break;
1553       }
1554       cur_def = pred_val->def;
1555    }
1556 
1557    if (!needs_phi) {
1558       interval->dst.def = cur_def;
1559       interval->dst.flags = cur_def->flags;
1560 
1561       rb_tree_foreach (struct ra_spill_interval, child,
1562                        &interval->interval.children, interval.node) {
1563          add_live_in_phi(ctx, child->interval.reg, cur_def, block);
1564       }
1565 
1566       return;
1567    }
1568 
1569    if (parent_def) {
1570       /* We have a child interval that needs a phi but whose parent does not
1571        * need one (otherwise parent_def would be NULL). Just extract the child
1572        * from the parent without creating a phi for the child.
1573        */
1574       unsigned offset = (def->interval_start - parent_def->interval_start) /
1575                         reg_elem_size(def);
1576       struct ir3_register *extracted =
1577          extract(parent_def, offset, reg_elems(def), ir3_after_phis(block));
1578       rewrite_src_interval(ctx, interval, extracted,
1579                            ir3_after_instr(extracted->instr));
1580       return;
1581    }
1582 
1583    struct ir3_instruction *phi = ir3_instr_create_at(
1584       ir3_before_block(block), OPC_META_PHI, 1, block->predecessors_count);
1585    struct ir3_register *dst = __ssa_dst(phi);
1586    dst->flags |= def->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
1587    dst->size = def->size;
1588    dst->wrmask = def->wrmask;
1589 
1590    dst->interval_start = def->interval_start;
1591    dst->interval_end = def->interval_end;
1592    dst->merge_set = def->merge_set;
1593    dst->merge_set_offset = def->merge_set_offset;
1594 
1595    for (unsigned i = 0; i < block->predecessors_count; i++) {
1596       struct ir3_block *pred = block->predecessors[i];
1597       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1598       struct ir3_register *src = ir3_src_create(phi, INVALID_REG, dst->flags);
1599       src->size = def->size;
1600       src->wrmask = def->wrmask;
1601 
1602       if (state->visited) {
1603          struct hash_entry *entry =
1604             _mesa_hash_table_search(state->remap, def);
1605          assert(entry);
1606          struct reg_or_immed *new_val = entry->data;
1607          set_src_val(src, new_val);
1608       } else {
1609          src->def = def;
1610       }
1611    }
1612 
1613    interval->dst.def = dst;
1614    interval->dst.flags = dst->flags;
1615    rewrite_src_interval(ctx, interval, dst, ir3_after_phis(block));
1616 }
1617 
1618 static void
add_live_in_phis(struct ra_spill_ctx * ctx,struct ir3_block * block)1619 add_live_in_phis(struct ra_spill_ctx *ctx, struct ir3_block *block)
1620 {
1621    rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
1622                     interval.node) {
1623       if (BITSET_TEST(ctx->live->live_in[block->index],
1624                       interval->interval.reg->name)) {
1625          add_live_in_phi(ctx, interval->interval.reg, NULL, block);
1626       }
1627    }
1628 }
1629 
1630 /* When spilling a block with a single predecessors, the pred may have other
1631  * successors so we can't choose what's live in and we can't spill/restore
1632  * anything. Just make the inserted intervals exactly match the predecessor. If
1633  * it wasn't live in the predecessor then it must've already been spilled. Also,
1634  * there are no phi nodes and no live-ins.
1635  */
1636 static void
spill_single_pred_live_in(struct ra_spill_ctx * ctx,struct ir3_block * block)1637 spill_single_pred_live_in(struct ra_spill_ctx *ctx,
1638                           struct ir3_block *block)
1639 {
1640    unsigned name;
1641    BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1642                        ctx->live->definitions_count) {
1643       struct ir3_register *reg = ctx->live->definitions[name];
1644       struct ra_spill_interval *interval = ctx->intervals[reg->name];
1645       struct reg_or_immed *val = read_live_in(ctx, reg, block, 0);
1646       if (val)
1647          interval->dst = *val;
1648       else
1649          ra_spill_ctx_remove(ctx, interval);
1650    }
1651 }
1652 
1653 static void
rewrite_phi(struct ra_spill_ctx * ctx,struct ir3_instruction * phi,struct ir3_block * block)1654 rewrite_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *phi,
1655             struct ir3_block *block)
1656 {
1657    if (!(phi->dsts[0]->flags & IR3_REG_SSA))
1658       return;
1659 
1660    if (!ctx->intervals[phi->dsts[0]->name]->interval.inserted) {
1661       phi->flags |= IR3_INSTR_UNUSED;
1662       return;
1663    }
1664 
1665    for (unsigned i = 0; i < block->predecessors_count; i++) {
1666       struct ir3_block *pred = block->predecessors[i];
1667       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1668 
1669       if (!state->visited)
1670          continue;
1671 
1672       struct ir3_register *src = phi->srcs[i];
1673       if (!src->def)
1674          continue;
1675 
1676       struct hash_entry *entry =
1677          _mesa_hash_table_search(state->remap, src->def);
1678       assert(entry);
1679       struct reg_or_immed *new_val = entry->data;
1680       set_src_val(src, new_val);
1681    }
1682 }
1683 
1684 static void
spill_live_out(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_block * block)1685 spill_live_out(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval,
1686                struct ir3_block *block)
1687 {
1688    struct ir3_register *def = interval->interval.reg;
1689 
1690    if (interval->interval.reg->merge_set ||
1691        !interval->can_rematerialize) {
1692       spill(ctx, &interval->dst, get_spill_slot(ctx, def),
1693             ir3_before_terminator(block));
1694    }
1695    ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
1696 }
1697 
1698 static void
spill_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1699 spill_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1700 {
1701    struct ra_spill_block_state *state = &ctx->blocks[block->index];
1702    rb_tree_foreach_safe (struct ra_spill_interval, interval,
1703                          &ctx->reg_ctx.intervals, interval.node) {
1704       if (!BITSET_TEST(state->live_out, interval->interval.reg->name)) {
1705          spill_live_out(ctx, interval, block);
1706       }
1707    }
1708 }
1709 
1710 static void
reload_live_out(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1711 reload_live_out(struct ra_spill_ctx *ctx, struct ir3_register *def,
1712                 struct ir3_block *block)
1713 {
1714    struct ra_spill_interval *interval = ctx->intervals[def->name];
1715    ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
1716 
1717    reload_def(ctx, def, ir3_before_terminator(block));
1718 }
1719 
1720 static void
reload_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1721 reload_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1722 {
1723    struct ra_spill_block_state *state = &ctx->blocks[block->index];
1724    unsigned name;
1725    BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) {
1726       struct ir3_register *reg = ctx->live->definitions[name];
1727       struct ra_spill_interval *interval = ctx->intervals[name];
1728       if (!interval->interval.inserted)
1729          reload_live_out(ctx, reg, block);
1730    }
1731 }
1732 
1733 static void
update_live_out_phis(struct ra_spill_ctx * ctx,struct ir3_block * block)1734 update_live_out_phis(struct ra_spill_ctx *ctx, struct ir3_block *block)
1735 {
1736    assert(!block->successors[1]);
1737    struct ir3_block *succ = block->successors[0];
1738    unsigned pred_idx = ir3_block_get_pred_index(succ, block);
1739 
1740    foreach_instr (instr, &succ->instr_list) {
1741       if (instr->opc != OPC_META_PHI)
1742          break;
1743 
1744       struct ir3_register *def = instr->srcs[pred_idx]->def;
1745       if (!def)
1746          continue;
1747 
1748       struct ra_spill_interval *interval = ctx->intervals[def->name];
1749       if (!interval->interval.inserted)
1750          continue;
1751       set_src_val(instr->srcs[pred_idx], &interval->dst);
1752    }
1753 }
1754 
1755 static void
record_pred_live_out(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_block * block,unsigned pred_idx)1756 record_pred_live_out(struct ra_spill_ctx *ctx,
1757                      struct ra_spill_interval *interval,
1758                      struct ir3_block *block, unsigned pred_idx)
1759 {
1760    struct ir3_block *pred = block->predecessors[pred_idx];
1761    struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1762 
1763    struct ir3_register *def = interval->interval.reg;
1764    if (is_live_in_phi(def, block)) {
1765       def = def->instr->srcs[pred_idx]->def;
1766    }
1767    BITSET_SET(state->live_out, def->name);
1768 
1769    rb_tree_foreach (struct ra_spill_interval, child,
1770                     &interval->interval.children, interval.node) {
1771       record_pred_live_out(ctx, child, block, pred_idx);
1772    }
1773 }
1774 
1775 static void
record_pred_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1776 record_pred_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1777 {
1778    for (unsigned i = 0; i < block->predecessors_count; i++) {
1779       struct ir3_block *pred = block->predecessors[i];
1780       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1781       if (state->visited)
1782          continue;
1783 
1784       state->live_out = rzalloc_array(ctx, BITSET_WORD,
1785                                       BITSET_WORDS(ctx->live->definitions_count));
1786 
1787 
1788       rb_tree_foreach (struct ra_spill_interval, interval,
1789                        &ctx->reg_ctx.intervals, interval.node) {
1790          record_pred_live_out(ctx, interval, block, i);
1791       }
1792    }
1793 }
1794 
1795 static void
record_live_out(struct ra_spill_ctx * ctx,struct ra_spill_block_state * state,struct ra_spill_interval * interval)1796 record_live_out(struct ra_spill_ctx *ctx,
1797                 struct ra_spill_block_state *state,
1798                 struct ra_spill_interval *interval)
1799 {
1800    if (!(interval->dst.flags & IR3_REG_SSA) ||
1801        interval->dst.def) {
1802       struct reg_or_immed *val = ralloc(ctx, struct reg_or_immed);
1803       *val = interval->dst;
1804       _mesa_hash_table_insert(state->remap, interval->interval.reg, val);
1805    }
1806    rb_tree_foreach (struct ra_spill_interval, child,
1807                     &interval->interval.children, interval.node) {
1808       record_live_out(ctx, state, child);
1809    }
1810 }
1811 
1812 static void
record_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1813 record_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1814 {
1815    struct ra_spill_block_state *state = &ctx->blocks[block->index];
1816    state->remap = _mesa_pointer_hash_table_create(ctx);
1817 
1818    rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
1819                     interval.node) {
1820       record_live_out(ctx, state, interval);
1821    }
1822 }
1823 
1824 static void
handle_block(struct ra_spill_ctx * ctx,struct ir3_block * block)1825 handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block)
1826 {
1827    memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure));
1828    rb_tree_init(&ctx->reg_ctx.intervals);
1829    rb_tree_init(&ctx->full_live_intervals);
1830    rb_tree_init(&ctx->half_live_intervals);
1831 
1832    unsigned name;
1833    BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1834                        ctx->live->definitions_count) {
1835       struct ir3_register *reg = ctx->live->definitions[name];
1836       handle_live_in(ctx, block, reg);
1837    }
1838 
1839    foreach_instr (instr, &block->instr_list) {
1840       if (instr->opc != OPC_META_PHI && instr->opc != OPC_META_INPUT &&
1841           instr->opc != OPC_META_TEX_PREFETCH)
1842          break;
1843       handle_input_phi(ctx, instr);
1844    }
1845 
1846    if (ctx->spilling) {
1847       if (block->predecessors_count == 1 &&
1848           block->predecessors[0]->successors[1]) {
1849          spill_single_pred_live_in(ctx, block);
1850       } else {
1851          spill_live_ins(ctx, block);
1852          reload_live_ins(ctx, block);
1853          record_pred_live_outs(ctx, block);
1854          foreach_instr (instr, &block->instr_list) {
1855             if (instr->opc != OPC_META_PHI)
1856                break;
1857             rewrite_phi(ctx, instr, block);
1858          }
1859          add_live_in_phis(ctx, block);
1860       }
1861    } else {
1862       update_max_pressure(ctx);
1863    }
1864 
1865    foreach_instr (instr, &block->instr_list) {
1866       di(instr, "processing");
1867 
1868       if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT ||
1869           instr->opc == OPC_META_TEX_PREFETCH)
1870          remove_input_phi(ctx, instr);
1871       else if (ctx->spilling && instr->opc == OPC_META_PARALLEL_COPY)
1872          handle_pcopy(ctx, instr);
1873       else if (ctx->spilling && instr->opc == OPC_MOV &&
1874                instr->dsts[0] == ctx->base_reg)
1875          /* skip */;
1876       else
1877          handle_instr(ctx, instr);
1878    }
1879 
1880    if (ctx->spilling && block->successors[0]) {
1881       struct ra_spill_block_state *state =
1882          &ctx->blocks[block->successors[0]->index];
1883       if (state->visited) {
1884          assert(!block->successors[1]);
1885 
1886          spill_live_outs(ctx, block);
1887          reload_live_outs(ctx, block);
1888          update_live_out_phis(ctx, block);
1889       }
1890    }
1891 
1892    if (ctx->spilling) {
1893       record_live_outs(ctx, block);
1894       ctx->blocks[block->index].visited = true;
1895    }
1896 }
1897 
1898 static bool
simplify_phi_node(struct ir3_instruction * phi)1899 simplify_phi_node(struct ir3_instruction *phi)
1900 {
1901    struct ir3_register *def = NULL;
1902    foreach_src (src, phi) {
1903       /* Ignore phi sources which point to the phi itself. */
1904       if (src->def == phi->dsts[0])
1905          continue;
1906       /* If it's undef or it doesn't match the previous sources, bail */
1907       if (!src->def || (def && def != src->def))
1908          return false;
1909       def = src->def;
1910    }
1911 
1912    phi->data = def;
1913    phi->flags |= IR3_INSTR_UNUSED;
1914    return true;
1915 }
1916 
1917 static struct ir3_register *
simplify_phi_def(struct ir3_register * def)1918 simplify_phi_def(struct ir3_register *def)
1919 {
1920    if (def->instr->opc == OPC_META_PHI) {
1921       struct ir3_instruction *phi = def->instr;
1922 
1923       /* Note: this function is always called at least once after visiting the
1924        * phi, so either there has been a simplified phi in the meantime, in
1925        * which case we will set progress=true and visit the definition again, or
1926        * phi->data already has the most up-to-date value. Therefore we don't
1927        * have to recursively check phi->data.
1928        */
1929       if (phi->data)
1930          return phi->data;
1931    }
1932 
1933    return def;
1934 }
1935 
1936 static void
simplify_phi_srcs(struct ir3_instruction * instr)1937 simplify_phi_srcs(struct ir3_instruction *instr)
1938 {
1939    foreach_src (src, instr) {
1940       if (src->def)
1941          src->def = simplify_phi_def(src->def);
1942    }
1943 }
1944 
1945 /* We insert phi nodes for all live-ins of loops in case we need to split the
1946  * live range. This pass cleans that up for the case where the live range didn't
1947  * actually need to be split.
1948  */
1949 static void
simplify_phi_nodes(struct ir3 * ir)1950 simplify_phi_nodes(struct ir3 *ir)
1951 {
1952    foreach_block (block, &ir->block_list) {
1953       foreach_instr (instr, &block->instr_list) {
1954          if (instr->opc != OPC_META_PHI)
1955             break;
1956          instr->data = NULL;
1957       }
1958    }
1959 
1960    bool progress;
1961    do {
1962       progress = false;
1963       foreach_block (block, &ir->block_list) {
1964          foreach_instr (instr, &block->instr_list) {
1965             if (instr->opc == OPC_META_PHI || (instr->flags & IR3_INSTR_UNUSED))
1966                continue;
1967 
1968             simplify_phi_srcs(instr);
1969          }
1970 
1971          /* Visit phi nodes in the successors to make sure that phi sources are
1972           * always visited at least once after visiting the definition they
1973           * point to. See note in simplify_phi_def() for why this is necessary.
1974           */
1975          for (unsigned i = 0; i < 2; i++) {
1976             struct ir3_block *succ = block->successors[i];
1977             if (!succ)
1978                continue;
1979             foreach_instr (instr, &succ->instr_list) {
1980                if (instr->opc != OPC_META_PHI)
1981                   break;
1982                if (instr->flags & IR3_INSTR_UNUSED) {
1983                   if (instr->data)
1984                      instr->data = simplify_phi_def(instr->data);
1985                } else {
1986                   simplify_phi_srcs(instr);
1987                   progress |= simplify_phi_node(instr);
1988                }
1989             }
1990          }
1991       }
1992    } while (progress);
1993 }
1994 
1995 static void
unmark_dead(struct ir3 * ir)1996 unmark_dead(struct ir3 *ir)
1997 {
1998    foreach_block (block, &ir->block_list) {
1999       foreach_instr (instr, &block->instr_list) {
2000          instr->flags &= ~IR3_INSTR_UNUSED;
2001       }
2002    }
2003 }
2004 
2005 /* Simple pass to remove now-dead phi nodes and pcopy instructions. We mark
2006  * which ones are dead along the way, so there's nothing to compute here.
2007  */
2008 static void
cleanup_dead(struct ir3 * ir)2009 cleanup_dead(struct ir3 *ir)
2010 {
2011    foreach_block (block, &ir->block_list) {
2012       foreach_instr_safe (instr, &block->instr_list) {
2013          if (instr->flags & IR3_INSTR_UNUSED) {
2014             if (instr->opc == OPC_META_PARALLEL_COPY) {
2015                /* There may be non-SSA shared copies, we need to preserve these.
2016                 */
2017                for (unsigned i = 0; i < instr->dsts_count;) {
2018                   if (instr->dsts[i]->flags & IR3_REG_SSA) {
2019                      instr->dsts[i] = instr->dsts[--instr->dsts_count];
2020                      instr->srcs[i] = instr->srcs[--instr->srcs_count];
2021                   } else {
2022                      i++;
2023                   }
2024                }
2025 
2026                if (instr->dsts_count == 0)
2027                   list_delinit(&instr->node);
2028             } else {
2029                list_delinit(&instr->node);
2030             }
2031          }
2032       }
2033    }
2034 }
2035 
2036 /* Deal with merge sets after spilling. Spilling generally leaves the merge sets
2037  * in a mess, and even if we properly cleaned up after ourselves, we would want
2038  * to recompute the merge sets afterward anway. That's because
2039  * spilling/reloading can "break up" phi webs and split/collect webs so that
2040  * allocating them to the same register no longer gives any benefit. For
2041  * example, imagine we have this:
2042  *
2043  * if (...) {
2044  *    foo = ...
2045  * } else {
2046  *    bar = ...
2047  * }
2048  * baz = phi(foo, bar)
2049  *
2050  * and we spill "baz":
2051  *
2052  * if (...) {
2053  *    foo = ...
2054  *    spill(foo)
2055  * } else {
2056  *    bar = ...
2057  *    spill(bar)
2058  * }
2059  * baz = reload()
2060  *
2061  * now foo, bar, and baz don't have to be allocated to the same register. How
2062  * exactly the merge sets change can be complicated, so it's easier just to
2063  * recompute them.
2064  *
2065  * However, there's a wrinkle in this: those same merge sets determine the
2066  * register pressure, due to multiple values inhabiting the same register! And
2067  * we assume that this sharing happens when spilling. Therefore we need a
2068  * three-step procedure:
2069  *
2070  * 1. Drop the original merge sets.
2071  * 2. Calculate which values *must* be merged, being careful to only use the
2072  *    interval information which isn't trashed by spilling, and forcibly merge
2073  *    them.
2074  * 3. Let ir3_merge_regs() finish the job, including recalculating the
2075  *    intervals.
2076  */
2077 
2078 static void
fixup_merge_sets(struct ir3_liveness * live,struct ir3 * ir)2079 fixup_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
2080 {
2081    foreach_block (block, &ir->block_list) {
2082       foreach_instr (instr, &block->instr_list) {
2083          foreach_dst (dst, instr) {
2084             dst->merge_set = NULL;
2085             dst->merge_set_offset = 0;
2086          }
2087       }
2088    }
2089 
2090    ir3_index_instrs_for_merge_sets(ir);
2091 
2092    foreach_block (block, &ir->block_list) {
2093       foreach_instr (instr, &block->instr_list) {
2094          if (instr->opc != OPC_META_SPLIT &&
2095              instr->opc != OPC_META_COLLECT)
2096             continue;
2097 
2098          struct ir3_register *dst = instr->dsts[0];
2099          ra_foreach_src (src, instr) {
2100             if (!(src->flags & IR3_REG_KILL) &&
2101                 src->def->interval_start < dst->interval_end &&
2102                 dst->interval_start < src->def->interval_end) {
2103                ir3_force_merge(dst, src->def,
2104                                src->def->interval_start - dst->interval_start);
2105             }
2106          }
2107       }
2108    }
2109 
2110    ir3_merge_regs(live, ir);
2111 }
2112 
2113 void
ir3_calc_pressure(struct ir3_shader_variant * v,struct ir3_liveness * live,struct ir3_pressure * max_pressure)2114 ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
2115                   struct ir3_pressure *max_pressure)
2116 {
2117    struct ra_spill_ctx *ctx = rzalloc(NULL, struct ra_spill_ctx);
2118    spill_ctx_init(ctx, v, live);
2119 
2120    foreach_block (block, &v->ir->block_list) {
2121       handle_block(ctx, block);
2122    }
2123 
2124    assert(ctx->cur_pressure.full == 0);
2125    assert(ctx->cur_pressure.half == 0);
2126    assert(ctx->cur_pressure.shared == 0);
2127    assert(ctx->cur_pressure.shared_half == 0);
2128 
2129    *max_pressure = ctx->max_pressure;
2130    ralloc_free(ctx);
2131 }
2132 
2133 bool
ir3_spill(struct ir3 * ir,struct ir3_shader_variant * v,struct ir3_liveness ** live,const struct ir3_pressure * limit_pressure)2134 ir3_spill(struct ir3 *ir, struct ir3_shader_variant *v,
2135           struct ir3_liveness **live,
2136           const struct ir3_pressure *limit_pressure)
2137 {
2138    void *mem_ctx = ralloc_parent(*live);
2139    struct ra_spill_ctx *ctx = rzalloc(mem_ctx, struct ra_spill_ctx);
2140    spill_ctx_init(ctx, v, *live);
2141 
2142    ctx->spilling = true;
2143 
2144    ctx->blocks = rzalloc_array(ctx, struct ra_spill_block_state,
2145                                ctx->live->block_count);
2146    rb_tree_init(&ctx->full_live_intervals);
2147    rb_tree_init(&ctx->half_live_intervals);
2148 
2149    ctx->limit_pressure = *limit_pressure;
2150    ctx->spill_slot = v->pvtmem_size;
2151 
2152    add_base_reg(ctx, ir);
2153    compute_next_distance(ctx, ir);
2154 
2155    unmark_dead(ir);
2156 
2157    foreach_block (block, &ir->block_list) {
2158       handle_block(ctx, block);
2159    }
2160 
2161    simplify_phi_nodes(ir);
2162 
2163    cleanup_dead(ir);
2164 
2165    ir3_create_parallel_copies(ir);
2166 
2167    /* After this point, we're done mutating the IR. Liveness has been trashed,
2168     * so recalculate it. We'll need it for recalculating the merge sets.
2169     */
2170    ralloc_free(ctx->live);
2171    *live = ir3_calc_liveness(mem_ctx, ir);
2172 
2173    fixup_merge_sets(*live, ir);
2174 
2175    v->pvtmem_size = ctx->spill_slot;
2176    ralloc_free(ctx);
2177 
2178    return true;
2179 }
2180