• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2021 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "util/rb_tree.h"
25 #include "ir3_ra.h"
26 #include "ir3_shader.h"
27 
28 /*
29  * This pass does two things:
30  *
31  * 1. Calculates the maximum register pressure. To do this, we need to use the
32  *    exact same technique that RA uses for combining meta_split instructions
33  *    with their sources, so that our calculation agrees with RA.
34  * 2. Spills when the register pressure is exceeded a limit calculated by RA.
35  *    The implementation is based on "Register Spilling and Live-Range Splitting
36  *    for SSA-Form Programs" by Braun and Hack, although again care has to be
37  *    taken to handle combining split/collect instructions.
38  */
39 
40 struct reg_or_immed {
41    unsigned flags;
42    union {
43       struct ir3_register *def;
44       uint32_t uimm;
45       unsigned const_num;
46    };
47 };
48 
49 struct ra_spill_interval {
50    struct ir3_reg_interval interval;
51 
52    struct rb_node node;
53    struct rb_node half_node;
54 
55    /* The current SSA value/const/immed this source is mapped to. */
56    struct reg_or_immed dst;
57 
58    /* When computing use distances we use the distance relative to the start
59     * of the block. So, for example, a value that's defined in cycle 5 of the
60     * block and used 6 cycles later will always have a next_use_distance of 11
61     * until we reach that use.
62     */
63    unsigned next_use_distance;
64 
65    /* Whether this value was reloaded and therefore doesn't need to be
66     * spilled again. Corresponds to the S set in the paper.
67     */
68    bool already_spilled;
69 
70    /* We need to add sources early for accounting purposes, but we have to
71     * insert the reload code for them last. Keep track of whether this interval
72     * needs to be reloaded later.
73     */
74    bool needs_reload;
75 
76    /* Keep track of whether this interval currently can't be spilled because:
77     * - It or one of its children is a source and we're making space for
78     *   sources.
79     * - It is a destination and we're making space for destinations.
80     */
81    bool cant_spill;
82 
83    /* Whether this interval can be rematerialized. */
84    bool can_rematerialize;
85 };
86 
87 struct ra_spill_block_state {
88    unsigned *next_use_end;
89    unsigned *next_use_start;
90 
91    unsigned cycles;
92 
93    /* Map from SSA def to reg_or_immed it is mapped to at the end of the block.
94     * This map only contains values which we didn't spill, so it also serves as
95     * a record of the new live-out set for this block.
96     */
97    struct hash_table *remap;
98 
99    /* For blocks whose successors are visited first (i.e. loop backedges), which
100     * values should be live at the end.
101     */
102    BITSET_WORD *live_out;
103 
104    bool visited;
105 };
106 
107 struct ra_spill_ctx {
108    struct ir3_reg_ctx reg_ctx;
109 
110    struct ra_spill_interval **intervals;
111    unsigned intervals_count;
112 
113    /* rb tree of live intervals that we can spill, ordered by next-use distance.
114     * full_live_intervals contains the full+shared intervals in the merged_regs
115     * case. We use this list to determine what to spill.
116     */
117    struct rb_tree full_live_intervals;
118    struct rb_tree half_live_intervals;
119 
120    struct ir3_pressure cur_pressure, max_pressure;
121 
122    struct ir3_pressure limit_pressure;
123 
124    /* When spilling, we need to reserve a register to serve as the zero'd
125     * "base". For simplicity we reserve a register at the beginning so that it's
126     * always available.
127     */
128    struct ir3_register *base_reg;
129 
130    /* Current pvtmem offset in bytes. */
131    unsigned spill_slot;
132 
133    struct ir3_liveness *live;
134 
135    const struct ir3_compiler *compiler;
136 
137    struct ra_spill_block_state *blocks;
138 
139    bool spilling;
140 
141    bool merged_regs;
142 };
143 
144 static void
add_base_reg(struct ra_spill_ctx * ctx,struct ir3 * ir)145 add_base_reg(struct ra_spill_ctx *ctx, struct ir3 *ir)
146 {
147    struct ir3_block *start = ir3_start_block(ir);
148 
149    /* We need to stick it after any meta instructions which need to be first. */
150    struct ir3_instruction *after = NULL;
151    foreach_instr (instr, &start->instr_list) {
152       if (instr->opc != OPC_META_INPUT &&
153           instr->opc != OPC_META_TEX_PREFETCH) {
154          after = instr;
155          break;
156       }
157    }
158 
159    struct ir3_instruction *mov = create_immed(start, 0);
160 
161    if (after)
162       ir3_instr_move_before(mov, after);
163 
164    ctx->base_reg = mov->dsts[0];
165 
166    /* We don't create an interval, etc. for the base reg, so just lower the
167     * register pressure limit to account for it. We assume it's always
168     * available for simplicity.
169     */
170    ctx->limit_pressure.full -= reg_size(ctx->base_reg);
171 }
172 
173 
174 /* Compute the number of cycles per instruction used for next-use-distance
175  * analysis. This is just approximate, obviously.
176  */
177 static unsigned
instr_cycles(struct ir3_instruction * instr)178 instr_cycles(struct ir3_instruction *instr)
179 {
180    if (instr->opc == OPC_META_PARALLEL_COPY) {
181       unsigned cycles = 0;
182       for (unsigned i = 0; i < instr->dsts_count; i++) {
183          if (!instr->srcs[i]->def ||
184              instr->srcs[i]->def->merge_set != instr->dsts[i]->merge_set) {
185             cycles += reg_elems(instr->srcs[i]);
186          }
187       }
188 
189       return cycles;
190    }
191 
192    if (instr->opc == OPC_META_COLLECT) {
193       unsigned cycles = 0;
194       for (unsigned i = 0; i < instr->srcs_count; i++) {
195          if (!instr->srcs[i]->def ||
196              instr->srcs[i]->def->merge_set != instr->dsts[0]->merge_set) {
197             cycles++;
198          }
199       }
200 
201       return cycles;
202    }
203 
204    if (is_meta(instr))
205       return 0;
206 
207    return 1 + instr->repeat;
208 }
209 
210 static bool
compute_block_next_distance(struct ra_spill_ctx * ctx,struct ir3_block * block,unsigned * tmp_next_use)211 compute_block_next_distance(struct ra_spill_ctx *ctx, struct ir3_block *block,
212                             unsigned *tmp_next_use)
213 {
214    struct ra_spill_block_state *state = &ctx->blocks[block->index];
215    memcpy(tmp_next_use, state->next_use_end,
216           ctx->live->definitions_count * sizeof(*tmp_next_use));
217 
218    unsigned cycle = state->cycles;
219    foreach_instr_rev (instr, &block->instr_list) {
220       ra_foreach_dst (dst, instr) {
221          dst->next_use = tmp_next_use[dst->name];
222       }
223 
224       ra_foreach_src (src, instr) {
225          src->next_use = tmp_next_use[src->def->name];
226       }
227 
228       cycle -= instr_cycles(instr);
229 
230       if (instr->opc == OPC_META_PARALLEL_COPY) {
231          ra_foreach_src_n (src, i, instr) {
232             if (src->def->merge_set == instr->dsts[i]->merge_set &&
233                 src->def->merge_set_offset == instr->dsts[i]->merge_set_offset) {
234                tmp_next_use[src->def->name] =
235                   tmp_next_use[instr->dsts[i]->name];
236             } else {
237                tmp_next_use[src->def->name] = cycle;
238             }
239          }
240       } else if (instr->opc != OPC_META_PHI) {
241          ra_foreach_src (src, instr) {
242             tmp_next_use[src->def->name] = cycle;
243          }
244       }
245 
246       ra_foreach_dst (dst, instr) {
247          tmp_next_use[dst->name] = UINT_MAX;
248       }
249    }
250 
251    memcpy(state->next_use_start, tmp_next_use,
252           ctx->live->definitions_count * sizeof(*tmp_next_use));
253 
254    bool progress = false;
255    for (unsigned i = 0; i < block->predecessors_count; i++) {
256       const struct ir3_block *pred = block->predecessors[i];
257       struct ra_spill_block_state *pred_state = &ctx->blocks[pred->index];
258 
259       /* Add a large-enough distance in front of edges exiting the loop so that
260        * variables that are live-through the loop but not used inside it are
261        * prioritized for spilling, as per the paper. This just needs to be
262        * larger than the longest path through the loop.
263        */
264       bool loop_exit = pred->loop_depth < block->loop_depth;
265       unsigned block_distance = pred_state->cycles + (loop_exit ? 100000 : 0);
266 
267       for (unsigned j = 0; j < ctx->live->definitions_count; j++) {
268          if (state->next_use_start[j] < UINT_MAX &&
269              state->next_use_start[j] + block_distance <
270              pred_state->next_use_end[j]) {
271             pred_state->next_use_end[j] = state->next_use_start[j] +
272                block_distance;
273             progress = true;
274          }
275       }
276 
277       foreach_instr (phi, &block->instr_list) {
278          if (phi->opc != OPC_META_PHI)
279             break;
280          if (!phi->srcs[i]->def)
281             continue;
282          unsigned src = phi->srcs[i]->def->name;
283          if (phi->dsts[0]->next_use < UINT_MAX &&
284              phi->dsts[0]->next_use + block_distance <
285              pred_state->next_use_end[src]) {
286             pred_state->next_use_end[src] = phi->dsts[0]->next_use +
287                block_distance;
288             progress = true;
289          }
290       }
291    }
292 
293    return progress;
294 }
295 
296 static void
compute_next_distance(struct ra_spill_ctx * ctx,struct ir3 * ir)297 compute_next_distance(struct ra_spill_ctx *ctx, struct ir3 *ir)
298 {
299    for (unsigned i = 0; i < ctx->live->block_count; i++) {
300       ctx->blocks[i].next_use_start =
301          ralloc_array(ctx, unsigned, ctx->live->definitions_count);
302       ctx->blocks[i].next_use_end =
303          ralloc_array(ctx, unsigned, ctx->live->definitions_count);
304 
305       for (unsigned j = 0; j < ctx->live->definitions_count; j++) {
306          ctx->blocks[i].next_use_start[j] = UINT_MAX;
307          ctx->blocks[i].next_use_end[j] = UINT_MAX;
308       }
309    }
310 
311    foreach_block (block, &ir->block_list) {
312       struct ra_spill_block_state *state = &ctx->blocks[block->index];
313       state->cycles = 0;
314       foreach_instr (instr, &block->instr_list) {
315          state->cycles += instr_cycles(instr);
316          foreach_dst (dst, instr) {
317             dst->spill_slot = ~0;
318          }
319       }
320    }
321 
322    unsigned *tmp_next_use =
323       ralloc_array(ctx, unsigned, ctx->live->definitions_count);
324 
325    bool progress = true;
326    while (progress) {
327       progress = false;
328       foreach_block_rev (block, &ir->block_list) {
329          progress |= compute_block_next_distance(ctx, block, tmp_next_use);
330       }
331    }
332 }
333 
334 static bool
can_rematerialize(struct ir3_register * reg)335 can_rematerialize(struct ir3_register *reg)
336 {
337    if (reg->flags & IR3_REG_ARRAY)
338       return false;
339    if (reg->instr->opc != OPC_MOV)
340       return false;
341    if (!(reg->instr->srcs[0]->flags & (IR3_REG_IMMED | IR3_REG_CONST)))
342       return false;
343    if (reg->instr->srcs[0]->flags & IR3_REG_RELATIV)
344       return false;
345    return true;
346 }
347 
348 static struct ir3_register *
rematerialize(struct ir3_register * reg,struct ir3_instruction * after,struct ir3_block * block)349 rematerialize(struct ir3_register *reg, struct ir3_instruction *after,
350               struct ir3_block *block)
351 {
352    d("rematerializing ssa_%u:%u", reg->instr->serialno, reg->name);
353 
354    struct ir3_instruction *remat =
355       ir3_instr_create(block, reg->instr->opc, 1, reg->instr->srcs_count);
356    struct ir3_register *dst = __ssa_dst(remat);
357    dst->flags |= reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
358    for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
359       struct ir3_register *src =
360          ir3_src_create(remat, INVALID_REG, reg->instr->srcs[i]->flags);
361       *src = *reg->instr->srcs[i];
362    }
363 
364    remat->cat1 = reg->instr->cat1;
365 
366    dst->merge_set = reg->merge_set;
367    dst->merge_set_offset = reg->merge_set_offset;
368    dst->interval_start = reg->interval_start;
369    dst->interval_end = reg->interval_end;
370 
371    if (after)
372       ir3_instr_move_before(remat, after);
373 
374    return dst;
375 }
376 
377 static void
ra_spill_interval_init(struct ra_spill_interval * interval,struct ir3_register * reg)378 ra_spill_interval_init(struct ra_spill_interval *interval,
379                        struct ir3_register *reg)
380 {
381    ir3_reg_interval_init(&interval->interval, reg);
382    interval->dst.flags = reg->flags;
383    interval->dst.def = reg;
384    interval->already_spilled = false;
385    interval->needs_reload = false;
386    interval->cant_spill = false;
387    interval->can_rematerialize = can_rematerialize(reg);
388 }
389 
390 static struct ra_spill_interval *
ir3_reg_interval_to_interval(struct ir3_reg_interval * interval)391 ir3_reg_interval_to_interval(struct ir3_reg_interval *interval)
392 {
393    return rb_node_data(struct ra_spill_interval, interval, interval);
394 }
395 
396 static struct ra_spill_interval *
ra_spill_interval_root(struct ra_spill_interval * interval)397 ra_spill_interval_root(struct ra_spill_interval *interval)
398 {
399    struct ir3_reg_interval *ir3_interval = &interval->interval;
400    while (ir3_interval->parent)
401       ir3_interval = ir3_interval->parent;
402    return ir3_reg_interval_to_interval(ir3_interval);
403 }
404 
405 static struct ra_spill_ctx *
ir3_reg_ctx_to_ctx(struct ir3_reg_ctx * ctx)406 ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
407 {
408    return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx);
409 }
410 
411 static int
spill_interval_cmp(const struct ra_spill_interval * a,const struct ra_spill_interval * b)412 spill_interval_cmp(const struct ra_spill_interval *a,
413                    const struct ra_spill_interval *b)
414 {
415    /* Prioritize intervals that we can rematerialize. */
416    if (a->can_rematerialize && !b->can_rematerialize)
417       return 1;
418    if (!a->can_rematerialize && b->can_rematerialize)
419       return -1;
420 
421    return a->next_use_distance - b->next_use_distance;
422 }
423 
424 static int
ra_spill_interval_cmp(const struct rb_node * _a,const struct rb_node * _b)425 ra_spill_interval_cmp(const struct rb_node *_a, const struct rb_node *_b)
426 {
427    const struct ra_spill_interval *a =
428       rb_node_data(const struct ra_spill_interval, _a, node);
429    const struct ra_spill_interval *b =
430       rb_node_data(const struct ra_spill_interval, _b, node);
431    return spill_interval_cmp(a, b);
432 }
433 
434 static int
ra_spill_interval_half_cmp(const struct rb_node * _a,const struct rb_node * _b)435 ra_spill_interval_half_cmp(const struct rb_node *_a, const struct rb_node *_b)
436 {
437    const struct ra_spill_interval *a =
438       rb_node_data(const struct ra_spill_interval, _a, half_node);
439    const struct ra_spill_interval *b =
440       rb_node_data(const struct ra_spill_interval, _b, half_node);
441    return spill_interval_cmp(a, b);
442 }
443 
444 static void
interval_add(struct ir3_reg_ctx * _ctx,struct ir3_reg_interval * _interval)445 interval_add(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
446 {
447    struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
448    struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
449 
450    unsigned size = reg_size(interval->interval.reg);
451    if (interval->interval.reg->flags & IR3_REG_SHARED) {
452       ctx->cur_pressure.shared += size;
453    } else {
454       if (interval->interval.reg->flags & IR3_REG_HALF) {
455          ctx->cur_pressure.half += size;
456          if (ctx->spilling) {
457             rb_tree_insert(&ctx->half_live_intervals, &interval->half_node,
458                            ra_spill_interval_half_cmp);
459          }
460       }
461       if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) {
462          ctx->cur_pressure.full += size;
463          if (ctx->spilling) {
464             rb_tree_insert(&ctx->full_live_intervals, &interval->node,
465                            ra_spill_interval_cmp);
466          }
467       }
468    }
469 }
470 
471 static void
interval_delete(struct ir3_reg_ctx * _ctx,struct ir3_reg_interval * _interval)472 interval_delete(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
473 {
474    struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
475    struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
476 
477    unsigned size = reg_size(interval->interval.reg);
478    if (interval->interval.reg->flags & IR3_REG_SHARED) {
479       ctx->cur_pressure.shared -= size;
480    } else {
481       if (interval->interval.reg->flags & IR3_REG_HALF) {
482          ctx->cur_pressure.half -= size;
483          if (ctx->spilling) {
484             rb_tree_remove(&ctx->half_live_intervals, &interval->half_node);
485          }
486       }
487       if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) {
488          ctx->cur_pressure.full -= size;
489          if (ctx->spilling) {
490             rb_tree_remove(&ctx->full_live_intervals, &interval->node);
491          }
492       }
493    }
494 }
495 
496 static void
interval_readd(struct ir3_reg_ctx * _ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)497 interval_readd(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_parent,
498                struct ir3_reg_interval *_child)
499 {
500    interval_add(_ctx, _child);
501 }
502 
503 static void
spill_ctx_init(struct ra_spill_ctx * ctx,struct ir3_shader_variant * v,struct ir3_liveness * live)504 spill_ctx_init(struct ra_spill_ctx *ctx, struct ir3_shader_variant *v,
505                struct ir3_liveness *live)
506 {
507    ctx->live = live;
508    ctx->intervals = ralloc_array(ctx, struct ra_spill_interval *,
509                                  ctx->live->definitions_count);
510    struct ra_spill_interval *intervals =
511       rzalloc_array(ctx, struct ra_spill_interval,
512                     ctx->live->definitions_count);
513    for (unsigned i = 0; i < ctx->live->definitions_count; i++)
514       ctx->intervals[i] = &intervals[i];
515 
516    ctx->intervals_count = ctx->live->definitions_count;
517    ctx->compiler = v->compiler;
518    ctx->merged_regs = v->mergedregs;
519 
520    rb_tree_init(&ctx->reg_ctx.intervals);
521    ctx->reg_ctx.interval_add = interval_add;
522    ctx->reg_ctx.interval_delete = interval_delete;
523    ctx->reg_ctx.interval_readd = interval_readd;
524 }
525 
526 static void
ra_spill_ctx_insert(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval)527 ra_spill_ctx_insert(struct ra_spill_ctx *ctx,
528                     struct ra_spill_interval *interval)
529 {
530    ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
531 }
532 
533 static void
ra_spill_ctx_remove(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval)534 ra_spill_ctx_remove(struct ra_spill_ctx *ctx,
535                     struct ra_spill_interval *interval)
536 {
537    ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
538 }
539 
540 static void
init_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)541 init_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
542 {
543    struct ra_spill_interval *interval = ctx->intervals[dst->name];
544    ra_spill_interval_init(interval, dst);
545    if (ctx->spilling) {
546       interval->next_use_distance = dst->next_use;
547 
548       /* We only need to keep track of used-ness if this value may be
549        * rematerialized. This also keeps us from nuking things that may be
550        * in the keeps list (e.g. atomics, input splits).
551        */
552       if (interval->can_rematerialize)
553          dst->instr->flags |= IR3_INSTR_UNUSED;
554    }
555 }
556 
557 static void
insert_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)558 insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
559 {
560    struct ra_spill_interval *interval = ctx->intervals[dst->name];
561    if (interval->interval.inserted)
562       return;
563 
564    ra_spill_ctx_insert(ctx, interval);
565    interval->cant_spill = true;
566 
567    /* For precolored inputs, make sure we leave enough registers to allow for
568     * holes in the inputs. It can happen that the binning shader has a lower
569     * register pressure than the main shader, but the main shader decided to
570     * add holes between the inputs which means that the binning shader has a
571     * higher register demand.
572     */
573    if (dst->instr->opc == OPC_META_INPUT && dst->num != INVALID_REG) {
574       physreg_t physreg = ra_reg_get_physreg(dst);
575       physreg_t max = physreg + reg_size(dst);
576 
577       if (interval->interval.reg->flags & IR3_REG_SHARED)
578          ctx->max_pressure.shared = MAX2(ctx->max_pressure.shared, max);
579       else if (interval->interval.reg->flags & IR3_REG_HALF)
580          ctx->max_pressure.half = MAX2(ctx->max_pressure.half, max);
581       else
582          ctx->max_pressure.full = MAX2(ctx->max_pressure.full, max);
583    }
584 }
585 
586 static void
insert_src(struct ra_spill_ctx * ctx,struct ir3_register * src)587 insert_src(struct ra_spill_ctx *ctx, struct ir3_register *src)
588 {
589    struct ra_spill_interval *interval = ctx->intervals[src->def->name];
590 
591    if (!interval->interval.inserted) {
592       ra_spill_ctx_insert(ctx, interval);
593       interval->needs_reload = true;
594       interval->already_spilled = true;
595    }
596 
597    ra_spill_interval_root(interval)->cant_spill = true;
598 
599 }
600 
601 static void
remove_src_early(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)602 remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
603                  struct ir3_register *src)
604 {
605    struct ra_spill_interval *interval = ctx->intervals[src->def->name];
606 
607    if (!interval->interval.inserted || interval->interval.parent ||
608        !rb_tree_is_empty(&interval->interval.children))
609       return;
610 
611    ra_spill_ctx_remove(ctx, interval);
612 }
613 
614 static void
remove_src(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)615 remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
616            struct ir3_register *src)
617 {
618    struct ra_spill_interval *interval = ctx->intervals[src->def->name];
619 
620    if (!interval->interval.inserted)
621       return;
622 
623    ra_spill_ctx_remove(ctx, interval);
624 }
625 
626 static void
finish_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)627 finish_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
628 {
629    struct ra_spill_interval *interval = ctx->intervals[dst->name];
630    interval->cant_spill = false;
631 }
632 
633 static void
remove_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)634 remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
635 {
636    struct ra_spill_interval *interval = ctx->intervals[dst->name];
637 
638    if (!interval->interval.inserted)
639       return;
640 
641    ra_spill_ctx_remove(ctx, interval);
642 }
643 
644 static void
update_src_next_use(struct ra_spill_ctx * ctx,struct ir3_register * src)645 update_src_next_use(struct ra_spill_ctx *ctx, struct ir3_register *src)
646 {
647    struct ra_spill_interval *interval = ctx->intervals[src->def->name];
648 
649    assert(interval->interval.inserted);
650 
651    interval->next_use_distance = src->next_use;
652 
653    /* If this node is inserted in one of the trees, then it needs to be resorted
654     * as its key has changed.
655     */
656    if (!interval->interval.parent && !(src->flags & IR3_REG_SHARED)) {
657       if (src->flags & IR3_REG_HALF) {
658          rb_tree_remove(&ctx->half_live_intervals, &interval->half_node);
659          rb_tree_insert(&ctx->half_live_intervals, &interval->half_node,
660                         ra_spill_interval_half_cmp);
661       }
662       if (ctx->merged_regs || !(src->flags & IR3_REG_HALF)) {
663          rb_tree_remove(&ctx->full_live_intervals, &interval->node);
664          rb_tree_insert(&ctx->full_live_intervals, &interval->node,
665                         ra_spill_interval_cmp);
666       }
667    }
668 }
669 
670 static unsigned
get_spill_slot(struct ra_spill_ctx * ctx,struct ir3_register * reg)671 get_spill_slot(struct ra_spill_ctx *ctx, struct ir3_register *reg)
672 {
673    if (reg->merge_set) {
674       if (reg->merge_set->spill_slot == ~0) {
675          reg->merge_set->spill_slot = ALIGN_POT(ctx->spill_slot,
676                                                 reg->merge_set->alignment);
677          ctx->spill_slot = reg->merge_set->spill_slot + reg->merge_set->size * 2;
678       }
679       return reg->merge_set->spill_slot + reg->merge_set_offset * 2;
680    } else {
681       if (reg->spill_slot == ~0) {
682          reg->spill_slot = ALIGN_POT(ctx->spill_slot, reg_elem_size(reg));
683          ctx->spill_slot = reg->spill_slot + reg_size(reg) * 2;
684       }
685       return reg->spill_slot;
686    }
687 }
688 
689 static void
set_src_val(struct ir3_register * src,const struct reg_or_immed * val)690 set_src_val(struct ir3_register *src, const struct reg_or_immed *val)
691 {
692    if (val->flags & IR3_REG_IMMED) {
693       src->flags = IR3_REG_IMMED | (val->flags & IR3_REG_HALF);
694       src->uim_val = val->uimm;
695       src->def = NULL;
696    } else if (val->flags & IR3_REG_CONST) {
697       src->flags = IR3_REG_CONST | (val->flags & IR3_REG_HALF);
698       src->num = val->const_num;
699       src->def = NULL;
700    } else {
701       src->def = val->def;
702       val->def->instr->flags &= ~IR3_INSTR_UNUSED;
703    }
704 }
705 
706 static struct ir3_register *
materialize_pcopy_src(const struct reg_or_immed * src,struct ir3_instruction * instr,struct ir3_block * block)707 materialize_pcopy_src(const struct reg_or_immed *src,
708                       struct ir3_instruction *instr,
709                       struct ir3_block *block)
710 {
711    struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
712    struct ir3_register *dst = __ssa_dst(mov);
713    dst->flags |= src->flags & IR3_REG_HALF;
714    struct ir3_register *mov_src = ir3_src_create(mov, INVALID_REG, src->flags);
715    set_src_val(mov_src, src);
716    mov->cat1.src_type = mov->cat1.dst_type =
717       (src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
718 
719    if (instr)
720       ir3_instr_move_before(mov, instr);
721    return dst;
722 }
723 
724 static void
spill(struct ra_spill_ctx * ctx,const struct reg_or_immed * val,unsigned spill_slot,struct ir3_instruction * instr,struct ir3_block * block)725 spill(struct ra_spill_ctx *ctx, const struct reg_or_immed *val,
726       unsigned spill_slot, struct ir3_instruction *instr, struct ir3_block *block)
727 {
728    struct ir3_register *reg;
729 
730    /* If spilling an immed/const pcopy src, we need to actually materialize it
731     * first with a mov.
732     */
733    if (val->flags & (IR3_REG_CONST | IR3_REG_IMMED)) {
734       reg = materialize_pcopy_src(val, instr, block);
735    } else {
736       reg = val->def;
737       reg->instr->flags &= ~IR3_INSTR_UNUSED;
738    }
739 
740    d("spilling ssa_%u:%u to %u", reg->instr->serialno, reg->name,
741      spill_slot);
742 
743    unsigned elems = reg_elems(reg);
744    struct ir3_instruction *spill =
745       ir3_instr_create(block, OPC_SPILL_MACRO, 0, 3);
746    ir3_src_create(spill, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg;
747    unsigned src_flags = reg->flags & (IR3_REG_HALF | IR3_REG_IMMED |
748                                       IR3_REG_CONST | IR3_REG_SSA |
749                                       IR3_REG_ARRAY);
750    struct ir3_register *src = ir3_src_create(spill, INVALID_REG, src_flags);
751    ir3_src_create(spill, INVALID_REG, IR3_REG_IMMED)->uim_val = elems;
752    spill->cat6.dst_offset = spill_slot;
753    spill->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
754 
755    src->def = reg;
756    if (reg->flags & IR3_REG_ARRAY) {
757       src->size = reg->size;
758       src->array.id = reg->array.id;
759       src->array.offset = 0;
760    } else {
761       src->wrmask = reg->wrmask;
762    }
763 
764    if (instr)
765       ir3_instr_move_before(spill, instr);
766 }
767 
768 static void
spill_interval(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_instruction * instr,struct ir3_block * block)769 spill_interval(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval,
770                struct ir3_instruction *instr, struct ir3_block *block)
771 {
772    if (interval->can_rematerialize && !interval->interval.reg->merge_set)
773       return;
774 
775    spill(ctx, &interval->dst, get_spill_slot(ctx, interval->interval.reg),
776          instr, block);
777 }
778 
779 /* This is similar to "limit" in the paper. */
780 static void
limit(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)781 limit(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
782 {
783    if (ctx->cur_pressure.half > ctx->limit_pressure.half) {
784       d("cur half pressure %u exceeds %u", ctx->cur_pressure.half,
785         ctx->limit_pressure.half);
786       rb_tree_foreach_safe (struct ra_spill_interval, interval,
787                             &ctx->half_live_intervals, half_node) {
788          d("trying ssa_%u:%u", interval->interval.reg->instr->serialno,
789            interval->interval.reg->name);
790          if (!interval->cant_spill) {
791             if (!interval->already_spilled)
792                spill_interval(ctx, interval, instr, instr->block);
793             ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
794             if (ctx->cur_pressure.half <= ctx->limit_pressure.half)
795                break;
796          }
797       }
798 
799       assert(ctx->cur_pressure.half <= ctx->limit_pressure.half);
800    }
801 
802    if (ctx->cur_pressure.full > ctx->limit_pressure.full) {
803       d("cur full pressure %u exceeds %u", ctx->cur_pressure.full,
804         ctx->limit_pressure.full);
805       rb_tree_foreach_safe (struct ra_spill_interval, interval,
806                             &ctx->full_live_intervals, node) {
807          d("trying ssa_%u:%u", interval->interval.reg->instr->serialno,
808            interval->interval.reg->name);
809          if (!interval->cant_spill) {
810             if (!interval->already_spilled)
811                spill_interval(ctx, interval, instr, instr->block);
812             ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
813             if (ctx->cur_pressure.full <= ctx->limit_pressure.full)
814                break;
815          } else {
816             d("can't spill");
817          }
818       }
819 
820       assert(ctx->cur_pressure.full <= ctx->limit_pressure.full);
821    }
822 }
823 
824 /* There's a corner case where we reload a value which has overlapping live
825  * values already reloaded, either because it's the child of some other interval
826  * that was already reloaded or some of its children have already been
827  * reloaded. Because RA only expects overlapping source/dest intervals for meta
828  * instructions (split/collect), and we don't want to add register pressure by
829  * creating an entirely separate value, we need to add splits and collects to
830  * deal with this case. These splits/collects have to also have correct merge
831  * set information, so that it doesn't result in any actual code or register
832  * pressure in practice.
833  */
834 
835 static void
add_to_merge_set(struct ir3_merge_set * set,struct ir3_register * def,unsigned offset)836 add_to_merge_set(struct ir3_merge_set *set, struct ir3_register *def,
837                  unsigned offset)
838 {
839    def->merge_set = set;
840    def->merge_set_offset = offset;
841    def->interval_start = set->interval_start + offset;
842    def->interval_end = set->interval_start + offset + reg_size(def);
843 }
844 
845 static struct ir3_register *
split(struct ir3_register * def,unsigned offset,struct ir3_instruction * after,struct ir3_block * block)846 split(struct ir3_register *def, unsigned offset,
847       struct ir3_instruction *after, struct ir3_block *block)
848 {
849    if (reg_elems(def) == 1) {
850       assert(offset == 0);
851       return def;
852    }
853 
854    assert(!(def->flags & IR3_REG_ARRAY));
855    assert(def->merge_set);
856    struct ir3_instruction *split =
857       ir3_instr_create(block, OPC_META_SPLIT, 1, 1);
858    struct ir3_register *dst = __ssa_dst(split);
859    dst->flags |= def->flags & IR3_REG_HALF;
860    struct ir3_register *src = ir3_src_create(split, INVALID_REG, def->flags);
861    src->wrmask = def->wrmask;
862    src->def = def;
863    add_to_merge_set(def->merge_set, dst,
864                     def->merge_set_offset + offset * reg_elem_size(def));
865    if (after)
866       ir3_instr_move_before(split, after);
867    return dst;
868 }
869 
870 static struct ir3_register *
extract(struct ir3_register * parent_def,unsigned offset,unsigned elems,struct ir3_instruction * after,struct ir3_block * block)871 extract(struct ir3_register *parent_def, unsigned offset, unsigned elems,
872         struct ir3_instruction *after, struct ir3_block *block)
873 {
874    if (offset == 0 && elems == reg_elems(parent_def))
875       return parent_def;
876 
877    struct ir3_register *srcs[elems];
878    for (unsigned i = 0; i < elems; i++) {
879       srcs[i] = split(parent_def, offset + i, after, block);
880    }
881 
882    struct ir3_instruction *collect =
883       ir3_instr_create(block, OPC_META_COLLECT, 1, elems);
884    struct ir3_register *dst = __ssa_dst(collect);
885    dst->flags |= parent_def->flags & IR3_REG_HALF;
886    dst->wrmask = MASK(elems);
887    add_to_merge_set(parent_def->merge_set, dst, parent_def->merge_set_offset);
888 
889    for (unsigned i = 0; i < elems; i++) {
890       ir3_src_create(collect, INVALID_REG, parent_def->flags)->def = srcs[i];
891    }
892 
893    if (after)
894       ir3_instr_move_before(collect, after);
895    return dst;
896 }
897 
898 static struct ir3_register *
reload(struct ra_spill_ctx * ctx,struct ir3_register * reg,struct ir3_instruction * after,struct ir3_block * block)899 reload(struct ra_spill_ctx *ctx, struct ir3_register *reg,
900        struct ir3_instruction *after, struct ir3_block *block)
901 {
902    unsigned spill_slot = get_spill_slot(ctx, reg);
903 
904    d("reloading ssa_%u:%u from %u", reg->instr->serialno, reg->name,
905      spill_slot);
906 
907    unsigned elems = reg_elems(reg);
908    struct ir3_instruction *reload =
909       ir3_instr_create(block, OPC_RELOAD_MACRO, 1, 3);
910    struct ir3_register *dst = __ssa_dst(reload);
911    dst->flags |= reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
912    /* The reload may be split into multiple pieces, and if the destination
913     * overlaps with the base register then it could get clobbered before the
914     * last ldp in the sequence. Note that we always reserve space for the base
915     * register throughout the whole program, so effectively extending its live
916     * range past the end of the instruction isn't a problem for our pressure
917     * accounting.
918     */
919    dst->flags |= IR3_REG_EARLY_CLOBBER;
920    ir3_src_create(reload, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg;
921    struct ir3_register *offset_reg =
922       ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED);
923    offset_reg->uim_val = spill_slot;
924    ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED)->uim_val = elems;
925    reload->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
926 
927    if (reg->flags & IR3_REG_ARRAY) {
928       dst->array.offset = 0;
929       dst->array.id = reg->array.id;
930       dst->size = reg->size;
931    } else {
932       dst->wrmask = MASK(elems);
933    }
934 
935    dst->merge_set = reg->merge_set;
936    dst->merge_set_offset = reg->merge_set_offset;
937    dst->interval_start = reg->interval_start;
938    dst->interval_end = reg->interval_end;
939 
940    if (after)
941       ir3_instr_move_before(reload, after);
942 
943    return dst;
944 }
945 
946 static void
rewrite_src_interval(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_register * def,struct ir3_instruction * instr,struct ir3_block * block)947 rewrite_src_interval(struct ra_spill_ctx *ctx,
948                     struct ra_spill_interval *interval,
949                     struct ir3_register *def,
950                     struct ir3_instruction *instr,
951                     struct ir3_block *block)
952 {
953    interval->dst.flags = def->flags;
954    interval->dst.def = def;
955    interval->needs_reload = false;
956 
957    rb_tree_foreach (struct ra_spill_interval, child,
958                     &interval->interval.children, interval.node) {
959       struct ir3_register *child_reg = child->interval.reg;
960       struct ir3_register *child_def =
961          extract(def, (child_reg->interval_start -
962                        interval->interval.reg->interval_start) / reg_elem_size(def),
963                  reg_elems(child_reg), instr, block);
964       rewrite_src_interval(ctx, child, child_def, instr, block);
965    }
966 }
967 
968 static void
reload_def(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_instruction * instr,struct ir3_block * block)969 reload_def(struct ra_spill_ctx *ctx, struct ir3_register *def,
970            struct ir3_instruction *instr, struct ir3_block *block)
971 {
972    unsigned elems = reg_elems(def);
973    struct ra_spill_interval *interval = ctx->intervals[def->name];
974 
975    struct ir3_reg_interval *ir3_parent = interval->interval.parent;
976 
977    if (ir3_parent) {
978       struct ra_spill_interval *parent =
979          ir3_reg_interval_to_interval(ir3_parent);
980       if (!parent->needs_reload) {
981          interval->dst.flags = def->flags;
982          interval->dst.def = extract(
983             parent->dst.def, (def->interval_start - parent->dst.def->interval_start) /
984             reg_elem_size(def), elems, instr, block);
985          return;
986       }
987    }
988 
989    struct ir3_register *dst;
990    if (interval->can_rematerialize)
991       dst = rematerialize(def, instr, block);
992    else
993       dst = reload(ctx, def, instr, block);
994 
995    rewrite_src_interval(ctx, interval, dst, instr, block);
996 }
997 
998 static void
reload_src(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)999 reload_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
1000             struct ir3_register *src)
1001 {
1002    struct ra_spill_interval *interval = ctx->intervals[src->def->name];
1003 
1004    if (interval->needs_reload) {
1005       reload_def(ctx, src->def, instr, instr->block);
1006    }
1007 
1008    ra_spill_interval_root(interval)->cant_spill = false;
1009 }
1010 
1011 static void
rewrite_src(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)1012 rewrite_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
1013             struct ir3_register *src)
1014 {
1015    struct ra_spill_interval *interval = ctx->intervals[src->def->name];
1016 
1017    set_src_val(src, &interval->dst);
1018 }
1019 
1020 static void
update_max_pressure(struct ra_spill_ctx * ctx)1021 update_max_pressure(struct ra_spill_ctx *ctx)
1022 {
1023    d("pressure:");
1024    d("\tfull: %u", ctx->cur_pressure.full);
1025    d("\thalf: %u", ctx->cur_pressure.half);
1026    d("\tshared: %u", ctx->cur_pressure.shared);
1027 
1028    ctx->max_pressure.full =
1029       MAX2(ctx->max_pressure.full, ctx->cur_pressure.full);
1030    ctx->max_pressure.half =
1031       MAX2(ctx->max_pressure.half, ctx->cur_pressure.half);
1032    ctx->max_pressure.shared =
1033       MAX2(ctx->max_pressure.shared, ctx->cur_pressure.shared);
1034 }
1035 
1036 static void
handle_instr(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)1037 handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
1038 {
1039    ra_foreach_dst (dst, instr) {
1040       init_dst(ctx, dst);
1041    }
1042 
1043    if (ctx->spilling) {
1044       ra_foreach_src (src, instr)
1045          insert_src(ctx, src);
1046    }
1047 
1048    /* Handle tied and early-kill destinations. If a destination is tied to a
1049     * source and that source is live-through, then we need to allocate a new
1050     * register for the destination which is live-through itself and cannot
1051     * overlap the sources. Similarly early-kill destinations cannot overlap
1052     * sources.
1053     */
1054 
1055    ra_foreach_dst (dst, instr) {
1056       struct ir3_register *tied_src = dst->tied;
1057       if ((tied_src && !(tied_src->flags & IR3_REG_FIRST_KILL)) ||
1058           (dst->flags & IR3_REG_EARLY_CLOBBER))
1059          insert_dst(ctx, dst);
1060    }
1061 
1062    if (ctx->spilling)
1063       limit(ctx, instr);
1064    else
1065       update_max_pressure(ctx);
1066 
1067    if (ctx->spilling) {
1068       ra_foreach_src (src, instr) {
1069          reload_src(ctx, instr, src);
1070          update_src_next_use(ctx, src);
1071       }
1072    }
1073 
1074    ra_foreach_src (src, instr) {
1075       if (src->flags & IR3_REG_FIRST_KILL)
1076          remove_src_early(ctx, instr, src);
1077    }
1078 
1079    ra_foreach_dst (dst, instr) {
1080       insert_dst(ctx, dst);
1081    }
1082 
1083    if (ctx->spilling)
1084       limit(ctx, instr);
1085    else
1086       update_max_pressure(ctx);
1087 
1088    /* We have to remove sources before rewriting them so that we can lookup the
1089     * interval to remove before the source itself is changed.
1090     */
1091    ra_foreach_src (src, instr) {
1092       if (src->flags & IR3_REG_FIRST_KILL)
1093          remove_src(ctx, instr, src);
1094    }
1095 
1096    if (ctx->spilling) {
1097       ra_foreach_src (src, instr) {
1098          rewrite_src(ctx, instr, src);
1099       }
1100    }
1101 
1102    ra_foreach_dst (dst, instr) {
1103       finish_dst(ctx, dst);
1104    }
1105 
1106    for (unsigned i = 0; i < instr->dsts_count; i++) {
1107       if (ra_reg_is_dst(instr->dsts[i]) &&
1108           (instr->dsts[i]->flags & IR3_REG_UNUSED))
1109          remove_dst(ctx, instr->dsts[i]);
1110    }
1111 }
1112 
1113 static struct ra_spill_interval *
create_temp_interval(struct ra_spill_ctx * ctx,struct ir3_register * def)1114 create_temp_interval(struct ra_spill_ctx *ctx, struct ir3_register *def)
1115 {
1116    unsigned name = ctx->intervals_count++;
1117    unsigned offset = ctx->live->interval_offset;
1118 
1119    /* This is kinda hacky, but we need to create a fake SSA def here that is
1120     * only used as part of the pcopy accounting. See below.
1121     */
1122    struct ir3_register *reg = rzalloc(ctx, struct ir3_register);
1123    *reg = *def;
1124    reg->name = name;
1125    reg->interval_start = offset;
1126    reg->interval_end = offset + reg_size(def);
1127    reg->merge_set = NULL;
1128 
1129    ctx->intervals = reralloc(ctx, ctx->intervals, struct ra_spill_interval *,
1130                              ctx->intervals_count);
1131    struct ra_spill_interval *interval = rzalloc(ctx, struct ra_spill_interval);
1132    ra_spill_interval_init(interval, reg);
1133    ctx->intervals[name] = interval;
1134    ctx->live->interval_offset += reg_size(def);
1135    return interval;
1136 }
1137 
1138 /* In the sequence of copies generated (see below), would this source be killed?
1139  */
1140 static bool
is_last_pcopy_src(struct ir3_instruction * pcopy,unsigned src_n)1141 is_last_pcopy_src(struct ir3_instruction *pcopy, unsigned src_n)
1142 {
1143    struct ir3_register *src = pcopy->srcs[src_n];
1144    if (!(src->flags & IR3_REG_KILL))
1145       return false;
1146    for (unsigned j = src_n + 1; j < pcopy->srcs_count; j++) {
1147       if (pcopy->srcs[j]->def == src->def)
1148          return false;
1149    }
1150    return true;
1151 }
1152 
1153 /* Parallel copies are different from normal instructions. The sources together
1154  * may be larger than the entire register file, so we cannot just reload every
1155  * source like normal, and indeed that probably wouldn't be a great idea.
1156  * Instead we essentially need to lower the parallel copy to "copies," just like
1157  * in the normal CSSA construction, although we implement the copies by
1158  * reloading and then possibly spilling values. We essentially just shuffle
1159  * around the sources until each source either (a) is live or (b) has the same
1160  * spill slot as its corresponding destination. We do this by decomposing the
1161  * copy into a series of copies, so:
1162  *
1163  * a, b, c = d, e, f
1164  *
1165  * becomes:
1166  *
1167  * d' = d
1168  * e' = e
1169  * f' = f
1170  * a = d'
1171  * b = e'
1172  * c = f'
1173  *
1174  * the temporary SSA values d', e', and f' never actually show up in the result.
1175  * They are only used for our internal accounting. They may, however, have their
1176  * own spill slot created for them. Similarly, we don't actually emit any copy
1177  * instructions, although we emit the spills/reloads that *would've* been
1178  * required if those copies were there.
1179  *
1180  * TODO: in order to reduce the number of temporaries and therefore spill slots,
1181  * we could instead do a more complicated analysis that considers the location
1182  * transfer graph.
1183  *
1184  * In addition, we actually remove the parallel copy and rewrite all its uses
1185  * (in the phi nodes) rather than rewrite its sources at the end. Recreating it
1186  * later turns out to be easier than keeping it up-to-date throughout this pass,
1187  * since we may have to remove entries for phi sources that are spilled and add
1188  * entries for live-outs that are spilled and reloaded, which can happen here
1189  * and then possibly be undone or done again when processing live-ins of the
1190  * successor block.
1191  */
1192 
1193 static void
handle_pcopy(struct ra_spill_ctx * ctx,struct ir3_instruction * pcopy)1194 handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
1195 {
1196    ra_foreach_dst (dst, pcopy) {
1197       struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
1198       ra_spill_interval_init(dst_interval, dst);
1199    }
1200 
1201    foreach_src_n (src, i, pcopy) {
1202       struct ir3_register *dst = pcopy->dsts[i];
1203       if (!(dst->flags & IR3_REG_SSA))
1204          continue;
1205 
1206       d("processing src %u", i);
1207 
1208       /* Skip the intermediate copy for cases where the source is merged with
1209        * the destination. Crucially this means that we also don't reload/spill
1210        * it if it's been spilled, because it shares the same spill slot.
1211        */
1212       if ((src->flags & IR3_REG_SSA) && src->def->merge_set &&
1213           src->def->merge_set == dst->merge_set &&
1214           src->def->merge_set_offset == dst->merge_set_offset) {
1215          struct ra_spill_interval *src_interval = ctx->intervals[src->def->name];
1216          struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
1217          if (src_interval->interval.inserted) {
1218             update_src_next_use(ctx, src);
1219             if (is_last_pcopy_src(pcopy, i))
1220                ra_spill_ctx_remove(ctx, src_interval);
1221             dst_interval->cant_spill = true;
1222             ra_spill_ctx_insert(ctx, dst_interval);
1223             limit(ctx, pcopy);
1224             dst_interval->cant_spill = false;
1225             dst_interval->dst = src_interval->dst;
1226          }
1227       } else if (src->flags & IR3_REG_SSA) {
1228          struct ra_spill_interval *temp_interval =
1229             create_temp_interval(ctx, dst);
1230          struct ir3_register *temp = temp_interval->interval.reg;
1231          temp_interval->next_use_distance = src->next_use;
1232 
1233          insert_src(ctx, src);
1234          limit(ctx, pcopy);
1235          reload_src(ctx, pcopy, src);
1236          update_src_next_use(ctx, src);
1237          if (is_last_pcopy_src(pcopy, i))
1238             remove_src(ctx, pcopy, src);
1239          struct ra_spill_interval *src_interval =
1240             ctx->intervals[src->def->name];
1241          temp_interval->dst = src_interval->dst;
1242 
1243          temp_interval->cant_spill = true;
1244          ra_spill_ctx_insert(ctx, temp_interval);
1245          limit(ctx, pcopy);
1246          temp_interval->cant_spill = false;
1247 
1248          src->flags = temp->flags;
1249          src->def = temp;
1250       }
1251    }
1252 
1253    d("done with pcopy srcs");
1254 
1255    foreach_src_n (src, i, pcopy) {
1256       struct ir3_register *dst = pcopy->dsts[i];
1257       if (!(dst->flags & IR3_REG_SSA))
1258          continue;
1259 
1260       if ((src->flags & IR3_REG_SSA) && src->def->merge_set &&
1261           src->def->merge_set == dst->merge_set &&
1262           src->def->merge_set_offset == dst->merge_set_offset)
1263          continue;
1264 
1265       struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
1266 
1267       if (!(src->flags & IR3_REG_SSA)) {
1268          dst_interval->cant_spill = true;
1269          ra_spill_ctx_insert(ctx, dst_interval);
1270          limit(ctx, pcopy);
1271          dst_interval->cant_spill = false;
1272 
1273          assert(src->flags & (IR3_REG_CONST | IR3_REG_IMMED));
1274          if (src->flags & IR3_REG_CONST) {
1275             dst_interval->dst.flags = src->flags;
1276             dst_interval->dst.const_num = src->num;
1277          } else {
1278             dst_interval->dst.flags = src->flags;
1279             dst_interval->dst.uimm = src->uim_val;
1280          }
1281       } else {
1282          struct ra_spill_interval *temp_interval = ctx->intervals[src->def->name];
1283 
1284          insert_src(ctx, src);
1285          limit(ctx, pcopy);
1286          reload_src(ctx, pcopy, src);
1287          remove_src(ctx, pcopy, src);
1288 
1289          dst_interval->dst = temp_interval->dst;
1290          ra_spill_ctx_insert(ctx, dst_interval);
1291       }
1292    }
1293 
1294    pcopy->flags |= IR3_INSTR_UNUSED;
1295 }
1296 
1297 static void
handle_input_phi(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)1298 handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
1299 {
1300    if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1301       return;
1302 
1303    init_dst(ctx, instr->dsts[0]);
1304    insert_dst(ctx, instr->dsts[0]);
1305    finish_dst(ctx, instr->dsts[0]);
1306 }
1307 
1308 static void
remove_input_phi(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)1309 remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
1310 {
1311    if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1312       return;
1313 
1314    if (instr->opc == OPC_META_TEX_PREFETCH) {
1315       ra_foreach_src (src, instr)
1316          remove_src(ctx, instr, src);
1317    }
1318    if (instr->dsts[0]->flags & IR3_REG_UNUSED)
1319       remove_dst(ctx, instr->dsts[0]);
1320 }
1321 
1322 static void
handle_live_in(struct ra_spill_ctx * ctx,struct ir3_block * block,struct ir3_register * def)1323 handle_live_in(struct ra_spill_ctx *ctx, struct ir3_block *block,
1324                struct ir3_register *def)
1325 {
1326    struct ra_spill_interval *interval = ctx->intervals[def->name];
1327    ra_spill_interval_init(interval, def);
1328    if (ctx->spilling) {
1329       interval->next_use_distance =
1330          ctx->blocks[block->index].next_use_start[def->name];
1331    }
1332 
1333    ra_spill_ctx_insert(ctx, interval);
1334 }
1335 
1336 static bool
is_live_in_phi(struct ir3_register * def,struct ir3_block * block)1337 is_live_in_phi(struct ir3_register *def, struct ir3_block *block)
1338 {
1339    return def->instr->opc == OPC_META_PHI && def->instr->block == block;
1340 }
1341 
1342 static bool
is_live_in_pred(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block,unsigned pred_idx)1343 is_live_in_pred(struct ra_spill_ctx *ctx, struct ir3_register *def,
1344                 struct ir3_block *block, unsigned pred_idx)
1345 {
1346    struct ir3_block *pred = block->predecessors[pred_idx];
1347    struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1348    if (is_live_in_phi(def, block)) {
1349       def = def->instr->srcs[pred_idx]->def;
1350       if (!def)
1351          return false;
1352    }
1353 
1354    return _mesa_hash_table_search(state->remap, def);
1355 }
1356 
1357 static bool
is_live_in_undef(struct ir3_register * def,struct ir3_block * block,unsigned pred_idx)1358 is_live_in_undef(struct ir3_register *def,
1359                  struct ir3_block *block, unsigned pred_idx)
1360 {
1361    if (!is_live_in_phi(def, block))
1362       return false;
1363 
1364    return !def->instr->srcs[pred_idx]->def;
1365 }
1366 
1367 static struct reg_or_immed *
read_live_in(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block,unsigned pred_idx)1368 read_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
1369              struct ir3_block *block, unsigned pred_idx)
1370 {
1371    struct ir3_block *pred = block->predecessors[pred_idx];
1372    struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1373 
1374    if (is_live_in_phi(def, block)) {
1375       def = def->instr->srcs[pred_idx]->def;
1376       if (!def)
1377          return NULL;
1378    }
1379 
1380    struct hash_entry *entry = _mesa_hash_table_search(state->remap, def);
1381    if (entry)
1382       return entry->data;
1383    else
1384       return NULL;
1385 }
1386 
1387 static bool
is_live_in_all_preds(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1388 is_live_in_all_preds(struct ra_spill_ctx *ctx, struct ir3_register *def,
1389                      struct ir3_block *block)
1390 {
1391    for (unsigned i = 0; i < block->predecessors_count; i++) {
1392       if (!is_live_in_pred(ctx, def, block, i))
1393          return false;
1394    }
1395 
1396    return true;
1397 }
1398 
1399 static void
spill_live_in(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1400 spill_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
1401               struct ir3_block *block)
1402 {
1403    for (unsigned i = 0; i < block->predecessors_count; i++) {
1404       struct ir3_block *pred = block->predecessors[i];
1405       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1406 
1407       if (!state->visited)
1408          continue;
1409 
1410       struct reg_or_immed *pred_def = read_live_in(ctx, def, block, i);
1411       if (pred_def) {
1412          spill(ctx, pred_def, get_spill_slot(ctx, def), NULL, pred);
1413       }
1414    }
1415 }
1416 
1417 static void
spill_live_ins(struct ra_spill_ctx * ctx,struct ir3_block * block)1418 spill_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block)
1419 {
1420    bool all_preds_visited = true;
1421    for (unsigned i = 0; i < block->predecessors_count; i++) {
1422       struct ir3_block *pred = block->predecessors[i];
1423       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1424       if (!state->visited) {
1425          all_preds_visited = false;
1426          break;
1427       }
1428    }
1429 
1430    /* Note: in the paper they explicitly spill live-through values first, but we
1431     * should be doing that automatically by virtue of picking the largest
1432     * distance due to the extra distance added to edges out of loops.
1433     *
1434     * TODO: Keep track of pressure in each block and preemptively spill
1435     * live-through values as described in the paper to avoid spilling them
1436     * inside the loop.
1437     */
1438 
1439    if (ctx->cur_pressure.half > ctx->limit_pressure.half) {
1440       rb_tree_foreach_safe (struct ra_spill_interval, interval,
1441                             &ctx->half_live_intervals, half_node) {
1442          if (all_preds_visited &&
1443              is_live_in_all_preds(ctx, interval->interval.reg, block))
1444             continue;
1445          if (interval->interval.reg->merge_set ||
1446              !interval->can_rematerialize)
1447             spill_live_in(ctx, interval->interval.reg, block);
1448          ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
1449          if (ctx->cur_pressure.half <= ctx->limit_pressure.half)
1450             break;
1451       }
1452    }
1453 
1454    if (ctx->cur_pressure.full > ctx->limit_pressure.full) {
1455       rb_tree_foreach_safe (struct ra_spill_interval, interval,
1456                             &ctx->full_live_intervals, node) {
1457          if (all_preds_visited &&
1458              is_live_in_all_preds(ctx, interval->interval.reg, block))
1459             continue;
1460          spill_live_in(ctx, interval->interval.reg, block);
1461          ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
1462          if (ctx->cur_pressure.full <= ctx->limit_pressure.full)
1463             break;
1464       }
1465    }
1466 }
1467 
1468 static void
live_in_rewrite(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct reg_or_immed * new_val,struct ir3_block * block,unsigned pred_idx)1469 live_in_rewrite(struct ra_spill_ctx *ctx,
1470                 struct ra_spill_interval *interval,
1471                 struct reg_or_immed *new_val,
1472                 struct ir3_block *block, unsigned pred_idx)
1473 {
1474    struct ir3_block *pred = block->predecessors[pred_idx];
1475    struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1476    struct ir3_register *def = interval->interval.reg;
1477    if (is_live_in_phi(def, block)) {
1478       def = def->instr->srcs[pred_idx]->def;
1479    }
1480 
1481    if (def)
1482       _mesa_hash_table_insert(state->remap, def, new_val);
1483 
1484    rb_tree_foreach (struct ra_spill_interval, child,
1485                     &interval->interval.children, interval.node) {
1486       assert(new_val->flags & IR3_REG_SSA);
1487       struct ir3_register *child_def =
1488          extract(new_val->def,
1489                  (child->interval.reg->interval_start - def->interval_start) /
1490                  reg_elem_size(def), reg_elems(child->interval.reg),
1491                  NULL, pred);
1492       struct reg_or_immed *child_val = ralloc(ctx, struct reg_or_immed);
1493       child_val->def = child_def;
1494       child_val->flags = child_def->flags;
1495       live_in_rewrite(ctx, child, child_val, block, pred_idx);
1496    }
1497 }
1498 
1499 static void
reload_live_in(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1500 reload_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
1501                struct ir3_block *block)
1502 {
1503    struct ra_spill_interval *interval = ctx->intervals[def->name];
1504    for (unsigned i = 0; i < block->predecessors_count; i++) {
1505       struct ir3_block *pred = block->predecessors[i];
1506       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1507       if (!state->visited)
1508          continue;
1509 
1510       if (is_live_in_undef(def, block, i))
1511          continue;
1512 
1513       struct reg_or_immed *new_val = read_live_in(ctx, def, block, i);
1514 
1515       if (!new_val) {
1516          new_val = ralloc(ctx, struct reg_or_immed);
1517          if (interval->can_rematerialize)
1518             new_val->def = rematerialize(def, NULL, pred);
1519          else
1520             new_val->def = reload(ctx, def, NULL, pred);
1521          new_val->flags = new_val->def->flags;
1522       }
1523       live_in_rewrite(ctx, interval, new_val, block, i);
1524    }
1525 }
1526 
1527 static void
reload_live_ins(struct ra_spill_ctx * ctx,struct ir3_block * block)1528 reload_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block)
1529 {
1530    rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
1531                     interval.node) {
1532       reload_live_in(ctx, interval->interval.reg, block);
1533    }
1534 }
1535 
1536 static void
add_live_in_phi(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1537 add_live_in_phi(struct ra_spill_ctx *ctx, struct ir3_register *def,
1538                 struct ir3_block *block)
1539 {
1540    struct ra_spill_interval *interval = ctx->intervals[def->name];
1541    if (!interval->interval.inserted)
1542       return;
1543 
1544    bool needs_phi = false;
1545    struct ir3_register *cur_def = NULL;
1546    for (unsigned i = 0; i < block->predecessors_count; i++) {
1547       struct ir3_block *pred = block->predecessors[i];
1548       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1549 
1550       if (!state->visited) {
1551          needs_phi = true;
1552          break;
1553       }
1554 
1555       struct hash_entry *entry =
1556          _mesa_hash_table_search(state->remap, def);
1557       assert(entry);
1558       struct reg_or_immed *pred_val = entry->data;
1559       if ((pred_val->flags & (IR3_REG_IMMED | IR3_REG_CONST)) ||
1560           !pred_val->def ||
1561           (cur_def && cur_def != pred_val->def)) {
1562          needs_phi = true;
1563          break;
1564       }
1565       cur_def = pred_val->def;
1566    }
1567 
1568    if (!needs_phi) {
1569       interval->dst.def = cur_def;
1570       interval->dst.flags = cur_def->flags;
1571       return;
1572    }
1573 
1574    struct ir3_instruction *phi =
1575       ir3_instr_create(block, OPC_META_PHI, 1, block->predecessors_count);
1576    struct ir3_register *dst = __ssa_dst(phi);
1577    dst->flags |= def->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
1578    dst->size = def->size;
1579    dst->wrmask = def->wrmask;
1580 
1581    dst->interval_start = def->interval_start;
1582    dst->interval_end = def->interval_end;
1583    dst->merge_set = def->merge_set;
1584    dst->merge_set_offset = def->merge_set_offset;
1585 
1586    for (unsigned i = 0; i < block->predecessors_count; i++) {
1587       struct ir3_block *pred = block->predecessors[i];
1588       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1589       struct ir3_register *src = ir3_src_create(phi, INVALID_REG, dst->flags);
1590       src->size = def->size;
1591       src->wrmask = def->wrmask;
1592 
1593       if (state->visited) {
1594          struct hash_entry *entry =
1595             _mesa_hash_table_search(state->remap, def);
1596          assert(entry);
1597          struct reg_or_immed *new_val = entry->data;
1598          set_src_val(src, new_val);
1599       } else {
1600          src->def = def;
1601       }
1602    }
1603 
1604    interval->dst.def = dst;
1605    interval->dst.flags = dst->flags;
1606 
1607    ir3_instr_move_before_block(phi, block);
1608 }
1609 
1610 /* When spilling a block with a single predecessors, the pred may have other
1611  * successors so we can't choose what's live in and we can't spill/restore
1612  * anything. Just make the inserted intervals exactly match the predecessor. If
1613  * it wasn't live in the predecessor then it must've already been spilled. Also,
1614  * there are no phi nodes and no live-ins.
1615  */
1616 static void
spill_single_pred_live_in(struct ra_spill_ctx * ctx,struct ir3_block * block)1617 spill_single_pred_live_in(struct ra_spill_ctx *ctx,
1618                           struct ir3_block *block)
1619 {
1620    unsigned name;
1621    BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1622                        ctx->live->definitions_count) {
1623       struct ir3_register *reg = ctx->live->definitions[name];
1624       struct ra_spill_interval *interval = ctx->intervals[reg->name];
1625       struct reg_or_immed *val = read_live_in(ctx, reg, block, 0);
1626       if (val)
1627          interval->dst = *val;
1628       else
1629          ra_spill_ctx_remove(ctx, interval);
1630    }
1631 }
1632 
1633 static void
rewrite_phi(struct ra_spill_ctx * ctx,struct ir3_instruction * phi,struct ir3_block * block)1634 rewrite_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *phi,
1635             struct ir3_block *block)
1636 {
1637    if (!(phi->dsts[0]->flags & IR3_REG_SSA))
1638       return;
1639 
1640    if (!ctx->intervals[phi->dsts[0]->name]->interval.inserted) {
1641       phi->flags |= IR3_INSTR_UNUSED;
1642       return;
1643    }
1644 
1645    for (unsigned i = 0; i < block->predecessors_count; i++) {
1646       struct ir3_block *pred = block->predecessors[i];
1647       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1648 
1649       if (!state->visited)
1650          continue;
1651 
1652       struct ir3_register *src = phi->srcs[i];
1653       if (!src->def)
1654          continue;
1655 
1656       struct hash_entry *entry =
1657          _mesa_hash_table_search(state->remap, src->def);
1658       assert(entry);
1659       struct reg_or_immed *new_val = entry->data;
1660       set_src_val(src, new_val);
1661    }
1662 }
1663 
1664 static void
spill_live_out(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_block * block)1665 spill_live_out(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval,
1666                struct ir3_block *block)
1667 {
1668    struct ir3_register *def = interval->interval.reg;
1669 
1670    if (interval->interval.reg->merge_set ||
1671        !interval->can_rematerialize)
1672       spill(ctx, &interval->dst, get_spill_slot(ctx, def), NULL, block);
1673    ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
1674 }
1675 
1676 static void
spill_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1677 spill_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1678 {
1679    struct ra_spill_block_state *state = &ctx->blocks[block->index];
1680    rb_tree_foreach_safe (struct ra_spill_interval, interval,
1681                          &ctx->reg_ctx.intervals, interval.node) {
1682       if (!BITSET_TEST(state->live_out, interval->interval.reg->name)) {
1683          spill_live_out(ctx, interval, block);
1684       }
1685    }
1686 }
1687 
1688 static void
reload_live_out(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1689 reload_live_out(struct ra_spill_ctx *ctx, struct ir3_register *def,
1690                 struct ir3_block *block)
1691 {
1692    struct ra_spill_interval *interval = ctx->intervals[def->name];
1693    ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
1694 
1695    reload_def(ctx, def, NULL, block);
1696 }
1697 
1698 static void
reload_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1699 reload_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1700 {
1701    struct ra_spill_block_state *state = &ctx->blocks[block->index];
1702    unsigned name;
1703    BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) {
1704       struct ir3_register *reg = ctx->live->definitions[name];
1705       struct ra_spill_interval *interval = ctx->intervals[name];
1706       if (!interval->interval.inserted)
1707          reload_live_out(ctx, reg, block);
1708    }
1709 }
1710 
1711 static void
update_live_out_phis(struct ra_spill_ctx * ctx,struct ir3_block * block)1712 update_live_out_phis(struct ra_spill_ctx *ctx, struct ir3_block *block)
1713 {
1714    assert(!block->successors[1]);
1715    struct ir3_block *succ = block->successors[0];
1716    unsigned pred_idx = ir3_block_get_pred_index(succ, block);
1717 
1718    foreach_instr (instr, &succ->instr_list) {
1719       if (instr->opc != OPC_META_PHI)
1720          break;
1721 
1722       struct ir3_register *def = instr->srcs[pred_idx]->def;
1723       if (!def)
1724          continue;
1725 
1726       struct ra_spill_interval *interval = ctx->intervals[def->name];
1727       if (!interval->interval.inserted)
1728          continue;
1729       set_src_val(instr->srcs[pred_idx], &interval->dst);
1730    }
1731 }
1732 
1733 static void
record_pred_live_out(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_block * block,unsigned pred_idx)1734 record_pred_live_out(struct ra_spill_ctx *ctx,
1735                      struct ra_spill_interval *interval,
1736                      struct ir3_block *block, unsigned pred_idx)
1737 {
1738    struct ir3_block *pred = block->predecessors[pred_idx];
1739    struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1740 
1741    struct ir3_register *def = interval->interval.reg;
1742    if (is_live_in_phi(def, block)) {
1743       def = def->instr->srcs[pred_idx]->def;
1744    }
1745    BITSET_SET(state->live_out, def->name);
1746 
1747    rb_tree_foreach (struct ra_spill_interval, child,
1748                     &interval->interval.children, interval.node) {
1749       record_pred_live_out(ctx, child, block, pred_idx);
1750    }
1751 }
1752 
1753 static void
record_pred_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1754 record_pred_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1755 {
1756    for (unsigned i = 0; i < block->predecessors_count; i++) {
1757       struct ir3_block *pred = block->predecessors[i];
1758       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1759       if (state->visited)
1760          continue;
1761 
1762       state->live_out = rzalloc_array(ctx, BITSET_WORD,
1763                                       BITSET_WORDS(ctx->live->definitions_count));
1764 
1765 
1766       rb_tree_foreach (struct ra_spill_interval, interval,
1767                        &ctx->reg_ctx.intervals, interval.node) {
1768          record_pred_live_out(ctx, interval, block, i);
1769       }
1770    }
1771 }
1772 
1773 static void
record_live_out(struct ra_spill_ctx * ctx,struct ra_spill_block_state * state,struct ra_spill_interval * interval)1774 record_live_out(struct ra_spill_ctx *ctx,
1775                 struct ra_spill_block_state *state,
1776                 struct ra_spill_interval *interval)
1777 {
1778    if (!(interval->dst.flags & IR3_REG_SSA) ||
1779        interval->dst.def) {
1780       struct reg_or_immed *val = ralloc(ctx, struct reg_or_immed);
1781       *val = interval->dst;
1782       _mesa_hash_table_insert(state->remap, interval->interval.reg, val);
1783    }
1784    rb_tree_foreach (struct ra_spill_interval, child,
1785                     &interval->interval.children, interval.node) {
1786       record_live_out(ctx, state, child);
1787    }
1788 }
1789 
1790 static void
record_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1791 record_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1792 {
1793    struct ra_spill_block_state *state = &ctx->blocks[block->index];
1794    state->remap = _mesa_pointer_hash_table_create(ctx);
1795 
1796    rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
1797                     interval.node) {
1798       record_live_out(ctx, state, interval);
1799    }
1800 }
1801 
1802 static void
handle_block(struct ra_spill_ctx * ctx,struct ir3_block * block)1803 handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block)
1804 {
1805    memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure));
1806    rb_tree_init(&ctx->reg_ctx.intervals);
1807    rb_tree_init(&ctx->full_live_intervals);
1808    rb_tree_init(&ctx->half_live_intervals);
1809 
1810    unsigned name;
1811    BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1812                        ctx->live->definitions_count) {
1813       struct ir3_register *reg = ctx->live->definitions[name];
1814       handle_live_in(ctx, block, reg);
1815    }
1816 
1817    foreach_instr (instr, &block->instr_list) {
1818       if (instr->opc != OPC_META_PHI && instr->opc != OPC_META_INPUT &&
1819           instr->opc != OPC_META_TEX_PREFETCH)
1820          break;
1821       handle_input_phi(ctx, instr);
1822    }
1823 
1824    if (ctx->spilling) {
1825       if (block->predecessors_count == 1) {
1826          spill_single_pred_live_in(ctx, block);
1827       } else {
1828          spill_live_ins(ctx, block);
1829          reload_live_ins(ctx, block);
1830          record_pred_live_outs(ctx, block);
1831          foreach_instr (instr, &block->instr_list) {
1832             if (instr->opc != OPC_META_PHI)
1833                break;
1834             rewrite_phi(ctx, instr, block);
1835          }
1836          BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1837                              ctx->live->definitions_count) {
1838             struct ir3_register *reg = ctx->live->definitions[name];
1839             add_live_in_phi(ctx, reg, block);
1840          }
1841       }
1842    } else {
1843       update_max_pressure(ctx);
1844    }
1845 
1846    foreach_instr (instr, &block->instr_list) {
1847       di(instr, "processing");
1848 
1849       if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT ||
1850           instr->opc == OPC_META_TEX_PREFETCH)
1851          remove_input_phi(ctx, instr);
1852       else if (ctx->spilling && instr->opc == OPC_META_PARALLEL_COPY)
1853          handle_pcopy(ctx, instr);
1854       else if (ctx->spilling && instr->opc == OPC_MOV &&
1855                instr->dsts[0] == ctx->base_reg)
1856          /* skip */;
1857       else
1858          handle_instr(ctx, instr);
1859    }
1860 
1861    if (ctx->spilling && block->successors[0]) {
1862       struct ra_spill_block_state *state =
1863          &ctx->blocks[block->successors[0]->index];
1864       if (state->visited) {
1865          assert(!block->successors[1]);
1866 
1867          spill_live_outs(ctx, block);
1868          reload_live_outs(ctx, block);
1869          update_live_out_phis(ctx, block);
1870       }
1871    }
1872 
1873    if (ctx->spilling) {
1874       record_live_outs(ctx, block);
1875       ctx->blocks[block->index].visited = true;
1876    }
1877 }
1878 
1879 static bool
simplify_phi_node(struct ir3_instruction * phi)1880 simplify_phi_node(struct ir3_instruction *phi)
1881 {
1882    struct ir3_register *def = NULL;
1883    foreach_src (src, phi) {
1884       /* Ignore phi sources which point to the phi itself. */
1885       if (src->def == phi->dsts[0])
1886          continue;
1887       /* If it's undef or it doesn't match the previous sources, bail */
1888       if (!src->def || (def && def != src->def))
1889          return false;
1890       def = src->def;
1891    }
1892 
1893    phi->data = def;
1894    phi->flags |= IR3_INSTR_UNUSED;
1895    return true;
1896 }
1897 
1898 static struct ir3_register *
simplify_phi_def(struct ir3_register * def)1899 simplify_phi_def(struct ir3_register *def)
1900 {
1901    if (def->instr->opc == OPC_META_PHI) {
1902       struct ir3_instruction *phi = def->instr;
1903 
1904       /* Note: this function is always called at least once after visiting the
1905        * phi, so either there has been a simplified phi in the meantime, in
1906        * which case we will set progress=true and visit the definition again, or
1907        * phi->data already has the most up-to-date value. Therefore we don't
1908        * have to recursively check phi->data.
1909        */
1910       if (phi->data)
1911          return phi->data;
1912    }
1913 
1914    return def;
1915 }
1916 
1917 static void
simplify_phi_srcs(struct ir3_instruction * instr)1918 simplify_phi_srcs(struct ir3_instruction *instr)
1919 {
1920    foreach_src (src, instr) {
1921       if (src->def)
1922          src->def = simplify_phi_def(src->def);
1923    }
1924 }
1925 
1926 /* We insert phi nodes for all live-ins of loops in case we need to split the
1927  * live range. This pass cleans that up for the case where the live range didn't
1928  * actually need to be split.
1929  */
1930 static void
simplify_phi_nodes(struct ir3 * ir)1931 simplify_phi_nodes(struct ir3 *ir)
1932 {
1933    foreach_block (block, &ir->block_list) {
1934       foreach_instr (instr, &block->instr_list) {
1935          if (instr->opc != OPC_META_PHI)
1936             break;
1937          instr->data = NULL;
1938       }
1939    }
1940 
1941    bool progress;
1942    do {
1943       progress = false;
1944       foreach_block (block, &ir->block_list) {
1945          foreach_instr (instr, &block->instr_list) {
1946             if (instr->opc == OPC_META_PHI || (instr->flags & IR3_INSTR_UNUSED))
1947                continue;
1948 
1949             simplify_phi_srcs(instr);
1950          }
1951 
1952          /* Visit phi nodes in the successors to make sure that phi sources are
1953           * always visited at least once after visiting the definition they
1954           * point to. See note in simplify_phi_def() for why this is necessary.
1955           */
1956          for (unsigned i = 0; i < 2; i++) {
1957             struct ir3_block *succ = block->successors[i];
1958             if (!succ)
1959                continue;
1960             foreach_instr (instr, &succ->instr_list) {
1961                if (instr->opc != OPC_META_PHI)
1962                   break;
1963                if (instr->flags & IR3_INSTR_UNUSED) {
1964                   if (instr->data)
1965                      instr->data = simplify_phi_def(instr->data);
1966                } else {
1967                   simplify_phi_srcs(instr);
1968                   progress |= simplify_phi_node(instr);
1969                }
1970             }
1971          }
1972       }
1973    } while (progress);
1974 }
1975 
1976 static void
unmark_dead(struct ir3 * ir)1977 unmark_dead(struct ir3 *ir)
1978 {
1979    foreach_block (block, &ir->block_list) {
1980       foreach_instr (instr, &block->instr_list) {
1981          instr->flags &= ~IR3_INSTR_UNUSED;
1982       }
1983    }
1984 }
1985 
1986 /* Simple pass to remove now-dead phi nodes and pcopy instructions. We mark
1987  * which ones are dead along the way, so there's nothing to compute here.
1988  */
1989 static void
cleanup_dead(struct ir3 * ir)1990 cleanup_dead(struct ir3 *ir)
1991 {
1992    foreach_block (block, &ir->block_list) {
1993       foreach_instr_safe (instr, &block->instr_list) {
1994          if (instr->flags & IR3_INSTR_UNUSED) {
1995             if (instr->opc == OPC_META_PARALLEL_COPY) {
1996                /* There may be non-SSA shared copies, we need to preserve these.
1997                 */
1998                for (unsigned i = 0; i < instr->dsts_count;) {
1999                   if (instr->dsts[i]->flags & IR3_REG_SSA) {
2000                      instr->dsts[i] = instr->dsts[--instr->dsts_count];
2001                      instr->srcs[i] = instr->srcs[--instr->srcs_count];
2002                   } else {
2003                      i++;
2004                   }
2005                }
2006 
2007                if (instr->dsts_count == 0)
2008                   list_delinit(&instr->node);
2009             } else {
2010                list_delinit(&instr->node);
2011             }
2012          }
2013       }
2014    }
2015 }
2016 
2017 /* Deal with merge sets after spilling. Spilling generally leaves the merge sets
2018  * in a mess, and even if we properly cleaned up after ourselves, we would want
2019  * to recompute the merge sets afterward anway. That's because
2020  * spilling/reloading can "break up" phi webs and split/collect webs so that
2021  * allocating them to the same register no longer gives any benefit. For
2022  * example, imagine we have this:
2023  *
2024  * if (...) {
2025  *    foo = ...
2026  * } else {
2027  *    bar = ...
2028  * }
2029  * baz = phi(foo, bar)
2030  *
2031  * and we spill "baz":
2032  *
2033  * if (...) {
2034  *    foo = ...
2035  *    spill(foo)
2036  * } else {
2037  *    bar = ...
2038  *    spill(bar)
2039  * }
2040  * baz = reload()
2041  *
2042  * now foo, bar, and baz don't have to be allocated to the same register. How
2043  * exactly the merge sets change can be complicated, so it's easier just to
2044  * recompute them.
2045  *
2046  * However, there's a wrinkle in this: those same merge sets determine the
2047  * register pressure, due to multiple values inhabiting the same register! And
2048  * we assume that this sharing happens when spilling. Therefore we need a
2049  * three-step procedure:
2050  *
2051  * 1. Drop the original merge sets.
2052  * 2. Calculate which values *must* be merged, being careful to only use the
2053  *    interval information which isn't trashed by spilling, and forcibly merge
2054  *    them.
2055  * 3. Let ir3_merge_regs() finish the job, including recalculating the
2056  *    intervals.
2057  */
2058 
2059 static void
fixup_merge_sets(struct ir3_liveness * live,struct ir3 * ir)2060 fixup_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
2061 {
2062    foreach_block (block, &ir->block_list) {
2063       foreach_instr (instr, &block->instr_list) {
2064          ra_foreach_dst (dst, instr) {
2065             dst->merge_set = NULL;
2066             dst->merge_set_offset = 0;
2067          }
2068       }
2069    }
2070 
2071    foreach_block (block, &ir->block_list) {
2072       foreach_instr (instr, &block->instr_list) {
2073          if (instr->opc != OPC_META_SPLIT &&
2074              instr->opc != OPC_META_COLLECT)
2075             continue;
2076 
2077          struct ir3_register *dst = instr->dsts[0];
2078          ra_foreach_src (src, instr) {
2079             if (!(src->flags & IR3_REG_KILL) &&
2080                 src->def->interval_start < dst->interval_end &&
2081                 dst->interval_start < src->def->interval_end) {
2082                ir3_force_merge(dst, src->def,
2083                                src->def->interval_start - dst->interval_start);
2084             }
2085          }
2086       }
2087    }
2088 
2089    ir3_merge_regs(live, ir);
2090 }
2091 
2092 void
ir3_calc_pressure(struct ir3_shader_variant * v,struct ir3_liveness * live,struct ir3_pressure * max_pressure)2093 ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
2094                   struct ir3_pressure *max_pressure)
2095 {
2096    struct ra_spill_ctx *ctx = rzalloc(NULL, struct ra_spill_ctx);
2097    spill_ctx_init(ctx, v, live);
2098 
2099    foreach_block (block, &v->ir->block_list) {
2100       handle_block(ctx, block);
2101    }
2102 
2103    assert(ctx->cur_pressure.full == 0);
2104    assert(ctx->cur_pressure.half == 0);
2105    assert(ctx->cur_pressure.shared == 0);
2106 
2107    *max_pressure = ctx->max_pressure;
2108    ralloc_free(ctx);
2109 }
2110 
2111 bool
ir3_spill(struct ir3 * ir,struct ir3_shader_variant * v,struct ir3_liveness ** live,const struct ir3_pressure * limit_pressure)2112 ir3_spill(struct ir3 *ir, struct ir3_shader_variant *v,
2113           struct ir3_liveness **live,
2114           const struct ir3_pressure *limit_pressure)
2115 {
2116    void *mem_ctx = ralloc_parent(*live);
2117    struct ra_spill_ctx *ctx = rzalloc(mem_ctx, struct ra_spill_ctx);
2118    spill_ctx_init(ctx, v, *live);
2119 
2120    ctx->spilling = true;
2121 
2122    ctx->blocks = rzalloc_array(ctx, struct ra_spill_block_state,
2123                                ctx->live->block_count);
2124    rb_tree_init(&ctx->full_live_intervals);
2125    rb_tree_init(&ctx->half_live_intervals);
2126 
2127    ctx->limit_pressure = *limit_pressure;
2128    ctx->spill_slot = v->pvtmem_size;
2129 
2130    add_base_reg(ctx, ir);
2131    compute_next_distance(ctx, ir);
2132 
2133    unmark_dead(ir);
2134 
2135    foreach_block (block, &ir->block_list) {
2136       handle_block(ctx, block);
2137    }
2138 
2139    simplify_phi_nodes(ir);
2140 
2141    cleanup_dead(ir);
2142 
2143    ir3_create_parallel_copies(ir);
2144 
2145    /* After this point, we're done mutating the IR. Liveness has been trashed,
2146     * so recalculate it. We'll need it for recalculating the merge sets.
2147     */
2148    ralloc_free(ctx->live);
2149    *live = ir3_calc_liveness(mem_ctx, ir);
2150 
2151    fixup_merge_sets(*live, ir);
2152 
2153    v->pvtmem_size = ctx->spill_slot;
2154    ralloc_free(ctx);
2155 
2156    return true;
2157 }
2158