• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Valve Corporation
3  * Copyright © 2014 Rob Clark <robclark@freedesktop.org>
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "ir3_ra.h"
8 #include "ir3_shader.h"
9 
10 #include "util/u_math.h"
11 
12 /* Allocating shared registers can pose a challenge, because their live
13  * intervals use the physical CFG which has extra edges inserted that are
14  * pretty much always critical edges. This causes problems with phi nodes,
15  * because copies for phi nodes have to happen "along the edge," and similarly
16  * causes problems when reunifying values that have had their live range split.
17  * Problematic phi nodes should be relatively rare, so we ban them for now.
18  * The solution we choose for live-range splitting is to integrate spilling and
19  * register allcoation and spill to vector registers rather than split a live
20  * range, which negates some of the advantages of SSA-based RA, but it isn't as
21  * bad as it seems because the conditions needed (vector shared registers, which
22  * only movmsk currently produces, or fixed registers which we don't do) are
23  * relatively rare. Spilling is also much cheaper than spilling vector registers
24  * to private memory.
25  */
26 
27 struct ra_interval {
28    struct ir3_reg_interval interval;
29 
30    struct rb_node physreg_node;
31    physreg_t physreg_start, physreg_end;
32 
33    /* If this interval was spilled, the original physreg_start before spilling.
34     * Used when reloading live outs.
35     */
36    physreg_t physreg_start_orig;
37 
38    /* Where the shared register is spilled to. If there were no uses when it's
39     * spilled it could be the original defining instruction.
40     */
41    struct ir3_register *spill_def;
42 
43    /* Whether this contains a source of the current instruction that can't be
44     * spilled.
45     */
46    bool src;
47 
48    bool needs_reload;
49 };
50 
51 struct ra_block_state {
52    bool visited;
53 
54    /* For blocks whose successors are visited first (i.e. loop backedges), which
55     * values should be live at the end.
56     */
57    BITSET_WORD *live_out;
58 };
59 
60 struct ra_ctx {
61    struct ir3_reg_ctx reg_ctx;
62 
63    BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
64 
65    struct rb_tree physreg_intervals;
66 
67    struct ra_interval *intervals;
68 
69    struct ir3_liveness *live;
70 
71    struct hash_table *pcopy_src_map;
72 
73    struct ra_block_state *blocks;
74 
75    unsigned start;
76 };
77 
78 static struct ra_interval *
ir3_reg_interval_to_ra_interval(struct ir3_reg_interval * interval)79 ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
80 {
81    return rb_node_data(struct ra_interval, interval, interval);
82 }
83 
84 static struct ra_interval *
rb_node_to_interval(struct rb_node * node)85 rb_node_to_interval(struct rb_node *node)
86 {
87    return rb_node_data(struct ra_interval, node, physreg_node);
88 }
89 
90 static const struct ra_interval *
rb_node_to_interval_const(const struct rb_node * node)91 rb_node_to_interval_const(const struct rb_node *node)
92 {
93    return rb_node_data(struct ra_interval, node, physreg_node);
94 }
95 
96 static struct ra_interval *
ra_interval_next(struct ra_interval * interval)97 ra_interval_next(struct ra_interval *interval)
98 {
99    struct rb_node *next = rb_node_next(&interval->physreg_node);
100    return next ? rb_node_to_interval(next) : NULL;
101 }
102 
103 static struct ra_interval *
ra_interval_next_or_null(struct ra_interval * interval)104 ra_interval_next_or_null(struct ra_interval *interval)
105 {
106    return interval ? ra_interval_next(interval) : NULL;
107 }
108 
109 static int
ra_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)110 ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
111 {
112    const struct ra_interval *a = rb_node_to_interval_const(_a);
113    const struct ra_interval *b = rb_node_to_interval_const(_b);
114    return b->physreg_start - a->physreg_start;
115 }
116 
117 static int
ra_interval_cmp(const struct rb_node * node,const void * data)118 ra_interval_cmp(const struct rb_node *node, const void *data)
119 {
120    physreg_t reg = *(const physreg_t *)data;
121    const struct ra_interval *interval = rb_node_to_interval_const(node);
122    if (interval->physreg_start > reg)
123       return -1;
124    else if (interval->physreg_end <= reg)
125       return 1;
126    else
127       return 0;
128 }
129 
130 static struct ra_ctx *
ir3_reg_ctx_to_ctx(struct ir3_reg_ctx * ctx)131 ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
132 {
133    return rb_node_data(struct ra_ctx, ctx, reg_ctx);
134 }
135 
136 static struct ra_interval *
ra_interval_search_sloppy(struct rb_tree * tree,physreg_t reg)137 ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
138 {
139    struct rb_node *node = rb_tree_search_sloppy(tree, &reg, ra_interval_cmp);
140    return node ? rb_node_to_interval(node) : NULL;
141 }
142 
143 /* Get the interval covering the reg, or the closest to the right if it
144  * doesn't exist.
145  */
146 static struct ra_interval *
ra_interval_search_right(struct rb_tree * tree,physreg_t reg)147 ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
148 {
149    struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
150    if (!interval) {
151       return NULL;
152    } else if (interval->physreg_end > reg) {
153       return interval;
154    } else {
155       /* There is no interval covering reg, and ra_file_search_sloppy()
156        * returned the closest range to the left, so the next interval to the
157        * right should be the closest to the right.
158        */
159       return ra_interval_next_or_null(interval);
160    }
161 }
162 
163 static struct ra_interval *
ra_ctx_search_right(struct ra_ctx * ctx,physreg_t reg)164 ra_ctx_search_right(struct ra_ctx *ctx, physreg_t reg)
165 {
166    return ra_interval_search_right(&ctx->physreg_intervals, reg);
167 }
168 
169 static void
interval_add(struct ir3_reg_ctx * reg_ctx,struct ir3_reg_interval * _interval)170 interval_add(struct ir3_reg_ctx *reg_ctx, struct ir3_reg_interval *_interval)
171 {
172    struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
173    struct ra_ctx *ctx = ir3_reg_ctx_to_ctx(reg_ctx);
174 
175    /* We can assume in this case that physreg_start/physreg_end is already
176     * initialized.
177     */
178    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
179       BITSET_CLEAR(ctx->available, i);
180    }
181 
182    rb_tree_insert(&ctx->physreg_intervals, &interval->physreg_node,
183                   ra_interval_insert_cmp);
184 }
185 
186 static void
interval_delete(struct ir3_reg_ctx * reg_ctx,struct ir3_reg_interval * _interval)187 interval_delete(struct ir3_reg_ctx *reg_ctx, struct ir3_reg_interval *_interval)
188 {
189    struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
190    struct ra_ctx *ctx = ir3_reg_ctx_to_ctx(reg_ctx);
191 
192    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
193       BITSET_SET(ctx->available, i);
194    }
195 
196    rb_tree_remove(&ctx->physreg_intervals, &interval->physreg_node);
197 }
198 
199 static void
interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)200 interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
201                struct ir3_reg_interval *_child)
202 {
203    struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
204    struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
205 
206    child->physreg_start =
207       parent->physreg_start + (child->interval.reg->interval_start -
208                                parent->interval.reg->interval_start);
209    child->physreg_end =
210       child->physreg_start +
211       (child->interval.reg->interval_end - child->interval.reg->interval_start);
212 
213    interval_add(ctx, _child);
214 }
215 
216 static void
ra_ctx_init(struct ra_ctx * ctx)217 ra_ctx_init(struct ra_ctx *ctx)
218 {
219    ctx->reg_ctx.interval_add = interval_add;
220    ctx->reg_ctx.interval_delete = interval_delete;
221    ctx->reg_ctx.interval_readd = interval_readd;
222 }
223 
224 static void
ra_ctx_reset_block(struct ra_ctx * ctx)225 ra_ctx_reset_block(struct ra_ctx *ctx)
226 {
227    for (unsigned i = 0; i < RA_SHARED_SIZE; i++) {
228       BITSET_SET(ctx->available, i);
229    }
230 
231    rb_tree_init(&ctx->reg_ctx.intervals);
232    rb_tree_init(&ctx->physreg_intervals);
233 }
234 
235 static void
ra_interval_init(struct ra_interval * interval,struct ir3_register * reg)236 ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
237 {
238    ir3_reg_interval_init(&interval->interval, reg);
239 }
240 
241 static physreg_t
ra_interval_get_physreg(const struct ra_interval * interval)242 ra_interval_get_physreg(const struct ra_interval *interval)
243 {
244    unsigned child_start = interval->interval.reg->interval_start;
245 
246    while (interval->interval.parent) {
247       interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
248    }
249 
250    return interval->physreg_start +
251           (child_start - interval->interval.reg->interval_start);
252 }
253 
254 static unsigned
ra_interval_get_num(const struct ra_interval * interval)255 ra_interval_get_num(const struct ra_interval *interval)
256 {
257    return ra_physreg_to_num(ra_interval_get_physreg(interval),
258                             interval->interval.reg->flags);
259 }
260 
261 static void
ra_interval_dump(struct log_stream * stream,struct ra_interval * interval)262 ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
263 {
264    mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
265 
266    ir3_reg_interval_dump(stream, &interval->interval);
267 }
268 
269 static void
ra_ctx_dump(struct ra_ctx * ctx)270 ra_ctx_dump(struct ra_ctx *ctx)
271 {
272    struct log_stream *stream = mesa_log_streami();
273 
274    mesa_log_stream_printf(stream, "shared:\n");
275    rb_tree_foreach (struct ra_interval, interval, &ctx->physreg_intervals,
276                     physreg_node) {
277       ra_interval_dump(stream, interval);
278    }
279 
280    unsigned start, end;
281    mesa_log_stream_printf(stream, "available:\n");
282    BITSET_FOREACH_RANGE (start, end, ctx->available, RA_SHARED_SIZE) {
283       mesa_log_stream_printf(stream, "%u-%u ", start, end);
284    }
285    mesa_log_stream_printf(stream, "\n");
286    mesa_log_stream_printf(stream, "start: %u\n", ctx->start);
287 }
288 
289 static bool
get_reg_specified(struct ra_ctx * ctx,struct ir3_register * reg,physreg_t physreg)290 get_reg_specified(struct ra_ctx *ctx, struct ir3_register *reg, physreg_t physreg)
291 {
292    for (unsigned i = 0; i < reg_size(reg); i++) {
293       physreg_t cur_physreg = physreg + i;
294 
295       if (!BITSET_TEST(ctx->available, cur_physreg)) {
296          /* If physreg is unavailable, we might still be able to use it if the
297           * value it holds is in the same merge set at the same offset as the
298           * value in reg.
299           */
300          if (!reg->merge_set) {
301             return false;
302          }
303 
304          /* Find the interval for the current element of physreg. */
305          struct ra_interval *interval = ra_ctx_search_right(ctx, cur_physreg);
306 
307          /* It must exist since the physreg is unavailable. */
308          assert(interval->physreg_start <= cur_physreg);
309 
310          struct ir3_register *live_reg = interval->interval.reg;
311 
312          if (reg->merge_set != live_reg->merge_set) {
313             return false;
314          }
315 
316          /* We want to check if the currently live value at physreg+i (live_reg)
317           * is at the same offset as the new value (reg+i) in their shared merge
318           * set. However, we cannot simply compare their merge set offsets as
319           * live_reg may be larger than reg+i (e.g., a collect) and reg+i may
320           * not be at the start of live_reg's interval. To account for this, we
321           * want to know the merge set offset delta between live_reg's value at
322           * physreg+i and the start of its interval. This always equals the
323           * delta between the physregs within intervals.
324           */
325          unsigned cur_merge_set_offset = reg->merge_set_offset + i;
326          unsigned interval_offset = cur_physreg - interval->physreg_start;
327          if (cur_merge_set_offset !=
328              live_reg->merge_set_offset + interval_offset) {
329             return false;
330          }
331       }
332    }
333 
334    return true;
335 }
336 
337 static unsigned
reg_file_size(struct ir3_register * reg)338 reg_file_size(struct ir3_register *reg)
339 {
340    if (reg->flags & IR3_REG_HALF)
341       return RA_SHARED_HALF_SIZE;
342    else
343       return RA_SHARED_SIZE;
344 }
345 
346 static physreg_t
find_best_gap(struct ra_ctx * ctx,struct ir3_register * dst,unsigned size,unsigned align)347 find_best_gap(struct ra_ctx *ctx, struct ir3_register *dst, unsigned size,
348               unsigned align)
349 {
350    unsigned file_size = reg_file_size(dst);
351 
352    /* This can happen if we create a very large merge set. Just bail out in that
353     * case.
354     */
355    if (size > file_size)
356       return (physreg_t) ~0;
357 
358    unsigned start = ALIGN(ctx->start, align) % (file_size - size + align);
359    unsigned candidate = start;
360    do {
361       bool is_available = true;
362       for (unsigned i = 0; i < size; i++) {
363          if (!BITSET_TEST(ctx->available, candidate + i)) {
364             is_available = false;
365             break;
366          }
367       }
368 
369       if (is_available) {
370          ctx->start = (candidate + size) % file_size;
371          return candidate;
372       }
373 
374       candidate += align;
375       if (candidate + size > file_size)
376          candidate = 0;
377    } while (candidate != start);
378 
379    return (physreg_t)~0;
380 }
381 
382 static physreg_t
find_best_spill_reg(struct ra_ctx * ctx,struct ir3_register * reg,unsigned size,unsigned align)383 find_best_spill_reg(struct ra_ctx *ctx, struct ir3_register *reg,
384                     unsigned size, unsigned align)
385 {
386    unsigned file_size = reg_file_size(reg);
387    unsigned min_cost = UINT_MAX;
388 
389    unsigned start = ALIGN(ctx->start, align) % (file_size - size + align);
390    physreg_t candidate = start;
391    physreg_t best_reg = (physreg_t)~0;
392    do {
393       unsigned cost = 0;
394 
395       /* Iterate through intervals we'd need to spill to use this reg. */
396       for (struct ra_interval *interval = ra_ctx_search_right(ctx, candidate);
397            interval && interval->physreg_start < candidate + size;
398            interval = ra_interval_next_or_null(interval)) {
399          /* We can't spill sources of the current instruction when reloading
400           * sources.
401           */
402          if (interval->src) {
403             cost = UINT_MAX;
404             break;
405          }
406 
407          /* We prefer spilling intervals that already have been spilled, so we
408           * don't have to emit another mov.
409           */
410          if (!interval->spill_def)
411             cost += (interval->physreg_end - interval->physreg_start);
412       }
413 
414       if (cost < min_cost) {
415          min_cost = cost;
416          best_reg = candidate;
417       }
418 
419       candidate += align;
420       if (candidate + size > file_size)
421          candidate = 0;
422    } while (candidate != start);
423 
424    return best_reg;
425 }
426 
427 static struct ir3_register *
split(struct ir3_register * def,unsigned offset,struct ir3_instruction * before)428 split(struct ir3_register *def, unsigned offset, struct ir3_instruction *before)
429 {
430    if (reg_elems(def) == 1) {
431       assert(offset == 0);
432       return def;
433    }
434 
435    struct ir3_instruction *split =
436       ir3_instr_create_at(ir3_after_instr(before), OPC_META_SPLIT, 1, 1);
437    split->split.off = offset;
438    struct ir3_register *dst = __ssa_dst(split);
439    struct ir3_register *src =
440       ir3_src_create(split, INVALID_REG, def->flags & (IR3_REG_HALF | IR3_REG_SSA));
441    src->wrmask = def->wrmask;
442    src->def = def;
443    return dst;
444 }
445 
446 static struct ir3_register *
extract(struct ir3_register * parent_def,unsigned offset,unsigned elems,struct ir3_instruction * before)447 extract(struct ir3_register *parent_def, unsigned offset, unsigned elems,
448         struct ir3_instruction *before)
449 {
450    if (offset == 0 && elems == reg_elems(parent_def))
451       return parent_def;
452 
453    if (elems == 1)
454       return split(parent_def, offset, before);
455 
456    struct ir3_instruction *collect =
457       ir3_instr_create_at(ir3_after_instr(before), OPC_META_COLLECT, 1, elems);
458    struct ir3_register *dst = __ssa_dst(collect);
459    dst->flags |= parent_def->flags & IR3_REG_HALF;
460    dst->wrmask = MASK(elems);
461 
462    for (unsigned i = 0; i < elems; i++) {
463       ir3_src_create(collect, INVALID_REG,
464                      parent_def->flags & (IR3_REG_HALF | IR3_REG_SSA))->def =
465          split(parent_def, offset + i, before);
466    }
467 
468    return dst;
469 }
470 
471 static void
spill_interval_children(struct ra_interval * interval,struct ir3_instruction * before)472 spill_interval_children(struct ra_interval *interval,
473                         struct ir3_instruction *before)
474 {
475    rb_tree_foreach (struct ra_interval, child, &interval->interval.children,
476                     interval.node) {
477       if (!child->spill_def) {
478          child->spill_def = extract(interval->spill_def,
479                                     (child->interval.reg->interval_start -
480                                      interval->interval.reg->interval_start) /
481                                     reg_elem_size(interval->interval.reg),
482                                     reg_elems(child->interval.reg), before);
483          interval->physreg_start_orig = child->physreg_start;
484       }
485       spill_interval_children(child, before);
486    }
487 }
488 
489 static void
spill_interval(struct ra_ctx * ctx,struct ra_interval * interval)490 spill_interval(struct ra_ctx *ctx, struct ra_interval *interval)
491 {
492    struct ir3_instruction *before = interval->interval.reg->instr;
493 
494    d("spilling ssa_%u:%u", before->serialno, interval->interval.reg->name);
495 
496    if (!interval->spill_def) {
497       /* If this is a phi node or input, we need to insert the demotion to a
498        * regular register after the last phi or input in the block.
499        */
500       if (before->opc == OPC_META_PHI ||
501           before->opc == OPC_META_INPUT) {
502          struct ir3_block *block = before->block;
503          struct ir3_instruction *last_phi_input = NULL;
504          foreach_instr_from (instr, before, &block->instr_list) {
505             if (instr->opc != before->opc)
506                break;
507             last_phi_input = instr;
508          }
509          before = last_phi_input;
510       }
511 
512       struct ir3_instruction *mov =
513          ir3_instr_create_at(ir3_after_instr(before), OPC_MOV, 1, 1);
514       mov->flags |= IR3_INSTR_SHARED_SPILL;
515       struct ir3_register *dst = __ssa_dst(mov);
516       dst->flags |= (interval->interval.reg->flags & IR3_REG_HALF);
517       dst->wrmask = interval->interval.reg->wrmask;
518       mov->repeat = reg_elems(dst) - 1;
519       ir3_src_create(mov, interval->interval.reg->num,
520                      IR3_REG_SHARED | (mov->repeat ? IR3_REG_R : 0) |
521                      (interval->interval.reg->flags & IR3_REG_HALF))->wrmask =
522                      interval->interval.reg->wrmask;
523       mov->cat1.src_type = mov->cat1.dst_type =
524          (interval->interval.reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
525 
526       interval->spill_def = dst;
527       interval->physreg_start_orig = interval->physreg_start;
528    }
529 
530    spill_interval_children(interval, interval->spill_def->instr);
531 
532    ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
533 }
534 
535 /* Try to demote a scalar ALU instruction to a normal ALU instruction, using the
536  * spilled sources. We have to take into account restrictions on the number of
537  * shared sources that only exist for normal ALU instructions.
538  */
539 static bool
try_demote_instruction(struct ra_ctx * ctx,struct ir3_instruction * instr)540 try_demote_instruction(struct ra_ctx *ctx, struct ir3_instruction *instr)
541 {
542    /* First, check restrictions. */
543    switch (opc_cat(instr->opc)) {
544    case 1:
545       /* MOVMSK is special and can't be demoted. It also has no sources so must
546        * go before the check below.
547        */
548       if (instr->opc == OPC_MOVMSK)
549          return false;
550 
551       assert(instr->srcs_count >= 1);
552       if (!(instr->srcs[0]->flags & (IR3_REG_CONST | IR3_REG_IMMED)))
553          return false;
554       break;
555    case 2: {
556       /* We need one source to either be demotable or an immediate. */
557       if (instr->srcs_count > 1) {
558          struct ra_interval *src0_interval =
559             (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
560          struct ra_interval *src1_interval =
561             (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
562          if (!(src0_interval && src0_interval->spill_def) &&
563              !(src1_interval && src1_interval->spill_def) &&
564              !(instr->srcs[0]->flags & IR3_REG_IMMED) &&
565              !(instr->srcs[1]->flags & IR3_REG_IMMED))
566             return false;
567       }
568       break;
569    }
570    case 3: {
571       struct ra_interval *src0_interval =
572          (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
573       struct ra_interval *src1_interval =
574          (instr->srcs[1]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[1]->def->name] : NULL;
575 
576       /* src1 cannot be shared */
577       if (src1_interval && !src1_interval->spill_def) {
578          /* Try to swap src0 and src1, similar to what copy prop does. */
579          if (!is_mad(instr->opc))
580             return false;
581 
582          if ((src0_interval && src0_interval->spill_def) ||
583              (instr->srcs[0]->flags & IR3_REG_IMMED)) {
584             struct ir3_register *src0 = instr->srcs[0];
585             instr->srcs[0] = instr->srcs[1];
586             instr->srcs[1] = src0;
587          } else {
588             return false;
589          }
590       }
591       break;
592    }
593    case 4: {
594       assert(instr->srcs[0]->flags & IR3_REG_SSA);
595       struct ra_interval *src_interval = &ctx->intervals[instr->srcs[0]->def->name];
596       if (!src_interval->spill_def)
597          return false;
598       break;
599    }
600 
601    default:
602       return false;
603    }
604 
605    d("demoting instruction");
606 
607    /* If the instruction is already not a scalar ALU instruction, we should've
608     * skipped reloading and just demoted sources directly, so we should never
609     * get here.
610     */
611    assert(instr->dsts[0]->flags & IR3_REG_SHARED);
612 
613    /* Now we actually demote the instruction */
614    ra_foreach_src (src, instr) {
615       assert(src->flags & IR3_REG_SHARED);
616       struct ra_interval *interval = &ctx->intervals[src->def->name];
617       if (interval->spill_def) {
618          src->def = interval->spill_def;
619          src->flags &= ~IR3_REG_SHARED;
620          interval->needs_reload = false;
621          if (interval->interval.inserted)
622             ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
623          while (interval->interval.parent)
624             interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
625          interval->src = false;
626       }
627    }
628 
629    struct ra_interval *dst_interval = &ctx->intervals[instr->dsts[0]->name];
630    instr->dsts[0]->flags &= ~IR3_REG_SHARED;
631    ra_interval_init(dst_interval, instr->dsts[0]);
632    dst_interval->spill_def = instr->dsts[0];
633 
634    instr->flags |= IR3_INSTR_SHARED_SPILL;
635 
636    return true;
637 }
638 
639 /* Free up [start, start + size) by spilling live intervals.
640  */
641 static void
free_space(struct ra_ctx * ctx,physreg_t start,unsigned size)642 free_space(struct ra_ctx *ctx, physreg_t start, unsigned size)
643 {
644    struct ra_interval *interval = ra_ctx_search_right(ctx, start);
645    while (interval && interval->physreg_start < start + size) {
646       struct ra_interval *next = ra_interval_next_or_null(interval);
647       spill_interval(ctx, interval);
648       interval = next;
649    }
650 }
651 
652 static physreg_t
get_reg(struct ra_ctx * ctx,struct ir3_register * reg,bool src)653 get_reg(struct ra_ctx *ctx, struct ir3_register *reg, bool src)
654 {
655    if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
656       physreg_t preferred_reg =
657          reg->merge_set->preferred_reg + reg->merge_set_offset;
658       if (preferred_reg < reg_file_size(reg) &&
659           preferred_reg % reg_elem_size(reg) == 0 &&
660           get_reg_specified(ctx, reg, preferred_reg))
661          return preferred_reg;
662    }
663 
664    /* If this register is a subset of a merge set which we have not picked a
665     * register for, first try to allocate enough space for the entire merge
666     * set.
667     */
668    unsigned size = reg_size(reg);
669    if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
670        size < reg->merge_set->size) {
671       physreg_t best_reg = find_best_gap(ctx, reg, reg->merge_set->size,
672                                          reg->merge_set->alignment);
673       if (best_reg != (physreg_t)~0u) {
674          best_reg += reg->merge_set_offset;
675          return best_reg;
676       }
677    }
678 
679    /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
680     * Because this doesn't introduce unnecessary dependencies, and it
681     * potentially avoids needing (ss) syncs for write after read hazards for
682     * SFU instructions:
683     */
684    if (!src && (is_sfu(reg->instr) || is_alu(reg->instr))) {
685       for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
686          struct ir3_register *src = reg->instr->srcs[i];
687          if (!ra_reg_is_src(src))
688             continue;
689          if ((src->flags & IR3_REG_SHARED) && reg_size(src) >= size) {
690             struct ra_interval *src_interval = &ctx->intervals[src->def->name];
691             physreg_t src_physreg = ra_interval_get_physreg(src_interval);
692             if (src_physreg % reg_elem_size(reg) == 0 &&
693                 src_physreg + size <= reg_file_size(reg) &&
694                 get_reg_specified(ctx, reg, src_physreg))
695                return src_physreg;
696          }
697       }
698    }
699 
700    return find_best_gap(ctx, reg, size, reg_elem_size(reg));
701 }
702 
703 /* The reload process is split in two, first we allocate a register to reload to
704  * for all sources that need a reload and then we actually execute the reload.
705  * This is to allow us to demote shared ALU instructions to non-shared whenever
706  * we would otherwise need to spill to reload, without leaving dangling unused
707  * reload mov's from previously processed sources. So, for example, we could
708  * need to reload both sources of an add, but after reloading the first source
709  * we realize that we would need to spill to reload the second source and we
710  * should demote the add instead, which means cancelling the first reload.
711  */
712 static void
reload_src(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)713 reload_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
714            struct ir3_register *src)
715 {
716    struct ir3_register *reg = src->def;
717    struct ra_interval *interval = &ctx->intervals[reg->name];
718    unsigned size = reg_size(reg);
719 
720    physreg_t best_reg = get_reg(ctx, reg, true);
721 
722    if (best_reg == (physreg_t)~0u) {
723       if (try_demote_instruction(ctx, instr))
724          return;
725 
726       best_reg = find_best_spill_reg(ctx, reg, size, reg_elem_size(reg));
727       assert(best_reg != (physreg_t)~0u);
728 
729       free_space(ctx, best_reg, size);
730    }
731 
732    d("reload src %u physreg %u", reg->name, best_reg);
733    interval->physreg_start = best_reg;
734    interval->physreg_end = best_reg + size;
735    interval->needs_reload = true;
736    ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
737 
738    while (interval->interval.parent)
739       interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
740 
741    interval->src = true;
742 }
743 
744 static void
reload_interval(struct ra_ctx * ctx,struct ir3_cursor cursor,struct ra_interval * interval)745 reload_interval(struct ra_ctx *ctx, struct ir3_cursor cursor,
746                 struct ra_interval *interval)
747 {
748    struct ir3_register *def = interval->interval.reg;
749    struct ir3_instruction *mov = ir3_instr_create_at(cursor, OPC_MOV, 1, 1);
750    mov->flags |= IR3_INSTR_SHARED_SPILL;
751    unsigned flags = IR3_REG_SHARED | (def->flags & IR3_REG_HALF);
752    ir3_dst_create(mov, ra_physreg_to_num(interval->physreg_start, flags),
753                   flags)->wrmask = def->wrmask;
754    mov->repeat = reg_elems(def) - 1;
755    struct ir3_register *mov_src =
756       ir3_src_create(mov, INVALID_REG, IR3_REG_SSA | (def->flags & IR3_REG_HALF) |
757                      (mov->repeat ? IR3_REG_R : 0));
758    assert(interval->spill_def);
759    mov_src->def = interval->spill_def;
760    mov_src->wrmask = def->wrmask;
761    mov->cat1.src_type = mov->cat1.dst_type =
762       (def->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
763 }
764 
765 static void
reload_src_finalize(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)766 reload_src_finalize(struct ra_ctx *ctx, struct ir3_instruction *instr,
767                     struct ir3_register *src)
768 {
769    struct ir3_register *reg = src->def;
770    struct ra_interval *interval = &ctx->intervals[reg->name];
771 
772    if (!interval->needs_reload)
773       return;
774 
775    reload_interval(ctx, ir3_before_instr(instr), interval);
776 
777    interval->needs_reload = false;
778 }
779 
780 static bool
can_demote_src(struct ir3_instruction * instr)781 can_demote_src(struct ir3_instruction *instr)
782 {
783    switch (instr->opc) {
784    case OPC_SCAN_MACRO:
785    case OPC_META_COLLECT:
786       return false;
787    case OPC_MOV:
788       /* non-shared -> shared floating-point conversions and
789        * 8-bit sign extension don't work.
790        */
791       return (!(instr->dsts[0]->flags & IR3_REG_SHARED) ||
792               !((full_type(instr->cat1.src_type) == TYPE_F32 ||
793                  full_type(instr->cat1.dst_type) == TYPE_F32) ||
794                 (instr->cat1.src_type == TYPE_U8 &&
795                  full_type(instr->cat1.dst_type) == TYPE_S32)));
796    default:
797       return (!is_alu(instr) && !is_sfu(instr)) ||
798          !(instr->dsts[0]->flags & IR3_REG_SHARED);
799    }
800 }
801 
802 /* Ensure that this source is never spilled while reloading other sources.
803  */
804 static void
mark_src(struct ra_ctx * ctx,struct ir3_register * src)805 mark_src(struct ra_ctx *ctx, struct ir3_register *src)
806 {
807    if (!(src->flags & IR3_REG_SHARED))
808       return;
809 
810    struct ra_interval *interval = &ctx->intervals[src->def->name];
811 
812    if (interval->interval.inserted) {
813       while (interval->interval.parent)
814          interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
815 
816       interval->src = true;
817    }
818 }
819 
820 static void
ensure_src_live(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)821 ensure_src_live(struct ra_ctx *ctx, struct ir3_instruction *instr,
822                 struct ir3_register *src)
823 {
824    if (!(src->flags & IR3_REG_SHARED))
825       return;
826 
827    struct ra_interval *interval = &ctx->intervals[src->def->name];
828 
829    if (!interval->interval.inserted) {
830       /* In some cases we cannot demote shared reg sources to non-shared regs,
831        * then we have to reload it.
832        */
833       assert(interval->spill_def);
834       if (!can_demote_src(instr)) {
835          reload_src(ctx, instr, src);
836       } else {
837          if (instr->opc == OPC_META_PARALLEL_COPY) {
838             /* Stash away the original def to use later in case we actually have
839              * to insert a reload.
840              */
841             _mesa_hash_table_insert(ctx->pcopy_src_map, src, src->def);
842          }
843          src->def = interval->spill_def;
844          src->flags &= ~IR3_REG_SHARED;
845       }
846    }
847 }
848 
849 static void
assign_src(struct ra_ctx * ctx,struct ir3_register * src)850 assign_src(struct ra_ctx *ctx, struct ir3_register *src)
851 {
852    if (!(src->flags & IR3_REG_SHARED))
853       return;
854 
855    struct ra_interval *interval = &ctx->intervals[src->def->name];
856    assert(interval->interval.inserted);
857    src->num = ra_physreg_to_num(ra_interval_get_physreg(interval), src->flags);
858 
859    if ((src->flags & IR3_REG_FIRST_KILL) &&
860        !interval->interval.parent &&
861        rb_tree_is_empty(&interval->interval.children))
862       ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
863 
864    while (interval->interval.parent)
865       interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
866 
867    interval->src = false;
868 }
869 
870 static void
handle_dst(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * dst)871 handle_dst(struct ra_ctx *ctx, struct ir3_instruction *instr,
872            struct ir3_register *dst)
873 {
874    if (!(dst->flags & IR3_REG_SHARED))
875       return;
876 
877    struct ra_interval *interval = &ctx->intervals[dst->name];
878    ra_interval_init(interval, dst);
879    interval->spill_def = NULL;
880 
881    if (dst->tied) {
882       struct ir3_register *tied_def = dst->tied->def;
883       struct ra_interval *tied_interval = &ctx->intervals[tied_def->name];
884       if ((dst->tied->flags & IR3_REG_KILL) &&
885           !tied_interval->interval.parent &&
886           rb_tree_is_empty(&tied_interval->interval.children)) {
887          dst->num = dst->tied->num;
888          interval->physreg_start = tied_interval->physreg_start;
889          interval->physreg_end = tied_interval->physreg_end;
890          ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
891          return;
892       }
893    }
894 
895    physreg_t physreg = get_reg(ctx, dst, false);
896    if (physreg == (physreg_t) ~0u) {
897       if (try_demote_instruction(ctx, instr))
898          return;
899 
900       unsigned size = reg_size(dst);
901       physreg = find_best_spill_reg(ctx, dst, size, reg_elem_size(dst));
902       assert(physreg != (physreg_t)~0u);
903       free_space(ctx, physreg, size);
904    }
905 
906    dst->num = ra_physreg_to_num(physreg, dst->flags);
907 
908    /* Non-trivial collects (i.e., ones that will introduce moves because the
909     * sources don't line-up with the destination) may cause source intervals to
910     * get implicitly moved when they are inserted as children of the destination
911     * interval. Since we don't support moving intervals in shared RA, this may
912     * cause illegal register allocations. Prevent this by making sure
913     * non-trivial collects will not share a merge set with (and will not be a
914     * parent interval of) their components. Detect this by checking if a dst got
915     * a register assignment that does not correspond with the existing preferred
916     * reg of its merge set, as this might cause a future collect covering its
917     * interval to become non-trivial.
918     */
919    if (dst->merge_set && dst->merge_set->preferred_reg != (physreg_t)~0 &&
920        physreg != dst->merge_set->preferred_reg + dst->merge_set_offset) {
921       dst->merge_set = NULL;
922       dst->interval_start = ctx->live->interval_offset;
923       dst->interval_end = dst->interval_start + reg_size(dst);
924       ctx->live->interval_offset = dst->interval_end;
925    }
926 
927    ra_update_affinity(reg_file_size(dst), dst, physreg);
928    interval->physreg_start = physreg;
929    interval->physreg_end = physreg + reg_size(dst);
930    ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
931    d("insert dst %u physreg %u", dst->name, physreg);
932 
933    if (dst->tied) {
934       struct ir3_instruction *mov = ir3_instr_create_at(
935          ir3_before_instr(instr), OPC_META_PARALLEL_COPY, 1, 1);
936       unsigned flags = IR3_REG_SHARED | (dst->flags & IR3_REG_HALF);
937       ir3_dst_create(mov, dst->num, flags)->wrmask = dst->wrmask;
938       ir3_src_create(mov, dst->tied->num, flags)->wrmask = dst->wrmask;
939       mov->cat1.src_type = mov->cat1.dst_type =
940          (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;;
941       dst->tied->num = dst->num;
942    }
943 }
944 
945 static void
handle_src_late(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)946 handle_src_late(struct ra_ctx *ctx, struct ir3_instruction *instr,
947                 struct ir3_register *src)
948 {
949    if (!(src->flags & IR3_REG_SHARED))
950       return;
951 
952    struct ra_interval *interval = &ctx->intervals[src->def->name];
953    reload_src_finalize(ctx, instr, src);
954 
955    /* Remove killed sources that have to be killed late due to being merged with
956     * other defs.
957     */
958    if (!(src->flags & IR3_REG_KILL))
959       return;
960 
961    if (interval->interval.inserted)
962       ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
963 }
964 
965 static void
handle_normal_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)966 handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
967 {
968    ra_foreach_src (src, instr)
969       mark_src(ctx, src);
970 
971    ra_foreach_src (src, instr)
972       ensure_src_live(ctx, instr, src);
973 
974    ra_foreach_src_rev (src, instr)
975       assign_src(ctx, src);
976 
977    ra_foreach_dst (dst, instr)
978       handle_dst(ctx, instr, dst);
979 
980    ra_foreach_src (src, instr)
981       handle_src_late(ctx, instr, src);
982 }
983 
984 static void
handle_split(struct ra_ctx * ctx,struct ir3_instruction * split)985 handle_split(struct ra_ctx *ctx, struct ir3_instruction *split)
986 {
987    struct ir3_register *src = split->srcs[0];
988    struct ir3_register *dst = split->dsts[0];
989 
990    if (!(dst->flags & IR3_REG_SHARED))
991       return;
992 
993    if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
994       handle_normal_instr(ctx, split);
995       return;
996    }
997 
998    struct ra_interval *src_interval = &ctx->intervals[src->def->name];
999    struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1000 
1001    ra_interval_init(dst_interval, dst);
1002    dst_interval->spill_def = NULL;
1003 
1004    if (src_interval->spill_def) {
1005       struct ir3_instruction *spill_split =
1006          ir3_instr_create_at(ir3_after_instr(split), OPC_META_SPLIT, 1, 1);
1007       struct ir3_register *dst = __ssa_dst(spill_split);
1008       struct ir3_register *src =
1009          ir3_src_create(spill_split, INVALID_REG, IR3_REG_SSA);
1010       src->def = src_interval->spill_def;
1011       src->wrmask = src_interval->spill_def->wrmask;
1012       spill_split->split.off = split->split.off;
1013       dst_interval->spill_def = dst;
1014       list_del(&split->node);
1015       return;
1016    }
1017 
1018    dst_interval->physreg_start =
1019       src_interval->physreg_start + dst->merge_set_offset -
1020       src->def->merge_set_offset;
1021    dst_interval->physreg_end = dst_interval->physreg_start + reg_size(dst);
1022    ir3_reg_interval_insert(&ctx->reg_ctx, &dst_interval->interval);
1023    src->num = ra_interval_get_num(src_interval);
1024    dst->num = ra_interval_get_num(dst_interval);
1025    d("insert dst %u physreg %u", dst->name, dst_interval->physreg_start);
1026 
1027    if (src->flags & IR3_REG_KILL)
1028       ir3_reg_interval_remove(&ctx->reg_ctx, &src_interval->interval);
1029 }
1030 
1031 static void
handle_phi(struct ra_ctx * ctx,struct ir3_instruction * phi)1032 handle_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
1033 {
1034    struct ir3_register *dst = phi->dsts[0];
1035 
1036    if (!(dst->flags & IR3_REG_SHARED))
1037       return;
1038 
1039    struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1040    ra_interval_init(dst_interval, dst);
1041 
1042    /* In some rare cases, it's possible to have a phi node with a physical-only
1043     * source. Here's a contrived example:
1044     *
1045     * loop {
1046     *    if non-uniform {
1047     *       if uniform {
1048     *          x_1 = ...;
1049     *          continue;
1050     *       }
1051     *       x_2 = ...;
1052     *    } else {
1053     *       break;
1054     *    }
1055     *    // continue block
1056     *    x_3 = phi(x_1, x_2)
1057     * }
1058     *
1059     * Assuming x_1 and x_2 are uniform, x_3 will also be uniform, because all
1060     * threads that stay in the loop take the same branch to the continue block,
1061     * however execution may fall through from the assignment to x_2 to the
1062     * break statement because the outer if is non-uniform, and then it will fall
1063     * through again to the continue block, so if x_3 is to be in a shared reg
1064     * then the phi needs an extra source pointing to the break statement, which
1065     * itself needs a phi node:
1066     *
1067     * loop {
1068     *    if non-uniform {
1069     *       if uniform {
1070     *          x_1 = ...;
1071     *          continue;
1072     *       }
1073     *       x_2 = ...;
1074     *    } else {
1075     *       x_4 = phi(undef, x_2)
1076     *       break;
1077     *    }
1078     *    // continue block
1079     *    x_3 = phi(x_1, x_2, x_4)
1080     * }
1081     */
1082 
1083    /* phi nodes are special because we cannot spill them normally, instead we
1084     * have to spill the parallel copies that their sources point to and make the
1085     * entire phi not shared anymore.
1086     */
1087 
1088    physreg_t physreg = get_reg(ctx, dst, false);
1089    if (physreg == (physreg_t) ~0u) {
1090       d("spilling phi destination");
1091       dst->flags &= ~IR3_REG_SHARED;
1092       dst_interval->spill_def = dst;
1093       phi->flags |= IR3_INSTR_SHARED_SPILL;
1094 
1095       foreach_src (src, phi) {
1096          src->flags &= ~IR3_REG_SHARED;
1097          if (src->def)
1098             src->def->flags &= ~IR3_REG_SHARED;
1099       }
1100 
1101       return;
1102    }
1103 
1104    dst->num = ra_physreg_to_num(physreg, dst->flags);
1105    dst_interval->spill_def = NULL;
1106    dst_interval->physreg_start = physreg;
1107    dst_interval->physreg_end = physreg + reg_size(dst);
1108    ir3_reg_interval_insert(&ctx->reg_ctx, &dst_interval->interval);
1109 
1110    ra_foreach_src_n (src, i, phi) {
1111       /* We assume that any phis with non-logical sources aren't promoted. */
1112       assert(i < phi->block->predecessors_count);
1113       src->num = dst->num;
1114       src->def->num = dst->num;
1115    }
1116 }
1117 
1118 static void
handle_pcopy(struct ra_ctx * ctx,struct ir3_instruction * pcopy)1119 handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *pcopy)
1120 {
1121    /* For parallel copies, we only handle the source. The destination is handled
1122     * later when processing phi nodes.
1123     */
1124 
1125    ra_foreach_src (src, pcopy)
1126       mark_src(ctx, src);
1127 
1128    ra_foreach_src (src, pcopy)
1129       ensure_src_live(ctx, pcopy, src);
1130 
1131    ra_foreach_src_rev (src, pcopy)
1132       assign_src(ctx, src);
1133 
1134    ra_foreach_src (src, pcopy)
1135       handle_src_late(ctx, pcopy, src);
1136 }
1137 
1138 static void
handle_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1139 handle_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1140 {
1141    instr->flags &= ~IR3_INSTR_SHARED_SPILL;
1142 
1143    switch (instr->opc) {
1144    case OPC_META_SPLIT:
1145       handle_split(ctx, instr);
1146       break;
1147    case OPC_META_PHI:
1148       handle_phi(ctx, instr);
1149       break;
1150    case OPC_META_PARALLEL_COPY:
1151       handle_pcopy(ctx, instr);
1152       break;
1153    default:
1154       handle_normal_instr(ctx, instr);
1155    }
1156 }
1157 
1158 /* In case we define a value outside a loop, use it inside the loop, then spill
1159  * it afterwards inside the same loop, we could lose the value so we have to
1160  * reload it. We have to reload it after any parallel copy instruction, when the
1161  * live shared registers equal the live-in of the backedge. lower_pcopy() will
1162  * then move any non-shared parallel copies down past the reload.
1163  */
1164 static void
reload_live_outs(struct ra_ctx * ctx,struct ir3_block * block)1165 reload_live_outs(struct ra_ctx *ctx, struct ir3_block *block)
1166 {
1167    struct ra_block_state *state = &ctx->blocks[block->index];
1168    unsigned name;
1169    BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) {
1170       struct ir3_register *reg = ctx->live->definitions[name];
1171 
1172       struct ra_interval *interval = &ctx->intervals[name];
1173       if (!interval->interval.inserted) {
1174          d("reloading %d at end of backedge", reg->name);
1175 
1176          /* When this interval was spilled inside the loop, we probably chose a
1177           * different physreg for it than the original physreg when it was
1178           * defined outside the loop. Restore the original physreg so that we
1179           * spill it correctly.
1180           */
1181          unsigned size = interval->physreg_end - interval->physreg_start;
1182          interval->physreg_start = interval->physreg_start_orig;
1183          interval->physreg_end = interval->physreg_start + size;
1184 
1185          reload_interval(ctx, ir3_before_terminator(block), interval);
1186       }
1187    }
1188 }
1189 
1190 static void
record_pred_live_out(struct ra_ctx * ctx,struct ra_interval * interval,struct ir3_block * pred)1191 record_pred_live_out(struct ra_ctx *ctx,
1192                      struct ra_interval *interval,
1193                      struct ir3_block *pred)
1194 {
1195    struct ra_block_state *state = &ctx->blocks[pred->index];
1196 
1197    struct ir3_register *def = interval->interval.reg;
1198    BITSET_SET(state->live_out, def->name);
1199 
1200    rb_tree_foreach (struct ra_interval, child,
1201                     &interval->interval.children, interval.node) {
1202       record_pred_live_out(ctx, child, pred);
1203    }
1204 }
1205 
1206 static void
record_pred_live_outs(struct ra_ctx * ctx,struct ir3_block * block)1207 record_pred_live_outs(struct ra_ctx *ctx, struct ir3_block *block)
1208 {
1209    for (unsigned i = 0; i < block->predecessors_count; i++) {
1210       struct ir3_block *pred = block->predecessors[i];
1211       struct ra_block_state *state = &ctx->blocks[pred->index];
1212       if (state->visited)
1213          continue;
1214 
1215       state->live_out = rzalloc_array(NULL, BITSET_WORD,
1216                                       BITSET_WORDS(ctx->live->definitions_count));
1217 
1218 
1219       rb_tree_foreach (struct ra_interval, interval,
1220                        &ctx->reg_ctx.intervals, interval.node) {
1221          record_pred_live_out(ctx, interval, pred);
1222       }
1223    }
1224 }
1225 
1226 static void
handle_block(struct ra_ctx * ctx,struct ir3_block * block)1227 handle_block(struct ra_ctx *ctx, struct ir3_block *block)
1228 {
1229    ra_ctx_reset_block(ctx);
1230 
1231    unsigned name;
1232    BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1233                        ctx->live->definitions_count) {
1234       struct ir3_register *def = ctx->live->definitions[name];
1235       struct ra_interval *interval = &ctx->intervals[name];
1236 
1237       /* Non-shared definitions may still be definitions we spilled by demoting
1238        * them, so we still need to initialize the interval. But we shouldn't
1239        * make these intervals live.
1240        */
1241       ra_interval_init(interval, def);
1242 
1243       if ((def->flags & IR3_REG_SHARED) && !interval->spill_def) {
1244          ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
1245       }
1246    }
1247 
1248    if (RA_DEBUG) {
1249       d("after live-in block %u:\n", block->index);
1250       ra_ctx_dump(ctx);
1251    }
1252 
1253    if (block->predecessors_count > 1)
1254       record_pred_live_outs(ctx, block);
1255 
1256    foreach_instr_safe (instr, &block->instr_list) {
1257       di(instr, "processing");
1258 
1259       handle_instr(ctx, instr);
1260 
1261       if (RA_DEBUG)
1262          ra_ctx_dump(ctx);
1263    }
1264 
1265    if (block->successors[0]) {
1266       struct ra_block_state *state = &ctx->blocks[block->successors[0]->index];
1267 
1268       if (state->visited) {
1269          assert(!block->successors[1]);
1270 
1271          reload_live_outs(ctx, block);
1272       }
1273    }
1274 
1275    ctx->blocks[block->index].visited = true;
1276 }
1277 
1278 static void
lower_pcopy(struct ir3 * ir,struct ra_ctx * ctx)1279 lower_pcopy(struct ir3 *ir, struct ra_ctx *ctx)
1280 {
1281    foreach_block (block, &ir->block_list) {
1282       foreach_instr_safe (instr, &block->instr_list) {
1283          /* At this point due to spilling there may be parallel copies from
1284           * shared to non-shared registers and vice versa. Lowering these after
1285           * RA may produce cycles involving shared and non-shared registers,
1286           * which would need to be resolved by swapping a shared and non-shared
1287           * register which is something we can't handle. However by lowering
1288           * these to moves now, we can make sure that cycles only involve
1289           * non-shared registers. To avoid illegally moving a shared register
1290           * read or write across the parallel copy, which may have other
1291           * conflicting reads/writes if there's a cycle, we need to move copies
1292           * from non-shared to shared below the shared copies, and we need to
1293           * move copies from shared to non-shared above them. So, we have the
1294           * following order:
1295           *
1296           * 1. shared->non-shared copies (spills)
1297           * 2. shared->shared copies (one parallel copy as there may be cycles)
1298           * 3. non-shared->shared copies (reloads)
1299           * 4. non-shared->non-shared copies
1300           *
1301           * We split out the non-shared->non-shared copies as a separate step.
1302           */
1303          if (instr->opc == OPC_META_PARALLEL_COPY) {
1304             for (unsigned i = 0; i < instr->srcs_count; i++) {
1305                if ((instr->srcs[i]->flags & IR3_REG_SHARED) &&
1306                    !(instr->dsts[i]->flags & IR3_REG_SHARED)) {
1307                   /* shared->non-shared. Create a spill move and rewrite the
1308                    * source to be the destination of the move (so that the
1309                    * original shared->non-shared copy becomes a
1310                    * non-shared->non-shared copy).
1311                    */
1312                   struct ir3_instruction *mov = ir3_instr_create_at(
1313                      ir3_before_instr(instr), OPC_MOV, 1, 1);
1314                   mov->flags |= IR3_INSTR_SHARED_SPILL;
1315                   struct ir3_register *dst =
1316                      ir3_dst_create(mov, INVALID_REG, instr->dsts[i]->flags);
1317                   dst->wrmask = instr->dsts[i]->wrmask;
1318                   dst->instr = mov;
1319                   mov->repeat = reg_elems(mov->dsts[0]) - 1;
1320                   struct ir3_register *src =
1321                      ir3_src_create(mov, instr->srcs[i]->num,
1322                                     instr->srcs[i]->flags |
1323                                     (mov->repeat ? IR3_REG_R : 0));
1324                   src->wrmask = instr->srcs[i]->wrmask;
1325                   mov->cat1.dst_type = mov->cat1.src_type =
1326                      (mov->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
1327                   instr->srcs[i]->flags = mov->dsts[0]->flags;
1328                   instr->srcs[i]->def = mov->dsts[0];
1329                }
1330             }
1331 
1332             for (unsigned i = 0; i < instr->dsts_count;) {
1333                if ((instr->dsts[i]->flags & IR3_REG_SHARED) &&
1334                    (instr->srcs[i]->flags & IR3_REG_SSA) &&
1335                    !(instr->srcs[i]->flags & IR3_REG_SHARED)) {
1336                   /* non-shared->shared. Create a reload move.
1337                    */
1338                   struct ir3_instruction *mov =
1339                      ir3_instr_create_at(ir3_after_instr(instr), OPC_MOV, 1, 1);
1340                   mov->flags |= IR3_INSTR_SHARED_SPILL;
1341                   struct ir3_register *dst =
1342                      ir3_dst_create(mov, instr->dsts[i]->num,
1343                                     instr->dsts[i]->flags);
1344                   dst->instr = mov;
1345                   dst->wrmask = instr->dsts[i]->wrmask;
1346                   mov->repeat = reg_elems(mov->dsts[0]) - 1;
1347                   struct ir3_register *src =
1348                      ir3_src_create(mov, INVALID_REG, instr->srcs[i]->flags |
1349                                     (mov->repeat ? IR3_REG_R : 0));
1350                   src->def = instr->srcs[i]->def;
1351                   src->wrmask = instr->srcs[i]->wrmask;
1352                   mov->cat1.dst_type = mov->cat1.src_type =
1353                      (mov->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
1354 
1355                   /* When we spill a parallel copy source, we lose the
1356                    * information of where it originally points to since we make
1357                    * it point to the spill def. If we later decide not to also
1358                    * spill the phi associated with it, we have to restore it
1359                    * here using the stashed original source so that RA
1360                    * validation can check that we did the correct thing.
1361                    *
1362                    * Because SSA-ness goes away after validation, this is really
1363                    * just about validation.
1364                    */
1365                   struct ir3_block *succ = block->successors[0];
1366                   unsigned pred_idx = ir3_block_get_pred_index(succ, block);
1367                   foreach_instr (phi, &succ->instr_list) {
1368                      if (phi->opc != OPC_META_PHI)
1369                         break;
1370 
1371                      if (phi->srcs[pred_idx]->def == instr->dsts[i]) {
1372                         struct ir3_register *def =
1373                            _mesa_hash_table_search(ctx->pcopy_src_map,
1374                                                    instr->srcs[i])->data;
1375                         phi->srcs[pred_idx]->def = def;
1376                         break;
1377                      }
1378                   }
1379 
1380                   instr->srcs[i] = instr->srcs[instr->srcs_count - 1];
1381                   instr->dsts[i] = instr->dsts[instr->dsts_count - 1];
1382                   instr->srcs_count--;
1383                   instr->dsts_count--;
1384                   continue;
1385                }
1386 
1387                i++;
1388             }
1389 
1390             /* Move any non-shared copies to a separate parallel copy
1391              * instruction right at the end of the block, after any reloads. At
1392              * this point all copies should be {shared,immediate}->shared or
1393              * {non-shared,immediate}->non-shared.
1394              */
1395             unsigned non_shared_copies = 0;
1396             for (unsigned i = 0; i < instr->dsts_count; i++) {
1397                if (!(instr->dsts[i]->flags & IR3_REG_SHARED))
1398                   non_shared_copies++;
1399             }
1400 
1401             if (non_shared_copies != 0) {
1402                struct ir3_instruction *pcopy = ir3_instr_create_at(
1403                   ir3_before_terminator(block), OPC_META_PARALLEL_COPY,
1404                   non_shared_copies, non_shared_copies);
1405 
1406                unsigned j = 0;
1407                for (unsigned i = 0; i < instr->dsts_count;) {
1408                   if (!(instr->dsts[i]->flags & IR3_REG_SHARED)) {
1409                      pcopy->dsts[j] = instr->dsts[i];
1410                      pcopy->srcs[j] = instr->srcs[i];
1411                      pcopy->dsts[j]->instr = pcopy;
1412                      instr->srcs[i] = instr->srcs[instr->srcs_count - 1];
1413                      instr->dsts[i] = instr->dsts[instr->dsts_count - 1];
1414                      instr->srcs_count--;
1415                      instr->dsts_count--;
1416                      j++;
1417                      continue;
1418                   }
1419                   i++;
1420                }
1421 
1422                pcopy->srcs_count = pcopy->dsts_count = j;
1423                if (instr->dsts_count == 0)
1424                   list_del(&instr->node);
1425             }
1426          }
1427       }
1428    }
1429 }
1430 
1431 static void
finalize(struct ir3 * ir)1432 finalize(struct ir3 *ir)
1433 {
1434    foreach_block (block, &ir->block_list) {
1435       foreach_instr (instr, &block->instr_list) {
1436          for (unsigned i = 0; i < instr->dsts_count; i++) {
1437             if (instr->dsts[i]->flags & IR3_REG_SHARED) {
1438                instr->dsts[i]->flags &= ~IR3_REG_SSA;
1439             }
1440          }
1441 
1442          for (unsigned i = 0; i < instr->srcs_count; i++) {
1443             if (instr->srcs[i]->flags & IR3_REG_SHARED) {
1444                instr->srcs[i]->flags &= ~IR3_REG_SSA;
1445                instr->srcs[i]->def = NULL;
1446             }
1447          }
1448       }
1449    }
1450 }
1451 
1452 void
ir3_ra_shared(struct ir3_shader_variant * v,struct ir3_liveness ** live_ptr)1453 ir3_ra_shared(struct ir3_shader_variant *v, struct ir3_liveness **live_ptr)
1454 {
1455    struct ra_ctx ctx;
1456    struct ir3_liveness *live = *live_ptr;
1457 
1458    ra_ctx_init(&ctx);
1459    ctx.intervals = rzalloc_array(NULL, struct ra_interval,
1460                                  live->definitions_count);
1461    ctx.blocks = rzalloc_array(NULL, struct ra_block_state,
1462                               live->block_count);
1463    ctx.start = 0;
1464    ctx.live = live;
1465    ctx.pcopy_src_map = _mesa_pointer_hash_table_create(NULL);
1466 
1467    foreach_block (block, &v->ir->block_list) {
1468       handle_block(&ctx, block);
1469    }
1470 
1471    lower_pcopy(v->ir, &ctx);
1472 
1473    for (unsigned i = 0; i < live->block_count; i++) {
1474       if (ctx.blocks[i].live_out)
1475          ralloc_free(ctx.blocks[i].live_out);
1476    }
1477 
1478    ralloc_free(ctx.intervals);
1479    ralloc_free(ctx.pcopy_src_map);
1480    ralloc_free(ctx.blocks);
1481 
1482    ir3_ra_validate(v, RA_FULL_SIZE, RA_HALF_SIZE, live->block_count, true);
1483    finalize(v->ir);
1484 
1485    /* Recalculate liveness and register pressure now that additional values have
1486     * been added.
1487     * TODO we should only do this if any values have been spilled/reloaded.
1488     * Note: since we don't have to recreate merge sets, we have to manually copy
1489     * interval_offset to the new liveness struct.
1490     */
1491    unsigned interval_offset = live->interval_offset;
1492    void *live_mem_ctx = ralloc_parent(live);
1493    ralloc_free(live);
1494    *live_ptr = ir3_calc_liveness(live_mem_ctx, v->ir);
1495    (*live_ptr)->interval_offset = interval_offset;
1496 }
1497 
1498