• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Valve Corporation
3  * Copyright © 2014 Rob Clark <robclark@freedesktop.org>
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "ir3_ra.h"
8 #include "ir3_shader.h"
9 
10 #include "util/u_math.h"
11 
12 /* Allocating shared registers can pose a challenge, because their live
13  * intervals use the physical CFG which has extra edges inserted that are
14  * pretty much always critical edges. This causes problems with phi nodes,
15  * because copies for phi nodes have to happen "along the edge," and similarly
16  * causes problems when reunifying values that have had their live range split.
17  * Problematic phi nodes should be relatively rare, so we ban them for now.
18  * The solution we choose for live-range splitting is to integrate spilling and
19  * register allcoation and spill to vector registers rather than split a live
20  * range, which negates some of the advantages of SSA-based RA, but it isn't as
21  * bad as it seems because the conditions needed (vector shared registers, which
22  * only movmsk currently produces, or fixed registers which we don't do) are
23  * relatively rare. Spilling is also much cheaper than spilling vector registers
24  * to private memory.
25  */
26 
27 struct ra_interval {
28    struct ir3_reg_interval interval;
29 
30    struct rb_node physreg_node;
31    physreg_t physreg_start, physreg_end;
32 
33    /* If this interval was spilled, the original physreg_start before spilling.
34     * Used when reloading live outs.
35     */
36    physreg_t physreg_start_orig;
37 
38    /* Where the shared register is spilled to. If there were no uses when it's
39     * spilled it could be the original defining instruction.
40     */
41    struct ir3_register *spill_def;
42 
43    /* Whether this contains a source of the current instruction that can't be
44     * spilled.
45     */
46    bool src;
47 
48    bool needs_reload;
49 };
50 
51 struct ra_block_state {
52    bool visited;
53 
54    /* For blocks whose successors are visited first (i.e. loop backedges), which
55     * values should be live at the end.
56     */
57    BITSET_WORD *live_out;
58 };
59 
60 struct ra_ctx {
61    struct ir3_reg_ctx reg_ctx;
62 
63    BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
64 
65    struct rb_tree physreg_intervals;
66 
67    struct ra_interval *intervals;
68 
69    struct ir3_liveness *live;
70 
71    struct hash_table *pcopy_src_map;
72 
73    struct ra_block_state *blocks;
74 
75    unsigned start;
76 };
77 
78 static struct ra_interval *
ir3_reg_interval_to_ra_interval(struct ir3_reg_interval * interval)79 ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
80 {
81    return rb_node_data(struct ra_interval, interval, interval);
82 }
83 
84 static struct ra_interval *
rb_node_to_interval(struct rb_node * node)85 rb_node_to_interval(struct rb_node *node)
86 {
87    return rb_node_data(struct ra_interval, node, physreg_node);
88 }
89 
90 static const struct ra_interval *
rb_node_to_interval_const(const struct rb_node * node)91 rb_node_to_interval_const(const struct rb_node *node)
92 {
93    return rb_node_data(struct ra_interval, node, physreg_node);
94 }
95 
96 static struct ra_interval *
ra_interval_next(struct ra_interval * interval)97 ra_interval_next(struct ra_interval *interval)
98 {
99    struct rb_node *next = rb_node_next(&interval->physreg_node);
100    return next ? rb_node_to_interval(next) : NULL;
101 }
102 
103 static struct ra_interval *
ra_interval_next_or_null(struct ra_interval * interval)104 ra_interval_next_or_null(struct ra_interval *interval)
105 {
106    return interval ? ra_interval_next(interval) : NULL;
107 }
108 
109 static int
ra_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)110 ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
111 {
112    const struct ra_interval *a = rb_node_to_interval_const(_a);
113    const struct ra_interval *b = rb_node_to_interval_const(_b);
114    return b->physreg_start - a->physreg_start;
115 }
116 
117 static int
ra_interval_cmp(const struct rb_node * node,const void * data)118 ra_interval_cmp(const struct rb_node *node, const void *data)
119 {
120    physreg_t reg = *(const physreg_t *)data;
121    const struct ra_interval *interval = rb_node_to_interval_const(node);
122    if (interval->physreg_start > reg)
123       return -1;
124    else if (interval->physreg_end <= reg)
125       return 1;
126    else
127       return 0;
128 }
129 
130 static struct ra_ctx *
ir3_reg_ctx_to_ctx(struct ir3_reg_ctx * ctx)131 ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
132 {
133    return rb_node_data(struct ra_ctx, ctx, reg_ctx);
134 }
135 
136 static struct ra_interval *
ra_interval_search_sloppy(struct rb_tree * tree,physreg_t reg)137 ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
138 {
139    struct rb_node *node = rb_tree_search_sloppy(tree, &reg, ra_interval_cmp);
140    return node ? rb_node_to_interval(node) : NULL;
141 }
142 
143 /* Get the interval covering the reg, or the closest to the right if it
144  * doesn't exist.
145  */
146 static struct ra_interval *
ra_interval_search_right(struct rb_tree * tree,physreg_t reg)147 ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
148 {
149    struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
150    if (!interval) {
151       return NULL;
152    } else if (interval->physreg_end > reg) {
153       return interval;
154    } else {
155       /* There is no interval covering reg, and ra_file_search_sloppy()
156        * returned the closest range to the left, so the next interval to the
157        * right should be the closest to the right.
158        */
159       return ra_interval_next_or_null(interval);
160    }
161 }
162 
163 static struct ra_interval *
ra_ctx_search_right(struct ra_ctx * ctx,physreg_t reg)164 ra_ctx_search_right(struct ra_ctx *ctx, physreg_t reg)
165 {
166    return ra_interval_search_right(&ctx->physreg_intervals, reg);
167 }
168 
169 static void
interval_add(struct ir3_reg_ctx * reg_ctx,struct ir3_reg_interval * _interval)170 interval_add(struct ir3_reg_ctx *reg_ctx, struct ir3_reg_interval *_interval)
171 {
172    struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
173    struct ra_ctx *ctx = ir3_reg_ctx_to_ctx(reg_ctx);
174 
175    /* We can assume in this case that physreg_start/physreg_end is already
176     * initialized.
177     */
178    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
179       BITSET_CLEAR(ctx->available, i);
180    }
181 
182    rb_tree_insert(&ctx->physreg_intervals, &interval->physreg_node,
183                   ra_interval_insert_cmp);
184 }
185 
186 static void
interval_delete(struct ir3_reg_ctx * reg_ctx,struct ir3_reg_interval * _interval)187 interval_delete(struct ir3_reg_ctx *reg_ctx, struct ir3_reg_interval *_interval)
188 {
189    struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
190    struct ra_ctx *ctx = ir3_reg_ctx_to_ctx(reg_ctx);
191 
192    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
193       BITSET_SET(ctx->available, i);
194    }
195 
196    rb_tree_remove(&ctx->physreg_intervals, &interval->physreg_node);
197 }
198 
199 static void
interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)200 interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
201                struct ir3_reg_interval *_child)
202 {
203    struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
204    struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
205 
206    child->physreg_start =
207       parent->physreg_start + (child->interval.reg->interval_start -
208                                parent->interval.reg->interval_start);
209    child->physreg_end =
210       child->physreg_start +
211       (child->interval.reg->interval_end - child->interval.reg->interval_start);
212 
213    interval_add(ctx, _child);
214 }
215 
216 static void
ra_ctx_init(struct ra_ctx * ctx)217 ra_ctx_init(struct ra_ctx *ctx)
218 {
219    ctx->reg_ctx.interval_add = interval_add;
220    ctx->reg_ctx.interval_delete = interval_delete;
221    ctx->reg_ctx.interval_readd = interval_readd;
222 }
223 
224 static void
ra_ctx_reset_block(struct ra_ctx * ctx)225 ra_ctx_reset_block(struct ra_ctx *ctx)
226 {
227    for (unsigned i = 0; i < RA_SHARED_SIZE; i++) {
228       BITSET_SET(ctx->available, i);
229    }
230 
231    rb_tree_init(&ctx->reg_ctx.intervals);
232    rb_tree_init(&ctx->physreg_intervals);
233 }
234 
235 static void
ra_interval_init(struct ra_interval * interval,struct ir3_register * reg)236 ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
237 {
238    ir3_reg_interval_init(&interval->interval, reg);
239 }
240 
241 static physreg_t
ra_interval_get_physreg(const struct ra_interval * interval)242 ra_interval_get_physreg(const struct ra_interval *interval)
243 {
244    unsigned child_start = interval->interval.reg->interval_start;
245 
246    while (interval->interval.parent) {
247       interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
248    }
249 
250    return interval->physreg_start +
251           (child_start - interval->interval.reg->interval_start);
252 }
253 
254 static unsigned
ra_interval_get_num(const struct ra_interval * interval)255 ra_interval_get_num(const struct ra_interval *interval)
256 {
257    return ra_physreg_to_num(ra_interval_get_physreg(interval),
258                             interval->interval.reg->flags);
259 }
260 
261 static void
ra_interval_dump(struct log_stream * stream,struct ra_interval * interval)262 ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
263 {
264    mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
265 
266    ir3_reg_interval_dump(stream, &interval->interval);
267 }
268 
269 static void
ra_ctx_dump(struct ra_ctx * ctx)270 ra_ctx_dump(struct ra_ctx *ctx)
271 {
272    struct log_stream *stream = mesa_log_streami();
273 
274    mesa_log_stream_printf(stream, "shared:\n");
275    rb_tree_foreach (struct ra_interval, interval, &ctx->physreg_intervals,
276                     physreg_node) {
277       ra_interval_dump(stream, interval);
278    }
279 
280    unsigned start, end;
281    mesa_log_stream_printf(stream, "available:\n");
282    BITSET_FOREACH_RANGE (start, end, ctx->available, RA_SHARED_SIZE) {
283       mesa_log_stream_printf(stream, "%u-%u ", start, end);
284    }
285    mesa_log_stream_printf(stream, "\n");
286    mesa_log_stream_printf(stream, "start: %u\n", ctx->start);
287 }
288 
289 static bool
get_reg_specified(struct ra_ctx * ctx,struct ir3_register * reg,physreg_t physreg)290 get_reg_specified(struct ra_ctx *ctx, struct ir3_register *reg, physreg_t physreg)
291 {
292    for (unsigned i = 0; i < reg_size(reg); i++) {
293       if (!BITSET_TEST(ctx->available, physreg + i))
294          return false;
295    }
296 
297    return true;
298 }
299 
300 static unsigned
reg_file_size(struct ir3_register * reg)301 reg_file_size(struct ir3_register *reg)
302 {
303    if (reg->flags & IR3_REG_HALF)
304       return RA_SHARED_HALF_SIZE;
305    else
306       return RA_SHARED_SIZE;
307 }
308 
309 static physreg_t
find_best_gap(struct ra_ctx * ctx,struct ir3_register * dst,unsigned size,unsigned align)310 find_best_gap(struct ra_ctx *ctx, struct ir3_register *dst, unsigned size,
311               unsigned align)
312 {
313    unsigned file_size = reg_file_size(dst);
314 
315    /* This can happen if we create a very large merge set. Just bail out in that
316     * case.
317     */
318    if (size > file_size)
319       return (physreg_t) ~0;
320 
321    unsigned start = ALIGN(ctx->start, align) % (file_size - size + align);
322    unsigned candidate = start;
323    do {
324       bool is_available = true;
325       for (unsigned i = 0; i < size; i++) {
326          if (!BITSET_TEST(ctx->available, candidate + i)) {
327             is_available = false;
328             break;
329          }
330       }
331 
332       if (is_available) {
333          ctx->start = (candidate + size) % file_size;
334          return candidate;
335       }
336 
337       candidate += align;
338       if (candidate + size > file_size)
339          candidate = 0;
340    } while (candidate != start);
341 
342    return (physreg_t)~0;
343 }
344 
345 static physreg_t
find_best_spill_reg(struct ra_ctx * ctx,struct ir3_register * reg,unsigned size,unsigned align)346 find_best_spill_reg(struct ra_ctx *ctx, struct ir3_register *reg,
347                     unsigned size, unsigned align)
348 {
349    unsigned file_size = reg_file_size(reg);
350    unsigned min_cost = UINT_MAX;
351 
352    unsigned start = ALIGN(ctx->start, align) % (file_size - size + align);
353    physreg_t candidate = start;
354    physreg_t best_reg = (physreg_t)~0;
355    do {
356       unsigned cost = 0;
357 
358       /* Iterate through intervals we'd need to spill to use this reg. */
359       for (struct ra_interval *interval = ra_ctx_search_right(ctx, candidate);
360            interval && interval->physreg_start < candidate + size;
361            interval = ra_interval_next_or_null(interval)) {
362          /* We can't spill sources of the current instruction when reloading
363           * sources.
364           */
365          if (interval->src) {
366             cost = UINT_MAX;
367             break;
368          }
369 
370          /* We prefer spilling intervals that already have been spilled, so we
371           * don't have to emit another mov.
372           */
373          if (!interval->spill_def)
374             cost += (interval->physreg_end - interval->physreg_start);
375       }
376 
377       if (cost < min_cost) {
378          min_cost = cost;
379          best_reg = candidate;
380       }
381 
382       candidate += align;
383       if (candidate + size > file_size)
384          candidate = 0;
385    } while (candidate != start);
386 
387    return best_reg;
388 }
389 
390 static struct ir3_register *
split(struct ir3_register * def,unsigned offset,struct ir3_instruction * before)391 split(struct ir3_register *def, unsigned offset, struct ir3_instruction *before)
392 {
393    if (reg_elems(def) == 1) {
394       assert(offset == 0);
395       return def;
396    }
397 
398    struct ir3_instruction *split =
399       ir3_instr_create_at(ir3_after_instr(before), OPC_META_SPLIT, 1, 1);
400    split->split.off = offset;
401    struct ir3_register *dst = __ssa_dst(split);
402    struct ir3_register *src =
403       ir3_src_create(split, INVALID_REG, def->flags & (IR3_REG_HALF | IR3_REG_SSA));
404    src->wrmask = def->wrmask;
405    src->def = def;
406    return dst;
407 }
408 
409 static struct ir3_register *
extract(struct ir3_register * parent_def,unsigned offset,unsigned elems,struct ir3_instruction * before)410 extract(struct ir3_register *parent_def, unsigned offset, unsigned elems,
411         struct ir3_instruction *before)
412 {
413    if (offset == 0 && elems == reg_elems(parent_def))
414       return parent_def;
415 
416    if (elems == 1)
417       return split(parent_def, offset, before);
418 
419    struct ir3_instruction *collect =
420       ir3_instr_create_at(ir3_after_instr(before), OPC_META_COLLECT, 1, elems);
421    struct ir3_register *dst = __ssa_dst(collect);
422    dst->flags |= parent_def->flags & IR3_REG_HALF;
423    dst->wrmask = MASK(elems);
424 
425    for (unsigned i = 0; i < elems; i++) {
426       ir3_src_create(collect, INVALID_REG,
427                      parent_def->flags & (IR3_REG_HALF | IR3_REG_SSA))->def =
428          split(parent_def, offset + i, before);
429    }
430 
431    return dst;
432 }
433 
434 static void
spill_interval_children(struct ra_interval * interval,struct ir3_instruction * before)435 spill_interval_children(struct ra_interval *interval,
436                         struct ir3_instruction *before)
437 {
438    rb_tree_foreach (struct ra_interval, child, &interval->interval.children,
439                     interval.node) {
440       if (!child->spill_def) {
441          child->spill_def = extract(interval->spill_def,
442                                     (child->interval.reg->interval_start -
443                                      interval->interval.reg->interval_start) /
444                                     reg_elem_size(interval->interval.reg),
445                                     reg_elems(child->interval.reg), before);
446          interval->physreg_start_orig = child->physreg_start;
447       }
448       spill_interval_children(child, before);
449    }
450 }
451 
452 static void
spill_interval(struct ra_ctx * ctx,struct ra_interval * interval)453 spill_interval(struct ra_ctx *ctx, struct ra_interval *interval)
454 {
455    struct ir3_instruction *before = interval->interval.reg->instr;
456 
457    d("spilling ssa_%u:%u", before->serialno, interval->interval.reg->name);
458 
459    if (!interval->spill_def) {
460       /* If this is a phi node or input, we need to insert the demotion to a
461        * regular register after the last phi or input in the block.
462        */
463       if (before->opc == OPC_META_PHI ||
464           before->opc == OPC_META_INPUT) {
465          struct ir3_block *block = before->block;
466          struct ir3_instruction *last_phi_input = NULL;
467          foreach_instr_from (instr, before, &block->instr_list) {
468             if (instr->opc != before->opc)
469                break;
470             last_phi_input = instr;
471          }
472          before = last_phi_input;
473       }
474 
475       struct ir3_instruction *mov =
476          ir3_instr_create_at(ir3_after_instr(before), OPC_MOV, 1, 1);
477       mov->flags |= IR3_INSTR_SHARED_SPILL;
478       struct ir3_register *dst = __ssa_dst(mov);
479       dst->flags |= (interval->interval.reg->flags & IR3_REG_HALF);
480       dst->wrmask = interval->interval.reg->wrmask;
481       mov->repeat = reg_elems(dst) - 1;
482       ir3_src_create(mov, interval->interval.reg->num,
483                      IR3_REG_SHARED | (mov->repeat ? IR3_REG_R : 0) |
484                      (interval->interval.reg->flags & IR3_REG_HALF))->wrmask =
485                      interval->interval.reg->wrmask;
486       mov->cat1.src_type = mov->cat1.dst_type =
487          (interval->interval.reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
488 
489       interval->spill_def = dst;
490       interval->physreg_start_orig = interval->physreg_start;
491    }
492 
493    spill_interval_children(interval, interval->spill_def->instr);
494 
495    ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
496 }
497 
498 /* Try to demote a scalar ALU instruction to a normal ALU instruction, using the
499  * spilled sources. We have to take into account restrictions on the number of
500  * shared sources that only exist for normal ALU instructions.
501  */
502 static bool
try_demote_instruction(struct ra_ctx * ctx,struct ir3_instruction * instr)503 try_demote_instruction(struct ra_ctx *ctx, struct ir3_instruction *instr)
504 {
505    /* First, check restrictions. */
506    switch (opc_cat(instr->opc)) {
507    case 1:
508       /* MOVMSK is special and can't be demoted. It also has no sources so must
509        * go before the check below.
510        */
511       if (instr->opc == OPC_MOVMSK)
512          return false;
513 
514       assert(instr->srcs_count >= 1);
515       if (!(instr->srcs[0]->flags & (IR3_REG_CONST | IR3_REG_IMMED)))
516          return false;
517       break;
518    case 2: {
519       /* We need one source to either be demotable or an immediate. */
520       if (instr->srcs_count > 1) {
521          struct ra_interval *src0_interval =
522             (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
523          struct ra_interval *src1_interval =
524             (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
525          if (!(src0_interval && src0_interval->spill_def) &&
526              !(src1_interval && src1_interval->spill_def) &&
527              !(instr->srcs[0]->flags & IR3_REG_IMMED) &&
528              !(instr->srcs[1]->flags & IR3_REG_IMMED))
529             return false;
530       }
531       break;
532    }
533    case 3: {
534       struct ra_interval *src0_interval =
535          (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
536       struct ra_interval *src1_interval =
537          (instr->srcs[1]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[1]->def->name] : NULL;
538 
539       /* src1 cannot be shared */
540       if (src1_interval && !src1_interval->spill_def) {
541          /* Try to swap src0 and src1, similar to what copy prop does. */
542          if (!is_mad(instr->opc))
543             return false;
544 
545          if ((src0_interval && src0_interval->spill_def) ||
546              (instr->srcs[0]->flags & IR3_REG_IMMED)) {
547             struct ir3_register *src0 = instr->srcs[0];
548             instr->srcs[0] = instr->srcs[1];
549             instr->srcs[1] = src0;
550          } else {
551             return false;
552          }
553       }
554       break;
555    }
556    case 4: {
557       assert(instr->srcs[0]->flags & IR3_REG_SSA);
558       struct ra_interval *src_interval = &ctx->intervals[instr->srcs[0]->def->name];
559       if (!src_interval->spill_def)
560          return false;
561       break;
562    }
563 
564    default:
565       return false;
566    }
567 
568    d("demoting instruction");
569 
570    /* If the instruction is already not a scalar ALU instruction, we should've
571     * skipped reloading and just demoted sources directly, so we should never
572     * get here.
573     */
574    assert(instr->dsts[0]->flags & IR3_REG_SHARED);
575 
576    /* Now we actually demote the instruction */
577    ra_foreach_src (src, instr) {
578       assert(src->flags & IR3_REG_SHARED);
579       struct ra_interval *interval = &ctx->intervals[src->def->name];
580       if (interval->spill_def) {
581          src->def = interval->spill_def;
582          src->flags &= ~IR3_REG_SHARED;
583          interval->needs_reload = false;
584          if (interval->interval.inserted)
585             ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
586          while (interval->interval.parent)
587             interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
588          interval->src = false;
589       }
590    }
591 
592    struct ra_interval *dst_interval = &ctx->intervals[instr->dsts[0]->name];
593    instr->dsts[0]->flags &= ~IR3_REG_SHARED;
594    ra_interval_init(dst_interval, instr->dsts[0]);
595    dst_interval->spill_def = instr->dsts[0];
596 
597    instr->flags |= IR3_INSTR_SHARED_SPILL;
598 
599    return true;
600 }
601 
602 /* Free up [start, start + size) by spilling live intervals.
603  */
604 static void
free_space(struct ra_ctx * ctx,physreg_t start,unsigned size)605 free_space(struct ra_ctx *ctx, physreg_t start, unsigned size)
606 {
607    struct ra_interval *interval = ra_ctx_search_right(ctx, start);
608    while (interval && interval->physreg_start < start + size) {
609       struct ra_interval *next = ra_interval_next_or_null(interval);
610       spill_interval(ctx, interval);
611       interval = next;
612    }
613 }
614 
615 static physreg_t
get_reg(struct ra_ctx * ctx,struct ir3_register * reg,bool src)616 get_reg(struct ra_ctx *ctx, struct ir3_register *reg, bool src)
617 {
618    if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
619       physreg_t preferred_reg =
620          reg->merge_set->preferred_reg + reg->merge_set_offset;
621       if (preferred_reg < reg_file_size(reg) &&
622           preferred_reg % reg_elem_size(reg) == 0 &&
623           get_reg_specified(ctx, reg, preferred_reg))
624          return preferred_reg;
625    }
626 
627    /* If this register is a subset of a merge set which we have not picked a
628     * register for, first try to allocate enough space for the entire merge
629     * set.
630     */
631    unsigned size = reg_size(reg);
632    if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
633        size < reg->merge_set->size) {
634       physreg_t best_reg = find_best_gap(ctx, reg, reg->merge_set->size,
635                                          reg->merge_set->alignment);
636       if (best_reg != (physreg_t)~0u) {
637          best_reg += reg->merge_set_offset;
638          return best_reg;
639       }
640    }
641 
642    /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
643     * Because this doesn't introduce unnecessary dependencies, and it
644     * potentially avoids needing (ss) syncs for write after read hazards for
645     * SFU instructions:
646     */
647    if (!src && (is_sfu(reg->instr) || is_alu(reg->instr))) {
648       for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
649          struct ir3_register *src = reg->instr->srcs[i];
650          if (!ra_reg_is_src(src))
651             continue;
652          if ((src->flags & IR3_REG_SHARED) && reg_size(src) >= size) {
653             struct ra_interval *src_interval = &ctx->intervals[src->def->name];
654             physreg_t src_physreg = ra_interval_get_physreg(src_interval);
655             if (src_physreg % reg_elem_size(reg) == 0 &&
656                 src_physreg + size <= reg_file_size(reg) &&
657                 get_reg_specified(ctx, reg, src_physreg))
658                return src_physreg;
659          }
660       }
661    }
662 
663    return find_best_gap(ctx, reg, size, reg_elem_size(reg));
664 }
665 
666 /* The reload process is split in two, first we allocate a register to reload to
667  * for all sources that need a reload and then we actually execute the reload.
668  * This is to allow us to demote shared ALU instructions to non-shared whenever
669  * we would otherwise need to spill to reload, without leaving dangling unused
670  * reload mov's from previously processed sources. So, for example, we could
671  * need to reload both sources of an add, but after reloading the first source
672  * we realize that we would need to spill to reload the second source and we
673  * should demote the add instead, which means cancelling the first reload.
674  */
675 static void
reload_src(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)676 reload_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
677            struct ir3_register *src)
678 {
679    struct ir3_register *reg = src->def;
680    struct ra_interval *interval = &ctx->intervals[reg->name];
681    unsigned size = reg_size(reg);
682 
683    physreg_t best_reg = get_reg(ctx, reg, true);
684 
685    if (best_reg == (physreg_t)~0u) {
686       if (try_demote_instruction(ctx, instr))
687          return;
688 
689       best_reg = find_best_spill_reg(ctx, reg, size, reg_elem_size(reg));
690       assert(best_reg != (physreg_t)~0u);
691 
692       free_space(ctx, best_reg, size);
693    }
694 
695    d("reload src %u physreg %u", reg->name, best_reg);
696    interval->physreg_start = best_reg;
697    interval->physreg_end = best_reg + size;
698    interval->needs_reload = true;
699    ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
700    interval->src = true;
701 }
702 
703 static void
reload_interval(struct ra_ctx * ctx,struct ir3_cursor cursor,struct ra_interval * interval)704 reload_interval(struct ra_ctx *ctx, struct ir3_cursor cursor,
705                 struct ra_interval *interval)
706 {
707    struct ir3_register *def = interval->interval.reg;
708    struct ir3_instruction *mov = ir3_instr_create_at(cursor, OPC_MOV, 1, 1);
709    mov->flags |= IR3_INSTR_SHARED_SPILL;
710    unsigned flags = IR3_REG_SHARED | (def->flags & IR3_REG_HALF);
711    ir3_dst_create(mov, ra_physreg_to_num(interval->physreg_start, flags),
712                   flags)->wrmask = def->wrmask;
713    mov->repeat = reg_elems(def) - 1;
714    struct ir3_register *mov_src =
715       ir3_src_create(mov, INVALID_REG, IR3_REG_SSA | (def->flags & IR3_REG_HALF) |
716                      (mov->repeat ? IR3_REG_R : 0));
717    assert(interval->spill_def);
718    mov_src->def = interval->spill_def;
719    mov_src->wrmask = def->wrmask;
720    mov->cat1.src_type = mov->cat1.dst_type =
721       (def->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
722 }
723 
724 static void
reload_src_finalize(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)725 reload_src_finalize(struct ra_ctx *ctx, struct ir3_instruction *instr,
726                     struct ir3_register *src)
727 {
728    struct ir3_register *reg = src->def;
729    struct ra_interval *interval = &ctx->intervals[reg->name];
730 
731    if (!interval->needs_reload)
732       return;
733 
734    reload_interval(ctx, ir3_before_instr(instr), interval);
735 
736    interval->needs_reload = false;
737 }
738 
739 static bool
can_demote_src(struct ir3_instruction * instr)740 can_demote_src(struct ir3_instruction *instr)
741 {
742    switch (instr->opc) {
743    case OPC_SCAN_MACRO:
744    case OPC_META_COLLECT:
745       return false;
746    case OPC_MOV:
747       /* non-shared -> shared floating-point conversions and
748        * 8-bit sign extension don't work.
749        */
750       return (!(instr->dsts[0]->flags & IR3_REG_SHARED) ||
751               !((full_type(instr->cat1.src_type) == TYPE_F32 ||
752                  full_type(instr->cat1.dst_type) == TYPE_F32) ||
753                 (instr->cat1.src_type == TYPE_U8 &&
754                  full_type(instr->cat1.dst_type) == TYPE_S32)));
755    default:
756       return (!is_alu(instr) && !is_sfu(instr)) ||
757          !(instr->dsts[0]->flags & IR3_REG_SHARED);
758    }
759 }
760 
761 /* Ensure that this source is never spilled while reloading other sources.
762  */
763 static void
mark_src(struct ra_ctx * ctx,struct ir3_register * src)764 mark_src(struct ra_ctx *ctx, struct ir3_register *src)
765 {
766    if (!(src->flags & IR3_REG_SHARED))
767       return;
768 
769    struct ra_interval *interval = &ctx->intervals[src->def->name];
770 
771    if (interval->interval.inserted) {
772       while (interval->interval.parent)
773          interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
774 
775       interval->src = true;
776    }
777 }
778 
779 static void
ensure_src_live(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)780 ensure_src_live(struct ra_ctx *ctx, struct ir3_instruction *instr,
781                 struct ir3_register *src)
782 {
783    if (!(src->flags & IR3_REG_SHARED))
784       return;
785 
786    struct ra_interval *interval = &ctx->intervals[src->def->name];
787 
788    if (!interval->interval.inserted) {
789       /* In some cases we cannot demote shared reg sources to non-shared regs,
790        * then we have to reload it.
791        */
792       assert(interval->spill_def);
793       if (!can_demote_src(instr)) {
794          reload_src(ctx, instr, src);
795       } else {
796          if (instr->opc == OPC_META_PARALLEL_COPY) {
797             /* Stash away the original def to use later in case we actually have
798              * to insert a reload.
799              */
800             _mesa_hash_table_insert(ctx->pcopy_src_map, src, src->def);
801          }
802          src->def = interval->spill_def;
803          src->flags &= ~IR3_REG_SHARED;
804       }
805    }
806 }
807 
808 static void
assign_src(struct ra_ctx * ctx,struct ir3_register * src)809 assign_src(struct ra_ctx *ctx, struct ir3_register *src)
810 {
811    if (!(src->flags & IR3_REG_SHARED))
812       return;
813 
814    struct ra_interval *interval = &ctx->intervals[src->def->name];
815    assert(interval->interval.inserted);
816    src->num = ra_physreg_to_num(ra_interval_get_physreg(interval), src->flags);
817 
818    if ((src->flags & IR3_REG_FIRST_KILL) &&
819        !interval->interval.parent &&
820        rb_tree_is_empty(&interval->interval.children))
821       ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
822 
823    while (interval->interval.parent)
824       interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
825 
826    interval->src = false;
827 }
828 
829 static bool
is_nontrivial_collect(struct ir3_instruction * collect)830 is_nontrivial_collect(struct ir3_instruction *collect)
831 {
832    if (collect->opc != OPC_META_COLLECT) {
833       return false;
834    }
835 
836    struct ir3_register *dst = collect->dsts[0];
837 
838    foreach_src_n (src, src_n, collect) {
839       if (src->num != dst->num + src_n) {
840          return true;
841       }
842    }
843 
844    return false;
845 }
846 
847 static void
handle_dst(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * dst)848 handle_dst(struct ra_ctx *ctx, struct ir3_instruction *instr,
849            struct ir3_register *dst)
850 {
851    if (!(dst->flags & IR3_REG_SHARED))
852       return;
853 
854    struct ra_interval *interval = &ctx->intervals[dst->name];
855    ra_interval_init(interval, dst);
856    interval->spill_def = NULL;
857 
858    if (dst->tied) {
859       struct ir3_register *tied_def = dst->tied->def;
860       struct ra_interval *tied_interval = &ctx->intervals[tied_def->name];
861       if ((dst->tied->flags & IR3_REG_KILL) &&
862           !tied_interval->interval.parent &&
863           rb_tree_is_empty(&tied_interval->interval.children)) {
864          dst->num = dst->tied->num;
865          interval->physreg_start = tied_interval->physreg_start;
866          interval->physreg_end = tied_interval->physreg_end;
867          ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
868          return;
869       }
870    }
871 
872    physreg_t physreg = get_reg(ctx, dst, false);
873    if (physreg == (physreg_t) ~0u) {
874       if (try_demote_instruction(ctx, instr))
875          return;
876 
877       unsigned size = reg_size(dst);
878       physreg = find_best_spill_reg(ctx, dst, size, reg_elem_size(dst));
879       assert(physreg != (physreg_t)~0u);
880       free_space(ctx, physreg, size);
881    }
882 
883    dst->num = ra_physreg_to_num(physreg, dst->flags);
884 
885    /* Non-trivial collects (i.e., ones that will introduce moves because the
886     * sources don't line-up with the destination) may cause source intervals to
887     * get implicitly moved when they are inserted as children of the destination
888     * interval. Since we don't support moving intervals in shared RA, this may
889     * cause illegal register allocations. Prevent this by creating a new
890     * top-level interval for the destination so that the source intervals will
891     * be left alone.
892     */
893    if (is_nontrivial_collect(instr)) {
894       dst->merge_set = NULL;
895       dst->interval_start = ctx->live->interval_offset;
896       dst->interval_end = dst->interval_start + reg_size(dst);
897       ctx->live->interval_offset = dst->interval_end;
898    }
899 
900    ra_update_affinity(reg_file_size(dst), dst, physreg);
901    interval->physreg_start = physreg;
902    interval->physreg_end = physreg + reg_size(dst);
903    ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
904    d("insert dst %u physreg %u", dst->name, physreg);
905 
906    if (dst->tied) {
907       struct ir3_instruction *mov = ir3_instr_create_at(
908          ir3_before_instr(instr), OPC_META_PARALLEL_COPY, 1, 1);
909       unsigned flags = IR3_REG_SHARED | (dst->flags & IR3_REG_HALF);
910       ir3_dst_create(mov, dst->num, flags)->wrmask = dst->wrmask;
911       ir3_src_create(mov, dst->tied->num, flags)->wrmask = dst->wrmask;
912       mov->cat1.src_type = mov->cat1.dst_type =
913          (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;;
914       dst->tied->num = dst->num;
915    }
916 }
917 
918 static void
handle_src_late(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)919 handle_src_late(struct ra_ctx *ctx, struct ir3_instruction *instr,
920                 struct ir3_register *src)
921 {
922    if (!(src->flags & IR3_REG_SHARED))
923       return;
924 
925    struct ra_interval *interval = &ctx->intervals[src->def->name];
926    reload_src_finalize(ctx, instr, src);
927 
928    /* Remove killed sources that have to be killed late due to being merged with
929     * other defs.
930     */
931    if (!(src->flags & IR3_REG_KILL))
932       return;
933 
934    if (interval->interval.inserted)
935       ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
936 }
937 
938 static void
handle_normal_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)939 handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
940 {
941    ra_foreach_src (src, instr)
942       mark_src(ctx, src);
943 
944    ra_foreach_src (src, instr)
945       ensure_src_live(ctx, instr, src);
946 
947    ra_foreach_src_rev (src, instr)
948       assign_src(ctx, src);
949 
950    ra_foreach_dst (dst, instr)
951       handle_dst(ctx, instr, dst);
952 
953    ra_foreach_src (src, instr)
954       handle_src_late(ctx, instr, src);
955 }
956 
957 static void
handle_split(struct ra_ctx * ctx,struct ir3_instruction * split)958 handle_split(struct ra_ctx *ctx, struct ir3_instruction *split)
959 {
960    struct ir3_register *src = split->srcs[0];
961    struct ir3_register *dst = split->dsts[0];
962 
963    if (!(dst->flags & IR3_REG_SHARED))
964       return;
965 
966    if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
967       handle_normal_instr(ctx, split);
968       return;
969    }
970 
971    struct ra_interval *src_interval = &ctx->intervals[src->def->name];
972    struct ra_interval *dst_interval = &ctx->intervals[dst->name];
973 
974    ra_interval_init(dst_interval, dst);
975    dst_interval->spill_def = NULL;
976 
977    if (src_interval->spill_def) {
978       struct ir3_instruction *spill_split =
979          ir3_instr_create_at(ir3_after_instr(split), OPC_META_SPLIT, 1, 1);
980       struct ir3_register *dst = __ssa_dst(spill_split);
981       struct ir3_register *src =
982          ir3_src_create(spill_split, INVALID_REG, IR3_REG_SSA);
983       src->def = src_interval->spill_def;
984       src->wrmask = src_interval->spill_def->wrmask;
985       spill_split->split.off = split->split.off;
986       dst_interval->spill_def = dst;
987       list_del(&split->node);
988       return;
989    }
990 
991    dst_interval->physreg_start =
992       src_interval->physreg_start + dst->merge_set_offset -
993       src->def->merge_set_offset;
994    dst_interval->physreg_end = dst_interval->physreg_start + reg_size(dst);
995    ir3_reg_interval_insert(&ctx->reg_ctx, &dst_interval->interval);
996    src->num = ra_interval_get_num(src_interval);
997    dst->num = ra_interval_get_num(dst_interval);
998    d("insert dst %u physreg %u", dst->name, dst_interval->physreg_start);
999 
1000    if (src->flags & IR3_REG_KILL)
1001       ir3_reg_interval_remove(&ctx->reg_ctx, &src_interval->interval);
1002 }
1003 
1004 static void
handle_phi(struct ra_ctx * ctx,struct ir3_instruction * phi)1005 handle_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
1006 {
1007    struct ir3_register *dst = phi->dsts[0];
1008 
1009    if (!(dst->flags & IR3_REG_SHARED))
1010       return;
1011 
1012    struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1013    ra_interval_init(dst_interval, dst);
1014 
1015    /* In some rare cases, it's possible to have a phi node with a physical-only
1016     * source. Here's a contrived example:
1017     *
1018     * loop {
1019     *    if non-uniform {
1020     *       if uniform {
1021     *          x_1 = ...;
1022     *          continue;
1023     *       }
1024     *       x_2 = ...;
1025     *    } else {
1026     *       break;
1027     *    }
1028     *    // continue block
1029     *    x_3 = phi(x_1, x_2)
1030     * }
1031     *
1032     * Assuming x_1 and x_2 are uniform, x_3 will also be uniform, because all
1033     * threads that stay in the loop take the same branch to the continue block,
1034     * however execution may fall through from the assignment to x_2 to the
1035     * break statement because the outer if is non-uniform, and then it will fall
1036     * through again to the continue block, so if x_3 is to be in a shared reg
1037     * then the phi needs an extra source pointing to the break statement, which
1038     * itself needs a phi node:
1039     *
1040     * loop {
1041     *    if non-uniform {
1042     *       if uniform {
1043     *          x_1 = ...;
1044     *          continue;
1045     *       }
1046     *       x_2 = ...;
1047     *    } else {
1048     *       x_4 = phi(undef, x_2)
1049     *       break;
1050     *    }
1051     *    // continue block
1052     *    x_3 = phi(x_1, x_2, x_4)
1053     * }
1054     */
1055 
1056    /* phi nodes are special because we cannot spill them normally, instead we
1057     * have to spill the parallel copies that their sources point to and make the
1058     * entire phi not shared anymore.
1059     */
1060 
1061    physreg_t physreg = get_reg(ctx, dst, false);
1062    if (physreg == (physreg_t) ~0u) {
1063       d("spilling phi destination");
1064       dst->flags &= ~IR3_REG_SHARED;
1065       dst_interval->spill_def = dst;
1066       phi->flags |= IR3_INSTR_SHARED_SPILL;
1067 
1068       foreach_src (src, phi) {
1069          src->flags &= ~IR3_REG_SHARED;
1070          if (src->def)
1071             src->def->flags &= ~IR3_REG_SHARED;
1072       }
1073 
1074       return;
1075    }
1076 
1077    dst->num = ra_physreg_to_num(physreg, dst->flags);
1078    dst_interval->spill_def = NULL;
1079    dst_interval->physreg_start = physreg;
1080    dst_interval->physreg_end = physreg + reg_size(dst);
1081    ir3_reg_interval_insert(&ctx->reg_ctx, &dst_interval->interval);
1082 
1083    ra_foreach_src_n (src, i, phi) {
1084       /* We assume that any phis with non-logical sources aren't promoted. */
1085       assert(i < phi->block->predecessors_count);
1086       src->num = dst->num;
1087       src->def->num = dst->num;
1088    }
1089 }
1090 
1091 static void
handle_pcopy(struct ra_ctx * ctx,struct ir3_instruction * pcopy)1092 handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *pcopy)
1093 {
1094    /* For parallel copies, we only handle the source. The destination is handled
1095     * later when processing phi nodes.
1096     */
1097 
1098    ra_foreach_src (src, pcopy)
1099       mark_src(ctx, src);
1100 
1101    ra_foreach_src (src, pcopy)
1102       ensure_src_live(ctx, pcopy, src);
1103 
1104    ra_foreach_src_rev (src, pcopy)
1105       assign_src(ctx, src);
1106 
1107    ra_foreach_src (src, pcopy)
1108       handle_src_late(ctx, pcopy, src);
1109 }
1110 
1111 static void
handle_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1112 handle_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1113 {
1114    instr->flags &= ~IR3_INSTR_SHARED_SPILL;
1115 
1116    switch (instr->opc) {
1117    case OPC_META_SPLIT:
1118       handle_split(ctx, instr);
1119       break;
1120    case OPC_META_PHI:
1121       handle_phi(ctx, instr);
1122       break;
1123    case OPC_META_PARALLEL_COPY:
1124       handle_pcopy(ctx, instr);
1125       break;
1126    default:
1127       handle_normal_instr(ctx, instr);
1128    }
1129 }
1130 
1131 /* In case we define a value outside a loop, use it inside the loop, then spill
1132  * it afterwards inside the same loop, we could lose the value so we have to
1133  * reload it. We have to reload it after any parallel copy instruction, when the
1134  * live shared registers equal the live-in of the backedge. lower_pcopy() will
1135  * then move any non-shared parallel copies down past the reload.
1136  */
1137 static void
reload_live_outs(struct ra_ctx * ctx,struct ir3_block * block)1138 reload_live_outs(struct ra_ctx *ctx, struct ir3_block *block)
1139 {
1140    struct ra_block_state *state = &ctx->blocks[block->index];
1141    unsigned name;
1142    BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) {
1143       struct ir3_register *reg = ctx->live->definitions[name];
1144 
1145       struct ra_interval *interval = &ctx->intervals[name];
1146       if (!interval->interval.inserted) {
1147          d("reloading %d at end of backedge", reg->name);
1148 
1149          /* When this interval was spilled inside the loop, we probably chose a
1150           * different physreg for it than the original physreg when it was
1151           * defined outside the loop. Restore the original physreg so that we
1152           * spill it correctly.
1153           */
1154          unsigned size = interval->physreg_end - interval->physreg_start;
1155          interval->physreg_start = interval->physreg_start_orig;
1156          interval->physreg_end = interval->physreg_start + size;
1157 
1158          reload_interval(ctx, ir3_before_terminator(block), interval);
1159       }
1160    }
1161 }
1162 
1163 static void
record_pred_live_out(struct ra_ctx * ctx,struct ra_interval * interval,struct ir3_block * pred)1164 record_pred_live_out(struct ra_ctx *ctx,
1165                      struct ra_interval *interval,
1166                      struct ir3_block *pred)
1167 {
1168    struct ra_block_state *state = &ctx->blocks[pred->index];
1169 
1170    struct ir3_register *def = interval->interval.reg;
1171    BITSET_SET(state->live_out, def->name);
1172 
1173    rb_tree_foreach (struct ra_interval, child,
1174                     &interval->interval.children, interval.node) {
1175       record_pred_live_out(ctx, child, pred);
1176    }
1177 }
1178 
1179 static void
record_pred_live_outs(struct ra_ctx * ctx,struct ir3_block * block)1180 record_pred_live_outs(struct ra_ctx *ctx, struct ir3_block *block)
1181 {
1182    for (unsigned i = 0; i < block->predecessors_count; i++) {
1183       struct ir3_block *pred = block->predecessors[i];
1184       struct ra_block_state *state = &ctx->blocks[pred->index];
1185       if (state->visited)
1186          continue;
1187 
1188       state->live_out = rzalloc_array(NULL, BITSET_WORD,
1189                                       BITSET_WORDS(ctx->live->definitions_count));
1190 
1191 
1192       rb_tree_foreach (struct ra_interval, interval,
1193                        &ctx->reg_ctx.intervals, interval.node) {
1194          record_pred_live_out(ctx, interval, pred);
1195       }
1196    }
1197 }
1198 
1199 static void
handle_block(struct ra_ctx * ctx,struct ir3_block * block)1200 handle_block(struct ra_ctx *ctx, struct ir3_block *block)
1201 {
1202    ra_ctx_reset_block(ctx);
1203 
1204    unsigned name;
1205    BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1206                        ctx->live->definitions_count) {
1207       struct ir3_register *def = ctx->live->definitions[name];
1208       struct ra_interval *interval = &ctx->intervals[name];
1209 
1210       /* Non-shared definitions may still be definitions we spilled by demoting
1211        * them, so we still need to initialize the interval. But we shouldn't
1212        * make these intervals live.
1213        */
1214       ra_interval_init(interval, def);
1215 
1216       if ((def->flags & IR3_REG_SHARED) && !interval->spill_def) {
1217          ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
1218       }
1219    }
1220 
1221    if (RA_DEBUG) {
1222       d("after live-in block %u:\n", block->index);
1223       ra_ctx_dump(ctx);
1224    }
1225 
1226    if (block->predecessors_count > 1)
1227       record_pred_live_outs(ctx, block);
1228 
1229    foreach_instr_safe (instr, &block->instr_list) {
1230       di(instr, "processing");
1231 
1232       handle_instr(ctx, instr);
1233 
1234       if (RA_DEBUG)
1235          ra_ctx_dump(ctx);
1236    }
1237 
1238    if (block->successors[0]) {
1239       struct ra_block_state *state = &ctx->blocks[block->successors[0]->index];
1240 
1241       if (state->visited) {
1242          assert(!block->successors[1]);
1243 
1244          reload_live_outs(ctx, block);
1245       }
1246    }
1247 
1248    ctx->blocks[block->index].visited = true;
1249 }
1250 
1251 static void
lower_pcopy(struct ir3 * ir,struct ra_ctx * ctx)1252 lower_pcopy(struct ir3 *ir, struct ra_ctx *ctx)
1253 {
1254    foreach_block (block, &ir->block_list) {
1255       foreach_instr_safe (instr, &block->instr_list) {
1256          /* At this point due to spilling there may be parallel copies from
1257           * shared to non-shared registers and vice versa. Lowering these after
1258           * RA may produce cycles involving shared and non-shared registers,
1259           * which would need to be resolved by swapping a shared and non-shared
1260           * register which is something we can't handle. However by lowering
1261           * these to moves now, we can make sure that cycles only involve
1262           * non-shared registers. To avoid illegally moving a shared register
1263           * read or write across the parallel copy, which may have other
1264           * conflicting reads/writes if there's a cycle, we need to move copies
1265           * from non-shared to shared below the shared copies, and we need to
1266           * move copies from shared to non-shared above them. So, we have the
1267           * following order:
1268           *
1269           * 1. shared->non-shared copies (spills)
1270           * 2. shared->shared copies (one parallel copy as there may be cycles)
1271           * 3. non-shared->shared copies (reloads)
1272           * 4. non-shared->non-shared copies
1273           *
1274           * We split out the non-shared->non-shared copies as a separate step.
1275           */
1276          if (instr->opc == OPC_META_PARALLEL_COPY) {
1277             for (unsigned i = 0; i < instr->srcs_count; i++) {
1278                if ((instr->srcs[i]->flags & IR3_REG_SHARED) &&
1279                    !(instr->dsts[i]->flags & IR3_REG_SHARED)) {
1280                   /* shared->non-shared. Create a spill move and rewrite the
1281                    * source to be the destination of the move (so that the
1282                    * original shared->non-shared copy becomes a
1283                    * non-shared->non-shared copy).
1284                    */
1285                   struct ir3_instruction *mov = ir3_instr_create_at(
1286                      ir3_before_instr(instr), OPC_MOV, 1, 1);
1287                   mov->flags |= IR3_INSTR_SHARED_SPILL;
1288                   struct ir3_register *dst =
1289                      ir3_dst_create(mov, INVALID_REG, instr->dsts[i]->flags);
1290                   dst->wrmask = instr->dsts[i]->wrmask;
1291                   dst->instr = mov;
1292                   mov->repeat = reg_elems(mov->dsts[0]) - 1;
1293                   struct ir3_register *src =
1294                      ir3_src_create(mov, instr->srcs[i]->num,
1295                                     instr->srcs[i]->flags |
1296                                     (mov->repeat ? IR3_REG_R : 0));
1297                   src->wrmask = instr->srcs[i]->wrmask;
1298                   mov->cat1.dst_type = mov->cat1.src_type =
1299                      (mov->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
1300                   instr->srcs[i]->flags = mov->dsts[0]->flags;
1301                   instr->srcs[i]->def = mov->dsts[0];
1302                }
1303             }
1304 
1305             for (unsigned i = 0; i < instr->dsts_count;) {
1306                if ((instr->dsts[i]->flags & IR3_REG_SHARED) &&
1307                    (instr->srcs[i]->flags & IR3_REG_SSA) &&
1308                    !(instr->srcs[i]->flags & IR3_REG_SHARED)) {
1309                   /* non-shared->shared. Create a reload move.
1310                    */
1311                   struct ir3_instruction *mov =
1312                      ir3_instr_create_at(ir3_after_instr(instr), OPC_MOV, 1, 1);
1313                   mov->flags |= IR3_INSTR_SHARED_SPILL;
1314                   struct ir3_register *dst =
1315                      ir3_dst_create(mov, instr->dsts[i]->num,
1316                                     instr->dsts[i]->flags);
1317                   dst->instr = mov;
1318                   dst->wrmask = instr->dsts[i]->wrmask;
1319                   mov->repeat = reg_elems(mov->dsts[0]) - 1;
1320                   struct ir3_register *src =
1321                      ir3_src_create(mov, INVALID_REG, instr->srcs[i]->flags |
1322                                     (mov->repeat ? IR3_REG_R : 0));
1323                   src->def = instr->srcs[i]->def;
1324                   src->wrmask = instr->srcs[i]->wrmask;
1325                   mov->cat1.dst_type = mov->cat1.src_type =
1326                      (mov->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
1327 
1328                   /* When we spill a parallel copy source, we lose the
1329                    * information of where it originally points to since we make
1330                    * it point to the spill def. If we later decide not to also
1331                    * spill the phi associated with it, we have to restore it
1332                    * here using the stashed original source so that RA
1333                    * validation can check that we did the correct thing.
1334                    *
1335                    * Because SSA-ness goes away after validation, this is really
1336                    * just about validation.
1337                    */
1338                   struct ir3_block *succ = block->successors[0];
1339                   unsigned pred_idx = ir3_block_get_pred_index(succ, block);
1340                   foreach_instr (phi, &succ->instr_list) {
1341                      if (phi->opc != OPC_META_PHI)
1342                         break;
1343 
1344                      if (phi->srcs[pred_idx]->def == instr->dsts[i]) {
1345                         struct ir3_register *def =
1346                            _mesa_hash_table_search(ctx->pcopy_src_map,
1347                                                    instr->srcs[i])->data;
1348                         phi->srcs[pred_idx]->def = def;
1349                         break;
1350                      }
1351                   }
1352 
1353                   instr->srcs[i] = instr->srcs[instr->srcs_count - 1];
1354                   instr->dsts[i] = instr->dsts[instr->dsts_count - 1];
1355                   instr->srcs_count--;
1356                   instr->dsts_count--;
1357                   continue;
1358                }
1359 
1360                i++;
1361             }
1362 
1363             /* Move any non-shared copies to a separate parallel copy
1364              * instruction right at the end of the block, after any reloads. At
1365              * this point all copies should be {shared,immediate}->shared or
1366              * {non-shared,immediate}->non-shared.
1367              */
1368             unsigned non_shared_copies = 0;
1369             for (unsigned i = 0; i < instr->dsts_count; i++) {
1370                if (!(instr->dsts[i]->flags & IR3_REG_SHARED))
1371                   non_shared_copies++;
1372             }
1373 
1374             if (non_shared_copies != 0) {
1375                struct ir3_instruction *pcopy = ir3_instr_create_at(
1376                   ir3_before_terminator(block), OPC_META_PARALLEL_COPY,
1377                   non_shared_copies, non_shared_copies);
1378 
1379                unsigned j = 0;
1380                for (unsigned i = 0; i < instr->dsts_count;) {
1381                   if (!(instr->dsts[i]->flags & IR3_REG_SHARED)) {
1382                      pcopy->dsts[j] = instr->dsts[i];
1383                      pcopy->srcs[j] = instr->srcs[i];
1384                      pcopy->dsts[j]->instr = pcopy;
1385                      instr->srcs[i] = instr->srcs[instr->srcs_count - 1];
1386                      instr->dsts[i] = instr->dsts[instr->dsts_count - 1];
1387                      instr->srcs_count--;
1388                      instr->dsts_count--;
1389                      j++;
1390                      continue;
1391                   }
1392                   i++;
1393                }
1394 
1395                pcopy->srcs_count = pcopy->dsts_count = j;
1396                if (instr->dsts_count == 0)
1397                   list_del(&instr->node);
1398             }
1399          }
1400       }
1401    }
1402 }
1403 
1404 static void
finalize(struct ir3 * ir)1405 finalize(struct ir3 *ir)
1406 {
1407    foreach_block (block, &ir->block_list) {
1408       foreach_instr (instr, &block->instr_list) {
1409          for (unsigned i = 0; i < instr->dsts_count; i++) {
1410             if (instr->dsts[i]->flags & IR3_REG_SHARED) {
1411                instr->dsts[i]->flags &= ~IR3_REG_SSA;
1412             }
1413          }
1414 
1415          for (unsigned i = 0; i < instr->srcs_count; i++) {
1416             if (instr->srcs[i]->flags & IR3_REG_SHARED) {
1417                instr->srcs[i]->flags &= ~IR3_REG_SSA;
1418                instr->srcs[i]->def = NULL;
1419             }
1420          }
1421       }
1422    }
1423 }
1424 
1425 void
ir3_ra_shared(struct ir3_shader_variant * v,struct ir3_liveness ** live_ptr)1426 ir3_ra_shared(struct ir3_shader_variant *v, struct ir3_liveness **live_ptr)
1427 {
1428    struct ra_ctx ctx;
1429    struct ir3_liveness *live = *live_ptr;
1430 
1431    ra_ctx_init(&ctx);
1432    ctx.intervals = rzalloc_array(NULL, struct ra_interval,
1433                                  live->definitions_count);
1434    ctx.blocks = rzalloc_array(NULL, struct ra_block_state,
1435                               live->block_count);
1436    ctx.start = 0;
1437    ctx.live = live;
1438    ctx.pcopy_src_map = _mesa_pointer_hash_table_create(NULL);
1439 
1440    foreach_block (block, &v->ir->block_list) {
1441       handle_block(&ctx, block);
1442    }
1443 
1444    lower_pcopy(v->ir, &ctx);
1445 
1446    for (unsigned i = 0; i < live->block_count; i++) {
1447       if (ctx.blocks[i].live_out)
1448          ralloc_free(ctx.blocks[i].live_out);
1449    }
1450 
1451    ralloc_free(ctx.intervals);
1452    ralloc_free(ctx.pcopy_src_map);
1453    ralloc_free(ctx.blocks);
1454 
1455    ir3_ra_validate(v, RA_FULL_SIZE, RA_HALF_SIZE, live->block_count, true);
1456    finalize(v->ir);
1457 
1458    /* Recalculate liveness and register pressure now that additional values have
1459     * been added.
1460     * TODO we should only do this if any values have been spilled/reloaded.
1461     * Note: since we don't have to recreate merge sets, we have to manually copy
1462     * interval_offset to the new liveness struct.
1463     */
1464    unsigned interval_offset = live->interval_offset;
1465    void *live_mem_ctx = ralloc_parent(live);
1466    ralloc_free(live);
1467    *live_ptr = ir3_calc_liveness(live_mem_ctx, v->ir);
1468    (*live_ptr)->interval_offset = interval_offset;
1469 }
1470 
1471