1 /*
2 * Copyright © 2021 Valve Corporation
3 * Copyright © 2014 Rob Clark <robclark@freedesktop.org>
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ir3_ra.h"
8 #include "ir3_shader.h"
9
10 #include "util/u_math.h"
11
12 /* Allocating shared registers can pose a challenge, because their live
13 * intervals use the physical CFG which has extra edges inserted that are
14 * pretty much always critical edges. This causes problems with phi nodes,
15 * because copies for phi nodes have to happen "along the edge," and similarly
16 * causes problems when reunifying values that have had their live range split.
17 * Problematic phi nodes should be relatively rare, so we ban them for now.
18 * The solution we choose for live-range splitting is to integrate spilling and
19 * register allcoation and spill to vector registers rather than split a live
20 * range, which negates some of the advantages of SSA-based RA, but it isn't as
21 * bad as it seems because the conditions needed (vector shared registers, which
22 * only movmsk currently produces, or fixed registers which we don't do) are
23 * relatively rare. Spilling is also much cheaper than spilling vector registers
24 * to private memory.
25 */
26
27 struct ra_interval {
28 struct ir3_reg_interval interval;
29
30 struct rb_node physreg_node;
31 physreg_t physreg_start, physreg_end;
32
33 /* If this interval was spilled, the original physreg_start before spilling.
34 * Used when reloading live outs.
35 */
36 physreg_t physreg_start_orig;
37
38 /* Where the shared register is spilled to. If there were no uses when it's
39 * spilled it could be the original defining instruction.
40 */
41 struct ir3_register *spill_def;
42
43 /* Whether this contains a source of the current instruction that can't be
44 * spilled.
45 */
46 bool src;
47
48 bool needs_reload;
49 };
50
51 struct ra_block_state {
52 bool visited;
53
54 /* For blocks whose successors are visited first (i.e. loop backedges), which
55 * values should be live at the end.
56 */
57 BITSET_WORD *live_out;
58 };
59
60 struct ra_ctx {
61 struct ir3_reg_ctx reg_ctx;
62
63 BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
64
65 struct rb_tree physreg_intervals;
66
67 struct ra_interval *intervals;
68
69 struct ir3_liveness *live;
70
71 struct hash_table *pcopy_src_map;
72
73 struct ra_block_state *blocks;
74
75 unsigned start;
76 };
77
78 static struct ra_interval *
ir3_reg_interval_to_ra_interval(struct ir3_reg_interval * interval)79 ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
80 {
81 return rb_node_data(struct ra_interval, interval, interval);
82 }
83
84 static struct ra_interval *
rb_node_to_interval(struct rb_node * node)85 rb_node_to_interval(struct rb_node *node)
86 {
87 return rb_node_data(struct ra_interval, node, physreg_node);
88 }
89
90 static const struct ra_interval *
rb_node_to_interval_const(const struct rb_node * node)91 rb_node_to_interval_const(const struct rb_node *node)
92 {
93 return rb_node_data(struct ra_interval, node, physreg_node);
94 }
95
96 static struct ra_interval *
ra_interval_next(struct ra_interval * interval)97 ra_interval_next(struct ra_interval *interval)
98 {
99 struct rb_node *next = rb_node_next(&interval->physreg_node);
100 return next ? rb_node_to_interval(next) : NULL;
101 }
102
103 static struct ra_interval *
ra_interval_next_or_null(struct ra_interval * interval)104 ra_interval_next_or_null(struct ra_interval *interval)
105 {
106 return interval ? ra_interval_next(interval) : NULL;
107 }
108
109 static int
ra_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)110 ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
111 {
112 const struct ra_interval *a = rb_node_to_interval_const(_a);
113 const struct ra_interval *b = rb_node_to_interval_const(_b);
114 return b->physreg_start - a->physreg_start;
115 }
116
117 static int
ra_interval_cmp(const struct rb_node * node,const void * data)118 ra_interval_cmp(const struct rb_node *node, const void *data)
119 {
120 physreg_t reg = *(const physreg_t *)data;
121 const struct ra_interval *interval = rb_node_to_interval_const(node);
122 if (interval->physreg_start > reg)
123 return -1;
124 else if (interval->physreg_end <= reg)
125 return 1;
126 else
127 return 0;
128 }
129
130 static struct ra_ctx *
ir3_reg_ctx_to_ctx(struct ir3_reg_ctx * ctx)131 ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
132 {
133 return rb_node_data(struct ra_ctx, ctx, reg_ctx);
134 }
135
136 static struct ra_interval *
ra_interval_search_sloppy(struct rb_tree * tree,physreg_t reg)137 ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
138 {
139 struct rb_node *node = rb_tree_search_sloppy(tree, ®, ra_interval_cmp);
140 return node ? rb_node_to_interval(node) : NULL;
141 }
142
143 /* Get the interval covering the reg, or the closest to the right if it
144 * doesn't exist.
145 */
146 static struct ra_interval *
ra_interval_search_right(struct rb_tree * tree,physreg_t reg)147 ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
148 {
149 struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
150 if (!interval) {
151 return NULL;
152 } else if (interval->physreg_end > reg) {
153 return interval;
154 } else {
155 /* There is no interval covering reg, and ra_file_search_sloppy()
156 * returned the closest range to the left, so the next interval to the
157 * right should be the closest to the right.
158 */
159 return ra_interval_next_or_null(interval);
160 }
161 }
162
163 static struct ra_interval *
ra_ctx_search_right(struct ra_ctx * ctx,physreg_t reg)164 ra_ctx_search_right(struct ra_ctx *ctx, physreg_t reg)
165 {
166 return ra_interval_search_right(&ctx->physreg_intervals, reg);
167 }
168
169 static void
interval_add(struct ir3_reg_ctx * reg_ctx,struct ir3_reg_interval * _interval)170 interval_add(struct ir3_reg_ctx *reg_ctx, struct ir3_reg_interval *_interval)
171 {
172 struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
173 struct ra_ctx *ctx = ir3_reg_ctx_to_ctx(reg_ctx);
174
175 /* We can assume in this case that physreg_start/physreg_end is already
176 * initialized.
177 */
178 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
179 BITSET_CLEAR(ctx->available, i);
180 }
181
182 rb_tree_insert(&ctx->physreg_intervals, &interval->physreg_node,
183 ra_interval_insert_cmp);
184 }
185
186 static void
interval_delete(struct ir3_reg_ctx * reg_ctx,struct ir3_reg_interval * _interval)187 interval_delete(struct ir3_reg_ctx *reg_ctx, struct ir3_reg_interval *_interval)
188 {
189 struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
190 struct ra_ctx *ctx = ir3_reg_ctx_to_ctx(reg_ctx);
191
192 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
193 BITSET_SET(ctx->available, i);
194 }
195
196 rb_tree_remove(&ctx->physreg_intervals, &interval->physreg_node);
197 }
198
199 static void
interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)200 interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
201 struct ir3_reg_interval *_child)
202 {
203 struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
204 struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
205
206 child->physreg_start =
207 parent->physreg_start + (child->interval.reg->interval_start -
208 parent->interval.reg->interval_start);
209 child->physreg_end =
210 child->physreg_start +
211 (child->interval.reg->interval_end - child->interval.reg->interval_start);
212
213 interval_add(ctx, _child);
214 }
215
216 static void
ra_ctx_init(struct ra_ctx * ctx)217 ra_ctx_init(struct ra_ctx *ctx)
218 {
219 ctx->reg_ctx.interval_add = interval_add;
220 ctx->reg_ctx.interval_delete = interval_delete;
221 ctx->reg_ctx.interval_readd = interval_readd;
222 }
223
224 static void
ra_ctx_reset_block(struct ra_ctx * ctx)225 ra_ctx_reset_block(struct ra_ctx *ctx)
226 {
227 for (unsigned i = 0; i < RA_SHARED_SIZE; i++) {
228 BITSET_SET(ctx->available, i);
229 }
230
231 rb_tree_init(&ctx->reg_ctx.intervals);
232 rb_tree_init(&ctx->physreg_intervals);
233 }
234
235 static void
ra_interval_init(struct ra_interval * interval,struct ir3_register * reg)236 ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
237 {
238 ir3_reg_interval_init(&interval->interval, reg);
239 }
240
241 static physreg_t
ra_interval_get_physreg(const struct ra_interval * interval)242 ra_interval_get_physreg(const struct ra_interval *interval)
243 {
244 unsigned child_start = interval->interval.reg->interval_start;
245
246 while (interval->interval.parent) {
247 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
248 }
249
250 return interval->physreg_start +
251 (child_start - interval->interval.reg->interval_start);
252 }
253
254 static unsigned
ra_interval_get_num(const struct ra_interval * interval)255 ra_interval_get_num(const struct ra_interval *interval)
256 {
257 return ra_physreg_to_num(ra_interval_get_physreg(interval),
258 interval->interval.reg->flags);
259 }
260
261 static void
ra_interval_dump(struct log_stream * stream,struct ra_interval * interval)262 ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
263 {
264 mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
265
266 ir3_reg_interval_dump(stream, &interval->interval);
267 }
268
269 static void
ra_ctx_dump(struct ra_ctx * ctx)270 ra_ctx_dump(struct ra_ctx *ctx)
271 {
272 struct log_stream *stream = mesa_log_streami();
273
274 mesa_log_stream_printf(stream, "shared:\n");
275 rb_tree_foreach (struct ra_interval, interval, &ctx->physreg_intervals,
276 physreg_node) {
277 ra_interval_dump(stream, interval);
278 }
279
280 unsigned start, end;
281 mesa_log_stream_printf(stream, "available:\n");
282 BITSET_FOREACH_RANGE (start, end, ctx->available, RA_SHARED_SIZE) {
283 mesa_log_stream_printf(stream, "%u-%u ", start, end);
284 }
285 mesa_log_stream_printf(stream, "\n");
286 mesa_log_stream_printf(stream, "start: %u\n", ctx->start);
287 }
288
289 static bool
get_reg_specified(struct ra_ctx * ctx,struct ir3_register * reg,physreg_t physreg)290 get_reg_specified(struct ra_ctx *ctx, struct ir3_register *reg, physreg_t physreg)
291 {
292 for (unsigned i = 0; i < reg_size(reg); i++) {
293 if (!BITSET_TEST(ctx->available, physreg + i))
294 return false;
295 }
296
297 return true;
298 }
299
300 static unsigned
reg_file_size(struct ir3_register * reg)301 reg_file_size(struct ir3_register *reg)
302 {
303 if (reg->flags & IR3_REG_HALF)
304 return RA_SHARED_HALF_SIZE;
305 else
306 return RA_SHARED_SIZE;
307 }
308
309 static physreg_t
find_best_gap(struct ra_ctx * ctx,struct ir3_register * dst,unsigned size,unsigned align)310 find_best_gap(struct ra_ctx *ctx, struct ir3_register *dst, unsigned size,
311 unsigned align)
312 {
313 unsigned file_size = reg_file_size(dst);
314
315 /* This can happen if we create a very large merge set. Just bail out in that
316 * case.
317 */
318 if (size > file_size)
319 return (physreg_t) ~0;
320
321 unsigned start = ALIGN(ctx->start, align) % (file_size - size + align);
322 unsigned candidate = start;
323 do {
324 bool is_available = true;
325 for (unsigned i = 0; i < size; i++) {
326 if (!BITSET_TEST(ctx->available, candidate + i)) {
327 is_available = false;
328 break;
329 }
330 }
331
332 if (is_available) {
333 ctx->start = (candidate + size) % file_size;
334 return candidate;
335 }
336
337 candidate += align;
338 if (candidate + size > file_size)
339 candidate = 0;
340 } while (candidate != start);
341
342 return (physreg_t)~0;
343 }
344
345 static physreg_t
find_best_spill_reg(struct ra_ctx * ctx,struct ir3_register * reg,unsigned size,unsigned align)346 find_best_spill_reg(struct ra_ctx *ctx, struct ir3_register *reg,
347 unsigned size, unsigned align)
348 {
349 unsigned file_size = reg_file_size(reg);
350 unsigned min_cost = UINT_MAX;
351
352 unsigned start = ALIGN(ctx->start, align) % (file_size - size + align);
353 physreg_t candidate = start;
354 physreg_t best_reg = (physreg_t)~0;
355 do {
356 unsigned cost = 0;
357
358 /* Iterate through intervals we'd need to spill to use this reg. */
359 for (struct ra_interval *interval = ra_ctx_search_right(ctx, candidate);
360 interval && interval->physreg_start < candidate + size;
361 interval = ra_interval_next_or_null(interval)) {
362 /* We can't spill sources of the current instruction when reloading
363 * sources.
364 */
365 if (interval->src) {
366 cost = UINT_MAX;
367 break;
368 }
369
370 /* We prefer spilling intervals that already have been spilled, so we
371 * don't have to emit another mov.
372 */
373 if (!interval->spill_def)
374 cost += (interval->physreg_end - interval->physreg_start);
375 }
376
377 if (cost < min_cost) {
378 min_cost = cost;
379 best_reg = candidate;
380 }
381
382 candidate += align;
383 if (candidate + size > file_size)
384 candidate = 0;
385 } while (candidate != start);
386
387 return best_reg;
388 }
389
390 static struct ir3_register *
split(struct ir3_register * def,unsigned offset,struct ir3_instruction * before)391 split(struct ir3_register *def, unsigned offset, struct ir3_instruction *before)
392 {
393 if (reg_elems(def) == 1) {
394 assert(offset == 0);
395 return def;
396 }
397
398 struct ir3_instruction *split =
399 ir3_instr_create_at(ir3_after_instr(before), OPC_META_SPLIT, 1, 1);
400 split->split.off = offset;
401 struct ir3_register *dst = __ssa_dst(split);
402 struct ir3_register *src =
403 ir3_src_create(split, INVALID_REG, def->flags & (IR3_REG_HALF | IR3_REG_SSA));
404 src->wrmask = def->wrmask;
405 src->def = def;
406 return dst;
407 }
408
409 static struct ir3_register *
extract(struct ir3_register * parent_def,unsigned offset,unsigned elems,struct ir3_instruction * before)410 extract(struct ir3_register *parent_def, unsigned offset, unsigned elems,
411 struct ir3_instruction *before)
412 {
413 if (offset == 0 && elems == reg_elems(parent_def))
414 return parent_def;
415
416 if (elems == 1)
417 return split(parent_def, offset, before);
418
419 struct ir3_instruction *collect =
420 ir3_instr_create_at(ir3_after_instr(before), OPC_META_COLLECT, 1, elems);
421 struct ir3_register *dst = __ssa_dst(collect);
422 dst->flags |= parent_def->flags & IR3_REG_HALF;
423 dst->wrmask = MASK(elems);
424
425 for (unsigned i = 0; i < elems; i++) {
426 ir3_src_create(collect, INVALID_REG,
427 parent_def->flags & (IR3_REG_HALF | IR3_REG_SSA))->def =
428 split(parent_def, offset + i, before);
429 }
430
431 return dst;
432 }
433
434 static void
spill_interval_children(struct ra_interval * interval,struct ir3_instruction * before)435 spill_interval_children(struct ra_interval *interval,
436 struct ir3_instruction *before)
437 {
438 rb_tree_foreach (struct ra_interval, child, &interval->interval.children,
439 interval.node) {
440 if (!child->spill_def) {
441 child->spill_def = extract(interval->spill_def,
442 (child->interval.reg->interval_start -
443 interval->interval.reg->interval_start) /
444 reg_elem_size(interval->interval.reg),
445 reg_elems(child->interval.reg), before);
446 interval->physreg_start_orig = child->physreg_start;
447 }
448 spill_interval_children(child, before);
449 }
450 }
451
452 static void
spill_interval(struct ra_ctx * ctx,struct ra_interval * interval)453 spill_interval(struct ra_ctx *ctx, struct ra_interval *interval)
454 {
455 struct ir3_instruction *before = interval->interval.reg->instr;
456
457 d("spilling ssa_%u:%u", before->serialno, interval->interval.reg->name);
458
459 if (!interval->spill_def) {
460 /* If this is a phi node or input, we need to insert the demotion to a
461 * regular register after the last phi or input in the block.
462 */
463 if (before->opc == OPC_META_PHI ||
464 before->opc == OPC_META_INPUT) {
465 struct ir3_block *block = before->block;
466 struct ir3_instruction *last_phi_input = NULL;
467 foreach_instr_from (instr, before, &block->instr_list) {
468 if (instr->opc != before->opc)
469 break;
470 last_phi_input = instr;
471 }
472 before = last_phi_input;
473 }
474
475 struct ir3_instruction *mov =
476 ir3_instr_create_at(ir3_after_instr(before), OPC_MOV, 1, 1);
477 mov->flags |= IR3_INSTR_SHARED_SPILL;
478 struct ir3_register *dst = __ssa_dst(mov);
479 dst->flags |= (interval->interval.reg->flags & IR3_REG_HALF);
480 dst->wrmask = interval->interval.reg->wrmask;
481 mov->repeat = reg_elems(dst) - 1;
482 ir3_src_create(mov, interval->interval.reg->num,
483 IR3_REG_SHARED | (mov->repeat ? IR3_REG_R : 0) |
484 (interval->interval.reg->flags & IR3_REG_HALF))->wrmask =
485 interval->interval.reg->wrmask;
486 mov->cat1.src_type = mov->cat1.dst_type =
487 (interval->interval.reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
488
489 interval->spill_def = dst;
490 interval->physreg_start_orig = interval->physreg_start;
491 }
492
493 spill_interval_children(interval, interval->spill_def->instr);
494
495 ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
496 }
497
498 /* Try to demote a scalar ALU instruction to a normal ALU instruction, using the
499 * spilled sources. We have to take into account restrictions on the number of
500 * shared sources that only exist for normal ALU instructions.
501 */
502 static bool
try_demote_instruction(struct ra_ctx * ctx,struct ir3_instruction * instr)503 try_demote_instruction(struct ra_ctx *ctx, struct ir3_instruction *instr)
504 {
505 /* First, check restrictions. */
506 switch (opc_cat(instr->opc)) {
507 case 1:
508 /* MOVMSK is special and can't be demoted. It also has no sources so must
509 * go before the check below.
510 */
511 if (instr->opc == OPC_MOVMSK)
512 return false;
513
514 assert(instr->srcs_count >= 1);
515 if (!(instr->srcs[0]->flags & (IR3_REG_CONST | IR3_REG_IMMED)))
516 return false;
517 break;
518 case 2: {
519 /* We need one source to either be demotable or an immediate. */
520 if (instr->srcs_count > 1) {
521 struct ra_interval *src0_interval =
522 (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
523 struct ra_interval *src1_interval =
524 (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
525 if (!(src0_interval && src0_interval->spill_def) &&
526 !(src1_interval && src1_interval->spill_def) &&
527 !(instr->srcs[0]->flags & IR3_REG_IMMED) &&
528 !(instr->srcs[1]->flags & IR3_REG_IMMED))
529 return false;
530 }
531 break;
532 }
533 case 3: {
534 struct ra_interval *src0_interval =
535 (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
536 struct ra_interval *src1_interval =
537 (instr->srcs[1]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[1]->def->name] : NULL;
538
539 /* src1 cannot be shared */
540 if (src1_interval && !src1_interval->spill_def) {
541 /* Try to swap src0 and src1, similar to what copy prop does. */
542 if (!is_mad(instr->opc))
543 return false;
544
545 if ((src0_interval && src0_interval->spill_def) ||
546 (instr->srcs[0]->flags & IR3_REG_IMMED)) {
547 struct ir3_register *src0 = instr->srcs[0];
548 instr->srcs[0] = instr->srcs[1];
549 instr->srcs[1] = src0;
550 } else {
551 return false;
552 }
553 }
554 break;
555 }
556 case 4: {
557 assert(instr->srcs[0]->flags & IR3_REG_SSA);
558 struct ra_interval *src_interval = &ctx->intervals[instr->srcs[0]->def->name];
559 if (!src_interval->spill_def)
560 return false;
561 break;
562 }
563
564 default:
565 return false;
566 }
567
568 d("demoting instruction");
569
570 /* If the instruction is already not a scalar ALU instruction, we should've
571 * skipped reloading and just demoted sources directly, so we should never
572 * get here.
573 */
574 assert(instr->dsts[0]->flags & IR3_REG_SHARED);
575
576 /* Now we actually demote the instruction */
577 ra_foreach_src (src, instr) {
578 assert(src->flags & IR3_REG_SHARED);
579 struct ra_interval *interval = &ctx->intervals[src->def->name];
580 if (interval->spill_def) {
581 src->def = interval->spill_def;
582 src->flags &= ~IR3_REG_SHARED;
583 interval->needs_reload = false;
584 if (interval->interval.inserted)
585 ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
586 while (interval->interval.parent)
587 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
588 interval->src = false;
589 }
590 }
591
592 struct ra_interval *dst_interval = &ctx->intervals[instr->dsts[0]->name];
593 instr->dsts[0]->flags &= ~IR3_REG_SHARED;
594 ra_interval_init(dst_interval, instr->dsts[0]);
595 dst_interval->spill_def = instr->dsts[0];
596
597 instr->flags |= IR3_INSTR_SHARED_SPILL;
598
599 return true;
600 }
601
602 /* Free up [start, start + size) by spilling live intervals.
603 */
604 static void
free_space(struct ra_ctx * ctx,physreg_t start,unsigned size)605 free_space(struct ra_ctx *ctx, physreg_t start, unsigned size)
606 {
607 struct ra_interval *interval = ra_ctx_search_right(ctx, start);
608 while (interval && interval->physreg_start < start + size) {
609 struct ra_interval *next = ra_interval_next_or_null(interval);
610 spill_interval(ctx, interval);
611 interval = next;
612 }
613 }
614
615 static physreg_t
get_reg(struct ra_ctx * ctx,struct ir3_register * reg,bool src)616 get_reg(struct ra_ctx *ctx, struct ir3_register *reg, bool src)
617 {
618 if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
619 physreg_t preferred_reg =
620 reg->merge_set->preferred_reg + reg->merge_set_offset;
621 if (preferred_reg < reg_file_size(reg) &&
622 preferred_reg % reg_elem_size(reg) == 0 &&
623 get_reg_specified(ctx, reg, preferred_reg))
624 return preferred_reg;
625 }
626
627 /* If this register is a subset of a merge set which we have not picked a
628 * register for, first try to allocate enough space for the entire merge
629 * set.
630 */
631 unsigned size = reg_size(reg);
632 if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
633 size < reg->merge_set->size) {
634 physreg_t best_reg = find_best_gap(ctx, reg, reg->merge_set->size,
635 reg->merge_set->alignment);
636 if (best_reg != (physreg_t)~0u) {
637 best_reg += reg->merge_set_offset;
638 return best_reg;
639 }
640 }
641
642 /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
643 * Because this doesn't introduce unnecessary dependencies, and it
644 * potentially avoids needing (ss) syncs for write after read hazards for
645 * SFU instructions:
646 */
647 if (!src && (is_sfu(reg->instr) || is_alu(reg->instr))) {
648 for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
649 struct ir3_register *src = reg->instr->srcs[i];
650 if (!ra_reg_is_src(src))
651 continue;
652 if ((src->flags & IR3_REG_SHARED) && reg_size(src) >= size) {
653 struct ra_interval *src_interval = &ctx->intervals[src->def->name];
654 physreg_t src_physreg = ra_interval_get_physreg(src_interval);
655 if (src_physreg % reg_elem_size(reg) == 0 &&
656 src_physreg + size <= reg_file_size(reg) &&
657 get_reg_specified(ctx, reg, src_physreg))
658 return src_physreg;
659 }
660 }
661 }
662
663 return find_best_gap(ctx, reg, size, reg_elem_size(reg));
664 }
665
666 /* The reload process is split in two, first we allocate a register to reload to
667 * for all sources that need a reload and then we actually execute the reload.
668 * This is to allow us to demote shared ALU instructions to non-shared whenever
669 * we would otherwise need to spill to reload, without leaving dangling unused
670 * reload mov's from previously processed sources. So, for example, we could
671 * need to reload both sources of an add, but after reloading the first source
672 * we realize that we would need to spill to reload the second source and we
673 * should demote the add instead, which means cancelling the first reload.
674 */
675 static void
reload_src(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)676 reload_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
677 struct ir3_register *src)
678 {
679 struct ir3_register *reg = src->def;
680 struct ra_interval *interval = &ctx->intervals[reg->name];
681 unsigned size = reg_size(reg);
682
683 physreg_t best_reg = get_reg(ctx, reg, true);
684
685 if (best_reg == (physreg_t)~0u) {
686 if (try_demote_instruction(ctx, instr))
687 return;
688
689 best_reg = find_best_spill_reg(ctx, reg, size, reg_elem_size(reg));
690 assert(best_reg != (physreg_t)~0u);
691
692 free_space(ctx, best_reg, size);
693 }
694
695 d("reload src %u physreg %u", reg->name, best_reg);
696 interval->physreg_start = best_reg;
697 interval->physreg_end = best_reg + size;
698 interval->needs_reload = true;
699 ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
700 interval->src = true;
701 }
702
703 static void
reload_interval(struct ra_ctx * ctx,struct ir3_cursor cursor,struct ra_interval * interval)704 reload_interval(struct ra_ctx *ctx, struct ir3_cursor cursor,
705 struct ra_interval *interval)
706 {
707 struct ir3_register *def = interval->interval.reg;
708 struct ir3_instruction *mov = ir3_instr_create_at(cursor, OPC_MOV, 1, 1);
709 mov->flags |= IR3_INSTR_SHARED_SPILL;
710 unsigned flags = IR3_REG_SHARED | (def->flags & IR3_REG_HALF);
711 ir3_dst_create(mov, ra_physreg_to_num(interval->physreg_start, flags),
712 flags)->wrmask = def->wrmask;
713 mov->repeat = reg_elems(def) - 1;
714 struct ir3_register *mov_src =
715 ir3_src_create(mov, INVALID_REG, IR3_REG_SSA | (def->flags & IR3_REG_HALF) |
716 (mov->repeat ? IR3_REG_R : 0));
717 assert(interval->spill_def);
718 mov_src->def = interval->spill_def;
719 mov_src->wrmask = def->wrmask;
720 mov->cat1.src_type = mov->cat1.dst_type =
721 (def->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
722 }
723
724 static void
reload_src_finalize(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)725 reload_src_finalize(struct ra_ctx *ctx, struct ir3_instruction *instr,
726 struct ir3_register *src)
727 {
728 struct ir3_register *reg = src->def;
729 struct ra_interval *interval = &ctx->intervals[reg->name];
730
731 if (!interval->needs_reload)
732 return;
733
734 reload_interval(ctx, ir3_before_instr(instr), interval);
735
736 interval->needs_reload = false;
737 }
738
739 static bool
can_demote_src(struct ir3_instruction * instr)740 can_demote_src(struct ir3_instruction *instr)
741 {
742 switch (instr->opc) {
743 case OPC_SCAN_MACRO:
744 case OPC_META_COLLECT:
745 return false;
746 case OPC_MOV:
747 /* non-shared -> shared floating-point conversions and
748 * 8-bit sign extension don't work.
749 */
750 return (!(instr->dsts[0]->flags & IR3_REG_SHARED) ||
751 !((full_type(instr->cat1.src_type) == TYPE_F32 ||
752 full_type(instr->cat1.dst_type) == TYPE_F32) ||
753 (instr->cat1.src_type == TYPE_U8 &&
754 full_type(instr->cat1.dst_type) == TYPE_S32)));
755 default:
756 return (!is_alu(instr) && !is_sfu(instr)) ||
757 !(instr->dsts[0]->flags & IR3_REG_SHARED);
758 }
759 }
760
761 /* Ensure that this source is never spilled while reloading other sources.
762 */
763 static void
mark_src(struct ra_ctx * ctx,struct ir3_register * src)764 mark_src(struct ra_ctx *ctx, struct ir3_register *src)
765 {
766 if (!(src->flags & IR3_REG_SHARED))
767 return;
768
769 struct ra_interval *interval = &ctx->intervals[src->def->name];
770
771 if (interval->interval.inserted) {
772 while (interval->interval.parent)
773 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
774
775 interval->src = true;
776 }
777 }
778
779 static void
ensure_src_live(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)780 ensure_src_live(struct ra_ctx *ctx, struct ir3_instruction *instr,
781 struct ir3_register *src)
782 {
783 if (!(src->flags & IR3_REG_SHARED))
784 return;
785
786 struct ra_interval *interval = &ctx->intervals[src->def->name];
787
788 if (!interval->interval.inserted) {
789 /* In some cases we cannot demote shared reg sources to non-shared regs,
790 * then we have to reload it.
791 */
792 assert(interval->spill_def);
793 if (!can_demote_src(instr)) {
794 reload_src(ctx, instr, src);
795 } else {
796 if (instr->opc == OPC_META_PARALLEL_COPY) {
797 /* Stash away the original def to use later in case we actually have
798 * to insert a reload.
799 */
800 _mesa_hash_table_insert(ctx->pcopy_src_map, src, src->def);
801 }
802 src->def = interval->spill_def;
803 src->flags &= ~IR3_REG_SHARED;
804 }
805 }
806 }
807
808 static void
assign_src(struct ra_ctx * ctx,struct ir3_register * src)809 assign_src(struct ra_ctx *ctx, struct ir3_register *src)
810 {
811 if (!(src->flags & IR3_REG_SHARED))
812 return;
813
814 struct ra_interval *interval = &ctx->intervals[src->def->name];
815 assert(interval->interval.inserted);
816 src->num = ra_physreg_to_num(ra_interval_get_physreg(interval), src->flags);
817
818 if ((src->flags & IR3_REG_FIRST_KILL) &&
819 !interval->interval.parent &&
820 rb_tree_is_empty(&interval->interval.children))
821 ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
822
823 while (interval->interval.parent)
824 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
825
826 interval->src = false;
827 }
828
829 static bool
is_nontrivial_collect(struct ir3_instruction * collect)830 is_nontrivial_collect(struct ir3_instruction *collect)
831 {
832 if (collect->opc != OPC_META_COLLECT) {
833 return false;
834 }
835
836 struct ir3_register *dst = collect->dsts[0];
837
838 foreach_src_n (src, src_n, collect) {
839 if (src->num != dst->num + src_n) {
840 return true;
841 }
842 }
843
844 return false;
845 }
846
847 static void
handle_dst(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * dst)848 handle_dst(struct ra_ctx *ctx, struct ir3_instruction *instr,
849 struct ir3_register *dst)
850 {
851 if (!(dst->flags & IR3_REG_SHARED))
852 return;
853
854 struct ra_interval *interval = &ctx->intervals[dst->name];
855 ra_interval_init(interval, dst);
856 interval->spill_def = NULL;
857
858 if (dst->tied) {
859 struct ir3_register *tied_def = dst->tied->def;
860 struct ra_interval *tied_interval = &ctx->intervals[tied_def->name];
861 if ((dst->tied->flags & IR3_REG_KILL) &&
862 !tied_interval->interval.parent &&
863 rb_tree_is_empty(&tied_interval->interval.children)) {
864 dst->num = dst->tied->num;
865 interval->physreg_start = tied_interval->physreg_start;
866 interval->physreg_end = tied_interval->physreg_end;
867 ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
868 return;
869 }
870 }
871
872 physreg_t physreg = get_reg(ctx, dst, false);
873 if (physreg == (physreg_t) ~0u) {
874 if (try_demote_instruction(ctx, instr))
875 return;
876
877 unsigned size = reg_size(dst);
878 physreg = find_best_spill_reg(ctx, dst, size, reg_elem_size(dst));
879 assert(physreg != (physreg_t)~0u);
880 free_space(ctx, physreg, size);
881 }
882
883 dst->num = ra_physreg_to_num(physreg, dst->flags);
884
885 /* Non-trivial collects (i.e., ones that will introduce moves because the
886 * sources don't line-up with the destination) may cause source intervals to
887 * get implicitly moved when they are inserted as children of the destination
888 * interval. Since we don't support moving intervals in shared RA, this may
889 * cause illegal register allocations. Prevent this by creating a new
890 * top-level interval for the destination so that the source intervals will
891 * be left alone.
892 */
893 if (is_nontrivial_collect(instr)) {
894 dst->merge_set = NULL;
895 dst->interval_start = ctx->live->interval_offset;
896 dst->interval_end = dst->interval_start + reg_size(dst);
897 ctx->live->interval_offset = dst->interval_end;
898 }
899
900 ra_update_affinity(reg_file_size(dst), dst, physreg);
901 interval->physreg_start = physreg;
902 interval->physreg_end = physreg + reg_size(dst);
903 ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
904 d("insert dst %u physreg %u", dst->name, physreg);
905
906 if (dst->tied) {
907 struct ir3_instruction *mov = ir3_instr_create_at(
908 ir3_before_instr(instr), OPC_META_PARALLEL_COPY, 1, 1);
909 unsigned flags = IR3_REG_SHARED | (dst->flags & IR3_REG_HALF);
910 ir3_dst_create(mov, dst->num, flags)->wrmask = dst->wrmask;
911 ir3_src_create(mov, dst->tied->num, flags)->wrmask = dst->wrmask;
912 mov->cat1.src_type = mov->cat1.dst_type =
913 (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;;
914 dst->tied->num = dst->num;
915 }
916 }
917
918 static void
handle_src_late(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)919 handle_src_late(struct ra_ctx *ctx, struct ir3_instruction *instr,
920 struct ir3_register *src)
921 {
922 if (!(src->flags & IR3_REG_SHARED))
923 return;
924
925 struct ra_interval *interval = &ctx->intervals[src->def->name];
926 reload_src_finalize(ctx, instr, src);
927
928 /* Remove killed sources that have to be killed late due to being merged with
929 * other defs.
930 */
931 if (!(src->flags & IR3_REG_KILL))
932 return;
933
934 if (interval->interval.inserted)
935 ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
936 }
937
938 static void
handle_normal_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)939 handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
940 {
941 ra_foreach_src (src, instr)
942 mark_src(ctx, src);
943
944 ra_foreach_src (src, instr)
945 ensure_src_live(ctx, instr, src);
946
947 ra_foreach_src_rev (src, instr)
948 assign_src(ctx, src);
949
950 ra_foreach_dst (dst, instr)
951 handle_dst(ctx, instr, dst);
952
953 ra_foreach_src (src, instr)
954 handle_src_late(ctx, instr, src);
955 }
956
957 static void
handle_split(struct ra_ctx * ctx,struct ir3_instruction * split)958 handle_split(struct ra_ctx *ctx, struct ir3_instruction *split)
959 {
960 struct ir3_register *src = split->srcs[0];
961 struct ir3_register *dst = split->dsts[0];
962
963 if (!(dst->flags & IR3_REG_SHARED))
964 return;
965
966 if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
967 handle_normal_instr(ctx, split);
968 return;
969 }
970
971 struct ra_interval *src_interval = &ctx->intervals[src->def->name];
972 struct ra_interval *dst_interval = &ctx->intervals[dst->name];
973
974 ra_interval_init(dst_interval, dst);
975 dst_interval->spill_def = NULL;
976
977 if (src_interval->spill_def) {
978 struct ir3_instruction *spill_split =
979 ir3_instr_create_at(ir3_after_instr(split), OPC_META_SPLIT, 1, 1);
980 struct ir3_register *dst = __ssa_dst(spill_split);
981 struct ir3_register *src =
982 ir3_src_create(spill_split, INVALID_REG, IR3_REG_SSA);
983 src->def = src_interval->spill_def;
984 src->wrmask = src_interval->spill_def->wrmask;
985 spill_split->split.off = split->split.off;
986 dst_interval->spill_def = dst;
987 list_del(&split->node);
988 return;
989 }
990
991 dst_interval->physreg_start =
992 src_interval->physreg_start + dst->merge_set_offset -
993 src->def->merge_set_offset;
994 dst_interval->physreg_end = dst_interval->physreg_start + reg_size(dst);
995 ir3_reg_interval_insert(&ctx->reg_ctx, &dst_interval->interval);
996 src->num = ra_interval_get_num(src_interval);
997 dst->num = ra_interval_get_num(dst_interval);
998 d("insert dst %u physreg %u", dst->name, dst_interval->physreg_start);
999
1000 if (src->flags & IR3_REG_KILL)
1001 ir3_reg_interval_remove(&ctx->reg_ctx, &src_interval->interval);
1002 }
1003
1004 static void
handle_phi(struct ra_ctx * ctx,struct ir3_instruction * phi)1005 handle_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
1006 {
1007 struct ir3_register *dst = phi->dsts[0];
1008
1009 if (!(dst->flags & IR3_REG_SHARED))
1010 return;
1011
1012 struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1013 ra_interval_init(dst_interval, dst);
1014
1015 /* In some rare cases, it's possible to have a phi node with a physical-only
1016 * source. Here's a contrived example:
1017 *
1018 * loop {
1019 * if non-uniform {
1020 * if uniform {
1021 * x_1 = ...;
1022 * continue;
1023 * }
1024 * x_2 = ...;
1025 * } else {
1026 * break;
1027 * }
1028 * // continue block
1029 * x_3 = phi(x_1, x_2)
1030 * }
1031 *
1032 * Assuming x_1 and x_2 are uniform, x_3 will also be uniform, because all
1033 * threads that stay in the loop take the same branch to the continue block,
1034 * however execution may fall through from the assignment to x_2 to the
1035 * break statement because the outer if is non-uniform, and then it will fall
1036 * through again to the continue block, so if x_3 is to be in a shared reg
1037 * then the phi needs an extra source pointing to the break statement, which
1038 * itself needs a phi node:
1039 *
1040 * loop {
1041 * if non-uniform {
1042 * if uniform {
1043 * x_1 = ...;
1044 * continue;
1045 * }
1046 * x_2 = ...;
1047 * } else {
1048 * x_4 = phi(undef, x_2)
1049 * break;
1050 * }
1051 * // continue block
1052 * x_3 = phi(x_1, x_2, x_4)
1053 * }
1054 */
1055
1056 /* phi nodes are special because we cannot spill them normally, instead we
1057 * have to spill the parallel copies that their sources point to and make the
1058 * entire phi not shared anymore.
1059 */
1060
1061 physreg_t physreg = get_reg(ctx, dst, false);
1062 if (physreg == (physreg_t) ~0u) {
1063 d("spilling phi destination");
1064 dst->flags &= ~IR3_REG_SHARED;
1065 dst_interval->spill_def = dst;
1066 phi->flags |= IR3_INSTR_SHARED_SPILL;
1067
1068 foreach_src (src, phi) {
1069 src->flags &= ~IR3_REG_SHARED;
1070 if (src->def)
1071 src->def->flags &= ~IR3_REG_SHARED;
1072 }
1073
1074 return;
1075 }
1076
1077 dst->num = ra_physreg_to_num(physreg, dst->flags);
1078 dst_interval->spill_def = NULL;
1079 dst_interval->physreg_start = physreg;
1080 dst_interval->physreg_end = physreg + reg_size(dst);
1081 ir3_reg_interval_insert(&ctx->reg_ctx, &dst_interval->interval);
1082
1083 ra_foreach_src_n (src, i, phi) {
1084 /* We assume that any phis with non-logical sources aren't promoted. */
1085 assert(i < phi->block->predecessors_count);
1086 src->num = dst->num;
1087 src->def->num = dst->num;
1088 }
1089 }
1090
1091 static void
handle_pcopy(struct ra_ctx * ctx,struct ir3_instruction * pcopy)1092 handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *pcopy)
1093 {
1094 /* For parallel copies, we only handle the source. The destination is handled
1095 * later when processing phi nodes.
1096 */
1097
1098 ra_foreach_src (src, pcopy)
1099 mark_src(ctx, src);
1100
1101 ra_foreach_src (src, pcopy)
1102 ensure_src_live(ctx, pcopy, src);
1103
1104 ra_foreach_src_rev (src, pcopy)
1105 assign_src(ctx, src);
1106
1107 ra_foreach_src (src, pcopy)
1108 handle_src_late(ctx, pcopy, src);
1109 }
1110
1111 static void
handle_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1112 handle_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1113 {
1114 instr->flags &= ~IR3_INSTR_SHARED_SPILL;
1115
1116 switch (instr->opc) {
1117 case OPC_META_SPLIT:
1118 handle_split(ctx, instr);
1119 break;
1120 case OPC_META_PHI:
1121 handle_phi(ctx, instr);
1122 break;
1123 case OPC_META_PARALLEL_COPY:
1124 handle_pcopy(ctx, instr);
1125 break;
1126 default:
1127 handle_normal_instr(ctx, instr);
1128 }
1129 }
1130
1131 /* In case we define a value outside a loop, use it inside the loop, then spill
1132 * it afterwards inside the same loop, we could lose the value so we have to
1133 * reload it. We have to reload it after any parallel copy instruction, when the
1134 * live shared registers equal the live-in of the backedge. lower_pcopy() will
1135 * then move any non-shared parallel copies down past the reload.
1136 */
1137 static void
reload_live_outs(struct ra_ctx * ctx,struct ir3_block * block)1138 reload_live_outs(struct ra_ctx *ctx, struct ir3_block *block)
1139 {
1140 struct ra_block_state *state = &ctx->blocks[block->index];
1141 unsigned name;
1142 BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) {
1143 struct ir3_register *reg = ctx->live->definitions[name];
1144
1145 struct ra_interval *interval = &ctx->intervals[name];
1146 if (!interval->interval.inserted) {
1147 d("reloading %d at end of backedge", reg->name);
1148
1149 /* When this interval was spilled inside the loop, we probably chose a
1150 * different physreg for it than the original physreg when it was
1151 * defined outside the loop. Restore the original physreg so that we
1152 * spill it correctly.
1153 */
1154 unsigned size = interval->physreg_end - interval->physreg_start;
1155 interval->physreg_start = interval->physreg_start_orig;
1156 interval->physreg_end = interval->physreg_start + size;
1157
1158 reload_interval(ctx, ir3_before_terminator(block), interval);
1159 }
1160 }
1161 }
1162
1163 static void
record_pred_live_out(struct ra_ctx * ctx,struct ra_interval * interval,struct ir3_block * pred)1164 record_pred_live_out(struct ra_ctx *ctx,
1165 struct ra_interval *interval,
1166 struct ir3_block *pred)
1167 {
1168 struct ra_block_state *state = &ctx->blocks[pred->index];
1169
1170 struct ir3_register *def = interval->interval.reg;
1171 BITSET_SET(state->live_out, def->name);
1172
1173 rb_tree_foreach (struct ra_interval, child,
1174 &interval->interval.children, interval.node) {
1175 record_pred_live_out(ctx, child, pred);
1176 }
1177 }
1178
1179 static void
record_pred_live_outs(struct ra_ctx * ctx,struct ir3_block * block)1180 record_pred_live_outs(struct ra_ctx *ctx, struct ir3_block *block)
1181 {
1182 for (unsigned i = 0; i < block->predecessors_count; i++) {
1183 struct ir3_block *pred = block->predecessors[i];
1184 struct ra_block_state *state = &ctx->blocks[pred->index];
1185 if (state->visited)
1186 continue;
1187
1188 state->live_out = rzalloc_array(NULL, BITSET_WORD,
1189 BITSET_WORDS(ctx->live->definitions_count));
1190
1191
1192 rb_tree_foreach (struct ra_interval, interval,
1193 &ctx->reg_ctx.intervals, interval.node) {
1194 record_pred_live_out(ctx, interval, pred);
1195 }
1196 }
1197 }
1198
1199 static void
handle_block(struct ra_ctx * ctx,struct ir3_block * block)1200 handle_block(struct ra_ctx *ctx, struct ir3_block *block)
1201 {
1202 ra_ctx_reset_block(ctx);
1203
1204 unsigned name;
1205 BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1206 ctx->live->definitions_count) {
1207 struct ir3_register *def = ctx->live->definitions[name];
1208 struct ra_interval *interval = &ctx->intervals[name];
1209
1210 /* Non-shared definitions may still be definitions we spilled by demoting
1211 * them, so we still need to initialize the interval. But we shouldn't
1212 * make these intervals live.
1213 */
1214 ra_interval_init(interval, def);
1215
1216 if ((def->flags & IR3_REG_SHARED) && !interval->spill_def) {
1217 ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
1218 }
1219 }
1220
1221 if (RA_DEBUG) {
1222 d("after live-in block %u:\n", block->index);
1223 ra_ctx_dump(ctx);
1224 }
1225
1226 if (block->predecessors_count > 1)
1227 record_pred_live_outs(ctx, block);
1228
1229 foreach_instr_safe (instr, &block->instr_list) {
1230 di(instr, "processing");
1231
1232 handle_instr(ctx, instr);
1233
1234 if (RA_DEBUG)
1235 ra_ctx_dump(ctx);
1236 }
1237
1238 if (block->successors[0]) {
1239 struct ra_block_state *state = &ctx->blocks[block->successors[0]->index];
1240
1241 if (state->visited) {
1242 assert(!block->successors[1]);
1243
1244 reload_live_outs(ctx, block);
1245 }
1246 }
1247
1248 ctx->blocks[block->index].visited = true;
1249 }
1250
1251 static void
lower_pcopy(struct ir3 * ir,struct ra_ctx * ctx)1252 lower_pcopy(struct ir3 *ir, struct ra_ctx *ctx)
1253 {
1254 foreach_block (block, &ir->block_list) {
1255 foreach_instr_safe (instr, &block->instr_list) {
1256 /* At this point due to spilling there may be parallel copies from
1257 * shared to non-shared registers and vice versa. Lowering these after
1258 * RA may produce cycles involving shared and non-shared registers,
1259 * which would need to be resolved by swapping a shared and non-shared
1260 * register which is something we can't handle. However by lowering
1261 * these to moves now, we can make sure that cycles only involve
1262 * non-shared registers. To avoid illegally moving a shared register
1263 * read or write across the parallel copy, which may have other
1264 * conflicting reads/writes if there's a cycle, we need to move copies
1265 * from non-shared to shared below the shared copies, and we need to
1266 * move copies from shared to non-shared above them. So, we have the
1267 * following order:
1268 *
1269 * 1. shared->non-shared copies (spills)
1270 * 2. shared->shared copies (one parallel copy as there may be cycles)
1271 * 3. non-shared->shared copies (reloads)
1272 * 4. non-shared->non-shared copies
1273 *
1274 * We split out the non-shared->non-shared copies as a separate step.
1275 */
1276 if (instr->opc == OPC_META_PARALLEL_COPY) {
1277 for (unsigned i = 0; i < instr->srcs_count; i++) {
1278 if ((instr->srcs[i]->flags & IR3_REG_SHARED) &&
1279 !(instr->dsts[i]->flags & IR3_REG_SHARED)) {
1280 /* shared->non-shared. Create a spill move and rewrite the
1281 * source to be the destination of the move (so that the
1282 * original shared->non-shared copy becomes a
1283 * non-shared->non-shared copy).
1284 */
1285 struct ir3_instruction *mov = ir3_instr_create_at(
1286 ir3_before_instr(instr), OPC_MOV, 1, 1);
1287 mov->flags |= IR3_INSTR_SHARED_SPILL;
1288 struct ir3_register *dst =
1289 ir3_dst_create(mov, INVALID_REG, instr->dsts[i]->flags);
1290 dst->wrmask = instr->dsts[i]->wrmask;
1291 dst->instr = mov;
1292 mov->repeat = reg_elems(mov->dsts[0]) - 1;
1293 struct ir3_register *src =
1294 ir3_src_create(mov, instr->srcs[i]->num,
1295 instr->srcs[i]->flags |
1296 (mov->repeat ? IR3_REG_R : 0));
1297 src->wrmask = instr->srcs[i]->wrmask;
1298 mov->cat1.dst_type = mov->cat1.src_type =
1299 (mov->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
1300 instr->srcs[i]->flags = mov->dsts[0]->flags;
1301 instr->srcs[i]->def = mov->dsts[0];
1302 }
1303 }
1304
1305 for (unsigned i = 0; i < instr->dsts_count;) {
1306 if ((instr->dsts[i]->flags & IR3_REG_SHARED) &&
1307 (instr->srcs[i]->flags & IR3_REG_SSA) &&
1308 !(instr->srcs[i]->flags & IR3_REG_SHARED)) {
1309 /* non-shared->shared. Create a reload move.
1310 */
1311 struct ir3_instruction *mov =
1312 ir3_instr_create_at(ir3_after_instr(instr), OPC_MOV, 1, 1);
1313 mov->flags |= IR3_INSTR_SHARED_SPILL;
1314 struct ir3_register *dst =
1315 ir3_dst_create(mov, instr->dsts[i]->num,
1316 instr->dsts[i]->flags);
1317 dst->instr = mov;
1318 dst->wrmask = instr->dsts[i]->wrmask;
1319 mov->repeat = reg_elems(mov->dsts[0]) - 1;
1320 struct ir3_register *src =
1321 ir3_src_create(mov, INVALID_REG, instr->srcs[i]->flags |
1322 (mov->repeat ? IR3_REG_R : 0));
1323 src->def = instr->srcs[i]->def;
1324 src->wrmask = instr->srcs[i]->wrmask;
1325 mov->cat1.dst_type = mov->cat1.src_type =
1326 (mov->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
1327
1328 /* When we spill a parallel copy source, we lose the
1329 * information of where it originally points to since we make
1330 * it point to the spill def. If we later decide not to also
1331 * spill the phi associated with it, we have to restore it
1332 * here using the stashed original source so that RA
1333 * validation can check that we did the correct thing.
1334 *
1335 * Because SSA-ness goes away after validation, this is really
1336 * just about validation.
1337 */
1338 struct ir3_block *succ = block->successors[0];
1339 unsigned pred_idx = ir3_block_get_pred_index(succ, block);
1340 foreach_instr (phi, &succ->instr_list) {
1341 if (phi->opc != OPC_META_PHI)
1342 break;
1343
1344 if (phi->srcs[pred_idx]->def == instr->dsts[i]) {
1345 struct ir3_register *def =
1346 _mesa_hash_table_search(ctx->pcopy_src_map,
1347 instr->srcs[i])->data;
1348 phi->srcs[pred_idx]->def = def;
1349 break;
1350 }
1351 }
1352
1353 instr->srcs[i] = instr->srcs[instr->srcs_count - 1];
1354 instr->dsts[i] = instr->dsts[instr->dsts_count - 1];
1355 instr->srcs_count--;
1356 instr->dsts_count--;
1357 continue;
1358 }
1359
1360 i++;
1361 }
1362
1363 /* Move any non-shared copies to a separate parallel copy
1364 * instruction right at the end of the block, after any reloads. At
1365 * this point all copies should be {shared,immediate}->shared or
1366 * {non-shared,immediate}->non-shared.
1367 */
1368 unsigned non_shared_copies = 0;
1369 for (unsigned i = 0; i < instr->dsts_count; i++) {
1370 if (!(instr->dsts[i]->flags & IR3_REG_SHARED))
1371 non_shared_copies++;
1372 }
1373
1374 if (non_shared_copies != 0) {
1375 struct ir3_instruction *pcopy = ir3_instr_create_at(
1376 ir3_before_terminator(block), OPC_META_PARALLEL_COPY,
1377 non_shared_copies, non_shared_copies);
1378
1379 unsigned j = 0;
1380 for (unsigned i = 0; i < instr->dsts_count;) {
1381 if (!(instr->dsts[i]->flags & IR3_REG_SHARED)) {
1382 pcopy->dsts[j] = instr->dsts[i];
1383 pcopy->srcs[j] = instr->srcs[i];
1384 pcopy->dsts[j]->instr = pcopy;
1385 instr->srcs[i] = instr->srcs[instr->srcs_count - 1];
1386 instr->dsts[i] = instr->dsts[instr->dsts_count - 1];
1387 instr->srcs_count--;
1388 instr->dsts_count--;
1389 j++;
1390 continue;
1391 }
1392 i++;
1393 }
1394
1395 pcopy->srcs_count = pcopy->dsts_count = j;
1396 if (instr->dsts_count == 0)
1397 list_del(&instr->node);
1398 }
1399 }
1400 }
1401 }
1402 }
1403
1404 static void
finalize(struct ir3 * ir)1405 finalize(struct ir3 *ir)
1406 {
1407 foreach_block (block, &ir->block_list) {
1408 foreach_instr (instr, &block->instr_list) {
1409 for (unsigned i = 0; i < instr->dsts_count; i++) {
1410 if (instr->dsts[i]->flags & IR3_REG_SHARED) {
1411 instr->dsts[i]->flags &= ~IR3_REG_SSA;
1412 }
1413 }
1414
1415 for (unsigned i = 0; i < instr->srcs_count; i++) {
1416 if (instr->srcs[i]->flags & IR3_REG_SHARED) {
1417 instr->srcs[i]->flags &= ~IR3_REG_SSA;
1418 instr->srcs[i]->def = NULL;
1419 }
1420 }
1421 }
1422 }
1423 }
1424
1425 void
ir3_ra_shared(struct ir3_shader_variant * v,struct ir3_liveness ** live_ptr)1426 ir3_ra_shared(struct ir3_shader_variant *v, struct ir3_liveness **live_ptr)
1427 {
1428 struct ra_ctx ctx;
1429 struct ir3_liveness *live = *live_ptr;
1430
1431 ra_ctx_init(&ctx);
1432 ctx.intervals = rzalloc_array(NULL, struct ra_interval,
1433 live->definitions_count);
1434 ctx.blocks = rzalloc_array(NULL, struct ra_block_state,
1435 live->block_count);
1436 ctx.start = 0;
1437 ctx.live = live;
1438 ctx.pcopy_src_map = _mesa_pointer_hash_table_create(NULL);
1439
1440 foreach_block (block, &v->ir->block_list) {
1441 handle_block(&ctx, block);
1442 }
1443
1444 lower_pcopy(v->ir, &ctx);
1445
1446 for (unsigned i = 0; i < live->block_count; i++) {
1447 if (ctx.blocks[i].live_out)
1448 ralloc_free(ctx.blocks[i].live_out);
1449 }
1450
1451 ralloc_free(ctx.intervals);
1452 ralloc_free(ctx.pcopy_src_map);
1453 ralloc_free(ctx.blocks);
1454
1455 ir3_ra_validate(v, RA_FULL_SIZE, RA_HALF_SIZE, live->block_count, true);
1456 finalize(v->ir);
1457
1458 /* Recalculate liveness and register pressure now that additional values have
1459 * been added.
1460 * TODO we should only do this if any values have been spilled/reloaded.
1461 * Note: since we don't have to recreate merge sets, we have to manually copy
1462 * interval_offset to the new liveness struct.
1463 */
1464 unsigned interval_offset = live->interval_offset;
1465 void *live_mem_ctx = ralloc_parent(live);
1466 ralloc_free(live);
1467 *live_ptr = ir3_calc_liveness(live_mem_ctx, v->ir);
1468 (*live_ptr)->interval_offset = interval_offset;
1469 }
1470
1471