1 /*
2 * Copyright (C) 2021 Valve Corporation
3 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "ir3_ra.h"
26 #include "ir3_shader.h"
27
28 #include "util/u_math.h"
29
30 /* Allocating shared registers can pose a challenge, because their live
31 * intervals use the physical CFG which has extra edges inserted that are
32 * pretty much always critical edges. This causes problems with phi nodes,
33 * because copies for phi nodes have to happen "along the edge," and similarly
34 * causes problems when reunifying values that have had their live range split.
35 * Problematic phi nodes should be relatively rare, so we ban them for now.
36 * The solution we choose for live-range splitting is to integrate spilling and
37 * register allcoation and spill to vector registers rather than split a live
38 * range, which negates some of the advantages of SSA-based RA, but it isn't as
39 * bad as it seems because the conditions needed (vector shared registers, which
40 * only movmsk currently produces, or fixed registers which we don't do) are
41 * relatively rare. Spilling is also much cheaper than spilling vector registers
42 * to private memory.
43 */
44
45 struct ra_interval {
46 struct ir3_reg_interval interval;
47
48 struct rb_node physreg_node;
49 physreg_t physreg_start, physreg_end;
50
51 /* Where the shared register is spilled to. If there were no uses when it's
52 * spilled it could be the original defining instruction.
53 */
54 struct ir3_register *spill_def;
55
56 /* Whether this contains a source of the current instruction that can't be
57 * spilled.
58 */
59 bool src;
60
61 bool needs_reload;
62 };
63
64 struct ra_block_state {
65 bool visited;
66
67 /* For blocks whose successors are visited first (i.e. loop backedges), which
68 * values should be live at the end.
69 */
70 BITSET_WORD *live_out;
71 };
72
73 struct ra_ctx {
74 struct ir3_reg_ctx reg_ctx;
75
76 BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
77
78 struct rb_tree physreg_intervals;
79
80 struct ra_interval *intervals;
81
82 struct ir3_liveness *live;
83
84 struct hash_table *pcopy_src_map;
85
86 struct ra_block_state *blocks;
87
88 unsigned start;
89 };
90
91 static struct ra_interval *
ir3_reg_interval_to_ra_interval(struct ir3_reg_interval * interval)92 ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
93 {
94 return rb_node_data(struct ra_interval, interval, interval);
95 }
96
97 static struct ra_interval *
rb_node_to_interval(struct rb_node * node)98 rb_node_to_interval(struct rb_node *node)
99 {
100 return rb_node_data(struct ra_interval, node, physreg_node);
101 }
102
103 static const struct ra_interval *
rb_node_to_interval_const(const struct rb_node * node)104 rb_node_to_interval_const(const struct rb_node *node)
105 {
106 return rb_node_data(struct ra_interval, node, physreg_node);
107 }
108
109 static struct ra_interval *
ra_interval_next(struct ra_interval * interval)110 ra_interval_next(struct ra_interval *interval)
111 {
112 struct rb_node *next = rb_node_next(&interval->physreg_node);
113 return next ? rb_node_to_interval(next) : NULL;
114 }
115
116 static struct ra_interval *
ra_interval_next_or_null(struct ra_interval * interval)117 ra_interval_next_or_null(struct ra_interval *interval)
118 {
119 return interval ? ra_interval_next(interval) : NULL;
120 }
121
122 static int
ra_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)123 ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
124 {
125 const struct ra_interval *a = rb_node_to_interval_const(_a);
126 const struct ra_interval *b = rb_node_to_interval_const(_b);
127 return b->physreg_start - a->physreg_start;
128 }
129
130 static int
ra_interval_cmp(const struct rb_node * node,const void * data)131 ra_interval_cmp(const struct rb_node *node, const void *data)
132 {
133 physreg_t reg = *(const physreg_t *)data;
134 const struct ra_interval *interval = rb_node_to_interval_const(node);
135 if (interval->physreg_start > reg)
136 return -1;
137 else if (interval->physreg_end <= reg)
138 return 1;
139 else
140 return 0;
141 }
142
143 static struct ra_ctx *
ir3_reg_ctx_to_ctx(struct ir3_reg_ctx * ctx)144 ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
145 {
146 return rb_node_data(struct ra_ctx, ctx, reg_ctx);
147 }
148
149 static struct ra_interval *
ra_interval_search_sloppy(struct rb_tree * tree,physreg_t reg)150 ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
151 {
152 struct rb_node *node = rb_tree_search_sloppy(tree, ®, ra_interval_cmp);
153 return node ? rb_node_to_interval(node) : NULL;
154 }
155
156 /* Get the interval covering the reg, or the closest to the right if it
157 * doesn't exist.
158 */
159 static struct ra_interval *
ra_interval_search_right(struct rb_tree * tree,physreg_t reg)160 ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
161 {
162 struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
163 if (!interval) {
164 return NULL;
165 } else if (interval->physreg_end > reg) {
166 return interval;
167 } else {
168 /* There is no interval covering reg, and ra_file_search_sloppy()
169 * returned the closest range to the left, so the next interval to the
170 * right should be the closest to the right.
171 */
172 return ra_interval_next_or_null(interval);
173 }
174 }
175
176 static struct ra_interval *
ra_ctx_search_right(struct ra_ctx * ctx,physreg_t reg)177 ra_ctx_search_right(struct ra_ctx *ctx, physreg_t reg)
178 {
179 return ra_interval_search_right(&ctx->physreg_intervals, reg);
180 }
181
182 static void
interval_add(struct ir3_reg_ctx * reg_ctx,struct ir3_reg_interval * _interval)183 interval_add(struct ir3_reg_ctx *reg_ctx, struct ir3_reg_interval *_interval)
184 {
185 struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
186 struct ra_ctx *ctx = ir3_reg_ctx_to_ctx(reg_ctx);
187
188 /* We can assume in this case that physreg_start/physreg_end is already
189 * initialized.
190 */
191 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
192 BITSET_CLEAR(ctx->available, i);
193 }
194
195 rb_tree_insert(&ctx->physreg_intervals, &interval->physreg_node,
196 ra_interval_insert_cmp);
197 }
198
199 static void
interval_delete(struct ir3_reg_ctx * reg_ctx,struct ir3_reg_interval * _interval)200 interval_delete(struct ir3_reg_ctx *reg_ctx, struct ir3_reg_interval *_interval)
201 {
202 struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
203 struct ra_ctx *ctx = ir3_reg_ctx_to_ctx(reg_ctx);
204
205 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
206 BITSET_SET(ctx->available, i);
207 }
208
209 rb_tree_remove(&ctx->physreg_intervals, &interval->physreg_node);
210 }
211
212 static void
interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)213 interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
214 struct ir3_reg_interval *_child)
215 {
216 struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
217 struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
218
219 child->physreg_start =
220 parent->physreg_start + (child->interval.reg->interval_start -
221 parent->interval.reg->interval_start);
222 child->physreg_end =
223 child->physreg_start +
224 (child->interval.reg->interval_end - child->interval.reg->interval_start);
225
226 interval_add(ctx, _child);
227 }
228
229 static void
ra_ctx_init(struct ra_ctx * ctx)230 ra_ctx_init(struct ra_ctx *ctx)
231 {
232 ctx->reg_ctx.interval_add = interval_add;
233 ctx->reg_ctx.interval_delete = interval_delete;
234 ctx->reg_ctx.interval_readd = interval_readd;
235 }
236
237 static void
ra_ctx_reset_block(struct ra_ctx * ctx)238 ra_ctx_reset_block(struct ra_ctx *ctx)
239 {
240 for (unsigned i = 0; i < RA_SHARED_SIZE; i++) {
241 BITSET_SET(ctx->available, i);
242 }
243
244 rb_tree_init(&ctx->reg_ctx.intervals);
245 rb_tree_init(&ctx->physreg_intervals);
246 }
247
248 static void
ra_interval_init(struct ra_interval * interval,struct ir3_register * reg)249 ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
250 {
251 ir3_reg_interval_init(&interval->interval, reg);
252 }
253
254 static physreg_t
ra_interval_get_physreg(const struct ra_interval * interval)255 ra_interval_get_physreg(const struct ra_interval *interval)
256 {
257 unsigned child_start = interval->interval.reg->interval_start;
258
259 while (interval->interval.parent) {
260 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
261 }
262
263 return interval->physreg_start +
264 (child_start - interval->interval.reg->interval_start);
265 }
266
267 static unsigned
ra_interval_get_num(const struct ra_interval * interval)268 ra_interval_get_num(const struct ra_interval *interval)
269 {
270 return ra_physreg_to_num(ra_interval_get_physreg(interval),
271 interval->interval.reg->flags);
272 }
273
274 static void
ra_interval_dump(struct log_stream * stream,struct ra_interval * interval)275 ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
276 {
277 mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
278
279 ir3_reg_interval_dump(stream, &interval->interval);
280 }
281
282 static void
ra_ctx_dump(struct ra_ctx * ctx)283 ra_ctx_dump(struct ra_ctx *ctx)
284 {
285 struct log_stream *stream = mesa_log_streami();
286
287 mesa_log_stream_printf(stream, "shared:\n");
288 rb_tree_foreach (struct ra_interval, interval, &ctx->physreg_intervals,
289 physreg_node) {
290 ra_interval_dump(stream, interval);
291 }
292
293 unsigned start, end;
294 mesa_log_stream_printf(stream, "available:\n");
295 BITSET_FOREACH_RANGE (start, end, ctx->available, RA_SHARED_SIZE) {
296 mesa_log_stream_printf(stream, "%u-%u ", start, end);
297 }
298 mesa_log_stream_printf(stream, "\n");
299 mesa_log_stream_printf(stream, "start: %u\n", ctx->start);
300 }
301
302 static bool
get_reg_specified(struct ra_ctx * ctx,struct ir3_register * reg,physreg_t physreg)303 get_reg_specified(struct ra_ctx *ctx, struct ir3_register *reg, physreg_t physreg)
304 {
305 for (unsigned i = 0; i < reg_size(reg); i++) {
306 if (!BITSET_TEST(ctx->available, physreg + i))
307 return false;
308 }
309
310 return true;
311 }
312
313 static unsigned
reg_file_size(struct ir3_register * reg)314 reg_file_size(struct ir3_register *reg)
315 {
316 return RA_SHARED_SIZE;
317 }
318
319 static physreg_t
find_best_gap(struct ra_ctx * ctx,struct ir3_register * dst,unsigned size,unsigned align)320 find_best_gap(struct ra_ctx *ctx, struct ir3_register *dst, unsigned size,
321 unsigned align)
322 {
323 unsigned file_size = reg_file_size(dst);
324
325 /* This can happen if we create a very large merge set. Just bail out in that
326 * case.
327 */
328 if (size > file_size)
329 return (physreg_t) ~0;
330
331 unsigned start = ALIGN(ctx->start, align) % (file_size - size + align);
332 unsigned candidate = start;
333 do {
334 bool is_available = true;
335 for (unsigned i = 0; i < size; i++) {
336 if (!BITSET_TEST(ctx->available, candidate + i)) {
337 is_available = false;
338 break;
339 }
340 }
341
342 if (is_available) {
343 ctx->start = (candidate + size) % file_size;
344 return candidate;
345 }
346
347 candidate += align;
348 if (candidate + size > file_size)
349 candidate = 0;
350 } while (candidate != start);
351
352 return (physreg_t)~0;
353 }
354
355 static physreg_t
find_best_spill_reg(struct ra_ctx * ctx,struct ir3_register * reg,unsigned size,unsigned align)356 find_best_spill_reg(struct ra_ctx *ctx, struct ir3_register *reg,
357 unsigned size, unsigned align)
358 {
359 unsigned file_size = reg_file_size(reg);
360 unsigned min_cost = UINT_MAX;
361
362 unsigned start = ALIGN(ctx->start, align) % (file_size - size + align);
363 physreg_t candidate = start;
364 physreg_t best_reg = (physreg_t)~0;
365 do {
366 unsigned cost = 0;
367
368 /* Iterate through intervals we'd need to spill to use this reg. */
369 for (struct ra_interval *interval = ra_ctx_search_right(ctx, candidate);
370 interval && interval->physreg_start < candidate + size;
371 interval = ra_interval_next_or_null(interval)) {
372 /* We can't spill sources of the current instruction when reloading
373 * sources.
374 */
375 if (interval->src) {
376 cost = UINT_MAX;
377 break;
378 }
379
380 /* We prefer spilling intervals that already have been spilled, so we
381 * don't have to emit another mov.
382 */
383 if (!interval->spill_def)
384 cost += (interval->physreg_end - interval->physreg_start);
385 }
386
387 if (cost < min_cost) {
388 min_cost = cost;
389 best_reg = candidate;
390 }
391
392 candidate += align;
393 if (candidate + size > file_size)
394 candidate = 0;
395 } while (candidate != start);
396
397 return best_reg;
398 }
399
400 static struct ir3_register *
split(struct ir3_register * def,unsigned offset,struct ir3_instruction * before)401 split(struct ir3_register *def, unsigned offset, struct ir3_instruction *before)
402 {
403 if (reg_elems(def) == 1) {
404 assert(offset == 0);
405 return def;
406 }
407
408 struct ir3_instruction *split =
409 ir3_instr_create(before->block, OPC_META_SPLIT, 1, 1);
410 split->split.off = offset;
411 struct ir3_register *dst = __ssa_dst(split);
412 struct ir3_register *src =
413 ir3_src_create(split, INVALID_REG, def->flags & (IR3_REG_HALF | IR3_REG_SSA));
414 src->wrmask = def->wrmask;
415 src->def = def;
416 ir3_instr_move_after(split, before);
417 return dst;
418 }
419
420 static struct ir3_register *
extract(struct ir3_register * parent_def,unsigned offset,unsigned elems,struct ir3_instruction * before)421 extract(struct ir3_register *parent_def, unsigned offset, unsigned elems,
422 struct ir3_instruction *before)
423 {
424 if (offset == 0 && elems == reg_elems(parent_def))
425 return parent_def;
426
427 if (elems == 1)
428 return split(parent_def, offset, before);
429
430 struct ir3_instruction *collect =
431 ir3_instr_create(before->block, OPC_META_COLLECT, 1, elems);
432 struct ir3_register *dst = __ssa_dst(collect);
433 dst->flags |= parent_def->flags & IR3_REG_HALF;
434 dst->wrmask = MASK(elems);
435
436 ir3_instr_move_after(collect, before);
437
438 for (unsigned i = 0; i < elems; i++) {
439 ir3_src_create(collect, INVALID_REG,
440 parent_def->flags & (IR3_REG_HALF | IR3_REG_SSA))->def =
441 split(parent_def, offset + i, before);
442 }
443
444 return dst;
445 }
446
447 static void
spill_interval_children(struct ra_interval * interval,struct ir3_instruction * before)448 spill_interval_children(struct ra_interval *interval,
449 struct ir3_instruction *before)
450 {
451 rb_tree_foreach (struct ra_interval, child, &interval->interval.children,
452 interval.node) {
453 if (!child->spill_def) {
454 child->spill_def = extract(interval->spill_def,
455 (child->interval.reg->interval_start -
456 interval->interval.reg->interval_start) /
457 reg_elem_size(interval->interval.reg),
458 reg_elems(child->interval.reg), before);
459 }
460 spill_interval_children(child, before);
461 }
462 }
463
464 static void
spill_interval(struct ra_ctx * ctx,struct ra_interval * interval)465 spill_interval(struct ra_ctx *ctx, struct ra_interval *interval)
466 {
467 struct ir3_instruction *before = interval->interval.reg->instr;
468
469 d("spilling ssa_%u:%u", before->serialno, interval->interval.reg->name);
470
471 if (!interval->spill_def) {
472 /* If this is a phi node or input, we need to insert the demotion to a
473 * regular register after the last phi or input in the block.
474 */
475 if (before->opc == OPC_META_PHI ||
476 before->opc == OPC_META_INPUT) {
477 struct ir3_block *block = before->block;
478 struct ir3_instruction *last_phi_input = NULL;
479 foreach_instr_from (instr, before, &block->instr_list) {
480 if (instr->opc != before->opc)
481 break;
482 last_phi_input = instr;
483 }
484 before = last_phi_input;
485 }
486
487 struct ir3_instruction *mov = ir3_instr_create(before->block, OPC_MOV, 1, 1);
488 mov->flags |= IR3_INSTR_SHARED_SPILL;
489 struct ir3_register *dst = __ssa_dst(mov);
490 dst->flags |= (interval->interval.reg->flags & IR3_REG_HALF);
491 dst->wrmask = interval->interval.reg->wrmask;
492 mov->repeat = reg_elems(dst) - 1;
493 ir3_src_create(mov, interval->interval.reg->num,
494 IR3_REG_SHARED | (mov->repeat ? IR3_REG_R : 0) |
495 (interval->interval.reg->flags & IR3_REG_HALF))->wrmask =
496 interval->interval.reg->wrmask;
497 mov->cat1.src_type = mov->cat1.dst_type =
498 (interval->interval.reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
499
500 ir3_instr_move_after(mov, before);
501 interval->spill_def = dst;
502 }
503
504 spill_interval_children(interval, interval->spill_def->instr);
505
506 ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
507 }
508
509 /* Try to demote a scalar ALU instruction to a normal ALU instruction, using the
510 * spilled sources. We have to take into account restrictions on the number of
511 * shared sources that only exist for normal ALU instructions.
512 */
513 static bool
try_demote_instruction(struct ra_ctx * ctx,struct ir3_instruction * instr)514 try_demote_instruction(struct ra_ctx *ctx, struct ir3_instruction *instr)
515 {
516 /* First, check restrictions. */
517 switch (opc_cat(instr->opc)) {
518 case 1:
519 if (!(instr->srcs[0]->flags & (IR3_REG_CONST | IR3_REG_IMMED)))
520 return false;
521 break;
522 case 2: {
523 /* We need one source to either be demotable or an immediate. */
524 if (instr->srcs_count > 1) {
525 struct ra_interval *src0_interval =
526 (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
527 struct ra_interval *src1_interval =
528 (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
529 if (!(src0_interval && src0_interval->spill_def) &&
530 !(src1_interval && src1_interval->spill_def) &&
531 !(instr->srcs[0]->flags & IR3_REG_IMMED) &&
532 !(instr->srcs[1]->flags & IR3_REG_IMMED))
533 return false;
534 }
535 break;
536 }
537 case 3: {
538 struct ra_interval *src0_interval =
539 (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
540 struct ra_interval *src1_interval =
541 (instr->srcs[1]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[1]->def->name] : NULL;
542
543 /* src1 cannot be shared */
544 if (src1_interval && !src1_interval->spill_def) {
545 /* Try to swap src0 and src1, similar to what copy prop does. */
546 if (!is_mad(instr->opc))
547 return false;
548
549 if ((src0_interval && src0_interval->spill_def) ||
550 (instr->srcs[0]->flags & IR3_REG_IMMED)) {
551 struct ir3_register *src0 = instr->srcs[0];
552 instr->srcs[0] = instr->srcs[1];
553 instr->srcs[1] = src0;
554 } else {
555 return false;
556 }
557 }
558 break;
559 }
560 case 4: {
561 assert(instr->srcs[0]->flags & IR3_REG_SSA);
562 struct ra_interval *src_interval = &ctx->intervals[instr->srcs[0]->def->name];
563 if (!src_interval->spill_def)
564 return false;
565 break;
566 }
567
568 default:
569 return false;
570 }
571
572 d("demoting instruction");
573
574 /* If the instruction is already not a scalar ALU instruction, we should've
575 * skipped reloading and just demoted sources directly, so we should never
576 * get here.
577 */
578 assert(instr->dsts[0]->flags & IR3_REG_SHARED);
579
580 /* Now we actually demote the instruction */
581 ra_foreach_src (src, instr) {
582 assert(src->flags & IR3_REG_SHARED);
583 struct ra_interval *interval = &ctx->intervals[src->def->name];
584 if (interval->spill_def) {
585 src->def = interval->spill_def;
586 src->flags &= ~IR3_REG_SHARED;
587 interval->needs_reload = false;
588 if (interval->interval.inserted)
589 ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
590 while (interval->interval.parent)
591 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
592 interval->src = false;
593 }
594 }
595
596 struct ra_interval *dst_interval = &ctx->intervals[instr->dsts[0]->name];
597 instr->dsts[0]->flags &= ~IR3_REG_SHARED;
598 ra_interval_init(dst_interval, instr->dsts[0]);
599 dst_interval->spill_def = instr->dsts[0];
600
601 instr->flags |= IR3_INSTR_SHARED_SPILL;
602
603 return true;
604 }
605
606 /* Free up [start, start + size) by spilling live intervals.
607 */
608 static void
free_space(struct ra_ctx * ctx,physreg_t start,unsigned size)609 free_space(struct ra_ctx *ctx, physreg_t start, unsigned size)
610 {
611 struct ra_interval *interval = ra_ctx_search_right(ctx, start);
612 while (interval && interval->physreg_start < start + size) {
613 struct ra_interval *next = ra_interval_next_or_null(interval);
614 spill_interval(ctx, interval);
615 interval = next;
616 }
617 }
618
619 static physreg_t
get_reg(struct ra_ctx * ctx,struct ir3_register * reg,bool src)620 get_reg(struct ra_ctx *ctx, struct ir3_register *reg, bool src)
621 {
622 if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
623 physreg_t preferred_reg =
624 reg->merge_set->preferred_reg + reg->merge_set_offset;
625 if (preferred_reg < reg_file_size(reg) &&
626 preferred_reg % reg_elem_size(reg) == 0 &&
627 get_reg_specified(ctx, reg, preferred_reg))
628 return preferred_reg;
629 }
630
631 /* If this register is a subset of a merge set which we have not picked a
632 * register for, first try to allocate enough space for the entire merge
633 * set.
634 */
635 unsigned size = reg_size(reg);
636 if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
637 size < reg->merge_set->size) {
638 physreg_t best_reg = find_best_gap(ctx, reg, reg->merge_set->size,
639 reg->merge_set->alignment);
640 if (best_reg != (physreg_t)~0u) {
641 best_reg += reg->merge_set_offset;
642 return best_reg;
643 }
644 }
645
646 /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
647 * Because this doesn't introduce unnecessary dependencies, and it
648 * potentially avoids needing (ss) syncs for write after read hazards for
649 * SFU instructions:
650 */
651 if (!src && (is_sfu(reg->instr) || is_alu(reg->instr))) {
652 for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
653 struct ir3_register *src = reg->instr->srcs[i];
654 if (!ra_reg_is_src(src))
655 continue;
656 if ((src->flags & IR3_REG_SHARED) && reg_size(src) >= size) {
657 struct ra_interval *src_interval = &ctx->intervals[src->def->name];
658 physreg_t src_physreg = ra_interval_get_physreg(src_interval);
659 if (src_physreg % reg_elem_size(reg) == 0 &&
660 src_physreg + size <= reg_file_size(reg) &&
661 get_reg_specified(ctx, reg, src_physreg))
662 return src_physreg;
663 }
664 }
665 }
666
667 return find_best_gap(ctx, reg, size, reg_elem_size(reg));
668 }
669
670 /* The reload process is split in two, first we allocate a register to reload to
671 * for all sources that need a reload and then we actually execute the reload.
672 * This is to allow us to demote shared ALU instructions to non-shared whenever
673 * we would otherwise need to spill to reload, without leaving dangling unused
674 * reload mov's from previously processed sources. So, for example, we could
675 * need to reload both sources of an add, but after reloading the first source
676 * we realize that we would need to spill to reload the second source and we
677 * should demote the add instead, which means cancelling the first reload.
678 */
679 static void
reload_src(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)680 reload_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
681 struct ir3_register *src)
682 {
683 struct ir3_register *reg = src->def;
684 struct ra_interval *interval = &ctx->intervals[reg->name];
685 unsigned size = reg_size(reg);
686
687 physreg_t best_reg = get_reg(ctx, reg, true);
688
689 if (best_reg == (physreg_t)~0u) {
690 if (try_demote_instruction(ctx, instr))
691 return;
692
693 best_reg = find_best_spill_reg(ctx, reg, size, reg_elem_size(reg));
694 assert(best_reg != (physreg_t)~0u);
695
696 free_space(ctx, best_reg, size);
697 }
698
699 d("reload src %u physreg %u", reg->name, best_reg);
700 interval->physreg_start = best_reg;
701 interval->physreg_end = best_reg + size;
702 interval->needs_reload = true;
703 ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
704 interval->src = true;
705 }
706
707 static void
reload_interval(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_block * block,struct ra_interval * interval)708 reload_interval(struct ra_ctx *ctx, struct ir3_instruction *instr,
709 struct ir3_block *block, struct ra_interval *interval)
710 {
711 struct ir3_register *def = interval->interval.reg;
712 struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
713 mov->flags |= IR3_INSTR_SHARED_SPILL;
714 unsigned flags = IR3_REG_SHARED | (def->flags & IR3_REG_HALF);
715 ir3_dst_create(mov, ra_physreg_to_num(interval->physreg_start, flags),
716 flags)->wrmask = def->wrmask;
717 mov->repeat = reg_elems(def) - 1;
718 struct ir3_register *mov_src =
719 ir3_src_create(mov, INVALID_REG, IR3_REG_SSA | (def->flags & IR3_REG_HALF) |
720 (mov->repeat ? IR3_REG_R : 0));
721 assert(interval->spill_def);
722 mov_src->def = interval->spill_def;
723 mov_src->wrmask = def->wrmask;
724 mov->cat1.src_type = mov->cat1.dst_type =
725 (def->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
726
727 if (instr)
728 ir3_instr_move_before(mov, instr);
729 }
730
731 static void
reload_src_finalize(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)732 reload_src_finalize(struct ra_ctx *ctx, struct ir3_instruction *instr,
733 struct ir3_register *src)
734 {
735 struct ir3_register *reg = src->def;
736 struct ra_interval *interval = &ctx->intervals[reg->name];
737
738 if (!interval->needs_reload)
739 return;
740
741 reload_interval(ctx, instr, instr->block, interval);
742
743 interval->needs_reload = false;
744 }
745
746 static bool
can_demote_src(struct ir3_instruction * instr)747 can_demote_src(struct ir3_instruction *instr)
748 {
749 switch (instr->opc) {
750 case OPC_SCAN_MACRO:
751 case OPC_META_COLLECT:
752 return false;
753 case OPC_MOV:
754 /* non-shared -> shared floating-point conversions don't work */
755 return (!(instr->dsts[0]->flags & IR3_REG_SHARED) ||
756 (full_type(instr->cat1.src_type) != TYPE_F32 &&
757 full_type(instr->cat1.dst_type) != TYPE_F32));
758 default:
759 return (!is_alu(instr) && !is_sfu(instr)) ||
760 !(instr->dsts[0]->flags & IR3_REG_SHARED);
761 }
762 }
763
764 /* Ensure that this source is never spilled while reloading other sources.
765 */
766 static void
mark_src(struct ra_ctx * ctx,struct ir3_register * src)767 mark_src(struct ra_ctx *ctx, struct ir3_register *src)
768 {
769 if (!(src->flags & IR3_REG_SHARED))
770 return;
771
772 struct ra_interval *interval = &ctx->intervals[src->def->name];
773
774 if (interval->interval.inserted) {
775 while (interval->interval.parent)
776 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
777
778 interval->src = true;
779 }
780 }
781
782 static void
ensure_src_live(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)783 ensure_src_live(struct ra_ctx *ctx, struct ir3_instruction *instr,
784 struct ir3_register *src)
785 {
786 if (!(src->flags & IR3_REG_SHARED))
787 return;
788
789 struct ra_interval *interval = &ctx->intervals[src->def->name];
790
791 if (!interval->interval.inserted) {
792 /* In some cases we cannot demote shared reg sources to non-shared regs,
793 * then we have to reload it.
794 */
795 assert(interval->spill_def);
796 if (!can_demote_src(instr)) {
797 reload_src(ctx, instr, src);
798 } else {
799 if (instr->opc == OPC_META_PARALLEL_COPY) {
800 /* Stash away the original def to use later in case we actually have
801 * to insert a reload.
802 */
803 _mesa_hash_table_insert(ctx->pcopy_src_map, src, src->def);
804 }
805 src->def = interval->spill_def;
806 src->flags &= ~IR3_REG_SHARED;
807 }
808 }
809 }
810
811 static void
assign_src(struct ra_ctx * ctx,struct ir3_register * src)812 assign_src(struct ra_ctx *ctx, struct ir3_register *src)
813 {
814 if (!(src->flags & IR3_REG_SHARED))
815 return;
816
817 struct ra_interval *interval = &ctx->intervals[src->def->name];
818 assert(interval->interval.inserted);
819 src->num = ra_physreg_to_num(ra_interval_get_physreg(interval), src->flags);
820
821 if ((src->flags & IR3_REG_FIRST_KILL) &&
822 !interval->interval.parent &&
823 rb_tree_is_empty(&interval->interval.children))
824 ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
825
826 while (interval->interval.parent)
827 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
828
829 interval->src = false;
830 }
831
832 static void
handle_dst(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * dst)833 handle_dst(struct ra_ctx *ctx, struct ir3_instruction *instr,
834 struct ir3_register *dst)
835 {
836 if (!(dst->flags & IR3_REG_SHARED))
837 return;
838
839 struct ra_interval *interval = &ctx->intervals[dst->name];
840 ra_interval_init(interval, dst);
841 interval->spill_def = NULL;
842
843 if (dst->tied) {
844 struct ir3_register *tied_def = dst->tied->def;
845 struct ra_interval *tied_interval = &ctx->intervals[tied_def->name];
846 if ((dst->tied->flags & IR3_REG_KILL) &&
847 !tied_interval->interval.parent &&
848 rb_tree_is_empty(&tied_interval->interval.children)) {
849 dst->num = dst->tied->num;
850 interval->physreg_start = tied_interval->physreg_start;
851 interval->physreg_end = tied_interval->physreg_end;
852 ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
853 return;
854 }
855 }
856
857 physreg_t physreg = get_reg(ctx, dst, false);
858 if (physreg == (physreg_t) ~0u) {
859 if (try_demote_instruction(ctx, instr))
860 return;
861
862 unsigned size = reg_size(dst);
863 physreg = find_best_spill_reg(ctx, dst, size, reg_elem_size(dst));
864 assert(physreg != (physreg_t)~0u);
865 free_space(ctx, physreg, size);
866 }
867
868 interval->physreg_start = physreg;
869 interval->physreg_end = physreg + reg_size(dst);
870 dst->num = ra_physreg_to_num(physreg, dst->flags);
871 ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
872 d("insert dst %u physreg %u", dst->name, physreg);
873
874 if (dst->tied) {
875 struct ir3_instruction *mov = ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY, 1, 1);
876 unsigned flags = IR3_REG_SHARED | (dst->flags & IR3_REG_HALF);
877 ir3_dst_create(mov, dst->num, flags)->wrmask = dst->wrmask;
878 ir3_src_create(mov, dst->tied->num, flags)->wrmask = dst->wrmask;
879 mov->cat1.src_type = mov->cat1.dst_type =
880 (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;;
881 ir3_instr_move_before(mov, instr);
882 dst->tied->num = dst->num;
883 }
884 }
885
886 static void
handle_src_late(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)887 handle_src_late(struct ra_ctx *ctx, struct ir3_instruction *instr,
888 struct ir3_register *src)
889 {
890 if (!(src->flags & IR3_REG_SHARED))
891 return;
892
893 struct ra_interval *interval = &ctx->intervals[src->def->name];
894 reload_src_finalize(ctx, instr, src);
895
896 /* Remove killed sources that have to be killed late due to being merged with
897 * other defs.
898 */
899 if (!(src->flags & IR3_REG_KILL))
900 return;
901
902 if (interval->interval.inserted)
903 ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
904 }
905
906 static void
handle_normal_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)907 handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
908 {
909 ra_foreach_src (src, instr)
910 mark_src(ctx, src);
911
912 ra_foreach_src (src, instr)
913 ensure_src_live(ctx, instr, src);
914
915 ra_foreach_src_rev (src, instr)
916 assign_src(ctx, src);
917
918 ra_foreach_dst (dst, instr)
919 handle_dst(ctx, instr, dst);
920
921 ra_foreach_src (src, instr)
922 handle_src_late(ctx, instr, src);
923 }
924
925 static void
handle_split(struct ra_ctx * ctx,struct ir3_instruction * split)926 handle_split(struct ra_ctx *ctx, struct ir3_instruction *split)
927 {
928 struct ir3_register *src = split->srcs[0];
929 struct ir3_register *dst = split->dsts[0];
930
931 if (!(dst->flags & IR3_REG_SHARED))
932 return;
933
934 if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
935 handle_normal_instr(ctx, split);
936 return;
937 }
938
939 struct ra_interval *src_interval = &ctx->intervals[src->def->name];
940 struct ra_interval *dst_interval = &ctx->intervals[dst->name];
941
942 ra_interval_init(dst_interval, dst);
943 dst_interval->spill_def = NULL;
944
945 if (src_interval->spill_def) {
946 struct ir3_instruction *spill_split =
947 ir3_instr_create(split->block, OPC_META_SPLIT, 1, 1);
948 struct ir3_register *dst = __ssa_dst(spill_split);
949 ir3_src_create(spill_split, INVALID_REG, IR3_REG_SSA)->def =
950 src_interval->spill_def;
951 spill_split->split.off = split->split.off;
952 ir3_instr_move_after(spill_split, split);
953 dst_interval->spill_def = dst;
954 return;
955 }
956
957 dst_interval->physreg_start =
958 src_interval->physreg_start + dst->merge_set_offset -
959 src->def->merge_set_offset;
960 dst_interval->physreg_end = dst_interval->physreg_start + reg_size(dst);
961 ir3_reg_interval_insert(&ctx->reg_ctx, &dst_interval->interval);
962 src->num = ra_interval_get_num(src_interval);
963 dst->num = ra_interval_get_num(dst_interval);
964 d("insert dst %u physreg %u", dst->name, dst_interval->physreg_start);
965
966 if (src->flags & IR3_REG_KILL)
967 ir3_reg_interval_remove(&ctx->reg_ctx, &src_interval->interval);
968 }
969
970 static void
handle_phi(struct ra_ctx * ctx,struct ir3_instruction * phi)971 handle_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
972 {
973 struct ir3_register *dst = phi->dsts[0];
974
975 if (!(dst->flags & IR3_REG_SHARED))
976 return;
977
978 struct ra_interval *dst_interval = &ctx->intervals[dst->name];
979 ra_interval_init(dst_interval, dst);
980
981 /* In some rare cases, it's possible to have a phi node with a physical-only
982 * source. Here's a contrived example:
983 *
984 * loop {
985 * if non-uniform {
986 * if uniform {
987 * x_1 = ...;
988 * continue;
989 * }
990 * x_2 = ...;
991 * } else {
992 * break;
993 * }
994 * // continue block
995 * x_3 = phi(x_1, x_2)
996 * }
997 *
998 * Assuming x_1 and x_2 are uniform, x_3 will also be uniform, because all
999 * threads that stay in the loop take the same branch to the continue block,
1000 * however execution may fall through from the assignment to x_2 to the
1001 * break statement because the outer if is non-uniform, and then it will fall
1002 * through again to the continue block, so if x_3 is to be in a shared reg
1003 * then the phi needs an extra source pointing to the break statement, which
1004 * itself needs a phi node:
1005 *
1006 * loop {
1007 * if non-uniform {
1008 * if uniform {
1009 * x_1 = ...;
1010 * continue;
1011 * }
1012 * x_2 = ...;
1013 * } else {
1014 * x_4 = phi(undef, x_2)
1015 * break;
1016 * }
1017 * // continue block
1018 * x_3 = phi(x_1, x_2, x_4)
1019 * }
1020 */
1021
1022 /* phi nodes are special because we cannot spill them normally, instead we
1023 * have to spill the parallel copies that their sources point to and make the
1024 * entire phi not shared anymore.
1025 */
1026
1027 physreg_t physreg = get_reg(ctx, dst, false);
1028 if (physreg == (physreg_t) ~0u) {
1029 d("spilling phi destination");
1030 dst->flags &= ~IR3_REG_SHARED;
1031 dst_interval->spill_def = dst;
1032 phi->flags |= IR3_INSTR_SHARED_SPILL;
1033
1034 foreach_src (src, phi) {
1035 src->flags &= ~IR3_REG_SHARED;
1036 if (src->def)
1037 src->def->flags &= ~IR3_REG_SHARED;
1038 }
1039
1040 return;
1041 }
1042
1043 dst->num = ra_physreg_to_num(physreg, dst->flags);
1044 dst_interval->spill_def = NULL;
1045 dst_interval->physreg_start = physreg;
1046 dst_interval->physreg_end = physreg + reg_size(dst);
1047 ir3_reg_interval_insert(&ctx->reg_ctx, &dst_interval->interval);
1048
1049 ra_foreach_src_n (src, i, phi) {
1050 /* We assume that any phis with non-logical sources aren't promoted. */
1051 assert(i < phi->block->predecessors_count);
1052 src->num = dst->num;
1053 src->def->num = dst->num;
1054 }
1055 }
1056
1057 static void
handle_pcopy(struct ra_ctx * ctx,struct ir3_instruction * pcopy)1058 handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *pcopy)
1059 {
1060 /* For parallel copies, we only handle the source. The destination is handled
1061 * later when processing phi nodes.
1062 */
1063
1064 ra_foreach_src (src, pcopy)
1065 mark_src(ctx, src);
1066
1067 ra_foreach_src (src, pcopy)
1068 ensure_src_live(ctx, pcopy, src);
1069
1070 ra_foreach_src_rev (src, pcopy)
1071 assign_src(ctx, src);
1072
1073 ra_foreach_src (src, pcopy)
1074 handle_src_late(ctx, pcopy, src);
1075 }
1076
1077 static void
handle_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1078 handle_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1079 {
1080 instr->flags &= ~IR3_INSTR_SHARED_SPILL;
1081
1082 switch (instr->opc) {
1083 case OPC_META_SPLIT:
1084 handle_split(ctx, instr);
1085 break;
1086 case OPC_META_PHI:
1087 handle_phi(ctx, instr);
1088 break;
1089 case OPC_META_PARALLEL_COPY:
1090 handle_pcopy(ctx, instr);
1091 break;
1092 default:
1093 handle_normal_instr(ctx, instr);
1094 }
1095 }
1096
1097 /* In case we define a value outside a loop, use it inside the loop, then spill
1098 * it afterwards inside the same loop, we could lose the value so we have to
1099 * reload it. We have to reload it after any parallel copy instruction, when the
1100 * live shared registers equal the live-in of the backedge. lower_pcopy() will
1101 * then move any non-shared parallel copies down past the reload.
1102 */
1103 static void
reload_live_outs(struct ra_ctx * ctx,struct ir3_block * block)1104 reload_live_outs(struct ra_ctx *ctx, struct ir3_block *block)
1105 {
1106 struct ra_block_state *state = &ctx->blocks[block->index];
1107 unsigned name;
1108 BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) {
1109 struct ir3_register *reg = ctx->live->definitions[name];
1110
1111 struct ra_interval *interval = &ctx->intervals[name];
1112 if (!interval->interval.inserted) {
1113 d("reloading %d at end of backedge", reg->name);
1114 reload_interval(ctx, NULL, block, interval);
1115 }
1116 }
1117 }
1118
1119 static void
record_pred_live_out(struct ra_ctx * ctx,struct ra_interval * interval,struct ir3_block * pred)1120 record_pred_live_out(struct ra_ctx *ctx,
1121 struct ra_interval *interval,
1122 struct ir3_block *pred)
1123 {
1124 struct ra_block_state *state = &ctx->blocks[pred->index];
1125
1126 struct ir3_register *def = interval->interval.reg;
1127 BITSET_SET(state->live_out, def->name);
1128
1129 rb_tree_foreach (struct ra_interval, child,
1130 &interval->interval.children, interval.node) {
1131 record_pred_live_out(ctx, child, pred);
1132 }
1133 }
1134
1135 static void
record_pred_live_outs(struct ra_ctx * ctx,struct ir3_block * block)1136 record_pred_live_outs(struct ra_ctx *ctx, struct ir3_block *block)
1137 {
1138 for (unsigned i = 0; i < block->predecessors_count; i++) {
1139 struct ir3_block *pred = block->predecessors[i];
1140 struct ra_block_state *state = &ctx->blocks[pred->index];
1141 if (state->visited)
1142 continue;
1143
1144 state->live_out = rzalloc_array(NULL, BITSET_WORD,
1145 BITSET_WORDS(ctx->live->definitions_count));
1146
1147
1148 rb_tree_foreach (struct ra_interval, interval,
1149 &ctx->reg_ctx.intervals, interval.node) {
1150 record_pred_live_out(ctx, interval, pred);
1151 }
1152 }
1153 }
1154
1155 static void
handle_block(struct ra_ctx * ctx,struct ir3_block * block)1156 handle_block(struct ra_ctx *ctx, struct ir3_block *block)
1157 {
1158 ra_ctx_reset_block(ctx);
1159
1160 unsigned name;
1161 BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1162 ctx->live->definitions_count) {
1163 struct ir3_register *def = ctx->live->definitions[name];
1164 struct ra_interval *interval = &ctx->intervals[name];
1165
1166 /* Non-shared definitions may still be definitions we spilled by demoting
1167 * them, so we still need to initialize the interval. But we shouldn't
1168 * make these intervals live.
1169 */
1170 ra_interval_init(interval, def);
1171
1172 if ((def->flags & IR3_REG_SHARED) && !interval->spill_def) {
1173 ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
1174 }
1175 }
1176
1177 if (RA_DEBUG) {
1178 d("after live-in block %u:\n", block->index);
1179 ra_ctx_dump(ctx);
1180 }
1181
1182 if (block->predecessors_count > 1)
1183 record_pred_live_outs(ctx, block);
1184
1185 foreach_instr (instr, &block->instr_list) {
1186 di(instr, "processing");
1187
1188 handle_instr(ctx, instr);
1189
1190 if (RA_DEBUG)
1191 ra_ctx_dump(ctx);
1192 }
1193
1194 if (block->successors[0]) {
1195 struct ra_block_state *state = &ctx->blocks[block->successors[0]->index];
1196
1197 if (state->visited) {
1198 assert(!block->successors[1]);
1199
1200 reload_live_outs(ctx, block);
1201 }
1202 }
1203
1204 ctx->blocks[block->index].visited = true;
1205 }
1206
1207 static void
lower_pcopy(struct ir3 * ir,struct ra_ctx * ctx)1208 lower_pcopy(struct ir3 *ir, struct ra_ctx *ctx)
1209 {
1210 foreach_block (block, &ir->block_list) {
1211 foreach_instr_safe (instr, &block->instr_list) {
1212 /* At this point due to spilling there may be parallel copies from
1213 * shared to non-shared registers and vice versa. Lowering these after
1214 * RA may produce cycles involving shared and non-shared registers,
1215 * which would need to be resolved by swapping a shared and non-shared
1216 * register which is something we can't handle. However by lowering
1217 * these to moves now, we can make sure that cycles only involve
1218 * non-shared registers. To avoid illegally moving a shared register
1219 * read or write across the parallel copy, which may have other
1220 * conflicting reads/writes if there's a cycle, we need to move copies
1221 * from non-shared to shared below the shared copies, and we need to
1222 * move copies from shared to non-shared above them. So, we have the
1223 * following order:
1224 *
1225 * 1. shared->non-shared copies (spills)
1226 * 2. shared->shared copies (one parallel copy as there may be cycles)
1227 * 3. non-shared->shared copies (reloads)
1228 * 4. non-shared->non-shared copies
1229 *
1230 * We split out the non-shared->non-shared copies as a separate step.
1231 */
1232 if (instr->opc == OPC_META_PARALLEL_COPY) {
1233 for (unsigned i = 0; i < instr->srcs_count; i++) {
1234 if ((instr->srcs[i]->flags & IR3_REG_SHARED) &&
1235 !(instr->dsts[i]->flags & IR3_REG_SHARED)) {
1236 /* shared->non-shared. Create a spill move and rewrite the
1237 * source to be the destination of the move (so that the
1238 * original shared->non-shared copy becomes a
1239 * non-shared->non-shared copy).
1240 */
1241 struct ir3_instruction *mov =
1242 ir3_instr_create(block, OPC_MOV, 1, 1);
1243 mov->flags |= IR3_INSTR_SHARED_SPILL;
1244 struct ir3_register *dst =
1245 ir3_dst_create(mov, INVALID_REG, instr->dsts[i]->flags);
1246 dst->wrmask = instr->dsts[i]->wrmask;
1247 dst->instr = mov;
1248 mov->repeat = reg_elems(mov->dsts[0]) - 1;
1249 struct ir3_register *src =
1250 ir3_src_create(mov, instr->srcs[i]->num,
1251 instr->srcs[i]->flags |
1252 (mov->repeat ? IR3_REG_R : 0));
1253 src->wrmask = instr->srcs[i]->wrmask;
1254 mov->cat1.dst_type = mov->cat1.src_type =
1255 (mov->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
1256 instr->srcs[i]->flags = mov->dsts[0]->flags;
1257 instr->srcs[i]->def = mov->dsts[0];
1258 ir3_instr_move_before(mov, instr);
1259 }
1260 }
1261
1262 for (unsigned i = 0; i < instr->dsts_count;) {
1263 if ((instr->dsts[i]->flags & IR3_REG_SHARED) &&
1264 (instr->srcs[i]->flags & IR3_REG_SSA) &&
1265 !(instr->srcs[i]->flags & IR3_REG_SHARED)) {
1266 /* non-shared->shared. Create a reload move.
1267 */
1268 struct ir3_instruction *mov =
1269 ir3_instr_create(block, OPC_MOV, 1, 1);
1270 mov->flags |= IR3_INSTR_SHARED_SPILL;
1271 struct ir3_register *dst =
1272 ir3_dst_create(mov, instr->dsts[i]->num,
1273 instr->dsts[i]->flags);
1274 dst->instr = mov;
1275 dst->wrmask = instr->dsts[i]->wrmask;
1276 mov->repeat = reg_elems(mov->dsts[0]) - 1;
1277 struct ir3_register *src =
1278 ir3_src_create(mov, INVALID_REG, instr->srcs[i]->flags |
1279 (mov->repeat ? IR3_REG_R : 0));
1280 src->def = instr->srcs[i]->def;
1281 src->wrmask = instr->srcs[i]->wrmask;
1282 mov->cat1.dst_type = mov->cat1.src_type =
1283 (mov->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
1284
1285 /* When we spill a parallel copy source, we lose the
1286 * information of where it originally points to since we make
1287 * it point to the spill def. If we later decide not to also
1288 * spill the phi associated with it, we have to restore it
1289 * here using the stashed original source so that RA
1290 * validation can check that we did the correct thing.
1291 *
1292 * Because SSA-ness goes away after validation, this is really
1293 * just about validation.
1294 */
1295 struct ir3_block *succ = block->successors[0];
1296 unsigned pred_idx = ir3_block_get_pred_index(succ, block);
1297 foreach_instr (phi, &succ->instr_list) {
1298 if (phi->opc != OPC_META_PHI)
1299 break;
1300
1301 if (phi->srcs[pred_idx]->def == instr->dsts[i]) {
1302 struct ir3_register *def =
1303 _mesa_hash_table_search(ctx->pcopy_src_map,
1304 instr->srcs[i])->data;
1305 phi->srcs[pred_idx]->def = def;
1306 break;
1307 }
1308 }
1309
1310 instr->srcs[i] = instr->srcs[instr->srcs_count - 1];
1311 instr->dsts[i] = instr->dsts[instr->dsts_count - 1];
1312 instr->srcs_count--;
1313 instr->dsts_count--;
1314 ir3_instr_move_after(mov, instr);
1315 continue;
1316 }
1317
1318 i++;
1319 }
1320
1321 /* Move any non-shared copies to a separate parallel copy
1322 * instruction right at the end of the block, after any reloads. At
1323 * this point all copies should be {shared,immediate}->shared or
1324 * {non-shared,immediate}->non-shared.
1325 */
1326 unsigned non_shared_copies = 0;
1327 for (unsigned i = 0; i < instr->dsts_count; i++) {
1328 if (!(instr->dsts[i]->flags & IR3_REG_SHARED))
1329 non_shared_copies++;
1330 }
1331
1332 if (non_shared_copies != 0) {
1333 struct ir3_instruction *pcopy =
1334 ir3_instr_create(block, OPC_META_PARALLEL_COPY,
1335 non_shared_copies, non_shared_copies);
1336
1337 unsigned j = 0;
1338 for (unsigned i = 0; i < instr->dsts_count;) {
1339 if (!(instr->dsts[i]->flags & IR3_REG_SHARED)) {
1340 pcopy->dsts[j] = instr->dsts[i];
1341 pcopy->srcs[j] = instr->srcs[i];
1342 pcopy->dsts[j]->instr = pcopy;
1343 instr->srcs[i] = instr->srcs[instr->srcs_count - 1];
1344 instr->dsts[i] = instr->dsts[instr->dsts_count - 1];
1345 instr->srcs_count--;
1346 instr->dsts_count--;
1347 j++;
1348 continue;
1349 }
1350 i++;
1351 }
1352
1353 pcopy->srcs_count = pcopy->dsts_count = j;
1354 if (instr->dsts_count == 0)
1355 list_del(&instr->node);
1356 }
1357 }
1358 }
1359 }
1360 }
1361
1362 static void
finalize(struct ir3 * ir)1363 finalize(struct ir3 *ir)
1364 {
1365 foreach_block (block, &ir->block_list) {
1366 foreach_instr (instr, &block->instr_list) {
1367 for (unsigned i = 0; i < instr->dsts_count; i++) {
1368 if (instr->dsts[i]->flags & IR3_REG_SHARED) {
1369 instr->dsts[i]->flags &= ~IR3_REG_SSA;
1370 }
1371 }
1372
1373 for (unsigned i = 0; i < instr->srcs_count; i++) {
1374 if (instr->srcs[i]->flags & IR3_REG_SHARED) {
1375 instr->srcs[i]->flags &= ~IR3_REG_SSA;
1376 instr->srcs[i]->def = NULL;
1377 }
1378 }
1379 }
1380 }
1381 }
1382
1383 void
ir3_ra_shared(struct ir3_shader_variant * v,struct ir3_liveness * live)1384 ir3_ra_shared(struct ir3_shader_variant *v, struct ir3_liveness *live)
1385 {
1386 struct ra_ctx ctx;
1387
1388 ra_ctx_init(&ctx);
1389 ctx.intervals = rzalloc_array(NULL, struct ra_interval,
1390 live->definitions_count);
1391 ctx.blocks = rzalloc_array(NULL, struct ra_block_state,
1392 live->block_count);
1393 ctx.start = 0;
1394 ctx.live = live;
1395 ctx.pcopy_src_map = _mesa_pointer_hash_table_create(NULL);
1396
1397 foreach_block (block, &v->ir->block_list) {
1398 handle_block(&ctx, block);
1399 }
1400
1401 lower_pcopy(v->ir, &ctx);
1402
1403 for (unsigned i = 0; i < live->block_count; i++) {
1404 if (ctx.blocks[i].live_out)
1405 ralloc_free(ctx.blocks[i].live_out);
1406 }
1407
1408 ralloc_free(ctx.intervals);
1409 ralloc_free(ctx.pcopy_src_map);
1410 ralloc_free(ctx.blocks);
1411
1412 ir3_ra_validate(v, RA_FULL_SIZE, RA_HALF_SIZE, live->block_count, true);
1413 finalize(v->ir);
1414 }
1415
1416