1 /*
2 * Copyright (C) 2021 Valve Corporation
3 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "ir3_ra.h"
26 #include "util/rb_tree.h"
27 #include "util/u_math.h"
28 #include "ir3_shader.h"
29
30 /* This file implements an SSA-based register allocator. Unlike other
31 * SSA-based allocators, it handles vector split/collect "smartly," meaning
32 * that multiple values may share the same register interval. From the
33 * perspective of the allocator itself, only the top-level intervals matter,
34 * and the allocator is only concerned with allocating top-level intervals,
35 * which may mean moving other top-level intervals around. Other intervals,
36 * like the destination of a split instruction or the source of a collect
37 * instruction, are "locked" to their parent interval. The details of this are
38 * mostly handled by ir3_merge_regs and ir3_reg_ctx.
39 *
40 * We currently don't do any backtracking, but we do use the merge sets as a
41 * form of affinity to try to avoid moves from phis/splits/collects. Each
42 * merge set is what a more "classic" graph-coloring or live-range based
43 * allocator would consider a single register, but here we use it as merely a
44 * hint, except when multiple overlapping values are live at the same time.
45 * Each merge set has a "preferred" register, and we try to honor that when
46 * allocating values in the merge set.
47 */
48
49 /* ir3_reg_ctx implementation. */
50
51 static int
ir3_reg_interval_cmp(const struct rb_node * node,const void * data)52 ir3_reg_interval_cmp(const struct rb_node *node, const void *data)
53 {
54 unsigned reg = *(const unsigned *)data;
55 const struct ir3_reg_interval *interval =
56 ir3_rb_node_to_interval_const(node);
57 if (interval->reg->interval_start > reg)
58 return -1;
59 else if (interval->reg->interval_end <= reg)
60 return 1;
61 else
62 return 0;
63 }
64
65 static struct ir3_reg_interval *
ir3_reg_interval_search(struct rb_tree * tree,unsigned offset)66 ir3_reg_interval_search(struct rb_tree *tree, unsigned offset)
67 {
68 struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp);
69 return node ? ir3_rb_node_to_interval(node) : NULL;
70 }
71
72 static struct ir3_reg_interval *
ir3_reg_interval_search_sloppy(struct rb_tree * tree,unsigned offset)73 ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset)
74 {
75 struct rb_node *node =
76 rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp);
77 return node ? ir3_rb_node_to_interval(node) : NULL;
78 }
79
80 /* Get the interval covering the reg, or the closest to the right if it
81 * doesn't exist.
82 */
83 static struct ir3_reg_interval *
ir3_reg_interval_search_right(struct rb_tree * tree,unsigned offset)84 ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset)
85 {
86 struct ir3_reg_interval *interval =
87 ir3_reg_interval_search_sloppy(tree, offset);
88 if (!interval) {
89 return NULL;
90 } else if (interval->reg->interval_end > offset) {
91 return interval;
92 } else {
93 /* There is no interval covering reg, and ra_file_search_sloppy()
94 * returned the closest range to the left, so the next interval to the
95 * right should be the closest to the right.
96 */
97 return ir3_reg_interval_next_or_null(interval);
98 }
99 }
100
101 static int
ir3_reg_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)102 ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
103 {
104 const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a);
105 const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b);
106 return b->reg->interval_start - a->reg->interval_start;
107 }
108
109 static void
interval_insert(struct ir3_reg_ctx * ctx,struct rb_tree * tree,struct ir3_reg_interval * interval)110 interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree,
111 struct ir3_reg_interval *interval)
112 {
113 struct ir3_reg_interval *right =
114 ir3_reg_interval_search_right(tree, interval->reg->interval_start);
115 if (right && right->reg->interval_start < interval->reg->interval_end) {
116 /* We disallow trees where different members have different half-ness.
117 * This means that we can't treat bitcasts as copies like normal
118 * split/collect, so something like this would require an extra copy
119 * in mergedregs mode, and count as 4 half-units of register pressure
120 * instead of 2:
121 *
122 * f16vec2 foo = unpackFloat2x16(bar)
123 * ... = foo.x
124 * ... = bar
125 *
126 * However, relaxing this rule would open a huge can of worms. What
127 * happens when there's a vector of 16 things, and the fifth element
128 * has been bitcasted as a half-reg? Would that element alone have to
129 * be small enough to be used as a half-reg source? Let's keep that
130 * can of worms firmly shut for now.
131 */
132 assert((interval->reg->flags & IR3_REG_HALF) ==
133 (right->reg->flags & IR3_REG_HALF));
134
135 if (right->reg->interval_end <= interval->reg->interval_end &&
136 right->reg->interval_start >= interval->reg->interval_start) {
137 /* Check if we're inserting something that's already inserted */
138 assert(interval != right);
139
140 /* "right" is contained in "interval" and must become a child of
141 * it. There may be further children too.
142 */
143 for (struct ir3_reg_interval *next = ir3_reg_interval_next(right);
144 right && right->reg->interval_start < interval->reg->interval_end;
145 right = next, next = ir3_reg_interval_next_or_null(next)) {
146 /* "right" must be contained in "interval." */
147 assert(right->reg->interval_end <= interval->reg->interval_end);
148 assert((interval->reg->flags & IR3_REG_HALF) ==
149 (right->reg->flags & IR3_REG_HALF));
150 if (!right->parent)
151 ctx->interval_delete(ctx, right);
152 right->parent = interval;
153 rb_tree_remove(tree, &right->node);
154 rb_tree_insert(&interval->children, &right->node,
155 ir3_reg_interval_insert_cmp);
156 }
157 } else {
158 /* "right" must contain "interval," since intervals must form a
159 * tree.
160 */
161 assert(right->reg->interval_start <= interval->reg->interval_start);
162 interval->parent = right;
163 interval_insert(ctx, &right->children, interval);
164 return;
165 }
166 }
167
168 if (!interval->parent)
169 ctx->interval_add(ctx, interval);
170 rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp);
171 interval->inserted = true;
172 }
173
174 void
ir3_reg_interval_insert(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)175 ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
176 struct ir3_reg_interval *interval)
177 {
178 rb_tree_init(&interval->children);
179 interval->parent = NULL;
180 interval_insert(ctx, &ctx->intervals, interval);
181 }
182
183 /* Call after ir3_reg_interval_remove_temp() to reinsert the interval */
184 static void
ir3_reg_interval_reinsert(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)185 ir3_reg_interval_reinsert(struct ir3_reg_ctx *ctx,
186 struct ir3_reg_interval *interval)
187 {
188 interval->parent = NULL;
189 interval_insert(ctx, &ctx->intervals, interval);
190 }
191
192 void
ir3_reg_interval_remove(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)193 ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
194 struct ir3_reg_interval *interval)
195 {
196 assert(interval->inserted);
197
198 if (interval->parent) {
199 rb_tree_remove(&interval->parent->children, &interval->node);
200 } else {
201 ctx->interval_delete(ctx, interval);
202 rb_tree_remove(&ctx->intervals, &interval->node);
203 }
204
205 rb_tree_foreach_safe (struct ir3_reg_interval, child, &interval->children,
206 node) {
207 rb_tree_remove(&interval->children, &child->node);
208 child->parent = interval->parent;
209
210 if (interval->parent) {
211 rb_tree_insert(&child->parent->children, &child->node,
212 ir3_reg_interval_insert_cmp);
213 } else {
214 ctx->interval_readd(ctx, interval, child);
215 rb_tree_insert(&ctx->intervals, &child->node,
216 ir3_reg_interval_insert_cmp);
217 }
218 }
219
220 interval->inserted = false;
221 }
222
223 static void
_mark_free(struct ir3_reg_interval * interval)224 _mark_free(struct ir3_reg_interval *interval)
225 {
226 interval->inserted = false;
227 rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
228 _mark_free(child);
229 }
230 }
231
232 /* Remove an interval and all its children from the tree. */
233 void
ir3_reg_interval_remove_all(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)234 ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
235 struct ir3_reg_interval *interval)
236 {
237 assert(!interval->parent);
238
239 ctx->interval_delete(ctx, interval);
240 rb_tree_remove(&ctx->intervals, &interval->node);
241 _mark_free(interval);
242 }
243
244 /* Used when popping an interval to be shuffled around. Don't disturb children
245 * so that it can be later reinserted.
246 */
247 static void
ir3_reg_interval_remove_temp(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)248 ir3_reg_interval_remove_temp(struct ir3_reg_ctx *ctx,
249 struct ir3_reg_interval *interval)
250 {
251 assert(!interval->parent);
252
253 ctx->interval_delete(ctx, interval);
254 rb_tree_remove(&ctx->intervals, &interval->node);
255 }
256
257 static void
interval_dump(struct log_stream * stream,struct ir3_reg_interval * interval,unsigned indent)258 interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval,
259 unsigned indent)
260 {
261 for (unsigned i = 0; i < indent; i++)
262 mesa_log_stream_printf(stream, "\t");
263 mesa_log_stream_printf(stream, "reg %u start %u\n", interval->reg->name,
264 interval->reg->interval_start);
265
266 rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
267 interval_dump(stream, child, indent + 1);
268 }
269
270 for (unsigned i = 0; i < indent; i++)
271 mesa_log_stream_printf(stream, "\t");
272 mesa_log_stream_printf(stream, "reg %u end %u\n", interval->reg->name,
273 interval->reg->interval_end);
274 }
275
276 void
ir3_reg_interval_dump(struct log_stream * stream,struct ir3_reg_interval * interval)277 ir3_reg_interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval)
278 {
279 interval_dump(stream, interval, 0);
280 }
281
282 /* These are the core datastructures used by the register allocator. First
283 * ra_interval and ra_file, which are used for intra-block tracking and use
284 * the ir3_reg_ctx infrastructure:
285 */
286
287 struct ra_interval {
288 struct ir3_reg_interval interval;
289
290 struct rb_node physreg_node;
291 physreg_t physreg_start, physreg_end;
292
293 /* True if this is a source of the current instruction which is entirely
294 * killed. This means we can allocate the dest over it, but we can't break
295 * it up.
296 */
297 bool is_killed;
298
299 /* True if this interval cannot be moved from its position. This is only
300 * used for precolored inputs to ensure that other inputs don't get
301 * allocated on top of them.
302 */
303 bool frozen;
304 };
305
306 struct ra_file {
307 struct ir3_reg_ctx reg_ctx;
308
309 BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
310 BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
311
312 struct rb_tree physreg_intervals;
313
314 unsigned size;
315 unsigned start;
316 };
317
318 /* State for inter-block tracking. When we split a live range to make space
319 * for a vector, we may need to insert fixup code when a block has multiple
320 * predecessors that have moved the same live value to different registers.
321 * This keeps track of state required to do that.
322 */
323
324 struct ra_block_state {
325 /* Map of defining ir3_register -> physreg it was allocated to at the end
326 * of the block.
327 */
328 struct hash_table *renames;
329
330 /* For loops, we need to process a block before all its predecessors have
331 * been processed. In particular, we need to pick registers for values
332 * without knowing if all the predecessors have been renamed. This keeps
333 * track of the registers we chose so that when we visit the back-edge we
334 * can move them appropriately. If all predecessors have been visited
335 * before this block is visited then we don't need to fill this out. This
336 * is a map from ir3_register -> physreg.
337 */
338 struct hash_table *entry_regs;
339
340 /* True if the block has been visited and "renames" is complete.
341 */
342 bool visited;
343 };
344
345 struct ra_parallel_copy {
346 struct ra_interval *interval;
347 physreg_t src;
348 };
349
350 /* The main context: */
351
352 struct ra_ctx {
353 /* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom
354 * half of this file too.
355 */
356 struct ra_file full;
357
358 /* hr0.x - hr63.w, only used without merged-regs. */
359 struct ra_file half;
360
361 /* Shared regs. */
362 struct ra_file shared;
363
364 struct ir3_liveness *live;
365
366 struct ir3_block *block;
367
368 const struct ir3_compiler *compiler;
369 gl_shader_stage stage;
370
371 /* Pending moves of top-level intervals that will be emitted once we're
372 * finished:
373 */
374 DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies);
375
376 struct ra_interval *intervals;
377 struct ra_block_state *blocks;
378
379 bool merged_regs;
380 };
381
382 #define foreach_interval(interval, file) \
383 rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals, \
384 physreg_node)
385 #define foreach_interval_rev(interval, file) \
386 rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals, \
387 physreg_node)
388 #define foreach_interval_safe(interval, file) \
389 rb_tree_foreach_safe (struct ra_interval, interval, \
390 &(file)->physreg_intervals, physreg_node)
391 #define foreach_interval_rev_safe(interval, file) \
392 rb_tree_foreach_rev_safe(struct ra_interval, interval, \
393 &(file)->physreg_intervals, physreg_node)
394
395 static struct ra_interval *
rb_node_to_interval(struct rb_node * node)396 rb_node_to_interval(struct rb_node *node)
397 {
398 return rb_node_data(struct ra_interval, node, physreg_node);
399 }
400
401 static const struct ra_interval *
rb_node_to_interval_const(const struct rb_node * node)402 rb_node_to_interval_const(const struct rb_node *node)
403 {
404 return rb_node_data(struct ra_interval, node, physreg_node);
405 }
406
407 static struct ra_interval *
ra_interval_next(struct ra_interval * interval)408 ra_interval_next(struct ra_interval *interval)
409 {
410 struct rb_node *next = rb_node_next(&interval->physreg_node);
411 return next ? rb_node_to_interval(next) : NULL;
412 }
413
414 static struct ra_interval *
ra_interval_next_or_null(struct ra_interval * interval)415 ra_interval_next_or_null(struct ra_interval *interval)
416 {
417 return interval ? ra_interval_next(interval) : NULL;
418 }
419
420 static int
ra_interval_cmp(const struct rb_node * node,const void * data)421 ra_interval_cmp(const struct rb_node *node, const void *data)
422 {
423 physreg_t reg = *(const physreg_t *)data;
424 const struct ra_interval *interval = rb_node_to_interval_const(node);
425 if (interval->physreg_start > reg)
426 return -1;
427 else if (interval->physreg_end <= reg)
428 return 1;
429 else
430 return 0;
431 }
432
433 static struct ra_interval *
ra_interval_search_sloppy(struct rb_tree * tree,physreg_t reg)434 ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
435 {
436 struct rb_node *node = rb_tree_search_sloppy(tree, ®, ra_interval_cmp);
437 return node ? rb_node_to_interval(node) : NULL;
438 }
439
440 /* Get the interval covering the reg, or the closest to the right if it
441 * doesn't exist.
442 */
443 static struct ra_interval *
ra_interval_search_right(struct rb_tree * tree,physreg_t reg)444 ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
445 {
446 struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
447 if (!interval) {
448 return NULL;
449 } else if (interval->physreg_end > reg) {
450 return interval;
451 } else {
452 /* There is no interval covering reg, and ra_file_search_sloppy()
453 * returned the closest range to the left, so the next interval to the
454 * right should be the closest to the right.
455 */
456 return ra_interval_next_or_null(interval);
457 }
458 }
459
460 static struct ra_interval *
ra_file_search_right(struct ra_file * file,physreg_t reg)461 ra_file_search_right(struct ra_file *file, physreg_t reg)
462 {
463 return ra_interval_search_right(&file->physreg_intervals, reg);
464 }
465
466 static int
ra_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)467 ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
468 {
469 const struct ra_interval *a = rb_node_to_interval_const(_a);
470 const struct ra_interval *b = rb_node_to_interval_const(_b);
471 return b->physreg_start - a->physreg_start;
472 }
473
474 static struct ra_interval *
ir3_reg_interval_to_ra_interval(struct ir3_reg_interval * interval)475 ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
476 {
477 return rb_node_data(struct ra_interval, interval, interval);
478 }
479
480 static struct ra_file *
ir3_reg_ctx_to_file(struct ir3_reg_ctx * ctx)481 ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx)
482 {
483 return rb_node_data(struct ra_file, ctx, reg_ctx);
484 }
485
486 static void
interval_add(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _interval)487 interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
488 {
489 struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
490 struct ra_file *file = ir3_reg_ctx_to_file(ctx);
491
492 /* We can assume in this case that physreg_start/physreg_end is already
493 * initialized.
494 */
495 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
496 BITSET_CLEAR(file->available, i);
497 BITSET_CLEAR(file->available_to_evict, i);
498 }
499
500 rb_tree_insert(&file->physreg_intervals, &interval->physreg_node,
501 ra_interval_insert_cmp);
502 }
503
504 static void
interval_delete(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _interval)505 interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
506 {
507 struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
508 struct ra_file *file = ir3_reg_ctx_to_file(ctx);
509
510 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
511 BITSET_SET(file->available, i);
512 BITSET_SET(file->available_to_evict, i);
513 }
514
515 rb_tree_remove(&file->physreg_intervals, &interval->physreg_node);
516 }
517
518 static void
interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)519 interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
520 struct ir3_reg_interval *_child)
521 {
522 struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
523 struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
524
525 child->physreg_start =
526 parent->physreg_start + (child->interval.reg->interval_start -
527 parent->interval.reg->interval_start);
528 child->physreg_end =
529 child->physreg_start +
530 (child->interval.reg->interval_end - child->interval.reg->interval_start);
531
532 interval_add(ctx, _child);
533 }
534
535 static void
ra_file_init(struct ra_file * file)536 ra_file_init(struct ra_file *file)
537 {
538 for (unsigned i = 0; i < file->size; i++) {
539 BITSET_SET(file->available, i);
540 BITSET_SET(file->available_to_evict, i);
541 }
542
543 rb_tree_init(&file->reg_ctx.intervals);
544 rb_tree_init(&file->physreg_intervals);
545
546 file->reg_ctx.interval_add = interval_add;
547 file->reg_ctx.interval_delete = interval_delete;
548 file->reg_ctx.interval_readd = interval_readd;
549 }
550
551 static void
ra_file_insert(struct ra_file * file,struct ra_interval * interval)552 ra_file_insert(struct ra_file *file, struct ra_interval *interval)
553 {
554 assert(interval->physreg_start < interval->physreg_end);
555 assert(interval->physreg_end <= file->size);
556 if (interval->interval.reg->flags & IR3_REG_HALF)
557 assert(interval->physreg_end <= RA_HALF_SIZE);
558
559 ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
560 }
561
562 static void
ra_file_remove(struct ra_file * file,struct ra_interval * interval)563 ra_file_remove(struct ra_file *file, struct ra_interval *interval)
564 {
565 ir3_reg_interval_remove(&file->reg_ctx, &interval->interval);
566 }
567
568 static void
ra_file_mark_killed(struct ra_file * file,struct ra_interval * interval)569 ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval)
570 {
571 assert(!interval->interval.parent);
572
573 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
574 BITSET_SET(file->available, i);
575 }
576
577 interval->is_killed = true;
578 }
579
580 static void
ra_file_unmark_killed(struct ra_file * file,struct ra_interval * interval)581 ra_file_unmark_killed(struct ra_file *file, struct ra_interval *interval)
582 {
583 assert(!interval->interval.parent);
584
585 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
586 BITSET_CLEAR(file->available, i);
587 }
588
589 interval->is_killed = false;
590 }
591
592 static physreg_t
ra_interval_get_physreg(const struct ra_interval * interval)593 ra_interval_get_physreg(const struct ra_interval *interval)
594 {
595 unsigned child_start = interval->interval.reg->interval_start;
596
597 while (interval->interval.parent) {
598 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
599 }
600
601 return interval->physreg_start +
602 (child_start - interval->interval.reg->interval_start);
603 }
604
605 static unsigned
ra_interval_get_num(const struct ra_interval * interval)606 ra_interval_get_num(const struct ra_interval *interval)
607 {
608 return ra_physreg_to_num(ra_interval_get_physreg(interval),
609 interval->interval.reg->flags);
610 }
611
612 static void
ra_interval_init(struct ra_interval * interval,struct ir3_register * reg)613 ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
614 {
615 ir3_reg_interval_init(&interval->interval, reg);
616 interval->is_killed = false;
617 interval->frozen = false;
618 }
619
620 static void
ra_interval_dump(struct log_stream * stream,struct ra_interval * interval)621 ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
622 {
623 mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
624
625 ir3_reg_interval_dump(stream, &interval->interval);
626 }
627
628 static void
ra_file_dump(struct log_stream * stream,struct ra_file * file)629 ra_file_dump(struct log_stream *stream, struct ra_file *file)
630 {
631 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
632 physreg_node) {
633 ra_interval_dump(stream, interval);
634 }
635
636 unsigned start, end;
637 mesa_log_stream_printf(stream, "available:\n");
638 BITSET_FOREACH_RANGE (start, end, file->available, file->size) {
639 mesa_log_stream_printf(stream, "%u-%u ", start, end);
640 }
641 mesa_log_stream_printf(stream, "\n");
642
643 mesa_log_stream_printf(stream, "available to evict:\n");
644 BITSET_FOREACH_RANGE (start, end, file->available_to_evict, file->size) {
645 mesa_log_stream_printf(stream, "%u-%u ", start, end);
646 }
647 mesa_log_stream_printf(stream, "\n");
648 mesa_log_stream_printf(stream, "start: %u\n", file->start);
649 }
650
651 static void
ra_ctx_dump(struct ra_ctx * ctx)652 ra_ctx_dump(struct ra_ctx *ctx)
653 {
654 struct log_stream *stream = mesa_log_streami();
655 mesa_log_stream_printf(stream, "full:\n");
656 ra_file_dump(stream, &ctx->full);
657 mesa_log_stream_printf(stream, "half:\n");
658 ra_file_dump(stream, &ctx->half);
659 mesa_log_stream_printf(stream, "shared:");
660 ra_file_dump(stream, &ctx->shared);
661 mesa_log_stream_destroy(stream);
662 }
663
664 static unsigned
reg_file_size(struct ra_file * file,struct ir3_register * reg)665 reg_file_size(struct ra_file *file, struct ir3_register *reg)
666 {
667 /* Half-regs can only take up the first half of the combined regfile */
668 if (reg->flags & IR3_REG_HALF)
669 return MIN2(file->size, RA_HALF_SIZE);
670 else
671 return file->size;
672 }
673
674 /* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple
675 * top-level intervals at once. Pop multiple intervals, then push them back in
676 * any order.
677 */
678
679 struct ra_removed_interval {
680 struct ra_interval *interval;
681 unsigned size;
682 };
683
684 static struct ra_removed_interval
ra_pop_interval(struct ra_ctx * ctx,struct ra_file * file,struct ra_interval * interval)685 ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
686 struct ra_interval *interval)
687 {
688 assert(!interval->interval.parent);
689 /* shared live splitting is not allowed! */
690 assert(!(interval->interval.reg->flags & IR3_REG_SHARED));
691
692 /* Check if we've already moved this reg before */
693 unsigned pcopy_index;
694 for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count;
695 pcopy_index++) {
696 if (ctx->parallel_copies[pcopy_index].interval == interval)
697 break;
698 }
699
700 if (pcopy_index == ctx->parallel_copies_count) {
701 array_insert(ctx, ctx->parallel_copies,
702 (struct ra_parallel_copy){
703 .interval = interval,
704 .src = interval->physreg_start,
705 });
706 }
707
708 ir3_reg_interval_remove_temp(&file->reg_ctx, &interval->interval);
709
710 return (struct ra_removed_interval){
711 .interval = interval,
712 .size = interval->physreg_end - interval->physreg_start,
713 };
714 }
715
716 static void
ra_push_interval(struct ra_ctx * ctx,struct ra_file * file,const struct ra_removed_interval * removed,physreg_t dst)717 ra_push_interval(struct ra_ctx *ctx, struct ra_file *file,
718 const struct ra_removed_interval *removed, physreg_t dst)
719 {
720 struct ra_interval *interval = removed->interval;
721
722 interval->physreg_start = dst;
723 interval->physreg_end = dst + removed->size;
724
725 assert(interval->physreg_end <= file->size);
726 if (interval->interval.reg->flags & IR3_REG_HALF)
727 assert(interval->physreg_end <= RA_HALF_SIZE);
728
729 ir3_reg_interval_reinsert(&file->reg_ctx, &interval->interval);
730 }
731
732 /* Pick up the interval and place it at "dst". */
733 static void
ra_move_interval(struct ra_ctx * ctx,struct ra_file * file,struct ra_interval * interval,physreg_t dst)734 ra_move_interval(struct ra_ctx *ctx, struct ra_file *file,
735 struct ra_interval *interval, physreg_t dst)
736 {
737 struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval);
738 ra_push_interval(ctx, file, &temp, dst);
739 }
740
741 static struct ra_file *
ra_get_file(struct ra_ctx * ctx,struct ir3_register * reg)742 ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
743 {
744 if (reg->flags & IR3_REG_SHARED)
745 return &ctx->shared;
746 else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
747 return &ctx->full;
748 else
749 return &ctx->half;
750 }
751
752
753 /* Returns true if the proposed spot for "dst" or a killed source overlaps a
754 * destination that's been allocated.
755 */
756 static bool
check_dst_overlap(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * dst,physreg_t start,physreg_t end)757 check_dst_overlap(struct ra_ctx *ctx, struct ra_file *file,
758 struct ir3_register *dst, physreg_t start,
759 physreg_t end)
760 {
761 struct ir3_instruction *instr = dst->instr;
762
763 ra_foreach_dst (other_dst, instr) {
764 /* We assume only destinations before the current one have been allocated.
765 */
766 if (other_dst == dst)
767 break;
768
769 if (ra_get_file(ctx, other_dst) != file)
770 continue;
771
772 struct ra_interval *other_interval = &ctx->intervals[other_dst->name];
773 assert(!other_interval->interval.parent);
774 physreg_t other_start = other_interval->physreg_start;
775 physreg_t other_end = other_interval->physreg_end;
776
777 if (other_end > start && end > other_start)
778 return true;
779 }
780
781 return false;
782 }
783
784 /* True if the destination is "early-clobber," meaning that it cannot be
785 * allocated over killed sources. Some destinations always require it, but it
786 * also is implicitly true for tied destinations whose source is live-through.
787 * If the source is killed, then we skip allocating a register for the
788 * destination altogether so we don't need to worry about that case here.
789 */
790 static bool
is_early_clobber(struct ir3_register * reg)791 is_early_clobber(struct ir3_register *reg)
792 {
793 return (reg->flags & IR3_REG_EARLY_CLOBBER) || reg->tied;
794 }
795
796 static bool
get_reg_specified(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg,physreg_t physreg,bool is_source)797 get_reg_specified(struct ra_ctx *ctx, struct ra_file *file,
798 struct ir3_register *reg, physreg_t physreg, bool is_source)
799 {
800 for (unsigned i = 0; i < reg_size(reg); i++) {
801 if (!BITSET_TEST(is_early_clobber(reg) || is_source ?
802 file->available_to_evict : file->available,
803 physreg + i))
804 return false;
805 }
806
807 if (!is_source &&
808 check_dst_overlap(ctx, file, reg, physreg, physreg + reg_size(reg)))
809 return false;
810
811 return true;
812 }
813
814 /* Try to evict any registers conflicting with the proposed spot "physreg" for
815 * "reg". That is, move them to other places so that we can allocate "physreg"
816 * here.
817 */
818
819 static bool
try_evict_regs(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg,physreg_t physreg,unsigned * _eviction_count,bool is_source,bool speculative)820 try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
821 struct ir3_register *reg, physreg_t physreg,
822 unsigned *_eviction_count, bool is_source, bool speculative)
823 {
824 BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
825 memcpy(available_to_evict, file->available_to_evict,
826 sizeof(available_to_evict));
827
828 BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
829 memcpy(available, file->available, sizeof(available));
830
831 for (unsigned i = 0; i < reg_size(reg); i++) {
832 BITSET_CLEAR(available_to_evict, physreg + i);
833 BITSET_CLEAR(available, physreg + i);
834 }
835
836 unsigned eviction_count = 0;
837 /* Iterate over each range conflicting with physreg */
838 for (struct ra_interval *conflicting = ra_file_search_right(file, physreg),
839 *next = ra_interval_next_or_null(conflicting);
840 conflicting != NULL &&
841 conflicting->physreg_start < physreg + reg_size(reg);
842 conflicting = next, next = ra_interval_next_or_null(next)) {
843 if (!is_early_clobber(reg) && !is_source && conflicting->is_killed)
844 continue;
845
846 if (conflicting->frozen) {
847 assert(speculative);
848 return false;
849 }
850
851 unsigned conflicting_file_size =
852 reg_file_size(file, conflicting->interval.reg);
853 unsigned avail_start, avail_end;
854 bool evicted = false;
855 BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict,
856 conflicting_file_size) {
857 unsigned size = avail_end - avail_start;
858
859 /* non-half registers must be aligned */
860 if (!(conflicting->interval.reg->flags & IR3_REG_HALF) &&
861 avail_start % 2 == 1) {
862 avail_start++;
863 size--;
864 }
865
866 unsigned conflicting_size =
867 conflicting->physreg_end - conflicting->physreg_start;
868 if (size >= conflicting_size &&
869 !check_dst_overlap(ctx, file, reg, avail_start, avail_start +
870 conflicting_size)) {
871 for (unsigned i = 0;
872 i < conflicting->physreg_end - conflicting->physreg_start; i++)
873 BITSET_CLEAR(available_to_evict, avail_start + i);
874 eviction_count +=
875 conflicting->physreg_end - conflicting->physreg_start;
876 if (!speculative)
877 ra_move_interval(ctx, file, conflicting, avail_start);
878 evicted = true;
879 break;
880 }
881 }
882
883 if (evicted)
884 continue;
885
886 /* If we couldn't evict this range, but the register we're allocating is
887 * allowed to overlap with a killed range, then we may be able to swap it
888 * with a killed range to acheive the same effect.
889 */
890 if (is_early_clobber(reg) || is_source)
891 return false;
892
893 foreach_interval (killed, file) {
894 if (!killed->is_killed)
895 continue;
896
897 if (killed->physreg_end - killed->physreg_start !=
898 conflicting->physreg_end - conflicting->physreg_start)
899 continue;
900
901 if (killed->physreg_end > conflicting_file_size ||
902 conflicting->physreg_end > reg_file_size(file, killed->interval.reg))
903 continue;
904
905 /* We can't swap the killed range if it partially/fully overlaps the
906 * space we're trying to allocate or (in speculative mode) if it's
907 * already been swapped and will overlap when we actually evict.
908 */
909 bool killed_available = true;
910 for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
911 if (!BITSET_TEST(available, i)) {
912 killed_available = false;
913 break;
914 }
915 }
916
917 if (!killed_available)
918 continue;
919
920 if (check_dst_overlap(ctx, file, reg, killed->physreg_start,
921 killed->physreg_end))
922 continue;
923
924 /* Check for alignment if one is a full reg */
925 if ((!(killed->interval.reg->flags & IR3_REG_HALF) ||
926 !(conflicting->interval.reg->flags & IR3_REG_HALF)) &&
927 (killed->physreg_start % 2 != 0 ||
928 conflicting->physreg_start % 2 != 0))
929 continue;
930
931 for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
932 BITSET_CLEAR(available, i);
933 }
934 /* Because this will generate swaps instead of moves, multiply the
935 * cost by 2.
936 */
937 eviction_count += (killed->physreg_end - killed->physreg_start) * 2;
938 if (!speculative) {
939 physreg_t killed_start = killed->physreg_start,
940 conflicting_start = conflicting->physreg_start;
941 struct ra_removed_interval killed_removed =
942 ra_pop_interval(ctx, file, killed);
943 struct ra_removed_interval conflicting_removed =
944 ra_pop_interval(ctx, file, conflicting);
945 ra_push_interval(ctx, file, &killed_removed, conflicting_start);
946 ra_push_interval(ctx, file, &conflicting_removed, killed_start);
947 }
948
949 evicted = true;
950 break;
951 }
952
953 if (!evicted)
954 return false;
955 }
956
957 *_eviction_count = eviction_count;
958 return true;
959 }
960
961 static int
removed_interval_cmp(const void * _i1,const void * _i2)962 removed_interval_cmp(const void *_i1, const void *_i2)
963 {
964 const struct ra_removed_interval *i1 = _i1;
965 const struct ra_removed_interval *i2 = _i2;
966
967 /* We sort the registers as follows:
968 *
969 * |------------------------------------------------------------------------------------------|
970 * | | | | | | |
971 * | Half | Half early-clobber | Half | Full | Full early-clobber | Full |
972 * | live-through | destination | killed | killed | destination | live-through |
973 * | | | | | | |
974 * |------------------------------------------------------------------------------------------|
975 * | |
976 * | Destination |
977 * | |
978 * |-----------------|
979 *
980 * Half-registers have to be first so that they stay in the low half of
981 * the register file. Then half and full killed must stay together so that
982 * there's a contiguous range where we can put the register. With this
983 * structure we should be able to accomodate any collection of intervals
984 * such that the total number of half components is within the half limit
985 * and the combined components are within the full limit.
986 */
987
988 unsigned i1_align = reg_elem_size(i1->interval->interval.reg);
989 unsigned i2_align = reg_elem_size(i2->interval->interval.reg);
990 if (i1_align > i2_align)
991 return 1;
992 if (i1_align < i2_align)
993 return -1;
994
995 if (i1_align == 1) {
996 if (i2->interval->is_killed)
997 return -1;
998 if (i1->interval->is_killed)
999 return 1;
1000 } else {
1001 if (i2->interval->is_killed)
1002 return 1;
1003 if (i1->interval->is_killed)
1004 return -1;
1005 }
1006
1007 return 0;
1008 }
1009
1010 static int
dsts_cmp(const void * _i1,const void * _i2)1011 dsts_cmp(const void *_i1, const void *_i2)
1012 {
1013 struct ir3_register *i1 = *(struct ir3_register *const *) _i1;
1014 struct ir3_register *i2 = *(struct ir3_register *const *) _i2;
1015
1016 /* Treat tied destinations as-if they are live-through sources, and normal
1017 * destinations as killed sources.
1018 */
1019 unsigned i1_align = reg_elem_size(i1);
1020 unsigned i2_align = reg_elem_size(i2);
1021 if (i1_align > i2_align)
1022 return 1;
1023 if (i1_align < i2_align)
1024 return -1;
1025
1026 if (i1_align == 1) {
1027 if (!is_early_clobber(i2))
1028 return -1;
1029 if (!is_early_clobber(i1))
1030 return 1;
1031 } else {
1032 if (!is_early_clobber(i2))
1033 return 1;
1034 if (!is_early_clobber(i1))
1035 return -1;
1036 }
1037
1038 return 0;
1039 }
1040
1041 /* "Compress" all the live intervals so that there is enough space for the
1042 * destination register. As there can be gaps when a more-aligned interval
1043 * follows a less-aligned interval, this also sorts them to remove such
1044 * "padding", which may be required when space is very tight. This isn't
1045 * amazing, but should be used only as a last resort in case the register file
1046 * is almost full and badly fragmented.
1047 *
1048 * Return the physreg to use.
1049 */
1050 static physreg_t
compress_regs_left(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg)1051 compress_regs_left(struct ra_ctx *ctx, struct ra_file *file,
1052 struct ir3_register *reg)
1053 {
1054 unsigned reg_align = reg_elem_size(reg);
1055 DECLARE_ARRAY(struct ra_removed_interval, intervals);
1056 intervals_count = intervals_sz = 0;
1057 intervals = NULL;
1058
1059 DECLARE_ARRAY(struct ir3_register *, dsts);
1060 dsts_count = dsts_sz = 0;
1061 dsts = NULL;
1062 array_insert(ctx, dsts, reg);
1063 bool dst_inserted[reg->instr->dsts_count];
1064
1065 unsigned dst_size = reg->tied ? 0 : reg_size(reg);
1066 unsigned ec_dst_size = is_early_clobber(reg) ? reg_size(reg) : 0;
1067 unsigned half_dst_size = 0, ec_half_dst_size = 0;
1068 if (reg_align == 1) {
1069 half_dst_size = dst_size;
1070 ec_half_dst_size = ec_dst_size;
1071 }
1072
1073 unsigned removed_size = 0, removed_half_size = 0;
1074 unsigned removed_killed_size = 0, removed_killed_half_size = 0;
1075 unsigned file_size =
1076 reg_align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size;
1077 physreg_t start_reg = 0;
1078
1079 foreach_interval_rev_safe (interval, file) {
1080 /* We'll check if we can compact the intervals starting here. */
1081 physreg_t candidate_start = interval->physreg_end;
1082
1083 /* Check if there are any other destinations we need to compact. */
1084 ra_foreach_dst_n (other_dst, n, reg->instr) {
1085 if (other_dst == reg)
1086 break;
1087 if (ra_get_file(ctx, other_dst) != file)
1088 continue;
1089 if (dst_inserted[n])
1090 continue;
1091
1092 struct ra_interval *other_interval = &ctx->intervals[other_dst->name];
1093 /* if the destination partially overlaps this interval, we need to
1094 * extend candidate_start to the end.
1095 */
1096 if (other_interval->physreg_start < candidate_start) {
1097 candidate_start = MAX2(candidate_start,
1098 other_interval->physreg_end);
1099 continue;
1100 }
1101
1102 dst_inserted[n] = true;
1103
1104 /* dst intervals with a tied killed source are considered attached to
1105 * that source. Don't actually insert them. This means we have to
1106 * update them below if their tied source moves.
1107 */
1108 if (other_dst->tied) {
1109 struct ra_interval *tied_interval =
1110 &ctx->intervals[other_dst->tied->def->name];
1111 if (tied_interval->is_killed)
1112 continue;
1113 }
1114
1115 d("popping destination %u physreg %u\n",
1116 other_interval->interval.reg->name,
1117 other_interval->physreg_start);
1118
1119 array_insert(ctx, dsts, other_dst);
1120 unsigned interval_size = reg_size(other_dst);
1121 if (is_early_clobber(other_dst)) {
1122 ec_dst_size += interval_size;
1123 if (other_interval->interval.reg->flags & IR3_REG_HALF)
1124 ec_half_dst_size += interval_size;
1125 } else {
1126 dst_size += interval_size;
1127 if (other_interval->interval.reg->flags & IR3_REG_HALF)
1128 half_dst_size += interval_size;
1129 }
1130 }
1131
1132 /* Check if we can sort the intervals *after* this one and have enough
1133 * space leftover to accomodate all intervals, keeping in mind that killed
1134 * sources overlap non-tied destinations. Also check that we have enough
1135 * space leftover for half-registers, if we're inserting a half-register
1136 * (otherwise we only shift any half-registers down so they should be
1137 * safe).
1138 */
1139 if (candidate_start + removed_size + ec_dst_size +
1140 MAX2(removed_killed_size, dst_size) <= file->size &&
1141 (reg_align != 1 ||
1142 candidate_start + removed_half_size + ec_half_dst_size +
1143 MAX2(removed_killed_half_size, half_dst_size) <= file_size)) {
1144 start_reg = candidate_start;
1145 break;
1146 }
1147
1148 /* We assume that all frozen intervals are at the start and that we
1149 * can avoid popping them.
1150 */
1151 assert(!interval->frozen);
1152
1153 /* Killed sources are different because they go at the end and can
1154 * overlap the register we're trying to add.
1155 */
1156 unsigned interval_size = interval->physreg_end - interval->physreg_start;
1157 if (interval->is_killed) {
1158 removed_killed_size += interval_size;
1159 if (interval->interval.reg->flags & IR3_REG_HALF)
1160 removed_killed_half_size += interval_size;
1161 } else {
1162 removed_size += interval_size;
1163 if (interval->interval.reg->flags & IR3_REG_HALF)
1164 removed_half_size += interval_size;
1165 }
1166
1167 /* Now that we've done the accounting, pop this off */
1168 d("popping interval %u physreg %u%s\n", interval->interval.reg->name,
1169 interval->physreg_start, interval->is_killed ? ", killed" : "");
1170 array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval));
1171 }
1172
1173 /* TODO: In addition to skipping registers at the beginning that are
1174 * well-packed, we should try to skip registers at the end.
1175 */
1176
1177 qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp);
1178 qsort(dsts, dsts_count, sizeof(*dsts), dsts_cmp);
1179
1180 physreg_t live_reg = start_reg;
1181 physreg_t dst_reg = (physreg_t)~0;
1182 physreg_t ret_reg = (physreg_t)~0;
1183 unsigned dst_index = 0;
1184 unsigned live_index = 0;
1185
1186 /* We have two lists of intervals to process, live intervals and destination
1187 * intervals. Process them in the order of the disgram in insert_cmp().
1188 */
1189 while (live_index < intervals_count || dst_index < dsts_count) {
1190 bool process_dst;
1191 if (live_index == intervals_count) {
1192 process_dst = true;
1193 } else if (dst_index == dsts_count) {
1194 process_dst = false;
1195 } else {
1196 struct ir3_register *dst = dsts[dst_index];
1197 struct ra_interval *live_interval = intervals[live_index].interval;
1198
1199 bool live_half = live_interval->interval.reg->flags & IR3_REG_HALF;
1200 bool live_killed = live_interval->is_killed;
1201 bool dst_half = dst->flags & IR3_REG_HALF;
1202 bool dst_early_clobber = is_early_clobber(dst);
1203
1204 if (live_half && !live_killed) {
1205 /* far-left of diagram. */
1206 process_dst = false;
1207 } else if (dst_half && dst_early_clobber) {
1208 /* mid-left of diagram. */
1209 process_dst = true;
1210 } else if (!dst_early_clobber) {
1211 /* bottom of disagram. */
1212 process_dst = true;
1213 } else if (live_killed) {
1214 /* middle of diagram. */
1215 process_dst = false;
1216 } else if (!dst_half && dst_early_clobber) {
1217 /* mid-right of diagram. */
1218 process_dst = true;
1219 } else {
1220 /* far right of diagram. */
1221 assert(!live_killed && !live_half);
1222 process_dst = false;
1223 }
1224 }
1225
1226 struct ir3_register *cur_reg =
1227 process_dst ? dsts[dst_index] :
1228 intervals[live_index].interval->interval.reg;
1229
1230 physreg_t physreg;
1231 if (process_dst && !is_early_clobber(cur_reg)) {
1232 if (dst_reg == (physreg_t)~0)
1233 dst_reg = live_reg;
1234 physreg = dst_reg;
1235 } else {
1236 physreg = live_reg;
1237 struct ra_interval *live_interval = intervals[live_index].interval;
1238 bool live_killed = live_interval->is_killed;
1239 /* If this is live-through and we've processed the destinations, we
1240 * need to make sure we take into account any overlapping destinations.
1241 */
1242 if (!live_killed && dst_reg != (physreg_t)~0)
1243 physreg = MAX2(physreg, dst_reg);
1244 }
1245
1246 if (!(cur_reg->flags & IR3_REG_HALF))
1247 physreg = ALIGN(physreg, 2);
1248
1249 d("pushing reg %u physreg %u\n", cur_reg->name, physreg);
1250
1251 unsigned interval_size = reg_size(cur_reg);
1252 if (physreg + interval_size >
1253 reg_file_size(file, cur_reg)) {
1254 d("ran out of room for interval %u!\n",
1255 cur_reg->name);
1256 unreachable("reg pressure calculation was wrong!");
1257 return 0;
1258 }
1259
1260 if (process_dst) {
1261 if (cur_reg == reg) {
1262 ret_reg = physreg;
1263 } else {
1264 struct ra_interval *interval = &ctx->intervals[cur_reg->name];
1265 interval->physreg_start = physreg;
1266 interval->physreg_end = physreg + interval_size;
1267 }
1268 dst_index++;
1269 } else {
1270 ra_push_interval(ctx, file, &intervals[live_index], physreg);
1271 live_index++;
1272 }
1273
1274 physreg += interval_size;
1275
1276 if (process_dst && !is_early_clobber(cur_reg)) {
1277 dst_reg = physreg;
1278 } else {
1279 live_reg = physreg;
1280 }
1281 }
1282
1283 /* If we shuffled around a tied source that is killed, we may have to update
1284 * its corresponding destination since we didn't insert it above.
1285 */
1286 ra_foreach_dst (dst, reg->instr) {
1287 if (dst == reg)
1288 break;
1289
1290 struct ir3_register *tied = dst->tied;
1291 if (!tied)
1292 continue;
1293
1294 struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1295 if (!tied_interval->is_killed)
1296 continue;
1297
1298 struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1299 unsigned dst_size = reg_size(dst);
1300 dst_interval->physreg_start = ra_interval_get_physreg(tied_interval);
1301 dst_interval->physreg_end = dst_interval->physreg_start + dst_size;
1302 }
1303
1304 return ret_reg;
1305 }
1306
1307 static void
update_affinity(struct ra_file * file,struct ir3_register * reg,physreg_t physreg)1308 update_affinity(struct ra_file *file, struct ir3_register *reg,
1309 physreg_t physreg)
1310 {
1311 if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0)
1312 return;
1313
1314 if (physreg < reg->merge_set_offset)
1315 return;
1316
1317 if ((physreg - reg->merge_set_offset + reg->merge_set->size) > file->size)
1318 return;
1319
1320 reg->merge_set->preferred_reg = physreg - reg->merge_set_offset;
1321 }
1322
1323 /* Try to find free space for a register without shuffling anything. This uses
1324 * a round-robin algorithm to reduce false dependencies.
1325 */
1326 static physreg_t
find_best_gap(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * dst,unsigned file_size,unsigned size,unsigned alignment)1327 find_best_gap(struct ra_ctx *ctx, struct ra_file *file,
1328 struct ir3_register *dst, unsigned file_size, unsigned size,
1329 unsigned alignment)
1330 {
1331 /* This can happen if we create a very large merge set. Just bail out in that
1332 * case.
1333 */
1334 if (size > file_size)
1335 return (physreg_t) ~0;
1336
1337 BITSET_WORD *available =
1338 is_early_clobber(dst) ? file->available_to_evict : file->available;
1339
1340 unsigned start = ALIGN(file->start, alignment) % (file_size - size + alignment);
1341 unsigned candidate = start;
1342 do {
1343 bool is_available = true;
1344 for (unsigned i = 0; i < size; i++) {
1345 if (!BITSET_TEST(available, candidate + i)) {
1346 is_available = false;
1347 break;
1348 }
1349 }
1350
1351 if (is_available) {
1352 is_available =
1353 !check_dst_overlap(ctx, file, dst, candidate, candidate + size);
1354 }
1355
1356 if (is_available) {
1357 file->start = (candidate + size) % file_size;
1358 return candidate;
1359 }
1360
1361 candidate += alignment;
1362 if (candidate + size > file_size)
1363 candidate = 0;
1364 } while (candidate != start);
1365
1366 return (physreg_t)~0;
1367 }
1368
1369 /* This is the main entrypoint for picking a register. Pick a free register
1370 * for "reg", shuffling around sources if necessary. In the normal case where
1371 * "is_source" is false, this register can overlap with killed sources
1372 * (intervals with "is_killed == true"). If "is_source" is true, then
1373 * is_killed is ignored and the register returned must not overlap with killed
1374 * sources. This must be used for tied registers, because we're actually
1375 * allocating the destination and the tied source at the same time.
1376 */
1377
1378 static physreg_t
get_reg(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg)1379 get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg)
1380 {
1381 unsigned file_size = reg_file_size(file, reg);
1382 if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
1383 physreg_t preferred_reg =
1384 reg->merge_set->preferred_reg + reg->merge_set_offset;
1385 if (preferred_reg + reg_size(reg) <= file_size &&
1386 preferred_reg % reg_elem_size(reg) == 0 &&
1387 get_reg_specified(ctx, file, reg, preferred_reg, false))
1388 return preferred_reg;
1389 }
1390
1391 /* If this register is a subset of a merge set which we have not picked a
1392 * register for, first try to allocate enough space for the entire merge
1393 * set.
1394 */
1395 unsigned size = reg_size(reg);
1396 if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
1397 size < reg->merge_set->size) {
1398 physreg_t best_reg = find_best_gap(ctx, file, reg, file_size,
1399 reg->merge_set->size,
1400 reg->merge_set->alignment);
1401 if (best_reg != (physreg_t)~0u) {
1402 best_reg += reg->merge_set_offset;
1403 return best_reg;
1404 }
1405 }
1406
1407 /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
1408 * Because this doesn't introduce unnecessary dependencies, and it
1409 * potentially avoids needing (ss) syncs for write after read hazards for
1410 * SFU instructions:
1411 */
1412 if (is_sfu(reg->instr) || is_alu(reg->instr)) {
1413 for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
1414 struct ir3_register *src = reg->instr->srcs[i];
1415 if (!ra_reg_is_src(src))
1416 continue;
1417 if (ra_get_file(ctx, src) == file && reg_size(src) >= size) {
1418 struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1419 physreg_t src_physreg = ra_interval_get_physreg(src_interval);
1420 if (src_physreg % reg_elem_size(reg) == 0 &&
1421 src_physreg + size <= file_size &&
1422 get_reg_specified(ctx, file, reg, src_physreg, false))
1423 return src_physreg;
1424 }
1425 }
1426 }
1427
1428 physreg_t best_reg =
1429 find_best_gap(ctx, file, reg, file_size, size, reg_elem_size(reg));
1430 if (best_reg != (physreg_t)~0u) {
1431 return best_reg;
1432 }
1433
1434 /* Ok, we couldn't find anything that fits. Here is where we have to start
1435 * moving things around to make stuff fit. First try solely evicting
1436 * registers in the way.
1437 */
1438 unsigned best_eviction_count = ~0;
1439 for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) {
1440 unsigned eviction_count;
1441 if (try_evict_regs(ctx, file, reg, i, &eviction_count, false, true)) {
1442 if (eviction_count < best_eviction_count) {
1443 best_eviction_count = eviction_count;
1444 best_reg = i;
1445 }
1446 }
1447 }
1448
1449 if (best_eviction_count != ~0) {
1450 ASSERTED bool result = try_evict_regs(
1451 ctx, file, reg, best_reg, &best_eviction_count, false, false);
1452 assert(result);
1453 return best_reg;
1454 }
1455
1456 /* Use the dumb fallback only if try_evict_regs() fails. */
1457 return compress_regs_left(ctx, file, reg);
1458 }
1459
1460 static void
assign_reg(struct ir3_instruction * instr,struct ir3_register * reg,unsigned num)1461 assign_reg(struct ir3_instruction *instr, struct ir3_register *reg,
1462 unsigned num)
1463 {
1464 if (reg->flags & IR3_REG_ARRAY) {
1465 reg->array.base = num;
1466 if (reg->flags & IR3_REG_RELATIV)
1467 reg->array.offset += num;
1468 else
1469 reg->num = num + reg->array.offset;
1470 } else {
1471 reg->num = num;
1472 }
1473 }
1474
1475 static void
mark_src_killed(struct ra_ctx * ctx,struct ir3_register * src)1476 mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src)
1477 {
1478 struct ra_interval *interval = &ctx->intervals[src->def->name];
1479
1480 if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed ||
1481 interval->interval.parent ||
1482 !rb_tree_is_empty(&interval->interval.children))
1483 return;
1484
1485 ra_file_mark_killed(ra_get_file(ctx, src), interval);
1486 }
1487
1488 static void
insert_dst(struct ra_ctx * ctx,struct ir3_register * dst)1489 insert_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1490 {
1491 struct ra_file *file = ra_get_file(ctx, dst);
1492 struct ra_interval *interval = &ctx->intervals[dst->name];
1493
1494 d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval));
1495
1496 if (!(dst->flags & IR3_REG_UNUSED))
1497 ra_file_insert(file, interval);
1498
1499 assign_reg(dst->instr, dst, ra_interval_get_num(interval));
1500 }
1501
1502 static void
allocate_dst_fixed(struct ra_ctx * ctx,struct ir3_register * dst,physreg_t physreg)1503 allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst,
1504 physreg_t physreg)
1505 {
1506 struct ra_file *file = ra_get_file(ctx, dst);
1507 struct ra_interval *interval = &ctx->intervals[dst->name];
1508 update_affinity(file, dst, physreg);
1509
1510 ra_interval_init(interval, dst);
1511 interval->physreg_start = physreg;
1512 interval->physreg_end = physreg + reg_size(dst);
1513 }
1514
1515 /* If a tied destination interferes with its source register, we have to insert
1516 * a copy beforehand to copy the source to the destination. Because we are using
1517 * the parallel_copies array and not creating a separate copy, this copy will
1518 * happen in parallel with any shuffling around of the tied source, so we have
1519 * to copy the source *as it exists before it is shuffled around*. We do this by
1520 * inserting the copy early, before any other copies are inserted. We don't
1521 * actually know the destination of the copy, but that's ok because the
1522 * dst_interval will be filled out later.
1523 */
1524 static void
insert_tied_dst_copy(struct ra_ctx * ctx,struct ir3_register * dst)1525 insert_tied_dst_copy(struct ra_ctx *ctx, struct ir3_register *dst)
1526 {
1527 struct ir3_register *tied = dst->tied;
1528
1529 if (!tied)
1530 return;
1531
1532 struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1533 struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1534
1535 if (tied_interval->is_killed)
1536 return;
1537
1538 physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);
1539
1540 array_insert(ctx, ctx->parallel_copies,
1541 (struct ra_parallel_copy){
1542 .interval = dst_interval,
1543 .src = tied_physreg,
1544 });
1545 }
1546
1547 static void
allocate_dst(struct ra_ctx * ctx,struct ir3_register * dst)1548 allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1549 {
1550 struct ra_file *file = ra_get_file(ctx, dst);
1551
1552 struct ir3_register *tied = dst->tied;
1553 if (tied) {
1554 struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1555 if (tied_interval->is_killed) {
1556 /* The easy case: the source is killed, so we can just reuse it
1557 * for the destination.
1558 */
1559 allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval));
1560 return;
1561 }
1562 }
1563
1564 /* All the hard work is done by get_reg here. */
1565 physreg_t physreg = get_reg(ctx, file, dst);
1566
1567 allocate_dst_fixed(ctx, dst, physreg);
1568 }
1569
1570 static void
assign_src(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)1571 assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
1572 struct ir3_register *src)
1573 {
1574 struct ra_interval *interval = &ctx->intervals[src->def->name];
1575 struct ra_file *file = ra_get_file(ctx, src);
1576
1577 struct ir3_register *tied = src->tied;
1578 physreg_t physreg;
1579 if (tied) {
1580 struct ra_interval *tied_interval = &ctx->intervals[tied->name];
1581 physreg = ra_interval_get_physreg(tied_interval);
1582 } else {
1583 physreg = ra_interval_get_physreg(interval);
1584 }
1585
1586 assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags));
1587
1588 if (src->flags & IR3_REG_FIRST_KILL)
1589 ra_file_remove(file, interval);
1590 }
1591
1592 /* Insert a parallel copy instruction before the instruction with the parallel
1593 * copy entries we've built up.
1594 */
1595 static void
insert_parallel_copy_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1596 insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1597 {
1598 if (ctx->parallel_copies_count == 0)
1599 return;
1600
1601 struct ir3_instruction *pcopy =
1602 ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY,
1603 ctx->parallel_copies_count, ctx->parallel_copies_count);
1604
1605 for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1606 struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1607 struct ir3_register *reg =
1608 ir3_dst_create(pcopy, INVALID_REG,
1609 entry->interval->interval.reg->flags &
1610 (IR3_REG_HALF | IR3_REG_ARRAY | IR3_REG_SHARED));
1611 reg->size = entry->interval->interval.reg->size;
1612 reg->wrmask = entry->interval->interval.reg->wrmask;
1613 assign_reg(pcopy, reg, ra_interval_get_num(entry->interval));
1614 }
1615
1616 for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1617 struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1618 struct ir3_register *reg =
1619 ir3_src_create(pcopy, INVALID_REG,
1620 entry->interval->interval.reg->flags &
1621 (IR3_REG_HALF | IR3_REG_ARRAY | IR3_REG_SHARED));
1622 reg->size = entry->interval->interval.reg->size;
1623 reg->wrmask = entry->interval->interval.reg->wrmask;
1624 assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags));
1625 }
1626
1627 list_del(&pcopy->node);
1628 list_addtail(&pcopy->node, &instr->node);
1629 ctx->parallel_copies_count = 0;
1630 }
1631
1632 static void
handle_normal_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1633 handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1634 {
1635 /* First, mark sources as going-to-be-killed while allocating the dest. */
1636 ra_foreach_src (src, instr) {
1637 mark_src_killed(ctx, src);
1638 }
1639
1640 /* Pre-insert tied dst copies. */
1641 ra_foreach_dst (dst, instr) {
1642 insert_tied_dst_copy(ctx, dst);
1643 }
1644
1645 /* Allocate the destination. */
1646 ra_foreach_dst (dst, instr) {
1647 allocate_dst(ctx, dst);
1648 }
1649
1650 /* Now handle sources. Go backward so that in case there are multiple
1651 * sources with the same def and that def is killed we only remove it at
1652 * the end.
1653 */
1654 ra_foreach_src_rev (src, instr) {
1655 assign_src(ctx, instr, src);
1656 }
1657
1658 /* Now finally insert the destination into the map. */
1659 ra_foreach_dst (dst, instr) {
1660 insert_dst(ctx, dst);
1661 }
1662
1663 insert_parallel_copy_instr(ctx, instr);
1664 }
1665
1666 static void
handle_split(struct ra_ctx * ctx,struct ir3_instruction * instr)1667 handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
1668 {
1669 struct ir3_register *dst = instr->dsts[0];
1670 struct ir3_register *src = instr->srcs[0];
1671
1672 if (!(dst->flags & IR3_REG_SSA))
1673 return;
1674
1675 if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
1676 handle_normal_instr(ctx, instr);
1677 return;
1678 }
1679
1680 struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1681
1682 physreg_t physreg = ra_interval_get_physreg(src_interval);
1683 assign_src(ctx, instr, src);
1684
1685 allocate_dst_fixed(
1686 ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset);
1687 insert_dst(ctx, dst);
1688 }
1689
1690 static void
handle_collect(struct ra_ctx * ctx,struct ir3_instruction * instr)1691 handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
1692 {
1693 if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1694 return;
1695
1696 struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;
1697 unsigned dst_offset = instr->dsts[0]->merge_set_offset;
1698
1699 if (!dst_set || dst_set->regs_count == 1) {
1700 handle_normal_instr(ctx, instr);
1701 return;
1702 }
1703
1704 /* We need to check if any of the sources are contained in an interval
1705 * that is at least as large as the vector. In this case, we should put
1706 * the vector inside that larger interval. (There should be one
1707 * unambiguous place to put it, because values sharing the same merge set
1708 * should be allocated together.) This can happen in a case like:
1709 *
1710 * ssa_1 (wrmask=0xf) = ...
1711 * ssa_2 = split ssa_1 off:0
1712 * ssa_3 = split ssa_1 off:1
1713 * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3
1714 * ... = (kill)ssa_1
1715 * ... = (kill)ssa_4
1716 *
1717 * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it.
1718 */
1719 physreg_t dst_fixed = (physreg_t)~0u;
1720
1721 ra_foreach_src (src, instr) {
1722 if (src->flags & IR3_REG_FIRST_KILL) {
1723 mark_src_killed(ctx, src);
1724 }
1725
1726 struct ra_interval *interval = &ctx->intervals[src->def->name];
1727
1728 /* We only need special handling if the source's interval overlaps with
1729 * the destination's interval.
1730 */
1731 if (src->def->interval_start >= instr->dsts[0]->interval_end ||
1732 instr->dsts[0]->interval_start >= src->def->interval_end ||
1733 interval->is_killed)
1734 continue;
1735
1736 while (interval->interval.parent != NULL) {
1737 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1738 }
1739 if (reg_size(interval->interval.reg) >= reg_size(instr->dsts[0])) {
1740 dst_fixed = interval->physreg_start -
1741 interval->interval.reg->merge_set_offset + dst_offset;
1742 } else {
1743 /* For sources whose root interval is smaller than the
1744 * destination (i.e. the normal case), we will shuffle them
1745 * around after allocating the destination. Mark them killed so
1746 * that the destination can be allocated over them, even if they
1747 * aren't actually killed.
1748 */
1749 ra_file_mark_killed(ra_get_file(ctx, src), interval);
1750 }
1751 }
1752
1753 if (dst_fixed != (physreg_t)~0u)
1754 allocate_dst_fixed(ctx, instr->dsts[0], dst_fixed);
1755 else
1756 allocate_dst(ctx, instr->dsts[0]);
1757
1758 /* Remove the temporary is_killed we added */
1759 ra_foreach_src (src, instr) {
1760 struct ra_interval *interval = &ctx->intervals[src->def->name];
1761 while (interval->interval.parent != NULL) {
1762 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1763 }
1764
1765 /* Filter out cases where it actually should be killed */
1766 if (interval != &ctx->intervals[src->def->name] ||
1767 !(src->flags & IR3_REG_KILL)) {
1768 ra_file_unmark_killed(ra_get_file(ctx, src), interval);
1769 }
1770 }
1771
1772 ra_foreach_src_rev (src, instr) {
1773 assign_src(ctx, instr, src);
1774 }
1775
1776 /* We need to do this before insert_dst(), so that children of the
1777 * destination which got marked as killed and then shuffled around to make
1778 * space for the destination have the correct pcopy destination that
1779 * matches what we assign the source of the collect to in assign_src().
1780 *
1781 * TODO: In this case we'll wind up copying the value in the pcopy and
1782 * then again in the collect. We could avoid one of those by updating the
1783 * pcopy destination to match up with the final location of the source
1784 * after the collect and making the collect a no-op. However this doesn't
1785 * seem to happen often.
1786 */
1787 insert_parallel_copy_instr(ctx, instr);
1788
1789 /* Note: insert_dst will automatically shuffle around any intervals that
1790 * are a child of the collect by making them children of the collect.
1791 */
1792
1793 insert_dst(ctx, instr->dsts[0]);
1794 }
1795
1796 /* Parallel copies before RA should only be at the end of the block, for
1797 * phi's. For these we only need to fill in the sources, and then we fill in
1798 * the destinations in the successor block.
1799 */
1800 static void
handle_pcopy(struct ra_ctx * ctx,struct ir3_instruction * instr)1801 handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
1802 {
1803 ra_foreach_src_rev (src, instr) {
1804 assign_src(ctx, instr, src);
1805 }
1806 }
1807
1808 /* Some inputs may need to be precolored. We need to handle those first, so
1809 * that other non-precolored inputs don't accidentally get allocated over
1810 * them. Inputs are the very first thing in the shader, so it shouldn't be a
1811 * problem to allocate them to a specific physreg.
1812 */
1813
1814 static void
handle_precolored_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1815 handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1816 {
1817 if (instr->dsts[0]->num == INVALID_REG ||
1818 !(instr->dsts[0]->flags & IR3_REG_SSA))
1819 return;
1820
1821 struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1822 struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1823 physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]);
1824 allocate_dst_fixed(ctx, instr->dsts[0], physreg);
1825
1826 d("insert precolored dst %u physreg %u", instr->dsts[0]->name,
1827 ra_interval_get_physreg(interval));
1828
1829 ra_file_insert(file, interval);
1830 interval->frozen = true;
1831 }
1832
1833 static void
handle_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1834 handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1835 {
1836 if (instr->dsts[0]->num != INVALID_REG)
1837 return;
1838
1839 allocate_dst(ctx, instr->dsts[0]);
1840
1841 struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1842 struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1843 ra_file_insert(file, interval);
1844 }
1845
1846 static void
assign_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1847 assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1848 {
1849 if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1850 return;
1851
1852 struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1853 struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1854
1855 if (instr->dsts[0]->num == INVALID_REG) {
1856 assign_reg(instr, instr->dsts[0], ra_interval_get_num(interval));
1857 } else {
1858 interval->frozen = false;
1859 }
1860
1861 if (instr->dsts[0]->flags & IR3_REG_UNUSED)
1862 ra_file_remove(file, interval);
1863
1864 ra_foreach_src_rev (src, instr)
1865 assign_src(ctx, instr, src);
1866 }
1867
1868 /* chmask is a bit weird, because it has pre-colored sources due to the need
1869 * to pass some registers to the next stage. Fortunately there are only at
1870 * most two, and there should be no other live values by the time we get to
1871 * this instruction, so we only have to do the minimum and don't need any
1872 * fancy fallbacks.
1873 *
1874 * TODO: Add more complete handling of precolored sources, e.g. for function
1875 * argument handling. We'd need a way to mark sources as fixed so that they
1876 * don't get moved around when placing other sources in the fallback case, and
1877 * a duplication of much of the logic in get_reg(). This also opens another
1878 * can of worms, e.g. what if the precolored source is a split of a vector
1879 * which is still live -- this breaks our assumption that splits don't incur
1880 * any "extra" register requirements and we'd have to break it out of the
1881 * parent ra_interval.
1882 */
1883
1884 static void
handle_precolored_source(struct ra_ctx * ctx,struct ir3_register * src)1885 handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src)
1886 {
1887 struct ra_file *file = ra_get_file(ctx, src);
1888 struct ra_interval *interval = &ctx->intervals[src->def->name];
1889 physreg_t physreg = ra_reg_get_physreg(src);
1890
1891 if (ra_interval_get_num(interval) == src->num)
1892 return;
1893
1894 /* Try evicting stuff in our way if it isn't free. This won't move
1895 * anything unless it overlaps with our precolored physreg, so we don't
1896 * have to worry about evicting other precolored sources.
1897 */
1898 if (!get_reg_specified(ctx, file, src, physreg, true)) {
1899 unsigned eviction_count;
1900 if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true,
1901 false)) {
1902 unreachable("failed to evict for precolored source!");
1903 return;
1904 }
1905 }
1906
1907 ra_move_interval(ctx, file, interval, physreg);
1908 }
1909
1910 static void
handle_chmask(struct ra_ctx * ctx,struct ir3_instruction * instr)1911 handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr)
1912 {
1913 /* Note: we purposely don't mark sources as killed, so that we can reuse
1914 * some of the get_reg() machinery as-if the source is a destination.
1915 * Marking it as killed would make e.g. get_reg_specified() wouldn't work
1916 * correctly.
1917 */
1918 ra_foreach_src (src, instr) {
1919 assert(src->num != INVALID_REG);
1920 handle_precolored_source(ctx, src);
1921 }
1922
1923 ra_foreach_src (src, instr) {
1924 struct ra_file *file = ra_get_file(ctx, src);
1925 struct ra_interval *interval = &ctx->intervals[src->def->name];
1926 if (src->flags & IR3_REG_FIRST_KILL)
1927 ra_file_remove(file, interval);
1928 }
1929
1930 insert_parallel_copy_instr(ctx, instr);
1931 }
1932
1933 static physreg_t
read_register(struct ra_ctx * ctx,struct ir3_block * block,struct ir3_register * def)1934 read_register(struct ra_ctx *ctx, struct ir3_block *block,
1935 struct ir3_register *def)
1936 {
1937 struct ra_block_state *state = &ctx->blocks[block->index];
1938 if (state->renames) {
1939 struct hash_entry *entry = _mesa_hash_table_search(state->renames, def);
1940 if (entry) {
1941 return (physreg_t)(uintptr_t)entry->data;
1942 }
1943 }
1944
1945 return ra_reg_get_physreg(def);
1946 }
1947
1948 static void
handle_live_in(struct ra_ctx * ctx,struct ir3_register * def)1949 handle_live_in(struct ra_ctx *ctx, struct ir3_register *def)
1950 {
1951 physreg_t physreg = ~0;
1952 for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
1953 struct ir3_block *pred = ctx->block->predecessors[i];
1954 struct ra_block_state *pred_state = &ctx->blocks[pred->index];
1955
1956 if (!pred_state->visited)
1957 continue;
1958
1959 physreg = read_register(ctx, pred, def);
1960 break;
1961 }
1962
1963 assert(physreg != (physreg_t)~0);
1964
1965 struct ra_interval *interval = &ctx->intervals[def->name];
1966 struct ra_file *file = ra_get_file(ctx, def);
1967 ra_interval_init(interval, def);
1968 interval->physreg_start = physreg;
1969 interval->physreg_end = physreg + reg_size(def);
1970 ra_file_insert(file, interval);
1971 }
1972
1973 static void
handle_live_out(struct ra_ctx * ctx,struct ir3_register * def)1974 handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
1975 {
1976 /* Skip parallelcopy's which in the original program are only used as phi
1977 * arguments. Even though phi arguments are live out, they are only
1978 * assigned when the phi is.
1979 */
1980 if (def->instr->opc == OPC_META_PARALLEL_COPY)
1981 return;
1982
1983 struct ra_block_state *state = &ctx->blocks[ctx->block->index];
1984 struct ra_interval *interval = &ctx->intervals[def->name];
1985 physreg_t physreg = ra_interval_get_physreg(interval);
1986 if (physreg != ra_reg_get_physreg(def)) {
1987 if (!state->renames)
1988 state->renames = _mesa_pointer_hash_table_create(ctx);
1989 _mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg);
1990 }
1991 }
1992
1993 static void
handle_phi(struct ra_ctx * ctx,struct ir3_register * def)1994 handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
1995 {
1996 if (!(def->flags & IR3_REG_SSA))
1997 return;
1998
1999 struct ra_file *file = ra_get_file(ctx, def);
2000 struct ra_interval *interval = &ctx->intervals[def->name];
2001
2002 /* phis are always scalar, so they should already be the smallest possible
2003 * size. However they may be coalesced with other live-in values/phi
2004 * nodes, so check for that here.
2005 */
2006 struct ir3_reg_interval *parent_ir3 =
2007 ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start);
2008 physreg_t physreg;
2009 if (parent_ir3) {
2010 struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3);
2011 physreg = ra_interval_get_physreg(parent) +
2012 (def->interval_start - parent_ir3->reg->interval_start);
2013 } else {
2014 physreg = get_reg(ctx, file, def);
2015 }
2016
2017 allocate_dst_fixed(ctx, def, physreg);
2018
2019 ra_file_insert(file, interval);
2020 }
2021
2022 static void
assign_phi(struct ra_ctx * ctx,struct ir3_instruction * phi)2023 assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
2024 {
2025 if (!(phi->dsts[0]->flags & IR3_REG_SSA))
2026 return;
2027
2028 struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);
2029 struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];
2030 assert(!interval->interval.parent);
2031 unsigned num = ra_interval_get_num(interval);
2032 assign_reg(phi, phi->dsts[0], num);
2033
2034 /* Assign the parallelcopy sources of this phi */
2035 for (unsigned i = 0; i < phi->srcs_count; i++) {
2036 if (phi->srcs[i]->def) {
2037 assign_reg(phi, phi->srcs[i], num);
2038 assign_reg(phi, phi->srcs[i]->def, num);
2039 }
2040 }
2041
2042 if (phi->dsts[0]->flags & IR3_REG_UNUSED)
2043 ra_file_remove(file, interval);
2044 }
2045
2046 /* When we split a live range, we sometimes need to emit fixup code at the end
2047 * of a block. For example, something like:
2048 *
2049 * a = ...
2050 * if (...) {
2051 * ...
2052 * a' = a
2053 * b = ... // a evicted to make room for b
2054 * ...
2055 * }
2056 * ... = a
2057 *
2058 * When we insert the copy to a' in insert_parallel_copy_instr(), this forces
2059 * to insert another copy "a = a'" at the end of the if. Normally this would
2060 * also entail adding a phi node, but since we're about to go out of SSA
2061 * anyway we just insert an extra move. Note, however, that "b" might be used
2062 * in a phi node at the end of the if and share registers with "a", so we
2063 * have to be careful to extend any preexisting parallelcopy instruction
2064 * instead of creating our own in order to guarantee that they properly get
2065 * swapped.
2066 */
2067
2068 static void
insert_liveout_copy(struct ir3_block * block,physreg_t dst,physreg_t src,struct ir3_register * reg)2069 insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,
2070 struct ir3_register *reg)
2071 {
2072 struct ir3_instruction *old_pcopy = NULL;
2073 if (!list_is_empty(&block->instr_list)) {
2074 struct ir3_instruction *last =
2075 list_entry(block->instr_list.prev, struct ir3_instruction, node);
2076 if (last->opc == OPC_META_PARALLEL_COPY)
2077 old_pcopy = last;
2078 }
2079
2080 unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0;
2081 struct ir3_instruction *pcopy = ir3_instr_create(
2082 block, OPC_META_PARALLEL_COPY, old_pcopy_srcs + 1, old_pcopy_srcs + 1);
2083
2084 for (unsigned i = 0; i < old_pcopy_srcs; i++) {
2085 old_pcopy->dsts[i]->instr = pcopy;
2086 pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i];
2087 }
2088
2089 unsigned flags = reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
2090
2091 struct ir3_register *dst_reg = ir3_dst_create(pcopy, INVALID_REG, flags);
2092 dst_reg->wrmask = reg->wrmask;
2093 dst_reg->size = reg->size;
2094 assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags));
2095
2096 for (unsigned i = 0; i < old_pcopy_srcs; i++) {
2097 pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i];
2098 }
2099
2100 struct ir3_register *src_reg = ir3_src_create(pcopy, INVALID_REG, flags);
2101 src_reg->wrmask = reg->wrmask;
2102 src_reg->size = reg->size;
2103 assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags));
2104
2105 if (old_pcopy)
2106 list_del(&old_pcopy->node);
2107 }
2108
2109 static void
insert_live_in_move(struct ra_ctx * ctx,struct ra_interval * interval)2110 insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
2111 {
2112 physreg_t physreg = ra_interval_get_physreg(interval);
2113
2114 for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
2115 struct ir3_block *pred = ctx->block->predecessors[i];
2116 struct ra_block_state *pred_state = &ctx->blocks[pred->index];
2117
2118 if (!pred_state->visited)
2119 continue;
2120
2121 physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
2122 if (pred_reg != physreg) {
2123 assert(!(interval->interval.reg->flags & IR3_REG_SHARED));
2124 insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
2125 }
2126 }
2127 }
2128
2129 static void
insert_file_live_in_moves(struct ra_ctx * ctx,struct ra_file * file)2130 insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file)
2131 {
2132 BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index];
2133 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2134 physreg_node) {
2135 /* Skip phi nodes. This needs to happen after phi nodes are allocated,
2136 * because we may have to move live-ins around to make space for phi
2137 * nodes, but we shouldn't be handling phi nodes here.
2138 */
2139 if (BITSET_TEST(live_in, interval->interval.reg->name))
2140 insert_live_in_move(ctx, interval);
2141 }
2142 }
2143
2144 static void
insert_entry_regs(struct ra_block_state * state,struct ra_file * file)2145 insert_entry_regs(struct ra_block_state *state, struct ra_file *file)
2146 {
2147 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2148 physreg_node) {
2149 _mesa_hash_table_insert(state->entry_regs, interval->interval.reg,
2150 (void *)(uintptr_t)interval->physreg_start);
2151 }
2152 }
2153
2154 static void
insert_live_in_moves(struct ra_ctx * ctx)2155 insert_live_in_moves(struct ra_ctx *ctx)
2156 {
2157 insert_file_live_in_moves(ctx, &ctx->full);
2158 insert_file_live_in_moves(ctx, &ctx->half);
2159 insert_file_live_in_moves(ctx, &ctx->shared);
2160
2161 /* If not all predecessors are visited, insert live-in regs so that
2162 * insert_live_out_moves() will work.
2163 */
2164 bool all_preds_visited = true;
2165 for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
2166 if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) {
2167 all_preds_visited = false;
2168 break;
2169 }
2170 }
2171
2172 if (!all_preds_visited) {
2173 struct ra_block_state *state = &ctx->blocks[ctx->block->index];
2174 state->entry_regs = _mesa_pointer_hash_table_create(ctx);
2175
2176 insert_entry_regs(state, &ctx->full);
2177 insert_entry_regs(state, &ctx->half);
2178 insert_entry_regs(state, &ctx->shared);
2179 }
2180 }
2181
2182 static void
insert_live_out_move(struct ra_ctx * ctx,struct ra_interval * interval)2183 insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval)
2184 {
2185 for (unsigned i = 0; i < 2; i++) {
2186 if (!ctx->block->successors[i])
2187 continue;
2188
2189 struct ir3_block *succ = ctx->block->successors[i];
2190 struct ra_block_state *succ_state = &ctx->blocks[succ->index];
2191
2192 if (!succ_state->visited)
2193 continue;
2194
2195 struct hash_entry *entry = _mesa_hash_table_search(
2196 succ_state->entry_regs, interval->interval.reg);
2197 if (!entry)
2198 continue;
2199
2200 physreg_t new_reg = (physreg_t)(uintptr_t)entry->data;
2201 if (new_reg != interval->physreg_start) {
2202 insert_liveout_copy(ctx->block, new_reg, interval->physreg_start,
2203 interval->interval.reg);
2204 }
2205 }
2206 }
2207
2208 static void
insert_file_live_out_moves(struct ra_ctx * ctx,struct ra_file * file)2209 insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file)
2210 {
2211 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2212 physreg_node) {
2213 insert_live_out_move(ctx, interval);
2214 }
2215 }
2216
2217 static void
insert_live_out_moves(struct ra_ctx * ctx)2218 insert_live_out_moves(struct ra_ctx *ctx)
2219 {
2220 insert_file_live_out_moves(ctx, &ctx->full);
2221 insert_file_live_out_moves(ctx, &ctx->half);
2222 insert_file_live_out_moves(ctx, &ctx->shared);
2223 }
2224
2225 static void
handle_block(struct ra_ctx * ctx,struct ir3_block * block)2226 handle_block(struct ra_ctx *ctx, struct ir3_block *block)
2227 {
2228 ctx->block = block;
2229
2230 /* Reset the register files from the last block */
2231 ra_file_init(&ctx->full);
2232 ra_file_init(&ctx->half);
2233 ra_file_init(&ctx->shared);
2234
2235 /* Handle live-ins, phis, and input meta-instructions. These all appear
2236 * live at the beginning of the block, and interfere with each other
2237 * therefore need to be allocated "in parallel". This means that we
2238 * have to allocate all of them, inserting them into the file, and then
2239 * delay updating the IR until all of them are allocated.
2240 *
2241 * Handle precolored inputs first, because we need to make sure that other
2242 * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes
2243 * and inputs at the same time, because the first block doesn't have
2244 * predecessors. Therefore handle_live_in doesn't have to worry about
2245 * them.
2246 */
2247
2248 foreach_instr (instr, &block->instr_list) {
2249 if (instr->opc == OPC_META_INPUT)
2250 handle_precolored_input(ctx, instr);
2251 else
2252 break;
2253 }
2254
2255 unsigned name;
2256 BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
2257 ctx->live->definitions_count) {
2258 struct ir3_register *reg = ctx->live->definitions[name];
2259 handle_live_in(ctx, reg);
2260 }
2261
2262 foreach_instr (instr, &block->instr_list) {
2263 if (instr->opc == OPC_META_PHI)
2264 handle_phi(ctx, instr->dsts[0]);
2265 else if (instr->opc == OPC_META_INPUT ||
2266 instr->opc == OPC_META_TEX_PREFETCH)
2267 handle_input(ctx, instr);
2268 else
2269 break;
2270 }
2271
2272 /* After this point, every live-in/phi/input has an interval assigned to
2273 * it. We delay actually assigning values until everything has been
2274 * allocated, so we can simply ignore any parallel copy entries created
2275 * when shuffling them around.
2276 */
2277 ctx->parallel_copies_count = 0;
2278
2279 insert_live_in_moves(ctx);
2280
2281 if (RA_DEBUG) {
2282 d("after live-in block %u:\n", block->index);
2283 ra_ctx_dump(ctx);
2284 }
2285
2286 /* Now we're done with processing live-ins, and can handle the body of the
2287 * block.
2288 */
2289 foreach_instr (instr, &block->instr_list) {
2290 di(instr, "processing");
2291
2292 if (instr->opc == OPC_META_PHI)
2293 assign_phi(ctx, instr);
2294 else if (instr->opc == OPC_META_INPUT ||
2295 instr->opc == OPC_META_TEX_PREFETCH)
2296 assign_input(ctx, instr);
2297 else if (instr->opc == OPC_META_SPLIT)
2298 handle_split(ctx, instr);
2299 else if (instr->opc == OPC_META_COLLECT)
2300 handle_collect(ctx, instr);
2301 else if (instr->opc == OPC_META_PARALLEL_COPY)
2302 handle_pcopy(ctx, instr);
2303 else if (instr->opc == OPC_CHMASK)
2304 handle_chmask(ctx, instr);
2305 else
2306 handle_normal_instr(ctx, instr);
2307
2308 if (RA_DEBUG)
2309 ra_ctx_dump(ctx);
2310 }
2311
2312 insert_live_out_moves(ctx);
2313
2314 BITSET_FOREACH_SET (name, ctx->live->live_out[block->index],
2315 ctx->live->definitions_count) {
2316 struct ir3_register *reg = ctx->live->definitions[name];
2317 handle_live_out(ctx, reg);
2318 }
2319
2320 ctx->blocks[block->index].visited = true;
2321 }
2322
2323 static unsigned
calc_target_full_pressure(struct ir3_shader_variant * v,unsigned pressure)2324 calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
2325 {
2326 /* Registers are allocated in units of vec4, so switch from units of
2327 * half-regs to vec4.
2328 */
2329 unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4);
2330
2331 bool double_threadsize = ir3_should_double_threadsize(v, reg_count);
2332
2333 unsigned target = reg_count;
2334 unsigned reg_independent_max_waves =
2335 ir3_get_reg_independent_max_waves(v, double_threadsize);
2336 unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
2337 v->compiler, reg_count, double_threadsize);
2338 unsigned target_waves =
2339 MIN2(reg_independent_max_waves, reg_dependent_max_waves);
2340
2341 while (target <= RA_FULL_SIZE / (2 * 4) &&
2342 ir3_should_double_threadsize(v, target) == double_threadsize &&
2343 ir3_get_reg_dependent_max_waves(v->compiler, target,
2344 double_threadsize) >= target_waves)
2345 target++;
2346
2347 return (target - 1) * 2 * 4;
2348 }
2349
2350 static void
add_pressure(struct ir3_pressure * pressure,struct ir3_register * reg,bool merged_regs)2351 add_pressure(struct ir3_pressure *pressure, struct ir3_register *reg,
2352 bool merged_regs)
2353 {
2354 unsigned size = reg_size(reg);
2355 if (reg->flags & IR3_REG_HALF)
2356 pressure->half += size;
2357 if (!(reg->flags & IR3_REG_HALF) || merged_regs)
2358 pressure->full += size;
2359 }
2360
2361 static void
dummy_interval_add(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)2362 dummy_interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2363 {
2364 }
2365
2366 static void
dummy_interval_delete(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)2367 dummy_interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2368 {
2369 }
2370
2371 static void
dummy_interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * parent,struct ir3_reg_interval * child)2372 dummy_interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *parent,
2373 struct ir3_reg_interval *child)
2374 {
2375 }
2376
2377 /* Calculate the minimum possible limit on register pressure so that spilling
2378 * still succeeds. Used to implement IR3_SHADER_DEBUG=spillall.
2379 */
2380
2381 static void
calc_min_limit_pressure(struct ir3_shader_variant * v,struct ir3_liveness * live,struct ir3_pressure * limit)2382 calc_min_limit_pressure(struct ir3_shader_variant *v,
2383 struct ir3_liveness *live,
2384 struct ir3_pressure *limit)
2385 {
2386 struct ir3_block *start = ir3_start_block(v->ir);
2387 struct ir3_reg_ctx *ctx = ralloc(NULL, struct ir3_reg_ctx);
2388 struct ir3_reg_interval *intervals =
2389 rzalloc_array(ctx, struct ir3_reg_interval, live->definitions_count);
2390
2391 ctx->interval_add = dummy_interval_add;
2392 ctx->interval_delete = dummy_interval_delete;
2393 ctx->interval_readd = dummy_interval_readd;
2394
2395 limit->full = limit->half = 0;
2396
2397 struct ir3_pressure cur_pressure = {0};
2398 foreach_instr (input, &start->instr_list) {
2399 if (input->opc != OPC_META_INPUT &&
2400 input->opc != OPC_META_TEX_PREFETCH)
2401 break;
2402
2403 add_pressure(&cur_pressure, input->dsts[0], v->mergedregs);
2404 }
2405
2406 limit->full = MAX2(limit->full, cur_pressure.full);
2407 limit->half = MAX2(limit->half, cur_pressure.half);
2408
2409 foreach_instr (input, &start->instr_list) {
2410 if (input->opc != OPC_META_INPUT &&
2411 input->opc != OPC_META_TEX_PREFETCH)
2412 break;
2413
2414 /* pre-colored inputs may have holes, which increases the pressure. */
2415 struct ir3_register *dst = input->dsts[0];
2416 if (dst->num != INVALID_REG) {
2417 unsigned physreg = ra_reg_get_physreg(dst) + reg_size(dst);
2418 if (dst->flags & IR3_REG_HALF)
2419 limit->half = MAX2(limit->half, physreg);
2420 if (!(dst->flags & IR3_REG_HALF) || v->mergedregs)
2421 limit->full = MAX2(limit->full, physreg);
2422 }
2423 }
2424
2425 foreach_block (block, &v->ir->block_list) {
2426 rb_tree_init(&ctx->intervals);
2427
2428 unsigned name;
2429 BITSET_FOREACH_SET (name, live->live_in[block->index],
2430 live->definitions_count) {
2431 struct ir3_register *reg = live->definitions[name];
2432 ir3_reg_interval_init(&intervals[reg->name], reg);
2433 ir3_reg_interval_insert(ctx, &intervals[reg->name]);
2434 }
2435
2436 foreach_instr (instr, &block->instr_list) {
2437 ra_foreach_dst (dst, instr) {
2438 ir3_reg_interval_init(&intervals[dst->name], dst);
2439 }
2440 /* phis and parallel copies can be deleted via spilling */
2441
2442 if (instr->opc == OPC_META_PHI) {
2443 ir3_reg_interval_insert(ctx, &intervals[instr->dsts[0]->name]);
2444 continue;
2445 }
2446
2447 if (instr->opc == OPC_META_PARALLEL_COPY)
2448 continue;
2449
2450 cur_pressure = (struct ir3_pressure) {0};
2451
2452 ra_foreach_dst (dst, instr) {
2453 if (dst->tied && !(dst->tied->flags & IR3_REG_KILL))
2454 add_pressure(&cur_pressure, dst, v->mergedregs);
2455 }
2456
2457 ra_foreach_src_rev (src, instr) {
2458 /* We currently don't support spilling the parent of a source when
2459 * making space for sources, so we have to keep track of the
2460 * intervals and figure out the root of the tree to figure out how
2461 * much space we need.
2462 *
2463 * TODO: We should probably support this in the spiller.
2464 */
2465 struct ir3_reg_interval *interval = &intervals[src->def->name];
2466 while (interval->parent)
2467 interval = interval->parent;
2468 add_pressure(&cur_pressure, interval->reg, v->mergedregs);
2469
2470 if (src->flags & IR3_REG_FIRST_KILL)
2471 ir3_reg_interval_remove(ctx, &intervals[src->def->name]);
2472 }
2473
2474 limit->full = MAX2(limit->full, cur_pressure.full);
2475 limit->half = MAX2(limit->half, cur_pressure.half);
2476
2477 cur_pressure = (struct ir3_pressure) {0};
2478
2479 ra_foreach_dst (dst, instr) {
2480 ir3_reg_interval_init(&intervals[dst->name], dst);
2481 ir3_reg_interval_insert(ctx, &intervals[dst->name]);
2482 add_pressure(&cur_pressure, dst, v->mergedregs);
2483 }
2484
2485 limit->full = MAX2(limit->full, cur_pressure.full);
2486 limit->half = MAX2(limit->half, cur_pressure.half);
2487 }
2488 }
2489
2490 /* Account for the base register, which needs to be available everywhere. */
2491 limit->full += 2;
2492
2493 ralloc_free(ctx);
2494 }
2495
2496 /*
2497 * If barriers are used, it must be possible for all waves in the workgroup
2498 * to execute concurrently. Thus we may have to reduce the registers limit.
2499 */
2500 static void
calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant * v,struct ir3_pressure * limit_pressure)2501 calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant *v,
2502 struct ir3_pressure *limit_pressure)
2503 {
2504 const struct ir3_compiler *compiler = v->compiler;
2505
2506 unsigned threads_per_wg;
2507 if (v->local_size_variable) {
2508 /* We have to expect the worst case. */
2509 threads_per_wg = compiler->max_variable_workgroup_size;
2510 } else {
2511 threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2];
2512 }
2513
2514 /* The register file is grouped into reg_size_vec4 number of parts.
2515 * Each part has enough registers to add a single vec4 register to
2516 * each thread of a single-sized wave-pair. With double threadsize
2517 * each wave-pair would consume two parts of the register file to get
2518 * a single vec4 for a thread. The more active wave-pairs the less
2519 * parts each could get.
2520 */
2521
2522 bool double_threadsize = ir3_should_double_threadsize(v, 0);
2523 unsigned waves_per_wg = DIV_ROUND_UP(
2524 threads_per_wg, compiler->threadsize_base * (double_threadsize ? 2 : 1) *
2525 compiler->wave_granularity);
2526
2527 uint32_t vec4_regs_per_thread =
2528 compiler->reg_size_vec4 / (waves_per_wg * (double_threadsize ? 2 : 1));
2529 assert(vec4_regs_per_thread > 0);
2530
2531 uint32_t half_regs_per_thread = vec4_regs_per_thread * 4 * 2;
2532
2533 if (limit_pressure->full > half_regs_per_thread) {
2534 if (v->mergedregs) {
2535 limit_pressure->full = half_regs_per_thread;
2536 } else {
2537 /* TODO: Handle !mergedregs case, probably we would have to do this
2538 * after the first register pressure pass.
2539 */
2540 }
2541 }
2542 }
2543
2544 int
ir3_ra(struct ir3_shader_variant * v)2545 ir3_ra(struct ir3_shader_variant *v)
2546 {
2547 ir3_calc_dominance(v->ir);
2548
2549 ir3_create_parallel_copies(v->ir);
2550
2551 struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
2552
2553 ctx->merged_regs = v->mergedregs;
2554 ctx->compiler = v->compiler;
2555 ctx->stage = v->type;
2556
2557 struct ir3_liveness *live = ir3_calc_liveness(ctx, v->ir);
2558
2559 ir3_debug_print(v->ir, "AFTER: create_parallel_copies");
2560
2561 ir3_merge_regs(live, v->ir);
2562
2563 bool has_shared_vectors = false;
2564 foreach_block (block, &v->ir->block_list) {
2565 foreach_instr (instr, &block->instr_list) {
2566 ra_foreach_dst (dst, instr) {
2567 if ((dst->flags & IR3_REG_SHARED) && reg_elems(dst) > 1) {
2568 has_shared_vectors = true;
2569 break;
2570 }
2571 }
2572 }
2573 }
2574
2575 struct ir3_pressure max_pressure;
2576 ir3_calc_pressure(v, live, &max_pressure);
2577 d("max pressure:");
2578 d("\tfull: %u", max_pressure.full);
2579 d("\thalf: %u", max_pressure.half);
2580 d("\tshared: %u", max_pressure.shared);
2581
2582 struct ir3_pressure limit_pressure;
2583 limit_pressure.full = RA_FULL_SIZE;
2584 limit_pressure.half = RA_HALF_SIZE;
2585 limit_pressure.shared = RA_SHARED_SIZE;
2586
2587 if (gl_shader_stage_is_compute(v->type) && v->has_barrier) {
2588 calc_limit_pressure_for_cs_with_barrier(v, &limit_pressure);
2589 }
2590
2591 /* If the user forces a doubled threadsize, we may have to lower the limit
2592 * because on some gens the register file is not big enough to hold a
2593 * double-size wave with all 48 registers in use.
2594 */
2595 if (v->shader_options.real_wavesize == IR3_DOUBLE_ONLY) {
2596 limit_pressure.full =
2597 MAX2(limit_pressure.full, ctx->compiler->reg_size_vec4 / 2 * 16);
2598 }
2599
2600 /* If requested, lower the limit so that spilling happens more often. */
2601 if (ir3_shader_debug & IR3_DBG_SPILLALL)
2602 calc_min_limit_pressure(v, live, &limit_pressure);
2603
2604 if (max_pressure.shared > limit_pressure.shared || has_shared_vectors) {
2605 ir3_ra_shared(v, live);
2606
2607 /* Recalculate liveness and register pressure now that additional values
2608 * have been added.
2609 */
2610 ralloc_free(live);
2611 live = ir3_calc_liveness(ctx, v->ir);
2612 ir3_calc_pressure(v, live, &max_pressure);
2613
2614 ir3_debug_print(v->ir, "AFTER: shared register allocation");
2615 }
2616
2617 bool spilled = false;
2618 if (max_pressure.full > limit_pressure.full ||
2619 max_pressure.half > limit_pressure.half) {
2620 if (!v->compiler->has_pvtmem) {
2621 d("max pressure exceeded!");
2622 goto fail;
2623 }
2624 d("max pressure exceeded, spilling!");
2625 IR3_PASS(v->ir, ir3_spill, v, &live, &limit_pressure);
2626 ir3_calc_pressure(v, live, &max_pressure);
2627 assert(max_pressure.full <= limit_pressure.full &&
2628 max_pressure.half <= limit_pressure.half);
2629 spilled = true;
2630 }
2631
2632 ctx->live = live;
2633 ctx->intervals =
2634 rzalloc_array(ctx, struct ra_interval, live->definitions_count);
2635 ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count);
2636
2637 ctx->full.size = calc_target_full_pressure(v, max_pressure.full);
2638 d("full size: %u", ctx->full.size);
2639
2640 if (!v->mergedregs)
2641 ctx->half.size = RA_HALF_SIZE;
2642
2643 ctx->shared.size = RA_SHARED_SIZE;
2644
2645 ctx->full.start = ctx->half.start = ctx->shared.start = 0;
2646
2647 foreach_block (block, &v->ir->block_list)
2648 handle_block(ctx, block);
2649
2650 ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count, false);
2651
2652 /* Strip array-ness and SSA-ness at the end, because various helpers still
2653 * need to work even on definitions that have already been assigned. For
2654 * example, we need to preserve array-ness so that array live-ins have the
2655 * right size.
2656 */
2657 foreach_block (block, &v->ir->block_list) {
2658 foreach_instr (instr, &block->instr_list) {
2659 for (unsigned i = 0; i < instr->dsts_count; i++) {
2660 instr->dsts[i]->flags &= ~IR3_REG_SSA;
2661
2662 /* Parallel copies of array registers copy the whole register, and
2663 * we need some way to let the parallel copy code know that this was
2664 * an array whose size is determined by reg->size. So keep the array
2665 * flag on those. spill/reload also need to work on the entire
2666 * array.
2667 */
2668 if (!is_meta(instr) && instr->opc != OPC_RELOAD_MACRO)
2669 instr->dsts[i]->flags &= ~IR3_REG_ARRAY;
2670 }
2671
2672 for (unsigned i = 0; i < instr->srcs_count; i++) {
2673 instr->srcs[i]->flags &= ~IR3_REG_SSA;
2674
2675 if (!is_meta(instr) && instr->opc != OPC_SPILL_MACRO)
2676 instr->srcs[i]->flags &= ~IR3_REG_ARRAY;
2677 }
2678 }
2679 }
2680
2681 ir3_debug_print(v->ir, "AFTER: register allocation");
2682
2683 if (spilled) {
2684 IR3_PASS(v->ir, ir3_lower_spill);
2685 }
2686
2687 ir3_lower_copies(v);
2688
2689 ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
2690
2691 ralloc_free(ctx);
2692
2693 return 0;
2694 fail:
2695 ralloc_free(ctx);
2696 return -1;
2697 }
2698