1 /*
2 * Copyright (C) 2021 Valve Corporation
3 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "ir3_ra.h"
26 #include "util/rb_tree.h"
27 #include "util/u_math.h"
28 #include "ir3_shader.h"
29
30 /* This file implements an SSA-based register allocator. Unlike other
31 * SSA-based allocators, it handles vector split/collect "smartly," meaning
32 * that multiple values may share the same register interval. From the
33 * perspective of the allocator itself, only the top-level intervals matter,
34 * and the allocator is only concerned with allocating top-level intervals,
35 * which may mean moving other top-level intervals around. Other intervals,
36 * like the destination of a split instruction or the source of a collect
37 * instruction, are "locked" to their parent interval. The details of this are
38 * mostly handled by ir3_merge_regs and ir3_reg_ctx.
39 *
40 * We currently don't do any backtracking, but we do use the merge sets as a
41 * form of affinity to try to avoid moves from phis/splits/collects. Each
42 * merge set is what a more "classic" graph-coloring or live-range based
43 * allocator would consider a single register, but here we use it as merely a
44 * hint, except when multiple overlapping values are live at the same time.
45 * Each merge set has a "preferred" register, and we try to honor that when
46 * allocating values in the merge set.
47 */
48
49 /* ir3_reg_ctx implementation. */
50
51 static int
ir3_reg_interval_cmp(const struct rb_node * node,const void * data)52 ir3_reg_interval_cmp(const struct rb_node *node, const void *data)
53 {
54 unsigned reg = *(const unsigned *)data;
55 const struct ir3_reg_interval *interval =
56 ir3_rb_node_to_interval_const(node);
57 if (interval->reg->interval_start > reg)
58 return -1;
59 else if (interval->reg->interval_end <= reg)
60 return 1;
61 else
62 return 0;
63 }
64
65 static struct ir3_reg_interval *
ir3_reg_interval_search(struct rb_tree * tree,unsigned offset)66 ir3_reg_interval_search(struct rb_tree *tree, unsigned offset)
67 {
68 struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp);
69 return node ? ir3_rb_node_to_interval(node) : NULL;
70 }
71
72 static struct ir3_reg_interval *
ir3_reg_interval_search_sloppy(struct rb_tree * tree,unsigned offset)73 ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset)
74 {
75 struct rb_node *node =
76 rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp);
77 return node ? ir3_rb_node_to_interval(node) : NULL;
78 }
79
80 /* Get the interval covering the reg, or the closest to the right if it
81 * doesn't exist.
82 */
83 static struct ir3_reg_interval *
ir3_reg_interval_search_right(struct rb_tree * tree,unsigned offset)84 ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset)
85 {
86 struct ir3_reg_interval *interval =
87 ir3_reg_interval_search_sloppy(tree, offset);
88 if (!interval) {
89 return NULL;
90 } else if (interval->reg->interval_end > offset) {
91 return interval;
92 } else {
93 /* There is no interval covering reg, and ra_file_search_sloppy()
94 * returned the closest range to the left, so the next interval to the
95 * right should be the closest to the right.
96 */
97 return ir3_reg_interval_next_or_null(interval);
98 }
99 }
100
101 static int
ir3_reg_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)102 ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
103 {
104 const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a);
105 const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b);
106 return b->reg->interval_start - a->reg->interval_start;
107 }
108
109 static void
interval_insert(struct ir3_reg_ctx * ctx,struct rb_tree * tree,struct ir3_reg_interval * interval)110 interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree,
111 struct ir3_reg_interval *interval)
112 {
113 struct ir3_reg_interval *right =
114 ir3_reg_interval_search_right(tree, interval->reg->interval_start);
115 if (right && right->reg->interval_start < interval->reg->interval_end) {
116 /* We disallow trees where different members have different half-ness.
117 * This means that we can't treat bitcasts as copies like normal
118 * split/collect, so something like this would require an extra copy
119 * in mergedregs mode, and count as 4 half-units of register pressure
120 * instead of 2:
121 *
122 * f16vec2 foo = unpackFloat2x16(bar)
123 * ... = foo.x
124 * ... = bar
125 *
126 * However, relaxing this rule would open a huge can of worms. What
127 * happens when there's a vector of 16 things, and the fifth element
128 * has been bitcasted as a half-reg? Would that element alone have to
129 * be small enough to be used as a half-reg source? Let's keep that
130 * can of worms firmly shut for now.
131 */
132 assert((interval->reg->flags & IR3_REG_HALF) ==
133 (right->reg->flags & IR3_REG_HALF));
134
135 if (right->reg->interval_end <= interval->reg->interval_end &&
136 right->reg->interval_start >= interval->reg->interval_start) {
137 /* Check if we're inserting something that's already inserted */
138 assert(interval != right);
139
140 /* "right" is contained in "interval" and must become a child of
141 * it. There may be further children too.
142 */
143 for (struct ir3_reg_interval *next = ir3_reg_interval_next(right);
144 right && right->reg->interval_start < interval->reg->interval_end;
145 right = next, next = ir3_reg_interval_next_or_null(next)) {
146 /* "right" must be contained in "interval." */
147 assert(right->reg->interval_end <= interval->reg->interval_end);
148 assert((interval->reg->flags & IR3_REG_HALF) ==
149 (right->reg->flags & IR3_REG_HALF));
150 if (!right->parent)
151 ctx->interval_delete(ctx, right);
152 right->parent = interval;
153 rb_tree_remove(tree, &right->node);
154 rb_tree_insert(&interval->children, &right->node,
155 ir3_reg_interval_insert_cmp);
156 }
157 } else {
158 /* "right" must contain "interval," since intervals must form a
159 * tree.
160 */
161 assert(right->reg->interval_start <= interval->reg->interval_start);
162 interval->parent = right;
163 interval_insert(ctx, &right->children, interval);
164 return;
165 }
166 }
167
168 if (!interval->parent)
169 ctx->interval_add(ctx, interval);
170 rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp);
171 interval->inserted = true;
172 }
173
174 void
ir3_reg_interval_insert(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)175 ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
176 struct ir3_reg_interval *interval)
177 {
178 rb_tree_init(&interval->children);
179 interval->parent = NULL;
180 interval_insert(ctx, &ctx->intervals, interval);
181 }
182
183 /* Call after ir3_reg_interval_remove_temp() to reinsert the interval */
184 static void
ir3_reg_interval_reinsert(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)185 ir3_reg_interval_reinsert(struct ir3_reg_ctx *ctx,
186 struct ir3_reg_interval *interval)
187 {
188 interval->parent = NULL;
189 interval_insert(ctx, &ctx->intervals, interval);
190 }
191
192 void
ir3_reg_interval_remove(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)193 ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
194 struct ir3_reg_interval *interval)
195 {
196 if (interval->parent) {
197 rb_tree_remove(&interval->parent->children, &interval->node);
198 } else {
199 ctx->interval_delete(ctx, interval);
200 rb_tree_remove(&ctx->intervals, &interval->node);
201 }
202
203 rb_tree_foreach_safe (struct ir3_reg_interval, child, &interval->children,
204 node) {
205 rb_tree_remove(&interval->children, &child->node);
206 child->parent = interval->parent;
207
208 if (interval->parent) {
209 rb_tree_insert(&child->parent->children, &child->node,
210 ir3_reg_interval_insert_cmp);
211 } else {
212 ctx->interval_readd(ctx, interval, child);
213 rb_tree_insert(&ctx->intervals, &child->node,
214 ir3_reg_interval_insert_cmp);
215 }
216 }
217
218 interval->inserted = false;
219 }
220
221 static void
_mark_free(struct ir3_reg_interval * interval)222 _mark_free(struct ir3_reg_interval *interval)
223 {
224 interval->inserted = false;
225 rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
226 _mark_free(child);
227 }
228 }
229
230 /* Remove an interval and all its children from the tree. */
231 void
ir3_reg_interval_remove_all(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)232 ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
233 struct ir3_reg_interval *interval)
234 {
235 assert(!interval->parent);
236
237 ctx->interval_delete(ctx, interval);
238 rb_tree_remove(&ctx->intervals, &interval->node);
239 _mark_free(interval);
240 }
241
242 /* Used when popping an interval to be shuffled around. Don't disturb children
243 * so that it can be later reinserted.
244 */
245 static void
ir3_reg_interval_remove_temp(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)246 ir3_reg_interval_remove_temp(struct ir3_reg_ctx *ctx,
247 struct ir3_reg_interval *interval)
248 {
249 assert(!interval->parent);
250
251 ctx->interval_delete(ctx, interval);
252 rb_tree_remove(&ctx->intervals, &interval->node);
253 }
254
255 static void
interval_dump(struct log_stream * stream,struct ir3_reg_interval * interval,unsigned indent)256 interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval,
257 unsigned indent)
258 {
259 for (unsigned i = 0; i < indent; i++)
260 mesa_log_stream_printf(stream, "\t");
261 mesa_log_stream_printf(stream, "reg %u start %u\n", interval->reg->name,
262 interval->reg->interval_start);
263
264 rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
265 interval_dump(stream, child, indent + 1);
266 }
267
268 for (unsigned i = 0; i < indent; i++)
269 mesa_log_stream_printf(stream, "\t");
270 mesa_log_stream_printf(stream, "reg %u end %u\n", interval->reg->name,
271 interval->reg->interval_end);
272 }
273
274 void
ir3_reg_interval_dump(struct log_stream * stream,struct ir3_reg_interval * interval)275 ir3_reg_interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval)
276 {
277 interval_dump(stream, interval, 0);
278 }
279
280 /* These are the core datastructures used by the register allocator. First
281 * ra_interval and ra_file, which are used for intra-block tracking and use
282 * the ir3_reg_ctx infrastructure:
283 */
284
285 struct ra_interval {
286 struct ir3_reg_interval interval;
287
288 struct rb_node physreg_node;
289 physreg_t physreg_start, physreg_end;
290
291 /* True if this is a source of the current instruction which is entirely
292 * killed. This means we can allocate the dest over it, but we can't break
293 * it up.
294 */
295 bool is_killed;
296
297 /* True if this interval cannot be moved from its position. This is only
298 * used for precolored inputs to ensure that other inputs don't get
299 * allocated on top of them.
300 */
301 bool frozen;
302 };
303
304 struct ra_file {
305 struct ir3_reg_ctx reg_ctx;
306
307 BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
308 BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
309
310 struct rb_tree physreg_intervals;
311
312 unsigned size;
313 unsigned start;
314 };
315
316 /* State for inter-block tracking. When we split a live range to make space
317 * for a vector, we may need to insert fixup code when a block has multiple
318 * predecessors that have moved the same live value to different registers.
319 * This keeps track of state required to do that.
320 */
321
322 struct ra_block_state {
323 /* Map of defining ir3_register -> physreg it was allocated to at the end
324 * of the block.
325 */
326 struct hash_table *renames;
327
328 /* For loops, we need to process a block before all its predecessors have
329 * been processed. In particular, we need to pick registers for values
330 * without knowing if all the predecessors have been renamed. This keeps
331 * track of the registers we chose so that when we visit the back-edge we
332 * can move them appropriately. If all predecessors have been visited
333 * before this block is visited then we don't need to fill this out. This
334 * is a map from ir3_register -> physreg.
335 */
336 struct hash_table *entry_regs;
337
338 /* True if the block has been visited and "renames" is complete.
339 */
340 bool visited;
341 };
342
343 struct ra_parallel_copy {
344 struct ra_interval *interval;
345 physreg_t src;
346 };
347
348 /* The main context: */
349
350 struct ra_ctx {
351 /* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom
352 * half of this file too.
353 */
354 struct ra_file full;
355
356 /* hr0.x - hr63.w, only used without merged-regs. */
357 struct ra_file half;
358
359 /* Shared regs. */
360 struct ra_file shared;
361
362 struct ir3_liveness *live;
363
364 struct ir3_block *block;
365
366 const struct ir3_compiler *compiler;
367 gl_shader_stage stage;
368
369 /* Pending moves of top-level intervals that will be emitted once we're
370 * finished:
371 */
372 DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies);
373
374 struct ra_interval *intervals;
375 struct ra_block_state *blocks;
376
377 bool merged_regs;
378 };
379
380 #define foreach_interval(interval, file) \
381 rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals, \
382 physreg_node)
383 #define foreach_interval_rev(interval, file) \
384 rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals, \
385 physreg_node)
386 #define foreach_interval_safe(interval, file) \
387 rb_tree_foreach_safe (struct ra_interval, interval, \
388 &(file)->physreg_intervals, physreg_node)
389 #define foreach_interval_rev_safe(interval, file) \
390 rb_tree_foreach_rev_safe(struct ra_interval, interval, \
391 &(file)->physreg_intervals, physreg_node)
392
393 static struct ra_interval *
rb_node_to_interval(struct rb_node * node)394 rb_node_to_interval(struct rb_node *node)
395 {
396 return rb_node_data(struct ra_interval, node, physreg_node);
397 }
398
399 static const struct ra_interval *
rb_node_to_interval_const(const struct rb_node * node)400 rb_node_to_interval_const(const struct rb_node *node)
401 {
402 return rb_node_data(struct ra_interval, node, physreg_node);
403 }
404
405 static struct ra_interval *
ra_interval_next(struct ra_interval * interval)406 ra_interval_next(struct ra_interval *interval)
407 {
408 struct rb_node *next = rb_node_next(&interval->physreg_node);
409 return next ? rb_node_to_interval(next) : NULL;
410 }
411
412 static struct ra_interval *
ra_interval_next_or_null(struct ra_interval * interval)413 ra_interval_next_or_null(struct ra_interval *interval)
414 {
415 return interval ? ra_interval_next(interval) : NULL;
416 }
417
418 static int
ra_interval_cmp(const struct rb_node * node,const void * data)419 ra_interval_cmp(const struct rb_node *node, const void *data)
420 {
421 physreg_t reg = *(const physreg_t *)data;
422 const struct ra_interval *interval = rb_node_to_interval_const(node);
423 if (interval->physreg_start > reg)
424 return -1;
425 else if (interval->physreg_end <= reg)
426 return 1;
427 else
428 return 0;
429 }
430
431 static struct ra_interval *
ra_interval_search_sloppy(struct rb_tree * tree,physreg_t reg)432 ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
433 {
434 struct rb_node *node = rb_tree_search_sloppy(tree, ®, ra_interval_cmp);
435 return node ? rb_node_to_interval(node) : NULL;
436 }
437
438 /* Get the interval covering the reg, or the closest to the right if it
439 * doesn't exist.
440 */
441 static struct ra_interval *
ra_interval_search_right(struct rb_tree * tree,physreg_t reg)442 ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
443 {
444 struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
445 if (!interval) {
446 return NULL;
447 } else if (interval->physreg_end > reg) {
448 return interval;
449 } else {
450 /* There is no interval covering reg, and ra_file_search_sloppy()
451 * returned the closest range to the left, so the next interval to the
452 * right should be the closest to the right.
453 */
454 return ra_interval_next_or_null(interval);
455 }
456 }
457
458 static struct ra_interval *
ra_file_search_right(struct ra_file * file,physreg_t reg)459 ra_file_search_right(struct ra_file *file, physreg_t reg)
460 {
461 return ra_interval_search_right(&file->physreg_intervals, reg);
462 }
463
464 static int
ra_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)465 ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
466 {
467 const struct ra_interval *a = rb_node_to_interval_const(_a);
468 const struct ra_interval *b = rb_node_to_interval_const(_b);
469 return b->physreg_start - a->physreg_start;
470 }
471
472 static struct ra_interval *
ir3_reg_interval_to_ra_interval(struct ir3_reg_interval * interval)473 ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
474 {
475 return rb_node_data(struct ra_interval, interval, interval);
476 }
477
478 static struct ra_file *
ir3_reg_ctx_to_file(struct ir3_reg_ctx * ctx)479 ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx)
480 {
481 return rb_node_data(struct ra_file, ctx, reg_ctx);
482 }
483
484 static void
interval_add(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _interval)485 interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
486 {
487 struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
488 struct ra_file *file = ir3_reg_ctx_to_file(ctx);
489
490 /* We can assume in this case that physreg_start/physreg_end is already
491 * initialized.
492 */
493 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
494 BITSET_CLEAR(file->available, i);
495 BITSET_CLEAR(file->available_to_evict, i);
496 }
497
498 rb_tree_insert(&file->physreg_intervals, &interval->physreg_node,
499 ra_interval_insert_cmp);
500 }
501
502 static void
interval_delete(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _interval)503 interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
504 {
505 struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
506 struct ra_file *file = ir3_reg_ctx_to_file(ctx);
507
508 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
509 BITSET_SET(file->available, i);
510 BITSET_SET(file->available_to_evict, i);
511 }
512
513 rb_tree_remove(&file->physreg_intervals, &interval->physreg_node);
514 }
515
516 static void
interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)517 interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
518 struct ir3_reg_interval *_child)
519 {
520 struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
521 struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
522
523 child->physreg_start =
524 parent->physreg_start + (child->interval.reg->interval_start -
525 parent->interval.reg->interval_start);
526 child->physreg_end =
527 child->physreg_start +
528 (child->interval.reg->interval_end - child->interval.reg->interval_start);
529
530 interval_add(ctx, _child);
531 }
532
533 static void
ra_file_init(struct ra_file * file)534 ra_file_init(struct ra_file *file)
535 {
536 for (unsigned i = 0; i < file->size; i++) {
537 BITSET_SET(file->available, i);
538 BITSET_SET(file->available_to_evict, i);
539 }
540
541 rb_tree_init(&file->reg_ctx.intervals);
542 rb_tree_init(&file->physreg_intervals);
543
544 file->reg_ctx.interval_add = interval_add;
545 file->reg_ctx.interval_delete = interval_delete;
546 file->reg_ctx.interval_readd = interval_readd;
547 }
548
549 static void
ra_file_insert(struct ra_file * file,struct ra_interval * interval)550 ra_file_insert(struct ra_file *file, struct ra_interval *interval)
551 {
552 assert(interval->physreg_start < interval->physreg_end);
553 assert(interval->physreg_end <= file->size);
554 if (interval->interval.reg->flags & IR3_REG_HALF)
555 assert(interval->physreg_end <= RA_HALF_SIZE);
556
557 ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
558 }
559
560 static void
ra_file_remove(struct ra_file * file,struct ra_interval * interval)561 ra_file_remove(struct ra_file *file, struct ra_interval *interval)
562 {
563 ir3_reg_interval_remove(&file->reg_ctx, &interval->interval);
564 }
565
566 static void
ra_file_mark_killed(struct ra_file * file,struct ra_interval * interval)567 ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval)
568 {
569 assert(!interval->interval.parent);
570
571 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
572 BITSET_SET(file->available, i);
573 }
574
575 interval->is_killed = true;
576 }
577
578 static void
ra_file_unmark_killed(struct ra_file * file,struct ra_interval * interval)579 ra_file_unmark_killed(struct ra_file *file, struct ra_interval *interval)
580 {
581 assert(!interval->interval.parent);
582
583 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
584 BITSET_CLEAR(file->available, i);
585 }
586
587 interval->is_killed = false;
588 }
589
590 static physreg_t
ra_interval_get_physreg(const struct ra_interval * interval)591 ra_interval_get_physreg(const struct ra_interval *interval)
592 {
593 unsigned child_start = interval->interval.reg->interval_start;
594
595 while (interval->interval.parent) {
596 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
597 }
598
599 return interval->physreg_start +
600 (child_start - interval->interval.reg->interval_start);
601 }
602
603 static unsigned
ra_interval_get_num(const struct ra_interval * interval)604 ra_interval_get_num(const struct ra_interval *interval)
605 {
606 return ra_physreg_to_num(ra_interval_get_physreg(interval),
607 interval->interval.reg->flags);
608 }
609
610 static void
ra_interval_init(struct ra_interval * interval,struct ir3_register * reg)611 ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
612 {
613 ir3_reg_interval_init(&interval->interval, reg);
614 interval->is_killed = false;
615 interval->frozen = false;
616 }
617
618 static void
ra_interval_dump(struct log_stream * stream,struct ra_interval * interval)619 ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
620 {
621 mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
622
623 ir3_reg_interval_dump(stream, &interval->interval);
624 }
625
626 static void
ra_file_dump(struct log_stream * stream,struct ra_file * file)627 ra_file_dump(struct log_stream *stream, struct ra_file *file)
628 {
629 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
630 physreg_node) {
631 ra_interval_dump(stream, interval);
632 }
633
634 unsigned start, end;
635 mesa_log_stream_printf(stream, "available:\n");
636 BITSET_FOREACH_RANGE (start, end, file->available, file->size) {
637 mesa_log_stream_printf(stream, "%u-%u ", start, end);
638 }
639 mesa_log_stream_printf(stream, "\n");
640
641 mesa_log_stream_printf(stream, "available to evict:\n");
642 BITSET_FOREACH_RANGE (start, end, file->available_to_evict, file->size) {
643 mesa_log_stream_printf(stream, "%u-%u ", start, end);
644 }
645 mesa_log_stream_printf(stream, "\n");
646 mesa_log_stream_printf(stream, "start: %u\n", file->start);
647 }
648
649 static void
ra_ctx_dump(struct ra_ctx * ctx)650 ra_ctx_dump(struct ra_ctx *ctx)
651 {
652 struct log_stream *stream = mesa_log_streami();
653 mesa_log_stream_printf(stream, "full:\n");
654 ra_file_dump(stream, &ctx->full);
655 mesa_log_stream_printf(stream, "half:\n");
656 ra_file_dump(stream, &ctx->half);
657 mesa_log_stream_printf(stream, "shared:");
658 ra_file_dump(stream, &ctx->shared);
659 mesa_log_stream_destroy(stream);
660 }
661
662 static unsigned
reg_file_size(struct ra_file * file,struct ir3_register * reg)663 reg_file_size(struct ra_file *file, struct ir3_register *reg)
664 {
665 /* Half-regs can only take up the first half of the combined regfile */
666 if (reg->flags & IR3_REG_HALF)
667 return MIN2(file->size, RA_HALF_SIZE);
668 else
669 return file->size;
670 }
671
672 /* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple
673 * top-level intervals at once. Pop multiple intervals, then push them back in
674 * any order.
675 */
676
677 struct ra_removed_interval {
678 struct ra_interval *interval;
679 unsigned size;
680 };
681
682 static struct ra_removed_interval
ra_pop_interval(struct ra_ctx * ctx,struct ra_file * file,struct ra_interval * interval)683 ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
684 struct ra_interval *interval)
685 {
686 assert(!interval->interval.parent);
687
688 /* Check if we've already moved this reg before */
689 unsigned pcopy_index;
690 for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count;
691 pcopy_index++) {
692 if (ctx->parallel_copies[pcopy_index].interval == interval)
693 break;
694 }
695
696 if (pcopy_index == ctx->parallel_copies_count) {
697 array_insert(ctx, ctx->parallel_copies,
698 (struct ra_parallel_copy){
699 .interval = interval,
700 .src = interval->physreg_start,
701 });
702 }
703
704 ir3_reg_interval_remove_temp(&file->reg_ctx, &interval->interval);
705
706 return (struct ra_removed_interval){
707 .interval = interval,
708 .size = interval->physreg_end - interval->physreg_start,
709 };
710 }
711
712 static void
ra_push_interval(struct ra_ctx * ctx,struct ra_file * file,const struct ra_removed_interval * removed,physreg_t dst)713 ra_push_interval(struct ra_ctx *ctx, struct ra_file *file,
714 const struct ra_removed_interval *removed, physreg_t dst)
715 {
716 struct ra_interval *interval = removed->interval;
717
718 interval->physreg_start = dst;
719 interval->physreg_end = dst + removed->size;
720
721 ir3_reg_interval_reinsert(&file->reg_ctx, &interval->interval);
722 }
723
724 /* Pick up the interval and place it at "dst". */
725 static void
ra_move_interval(struct ra_ctx * ctx,struct ra_file * file,struct ra_interval * interval,physreg_t dst)726 ra_move_interval(struct ra_ctx *ctx, struct ra_file *file,
727 struct ra_interval *interval, physreg_t dst)
728 {
729 struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval);
730 ra_push_interval(ctx, file, &temp, dst);
731 }
732
733 static bool
get_reg_specified(struct ra_file * file,struct ir3_register * reg,physreg_t physreg,bool is_source)734 get_reg_specified(struct ra_file *file, struct ir3_register *reg,
735 physreg_t physreg, bool is_source)
736 {
737 for (unsigned i = 0; i < reg_size(reg); i++) {
738 if (!BITSET_TEST(is_source ? file->available_to_evict : file->available,
739 physreg + i))
740 return false;
741 }
742
743 return true;
744 }
745
746 /* Try to evict any registers conflicting with the proposed spot "physreg" for
747 * "reg". That is, move them to other places so that we can allocate "physreg"
748 * here.
749 */
750
751 static bool
try_evict_regs(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg,physreg_t physreg,unsigned * _eviction_count,bool is_source,bool speculative)752 try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
753 struct ir3_register *reg, physreg_t physreg,
754 unsigned *_eviction_count, bool is_source, bool speculative)
755 {
756 BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
757 memcpy(available_to_evict, file->available_to_evict,
758 sizeof(available_to_evict));
759
760 BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
761 memcpy(available, file->available, sizeof(available));
762
763 for (unsigned i = 0; i < reg_size(reg); i++) {
764 BITSET_CLEAR(available_to_evict, physreg + i);
765 BITSET_CLEAR(available, physreg + i);
766 }
767
768 unsigned eviction_count = 0;
769 /* Iterate over each range conflicting with physreg */
770 for (struct ra_interval *conflicting = ra_file_search_right(file, physreg),
771 *next = ra_interval_next_or_null(conflicting);
772 conflicting != NULL &&
773 conflicting->physreg_start < physreg + reg_size(reg);
774 conflicting = next, next = ra_interval_next_or_null(next)) {
775 if (!is_source && conflicting->is_killed)
776 continue;
777
778 if (conflicting->frozen) {
779 assert(speculative);
780 return false;
781 }
782
783 unsigned conflicting_file_size =
784 reg_file_size(file, conflicting->interval.reg);
785 unsigned avail_start, avail_end;
786 bool evicted = false;
787 BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict,
788 conflicting_file_size) {
789 unsigned size = avail_end - avail_start;
790
791 /* non-half registers must be aligned */
792 if (!(conflicting->interval.reg->flags & IR3_REG_HALF) &&
793 avail_start % 2 == 1) {
794 avail_start++;
795 size--;
796 }
797
798 if (size >= conflicting->physreg_end - conflicting->physreg_start) {
799 for (unsigned i = 0;
800 i < conflicting->physreg_end - conflicting->physreg_start; i++)
801 BITSET_CLEAR(available_to_evict, avail_start + i);
802 eviction_count +=
803 conflicting->physreg_end - conflicting->physreg_start;
804 if (!speculative)
805 ra_move_interval(ctx, file, conflicting, avail_start);
806 evicted = true;
807 break;
808 }
809 }
810
811 if (evicted)
812 continue;
813
814 /* If we couldn't evict this range, we may be able to swap it with a
815 * killed range to acheive the same effect.
816 */
817 foreach_interval (killed, file) {
818 if (!killed->is_killed)
819 continue;
820
821 if (killed->physreg_end - killed->physreg_start !=
822 conflicting->physreg_end - conflicting->physreg_start)
823 continue;
824
825 if (killed->physreg_end > conflicting_file_size ||
826 conflicting->physreg_end > reg_file_size(file, killed->interval.reg))
827 continue;
828
829 /* We can't swap the killed range if it partially/fully overlaps the
830 * space we're trying to allocate or (in speculative mode) if it's
831 * already been swapped and will overlap when we actually evict.
832 */
833 bool killed_available = true;
834 for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
835 if (!BITSET_TEST(available, i)) {
836 killed_available = false;
837 break;
838 }
839 }
840
841 if (!killed_available)
842 continue;
843
844 /* Check for alignment if one is a full reg */
845 if ((!(killed->interval.reg->flags & IR3_REG_HALF) ||
846 !(conflicting->interval.reg->flags & IR3_REG_HALF)) &&
847 (killed->physreg_start % 2 != 0 ||
848 conflicting->physreg_start % 2 != 0))
849 continue;
850
851 for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
852 BITSET_CLEAR(available, i);
853 }
854 /* Because this will generate swaps instead of moves, multiply the
855 * cost by 2.
856 */
857 eviction_count += (killed->physreg_end - killed->physreg_start) * 2;
858 if (!speculative) {
859 physreg_t killed_start = killed->physreg_start,
860 conflicting_start = conflicting->physreg_start;
861 struct ra_removed_interval killed_removed =
862 ra_pop_interval(ctx, file, killed);
863 struct ra_removed_interval conflicting_removed =
864 ra_pop_interval(ctx, file, conflicting);
865 ra_push_interval(ctx, file, &killed_removed, conflicting_start);
866 ra_push_interval(ctx, file, &conflicting_removed, killed_start);
867 }
868
869 evicted = true;
870 break;
871 }
872
873 if (!evicted)
874 return false;
875 }
876
877 *_eviction_count = eviction_count;
878 return true;
879 }
880
881 static int
removed_interval_cmp(const void * _i1,const void * _i2)882 removed_interval_cmp(const void *_i1, const void *_i2)
883 {
884 const struct ra_removed_interval *i1 = _i1;
885 const struct ra_removed_interval *i2 = _i2;
886
887 /* We sort the registers as follows:
888 *
889 * |--------------------------------------------------------------------|
890 * | | | | |
891 * | Half live-through | Half killed | Full killed | Full live-through |
892 * | | | | |
893 * |--------------------------------------------------------------------|
894 * | |
895 * | Destination |
896 * | |
897 * |-----------------|
898 *
899 * Half-registers have to be first so that they stay in the low half of
900 * the register file. Then half and full killed must stay together so that
901 * there's a contiguous range where we can put the register. With this
902 * structure we should be able to accomodate any collection of intervals
903 * such that the total number of half components is within the half limit
904 * and the combined components are within the full limit.
905 */
906
907 unsigned i1_align = reg_elem_size(i1->interval->interval.reg);
908 unsigned i2_align = reg_elem_size(i2->interval->interval.reg);
909 if (i1_align > i2_align)
910 return 1;
911 if (i1_align < i2_align)
912 return -1;
913
914 if (i1_align == 1) {
915 if (i2->interval->is_killed)
916 return -1;
917 if (i1->interval->is_killed)
918 return 1;
919 } else {
920 if (i2->interval->is_killed)
921 return 1;
922 if (i1->interval->is_killed)
923 return -1;
924 }
925
926 return 0;
927 }
928
929 /* "Compress" all the live intervals so that there is enough space for the
930 * destination register. As there can be gaps when a more-aligned interval
931 * follows a less-aligned interval, this also sorts them to remove such
932 * "padding", which may be required when space is very tight. This isn't
933 * amazing, but should be used only as a last resort in case the register file
934 * is almost full and badly fragmented.
935 *
936 * Return the physreg to use.
937 */
938 static physreg_t
compress_regs_left(struct ra_ctx * ctx,struct ra_file * file,unsigned size,unsigned align,bool is_source)939 compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size,
940 unsigned align, bool is_source)
941 {
942 DECLARE_ARRAY(struct ra_removed_interval, intervals);
943 intervals_count = intervals_sz = 0;
944 intervals = NULL;
945
946 unsigned removed_size = 0, removed_half_size = 0;
947 unsigned file_size =
948 align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size;
949 physreg_t start_reg = 0;
950
951 foreach_interval_rev_safe (interval, file) {
952 /* Check if we can sort the intervals *after* this one and have enough
953 * space leftover to accomodate "size" units. Also check that we have
954 * enough space leftover for half-registers, if we're inserting a
955 * half-register (otherwise we only shift any half-registers down so they
956 * should be safe).
957 */
958 if (interval->physreg_end + size + removed_size <= file->size &&
959 (align != 1 ||
960 interval->physreg_end + size + removed_half_size <= file_size)) {
961 start_reg = interval->physreg_end;
962 break;
963 }
964
965 /* We assume that all frozen intervals are at the start and that we
966 * can avoid popping them.
967 */
968 assert(!interval->frozen);
969
970 /* Killed sources don't count because they go at the end and can
971 * overlap the register we're trying to add.
972 */
973 if (!interval->is_killed && !is_source) {
974 removed_size += interval->physreg_end - interval->physreg_start;
975 if (interval->interval.reg->flags & IR3_REG_HALF) {
976 removed_half_size += interval->physreg_end -
977 interval->physreg_start;
978 }
979 }
980
981 /* Now that we've done the accounting, pop this off */
982 d("popping interval %u physreg %u\n", interval->interval.reg->name,
983 interval->physreg_start);
984 array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval));
985 }
986
987 /* TODO: In addition to skipping registers at the beginning that are
988 * well-packed, we should try to skip registers at the end.
989 */
990
991 qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp);
992
993 physreg_t physreg = start_reg;
994 physreg_t ret_reg = (physreg_t)~0;
995 for (unsigned i = 0; i < intervals_count; i++) {
996 if (ret_reg == (physreg_t)~0 &&
997 ((intervals[i].interval->is_killed && !is_source) ||
998 !(intervals[i].interval->interval.reg->flags & IR3_REG_HALF))) {
999 ret_reg = ALIGN(physreg, align);
1000 }
1001
1002 if (ret_reg != (physreg_t)~0 &&
1003 (is_source || !intervals[i].interval->is_killed)) {
1004 physreg = MAX2(physreg, ret_reg + size);
1005 }
1006
1007 if (!(intervals[i].interval->interval.reg->flags & IR3_REG_HALF)) {
1008 physreg = ALIGN(physreg, 2);
1009 }
1010
1011 if (physreg + intervals[i].size >
1012 reg_file_size(file, intervals[i].interval->interval.reg)) {
1013 d("ran out of room for interval %u!\n",
1014 intervals[i].interval->interval.reg->name);
1015 unreachable("reg pressure calculation was wrong!");
1016 return 0;
1017 }
1018
1019 d("pushing interval %u physreg %u\n",
1020 intervals[i].interval->interval.reg->name, physreg);
1021 ra_push_interval(ctx, file, &intervals[i], physreg);
1022
1023 physreg += intervals[i].size;
1024 }
1025
1026 if (ret_reg == (physreg_t)~0)
1027 ret_reg = physreg;
1028
1029 ret_reg = ALIGN(ret_reg, align);
1030 if (ret_reg + size > file_size) {
1031 d("ran out of room for the new interval!\n");
1032 unreachable("reg pressure calculation was wrong!");
1033 return 0;
1034 }
1035
1036 return ret_reg;
1037 }
1038
1039 static void
update_affinity(struct ra_file * file,struct ir3_register * reg,physreg_t physreg)1040 update_affinity(struct ra_file *file, struct ir3_register *reg,
1041 physreg_t physreg)
1042 {
1043 if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0)
1044 return;
1045
1046 if (physreg < reg->merge_set_offset)
1047 return;
1048
1049 if ((physreg - reg->merge_set_offset + reg->merge_set->size) > file->size)
1050 return;
1051
1052 reg->merge_set->preferred_reg = physreg - reg->merge_set_offset;
1053 }
1054
1055 /* Try to find free space for a register without shuffling anything. This uses
1056 * a round-robin algorithm to reduce false dependencies.
1057 */
1058 static physreg_t
find_best_gap(struct ra_file * file,unsigned file_size,unsigned size,unsigned align,bool is_source)1059 find_best_gap(struct ra_file *file, unsigned file_size, unsigned size,
1060 unsigned align, bool is_source)
1061 {
1062 /* This can happen if we create a very large merge set. Just bail out in that
1063 * case.
1064 */
1065 if (size > file_size)
1066 return (physreg_t) ~0;
1067
1068 BITSET_WORD *available =
1069 is_source ? file->available_to_evict : file->available;
1070
1071 unsigned start = ALIGN(file->start, align) % (file_size - size + align);
1072 unsigned candidate = start;
1073 do {
1074 bool is_available = true;
1075 for (unsigned i = 0; i < size; i++) {
1076 if (!BITSET_TEST(available, candidate + i)) {
1077 is_available = false;
1078 break;
1079 }
1080 }
1081
1082 if (is_available) {
1083 file->start = (candidate + size) % file_size;
1084 return candidate;
1085 }
1086
1087 candidate += align;
1088 if (candidate + size > file_size)
1089 candidate = 0;
1090 } while (candidate != start);
1091
1092 return (physreg_t)~0;
1093 }
1094
1095 static struct ra_file *
ra_get_file(struct ra_ctx * ctx,struct ir3_register * reg)1096 ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
1097 {
1098 if (reg->flags & IR3_REG_SHARED)
1099 return &ctx->shared;
1100 else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
1101 return &ctx->full;
1102 else
1103 return &ctx->half;
1104 }
1105
1106 /* This is the main entrypoint for picking a register. Pick a free register
1107 * for "reg", shuffling around sources if necessary. In the normal case where
1108 * "is_source" is false, this register can overlap with killed sources
1109 * (intervals with "is_killed == true"). If "is_source" is true, then
1110 * is_killed is ignored and the register returned must not overlap with killed
1111 * sources. This must be used for tied registers, because we're actually
1112 * allocating the destination and the tied source at the same time.
1113 */
1114
1115 static physreg_t
get_reg(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg,bool is_source)1116 get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg,
1117 bool is_source)
1118 {
1119 unsigned file_size = reg_file_size(file, reg);
1120 if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
1121 physreg_t preferred_reg =
1122 reg->merge_set->preferred_reg + reg->merge_set_offset;
1123 if (preferred_reg < file_size &&
1124 preferred_reg % reg_elem_size(reg) == 0 &&
1125 get_reg_specified(file, reg, preferred_reg, is_source))
1126 return preferred_reg;
1127 }
1128
1129 /* If this register is a subset of a merge set which we have not picked a
1130 * register for, first try to allocate enough space for the entire merge
1131 * set.
1132 */
1133 unsigned size = reg_size(reg);
1134 if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
1135 size < reg->merge_set->size) {
1136 physreg_t best_reg = find_best_gap(file, file_size, reg->merge_set->size,
1137 reg->merge_set->alignment, is_source);
1138 if (best_reg != (physreg_t)~0u) {
1139 best_reg += reg->merge_set_offset;
1140 return best_reg;
1141 }
1142 }
1143
1144 /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
1145 * Because this doesn't introduce unnecessary dependencies, and it
1146 * potentially avoids needing (ss) syncs for write after read hazards for
1147 * SFU instructions:
1148 */
1149 if (is_sfu(reg->instr) || is_alu(reg->instr)) {
1150 for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
1151 struct ir3_register *src = reg->instr->srcs[i];
1152 if (!ra_reg_is_src(src))
1153 continue;
1154 if (ra_get_file(ctx, src) == file && reg_size(src) >= size) {
1155 struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1156 physreg_t src_physreg = ra_interval_get_physreg(src_interval);
1157 if (src_physreg % reg_elem_size(reg) == 0 &&
1158 src_physreg + size <= file_size &&
1159 get_reg_specified(file, reg, src_physreg, is_source))
1160 return src_physreg;
1161 }
1162 }
1163 }
1164
1165 physreg_t best_reg =
1166 find_best_gap(file, file_size, size, reg_elem_size(reg), is_source);
1167 if (best_reg != (physreg_t)~0u) {
1168 return best_reg;
1169 }
1170
1171 /* Ok, we couldn't find anything that fits. Here is where we have to start
1172 * moving things around to make stuff fit. First try solely evicting
1173 * registers in the way.
1174 */
1175 unsigned best_eviction_count = ~0;
1176 for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) {
1177 unsigned eviction_count;
1178 if (try_evict_regs(ctx, file, reg, i, &eviction_count, is_source, true)) {
1179 if (eviction_count < best_eviction_count) {
1180 best_eviction_count = eviction_count;
1181 best_reg = i;
1182 }
1183 }
1184 }
1185
1186 if (best_eviction_count != ~0) {
1187 ASSERTED bool result = try_evict_regs(
1188 ctx, file, reg, best_reg, &best_eviction_count, is_source, false);
1189 assert(result);
1190 return best_reg;
1191 }
1192
1193 /* Use the dumb fallback only if try_evict_regs() fails. */
1194 return compress_regs_left(ctx, file, reg_size(reg), reg_elem_size(reg),
1195 is_source);
1196 }
1197
1198 static void
assign_reg(struct ir3_instruction * instr,struct ir3_register * reg,unsigned num)1199 assign_reg(struct ir3_instruction *instr, struct ir3_register *reg,
1200 unsigned num)
1201 {
1202 if (reg->flags & IR3_REG_ARRAY) {
1203 reg->array.base = num;
1204 if (reg->flags & IR3_REG_RELATIV)
1205 reg->array.offset += num;
1206 else
1207 reg->num = num + reg->array.offset;
1208 } else {
1209 reg->num = num;
1210 }
1211 }
1212
1213 static void
mark_src_killed(struct ra_ctx * ctx,struct ir3_register * src)1214 mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src)
1215 {
1216 struct ra_interval *interval = &ctx->intervals[src->def->name];
1217
1218 if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed ||
1219 interval->interval.parent ||
1220 !rb_tree_is_empty(&interval->interval.children))
1221 return;
1222
1223 ra_file_mark_killed(ra_get_file(ctx, src), interval);
1224 }
1225
1226 static void
insert_dst(struct ra_ctx * ctx,struct ir3_register * dst)1227 insert_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1228 {
1229 struct ra_file *file = ra_get_file(ctx, dst);
1230 struct ra_interval *interval = &ctx->intervals[dst->name];
1231
1232 d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval));
1233
1234 if (!(dst->flags & IR3_REG_UNUSED))
1235 ra_file_insert(file, interval);
1236
1237 assign_reg(dst->instr, dst, ra_interval_get_num(interval));
1238 }
1239
1240 static void
allocate_dst_fixed(struct ra_ctx * ctx,struct ir3_register * dst,physreg_t physreg)1241 allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst,
1242 physreg_t physreg)
1243 {
1244 struct ra_file *file = ra_get_file(ctx, dst);
1245 struct ra_interval *interval = &ctx->intervals[dst->name];
1246 update_affinity(file, dst, physreg);
1247
1248 ra_interval_init(interval, dst);
1249 interval->physreg_start = physreg;
1250 interval->physreg_end = physreg + reg_size(dst);
1251 }
1252
1253 static void
allocate_dst(struct ra_ctx * ctx,struct ir3_register * dst)1254 allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1255 {
1256 struct ra_file *file = ra_get_file(ctx, dst);
1257
1258 struct ir3_register *tied = dst->tied;
1259 if (tied) {
1260 struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1261 struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1262 physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);
1263 if (tied_interval->is_killed) {
1264 /* The easy case: the source is killed, so we can just reuse it
1265 * for the destination.
1266 */
1267 allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval));
1268 } else {
1269 /* The source is live-through, so we need to get a free register
1270 * (which is free for both the source and destination!), copy the
1271 * original source to it, then use that for the source and
1272 * destination.
1273 */
1274 physreg_t physreg = get_reg(ctx, file, dst, true);
1275 allocate_dst_fixed(ctx, dst, physreg);
1276 array_insert(ctx, ctx->parallel_copies,
1277 (struct ra_parallel_copy){
1278 .interval = dst_interval,
1279 .src = tied_physreg,
1280 });
1281 }
1282
1283 return;
1284 }
1285
1286 /* All the hard work is done by get_reg here. */
1287 physreg_t physreg = get_reg(ctx, file, dst, false);
1288
1289 allocate_dst_fixed(ctx, dst, physreg);
1290 }
1291
1292 static void
assign_src(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)1293 assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
1294 struct ir3_register *src)
1295 {
1296 struct ra_interval *interval = &ctx->intervals[src->def->name];
1297 struct ra_file *file = ra_get_file(ctx, src);
1298
1299 struct ir3_register *tied = src->tied;
1300 physreg_t physreg;
1301 if (tied) {
1302 struct ra_interval *tied_interval = &ctx->intervals[tied->name];
1303 physreg = ra_interval_get_physreg(tied_interval);
1304 } else {
1305 physreg = ra_interval_get_physreg(interval);
1306 }
1307
1308 assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags));
1309
1310 if (src->flags & IR3_REG_FIRST_KILL)
1311 ra_file_remove(file, interval);
1312 }
1313
1314 /* Insert a parallel copy instruction before the instruction with the parallel
1315 * copy entries we've built up.
1316 */
1317 static void
insert_parallel_copy_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1318 insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1319 {
1320 if (ctx->parallel_copies_count == 0)
1321 return;
1322
1323 struct ir3_instruction *pcopy =
1324 ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY,
1325 ctx->parallel_copies_count, ctx->parallel_copies_count);
1326
1327 for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1328 struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1329 struct ir3_register *reg =
1330 ir3_dst_create(pcopy, INVALID_REG,
1331 entry->interval->interval.reg->flags & ~IR3_REG_SSA);
1332 reg->size = entry->interval->interval.reg->size;
1333 reg->wrmask = entry->interval->interval.reg->wrmask;
1334 assign_reg(pcopy, reg, ra_interval_get_num(entry->interval));
1335 }
1336
1337 for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1338 struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1339 struct ir3_register *reg =
1340 ir3_src_create(pcopy, INVALID_REG,
1341 entry->interval->interval.reg->flags & ~IR3_REG_SSA);
1342 reg->size = entry->interval->interval.reg->size;
1343 reg->wrmask = entry->interval->interval.reg->wrmask;
1344 assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags));
1345 }
1346
1347 list_del(&pcopy->node);
1348 list_addtail(&pcopy->node, &instr->node);
1349 ctx->parallel_copies_count = 0;
1350 }
1351
1352 static void
handle_normal_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1353 handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1354 {
1355 /* First, mark sources as going-to-be-killed while allocating the dest. */
1356 ra_foreach_src (src, instr) {
1357 mark_src_killed(ctx, src);
1358 }
1359
1360 /* Allocate the destination. */
1361 ra_foreach_dst (dst, instr) {
1362 allocate_dst(ctx, dst);
1363 }
1364
1365 /* Now handle sources. Go backward so that in case there are multiple
1366 * sources with the same def and that def is killed we only remove it at
1367 * the end.
1368 */
1369 ra_foreach_src_rev (src, instr) {
1370 assign_src(ctx, instr, src);
1371 }
1372
1373 /* Now finally insert the destination into the map. */
1374 ra_foreach_dst (dst, instr) {
1375 insert_dst(ctx, dst);
1376 }
1377
1378 insert_parallel_copy_instr(ctx, instr);
1379 }
1380
1381 static void
handle_split(struct ra_ctx * ctx,struct ir3_instruction * instr)1382 handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
1383 {
1384 struct ir3_register *dst = instr->dsts[0];
1385 struct ir3_register *src = instr->srcs[0];
1386
1387 if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
1388 handle_normal_instr(ctx, instr);
1389 return;
1390 }
1391
1392 struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1393
1394 physreg_t physreg = ra_interval_get_physreg(src_interval);
1395 assign_src(ctx, instr, src);
1396
1397 allocate_dst_fixed(
1398 ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset);
1399 insert_dst(ctx, dst);
1400 }
1401
1402 static void
handle_collect(struct ra_ctx * ctx,struct ir3_instruction * instr)1403 handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
1404 {
1405 struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;
1406 unsigned dst_offset = instr->dsts[0]->merge_set_offset;
1407
1408 if (!dst_set || dst_set->regs_count == 1) {
1409 handle_normal_instr(ctx, instr);
1410 return;
1411 }
1412
1413 /* We need to check if any of the sources are contained in an interval
1414 * that is at least as large as the vector. In this case, we should put
1415 * the vector inside that larger interval. (There should be one
1416 * unambiguous place to put it, because values sharing the same merge set
1417 * should be allocated together.) This can happen in a case like:
1418 *
1419 * ssa_1 (wrmask=0xf) = ...
1420 * ssa_2 = split ssa_1 off:0
1421 * ssa_3 = split ssa_1 off:1
1422 * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3
1423 * ... = (kill)ssa_1
1424 * ... = (kill)ssa_4
1425 *
1426 * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it.
1427 */
1428 physreg_t dst_fixed = (physreg_t)~0u;
1429
1430 ra_foreach_src (src, instr) {
1431 if (src->flags & IR3_REG_FIRST_KILL) {
1432 mark_src_killed(ctx, src);
1433 }
1434
1435 struct ra_interval *interval = &ctx->intervals[src->def->name];
1436
1437 if (src->def->merge_set != dst_set || interval->is_killed)
1438 continue;
1439 while (interval->interval.parent != NULL) {
1440 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1441 }
1442 if (reg_size(interval->interval.reg) >= reg_size(instr->dsts[0])) {
1443 dst_fixed = interval->physreg_start -
1444 interval->interval.reg->merge_set_offset + dst_offset;
1445 } else {
1446 /* For sources whose root interval is smaller than the
1447 * destination (i.e. the normal case), we will shuffle them
1448 * around after allocating the destination. Mark them killed so
1449 * that the destination can be allocated over them, even if they
1450 * aren't actually killed.
1451 */
1452 ra_file_mark_killed(ra_get_file(ctx, src), interval);
1453 }
1454 }
1455
1456 if (dst_fixed != (physreg_t)~0u)
1457 allocate_dst_fixed(ctx, instr->dsts[0], dst_fixed);
1458 else
1459 allocate_dst(ctx, instr->dsts[0]);
1460
1461 /* Remove the temporary is_killed we added */
1462 ra_foreach_src (src, instr) {
1463 struct ra_interval *interval = &ctx->intervals[src->def->name];
1464 while (interval->interval.parent != NULL) {
1465 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1466 }
1467
1468 /* Filter out cases where it actually should be killed */
1469 if (interval != &ctx->intervals[src->def->name] ||
1470 !(src->flags & IR3_REG_KILL)) {
1471 ra_file_unmark_killed(ra_get_file(ctx, src), interval);
1472 }
1473 }
1474
1475 ra_foreach_src_rev (src, instr) {
1476 assign_src(ctx, instr, src);
1477 }
1478
1479 /* We need to do this before insert_dst(), so that children of the
1480 * destination which got marked as killed and then shuffled around to make
1481 * space for the destination have the correct pcopy destination that
1482 * matches what we assign the source of the collect to in assign_src().
1483 *
1484 * TODO: In this case we'll wind up copying the value in the pcopy and
1485 * then again in the collect. We could avoid one of those by updating the
1486 * pcopy destination to match up with the final location of the source
1487 * after the collect and making the collect a no-op. However this doesn't
1488 * seem to happen often.
1489 */
1490 insert_parallel_copy_instr(ctx, instr);
1491
1492 /* Note: insert_dst will automatically shuffle around any intervals that
1493 * are a child of the collect by making them children of the collect.
1494 */
1495
1496 insert_dst(ctx, instr->dsts[0]);
1497 }
1498
1499 /* Parallel copies before RA should only be at the end of the block, for
1500 * phi's. For these we only need to fill in the sources, and then we fill in
1501 * the destinations in the successor block.
1502 */
1503 static void
handle_pcopy(struct ra_ctx * ctx,struct ir3_instruction * instr)1504 handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
1505 {
1506 ra_foreach_src_rev (src, instr) {
1507 assign_src(ctx, instr, src);
1508 }
1509 }
1510
1511 /* Some inputs may need to be precolored. We need to handle those first, so
1512 * that other non-precolored inputs don't accidentally get allocated over
1513 * them. Inputs are the very first thing in the shader, so it shouldn't be a
1514 * problem to allocate them to a specific physreg.
1515 */
1516
1517 static void
handle_precolored_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1518 handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1519 {
1520 if (instr->dsts[0]->num == INVALID_REG)
1521 return;
1522
1523 struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1524 physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]);
1525 allocate_dst_fixed(ctx, instr->dsts[0], physreg);
1526 insert_dst(ctx, instr->dsts[0]);
1527 interval->frozen = true;
1528 }
1529
1530 static void
handle_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1531 handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1532 {
1533 if (instr->dsts[0]->num != INVALID_REG)
1534 return;
1535
1536 allocate_dst(ctx, instr->dsts[0]);
1537
1538 struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1539 struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1540 ra_file_insert(file, interval);
1541 }
1542
1543 static void
assign_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1544 assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1545 {
1546 struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1547 struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1548
1549 if (instr->dsts[0]->num == INVALID_REG) {
1550 assign_reg(instr, instr->dsts[0], ra_interval_get_num(interval));
1551 } else {
1552 interval->frozen = false;
1553 }
1554
1555 if (instr->dsts[0]->flags & IR3_REG_UNUSED)
1556 ra_file_remove(file, interval);
1557
1558 ra_foreach_src_rev (src, instr)
1559 assign_src(ctx, instr, src);
1560 }
1561
1562 /* chmask is a bit weird, because it has pre-colored sources due to the need
1563 * to pass some registers to the next stage. Fortunately there are only at
1564 * most two, and there should be no other live values by the time we get to
1565 * this instruction, so we only have to do the minimum and don't need any
1566 * fancy fallbacks.
1567 *
1568 * TODO: Add more complete handling of precolored sources, e.g. for function
1569 * argument handling. We'd need a way to mark sources as fixed so that they
1570 * don't get moved around when placing other sources in the fallback case, and
1571 * a duplication of much of the logic in get_reg(). This also opens another
1572 * can of worms, e.g. what if the precolored source is a split of a vector
1573 * which is still live -- this breaks our assumption that splits don't incur
1574 * any "extra" register requirements and we'd have to break it out of the
1575 * parent ra_interval.
1576 */
1577
1578 static void
handle_precolored_source(struct ra_ctx * ctx,struct ir3_register * src)1579 handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src)
1580 {
1581 struct ra_file *file = ra_get_file(ctx, src);
1582 struct ra_interval *interval = &ctx->intervals[src->def->name];
1583 physreg_t physreg = ra_reg_get_physreg(src);
1584
1585 if (ra_interval_get_num(interval) == src->num)
1586 return;
1587
1588 /* Try evicting stuff in our way if it isn't free. This won't move
1589 * anything unless it overlaps with our precolored physreg, so we don't
1590 * have to worry about evicting other precolored sources.
1591 */
1592 if (!get_reg_specified(file, src, physreg, true)) {
1593 unsigned eviction_count;
1594 if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true,
1595 false)) {
1596 unreachable("failed to evict for precolored source!");
1597 return;
1598 }
1599 }
1600
1601 ra_move_interval(ctx, file, interval, physreg);
1602 }
1603
1604 static void
handle_chmask(struct ra_ctx * ctx,struct ir3_instruction * instr)1605 handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr)
1606 {
1607 /* Note: we purposely don't mark sources as killed, so that we can reuse
1608 * some of the get_reg() machinery as-if the source is a destination.
1609 * Marking it as killed would make e.g. get_reg_specified() wouldn't work
1610 * correctly.
1611 */
1612 ra_foreach_src (src, instr) {
1613 assert(src->num != INVALID_REG);
1614 handle_precolored_source(ctx, src);
1615 }
1616
1617 ra_foreach_src (src, instr) {
1618 struct ra_file *file = ra_get_file(ctx, src);
1619 struct ra_interval *interval = &ctx->intervals[src->def->name];
1620 if (src->flags & IR3_REG_FIRST_KILL)
1621 ra_file_remove(file, interval);
1622 }
1623
1624 insert_parallel_copy_instr(ctx, instr);
1625 }
1626
1627 static physreg_t
read_register(struct ra_ctx * ctx,struct ir3_block * block,struct ir3_register * def)1628 read_register(struct ra_ctx *ctx, struct ir3_block *block,
1629 struct ir3_register *def)
1630 {
1631 struct ra_block_state *state = &ctx->blocks[block->index];
1632 if (state->renames) {
1633 struct hash_entry *entry = _mesa_hash_table_search(state->renames, def);
1634 if (entry) {
1635 return (physreg_t)(uintptr_t)entry->data;
1636 }
1637 }
1638
1639 return ra_reg_get_physreg(def);
1640 }
1641
1642 static void
handle_live_in(struct ra_ctx * ctx,struct ir3_register * def)1643 handle_live_in(struct ra_ctx *ctx, struct ir3_register *def)
1644 {
1645 physreg_t physreg = ~0;
1646 for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
1647 struct ir3_block *pred = ctx->block->predecessors[i];
1648 struct ra_block_state *pred_state = &ctx->blocks[pred->index];
1649
1650 if (!pred_state->visited)
1651 continue;
1652
1653 physreg = read_register(ctx, pred, def);
1654 break;
1655 }
1656
1657 assert(physreg != (physreg_t)~0);
1658
1659 struct ra_interval *interval = &ctx->intervals[def->name];
1660 struct ra_file *file = ra_get_file(ctx, def);
1661 ra_interval_init(interval, def);
1662 interval->physreg_start = physreg;
1663 interval->physreg_end = physreg + reg_size(def);
1664 ra_file_insert(file, interval);
1665 }
1666
1667 static void
handle_live_out(struct ra_ctx * ctx,struct ir3_register * def)1668 handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
1669 {
1670 /* Skip parallelcopy's which in the original program are only used as phi
1671 * arguments. Even though phi arguments are live out, they are only
1672 * assigned when the phi is.
1673 */
1674 if (def->instr->opc == OPC_META_PARALLEL_COPY)
1675 return;
1676
1677 struct ra_block_state *state = &ctx->blocks[ctx->block->index];
1678 struct ra_interval *interval = &ctx->intervals[def->name];
1679 physreg_t physreg = ra_interval_get_physreg(interval);
1680 if (physreg != ra_reg_get_physreg(def)) {
1681 if (!state->renames)
1682 state->renames = _mesa_pointer_hash_table_create(ctx);
1683 _mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg);
1684 }
1685 }
1686
1687 static void
handle_phi(struct ra_ctx * ctx,struct ir3_register * def)1688 handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
1689 {
1690 struct ra_file *file = ra_get_file(ctx, def);
1691 struct ra_interval *interval = &ctx->intervals[def->name];
1692
1693 /* phis are always scalar, so they should already be the smallest possible
1694 * size. However they may be coalesced with other live-in values/phi
1695 * nodes, so check for that here.
1696 */
1697 struct ir3_reg_interval *parent_ir3 =
1698 ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start);
1699 physreg_t physreg;
1700 if (parent_ir3) {
1701 struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3);
1702 physreg = ra_interval_get_physreg(parent) +
1703 (def->interval_start - parent_ir3->reg->interval_start);
1704 } else {
1705 physreg = get_reg(ctx, file, def, false);
1706 }
1707
1708 allocate_dst_fixed(ctx, def, physreg);
1709
1710 ra_file_insert(file, interval);
1711 }
1712
1713 static void
assign_phi(struct ra_ctx * ctx,struct ir3_instruction * phi)1714 assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
1715 {
1716 struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);
1717 struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];
1718 assert(!interval->interval.parent);
1719 unsigned num = ra_interval_get_num(interval);
1720 assign_reg(phi, phi->dsts[0], num);
1721
1722 /* Assign the parallelcopy sources of this phi */
1723 for (unsigned i = 0; i < phi->srcs_count; i++) {
1724 if (phi->srcs[i]->def) {
1725 assign_reg(phi, phi->srcs[i], num);
1726 assign_reg(phi, phi->srcs[i]->def, num);
1727 }
1728 }
1729
1730 if (phi->dsts[0]->flags & IR3_REG_UNUSED)
1731 ra_file_remove(file, interval);
1732 }
1733
1734 /* When we split a live range, we sometimes need to emit fixup code at the end
1735 * of a block. For example, something like:
1736 *
1737 * a = ...
1738 * if (...) {
1739 * ...
1740 * a' = a
1741 * b = ... // a evicted to make room for b
1742 * ...
1743 * }
1744 * ... = a
1745 *
1746 * When we insert the copy to a' in insert_parallel_copy_instr(), this forces
1747 * to insert another copy "a = a'" at the end of the if. Normally this would
1748 * also entail adding a phi node, but since we're about to go out of SSA
1749 * anyway we just insert an extra move. Note, however, that "b" might be used
1750 * in a phi node at the end of the if and share registers with "a", so we
1751 * have to be careful to extend any preexisting parallelcopy instruction
1752 * instead of creating our own in order to guarantee that they properly get
1753 * swapped.
1754 */
1755
1756 static void
insert_liveout_copy(struct ir3_block * block,physreg_t dst,physreg_t src,struct ir3_register * reg)1757 insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,
1758 struct ir3_register *reg)
1759 {
1760 struct ir3_instruction *old_pcopy = NULL;
1761 if (!list_is_empty(&block->instr_list)) {
1762 struct ir3_instruction *last =
1763 LIST_ENTRY(struct ir3_instruction, block->instr_list.prev, node);
1764 if (last->opc == OPC_META_PARALLEL_COPY)
1765 old_pcopy = last;
1766 }
1767
1768 unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0;
1769 struct ir3_instruction *pcopy = ir3_instr_create(
1770 block, OPC_META_PARALLEL_COPY, old_pcopy_srcs + 1, old_pcopy_srcs + 1);
1771
1772 for (unsigned i = 0; i < old_pcopy_srcs; i++) {
1773 old_pcopy->dsts[i]->instr = pcopy;
1774 pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i];
1775 }
1776
1777 struct ir3_register *dst_reg =
1778 ir3_dst_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA);
1779 dst_reg->wrmask = reg->wrmask;
1780 dst_reg->size = reg->size;
1781 assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags));
1782
1783 for (unsigned i = 0; i < old_pcopy_srcs; i++) {
1784 pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i];
1785 }
1786
1787 struct ir3_register *src_reg =
1788 ir3_src_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA);
1789 src_reg->wrmask = reg->wrmask;
1790 src_reg->size = reg->size;
1791 assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags));
1792
1793 if (old_pcopy)
1794 list_del(&old_pcopy->node);
1795 }
1796
1797 static void
insert_live_in_move(struct ra_ctx * ctx,struct ra_interval * interval)1798 insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
1799 {
1800 physreg_t physreg = ra_interval_get_physreg(interval);
1801
1802 bool shared = interval->interval.reg->flags & IR3_REG_SHARED;
1803 struct ir3_block **predecessors =
1804 shared ? ctx->block->physical_predecessors : ctx->block->predecessors;
1805 unsigned predecessors_count = shared
1806 ? ctx->block->physical_predecessors_count
1807 : ctx->block->predecessors_count;
1808
1809 for (unsigned i = 0; i < predecessors_count; i++) {
1810 struct ir3_block *pred = predecessors[i];
1811 struct ra_block_state *pred_state = &ctx->blocks[pred->index];
1812
1813 if (!pred_state->visited)
1814 continue;
1815
1816 physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
1817 if (pred_reg != physreg) {
1818 insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
1819
1820 /* This is a bit tricky, but when visiting the destination of a
1821 * physical-only edge, we have two predecessors (the if and the
1822 * header block) and both have multiple successors. We pick the
1823 * register for all live-ins from the normal edge, which should
1824 * guarantee that there's no need for shuffling things around in
1825 * the normal predecessor as long as there are no phi nodes, but
1826 * we still may need to insert fixup code in the physical
1827 * predecessor (i.e. the last block of the if) and that has
1828 * another successor (the block after the if) so we need to update
1829 * the renames state for when we process the other successor. This
1830 * crucially depends on the other successor getting processed
1831 * after this.
1832 *
1833 * For normal (non-physical) edges we disallow critical edges so
1834 * that hacks like this aren't necessary.
1835 */
1836 if (!pred_state->renames)
1837 pred_state->renames = _mesa_pointer_hash_table_create(ctx);
1838 _mesa_hash_table_insert(pred_state->renames, interval->interval.reg,
1839 (void *)(uintptr_t)physreg);
1840 }
1841 }
1842 }
1843
1844 static void
insert_file_live_in_moves(struct ra_ctx * ctx,struct ra_file * file)1845 insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file)
1846 {
1847 BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index];
1848 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
1849 physreg_node) {
1850 /* Skip phi nodes. This needs to happen after phi nodes are allocated,
1851 * because we may have to move live-ins around to make space for phi
1852 * nodes, but we shouldn't be handling phi nodes here.
1853 */
1854 if (BITSET_TEST(live_in, interval->interval.reg->name))
1855 insert_live_in_move(ctx, interval);
1856 }
1857 }
1858
1859 static void
insert_entry_regs(struct ra_block_state * state,struct ra_file * file)1860 insert_entry_regs(struct ra_block_state *state, struct ra_file *file)
1861 {
1862 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
1863 physreg_node) {
1864 _mesa_hash_table_insert(state->entry_regs, interval->interval.reg,
1865 (void *)(uintptr_t)interval->physreg_start);
1866 }
1867 }
1868
1869 static void
insert_live_in_moves(struct ra_ctx * ctx)1870 insert_live_in_moves(struct ra_ctx *ctx)
1871 {
1872 insert_file_live_in_moves(ctx, &ctx->full);
1873 insert_file_live_in_moves(ctx, &ctx->half);
1874 insert_file_live_in_moves(ctx, &ctx->shared);
1875
1876 /* If not all predecessors are visited, insert live-in regs so that
1877 * insert_live_out_moves() will work.
1878 */
1879 bool all_preds_visited = true;
1880 for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
1881 if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) {
1882 all_preds_visited = false;
1883 break;
1884 }
1885 }
1886
1887 if (!all_preds_visited) {
1888 struct ra_block_state *state = &ctx->blocks[ctx->block->index];
1889 state->entry_regs = _mesa_pointer_hash_table_create(ctx);
1890
1891 insert_entry_regs(state, &ctx->full);
1892 insert_entry_regs(state, &ctx->half);
1893 insert_entry_regs(state, &ctx->shared);
1894 }
1895 }
1896
1897 static void
insert_live_out_move(struct ra_ctx * ctx,struct ra_interval * interval)1898 insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval)
1899 {
1900 for (unsigned i = 0; i < 2; i++) {
1901 if (!ctx->block->successors[i])
1902 continue;
1903
1904 struct ir3_block *succ = ctx->block->successors[i];
1905 struct ra_block_state *succ_state = &ctx->blocks[succ->index];
1906
1907 if (!succ_state->visited)
1908 continue;
1909
1910 struct hash_entry *entry = _mesa_hash_table_search(
1911 succ_state->entry_regs, interval->interval.reg);
1912 if (!entry)
1913 continue;
1914
1915 physreg_t new_reg = (physreg_t)(uintptr_t)entry->data;
1916 if (new_reg != interval->physreg_start) {
1917 insert_liveout_copy(ctx->block, new_reg, interval->physreg_start,
1918 interval->interval.reg);
1919 }
1920 }
1921 }
1922
1923 static void
insert_file_live_out_moves(struct ra_ctx * ctx,struct ra_file * file)1924 insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file)
1925 {
1926 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
1927 physreg_node) {
1928 insert_live_out_move(ctx, interval);
1929 }
1930 }
1931
1932 static void
insert_live_out_moves(struct ra_ctx * ctx)1933 insert_live_out_moves(struct ra_ctx *ctx)
1934 {
1935 insert_file_live_out_moves(ctx, &ctx->full);
1936 insert_file_live_out_moves(ctx, &ctx->half);
1937 insert_file_live_out_moves(ctx, &ctx->shared);
1938 }
1939
1940 static void
handle_block(struct ra_ctx * ctx,struct ir3_block * block)1941 handle_block(struct ra_ctx *ctx, struct ir3_block *block)
1942 {
1943 ctx->block = block;
1944
1945 /* Reset the register files from the last block */
1946 ra_file_init(&ctx->full);
1947 ra_file_init(&ctx->half);
1948 ra_file_init(&ctx->shared);
1949
1950 /* Handle live-ins, phis, and input meta-instructions. These all appear
1951 * live at the beginning of the block, and interfere with each other
1952 * therefore need to be allocated "in parallel". This means that we
1953 * have to allocate all of them, inserting them into the file, and then
1954 * delay updating the IR until all of them are allocated.
1955 *
1956 * Handle precolored inputs first, because we need to make sure that other
1957 * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes
1958 * and inputs at the same time, because the first block doesn't have
1959 * predecessors. Therefore handle_live_in doesn't have to worry about
1960 * them.
1961 */
1962
1963 foreach_instr (instr, &block->instr_list) {
1964 if (instr->opc == OPC_META_INPUT)
1965 handle_precolored_input(ctx, instr);
1966 else
1967 break;
1968 }
1969
1970 unsigned name;
1971 BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1972 ctx->live->definitions_count) {
1973 struct ir3_register *reg = ctx->live->definitions[name];
1974 handle_live_in(ctx, reg);
1975 }
1976
1977 foreach_instr (instr, &block->instr_list) {
1978 if (instr->opc == OPC_META_PHI)
1979 handle_phi(ctx, instr->dsts[0]);
1980 else if (instr->opc == OPC_META_INPUT ||
1981 instr->opc == OPC_META_TEX_PREFETCH)
1982 handle_input(ctx, instr);
1983 else
1984 break;
1985 }
1986
1987 /* After this point, every live-in/phi/input has an interval assigned to
1988 * it. We delay actually assigning values until everything has been
1989 * allocated, so we can simply ignore any parallel copy entries created
1990 * when shuffling them around.
1991 */
1992 ctx->parallel_copies_count = 0;
1993
1994 insert_live_in_moves(ctx);
1995
1996 if (RA_DEBUG) {
1997 d("after live-in block %u:\n", block->index);
1998 ra_ctx_dump(ctx);
1999 }
2000
2001 /* Now we're done with processing live-ins, and can handle the body of the
2002 * block.
2003 */
2004 foreach_instr (instr, &block->instr_list) {
2005 di(instr, "processing");
2006
2007 if (instr->opc == OPC_META_PHI)
2008 assign_phi(ctx, instr);
2009 else if (instr->opc == OPC_META_INPUT ||
2010 instr->opc == OPC_META_TEX_PREFETCH)
2011 assign_input(ctx, instr);
2012 else if (instr->opc == OPC_META_SPLIT)
2013 handle_split(ctx, instr);
2014 else if (instr->opc == OPC_META_COLLECT)
2015 handle_collect(ctx, instr);
2016 else if (instr->opc == OPC_META_PARALLEL_COPY)
2017 handle_pcopy(ctx, instr);
2018 else if (instr->opc == OPC_CHMASK)
2019 handle_chmask(ctx, instr);
2020 else
2021 handle_normal_instr(ctx, instr);
2022
2023 if (RA_DEBUG)
2024 ra_ctx_dump(ctx);
2025 }
2026
2027 insert_live_out_moves(ctx);
2028
2029 BITSET_FOREACH_SET (name, ctx->live->live_out[block->index],
2030 ctx->live->definitions_count) {
2031 struct ir3_register *reg = ctx->live->definitions[name];
2032 handle_live_out(ctx, reg);
2033 }
2034
2035 ctx->blocks[block->index].visited = true;
2036 }
2037
2038 static unsigned
calc_target_full_pressure(struct ir3_shader_variant * v,unsigned pressure)2039 calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
2040 {
2041 /* Registers are allocated in units of vec4, so switch from units of
2042 * half-regs to vec4.
2043 */
2044 unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4);
2045
2046 bool double_threadsize = ir3_should_double_threadsize(v, reg_count);
2047
2048 unsigned target = reg_count;
2049 unsigned reg_independent_max_waves =
2050 ir3_get_reg_independent_max_waves(v, double_threadsize);
2051 unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
2052 v->shader->compiler, reg_count, double_threadsize);
2053 unsigned target_waves =
2054 MIN2(reg_independent_max_waves, reg_dependent_max_waves);
2055
2056 while (target <= RA_FULL_SIZE / (2 * 4) &&
2057 ir3_should_double_threadsize(v, target) == double_threadsize &&
2058 ir3_get_reg_dependent_max_waves(v->shader->compiler, target,
2059 double_threadsize) >= target_waves)
2060 target++;
2061
2062 return (target - 1) * 2 * 4;
2063 }
2064
2065 static void
add_pressure(struct ir3_pressure * pressure,struct ir3_register * reg,bool merged_regs)2066 add_pressure(struct ir3_pressure *pressure, struct ir3_register *reg,
2067 bool merged_regs)
2068 {
2069 unsigned size = reg_size(reg);
2070 if (reg->flags & IR3_REG_HALF)
2071 pressure->half += size;
2072 if (!(reg->flags & IR3_REG_HALF) || merged_regs)
2073 pressure->full += size;
2074 }
2075
2076 static void
dummy_interval_add(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)2077 dummy_interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2078 {
2079 }
2080
2081 static void
dummy_interval_delete(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)2082 dummy_interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2083 {
2084 }
2085
2086 static void
dummy_interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * parent,struct ir3_reg_interval * child)2087 dummy_interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *parent,
2088 struct ir3_reg_interval *child)
2089 {
2090 }
2091
2092 /* Calculate the minimum possible limit on register pressure so that spilling
2093 * still succeeds. Used to implement IR3_SHADER_DEBUG=spillall.
2094 */
2095
2096 static void
calc_min_limit_pressure(struct ir3_shader_variant * v,struct ir3_liveness * live,struct ir3_pressure * limit)2097 calc_min_limit_pressure(struct ir3_shader_variant *v,
2098 struct ir3_liveness *live,
2099 struct ir3_pressure *limit)
2100 {
2101 struct ir3_block *start = ir3_start_block(v->ir);
2102 struct ir3_reg_ctx *ctx = ralloc(NULL, struct ir3_reg_ctx);
2103 struct ir3_reg_interval *intervals =
2104 rzalloc_array(ctx, struct ir3_reg_interval, live->definitions_count);
2105
2106 ctx->interval_add = dummy_interval_add;
2107 ctx->interval_delete = dummy_interval_delete;
2108 ctx->interval_readd = dummy_interval_readd;
2109
2110 limit->full = limit->half = 0;
2111
2112 struct ir3_pressure cur_pressure = {0};
2113 foreach_instr (input, &start->instr_list) {
2114 if (input->opc != OPC_META_INPUT &&
2115 input->opc != OPC_META_TEX_PREFETCH)
2116 break;
2117
2118 add_pressure(&cur_pressure, input->dsts[0], v->mergedregs);
2119 }
2120
2121 limit->full = MAX2(limit->full, cur_pressure.full);
2122 limit->half = MAX2(limit->half, cur_pressure.half);
2123
2124 foreach_instr (input, &start->instr_list) {
2125 if (input->opc != OPC_META_INPUT &&
2126 input->opc != OPC_META_TEX_PREFETCH)
2127 break;
2128
2129 /* pre-colored inputs may have holes, which increases the pressure. */
2130 struct ir3_register *dst = input->dsts[0];
2131 if (dst->num != INVALID_REG) {
2132 unsigned physreg = ra_reg_get_physreg(dst) + reg_size(dst);
2133 if (dst->flags & IR3_REG_HALF)
2134 limit->half = MAX2(limit->half, physreg);
2135 if (!(dst->flags & IR3_REG_HALF) || v->mergedregs)
2136 limit->full = MAX2(limit->full, physreg);
2137 }
2138 }
2139
2140 foreach_block (block, &v->ir->block_list) {
2141 rb_tree_init(&ctx->intervals);
2142
2143 unsigned name;
2144 BITSET_FOREACH_SET (name, live->live_in[block->index],
2145 live->definitions_count) {
2146 struct ir3_register *reg = live->definitions[name];
2147 ir3_reg_interval_init(&intervals[reg->name], reg);
2148 ir3_reg_interval_insert(ctx, &intervals[reg->name]);
2149 }
2150
2151 foreach_instr (instr, &block->instr_list) {
2152 ra_foreach_dst (dst, instr) {
2153 ir3_reg_interval_init(&intervals[dst->name], dst);
2154 }
2155 /* phis and parallel copies can be deleted via spilling */
2156
2157 if (instr->opc == OPC_META_PHI) {
2158 ir3_reg_interval_insert(ctx, &intervals[instr->dsts[0]->name]);
2159 continue;
2160 }
2161
2162 if (instr->opc == OPC_META_PARALLEL_COPY)
2163 continue;
2164
2165 cur_pressure = (struct ir3_pressure) {0};
2166
2167 ra_foreach_dst (dst, instr) {
2168 if (dst->tied && !(dst->tied->flags & IR3_REG_KILL))
2169 add_pressure(&cur_pressure, dst, v->mergedregs);
2170 }
2171
2172 ra_foreach_src_rev (src, instr) {
2173 /* We currently don't support spilling the parent of a source when
2174 * making space for sources, so we have to keep track of the
2175 * intervals and figure out the root of the tree to figure out how
2176 * much space we need.
2177 *
2178 * TODO: We should probably support this in the spiller.
2179 */
2180 struct ir3_reg_interval *interval = &intervals[src->def->name];
2181 while (interval->parent)
2182 interval = interval->parent;
2183 add_pressure(&cur_pressure, interval->reg, v->mergedregs);
2184
2185 if (src->flags & IR3_REG_FIRST_KILL)
2186 ir3_reg_interval_remove(ctx, &intervals[src->def->name]);
2187 }
2188
2189 limit->full = MAX2(limit->full, cur_pressure.full);
2190 limit->half = MAX2(limit->half, cur_pressure.half);
2191
2192 cur_pressure = (struct ir3_pressure) {0};
2193
2194 ra_foreach_dst (dst, instr) {
2195 ir3_reg_interval_init(&intervals[dst->name], dst);
2196 ir3_reg_interval_insert(ctx, &intervals[dst->name]);
2197 add_pressure(&cur_pressure, dst, v->mergedregs);
2198 }
2199
2200 limit->full = MAX2(limit->full, cur_pressure.full);
2201 limit->half = MAX2(limit->half, cur_pressure.half);
2202 }
2203 }
2204
2205 /* Account for the base register, which needs to be available everywhere. */
2206 limit->full += 2;
2207
2208 ralloc_free(ctx);
2209 }
2210
2211 int
ir3_ra(struct ir3_shader_variant * v)2212 ir3_ra(struct ir3_shader_variant *v)
2213 {
2214 ir3_calc_dominance(v->ir);
2215
2216 ir3_create_parallel_copies(v->ir);
2217
2218 struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
2219
2220 ctx->merged_regs = v->mergedregs;
2221 ctx->compiler = v->shader->compiler;
2222 ctx->stage = v->type;
2223
2224 struct ir3_liveness *live = ir3_calc_liveness(ctx, v->ir);
2225
2226 ir3_debug_print(v->ir, "AFTER: create_parallel_copies");
2227
2228 ir3_merge_regs(live, v->ir);
2229
2230 struct ir3_pressure max_pressure;
2231 ir3_calc_pressure(v, live, &max_pressure);
2232 d("max pressure:");
2233 d("\tfull: %u", max_pressure.full);
2234 d("\thalf: %u", max_pressure.half);
2235 d("\tshared: %u", max_pressure.shared);
2236
2237 /* TODO: calculate half/full limit correctly for CS with barrier */
2238 struct ir3_pressure limit_pressure;
2239 limit_pressure.full = RA_FULL_SIZE;
2240 limit_pressure.half = RA_HALF_SIZE;
2241 limit_pressure.shared = RA_SHARED_SIZE;
2242
2243 /* If requested, lower the limit so that spilling happens more often. */
2244 if (ir3_shader_debug & IR3_DBG_SPILLALL)
2245 calc_min_limit_pressure(v, live, &limit_pressure);
2246
2247 if (max_pressure.shared > limit_pressure.shared) {
2248 /* TODO shared reg -> normal reg spilling */
2249 d("shared max pressure exceeded!");
2250 goto fail;
2251 }
2252
2253 bool spilled = false;
2254 if (max_pressure.full > limit_pressure.full ||
2255 max_pressure.half > limit_pressure.half) {
2256 if (!v->shader->compiler->has_pvtmem) {
2257 d("max pressure exceeded!");
2258 goto fail;
2259 }
2260 d("max pressure exceeded, spilling!");
2261 IR3_PASS(v->ir, ir3_spill, v, &live, &limit_pressure);
2262 ir3_calc_pressure(v, live, &max_pressure);
2263 assert(max_pressure.full <= limit_pressure.full &&
2264 max_pressure.half <= limit_pressure.half);
2265 spilled = true;
2266 }
2267
2268 ctx->live = live;
2269 ctx->intervals =
2270 rzalloc_array(ctx, struct ra_interval, live->definitions_count);
2271 ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count);
2272
2273 ctx->full.size = calc_target_full_pressure(v, max_pressure.full);
2274 d("full size: %u", ctx->full.size);
2275
2276 if (!v->mergedregs)
2277 ctx->half.size = RA_HALF_SIZE;
2278
2279 ctx->shared.size = RA_SHARED_SIZE;
2280
2281 ctx->full.start = ctx->half.start = ctx->shared.start = 0;
2282
2283 foreach_block (block, &v->ir->block_list)
2284 handle_block(ctx, block);
2285
2286 ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count);
2287
2288 /* Strip array-ness and SSA-ness at the end, because various helpers still
2289 * need to work even on definitions that have already been assigned. For
2290 * example, we need to preserve array-ness so that array live-ins have the
2291 * right size.
2292 */
2293 foreach_block (block, &v->ir->block_list) {
2294 foreach_instr (instr, &block->instr_list) {
2295 for (unsigned i = 0; i < instr->dsts_count; i++) {
2296 instr->dsts[i]->flags &= ~IR3_REG_SSA;
2297
2298 /* Parallel copies of array registers copy the whole register, and
2299 * we need some way to let the parallel copy code know that this was
2300 * an array whose size is determined by reg->size. So keep the array
2301 * flag on those. spill/reload also need to work on the entire
2302 * array.
2303 */
2304 if (!is_meta(instr) && instr->opc != OPC_RELOAD_MACRO)
2305 instr->dsts[i]->flags &= ~IR3_REG_ARRAY;
2306 }
2307
2308 for (unsigned i = 0; i < instr->srcs_count; i++) {
2309 instr->srcs[i]->flags &= ~IR3_REG_SSA;
2310
2311 if (!is_meta(instr) && instr->opc != OPC_SPILL_MACRO)
2312 instr->srcs[i]->flags &= ~IR3_REG_ARRAY;
2313 }
2314 }
2315 }
2316
2317 ir3_debug_print(v->ir, "AFTER: register allocation");
2318
2319 if (spilled) {
2320 IR3_PASS(v->ir, ir3_lower_spill);
2321 }
2322
2323 ir3_lower_copies(v);
2324
2325 ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
2326
2327 ralloc_free(ctx);
2328
2329 return 0;
2330 fail:
2331 ralloc_free(ctx);
2332 return -1;
2333 }
2334