1 /*
2 * Copyright (C) 2021 Valve Corporation
3 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "ir3_ra.h"
26 #include "util/rb_tree.h"
27 #include "util/u_math.h"
28 #include "ir3_shader.h"
29
30 /* This file implements an SSA-based register allocator. Unlike other
31 * SSA-based allocators, it handles vector split/collect "smartly," meaning
32 * that multiple values may share the same register interval. From the
33 * perspective of the allocator itself, only the top-level intervals matter,
34 * and the allocator is only concerned with allocating top-level intervals,
35 * which may mean moving other top-level intervals around. Other intervals,
36 * like the destination of a split instruction or the source of a collect
37 * instruction, are "locked" to their parent interval. The details of this are
38 * mostly handled by ir3_merge_regs and ir3_reg_ctx.
39 *
40 * We currently don't do any backtracking, but we do use the merge sets as a
41 * form of affinity to try to avoid moves from phis/splits/collects. Each
42 * merge set is what a more "classic" graph-coloring or live-range based
43 * allocator would consider a single register, but here we use it as merely a
44 * hint, except when multiple overlapping values are live at the same time.
45 * Each merge set has a "preferred" register, and we try to honor that when
46 * allocating values in the merge set.
47 */
48
49 /* ir3_reg_ctx implementation. */
50
51 static int
ir3_reg_interval_cmp(const struct rb_node * node,const void * data)52 ir3_reg_interval_cmp(const struct rb_node *node, const void *data)
53 {
54 unsigned reg = *(const unsigned *)data;
55 const struct ir3_reg_interval *interval =
56 ir3_rb_node_to_interval_const(node);
57 if (interval->reg->interval_start > reg)
58 return -1;
59 else if (interval->reg->interval_end <= reg)
60 return 1;
61 else
62 return 0;
63 }
64
65 static struct ir3_reg_interval *
ir3_reg_interval_search(struct rb_tree * tree,unsigned offset)66 ir3_reg_interval_search(struct rb_tree *tree, unsigned offset)
67 {
68 struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp);
69 return node ? ir3_rb_node_to_interval(node) : NULL;
70 }
71
72 static struct ir3_reg_interval *
ir3_reg_interval_search_sloppy(struct rb_tree * tree,unsigned offset)73 ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset)
74 {
75 struct rb_node *node =
76 rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp);
77 return node ? ir3_rb_node_to_interval(node) : NULL;
78 }
79
80 /* Get the interval covering the reg, or the closest to the right if it
81 * doesn't exist.
82 */
83 static struct ir3_reg_interval *
ir3_reg_interval_search_right(struct rb_tree * tree,unsigned offset)84 ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset)
85 {
86 struct ir3_reg_interval *interval =
87 ir3_reg_interval_search_sloppy(tree, offset);
88 if (!interval) {
89 return NULL;
90 } else if (interval->reg->interval_end > offset) {
91 return interval;
92 } else {
93 /* There is no interval covering reg, and ra_file_search_sloppy()
94 * returned the closest range to the left, so the next interval to the
95 * right should be the closest to the right.
96 */
97 return ir3_reg_interval_next_or_null(interval);
98 }
99 }
100
101 static int
ir3_reg_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)102 ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
103 {
104 const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a);
105 const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b);
106 return b->reg->interval_start - a->reg->interval_start;
107 }
108
109 static void
interval_insert(struct ir3_reg_ctx * ctx,struct rb_tree * tree,struct ir3_reg_interval * interval)110 interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree,
111 struct ir3_reg_interval *interval)
112 {
113 struct ir3_reg_interval *right =
114 ir3_reg_interval_search_right(tree, interval->reg->interval_start);
115 if (right && right->reg->interval_start < interval->reg->interval_end) {
116 /* We disallow trees where different members have different half-ness.
117 * This means that we can't treat bitcasts as copies like normal
118 * split/collect, so something like this would require an extra copy
119 * in mergedregs mode, and count as 4 half-units of register pressure
120 * instead of 2:
121 *
122 * f16vec2 foo = unpackFloat2x16(bar)
123 * ... = foo.x
124 * ... = bar
125 *
126 * However, relaxing this rule would open a huge can of worms. What
127 * happens when there's a vector of 16 things, and the fifth element
128 * has been bitcasted as a half-reg? Would that element alone have to
129 * be small enough to be used as a half-reg source? Let's keep that
130 * can of worms firmly shut for now.
131 */
132 assert((interval->reg->flags & IR3_REG_HALF) ==
133 (right->reg->flags & IR3_REG_HALF));
134
135 if (right->reg->interval_end <= interval->reg->interval_end &&
136 right->reg->interval_start >= interval->reg->interval_start) {
137 /* Check if we're inserting something that's already inserted */
138 assert(interval != right);
139
140 /* "right" is contained in "interval" and must become a child of
141 * it. There may be further children too.
142 */
143 for (struct ir3_reg_interval *next = ir3_reg_interval_next(right);
144 right && right->reg->interval_start < interval->reg->interval_end;
145 right = next, next = ir3_reg_interval_next_or_null(next)) {
146 /* "right" must be contained in "interval." */
147 assert(right->reg->interval_end <= interval->reg->interval_end);
148 assert((interval->reg->flags & IR3_REG_HALF) ==
149 (right->reg->flags & IR3_REG_HALF));
150 if (!right->parent)
151 ctx->interval_delete(ctx, right);
152 right->parent = interval;
153 rb_tree_remove(tree, &right->node);
154 rb_tree_insert(&interval->children, &right->node,
155 ir3_reg_interval_insert_cmp);
156 }
157 } else {
158 /* "right" must contain "interval," since intervals must form a
159 * tree.
160 */
161 assert(right->reg->interval_start <= interval->reg->interval_start);
162 interval->parent = right;
163 interval_insert(ctx, &right->children, interval);
164 return;
165 }
166 }
167
168 if (!interval->parent)
169 ctx->interval_add(ctx, interval);
170 rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp);
171 interval->inserted = true;
172 }
173
174 void
ir3_reg_interval_insert(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)175 ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
176 struct ir3_reg_interval *interval)
177 {
178 rb_tree_init(&interval->children);
179 interval->parent = NULL;
180 interval_insert(ctx, &ctx->intervals, interval);
181 }
182
183 /* Call after ir3_reg_interval_remove_temp() to reinsert the interval */
184 static void
ir3_reg_interval_reinsert(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)185 ir3_reg_interval_reinsert(struct ir3_reg_ctx *ctx,
186 struct ir3_reg_interval *interval)
187 {
188 interval->parent = NULL;
189 interval_insert(ctx, &ctx->intervals, interval);
190 }
191
192 void
ir3_reg_interval_remove(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)193 ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
194 struct ir3_reg_interval *interval)
195 {
196 if (interval->parent) {
197 rb_tree_remove(&interval->parent->children, &interval->node);
198 } else {
199 ctx->interval_delete(ctx, interval);
200 rb_tree_remove(&ctx->intervals, &interval->node);
201 }
202
203 rb_tree_foreach_safe (struct ir3_reg_interval, child, &interval->children,
204 node) {
205 rb_tree_remove(&interval->children, &child->node);
206 child->parent = interval->parent;
207
208 if (interval->parent) {
209 rb_tree_insert(&child->parent->children, &child->node,
210 ir3_reg_interval_insert_cmp);
211 } else {
212 ctx->interval_readd(ctx, interval, child);
213 rb_tree_insert(&ctx->intervals, &child->node,
214 ir3_reg_interval_insert_cmp);
215 }
216 }
217
218 interval->inserted = false;
219 }
220
221 static void
_mark_free(struct ir3_reg_interval * interval)222 _mark_free(struct ir3_reg_interval *interval)
223 {
224 interval->inserted = false;
225 rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
226 _mark_free(child);
227 }
228 }
229
230 /* Remove an interval and all its children from the tree. */
231 void
ir3_reg_interval_remove_all(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)232 ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
233 struct ir3_reg_interval *interval)
234 {
235 assert(!interval->parent);
236
237 ctx->interval_delete(ctx, interval);
238 rb_tree_remove(&ctx->intervals, &interval->node);
239 _mark_free(interval);
240 }
241
242 /* Used when popping an interval to be shuffled around. Don't disturb children
243 * so that it can be later reinserted.
244 */
245 static void
ir3_reg_interval_remove_temp(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)246 ir3_reg_interval_remove_temp(struct ir3_reg_ctx *ctx,
247 struct ir3_reg_interval *interval)
248 {
249 assert(!interval->parent);
250
251 ctx->interval_delete(ctx, interval);
252 rb_tree_remove(&ctx->intervals, &interval->node);
253 }
254
255 static void
interval_dump(struct log_stream * stream,struct ir3_reg_interval * interval,unsigned indent)256 interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval,
257 unsigned indent)
258 {
259 for (unsigned i = 0; i < indent; i++)
260 mesa_log_stream_printf(stream, "\t");
261 mesa_log_stream_printf(stream, "reg %u start %u\n", interval->reg->name,
262 interval->reg->interval_start);
263
264 rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
265 interval_dump(stream, child, indent + 1);
266 }
267
268 for (unsigned i = 0; i < indent; i++)
269 mesa_log_stream_printf(stream, "\t");
270 mesa_log_stream_printf(stream, "reg %u end %u\n", interval->reg->name,
271 interval->reg->interval_end);
272 }
273
274 void
ir3_reg_interval_dump(struct log_stream * stream,struct ir3_reg_interval * interval)275 ir3_reg_interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval)
276 {
277 interval_dump(stream, interval, 0);
278 }
279
280 /* These are the core datastructures used by the register allocator. First
281 * ra_interval and ra_file, which are used for intra-block tracking and use
282 * the ir3_reg_ctx infrastructure:
283 */
284
285 struct ra_interval {
286 struct ir3_reg_interval interval;
287
288 struct rb_node physreg_node;
289 physreg_t physreg_start, physreg_end;
290
291 /* True if this is a source of the current instruction which is entirely
292 * killed. This means we can allocate the dest over it, but we can't break
293 * it up.
294 */
295 bool is_killed;
296
297 /* True if this interval cannot be moved from its position. This is only
298 * used for precolored inputs to ensure that other inputs don't get
299 * allocated on top of them.
300 */
301 bool frozen;
302 };
303
304 struct ra_file {
305 struct ir3_reg_ctx reg_ctx;
306
307 BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
308 BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
309
310 struct rb_tree physreg_intervals;
311
312 unsigned size;
313 unsigned start;
314 };
315
316 /* State for inter-block tracking. When we split a live range to make space
317 * for a vector, we may need to insert fixup code when a block has multiple
318 * predecessors that have moved the same live value to different registers.
319 * This keeps track of state required to do that.
320 */
321
322 struct ra_block_state {
323 /* Map of defining ir3_register -> physreg it was allocated to at the end
324 * of the block.
325 */
326 struct hash_table *renames;
327
328 /* For loops, we need to process a block before all its predecessors have
329 * been processed. In particular, we need to pick registers for values
330 * without knowing if all the predecessors have been renamed. This keeps
331 * track of the registers we chose so that when we visit the back-edge we
332 * can move them appropriately. If all predecessors have been visited
333 * before this block is visited then we don't need to fill this out. This
334 * is a map from ir3_register -> physreg.
335 */
336 struct hash_table *entry_regs;
337
338 /* True if the block has been visited and "renames" is complete.
339 */
340 bool visited;
341 };
342
343 struct ra_parallel_copy {
344 struct ra_interval *interval;
345 physreg_t src;
346 };
347
348 /* The main context: */
349
350 struct ra_ctx {
351 /* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom
352 * half of this file too.
353 */
354 struct ra_file full;
355
356 /* hr0.x - hr63.w, only used without merged-regs. */
357 struct ra_file half;
358
359 /* Shared regs. */
360 struct ra_file shared;
361
362 struct ir3_liveness *live;
363
364 struct ir3_block *block;
365
366 const struct ir3_compiler *compiler;
367 gl_shader_stage stage;
368
369 /* Pending moves of top-level intervals that will be emitted once we're
370 * finished:
371 */
372 DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies);
373
374 struct ra_interval *intervals;
375 struct ra_block_state *blocks;
376
377 bool merged_regs;
378 };
379
380 #define foreach_interval(interval, file) \
381 rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals, \
382 physreg_node)
383 #define foreach_interval_rev(interval, file) \
384 rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals, \
385 physreg_node)
386 #define foreach_interval_safe(interval, file) \
387 rb_tree_foreach_safe (struct ra_interval, interval, \
388 &(file)->physreg_intervals, physreg_node)
389 #define foreach_interval_rev_safe(interval, file) \
390 rb_tree_foreach_rev_safe(struct ra_interval, interval, \
391 &(file)->physreg_intervals, physreg_node)
392
393 static struct ra_interval *
rb_node_to_interval(struct rb_node * node)394 rb_node_to_interval(struct rb_node *node)
395 {
396 return rb_node_data(struct ra_interval, node, physreg_node);
397 }
398
399 static const struct ra_interval *
rb_node_to_interval_const(const struct rb_node * node)400 rb_node_to_interval_const(const struct rb_node *node)
401 {
402 return rb_node_data(struct ra_interval, node, physreg_node);
403 }
404
405 static struct ra_interval *
ra_interval_next(struct ra_interval * interval)406 ra_interval_next(struct ra_interval *interval)
407 {
408 struct rb_node *next = rb_node_next(&interval->physreg_node);
409 return next ? rb_node_to_interval(next) : NULL;
410 }
411
412 static struct ra_interval *
ra_interval_next_or_null(struct ra_interval * interval)413 ra_interval_next_or_null(struct ra_interval *interval)
414 {
415 return interval ? ra_interval_next(interval) : NULL;
416 }
417
418 static int
ra_interval_cmp(const struct rb_node * node,const void * data)419 ra_interval_cmp(const struct rb_node *node, const void *data)
420 {
421 physreg_t reg = *(const physreg_t *)data;
422 const struct ra_interval *interval = rb_node_to_interval_const(node);
423 if (interval->physreg_start > reg)
424 return -1;
425 else if (interval->physreg_end <= reg)
426 return 1;
427 else
428 return 0;
429 }
430
431 static struct ra_interval *
ra_interval_search_sloppy(struct rb_tree * tree,physreg_t reg)432 ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
433 {
434 struct rb_node *node = rb_tree_search_sloppy(tree, ®, ra_interval_cmp);
435 return node ? rb_node_to_interval(node) : NULL;
436 }
437
438 /* Get the interval covering the reg, or the closest to the right if it
439 * doesn't exist.
440 */
441 static struct ra_interval *
ra_interval_search_right(struct rb_tree * tree,physreg_t reg)442 ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
443 {
444 struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
445 if (!interval) {
446 return NULL;
447 } else if (interval->physreg_end > reg) {
448 return interval;
449 } else {
450 /* There is no interval covering reg, and ra_file_search_sloppy()
451 * returned the closest range to the left, so the next interval to the
452 * right should be the closest to the right.
453 */
454 return ra_interval_next_or_null(interval);
455 }
456 }
457
458 static struct ra_interval *
ra_file_search_right(struct ra_file * file,physreg_t reg)459 ra_file_search_right(struct ra_file *file, physreg_t reg)
460 {
461 return ra_interval_search_right(&file->physreg_intervals, reg);
462 }
463
464 static int
ra_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)465 ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
466 {
467 const struct ra_interval *a = rb_node_to_interval_const(_a);
468 const struct ra_interval *b = rb_node_to_interval_const(_b);
469 return b->physreg_start - a->physreg_start;
470 }
471
472 static struct ra_interval *
ir3_reg_interval_to_ra_interval(struct ir3_reg_interval * interval)473 ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
474 {
475 return rb_node_data(struct ra_interval, interval, interval);
476 }
477
478 static struct ra_file *
ir3_reg_ctx_to_file(struct ir3_reg_ctx * ctx)479 ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx)
480 {
481 return rb_node_data(struct ra_file, ctx, reg_ctx);
482 }
483
484 static void
interval_add(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _interval)485 interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
486 {
487 struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
488 struct ra_file *file = ir3_reg_ctx_to_file(ctx);
489
490 /* We can assume in this case that physreg_start/physreg_end is already
491 * initialized.
492 */
493 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
494 BITSET_CLEAR(file->available, i);
495 BITSET_CLEAR(file->available_to_evict, i);
496 }
497
498 rb_tree_insert(&file->physreg_intervals, &interval->physreg_node,
499 ra_interval_insert_cmp);
500 }
501
502 static void
interval_delete(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _interval)503 interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
504 {
505 struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
506 struct ra_file *file = ir3_reg_ctx_to_file(ctx);
507
508 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
509 BITSET_SET(file->available, i);
510 BITSET_SET(file->available_to_evict, i);
511 }
512
513 rb_tree_remove(&file->physreg_intervals, &interval->physreg_node);
514 }
515
516 static void
interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)517 interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
518 struct ir3_reg_interval *_child)
519 {
520 struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
521 struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
522
523 child->physreg_start =
524 parent->physreg_start + (child->interval.reg->interval_start -
525 parent->interval.reg->interval_start);
526 child->physreg_end =
527 child->physreg_start +
528 (child->interval.reg->interval_end - child->interval.reg->interval_start);
529
530 interval_add(ctx, _child);
531 }
532
533 static void
ra_file_init(struct ra_file * file)534 ra_file_init(struct ra_file *file)
535 {
536 for (unsigned i = 0; i < file->size; i++) {
537 BITSET_SET(file->available, i);
538 BITSET_SET(file->available_to_evict, i);
539 }
540
541 rb_tree_init(&file->reg_ctx.intervals);
542 rb_tree_init(&file->physreg_intervals);
543
544 file->reg_ctx.interval_add = interval_add;
545 file->reg_ctx.interval_delete = interval_delete;
546 file->reg_ctx.interval_readd = interval_readd;
547 }
548
549 static void
ra_file_insert(struct ra_file * file,struct ra_interval * interval)550 ra_file_insert(struct ra_file *file, struct ra_interval *interval)
551 {
552 assert(interval->physreg_start < interval->physreg_end);
553 assert(interval->physreg_end <= file->size);
554 if (interval->interval.reg->flags & IR3_REG_HALF)
555 assert(interval->physreg_end <= RA_HALF_SIZE);
556
557 ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
558 }
559
560 static void
ra_file_remove(struct ra_file * file,struct ra_interval * interval)561 ra_file_remove(struct ra_file *file, struct ra_interval *interval)
562 {
563 ir3_reg_interval_remove(&file->reg_ctx, &interval->interval);
564 }
565
566 static void
ra_file_mark_killed(struct ra_file * file,struct ra_interval * interval)567 ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval)
568 {
569 assert(!interval->interval.parent);
570
571 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
572 BITSET_SET(file->available, i);
573 }
574
575 interval->is_killed = true;
576 }
577
578 static void
ra_file_unmark_killed(struct ra_file * file,struct ra_interval * interval)579 ra_file_unmark_killed(struct ra_file *file, struct ra_interval *interval)
580 {
581 assert(!interval->interval.parent);
582
583 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
584 BITSET_CLEAR(file->available, i);
585 }
586
587 interval->is_killed = false;
588 }
589
590 static physreg_t
ra_interval_get_physreg(const struct ra_interval * interval)591 ra_interval_get_physreg(const struct ra_interval *interval)
592 {
593 unsigned child_start = interval->interval.reg->interval_start;
594
595 while (interval->interval.parent) {
596 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
597 }
598
599 return interval->physreg_start +
600 (child_start - interval->interval.reg->interval_start);
601 }
602
603 static unsigned
ra_interval_get_num(const struct ra_interval * interval)604 ra_interval_get_num(const struct ra_interval *interval)
605 {
606 return ra_physreg_to_num(ra_interval_get_physreg(interval),
607 interval->interval.reg->flags);
608 }
609
610 static void
ra_interval_init(struct ra_interval * interval,struct ir3_register * reg)611 ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
612 {
613 ir3_reg_interval_init(&interval->interval, reg);
614 interval->is_killed = false;
615 interval->frozen = false;
616 }
617
618 static void
ra_interval_dump(struct log_stream * stream,struct ra_interval * interval)619 ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
620 {
621 mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
622
623 ir3_reg_interval_dump(stream, &interval->interval);
624 }
625
626 static void
ra_file_dump(struct log_stream * stream,struct ra_file * file)627 ra_file_dump(struct log_stream *stream, struct ra_file *file)
628 {
629 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
630 physreg_node) {
631 ra_interval_dump(stream, interval);
632 }
633
634 unsigned start, end;
635 mesa_log_stream_printf(stream, "available:\n");
636 BITSET_FOREACH_RANGE (start, end, file->available, file->size) {
637 mesa_log_stream_printf(stream, "%u-%u ", start, end);
638 }
639 mesa_log_stream_printf(stream, "\n");
640
641 mesa_log_stream_printf(stream, "available to evict:\n");
642 BITSET_FOREACH_RANGE (start, end, file->available_to_evict, file->size) {
643 mesa_log_stream_printf(stream, "%u-%u ", start, end);
644 }
645 mesa_log_stream_printf(stream, "\n");
646 mesa_log_stream_printf(stream, "start: %u\n", file->start);
647 }
648
649 static void
ra_ctx_dump(struct ra_ctx * ctx)650 ra_ctx_dump(struct ra_ctx *ctx)
651 {
652 struct log_stream *stream = mesa_log_streami();
653 mesa_log_stream_printf(stream, "full:\n");
654 ra_file_dump(stream, &ctx->full);
655 mesa_log_stream_printf(stream, "half:\n");
656 ra_file_dump(stream, &ctx->half);
657 mesa_log_stream_printf(stream, "shared:");
658 ra_file_dump(stream, &ctx->shared);
659 mesa_log_stream_destroy(stream);
660 }
661
662 static unsigned
reg_file_size(struct ra_file * file,struct ir3_register * reg)663 reg_file_size(struct ra_file *file, struct ir3_register *reg)
664 {
665 /* Half-regs can only take up the first half of the combined regfile */
666 if (reg->flags & IR3_REG_HALF)
667 return MIN2(file->size, RA_HALF_SIZE);
668 else
669 return file->size;
670 }
671
672 /* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple
673 * top-level intervals at once. Pop multiple intervals, then push them back in
674 * any order.
675 */
676
677 struct ra_removed_interval {
678 struct ra_interval *interval;
679 unsigned size;
680 };
681
682 static struct ra_removed_interval
ra_pop_interval(struct ra_ctx * ctx,struct ra_file * file,struct ra_interval * interval)683 ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
684 struct ra_interval *interval)
685 {
686 assert(!interval->interval.parent);
687
688 /* Check if we've already moved this reg before */
689 unsigned pcopy_index;
690 for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count;
691 pcopy_index++) {
692 if (ctx->parallel_copies[pcopy_index].interval == interval)
693 break;
694 }
695
696 if (pcopy_index == ctx->parallel_copies_count) {
697 array_insert(ctx, ctx->parallel_copies,
698 (struct ra_parallel_copy){
699 .interval = interval,
700 .src = interval->physreg_start,
701 });
702 }
703
704 ir3_reg_interval_remove_temp(&file->reg_ctx, &interval->interval);
705
706 return (struct ra_removed_interval){
707 .interval = interval,
708 .size = interval->physreg_end - interval->physreg_start,
709 };
710 }
711
712 static void
ra_push_interval(struct ra_ctx * ctx,struct ra_file * file,const struct ra_removed_interval * removed,physreg_t dst)713 ra_push_interval(struct ra_ctx *ctx, struct ra_file *file,
714 const struct ra_removed_interval *removed, physreg_t dst)
715 {
716 struct ra_interval *interval = removed->interval;
717
718 interval->physreg_start = dst;
719 interval->physreg_end = dst + removed->size;
720
721 assert(interval->physreg_end <= file->size);
722 if (interval->interval.reg->flags & IR3_REG_HALF)
723 assert(interval->physreg_end <= RA_HALF_SIZE);
724
725 ir3_reg_interval_reinsert(&file->reg_ctx, &interval->interval);
726 }
727
728 /* Pick up the interval and place it at "dst". */
729 static void
ra_move_interval(struct ra_ctx * ctx,struct ra_file * file,struct ra_interval * interval,physreg_t dst)730 ra_move_interval(struct ra_ctx *ctx, struct ra_file *file,
731 struct ra_interval *interval, physreg_t dst)
732 {
733 struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval);
734 ra_push_interval(ctx, file, &temp, dst);
735 }
736
737 static struct ra_file *
ra_get_file(struct ra_ctx * ctx,struct ir3_register * reg)738 ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
739 {
740 if (reg->flags & IR3_REG_SHARED)
741 return &ctx->shared;
742 else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
743 return &ctx->full;
744 else
745 return &ctx->half;
746 }
747
748
749 /* Returns true if the proposed spot for "dst" or a killed source overlaps a
750 * destination that's been allocated.
751 */
752 static bool
check_dst_overlap(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * dst,physreg_t start,physreg_t end)753 check_dst_overlap(struct ra_ctx *ctx, struct ra_file *file,
754 struct ir3_register *dst, physreg_t start,
755 physreg_t end)
756 {
757 struct ir3_instruction *instr = dst->instr;
758
759 ra_foreach_dst (other_dst, instr) {
760 /* We assume only destinations before the current one have been allocated.
761 */
762 if (other_dst == dst)
763 break;
764
765 if (ra_get_file(ctx, other_dst) != file)
766 continue;
767
768 struct ra_interval *other_interval = &ctx->intervals[other_dst->name];
769 assert(!other_interval->interval.parent);
770 physreg_t other_start = other_interval->physreg_start;
771 physreg_t other_end = other_interval->physreg_end;
772
773 if (other_end > start && end > other_start)
774 return true;
775 }
776
777 return false;
778 }
779
780 /* True if the destination is "early-clobber," meaning that it cannot be
781 * allocated over killed sources. Some destinations always require it, but it
782 * also is implicitly true for tied destinations whose source is live-through.
783 * If the source is killed, then we skip allocating a register for the
784 * destination altogether so we don't need to worry about that case here.
785 */
786 static bool
is_early_clobber(struct ir3_register * reg)787 is_early_clobber(struct ir3_register *reg)
788 {
789 return (reg->flags & IR3_REG_EARLY_CLOBBER) || reg->tied;
790 }
791
792 static bool
get_reg_specified(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg,physreg_t physreg,bool is_source)793 get_reg_specified(struct ra_ctx *ctx, struct ra_file *file,
794 struct ir3_register *reg, physreg_t physreg, bool is_source)
795 {
796 for (unsigned i = 0; i < reg_size(reg); i++) {
797 if (!BITSET_TEST(is_early_clobber(reg) || is_source ?
798 file->available_to_evict : file->available,
799 physreg + i))
800 return false;
801 }
802
803 if (!is_source &&
804 check_dst_overlap(ctx, file, reg, physreg, physreg + reg_size(reg)))
805 return false;
806
807 return true;
808 }
809
810 /* Try to evict any registers conflicting with the proposed spot "physreg" for
811 * "reg". That is, move them to other places so that we can allocate "physreg"
812 * here.
813 */
814
815 static bool
try_evict_regs(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg,physreg_t physreg,unsigned * _eviction_count,bool is_source,bool speculative)816 try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
817 struct ir3_register *reg, physreg_t physreg,
818 unsigned *_eviction_count, bool is_source, bool speculative)
819 {
820 BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
821 memcpy(available_to_evict, file->available_to_evict,
822 sizeof(available_to_evict));
823
824 BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
825 memcpy(available, file->available, sizeof(available));
826
827 for (unsigned i = 0; i < reg_size(reg); i++) {
828 BITSET_CLEAR(available_to_evict, physreg + i);
829 BITSET_CLEAR(available, physreg + i);
830 }
831
832 unsigned eviction_count = 0;
833 /* Iterate over each range conflicting with physreg */
834 for (struct ra_interval *conflicting = ra_file_search_right(file, physreg),
835 *next = ra_interval_next_or_null(conflicting);
836 conflicting != NULL &&
837 conflicting->physreg_start < physreg + reg_size(reg);
838 conflicting = next, next = ra_interval_next_or_null(next)) {
839 if (!is_early_clobber(reg) && !is_source && conflicting->is_killed)
840 continue;
841
842 if (conflicting->frozen) {
843 assert(speculative);
844 return false;
845 }
846
847 unsigned conflicting_file_size =
848 reg_file_size(file, conflicting->interval.reg);
849 unsigned avail_start, avail_end;
850 bool evicted = false;
851 BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict,
852 conflicting_file_size) {
853 unsigned size = avail_end - avail_start;
854
855 /* non-half registers must be aligned */
856 if (!(conflicting->interval.reg->flags & IR3_REG_HALF) &&
857 avail_start % 2 == 1) {
858 avail_start++;
859 size--;
860 }
861
862 unsigned conflicting_size =
863 conflicting->physreg_end - conflicting->physreg_start;
864 if (size >= conflicting_size &&
865 !check_dst_overlap(ctx, file, reg, avail_start, avail_start +
866 conflicting_size)) {
867 for (unsigned i = 0;
868 i < conflicting->physreg_end - conflicting->physreg_start; i++)
869 BITSET_CLEAR(available_to_evict, avail_start + i);
870 eviction_count +=
871 conflicting->physreg_end - conflicting->physreg_start;
872 if (!speculative)
873 ra_move_interval(ctx, file, conflicting, avail_start);
874 evicted = true;
875 break;
876 }
877 }
878
879 if (evicted)
880 continue;
881
882 /* If we couldn't evict this range, we may be able to swap it with a
883 * killed range to acheive the same effect.
884 */
885 foreach_interval (killed, file) {
886 if (!killed->is_killed)
887 continue;
888
889 if (killed->physreg_end - killed->physreg_start !=
890 conflicting->physreg_end - conflicting->physreg_start)
891 continue;
892
893 if (killed->physreg_end > conflicting_file_size ||
894 conflicting->physreg_end > reg_file_size(file, killed->interval.reg))
895 continue;
896
897 /* We can't swap the killed range if it partially/fully overlaps the
898 * space we're trying to allocate or (in speculative mode) if it's
899 * already been swapped and will overlap when we actually evict.
900 */
901 bool killed_available = true;
902 for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
903 if (!BITSET_TEST(available, i)) {
904 killed_available = false;
905 break;
906 }
907 }
908
909 if (!killed_available)
910 continue;
911
912 if (check_dst_overlap(ctx, file, reg, killed->physreg_start,
913 killed->physreg_end))
914 continue;
915
916 /* Check for alignment if one is a full reg */
917 if ((!(killed->interval.reg->flags & IR3_REG_HALF) ||
918 !(conflicting->interval.reg->flags & IR3_REG_HALF)) &&
919 (killed->physreg_start % 2 != 0 ||
920 conflicting->physreg_start % 2 != 0))
921 continue;
922
923 for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
924 BITSET_CLEAR(available, i);
925 }
926 /* Because this will generate swaps instead of moves, multiply the
927 * cost by 2.
928 */
929 eviction_count += (killed->physreg_end - killed->physreg_start) * 2;
930 if (!speculative) {
931 physreg_t killed_start = killed->physreg_start,
932 conflicting_start = conflicting->physreg_start;
933 struct ra_removed_interval killed_removed =
934 ra_pop_interval(ctx, file, killed);
935 struct ra_removed_interval conflicting_removed =
936 ra_pop_interval(ctx, file, conflicting);
937 ra_push_interval(ctx, file, &killed_removed, conflicting_start);
938 ra_push_interval(ctx, file, &conflicting_removed, killed_start);
939 }
940
941 evicted = true;
942 break;
943 }
944
945 if (!evicted)
946 return false;
947 }
948
949 *_eviction_count = eviction_count;
950 return true;
951 }
952
953 static int
removed_interval_cmp(const void * _i1,const void * _i2)954 removed_interval_cmp(const void *_i1, const void *_i2)
955 {
956 const struct ra_removed_interval *i1 = _i1;
957 const struct ra_removed_interval *i2 = _i2;
958
959 /* We sort the registers as follows:
960 *
961 * |------------------------------------------------------------------------------------------|
962 * | | | | | | |
963 * | Half | Half early-clobber | Half | Full | Full early-clobber | Full |
964 * | live-through | destination | killed | killed | destination | live-through |
965 * | | | | | | |
966 * |------------------------------------------------------------------------------------------|
967 * | |
968 * | Destination |
969 * | |
970 * |-----------------|
971 *
972 * Half-registers have to be first so that they stay in the low half of
973 * the register file. Then half and full killed must stay together so that
974 * there's a contiguous range where we can put the register. With this
975 * structure we should be able to accomodate any collection of intervals
976 * such that the total number of half components is within the half limit
977 * and the combined components are within the full limit.
978 */
979
980 unsigned i1_align = reg_elem_size(i1->interval->interval.reg);
981 unsigned i2_align = reg_elem_size(i2->interval->interval.reg);
982 if (i1_align > i2_align)
983 return 1;
984 if (i1_align < i2_align)
985 return -1;
986
987 if (i1_align == 1) {
988 if (i2->interval->is_killed)
989 return -1;
990 if (i1->interval->is_killed)
991 return 1;
992 } else {
993 if (i2->interval->is_killed)
994 return 1;
995 if (i1->interval->is_killed)
996 return -1;
997 }
998
999 return 0;
1000 }
1001
1002 static int
dsts_cmp(const void * _i1,const void * _i2)1003 dsts_cmp(const void *_i1, const void *_i2)
1004 {
1005 struct ir3_register *i1 = *(struct ir3_register *const *) _i1;
1006 struct ir3_register *i2 = *(struct ir3_register *const *) _i2;
1007
1008 /* Treat tied destinations as-if they are live-through sources, and normal
1009 * destinations as killed sources.
1010 */
1011 unsigned i1_align = reg_elem_size(i1);
1012 unsigned i2_align = reg_elem_size(i2);
1013 if (i1_align > i2_align)
1014 return 1;
1015 if (i1_align < i2_align)
1016 return -1;
1017
1018 if (i1_align == 1) {
1019 if (!is_early_clobber(i2))
1020 return -1;
1021 if (!is_early_clobber(i1))
1022 return 1;
1023 } else {
1024 if (!is_early_clobber(i2))
1025 return 1;
1026 if (!is_early_clobber(i1))
1027 return -1;
1028 }
1029
1030 return 0;
1031 }
1032
1033 /* "Compress" all the live intervals so that there is enough space for the
1034 * destination register. As there can be gaps when a more-aligned interval
1035 * follows a less-aligned interval, this also sorts them to remove such
1036 * "padding", which may be required when space is very tight. This isn't
1037 * amazing, but should be used only as a last resort in case the register file
1038 * is almost full and badly fragmented.
1039 *
1040 * Return the physreg to use.
1041 */
1042 static physreg_t
compress_regs_left(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg)1043 compress_regs_left(struct ra_ctx *ctx, struct ra_file *file,
1044 struct ir3_register *reg)
1045 {
1046 unsigned align = reg_elem_size(reg);
1047 DECLARE_ARRAY(struct ra_removed_interval, intervals);
1048 intervals_count = intervals_sz = 0;
1049 intervals = NULL;
1050
1051 DECLARE_ARRAY(struct ir3_register *, dsts);
1052 dsts_count = dsts_sz = 0;
1053 dsts = NULL;
1054 array_insert(ctx, dsts, reg);
1055 bool dst_inserted[reg->instr->dsts_count];
1056
1057 unsigned dst_size = reg->tied ? 0 : reg_size(reg);
1058 unsigned ec_dst_size = is_early_clobber(reg) ? reg_size(reg) : 0;
1059 unsigned half_dst_size = 0, ec_half_dst_size = 0;
1060 if (align == 1) {
1061 half_dst_size = dst_size;
1062 ec_half_dst_size = ec_dst_size;
1063 }
1064
1065 unsigned removed_size = 0, removed_half_size = 0;
1066 unsigned removed_killed_size = 0, removed_killed_half_size = 0;
1067 unsigned file_size =
1068 align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size;
1069 physreg_t start_reg = 0;
1070
1071 foreach_interval_rev_safe (interval, file) {
1072 /* We'll check if we can compact the intervals starting here. */
1073 physreg_t candidate_start = interval->physreg_end;
1074
1075 /* Check if there are any other destinations we need to compact. */
1076 ra_foreach_dst_n (other_dst, n, reg->instr) {
1077 if (other_dst == reg)
1078 break;
1079 if (ra_get_file(ctx, other_dst) != file)
1080 continue;
1081 if (dst_inserted[n])
1082 continue;
1083
1084 struct ra_interval *other_interval = &ctx->intervals[other_dst->name];
1085 /* if the destination partially overlaps this interval, we need to
1086 * extend candidate_start to the end.
1087 */
1088 if (other_interval->physreg_start < candidate_start) {
1089 candidate_start = MAX2(candidate_start,
1090 other_interval->physreg_end);
1091 continue;
1092 }
1093
1094 dst_inserted[n] = true;
1095
1096 /* dst intervals with a tied killed source are considered attached to
1097 * that source. Don't actually insert them. This means we have to
1098 * update them below if their tied source moves.
1099 */
1100 if (other_dst->tied) {
1101 struct ra_interval *tied_interval =
1102 &ctx->intervals[other_dst->tied->def->name];
1103 if (tied_interval->is_killed)
1104 continue;
1105 }
1106
1107 d("popping destination %u physreg %u\n",
1108 other_interval->interval.reg->name,
1109 other_interval->physreg_start);
1110
1111 array_insert(ctx, dsts, other_dst);
1112 unsigned interval_size = reg_size(other_dst);
1113 if (is_early_clobber(other_dst)) {
1114 ec_dst_size += interval_size;
1115 if (other_interval->interval.reg->flags & IR3_REG_HALF)
1116 ec_half_dst_size += interval_size;
1117 } else {
1118 dst_size += interval_size;
1119 if (other_interval->interval.reg->flags & IR3_REG_HALF)
1120 half_dst_size += interval_size;
1121 }
1122 }
1123
1124 /* Check if we can sort the intervals *after* this one and have enough
1125 * space leftover to accomodate all intervals, keeping in mind that killed
1126 * sources overlap non-tied destinations. Also check that we have enough
1127 * space leftover for half-registers, if we're inserting a half-register
1128 * (otherwise we only shift any half-registers down so they should be
1129 * safe).
1130 */
1131 if (candidate_start + removed_size + ec_dst_size +
1132 MAX2(removed_killed_size, dst_size) <= file->size &&
1133 (align != 1 ||
1134 candidate_start + removed_half_size + ec_half_dst_size +
1135 MAX2(removed_killed_half_size, half_dst_size) <= file_size)) {
1136 start_reg = candidate_start;
1137 break;
1138 }
1139
1140 /* We assume that all frozen intervals are at the start and that we
1141 * can avoid popping them.
1142 */
1143 assert(!interval->frozen);
1144
1145 /* Killed sources are different because they go at the end and can
1146 * overlap the register we're trying to add.
1147 */
1148 unsigned interval_size = interval->physreg_end - interval->physreg_start;
1149 if (interval->is_killed) {
1150 removed_killed_size += interval_size;
1151 if (interval->interval.reg->flags & IR3_REG_HALF)
1152 removed_killed_half_size += interval_size;
1153 } else {
1154 removed_size += interval_size;
1155 if (interval->interval.reg->flags & IR3_REG_HALF)
1156 removed_half_size += interval_size;
1157 }
1158
1159 /* Now that we've done the accounting, pop this off */
1160 d("popping interval %u physreg %u%s\n", interval->interval.reg->name,
1161 interval->physreg_start, interval->is_killed ? ", killed" : "");
1162 array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval));
1163 }
1164
1165 /* TODO: In addition to skipping registers at the beginning that are
1166 * well-packed, we should try to skip registers at the end.
1167 */
1168
1169 qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp);
1170 qsort(dsts, dsts_count, sizeof(*dsts), dsts_cmp);
1171
1172 physreg_t live_reg = start_reg;
1173 physreg_t dst_reg = (physreg_t)~0;
1174 physreg_t ret_reg = (physreg_t)~0;
1175 unsigned dst_index = 0;
1176 unsigned live_index = 0;
1177
1178 /* We have two lists of intervals to process, live intervals and destination
1179 * intervals. Process them in the order of the disgram in insert_cmp().
1180 */
1181 while (live_index < intervals_count || dst_index < dsts_count) {
1182 bool process_dst;
1183 if (live_index == intervals_count) {
1184 process_dst = true;
1185 } else if (dst_index == dsts_count) {
1186 process_dst = false;
1187 } else {
1188 struct ir3_register *dst = dsts[dst_index];
1189 struct ra_interval *live_interval = intervals[live_index].interval;
1190
1191 bool live_half = live_interval->interval.reg->flags & IR3_REG_HALF;
1192 bool live_killed = live_interval->is_killed;
1193 bool dst_half = dst->flags & IR3_REG_HALF;
1194 bool dst_early_clobber = is_early_clobber(dst);
1195
1196 if (live_half && !live_killed) {
1197 /* far-left of diagram. */
1198 process_dst = false;
1199 } else if (dst_half && dst_early_clobber) {
1200 /* mid-left of diagram. */
1201 process_dst = true;
1202 } else if (!dst_early_clobber) {
1203 /* bottom of disagram. */
1204 process_dst = true;
1205 } else if (live_killed) {
1206 /* middle of diagram. */
1207 process_dst = false;
1208 } else if (!dst_half && dst_early_clobber) {
1209 /* mid-right of diagram. */
1210 process_dst = true;
1211 } else {
1212 /* far right of diagram. */
1213 assert(!live_killed && !live_half);
1214 process_dst = false;
1215 }
1216 }
1217
1218 struct ir3_register *cur_reg =
1219 process_dst ? dsts[dst_index] :
1220 intervals[live_index].interval->interval.reg;
1221
1222 physreg_t physreg;
1223 if (process_dst && !is_early_clobber(cur_reg)) {
1224 if (dst_reg == (physreg_t)~0)
1225 dst_reg = live_reg;
1226 physreg = dst_reg;
1227 } else {
1228 physreg = live_reg;
1229 struct ra_interval *live_interval = intervals[live_index].interval;
1230 bool live_killed = live_interval->is_killed;
1231 /* If this is live-through and we've processed the destinations, we
1232 * need to make sure we take into account any overlapping destinations.
1233 */
1234 if (!live_killed && dst_reg != (physreg_t)~0)
1235 physreg = MAX2(physreg, dst_reg);
1236 }
1237
1238 if (!(cur_reg->flags & IR3_REG_HALF))
1239 physreg = ALIGN(physreg, 2);
1240
1241 d("pushing reg %u physreg %u\n", cur_reg->name, physreg);
1242
1243 unsigned interval_size = reg_size(cur_reg);
1244 if (physreg + interval_size >
1245 reg_file_size(file, cur_reg)) {
1246 d("ran out of room for interval %u!\n",
1247 cur_reg->name);
1248 unreachable("reg pressure calculation was wrong!");
1249 return 0;
1250 }
1251
1252 if (process_dst) {
1253 if (cur_reg == reg) {
1254 ret_reg = physreg;
1255 } else {
1256 struct ra_interval *interval = &ctx->intervals[cur_reg->name];
1257 interval->physreg_start = physreg;
1258 interval->physreg_end = physreg + interval_size;
1259 }
1260 dst_index++;
1261 } else {
1262 ra_push_interval(ctx, file, &intervals[live_index], physreg);
1263 live_index++;
1264 }
1265
1266 physreg += interval_size;
1267
1268 if (process_dst && !is_early_clobber(cur_reg)) {
1269 dst_reg = physreg;
1270 } else {
1271 live_reg = physreg;
1272 }
1273 }
1274
1275 /* If we shuffled around a tied source that is killed, we may have to update
1276 * its corresponding destination since we didn't insert it above.
1277 */
1278 ra_foreach_dst (dst, reg->instr) {
1279 if (dst == reg)
1280 break;
1281
1282 struct ir3_register *tied = dst->tied;
1283 if (!tied)
1284 continue;
1285
1286 struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1287 if (!tied_interval->is_killed)
1288 continue;
1289
1290 struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1291 unsigned dst_size = reg_size(dst);
1292 dst_interval->physreg_start = ra_interval_get_physreg(tied_interval);
1293 dst_interval->physreg_end = dst_interval->physreg_start + dst_size;
1294 }
1295
1296 return ret_reg;
1297 }
1298
1299 static void
update_affinity(struct ra_file * file,struct ir3_register * reg,physreg_t physreg)1300 update_affinity(struct ra_file *file, struct ir3_register *reg,
1301 physreg_t physreg)
1302 {
1303 if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0)
1304 return;
1305
1306 if (physreg < reg->merge_set_offset)
1307 return;
1308
1309 if ((physreg - reg->merge_set_offset + reg->merge_set->size) > file->size)
1310 return;
1311
1312 reg->merge_set->preferred_reg = physreg - reg->merge_set_offset;
1313 }
1314
1315 /* Try to find free space for a register without shuffling anything. This uses
1316 * a round-robin algorithm to reduce false dependencies.
1317 */
1318 static physreg_t
find_best_gap(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * dst,unsigned file_size,unsigned size,unsigned align)1319 find_best_gap(struct ra_ctx *ctx, struct ra_file *file,
1320 struct ir3_register *dst, unsigned file_size, unsigned size,
1321 unsigned align)
1322 {
1323 /* This can happen if we create a very large merge set. Just bail out in that
1324 * case.
1325 */
1326 if (size > file_size)
1327 return (physreg_t) ~0;
1328
1329 BITSET_WORD *available =
1330 is_early_clobber(dst) ? file->available_to_evict : file->available;
1331
1332 unsigned start = ALIGN(file->start, align) % (file_size - size + align);
1333 unsigned candidate = start;
1334 do {
1335 bool is_available = true;
1336 for (unsigned i = 0; i < size; i++) {
1337 if (!BITSET_TEST(available, candidate + i)) {
1338 is_available = false;
1339 break;
1340 }
1341 }
1342
1343 if (is_available) {
1344 is_available =
1345 !check_dst_overlap(ctx, file, dst, candidate, candidate + size);
1346 }
1347
1348 if (is_available) {
1349 file->start = (candidate + size) % file_size;
1350 return candidate;
1351 }
1352
1353 candidate += align;
1354 if (candidate + size > file_size)
1355 candidate = 0;
1356 } while (candidate != start);
1357
1358 return (physreg_t)~0;
1359 }
1360
1361 /* This is the main entrypoint for picking a register. Pick a free register
1362 * for "reg", shuffling around sources if necessary. In the normal case where
1363 * "is_source" is false, this register can overlap with killed sources
1364 * (intervals with "is_killed == true"). If "is_source" is true, then
1365 * is_killed is ignored and the register returned must not overlap with killed
1366 * sources. This must be used for tied registers, because we're actually
1367 * allocating the destination and the tied source at the same time.
1368 */
1369
1370 static physreg_t
get_reg(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg)1371 get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg)
1372 {
1373 unsigned file_size = reg_file_size(file, reg);
1374 if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
1375 physreg_t preferred_reg =
1376 reg->merge_set->preferred_reg + reg->merge_set_offset;
1377 if (preferred_reg + reg_size(reg) <= file_size &&
1378 preferred_reg % reg_elem_size(reg) == 0 &&
1379 get_reg_specified(ctx, file, reg, preferred_reg, false))
1380 return preferred_reg;
1381 }
1382
1383 /* If this register is a subset of a merge set which we have not picked a
1384 * register for, first try to allocate enough space for the entire merge
1385 * set.
1386 */
1387 unsigned size = reg_size(reg);
1388 if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
1389 size < reg->merge_set->size) {
1390 physreg_t best_reg = find_best_gap(ctx, file, reg, file_size,
1391 reg->merge_set->size,
1392 reg->merge_set->alignment);
1393 if (best_reg != (physreg_t)~0u) {
1394 best_reg += reg->merge_set_offset;
1395 return best_reg;
1396 }
1397 }
1398
1399 /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
1400 * Because this doesn't introduce unnecessary dependencies, and it
1401 * potentially avoids needing (ss) syncs for write after read hazards for
1402 * SFU instructions:
1403 */
1404 if (is_sfu(reg->instr) || is_alu(reg->instr)) {
1405 for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
1406 struct ir3_register *src = reg->instr->srcs[i];
1407 if (!ra_reg_is_src(src))
1408 continue;
1409 if (ra_get_file(ctx, src) == file && reg_size(src) >= size) {
1410 struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1411 physreg_t src_physreg = ra_interval_get_physreg(src_interval);
1412 if (src_physreg % reg_elem_size(reg) == 0 &&
1413 src_physreg + size <= file_size &&
1414 get_reg_specified(ctx, file, reg, src_physreg, false))
1415 return src_physreg;
1416 }
1417 }
1418 }
1419
1420 physreg_t best_reg =
1421 find_best_gap(ctx, file, reg, file_size, size, reg_elem_size(reg));
1422 if (best_reg != (physreg_t)~0u) {
1423 return best_reg;
1424 }
1425
1426 /* Ok, we couldn't find anything that fits. Here is where we have to start
1427 * moving things around to make stuff fit. First try solely evicting
1428 * registers in the way.
1429 */
1430 unsigned best_eviction_count = ~0;
1431 for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) {
1432 unsigned eviction_count;
1433 if (try_evict_regs(ctx, file, reg, i, &eviction_count, false, true)) {
1434 if (eviction_count < best_eviction_count) {
1435 best_eviction_count = eviction_count;
1436 best_reg = i;
1437 }
1438 }
1439 }
1440
1441 if (best_eviction_count != ~0) {
1442 ASSERTED bool result = try_evict_regs(
1443 ctx, file, reg, best_reg, &best_eviction_count, false, false);
1444 assert(result);
1445 return best_reg;
1446 }
1447
1448 /* Use the dumb fallback only if try_evict_regs() fails. */
1449 return compress_regs_left(ctx, file, reg);
1450 }
1451
1452 static void
assign_reg(struct ir3_instruction * instr,struct ir3_register * reg,unsigned num)1453 assign_reg(struct ir3_instruction *instr, struct ir3_register *reg,
1454 unsigned num)
1455 {
1456 if (reg->flags & IR3_REG_ARRAY) {
1457 reg->array.base = num;
1458 if (reg->flags & IR3_REG_RELATIV)
1459 reg->array.offset += num;
1460 else
1461 reg->num = num + reg->array.offset;
1462 } else {
1463 reg->num = num;
1464 }
1465 }
1466
1467 static void
mark_src_killed(struct ra_ctx * ctx,struct ir3_register * src)1468 mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src)
1469 {
1470 struct ra_interval *interval = &ctx->intervals[src->def->name];
1471
1472 if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed ||
1473 interval->interval.parent ||
1474 !rb_tree_is_empty(&interval->interval.children))
1475 return;
1476
1477 ra_file_mark_killed(ra_get_file(ctx, src), interval);
1478 }
1479
1480 static void
insert_dst(struct ra_ctx * ctx,struct ir3_register * dst)1481 insert_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1482 {
1483 struct ra_file *file = ra_get_file(ctx, dst);
1484 struct ra_interval *interval = &ctx->intervals[dst->name];
1485
1486 d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval));
1487
1488 if (!(dst->flags & IR3_REG_UNUSED))
1489 ra_file_insert(file, interval);
1490
1491 assign_reg(dst->instr, dst, ra_interval_get_num(interval));
1492 }
1493
1494 static void
allocate_dst_fixed(struct ra_ctx * ctx,struct ir3_register * dst,physreg_t physreg)1495 allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst,
1496 physreg_t physreg)
1497 {
1498 struct ra_file *file = ra_get_file(ctx, dst);
1499 struct ra_interval *interval = &ctx->intervals[dst->name];
1500 update_affinity(file, dst, physreg);
1501
1502 ra_interval_init(interval, dst);
1503 interval->physreg_start = physreg;
1504 interval->physreg_end = physreg + reg_size(dst);
1505 }
1506
1507 /* If a tied destination interferes with its source register, we have to insert
1508 * a copy beforehand to copy the source to the destination. Because we are using
1509 * the parallel_copies array and not creating a separate copy, this copy will
1510 * happen in parallel with any shuffling around of the tied source, so we have
1511 * to copy the source *as it exists before it is shuffled around*. We do this by
1512 * inserting the copy early, before any other copies are inserted. We don't
1513 * actually know the destination of the copy, but that's ok because the
1514 * dst_interval will be filled out later.
1515 */
1516 static void
insert_tied_dst_copy(struct ra_ctx * ctx,struct ir3_register * dst)1517 insert_tied_dst_copy(struct ra_ctx *ctx, struct ir3_register *dst)
1518 {
1519 struct ir3_register *tied = dst->tied;
1520
1521 if (!tied)
1522 return;
1523
1524 struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1525 struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1526
1527 if (tied_interval->is_killed)
1528 return;
1529
1530 physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);
1531
1532 array_insert(ctx, ctx->parallel_copies,
1533 (struct ra_parallel_copy){
1534 .interval = dst_interval,
1535 .src = tied_physreg,
1536 });
1537 }
1538
1539 static void
allocate_dst(struct ra_ctx * ctx,struct ir3_register * dst)1540 allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1541 {
1542 struct ra_file *file = ra_get_file(ctx, dst);
1543
1544 struct ir3_register *tied = dst->tied;
1545 if (tied) {
1546 struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1547 if (tied_interval->is_killed) {
1548 /* The easy case: the source is killed, so we can just reuse it
1549 * for the destination.
1550 */
1551 allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval));
1552 return;
1553 }
1554 }
1555
1556 /* All the hard work is done by get_reg here. */
1557 physreg_t physreg = get_reg(ctx, file, dst);
1558
1559 allocate_dst_fixed(ctx, dst, physreg);
1560 }
1561
1562 static void
assign_src(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)1563 assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
1564 struct ir3_register *src)
1565 {
1566 struct ra_interval *interval = &ctx->intervals[src->def->name];
1567 struct ra_file *file = ra_get_file(ctx, src);
1568
1569 struct ir3_register *tied = src->tied;
1570 physreg_t physreg;
1571 if (tied) {
1572 struct ra_interval *tied_interval = &ctx->intervals[tied->name];
1573 physreg = ra_interval_get_physreg(tied_interval);
1574 } else {
1575 physreg = ra_interval_get_physreg(interval);
1576 }
1577
1578 assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags));
1579
1580 if (src->flags & IR3_REG_FIRST_KILL)
1581 ra_file_remove(file, interval);
1582 }
1583
1584 /* Insert a parallel copy instruction before the instruction with the parallel
1585 * copy entries we've built up.
1586 */
1587 static void
insert_parallel_copy_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1588 insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1589 {
1590 if (ctx->parallel_copies_count == 0)
1591 return;
1592
1593 struct ir3_instruction *pcopy =
1594 ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY,
1595 ctx->parallel_copies_count, ctx->parallel_copies_count);
1596
1597 for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1598 struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1599 struct ir3_register *reg =
1600 ir3_dst_create(pcopy, INVALID_REG,
1601 entry->interval->interval.reg->flags &
1602 (IR3_REG_HALF | IR3_REG_ARRAY));
1603 reg->size = entry->interval->interval.reg->size;
1604 reg->wrmask = entry->interval->interval.reg->wrmask;
1605 assign_reg(pcopy, reg, ra_interval_get_num(entry->interval));
1606 }
1607
1608 for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1609 struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1610 struct ir3_register *reg =
1611 ir3_src_create(pcopy, INVALID_REG,
1612 entry->interval->interval.reg->flags &
1613 (IR3_REG_HALF | IR3_REG_ARRAY));
1614 reg->size = entry->interval->interval.reg->size;
1615 reg->wrmask = entry->interval->interval.reg->wrmask;
1616 assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags));
1617 }
1618
1619 list_del(&pcopy->node);
1620 list_addtail(&pcopy->node, &instr->node);
1621 ctx->parallel_copies_count = 0;
1622 }
1623
1624 static void
handle_normal_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1625 handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1626 {
1627 /* First, mark sources as going-to-be-killed while allocating the dest. */
1628 ra_foreach_src (src, instr) {
1629 mark_src_killed(ctx, src);
1630 }
1631
1632 /* Pre-insert tied dst copies. */
1633 ra_foreach_dst (dst, instr) {
1634 insert_tied_dst_copy(ctx, dst);
1635 }
1636
1637 /* Allocate the destination. */
1638 ra_foreach_dst (dst, instr) {
1639 allocate_dst(ctx, dst);
1640 }
1641
1642 /* Now handle sources. Go backward so that in case there are multiple
1643 * sources with the same def and that def is killed we only remove it at
1644 * the end.
1645 */
1646 ra_foreach_src_rev (src, instr) {
1647 assign_src(ctx, instr, src);
1648 }
1649
1650 /* Now finally insert the destination into the map. */
1651 ra_foreach_dst (dst, instr) {
1652 insert_dst(ctx, dst);
1653 }
1654
1655 insert_parallel_copy_instr(ctx, instr);
1656 }
1657
1658 static void
handle_split(struct ra_ctx * ctx,struct ir3_instruction * instr)1659 handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
1660 {
1661 struct ir3_register *dst = instr->dsts[0];
1662 struct ir3_register *src = instr->srcs[0];
1663
1664 if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
1665 handle_normal_instr(ctx, instr);
1666 return;
1667 }
1668
1669 struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1670
1671 physreg_t physreg = ra_interval_get_physreg(src_interval);
1672 assign_src(ctx, instr, src);
1673
1674 allocate_dst_fixed(
1675 ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset);
1676 insert_dst(ctx, dst);
1677 }
1678
1679 static void
handle_collect(struct ra_ctx * ctx,struct ir3_instruction * instr)1680 handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
1681 {
1682 struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;
1683 unsigned dst_offset = instr->dsts[0]->merge_set_offset;
1684
1685 if (!dst_set || dst_set->regs_count == 1) {
1686 handle_normal_instr(ctx, instr);
1687 return;
1688 }
1689
1690 /* We need to check if any of the sources are contained in an interval
1691 * that is at least as large as the vector. In this case, we should put
1692 * the vector inside that larger interval. (There should be one
1693 * unambiguous place to put it, because values sharing the same merge set
1694 * should be allocated together.) This can happen in a case like:
1695 *
1696 * ssa_1 (wrmask=0xf) = ...
1697 * ssa_2 = split ssa_1 off:0
1698 * ssa_3 = split ssa_1 off:1
1699 * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3
1700 * ... = (kill)ssa_1
1701 * ... = (kill)ssa_4
1702 *
1703 * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it.
1704 */
1705 physreg_t dst_fixed = (physreg_t)~0u;
1706
1707 ra_foreach_src (src, instr) {
1708 if (src->flags & IR3_REG_FIRST_KILL) {
1709 mark_src_killed(ctx, src);
1710 }
1711
1712 struct ra_interval *interval = &ctx->intervals[src->def->name];
1713
1714 if (src->def->merge_set != dst_set || interval->is_killed)
1715 continue;
1716 while (interval->interval.parent != NULL) {
1717 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1718 }
1719 if (reg_size(interval->interval.reg) >= reg_size(instr->dsts[0])) {
1720 dst_fixed = interval->physreg_start -
1721 interval->interval.reg->merge_set_offset + dst_offset;
1722 } else {
1723 /* For sources whose root interval is smaller than the
1724 * destination (i.e. the normal case), we will shuffle them
1725 * around after allocating the destination. Mark them killed so
1726 * that the destination can be allocated over them, even if they
1727 * aren't actually killed.
1728 */
1729 ra_file_mark_killed(ra_get_file(ctx, src), interval);
1730 }
1731 }
1732
1733 if (dst_fixed != (physreg_t)~0u)
1734 allocate_dst_fixed(ctx, instr->dsts[0], dst_fixed);
1735 else
1736 allocate_dst(ctx, instr->dsts[0]);
1737
1738 /* Remove the temporary is_killed we added */
1739 ra_foreach_src (src, instr) {
1740 struct ra_interval *interval = &ctx->intervals[src->def->name];
1741 while (interval->interval.parent != NULL) {
1742 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1743 }
1744
1745 /* Filter out cases where it actually should be killed */
1746 if (interval != &ctx->intervals[src->def->name] ||
1747 !(src->flags & IR3_REG_KILL)) {
1748 ra_file_unmark_killed(ra_get_file(ctx, src), interval);
1749 }
1750 }
1751
1752 ra_foreach_src_rev (src, instr) {
1753 assign_src(ctx, instr, src);
1754 }
1755
1756 /* We need to do this before insert_dst(), so that children of the
1757 * destination which got marked as killed and then shuffled around to make
1758 * space for the destination have the correct pcopy destination that
1759 * matches what we assign the source of the collect to in assign_src().
1760 *
1761 * TODO: In this case we'll wind up copying the value in the pcopy and
1762 * then again in the collect. We could avoid one of those by updating the
1763 * pcopy destination to match up with the final location of the source
1764 * after the collect and making the collect a no-op. However this doesn't
1765 * seem to happen often.
1766 */
1767 insert_parallel_copy_instr(ctx, instr);
1768
1769 /* Note: insert_dst will automatically shuffle around any intervals that
1770 * are a child of the collect by making them children of the collect.
1771 */
1772
1773 insert_dst(ctx, instr->dsts[0]);
1774 }
1775
1776 /* Parallel copies before RA should only be at the end of the block, for
1777 * phi's. For these we only need to fill in the sources, and then we fill in
1778 * the destinations in the successor block.
1779 */
1780 static void
handle_pcopy(struct ra_ctx * ctx,struct ir3_instruction * instr)1781 handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
1782 {
1783 ra_foreach_src_rev (src, instr) {
1784 assign_src(ctx, instr, src);
1785 }
1786 }
1787
1788 /* Some inputs may need to be precolored. We need to handle those first, so
1789 * that other non-precolored inputs don't accidentally get allocated over
1790 * them. Inputs are the very first thing in the shader, so it shouldn't be a
1791 * problem to allocate them to a specific physreg.
1792 */
1793
1794 static void
handle_precolored_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1795 handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1796 {
1797 if (instr->dsts[0]->num == INVALID_REG)
1798 return;
1799
1800 struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1801 struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1802 physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]);
1803 allocate_dst_fixed(ctx, instr->dsts[0], physreg);
1804
1805 d("insert precolored dst %u physreg %u", instr->dsts[0]->name,
1806 ra_interval_get_physreg(interval));
1807
1808 ra_file_insert(file, interval);
1809 interval->frozen = true;
1810 }
1811
1812 static void
handle_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1813 handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1814 {
1815 if (instr->dsts[0]->num != INVALID_REG)
1816 return;
1817
1818 allocate_dst(ctx, instr->dsts[0]);
1819
1820 struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1821 struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1822 ra_file_insert(file, interval);
1823 }
1824
1825 static void
assign_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1826 assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1827 {
1828 struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1829 struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1830
1831 if (instr->dsts[0]->num == INVALID_REG) {
1832 assign_reg(instr, instr->dsts[0], ra_interval_get_num(interval));
1833 } else {
1834 interval->frozen = false;
1835 }
1836
1837 if (instr->dsts[0]->flags & IR3_REG_UNUSED)
1838 ra_file_remove(file, interval);
1839
1840 ra_foreach_src_rev (src, instr)
1841 assign_src(ctx, instr, src);
1842 }
1843
1844 /* chmask is a bit weird, because it has pre-colored sources due to the need
1845 * to pass some registers to the next stage. Fortunately there are only at
1846 * most two, and there should be no other live values by the time we get to
1847 * this instruction, so we only have to do the minimum and don't need any
1848 * fancy fallbacks.
1849 *
1850 * TODO: Add more complete handling of precolored sources, e.g. for function
1851 * argument handling. We'd need a way to mark sources as fixed so that they
1852 * don't get moved around when placing other sources in the fallback case, and
1853 * a duplication of much of the logic in get_reg(). This also opens another
1854 * can of worms, e.g. what if the precolored source is a split of a vector
1855 * which is still live -- this breaks our assumption that splits don't incur
1856 * any "extra" register requirements and we'd have to break it out of the
1857 * parent ra_interval.
1858 */
1859
1860 static void
handle_precolored_source(struct ra_ctx * ctx,struct ir3_register * src)1861 handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src)
1862 {
1863 struct ra_file *file = ra_get_file(ctx, src);
1864 struct ra_interval *interval = &ctx->intervals[src->def->name];
1865 physreg_t physreg = ra_reg_get_physreg(src);
1866
1867 if (ra_interval_get_num(interval) == src->num)
1868 return;
1869
1870 /* Try evicting stuff in our way if it isn't free. This won't move
1871 * anything unless it overlaps with our precolored physreg, so we don't
1872 * have to worry about evicting other precolored sources.
1873 */
1874 if (!get_reg_specified(ctx, file, src, physreg, true)) {
1875 unsigned eviction_count;
1876 if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true,
1877 false)) {
1878 unreachable("failed to evict for precolored source!");
1879 return;
1880 }
1881 }
1882
1883 ra_move_interval(ctx, file, interval, physreg);
1884 }
1885
1886 static void
handle_chmask(struct ra_ctx * ctx,struct ir3_instruction * instr)1887 handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr)
1888 {
1889 /* Note: we purposely don't mark sources as killed, so that we can reuse
1890 * some of the get_reg() machinery as-if the source is a destination.
1891 * Marking it as killed would make e.g. get_reg_specified() wouldn't work
1892 * correctly.
1893 */
1894 ra_foreach_src (src, instr) {
1895 assert(src->num != INVALID_REG);
1896 handle_precolored_source(ctx, src);
1897 }
1898
1899 ra_foreach_src (src, instr) {
1900 struct ra_file *file = ra_get_file(ctx, src);
1901 struct ra_interval *interval = &ctx->intervals[src->def->name];
1902 if (src->flags & IR3_REG_FIRST_KILL)
1903 ra_file_remove(file, interval);
1904 }
1905
1906 insert_parallel_copy_instr(ctx, instr);
1907 }
1908
1909 static physreg_t
read_register(struct ra_ctx * ctx,struct ir3_block * block,struct ir3_register * def)1910 read_register(struct ra_ctx *ctx, struct ir3_block *block,
1911 struct ir3_register *def)
1912 {
1913 struct ra_block_state *state = &ctx->blocks[block->index];
1914 if (state->renames) {
1915 struct hash_entry *entry = _mesa_hash_table_search(state->renames, def);
1916 if (entry) {
1917 return (physreg_t)(uintptr_t)entry->data;
1918 }
1919 }
1920
1921 return ra_reg_get_physreg(def);
1922 }
1923
1924 static void
handle_live_in(struct ra_ctx * ctx,struct ir3_register * def)1925 handle_live_in(struct ra_ctx *ctx, struct ir3_register *def)
1926 {
1927 physreg_t physreg = ~0;
1928 for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
1929 struct ir3_block *pred = ctx->block->predecessors[i];
1930 struct ra_block_state *pred_state = &ctx->blocks[pred->index];
1931
1932 if (!pred_state->visited)
1933 continue;
1934
1935 physreg = read_register(ctx, pred, def);
1936 break;
1937 }
1938
1939 assert(physreg != (physreg_t)~0);
1940
1941 struct ra_interval *interval = &ctx->intervals[def->name];
1942 struct ra_file *file = ra_get_file(ctx, def);
1943 ra_interval_init(interval, def);
1944 interval->physreg_start = physreg;
1945 interval->physreg_end = physreg + reg_size(def);
1946 ra_file_insert(file, interval);
1947 }
1948
1949 static void
handle_live_out(struct ra_ctx * ctx,struct ir3_register * def)1950 handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
1951 {
1952 /* Skip parallelcopy's which in the original program are only used as phi
1953 * arguments. Even though phi arguments are live out, they are only
1954 * assigned when the phi is.
1955 */
1956 if (def->instr->opc == OPC_META_PARALLEL_COPY)
1957 return;
1958
1959 struct ra_block_state *state = &ctx->blocks[ctx->block->index];
1960 struct ra_interval *interval = &ctx->intervals[def->name];
1961 physreg_t physreg = ra_interval_get_physreg(interval);
1962 if (physreg != ra_reg_get_physreg(def)) {
1963 if (!state->renames)
1964 state->renames = _mesa_pointer_hash_table_create(ctx);
1965 _mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg);
1966 }
1967 }
1968
1969 static void
handle_phi(struct ra_ctx * ctx,struct ir3_register * def)1970 handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
1971 {
1972 struct ra_file *file = ra_get_file(ctx, def);
1973 struct ra_interval *interval = &ctx->intervals[def->name];
1974
1975 /* phis are always scalar, so they should already be the smallest possible
1976 * size. However they may be coalesced with other live-in values/phi
1977 * nodes, so check for that here.
1978 */
1979 struct ir3_reg_interval *parent_ir3 =
1980 ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start);
1981 physreg_t physreg;
1982 if (parent_ir3) {
1983 struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3);
1984 physreg = ra_interval_get_physreg(parent) +
1985 (def->interval_start - parent_ir3->reg->interval_start);
1986 } else {
1987 physreg = get_reg(ctx, file, def);
1988 }
1989
1990 allocate_dst_fixed(ctx, def, physreg);
1991
1992 ra_file_insert(file, interval);
1993 }
1994
1995 static void
assign_phi(struct ra_ctx * ctx,struct ir3_instruction * phi)1996 assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
1997 {
1998 struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);
1999 struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];
2000 assert(!interval->interval.parent);
2001 unsigned num = ra_interval_get_num(interval);
2002 assign_reg(phi, phi->dsts[0], num);
2003
2004 /* Assign the parallelcopy sources of this phi */
2005 for (unsigned i = 0; i < phi->srcs_count; i++) {
2006 if (phi->srcs[i]->def) {
2007 assign_reg(phi, phi->srcs[i], num);
2008 assign_reg(phi, phi->srcs[i]->def, num);
2009 }
2010 }
2011
2012 if (phi->dsts[0]->flags & IR3_REG_UNUSED)
2013 ra_file_remove(file, interval);
2014 }
2015
2016 /* When we split a live range, we sometimes need to emit fixup code at the end
2017 * of a block. For example, something like:
2018 *
2019 * a = ...
2020 * if (...) {
2021 * ...
2022 * a' = a
2023 * b = ... // a evicted to make room for b
2024 * ...
2025 * }
2026 * ... = a
2027 *
2028 * When we insert the copy to a' in insert_parallel_copy_instr(), this forces
2029 * to insert another copy "a = a'" at the end of the if. Normally this would
2030 * also entail adding a phi node, but since we're about to go out of SSA
2031 * anyway we just insert an extra move. Note, however, that "b" might be used
2032 * in a phi node at the end of the if and share registers with "a", so we
2033 * have to be careful to extend any preexisting parallelcopy instruction
2034 * instead of creating our own in order to guarantee that they properly get
2035 * swapped.
2036 */
2037
2038 static void
insert_liveout_copy(struct ir3_block * block,physreg_t dst,physreg_t src,struct ir3_register * reg)2039 insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,
2040 struct ir3_register *reg)
2041 {
2042 struct ir3_instruction *old_pcopy = NULL;
2043 if (!list_is_empty(&block->instr_list)) {
2044 struct ir3_instruction *last =
2045 list_entry(block->instr_list.prev, struct ir3_instruction, node);
2046 if (last->opc == OPC_META_PARALLEL_COPY)
2047 old_pcopy = last;
2048 }
2049
2050 unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0;
2051 struct ir3_instruction *pcopy = ir3_instr_create(
2052 block, OPC_META_PARALLEL_COPY, old_pcopy_srcs + 1, old_pcopy_srcs + 1);
2053
2054 for (unsigned i = 0; i < old_pcopy_srcs; i++) {
2055 old_pcopy->dsts[i]->instr = pcopy;
2056 pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i];
2057 }
2058
2059 unsigned flags = reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
2060
2061 struct ir3_register *dst_reg = ir3_dst_create(pcopy, INVALID_REG, flags);
2062 dst_reg->wrmask = reg->wrmask;
2063 dst_reg->size = reg->size;
2064 assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags));
2065
2066 for (unsigned i = 0; i < old_pcopy_srcs; i++) {
2067 pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i];
2068 }
2069
2070 struct ir3_register *src_reg = ir3_src_create(pcopy, INVALID_REG, flags);
2071 src_reg->wrmask = reg->wrmask;
2072 src_reg->size = reg->size;
2073 assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags));
2074
2075 if (old_pcopy)
2076 list_del(&old_pcopy->node);
2077 }
2078
2079 static void
insert_live_in_move(struct ra_ctx * ctx,struct ra_interval * interval)2080 insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
2081 {
2082 physreg_t physreg = ra_interval_get_physreg(interval);
2083
2084 bool shared = interval->interval.reg->flags & IR3_REG_SHARED;
2085 struct ir3_block **predecessors =
2086 shared ? ctx->block->physical_predecessors : ctx->block->predecessors;
2087 unsigned predecessors_count = shared
2088 ? ctx->block->physical_predecessors_count
2089 : ctx->block->predecessors_count;
2090
2091 for (unsigned i = 0; i < predecessors_count; i++) {
2092 struct ir3_block *pred = predecessors[i];
2093 struct ra_block_state *pred_state = &ctx->blocks[pred->index];
2094
2095 if (!pred_state->visited)
2096 continue;
2097
2098 physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
2099 if (pred_reg != physreg) {
2100 insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
2101
2102 /* This is a bit tricky, but when visiting the destination of a
2103 * physical-only edge, we have two predecessors (the if and the
2104 * header block) and both have multiple successors. We pick the
2105 * register for all live-ins from the normal edge, which should
2106 * guarantee that there's no need for shuffling things around in
2107 * the normal predecessor as long as there are no phi nodes, but
2108 * we still may need to insert fixup code in the physical
2109 * predecessor (i.e. the last block of the if) and that has
2110 * another successor (the block after the if) so we need to update
2111 * the renames state for when we process the other successor. This
2112 * crucially depends on the other successor getting processed
2113 * after this.
2114 *
2115 * For normal (non-physical) edges we disallow critical edges so
2116 * that hacks like this aren't necessary.
2117 */
2118 if (!pred_state->renames)
2119 pred_state->renames = _mesa_pointer_hash_table_create(ctx);
2120 _mesa_hash_table_insert(pred_state->renames, interval->interval.reg,
2121 (void *)(uintptr_t)physreg);
2122 }
2123 }
2124 }
2125
2126 static void
insert_file_live_in_moves(struct ra_ctx * ctx,struct ra_file * file)2127 insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file)
2128 {
2129 BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index];
2130 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2131 physreg_node) {
2132 /* Skip phi nodes. This needs to happen after phi nodes are allocated,
2133 * because we may have to move live-ins around to make space for phi
2134 * nodes, but we shouldn't be handling phi nodes here.
2135 */
2136 if (BITSET_TEST(live_in, interval->interval.reg->name))
2137 insert_live_in_move(ctx, interval);
2138 }
2139 }
2140
2141 static void
insert_entry_regs(struct ra_block_state * state,struct ra_file * file)2142 insert_entry_regs(struct ra_block_state *state, struct ra_file *file)
2143 {
2144 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2145 physreg_node) {
2146 _mesa_hash_table_insert(state->entry_regs, interval->interval.reg,
2147 (void *)(uintptr_t)interval->physreg_start);
2148 }
2149 }
2150
2151 static void
insert_live_in_moves(struct ra_ctx * ctx)2152 insert_live_in_moves(struct ra_ctx *ctx)
2153 {
2154 insert_file_live_in_moves(ctx, &ctx->full);
2155 insert_file_live_in_moves(ctx, &ctx->half);
2156 insert_file_live_in_moves(ctx, &ctx->shared);
2157
2158 /* If not all predecessors are visited, insert live-in regs so that
2159 * insert_live_out_moves() will work.
2160 */
2161 bool all_preds_visited = true;
2162 for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
2163 if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) {
2164 all_preds_visited = false;
2165 break;
2166 }
2167 }
2168
2169 if (!all_preds_visited) {
2170 struct ra_block_state *state = &ctx->blocks[ctx->block->index];
2171 state->entry_regs = _mesa_pointer_hash_table_create(ctx);
2172
2173 insert_entry_regs(state, &ctx->full);
2174 insert_entry_regs(state, &ctx->half);
2175 insert_entry_regs(state, &ctx->shared);
2176 }
2177 }
2178
2179 static void
insert_live_out_move(struct ra_ctx * ctx,struct ra_interval * interval)2180 insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval)
2181 {
2182 for (unsigned i = 0; i < 2; i++) {
2183 if (!ctx->block->successors[i])
2184 continue;
2185
2186 struct ir3_block *succ = ctx->block->successors[i];
2187 struct ra_block_state *succ_state = &ctx->blocks[succ->index];
2188
2189 if (!succ_state->visited)
2190 continue;
2191
2192 struct hash_entry *entry = _mesa_hash_table_search(
2193 succ_state->entry_regs, interval->interval.reg);
2194 if (!entry)
2195 continue;
2196
2197 physreg_t new_reg = (physreg_t)(uintptr_t)entry->data;
2198 if (new_reg != interval->physreg_start) {
2199 insert_liveout_copy(ctx->block, new_reg, interval->physreg_start,
2200 interval->interval.reg);
2201 }
2202 }
2203 }
2204
2205 static void
insert_file_live_out_moves(struct ra_ctx * ctx,struct ra_file * file)2206 insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file)
2207 {
2208 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2209 physreg_node) {
2210 insert_live_out_move(ctx, interval);
2211 }
2212 }
2213
2214 static void
insert_live_out_moves(struct ra_ctx * ctx)2215 insert_live_out_moves(struct ra_ctx *ctx)
2216 {
2217 insert_file_live_out_moves(ctx, &ctx->full);
2218 insert_file_live_out_moves(ctx, &ctx->half);
2219 insert_file_live_out_moves(ctx, &ctx->shared);
2220 }
2221
2222 static void
handle_block(struct ra_ctx * ctx,struct ir3_block * block)2223 handle_block(struct ra_ctx *ctx, struct ir3_block *block)
2224 {
2225 ctx->block = block;
2226
2227 /* Reset the register files from the last block */
2228 ra_file_init(&ctx->full);
2229 ra_file_init(&ctx->half);
2230 ra_file_init(&ctx->shared);
2231
2232 /* Handle live-ins, phis, and input meta-instructions. These all appear
2233 * live at the beginning of the block, and interfere with each other
2234 * therefore need to be allocated "in parallel". This means that we
2235 * have to allocate all of them, inserting them into the file, and then
2236 * delay updating the IR until all of them are allocated.
2237 *
2238 * Handle precolored inputs first, because we need to make sure that other
2239 * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes
2240 * and inputs at the same time, because the first block doesn't have
2241 * predecessors. Therefore handle_live_in doesn't have to worry about
2242 * them.
2243 */
2244
2245 foreach_instr (instr, &block->instr_list) {
2246 if (instr->opc == OPC_META_INPUT)
2247 handle_precolored_input(ctx, instr);
2248 else
2249 break;
2250 }
2251
2252 unsigned name;
2253 BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
2254 ctx->live->definitions_count) {
2255 struct ir3_register *reg = ctx->live->definitions[name];
2256 handle_live_in(ctx, reg);
2257 }
2258
2259 foreach_instr (instr, &block->instr_list) {
2260 if (instr->opc == OPC_META_PHI)
2261 handle_phi(ctx, instr->dsts[0]);
2262 else if (instr->opc == OPC_META_INPUT ||
2263 instr->opc == OPC_META_TEX_PREFETCH)
2264 handle_input(ctx, instr);
2265 else
2266 break;
2267 }
2268
2269 /* After this point, every live-in/phi/input has an interval assigned to
2270 * it. We delay actually assigning values until everything has been
2271 * allocated, so we can simply ignore any parallel copy entries created
2272 * when shuffling them around.
2273 */
2274 ctx->parallel_copies_count = 0;
2275
2276 insert_live_in_moves(ctx);
2277
2278 if (RA_DEBUG) {
2279 d("after live-in block %u:\n", block->index);
2280 ra_ctx_dump(ctx);
2281 }
2282
2283 /* Now we're done with processing live-ins, and can handle the body of the
2284 * block.
2285 */
2286 foreach_instr (instr, &block->instr_list) {
2287 di(instr, "processing");
2288
2289 if (instr->opc == OPC_META_PHI)
2290 assign_phi(ctx, instr);
2291 else if (instr->opc == OPC_META_INPUT ||
2292 instr->opc == OPC_META_TEX_PREFETCH)
2293 assign_input(ctx, instr);
2294 else if (instr->opc == OPC_META_SPLIT)
2295 handle_split(ctx, instr);
2296 else if (instr->opc == OPC_META_COLLECT)
2297 handle_collect(ctx, instr);
2298 else if (instr->opc == OPC_META_PARALLEL_COPY)
2299 handle_pcopy(ctx, instr);
2300 else if (instr->opc == OPC_CHMASK)
2301 handle_chmask(ctx, instr);
2302 else
2303 handle_normal_instr(ctx, instr);
2304
2305 if (RA_DEBUG)
2306 ra_ctx_dump(ctx);
2307 }
2308
2309 insert_live_out_moves(ctx);
2310
2311 BITSET_FOREACH_SET (name, ctx->live->live_out[block->index],
2312 ctx->live->definitions_count) {
2313 struct ir3_register *reg = ctx->live->definitions[name];
2314 handle_live_out(ctx, reg);
2315 }
2316
2317 ctx->blocks[block->index].visited = true;
2318 }
2319
2320 static unsigned
calc_target_full_pressure(struct ir3_shader_variant * v,unsigned pressure)2321 calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
2322 {
2323 /* Registers are allocated in units of vec4, so switch from units of
2324 * half-regs to vec4.
2325 */
2326 unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4);
2327
2328 bool double_threadsize = ir3_should_double_threadsize(v, reg_count);
2329
2330 unsigned target = reg_count;
2331 unsigned reg_independent_max_waves =
2332 ir3_get_reg_independent_max_waves(v, double_threadsize);
2333 unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
2334 v->compiler, reg_count, double_threadsize);
2335 unsigned target_waves =
2336 MIN2(reg_independent_max_waves, reg_dependent_max_waves);
2337
2338 while (target <= RA_FULL_SIZE / (2 * 4) &&
2339 ir3_should_double_threadsize(v, target) == double_threadsize &&
2340 ir3_get_reg_dependent_max_waves(v->compiler, target,
2341 double_threadsize) >= target_waves)
2342 target++;
2343
2344 return (target - 1) * 2 * 4;
2345 }
2346
2347 static void
add_pressure(struct ir3_pressure * pressure,struct ir3_register * reg,bool merged_regs)2348 add_pressure(struct ir3_pressure *pressure, struct ir3_register *reg,
2349 bool merged_regs)
2350 {
2351 unsigned size = reg_size(reg);
2352 if (reg->flags & IR3_REG_HALF)
2353 pressure->half += size;
2354 if (!(reg->flags & IR3_REG_HALF) || merged_regs)
2355 pressure->full += size;
2356 }
2357
2358 static void
dummy_interval_add(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)2359 dummy_interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2360 {
2361 }
2362
2363 static void
dummy_interval_delete(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)2364 dummy_interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2365 {
2366 }
2367
2368 static void
dummy_interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * parent,struct ir3_reg_interval * child)2369 dummy_interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *parent,
2370 struct ir3_reg_interval *child)
2371 {
2372 }
2373
2374 /* Calculate the minimum possible limit on register pressure so that spilling
2375 * still succeeds. Used to implement IR3_SHADER_DEBUG=spillall.
2376 */
2377
2378 static void
calc_min_limit_pressure(struct ir3_shader_variant * v,struct ir3_liveness * live,struct ir3_pressure * limit)2379 calc_min_limit_pressure(struct ir3_shader_variant *v,
2380 struct ir3_liveness *live,
2381 struct ir3_pressure *limit)
2382 {
2383 struct ir3_block *start = ir3_start_block(v->ir);
2384 struct ir3_reg_ctx *ctx = ralloc(NULL, struct ir3_reg_ctx);
2385 struct ir3_reg_interval *intervals =
2386 rzalloc_array(ctx, struct ir3_reg_interval, live->definitions_count);
2387
2388 ctx->interval_add = dummy_interval_add;
2389 ctx->interval_delete = dummy_interval_delete;
2390 ctx->interval_readd = dummy_interval_readd;
2391
2392 limit->full = limit->half = 0;
2393
2394 struct ir3_pressure cur_pressure = {0};
2395 foreach_instr (input, &start->instr_list) {
2396 if (input->opc != OPC_META_INPUT &&
2397 input->opc != OPC_META_TEX_PREFETCH)
2398 break;
2399
2400 add_pressure(&cur_pressure, input->dsts[0], v->mergedregs);
2401 }
2402
2403 limit->full = MAX2(limit->full, cur_pressure.full);
2404 limit->half = MAX2(limit->half, cur_pressure.half);
2405
2406 foreach_instr (input, &start->instr_list) {
2407 if (input->opc != OPC_META_INPUT &&
2408 input->opc != OPC_META_TEX_PREFETCH)
2409 break;
2410
2411 /* pre-colored inputs may have holes, which increases the pressure. */
2412 struct ir3_register *dst = input->dsts[0];
2413 if (dst->num != INVALID_REG) {
2414 unsigned physreg = ra_reg_get_physreg(dst) + reg_size(dst);
2415 if (dst->flags & IR3_REG_HALF)
2416 limit->half = MAX2(limit->half, physreg);
2417 if (!(dst->flags & IR3_REG_HALF) || v->mergedregs)
2418 limit->full = MAX2(limit->full, physreg);
2419 }
2420 }
2421
2422 foreach_block (block, &v->ir->block_list) {
2423 rb_tree_init(&ctx->intervals);
2424
2425 unsigned name;
2426 BITSET_FOREACH_SET (name, live->live_in[block->index],
2427 live->definitions_count) {
2428 struct ir3_register *reg = live->definitions[name];
2429 ir3_reg_interval_init(&intervals[reg->name], reg);
2430 ir3_reg_interval_insert(ctx, &intervals[reg->name]);
2431 }
2432
2433 foreach_instr (instr, &block->instr_list) {
2434 ra_foreach_dst (dst, instr) {
2435 ir3_reg_interval_init(&intervals[dst->name], dst);
2436 }
2437 /* phis and parallel copies can be deleted via spilling */
2438
2439 if (instr->opc == OPC_META_PHI) {
2440 ir3_reg_interval_insert(ctx, &intervals[instr->dsts[0]->name]);
2441 continue;
2442 }
2443
2444 if (instr->opc == OPC_META_PARALLEL_COPY)
2445 continue;
2446
2447 cur_pressure = (struct ir3_pressure) {0};
2448
2449 ra_foreach_dst (dst, instr) {
2450 if (dst->tied && !(dst->tied->flags & IR3_REG_KILL))
2451 add_pressure(&cur_pressure, dst, v->mergedregs);
2452 }
2453
2454 ra_foreach_src_rev (src, instr) {
2455 /* We currently don't support spilling the parent of a source when
2456 * making space for sources, so we have to keep track of the
2457 * intervals and figure out the root of the tree to figure out how
2458 * much space we need.
2459 *
2460 * TODO: We should probably support this in the spiller.
2461 */
2462 struct ir3_reg_interval *interval = &intervals[src->def->name];
2463 while (interval->parent)
2464 interval = interval->parent;
2465 add_pressure(&cur_pressure, interval->reg, v->mergedregs);
2466
2467 if (src->flags & IR3_REG_FIRST_KILL)
2468 ir3_reg_interval_remove(ctx, &intervals[src->def->name]);
2469 }
2470
2471 limit->full = MAX2(limit->full, cur_pressure.full);
2472 limit->half = MAX2(limit->half, cur_pressure.half);
2473
2474 cur_pressure = (struct ir3_pressure) {0};
2475
2476 ra_foreach_dst (dst, instr) {
2477 ir3_reg_interval_init(&intervals[dst->name], dst);
2478 ir3_reg_interval_insert(ctx, &intervals[dst->name]);
2479 add_pressure(&cur_pressure, dst, v->mergedregs);
2480 }
2481
2482 limit->full = MAX2(limit->full, cur_pressure.full);
2483 limit->half = MAX2(limit->half, cur_pressure.half);
2484 }
2485 }
2486
2487 /* Account for the base register, which needs to be available everywhere. */
2488 limit->full += 2;
2489
2490 ralloc_free(ctx);
2491 }
2492
2493 /*
2494 * If barriers are used, it must be possible for all waves in the workgroup
2495 * to execute concurrently. Thus we may have to reduce the registers limit.
2496 */
2497 static void
calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant * v,struct ir3_pressure * limit_pressure)2498 calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant *v,
2499 struct ir3_pressure *limit_pressure)
2500 {
2501 const struct ir3_compiler *compiler = v->compiler;
2502
2503 unsigned threads_per_wg;
2504 if (v->local_size_variable) {
2505 /* We have to expect the worst case. */
2506 threads_per_wg = compiler->max_variable_workgroup_size;
2507 } else {
2508 threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2];
2509 }
2510
2511 /* The register file is grouped into reg_size_vec4 number of parts.
2512 * Each part has enough registers to add a single vec4 register to
2513 * each thread of a single-sized wave-pair. With double threadsize
2514 * each wave-pair would consume two parts of the register file to get
2515 * a single vec4 for a thread. The more active wave-pairs the less
2516 * parts each could get.
2517 */
2518
2519 bool double_threadsize = ir3_should_double_threadsize(v, 0);
2520 unsigned waves_per_wg = DIV_ROUND_UP(
2521 threads_per_wg, compiler->threadsize_base * (double_threadsize ? 2 : 1) *
2522 compiler->wave_granularity);
2523
2524 uint32_t vec4_regs_per_thread =
2525 compiler->reg_size_vec4 / (waves_per_wg * (double_threadsize ? 2 : 1));
2526 assert(vec4_regs_per_thread > 0);
2527
2528 uint32_t half_regs_per_thread = vec4_regs_per_thread * 4 * 2;
2529
2530 if (limit_pressure->full > half_regs_per_thread) {
2531 if (v->mergedregs) {
2532 limit_pressure->full = half_regs_per_thread;
2533 } else {
2534 /* TODO: Handle !mergedregs case, probably we would have to do this
2535 * after the first register pressure pass.
2536 */
2537 }
2538 }
2539 }
2540
2541 int
ir3_ra(struct ir3_shader_variant * v)2542 ir3_ra(struct ir3_shader_variant *v)
2543 {
2544 ir3_calc_dominance(v->ir);
2545
2546 ir3_create_parallel_copies(v->ir);
2547
2548 struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
2549
2550 ctx->merged_regs = v->mergedregs;
2551 ctx->compiler = v->compiler;
2552 ctx->stage = v->type;
2553
2554 struct ir3_liveness *live = ir3_calc_liveness(ctx, v->ir);
2555
2556 ir3_debug_print(v->ir, "AFTER: create_parallel_copies");
2557
2558 ir3_merge_regs(live, v->ir);
2559
2560 struct ir3_pressure max_pressure;
2561 ir3_calc_pressure(v, live, &max_pressure);
2562 d("max pressure:");
2563 d("\tfull: %u", max_pressure.full);
2564 d("\thalf: %u", max_pressure.half);
2565 d("\tshared: %u", max_pressure.shared);
2566
2567 struct ir3_pressure limit_pressure;
2568 limit_pressure.full = RA_FULL_SIZE;
2569 limit_pressure.half = RA_HALF_SIZE;
2570 limit_pressure.shared = RA_SHARED_SIZE;
2571
2572 if (gl_shader_stage_is_compute(v->type) && v->has_barrier) {
2573 calc_limit_pressure_for_cs_with_barrier(v, &limit_pressure);
2574 }
2575
2576 /* If the user forces a doubled threadsize, we may have to lower the limit
2577 * because on some gens the register file is not big enough to hold a
2578 * double-size wave with all 48 registers in use.
2579 */
2580 if (v->real_wavesize == IR3_DOUBLE_ONLY) {
2581 limit_pressure.full =
2582 MAX2(limit_pressure.full, ctx->compiler->reg_size_vec4 / 2 * 16);
2583 }
2584
2585 /* If requested, lower the limit so that spilling happens more often. */
2586 if (ir3_shader_debug & IR3_DBG_SPILLALL)
2587 calc_min_limit_pressure(v, live, &limit_pressure);
2588
2589 if (max_pressure.shared > limit_pressure.shared) {
2590 /* TODO shared reg -> normal reg spilling */
2591 d("shared max pressure exceeded!");
2592 goto fail;
2593 }
2594
2595 bool spilled = false;
2596 if (max_pressure.full > limit_pressure.full ||
2597 max_pressure.half > limit_pressure.half) {
2598 if (!v->compiler->has_pvtmem) {
2599 d("max pressure exceeded!");
2600 goto fail;
2601 }
2602 d("max pressure exceeded, spilling!");
2603 IR3_PASS(v->ir, ir3_spill, v, &live, &limit_pressure);
2604 ir3_calc_pressure(v, live, &max_pressure);
2605 assert(max_pressure.full <= limit_pressure.full &&
2606 max_pressure.half <= limit_pressure.half);
2607 spilled = true;
2608 }
2609
2610 ctx->live = live;
2611 ctx->intervals =
2612 rzalloc_array(ctx, struct ra_interval, live->definitions_count);
2613 ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count);
2614
2615 ctx->full.size = calc_target_full_pressure(v, max_pressure.full);
2616 d("full size: %u", ctx->full.size);
2617
2618 if (!v->mergedregs)
2619 ctx->half.size = RA_HALF_SIZE;
2620
2621 ctx->shared.size = RA_SHARED_SIZE;
2622
2623 ctx->full.start = ctx->half.start = ctx->shared.start = 0;
2624
2625 foreach_block (block, &v->ir->block_list)
2626 handle_block(ctx, block);
2627
2628 ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count);
2629
2630 /* Strip array-ness and SSA-ness at the end, because various helpers still
2631 * need to work even on definitions that have already been assigned. For
2632 * example, we need to preserve array-ness so that array live-ins have the
2633 * right size.
2634 */
2635 foreach_block (block, &v->ir->block_list) {
2636 foreach_instr (instr, &block->instr_list) {
2637 for (unsigned i = 0; i < instr->dsts_count; i++) {
2638 instr->dsts[i]->flags &= ~IR3_REG_SSA;
2639
2640 /* Parallel copies of array registers copy the whole register, and
2641 * we need some way to let the parallel copy code know that this was
2642 * an array whose size is determined by reg->size. So keep the array
2643 * flag on those. spill/reload also need to work on the entire
2644 * array.
2645 */
2646 if (!is_meta(instr) && instr->opc != OPC_RELOAD_MACRO)
2647 instr->dsts[i]->flags &= ~IR3_REG_ARRAY;
2648 }
2649
2650 for (unsigned i = 0; i < instr->srcs_count; i++) {
2651 instr->srcs[i]->flags &= ~IR3_REG_SSA;
2652
2653 if (!is_meta(instr) && instr->opc != OPC_SPILL_MACRO)
2654 instr->srcs[i]->flags &= ~IR3_REG_ARRAY;
2655 }
2656 }
2657 }
2658
2659 ir3_debug_print(v->ir, "AFTER: register allocation");
2660
2661 if (spilled) {
2662 IR3_PASS(v->ir, ir3_lower_spill);
2663 }
2664
2665 ir3_lower_copies(v);
2666
2667 ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
2668
2669 ralloc_free(ctx);
2670
2671 return 0;
2672 fail:
2673 ralloc_free(ctx);
2674 return -1;
2675 }
2676