• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2021 Valve Corporation
3  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "ir3_ra.h"
26 #include "util/rb_tree.h"
27 #include "util/u_math.h"
28 #include "ir3_shader.h"
29 
30 /* This file implements an SSA-based register allocator. Unlike other
31  * SSA-based allocators, it handles vector split/collect "smartly," meaning
32  * that multiple values may share the same register interval. From the
33  * perspective of the allocator itself, only the top-level intervals matter,
34  * and the allocator is only concerned with allocating top-level intervals,
35  * which may mean moving other top-level intervals around. Other intervals,
36  * like the destination of a split instruction or the source of a collect
37  * instruction, are "locked" to their parent interval. The details of this are
38  * mostly handled by ir3_merge_regs and ir3_reg_ctx.
39  *
40  * We currently don't do any backtracking, but we do use the merge sets as a
41  * form of affinity to try to avoid moves from phis/splits/collects. Each
42  * merge set is what a more "classic" graph-coloring or live-range based
43  * allocator would consider a single register, but here we use it as merely a
44  * hint, except when multiple overlapping values are live at the same time.
45  * Each merge set has a "preferred" register, and we try to honor that when
46  * allocating values in the merge set.
47  */
48 
49 /* ir3_reg_ctx implementation. */
50 
51 static int
ir3_reg_interval_cmp(const struct rb_node * node,const void * data)52 ir3_reg_interval_cmp(const struct rb_node *node, const void *data)
53 {
54    unsigned reg = *(const unsigned *)data;
55    const struct ir3_reg_interval *interval =
56       ir3_rb_node_to_interval_const(node);
57    if (interval->reg->interval_start > reg)
58       return -1;
59    else if (interval->reg->interval_end <= reg)
60       return 1;
61    else
62       return 0;
63 }
64 
65 static struct ir3_reg_interval *
ir3_reg_interval_search(struct rb_tree * tree,unsigned offset)66 ir3_reg_interval_search(struct rb_tree *tree, unsigned offset)
67 {
68    struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp);
69    return node ? ir3_rb_node_to_interval(node) : NULL;
70 }
71 
72 static struct ir3_reg_interval *
ir3_reg_interval_search_sloppy(struct rb_tree * tree,unsigned offset)73 ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset)
74 {
75    struct rb_node *node =
76       rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp);
77    return node ? ir3_rb_node_to_interval(node) : NULL;
78 }
79 
80 /* Get the interval covering the reg, or the closest to the right if it
81  * doesn't exist.
82  */
83 static struct ir3_reg_interval *
ir3_reg_interval_search_right(struct rb_tree * tree,unsigned offset)84 ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset)
85 {
86    struct ir3_reg_interval *interval =
87       ir3_reg_interval_search_sloppy(tree, offset);
88    if (!interval) {
89       return NULL;
90    } else if (interval->reg->interval_end > offset) {
91       return interval;
92    } else {
93       /* There is no interval covering reg, and ra_file_search_sloppy()
94        * returned the closest range to the left, so the next interval to the
95        * right should be the closest to the right.
96        */
97       return ir3_reg_interval_next_or_null(interval);
98    }
99 }
100 
101 static int
ir3_reg_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)102 ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
103 {
104    const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a);
105    const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b);
106    return b->reg->interval_start - a->reg->interval_start;
107 }
108 
109 static void
interval_insert(struct ir3_reg_ctx * ctx,struct rb_tree * tree,struct ir3_reg_interval * interval)110 interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree,
111                 struct ir3_reg_interval *interval)
112 {
113    struct ir3_reg_interval *right =
114       ir3_reg_interval_search_right(tree, interval->reg->interval_start);
115    if (right && right->reg->interval_start < interval->reg->interval_end) {
116       /* We disallow trees where different members have different half-ness.
117        * This means that we can't treat bitcasts as copies like normal
118        * split/collect, so something like this would require an extra copy
119        * in mergedregs mode, and count as 4 half-units of register pressure
120        * instead of 2:
121        *
122        * f16vec2 foo = unpackFloat2x16(bar)
123        * ... = foo.x
124        * ... = bar
125        *
126        * However, relaxing this rule would open a huge can of worms. What
127        * happens when there's a vector of 16 things, and the fifth element
128        * has been bitcasted as a half-reg? Would that element alone have to
129        * be small enough to be used as a half-reg source? Let's keep that
130        * can of worms firmly shut for now.
131        */
132       assert((interval->reg->flags & IR3_REG_HALF) ==
133              (right->reg->flags & IR3_REG_HALF));
134 
135       if (right->reg->interval_end <= interval->reg->interval_end &&
136           right->reg->interval_start >= interval->reg->interval_start) {
137          /* Check if we're inserting something that's already inserted */
138          assert(interval != right);
139 
140          /* "right" is contained in "interval" and must become a child of
141           * it. There may be further children too.
142           */
143          for (struct ir3_reg_interval *next = ir3_reg_interval_next(right);
144               right && right->reg->interval_start < interval->reg->interval_end;
145               right = next, next = ir3_reg_interval_next_or_null(next)) {
146             /* "right" must be contained in "interval." */
147             assert(right->reg->interval_end <= interval->reg->interval_end);
148             assert((interval->reg->flags & IR3_REG_HALF) ==
149                    (right->reg->flags & IR3_REG_HALF));
150             if (!right->parent)
151                ctx->interval_delete(ctx, right);
152             right->parent = interval;
153             rb_tree_remove(tree, &right->node);
154             rb_tree_insert(&interval->children, &right->node,
155                            ir3_reg_interval_insert_cmp);
156          }
157       } else {
158          /* "right" must contain "interval," since intervals must form a
159           * tree.
160           */
161          assert(right->reg->interval_start <= interval->reg->interval_start);
162          interval->parent = right;
163          interval_insert(ctx, &right->children, interval);
164          return;
165       }
166    }
167 
168    if (!interval->parent)
169       ctx->interval_add(ctx, interval);
170    rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp);
171    interval->inserted = true;
172 }
173 
174 void
ir3_reg_interval_insert(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)175 ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
176                         struct ir3_reg_interval *interval)
177 {
178    rb_tree_init(&interval->children);
179    interval->parent = NULL;
180    interval_insert(ctx, &ctx->intervals, interval);
181 }
182 
183 /* Call after ir3_reg_interval_remove_temp() to reinsert the interval */
184 static void
ir3_reg_interval_reinsert(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)185 ir3_reg_interval_reinsert(struct ir3_reg_ctx *ctx,
186                           struct ir3_reg_interval *interval)
187 {
188    interval->parent = NULL;
189    interval_insert(ctx, &ctx->intervals, interval);
190 }
191 
192 void
ir3_reg_interval_remove(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)193 ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
194                         struct ir3_reg_interval *interval)
195 {
196    assert(interval->inserted);
197 
198    if (interval->parent) {
199       rb_tree_remove(&interval->parent->children, &interval->node);
200    } else {
201       ctx->interval_delete(ctx, interval);
202       rb_tree_remove(&ctx->intervals, &interval->node);
203    }
204 
205    rb_tree_foreach_safe (struct ir3_reg_interval, child, &interval->children,
206                          node) {
207       rb_tree_remove(&interval->children, &child->node);
208       child->parent = interval->parent;
209 
210       if (interval->parent) {
211          rb_tree_insert(&child->parent->children, &child->node,
212                         ir3_reg_interval_insert_cmp);
213       } else {
214          ctx->interval_readd(ctx, interval, child);
215          rb_tree_insert(&ctx->intervals, &child->node,
216                         ir3_reg_interval_insert_cmp);
217       }
218    }
219 
220    interval->inserted = false;
221 }
222 
223 static void
_mark_free(struct ir3_reg_interval * interval)224 _mark_free(struct ir3_reg_interval *interval)
225 {
226    interval->inserted = false;
227    rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
228       _mark_free(child);
229    }
230 }
231 
232 /* Remove an interval and all its children from the tree. */
233 void
ir3_reg_interval_remove_all(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)234 ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
235                             struct ir3_reg_interval *interval)
236 {
237    assert(!interval->parent);
238 
239    ctx->interval_delete(ctx, interval);
240    rb_tree_remove(&ctx->intervals, &interval->node);
241    _mark_free(interval);
242 }
243 
244 /* Used when popping an interval to be shuffled around. Don't disturb children
245  * so that it can be later reinserted.
246  */
247 static void
ir3_reg_interval_remove_temp(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)248 ir3_reg_interval_remove_temp(struct ir3_reg_ctx *ctx,
249                              struct ir3_reg_interval *interval)
250 {
251    assert(!interval->parent);
252 
253    ctx->interval_delete(ctx, interval);
254    rb_tree_remove(&ctx->intervals, &interval->node);
255 }
256 
257 static void
interval_dump(struct log_stream * stream,struct ir3_reg_interval * interval,unsigned indent)258 interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval,
259               unsigned indent)
260 {
261    for (unsigned i = 0; i < indent; i++)
262       mesa_log_stream_printf(stream, "\t");
263    mesa_log_stream_printf(stream, "reg %u start %u\n", interval->reg->name,
264                           interval->reg->interval_start);
265 
266    rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
267       interval_dump(stream, child, indent + 1);
268    }
269 
270    for (unsigned i = 0; i < indent; i++)
271       mesa_log_stream_printf(stream, "\t");
272    mesa_log_stream_printf(stream, "reg %u end %u\n", interval->reg->name,
273                           interval->reg->interval_end);
274 }
275 
276 void
ir3_reg_interval_dump(struct log_stream * stream,struct ir3_reg_interval * interval)277 ir3_reg_interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval)
278 {
279    interval_dump(stream, interval, 0);
280 }
281 
282 /* These are the core datastructures used by the register allocator. First
283  * ra_interval and ra_file, which are used for intra-block tracking and use
284  * the ir3_reg_ctx infrastructure:
285  */
286 
287 struct ra_interval {
288    struct ir3_reg_interval interval;
289 
290    struct rb_node physreg_node;
291    physreg_t physreg_start, physreg_end;
292 
293    /* True if this is a source of the current instruction which is entirely
294     * killed. This means we can allocate the dest over it, but we can't break
295     * it up.
296     */
297    bool is_killed;
298 
299    /* True if this interval cannot be moved from its position. This is only
300     * used for precolored inputs to ensure that other inputs don't get
301     * allocated on top of them.
302     */
303    bool frozen;
304 };
305 
306 struct ra_file {
307    struct ir3_reg_ctx reg_ctx;
308 
309    BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
310    BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
311 
312    struct rb_tree physreg_intervals;
313 
314    unsigned size;
315    unsigned start;
316 };
317 
318 /* State for inter-block tracking. When we split a live range to make space
319  * for a vector, we may need to insert fixup code when a block has multiple
320  * predecessors that have moved the same live value to different registers.
321  * This keeps track of state required to do that.
322  */
323 
324 struct ra_block_state {
325    /* Map of defining ir3_register -> physreg it was allocated to at the end
326     * of the block.
327     */
328    struct hash_table *renames;
329 
330    /* For loops, we need to process a block before all its predecessors have
331     * been processed. In particular, we need to pick registers for values
332     * without knowing if all the predecessors have been renamed. This keeps
333     * track of the registers we chose so that when we visit the back-edge we
334     * can move them appropriately. If all predecessors have been visited
335     * before this block is visited then we don't need to fill this out. This
336     * is a map from ir3_register -> physreg.
337     */
338    struct hash_table *entry_regs;
339 
340    /* True if the block has been visited and "renames" is complete.
341     */
342    bool visited;
343 };
344 
345 struct ra_parallel_copy {
346    struct ra_interval *interval;
347    physreg_t src;
348 };
349 
350 /* The main context: */
351 
352 struct ra_ctx {
353    /* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom
354     * half of this file too.
355     */
356    struct ra_file full;
357 
358    /* hr0.x - hr63.w, only used without merged-regs. */
359    struct ra_file half;
360 
361    /* Shared regs. */
362    struct ra_file shared;
363 
364    struct ir3_liveness *live;
365 
366    struct ir3_block *block;
367 
368    const struct ir3_compiler *compiler;
369    gl_shader_stage stage;
370 
371    /* Pending moves of top-level intervals that will be emitted once we're
372     * finished:
373     */
374    DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies);
375 
376    struct ra_interval *intervals;
377    struct ra_block_state *blocks;
378 
379    bool merged_regs;
380 };
381 
382 #define foreach_interval(interval, file)                                       \
383    rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals,  \
384                     physreg_node)
385 #define foreach_interval_rev(interval, file)                                   \
386    rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals,  \
387                     physreg_node)
388 #define foreach_interval_safe(interval, file)                                  \
389    rb_tree_foreach_safe (struct ra_interval, interval,                         \
390                          &(file)->physreg_intervals, physreg_node)
391 #define foreach_interval_rev_safe(interval, file)                              \
392    rb_tree_foreach_rev_safe(struct ra_interval, interval,                      \
393                             &(file)->physreg_intervals, physreg_node)
394 
395 static struct ra_interval *
rb_node_to_interval(struct rb_node * node)396 rb_node_to_interval(struct rb_node *node)
397 {
398    return rb_node_data(struct ra_interval, node, physreg_node);
399 }
400 
401 static const struct ra_interval *
rb_node_to_interval_const(const struct rb_node * node)402 rb_node_to_interval_const(const struct rb_node *node)
403 {
404    return rb_node_data(struct ra_interval, node, physreg_node);
405 }
406 
407 static struct ra_interval *
ra_interval_next(struct ra_interval * interval)408 ra_interval_next(struct ra_interval *interval)
409 {
410    struct rb_node *next = rb_node_next(&interval->physreg_node);
411    return next ? rb_node_to_interval(next) : NULL;
412 }
413 
414 static struct ra_interval *
ra_interval_next_or_null(struct ra_interval * interval)415 ra_interval_next_or_null(struct ra_interval *interval)
416 {
417    return interval ? ra_interval_next(interval) : NULL;
418 }
419 
420 static int
ra_interval_cmp(const struct rb_node * node,const void * data)421 ra_interval_cmp(const struct rb_node *node, const void *data)
422 {
423    physreg_t reg = *(const physreg_t *)data;
424    const struct ra_interval *interval = rb_node_to_interval_const(node);
425    if (interval->physreg_start > reg)
426       return -1;
427    else if (interval->physreg_end <= reg)
428       return 1;
429    else
430       return 0;
431 }
432 
433 static struct ra_interval *
ra_interval_search_sloppy(struct rb_tree * tree,physreg_t reg)434 ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
435 {
436    struct rb_node *node = rb_tree_search_sloppy(tree, &reg, ra_interval_cmp);
437    return node ? rb_node_to_interval(node) : NULL;
438 }
439 
440 /* Get the interval covering the reg, or the closest to the right if it
441  * doesn't exist.
442  */
443 static struct ra_interval *
ra_interval_search_right(struct rb_tree * tree,physreg_t reg)444 ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
445 {
446    struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
447    if (!interval) {
448       return NULL;
449    } else if (interval->physreg_end > reg) {
450       return interval;
451    } else {
452       /* There is no interval covering reg, and ra_file_search_sloppy()
453        * returned the closest range to the left, so the next interval to the
454        * right should be the closest to the right.
455        */
456       return ra_interval_next_or_null(interval);
457    }
458 }
459 
460 static struct ra_interval *
ra_file_search_right(struct ra_file * file,physreg_t reg)461 ra_file_search_right(struct ra_file *file, physreg_t reg)
462 {
463    return ra_interval_search_right(&file->physreg_intervals, reg);
464 }
465 
466 static int
ra_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)467 ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
468 {
469    const struct ra_interval *a = rb_node_to_interval_const(_a);
470    const struct ra_interval *b = rb_node_to_interval_const(_b);
471    return b->physreg_start - a->physreg_start;
472 }
473 
474 static struct ra_interval *
ir3_reg_interval_to_ra_interval(struct ir3_reg_interval * interval)475 ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
476 {
477    return rb_node_data(struct ra_interval, interval, interval);
478 }
479 
480 static struct ra_file *
ir3_reg_ctx_to_file(struct ir3_reg_ctx * ctx)481 ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx)
482 {
483    return rb_node_data(struct ra_file, ctx, reg_ctx);
484 }
485 
486 static void
interval_add(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _interval)487 interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
488 {
489    struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
490    struct ra_file *file = ir3_reg_ctx_to_file(ctx);
491 
492    /* We can assume in this case that physreg_start/physreg_end is already
493     * initialized.
494     */
495    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
496       BITSET_CLEAR(file->available, i);
497       BITSET_CLEAR(file->available_to_evict, i);
498    }
499 
500    rb_tree_insert(&file->physreg_intervals, &interval->physreg_node,
501                   ra_interval_insert_cmp);
502 }
503 
504 static void
interval_delete(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _interval)505 interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
506 {
507    struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
508    struct ra_file *file = ir3_reg_ctx_to_file(ctx);
509 
510    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
511       BITSET_SET(file->available, i);
512       BITSET_SET(file->available_to_evict, i);
513    }
514 
515    rb_tree_remove(&file->physreg_intervals, &interval->physreg_node);
516 }
517 
518 static void
interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)519 interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
520                struct ir3_reg_interval *_child)
521 {
522    struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
523    struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
524 
525    child->physreg_start =
526       parent->physreg_start + (child->interval.reg->interval_start -
527                                parent->interval.reg->interval_start);
528    child->physreg_end =
529       child->physreg_start +
530       (child->interval.reg->interval_end - child->interval.reg->interval_start);
531 
532    interval_add(ctx, _child);
533 }
534 
535 static void
ra_file_init(struct ra_file * file)536 ra_file_init(struct ra_file *file)
537 {
538    for (unsigned i = 0; i < file->size; i++) {
539       BITSET_SET(file->available, i);
540       BITSET_SET(file->available_to_evict, i);
541    }
542 
543    rb_tree_init(&file->reg_ctx.intervals);
544    rb_tree_init(&file->physreg_intervals);
545 
546    file->reg_ctx.interval_add = interval_add;
547    file->reg_ctx.interval_delete = interval_delete;
548    file->reg_ctx.interval_readd = interval_readd;
549 }
550 
551 static void
ra_file_insert(struct ra_file * file,struct ra_interval * interval)552 ra_file_insert(struct ra_file *file, struct ra_interval *interval)
553 {
554    assert(interval->physreg_start < interval->physreg_end);
555    assert(interval->physreg_end <= file->size);
556    if (interval->interval.reg->flags & IR3_REG_HALF)
557       assert(interval->physreg_end <= RA_HALF_SIZE);
558 
559    ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
560 }
561 
562 static void
ra_file_remove(struct ra_file * file,struct ra_interval * interval)563 ra_file_remove(struct ra_file *file, struct ra_interval *interval)
564 {
565    ir3_reg_interval_remove(&file->reg_ctx, &interval->interval);
566 }
567 
568 static void
ra_file_mark_killed(struct ra_file * file,struct ra_interval * interval)569 ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval)
570 {
571    assert(!interval->interval.parent);
572 
573    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
574       BITSET_SET(file->available, i);
575    }
576 
577    interval->is_killed = true;
578 }
579 
580 static void
ra_file_unmark_killed(struct ra_file * file,struct ra_interval * interval)581 ra_file_unmark_killed(struct ra_file *file, struct ra_interval *interval)
582 {
583    assert(!interval->interval.parent);
584 
585    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
586       BITSET_CLEAR(file->available, i);
587    }
588 
589    interval->is_killed = false;
590 }
591 
592 static physreg_t
ra_interval_get_physreg(const struct ra_interval * interval)593 ra_interval_get_physreg(const struct ra_interval *interval)
594 {
595    unsigned child_start = interval->interval.reg->interval_start;
596 
597    while (interval->interval.parent) {
598       interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
599    }
600 
601    return interval->physreg_start +
602           (child_start - interval->interval.reg->interval_start);
603 }
604 
605 static unsigned
ra_interval_get_num(const struct ra_interval * interval)606 ra_interval_get_num(const struct ra_interval *interval)
607 {
608    return ra_physreg_to_num(ra_interval_get_physreg(interval),
609                             interval->interval.reg->flags);
610 }
611 
612 static void
ra_interval_init(struct ra_interval * interval,struct ir3_register * reg)613 ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
614 {
615    ir3_reg_interval_init(&interval->interval, reg);
616    interval->is_killed = false;
617    interval->frozen = false;
618 }
619 
620 static void
ra_interval_dump(struct log_stream * stream,struct ra_interval * interval)621 ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
622 {
623    mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
624 
625    ir3_reg_interval_dump(stream, &interval->interval);
626 }
627 
628 static void
ra_file_dump(struct log_stream * stream,struct ra_file * file)629 ra_file_dump(struct log_stream *stream, struct ra_file *file)
630 {
631    rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
632                     physreg_node) {
633       ra_interval_dump(stream, interval);
634    }
635 
636    unsigned start, end;
637    mesa_log_stream_printf(stream, "available:\n");
638    BITSET_FOREACH_RANGE (start, end, file->available, file->size) {
639       mesa_log_stream_printf(stream, "%u-%u ", start, end);
640    }
641    mesa_log_stream_printf(stream, "\n");
642 
643    mesa_log_stream_printf(stream, "available to evict:\n");
644    BITSET_FOREACH_RANGE (start, end, file->available_to_evict, file->size) {
645       mesa_log_stream_printf(stream, "%u-%u ", start, end);
646    }
647    mesa_log_stream_printf(stream, "\n");
648    mesa_log_stream_printf(stream, "start: %u\n", file->start);
649 }
650 
651 static void
ra_ctx_dump(struct ra_ctx * ctx)652 ra_ctx_dump(struct ra_ctx *ctx)
653 {
654    struct log_stream *stream = mesa_log_streami();
655    mesa_log_stream_printf(stream, "full:\n");
656    ra_file_dump(stream, &ctx->full);
657    mesa_log_stream_printf(stream, "half:\n");
658    ra_file_dump(stream, &ctx->half);
659    mesa_log_stream_printf(stream, "shared:");
660    ra_file_dump(stream, &ctx->shared);
661    mesa_log_stream_destroy(stream);
662 }
663 
664 static unsigned
reg_file_size(struct ra_file * file,struct ir3_register * reg)665 reg_file_size(struct ra_file *file, struct ir3_register *reg)
666 {
667    /* Half-regs can only take up the first half of the combined regfile */
668    if (reg->flags & IR3_REG_HALF)
669       return MIN2(file->size, RA_HALF_SIZE);
670    else
671       return file->size;
672 }
673 
674 /* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple
675  * top-level intervals at once. Pop multiple intervals, then push them back in
676  * any order.
677  */
678 
679 struct ra_removed_interval {
680    struct ra_interval *interval;
681    unsigned size;
682 };
683 
684 static struct ra_removed_interval
ra_pop_interval(struct ra_ctx * ctx,struct ra_file * file,struct ra_interval * interval)685 ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
686                 struct ra_interval *interval)
687 {
688    assert(!interval->interval.parent);
689    /* shared live splitting is not allowed! */
690    assert(!(interval->interval.reg->flags & IR3_REG_SHARED));
691 
692    /* Check if we've already moved this reg before */
693    unsigned pcopy_index;
694    for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count;
695         pcopy_index++) {
696       if (ctx->parallel_copies[pcopy_index].interval == interval)
697          break;
698    }
699 
700    if (pcopy_index == ctx->parallel_copies_count) {
701       array_insert(ctx, ctx->parallel_copies,
702                    (struct ra_parallel_copy){
703                       .interval = interval,
704                       .src = interval->physreg_start,
705                    });
706    }
707 
708    ir3_reg_interval_remove_temp(&file->reg_ctx, &interval->interval);
709 
710    return (struct ra_removed_interval){
711       .interval = interval,
712       .size = interval->physreg_end - interval->physreg_start,
713    };
714 }
715 
716 static void
ra_push_interval(struct ra_ctx * ctx,struct ra_file * file,const struct ra_removed_interval * removed,physreg_t dst)717 ra_push_interval(struct ra_ctx *ctx, struct ra_file *file,
718                  const struct ra_removed_interval *removed, physreg_t dst)
719 {
720    struct ra_interval *interval = removed->interval;
721 
722    interval->physreg_start = dst;
723    interval->physreg_end = dst + removed->size;
724 
725    assert(interval->physreg_end <= file->size);
726    if (interval->interval.reg->flags & IR3_REG_HALF)
727       assert(interval->physreg_end <= RA_HALF_SIZE);
728 
729    ir3_reg_interval_reinsert(&file->reg_ctx, &interval->interval);
730 }
731 
732 /* Pick up the interval and place it at "dst". */
733 static void
ra_move_interval(struct ra_ctx * ctx,struct ra_file * file,struct ra_interval * interval,physreg_t dst)734 ra_move_interval(struct ra_ctx *ctx, struct ra_file *file,
735                  struct ra_interval *interval, physreg_t dst)
736 {
737    struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval);
738    ra_push_interval(ctx, file, &temp, dst);
739 }
740 
741 static struct ra_file *
ra_get_file(struct ra_ctx * ctx,struct ir3_register * reg)742 ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
743 {
744    if (reg->flags & IR3_REG_SHARED)
745       return &ctx->shared;
746    else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
747       return &ctx->full;
748    else
749       return &ctx->half;
750 }
751 
752 
753 /* Returns true if the proposed spot for "dst" or a killed source overlaps a
754  * destination that's been allocated.
755  */
756 static bool
check_dst_overlap(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * dst,physreg_t start,physreg_t end)757 check_dst_overlap(struct ra_ctx *ctx, struct ra_file *file,
758                   struct ir3_register *dst, physreg_t start,
759                   physreg_t end)
760 {
761    struct ir3_instruction *instr = dst->instr;
762 
763    ra_foreach_dst (other_dst, instr) {
764       /* We assume only destinations before the current one have been allocated.
765        */
766       if (other_dst == dst)
767          break;
768 
769       if (ra_get_file(ctx, other_dst) != file)
770          continue;
771 
772       struct ra_interval *other_interval = &ctx->intervals[other_dst->name];
773       assert(!other_interval->interval.parent);
774       physreg_t other_start = other_interval->physreg_start;
775       physreg_t other_end = other_interval->physreg_end;
776 
777       if (other_end > start && end > other_start)
778          return true;
779    }
780 
781    return false;
782 }
783 
784 /* True if the destination is "early-clobber," meaning that it cannot be
785  * allocated over killed sources. Some destinations always require it, but it
786  * also is implicitly true for tied destinations whose source is live-through.
787  * If the source is killed, then we skip allocating a register for the
788  * destination altogether so we don't need to worry about that case here.
789  */
790 static bool
is_early_clobber(struct ir3_register * reg)791 is_early_clobber(struct ir3_register *reg)
792 {
793    return (reg->flags & IR3_REG_EARLY_CLOBBER) || reg->tied;
794 }
795 
796 static bool
get_reg_specified(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg,physreg_t physreg,bool is_source)797 get_reg_specified(struct ra_ctx *ctx, struct ra_file *file,
798                   struct ir3_register *reg, physreg_t physreg, bool is_source)
799 {
800    for (unsigned i = 0; i < reg_size(reg); i++) {
801       if (!BITSET_TEST(is_early_clobber(reg) || is_source ?
802                            file->available_to_evict : file->available,
803                        physreg + i))
804          return false;
805    }
806 
807    if (!is_source &&
808        check_dst_overlap(ctx, file, reg, physreg, physreg + reg_size(reg)))
809       return false;
810 
811    return true;
812 }
813 
814 /* Try to evict any registers conflicting with the proposed spot "physreg" for
815  * "reg". That is, move them to other places so that we can allocate "physreg"
816  * here.
817  */
818 
819 static bool
try_evict_regs(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg,physreg_t physreg,unsigned * _eviction_count,bool is_source,bool speculative)820 try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
821                struct ir3_register *reg, physreg_t physreg,
822                unsigned *_eviction_count, bool is_source, bool speculative)
823 {
824    BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
825    memcpy(available_to_evict, file->available_to_evict,
826           sizeof(available_to_evict));
827 
828    BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
829    memcpy(available, file->available, sizeof(available));
830 
831    for (unsigned i = 0; i < reg_size(reg); i++) {
832       BITSET_CLEAR(available_to_evict, physreg + i);
833       BITSET_CLEAR(available, physreg + i);
834    }
835 
836    unsigned eviction_count = 0;
837    /* Iterate over each range conflicting with physreg */
838    for (struct ra_interval *conflicting = ra_file_search_right(file, physreg),
839                            *next = ra_interval_next_or_null(conflicting);
840         conflicting != NULL &&
841         conflicting->physreg_start < physreg + reg_size(reg);
842         conflicting = next, next = ra_interval_next_or_null(next)) {
843       if (!is_early_clobber(reg) && !is_source && conflicting->is_killed)
844          continue;
845 
846       if (conflicting->frozen) {
847          assert(speculative);
848          return false;
849       }
850 
851       unsigned conflicting_file_size =
852          reg_file_size(file, conflicting->interval.reg);
853       unsigned avail_start, avail_end;
854       bool evicted = false;
855       BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict,
856                             conflicting_file_size) {
857          unsigned size = avail_end - avail_start;
858 
859          /* non-half registers must be aligned */
860          if (!(conflicting->interval.reg->flags & IR3_REG_HALF) &&
861              avail_start % 2 == 1) {
862             avail_start++;
863             size--;
864          }
865 
866          unsigned conflicting_size =
867             conflicting->physreg_end - conflicting->physreg_start;
868          if (size >= conflicting_size &&
869              !check_dst_overlap(ctx, file, reg, avail_start, avail_start +
870                                 conflicting_size)) {
871             for (unsigned i = 0;
872                  i < conflicting->physreg_end - conflicting->physreg_start; i++)
873                BITSET_CLEAR(available_to_evict, avail_start + i);
874             eviction_count +=
875                conflicting->physreg_end - conflicting->physreg_start;
876             if (!speculative)
877                ra_move_interval(ctx, file, conflicting, avail_start);
878             evicted = true;
879             break;
880          }
881       }
882 
883       if (evicted)
884          continue;
885 
886       /* If we couldn't evict this range, but the register we're allocating is
887        * allowed to overlap with a killed range, then we may be able to swap it
888        * with a killed range to acheive the same effect.
889        */
890       if (is_early_clobber(reg) || is_source)
891          return false;
892 
893       foreach_interval (killed, file) {
894          if (!killed->is_killed)
895             continue;
896 
897          if (killed->physreg_end - killed->physreg_start !=
898              conflicting->physreg_end - conflicting->physreg_start)
899             continue;
900 
901          if (killed->physreg_end > conflicting_file_size ||
902              conflicting->physreg_end > reg_file_size(file, killed->interval.reg))
903             continue;
904 
905          /* We can't swap the killed range if it partially/fully overlaps the
906           * space we're trying to allocate or (in speculative mode) if it's
907           * already been swapped and will overlap when we actually evict.
908           */
909          bool killed_available = true;
910          for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
911             if (!BITSET_TEST(available, i)) {
912                killed_available = false;
913                break;
914             }
915          }
916 
917          if (!killed_available)
918             continue;
919 
920          if (check_dst_overlap(ctx, file, reg, killed->physreg_start,
921                                killed->physreg_end))
922             continue;
923 
924          /* Check for alignment if one is a full reg */
925          if ((!(killed->interval.reg->flags & IR3_REG_HALF) ||
926               !(conflicting->interval.reg->flags & IR3_REG_HALF)) &&
927              (killed->physreg_start % 2 != 0 ||
928               conflicting->physreg_start % 2 != 0))
929             continue;
930 
931          for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
932             BITSET_CLEAR(available, i);
933          }
934          /* Because this will generate swaps instead of moves, multiply the
935           * cost by 2.
936           */
937          eviction_count += (killed->physreg_end - killed->physreg_start) * 2;
938          if (!speculative) {
939             physreg_t killed_start = killed->physreg_start,
940                       conflicting_start = conflicting->physreg_start;
941             struct ra_removed_interval killed_removed =
942                ra_pop_interval(ctx, file, killed);
943             struct ra_removed_interval conflicting_removed =
944                ra_pop_interval(ctx, file, conflicting);
945             ra_push_interval(ctx, file, &killed_removed, conflicting_start);
946             ra_push_interval(ctx, file, &conflicting_removed, killed_start);
947          }
948 
949          evicted = true;
950          break;
951       }
952 
953       if (!evicted)
954          return false;
955    }
956 
957    *_eviction_count = eviction_count;
958    return true;
959 }
960 
961 static int
removed_interval_cmp(const void * _i1,const void * _i2)962 removed_interval_cmp(const void *_i1, const void *_i2)
963 {
964    const struct ra_removed_interval *i1 = _i1;
965    const struct ra_removed_interval *i2 = _i2;
966 
967    /* We sort the registers as follows:
968     *
969     * |------------------------------------------------------------------------------------------|
970     * |               |                    |        |        |                    |              |
971     * |  Half         | Half early-clobber | Half   | Full   | Full early-clobber | Full         |
972     * |  live-through | destination        | killed | killed | destination        | live-through |
973     * |               |                    |        |        |                    |              |
974     * |------------------------------------------------------------------------------------------|
975     *                                      |                 |
976     *                                      |   Destination   |
977     *                                      |                 |
978     *                                      |-----------------|
979     *
980     * Half-registers have to be first so that they stay in the low half of
981     * the register file. Then half and full killed must stay together so that
982     * there's a contiguous range where we can put the register. With this
983     * structure we should be able to accomodate any collection of intervals
984     * such that the total number of half components is within the half limit
985     * and the combined components are within the full limit.
986     */
987 
988    unsigned i1_align = reg_elem_size(i1->interval->interval.reg);
989    unsigned i2_align = reg_elem_size(i2->interval->interval.reg);
990    if (i1_align > i2_align)
991       return 1;
992    if (i1_align < i2_align)
993       return -1;
994 
995    if (i1_align == 1) {
996       if (i2->interval->is_killed)
997          return -1;
998       if (i1->interval->is_killed)
999          return 1;
1000    } else {
1001       if (i2->interval->is_killed)
1002          return 1;
1003       if (i1->interval->is_killed)
1004          return -1;
1005    }
1006 
1007    return 0;
1008 }
1009 
1010 static int
dsts_cmp(const void * _i1,const void * _i2)1011 dsts_cmp(const void *_i1, const void *_i2)
1012 {
1013    struct ir3_register *i1 = *(struct ir3_register *const *) _i1;
1014    struct ir3_register *i2 = *(struct ir3_register *const *) _i2;
1015 
1016    /* Treat tied destinations as-if they are live-through sources, and normal
1017     * destinations as killed sources.
1018     */
1019    unsigned i1_align = reg_elem_size(i1);
1020    unsigned i2_align = reg_elem_size(i2);
1021    if (i1_align > i2_align)
1022       return 1;
1023    if (i1_align < i2_align)
1024       return -1;
1025 
1026    if (i1_align == 1) {
1027       if (!is_early_clobber(i2))
1028          return -1;
1029       if (!is_early_clobber(i1))
1030          return 1;
1031    } else {
1032       if (!is_early_clobber(i2))
1033          return 1;
1034       if (!is_early_clobber(i1))
1035          return -1;
1036    }
1037 
1038    return 0;
1039 }
1040 
1041 /* "Compress" all the live intervals so that there is enough space for the
1042  * destination register. As there can be gaps when a more-aligned interval
1043  * follows a less-aligned interval, this also sorts them to remove such
1044  * "padding", which may be required when space is very tight.  This isn't
1045  * amazing, but should be used only as a last resort in case the register file
1046  * is almost full and badly fragmented.
1047  *
1048  * Return the physreg to use.
1049  */
1050 static physreg_t
compress_regs_left(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg)1051 compress_regs_left(struct ra_ctx *ctx, struct ra_file *file,
1052                    struct ir3_register *reg)
1053 {
1054    unsigned reg_align = reg_elem_size(reg);
1055    DECLARE_ARRAY(struct ra_removed_interval, intervals);
1056    intervals_count = intervals_sz = 0;
1057    intervals = NULL;
1058 
1059    DECLARE_ARRAY(struct ir3_register *, dsts);
1060    dsts_count = dsts_sz = 0;
1061    dsts = NULL;
1062    array_insert(ctx, dsts, reg);
1063    bool dst_inserted[reg->instr->dsts_count];
1064 
1065    unsigned dst_size = reg->tied ? 0 : reg_size(reg);
1066    unsigned ec_dst_size = is_early_clobber(reg) ? reg_size(reg) : 0;
1067    unsigned half_dst_size = 0, ec_half_dst_size = 0;
1068    if (reg_align == 1) {
1069       half_dst_size = dst_size;
1070       ec_half_dst_size = ec_dst_size;
1071    }
1072 
1073    unsigned removed_size = 0, removed_half_size = 0;
1074    unsigned removed_killed_size = 0, removed_killed_half_size = 0;
1075    unsigned file_size =
1076       reg_align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size;
1077    physreg_t start_reg = 0;
1078 
1079    foreach_interval_rev_safe (interval, file) {
1080       /* We'll check if we can compact the intervals starting here. */
1081       physreg_t candidate_start = interval->physreg_end;
1082 
1083       /* Check if there are any other destinations we need to compact. */
1084       ra_foreach_dst_n (other_dst, n, reg->instr) {
1085          if (other_dst == reg)
1086             break;
1087          if (ra_get_file(ctx, other_dst) != file)
1088             continue;
1089          if (dst_inserted[n])
1090             continue;
1091 
1092          struct ra_interval *other_interval = &ctx->intervals[other_dst->name];
1093          /* if the destination partially overlaps this interval, we need to
1094           * extend candidate_start to the end.
1095           */
1096          if (other_interval->physreg_start < candidate_start) {
1097             candidate_start = MAX2(candidate_start,
1098                                    other_interval->physreg_end);
1099             continue;
1100          }
1101 
1102          dst_inserted[n] = true;
1103 
1104          /* dst intervals with a tied killed source are considered attached to
1105           * that source. Don't actually insert them. This means we have to
1106           * update them below if their tied source moves.
1107           */
1108          if (other_dst->tied) {
1109             struct ra_interval *tied_interval =
1110                &ctx->intervals[other_dst->tied->def->name];
1111             if (tied_interval->is_killed)
1112                continue;
1113          }
1114 
1115          d("popping destination %u physreg %u\n",
1116            other_interval->interval.reg->name,
1117            other_interval->physreg_start);
1118 
1119          array_insert(ctx, dsts, other_dst);
1120          unsigned interval_size = reg_size(other_dst);
1121          if (is_early_clobber(other_dst)) {
1122             ec_dst_size += interval_size;
1123             if (other_interval->interval.reg->flags & IR3_REG_HALF)
1124                ec_half_dst_size += interval_size;
1125          } else {
1126             dst_size += interval_size;
1127             if (other_interval->interval.reg->flags & IR3_REG_HALF)
1128                half_dst_size += interval_size;
1129          }
1130       }
1131 
1132       /* Check if we can sort the intervals *after* this one and have enough
1133        * space leftover to accomodate all intervals, keeping in mind that killed
1134        * sources overlap non-tied destinations. Also check that we have enough
1135        * space leftover for half-registers, if we're inserting a half-register
1136        * (otherwise we only shift any half-registers down so they should be
1137        * safe).
1138        */
1139       if (candidate_start + removed_size + ec_dst_size +
1140           MAX2(removed_killed_size, dst_size) <= file->size &&
1141           (reg_align != 1 ||
1142            candidate_start + removed_half_size + ec_half_dst_size +
1143            MAX2(removed_killed_half_size, half_dst_size) <= file_size)) {
1144          start_reg = candidate_start;
1145          break;
1146       }
1147 
1148       /* We assume that all frozen intervals are at the start and that we
1149        * can avoid popping them.
1150        */
1151       assert(!interval->frozen);
1152 
1153       /* Killed sources are different because they go at the end and can
1154        * overlap the register we're trying to add.
1155        */
1156       unsigned interval_size = interval->physreg_end - interval->physreg_start;
1157       if (interval->is_killed) {
1158          removed_killed_size += interval_size;
1159          if (interval->interval.reg->flags & IR3_REG_HALF)
1160             removed_killed_half_size += interval_size;
1161       } else {
1162          removed_size += interval_size;
1163          if (interval->interval.reg->flags & IR3_REG_HALF)
1164             removed_half_size += interval_size;
1165       }
1166 
1167       /* Now that we've done the accounting, pop this off */
1168       d("popping interval %u physreg %u%s\n", interval->interval.reg->name,
1169         interval->physreg_start, interval->is_killed ? ", killed" : "");
1170       array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval));
1171    }
1172 
1173    /* TODO: In addition to skipping registers at the beginning that are
1174     * well-packed, we should try to skip registers at the end.
1175     */
1176 
1177    qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp);
1178    qsort(dsts, dsts_count, sizeof(*dsts), dsts_cmp);
1179 
1180    physreg_t live_reg = start_reg;
1181    physreg_t dst_reg = (physreg_t)~0;
1182    physreg_t ret_reg = (physreg_t)~0;
1183    unsigned dst_index = 0;
1184    unsigned live_index = 0;
1185 
1186    /* We have two lists of intervals to process, live intervals and destination
1187     * intervals. Process them in the order of the disgram in insert_cmp().
1188     */
1189    while (live_index < intervals_count || dst_index < dsts_count) {
1190       bool process_dst;
1191       if (live_index == intervals_count) {
1192          process_dst = true;
1193       } else if (dst_index == dsts_count) {
1194          process_dst = false;
1195       } else {
1196          struct ir3_register *dst = dsts[dst_index];
1197          struct ra_interval *live_interval = intervals[live_index].interval;
1198 
1199          bool live_half = live_interval->interval.reg->flags & IR3_REG_HALF;
1200          bool live_killed = live_interval->is_killed;
1201          bool dst_half = dst->flags & IR3_REG_HALF;
1202          bool dst_early_clobber = is_early_clobber(dst);
1203 
1204          if (live_half && !live_killed) {
1205             /* far-left of diagram. */
1206             process_dst = false;
1207          } else if (dst_half && dst_early_clobber) {
1208             /* mid-left of diagram. */
1209             process_dst = true;
1210          } else if (!dst_early_clobber) {
1211             /* bottom of disagram. */
1212             process_dst = true;
1213          } else if (live_killed) {
1214             /* middle of diagram. */
1215             process_dst = false;
1216          } else if (!dst_half && dst_early_clobber) {
1217             /* mid-right of diagram. */
1218             process_dst = true;
1219          } else {
1220             /* far right of diagram. */
1221             assert(!live_killed && !live_half);
1222             process_dst = false;
1223          }
1224       }
1225 
1226       struct ir3_register *cur_reg =
1227          process_dst ? dsts[dst_index] :
1228          intervals[live_index].interval->interval.reg;
1229 
1230       physreg_t physreg;
1231       if (process_dst && !is_early_clobber(cur_reg)) {
1232          if (dst_reg == (physreg_t)~0)
1233             dst_reg = live_reg;
1234          physreg = dst_reg;
1235       } else {
1236          physreg = live_reg;
1237          struct ra_interval *live_interval = intervals[live_index].interval;
1238          bool live_killed = live_interval->is_killed;
1239          /* If this is live-through and we've processed the destinations, we
1240           * need to make sure we take into account any overlapping destinations.
1241           */
1242          if (!live_killed && dst_reg != (physreg_t)~0)
1243             physreg = MAX2(physreg, dst_reg);
1244       }
1245 
1246       if (!(cur_reg->flags & IR3_REG_HALF))
1247          physreg = ALIGN(physreg, 2);
1248 
1249       d("pushing reg %u physreg %u\n", cur_reg->name, physreg);
1250 
1251       unsigned interval_size = reg_size(cur_reg);
1252       if (physreg + interval_size >
1253           reg_file_size(file, cur_reg)) {
1254          d("ran out of room for interval %u!\n",
1255            cur_reg->name);
1256          unreachable("reg pressure calculation was wrong!");
1257          return 0;
1258       }
1259 
1260       if (process_dst) {
1261          if (cur_reg == reg) {
1262             ret_reg = physreg;
1263          } else {
1264             struct ra_interval *interval = &ctx->intervals[cur_reg->name];
1265             interval->physreg_start = physreg;
1266             interval->physreg_end = physreg + interval_size;
1267          }
1268          dst_index++;
1269       } else {
1270          ra_push_interval(ctx, file, &intervals[live_index], physreg);
1271          live_index++;
1272       }
1273 
1274       physreg += interval_size;
1275 
1276       if (process_dst && !is_early_clobber(cur_reg)) {
1277          dst_reg = physreg;
1278       } else {
1279          live_reg = physreg;
1280       }
1281    }
1282 
1283    /* If we shuffled around a tied source that is killed, we may have to update
1284     * its corresponding destination since we didn't insert it above.
1285     */
1286    ra_foreach_dst (dst, reg->instr) {
1287       if (dst == reg)
1288          break;
1289 
1290       struct ir3_register *tied = dst->tied;
1291       if (!tied)
1292          continue;
1293 
1294       struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1295       if (!tied_interval->is_killed)
1296          continue;
1297 
1298       struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1299       unsigned dst_size = reg_size(dst);
1300       dst_interval->physreg_start = ra_interval_get_physreg(tied_interval);
1301       dst_interval->physreg_end = dst_interval->physreg_start + dst_size;
1302    }
1303 
1304    return ret_reg;
1305 }
1306 
1307 static void
update_affinity(struct ra_file * file,struct ir3_register * reg,physreg_t physreg)1308 update_affinity(struct ra_file *file, struct ir3_register *reg,
1309                 physreg_t physreg)
1310 {
1311    if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0)
1312       return;
1313 
1314    if (physreg < reg->merge_set_offset)
1315       return;
1316 
1317    if ((physreg - reg->merge_set_offset + reg->merge_set->size) > file->size)
1318       return;
1319 
1320    reg->merge_set->preferred_reg = physreg - reg->merge_set_offset;
1321 }
1322 
1323 /* Try to find free space for a register without shuffling anything. This uses
1324  * a round-robin algorithm to reduce false dependencies.
1325  */
1326 static physreg_t
find_best_gap(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * dst,unsigned file_size,unsigned size,unsigned alignment)1327 find_best_gap(struct ra_ctx *ctx, struct ra_file *file,
1328               struct ir3_register *dst, unsigned file_size, unsigned size,
1329               unsigned alignment)
1330 {
1331    /* This can happen if we create a very large merge set. Just bail out in that
1332     * case.
1333     */
1334    if (size > file_size)
1335       return (physreg_t) ~0;
1336 
1337    BITSET_WORD *available =
1338       is_early_clobber(dst) ? file->available_to_evict : file->available;
1339 
1340    unsigned start = ALIGN(file->start, alignment) % (file_size - size + alignment);
1341    unsigned candidate = start;
1342    do {
1343       bool is_available = true;
1344       for (unsigned i = 0; i < size; i++) {
1345          if (!BITSET_TEST(available, candidate + i)) {
1346             is_available = false;
1347             break;
1348          }
1349       }
1350 
1351       if (is_available) {
1352          is_available =
1353             !check_dst_overlap(ctx, file, dst, candidate, candidate + size);
1354       }
1355 
1356       if (is_available) {
1357          file->start = (candidate + size) % file_size;
1358          return candidate;
1359       }
1360 
1361       candidate += alignment;
1362       if (candidate + size > file_size)
1363          candidate = 0;
1364    } while (candidate != start);
1365 
1366    return (physreg_t)~0;
1367 }
1368 
1369 /* This is the main entrypoint for picking a register. Pick a free register
1370  * for "reg", shuffling around sources if necessary. In the normal case where
1371  * "is_source" is false, this register can overlap with killed sources
1372  * (intervals with "is_killed == true"). If "is_source" is true, then
1373  * is_killed is ignored and the register returned must not overlap with killed
1374  * sources. This must be used for tied registers, because we're actually
1375  * allocating the destination and the tied source at the same time.
1376  */
1377 
1378 static physreg_t
get_reg(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg)1379 get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg)
1380 {
1381    unsigned file_size = reg_file_size(file, reg);
1382    if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
1383       physreg_t preferred_reg =
1384          reg->merge_set->preferred_reg + reg->merge_set_offset;
1385       if (preferred_reg + reg_size(reg) <= file_size &&
1386           preferred_reg % reg_elem_size(reg) == 0 &&
1387           get_reg_specified(ctx, file, reg, preferred_reg, false))
1388          return preferred_reg;
1389    }
1390 
1391    /* If this register is a subset of a merge set which we have not picked a
1392     * register for, first try to allocate enough space for the entire merge
1393     * set.
1394     */
1395    unsigned size = reg_size(reg);
1396    if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
1397        size < reg->merge_set->size) {
1398       physreg_t best_reg = find_best_gap(ctx, file, reg, file_size,
1399                                          reg->merge_set->size,
1400                                          reg->merge_set->alignment);
1401       if (best_reg != (physreg_t)~0u) {
1402          best_reg += reg->merge_set_offset;
1403          return best_reg;
1404       }
1405    }
1406 
1407    /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
1408     * Because this doesn't introduce unnecessary dependencies, and it
1409     * potentially avoids needing (ss) syncs for write after read hazards for
1410     * SFU instructions:
1411     */
1412    if (is_sfu(reg->instr) || is_alu(reg->instr)) {
1413       for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
1414          struct ir3_register *src = reg->instr->srcs[i];
1415          if (!ra_reg_is_src(src))
1416             continue;
1417          if (ra_get_file(ctx, src) == file && reg_size(src) >= size) {
1418             struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1419             physreg_t src_physreg = ra_interval_get_physreg(src_interval);
1420             if (src_physreg % reg_elem_size(reg) == 0 &&
1421                 src_physreg + size <= file_size &&
1422                 get_reg_specified(ctx, file, reg, src_physreg, false))
1423                return src_physreg;
1424          }
1425       }
1426    }
1427 
1428    physreg_t best_reg =
1429       find_best_gap(ctx, file, reg, file_size, size, reg_elem_size(reg));
1430    if (best_reg != (physreg_t)~0u) {
1431       return best_reg;
1432    }
1433 
1434    /* Ok, we couldn't find anything that fits. Here is where we have to start
1435     * moving things around to make stuff fit. First try solely evicting
1436     * registers in the way.
1437     */
1438    unsigned best_eviction_count = ~0;
1439    for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) {
1440       unsigned eviction_count;
1441       if (try_evict_regs(ctx, file, reg, i, &eviction_count, false, true)) {
1442          if (eviction_count < best_eviction_count) {
1443             best_eviction_count = eviction_count;
1444             best_reg = i;
1445          }
1446       }
1447    }
1448 
1449    if (best_eviction_count != ~0) {
1450       ASSERTED bool result = try_evict_regs(
1451          ctx, file, reg, best_reg, &best_eviction_count, false, false);
1452       assert(result);
1453       return best_reg;
1454    }
1455 
1456    /* Use the dumb fallback only if try_evict_regs() fails. */
1457    return compress_regs_left(ctx, file, reg);
1458 }
1459 
1460 static void
assign_reg(struct ir3_instruction * instr,struct ir3_register * reg,unsigned num)1461 assign_reg(struct ir3_instruction *instr, struct ir3_register *reg,
1462            unsigned num)
1463 {
1464    if (reg->flags & IR3_REG_ARRAY) {
1465       reg->array.base = num;
1466       if (reg->flags & IR3_REG_RELATIV)
1467          reg->array.offset += num;
1468       else
1469          reg->num = num + reg->array.offset;
1470    } else {
1471       reg->num = num;
1472    }
1473 }
1474 
1475 static void
mark_src_killed(struct ra_ctx * ctx,struct ir3_register * src)1476 mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src)
1477 {
1478    struct ra_interval *interval = &ctx->intervals[src->def->name];
1479 
1480    if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed ||
1481        interval->interval.parent ||
1482        !rb_tree_is_empty(&interval->interval.children))
1483       return;
1484 
1485    ra_file_mark_killed(ra_get_file(ctx, src), interval);
1486 }
1487 
1488 static void
insert_dst(struct ra_ctx * ctx,struct ir3_register * dst)1489 insert_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1490 {
1491    struct ra_file *file = ra_get_file(ctx, dst);
1492    struct ra_interval *interval = &ctx->intervals[dst->name];
1493 
1494    d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval));
1495 
1496    if (!(dst->flags & IR3_REG_UNUSED))
1497       ra_file_insert(file, interval);
1498 
1499    assign_reg(dst->instr, dst, ra_interval_get_num(interval));
1500 }
1501 
1502 static void
allocate_dst_fixed(struct ra_ctx * ctx,struct ir3_register * dst,physreg_t physreg)1503 allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst,
1504                    physreg_t physreg)
1505 {
1506    struct ra_file *file = ra_get_file(ctx, dst);
1507    struct ra_interval *interval = &ctx->intervals[dst->name];
1508    update_affinity(file, dst, physreg);
1509 
1510    ra_interval_init(interval, dst);
1511    interval->physreg_start = physreg;
1512    interval->physreg_end = physreg + reg_size(dst);
1513 }
1514 
1515 /* If a tied destination interferes with its source register, we have to insert
1516  * a copy beforehand to copy the source to the destination. Because we are using
1517  * the parallel_copies array and not creating a separate copy, this copy will
1518  * happen in parallel with any shuffling around of the tied source, so we have
1519  * to copy the source *as it exists before it is shuffled around*. We do this by
1520  * inserting the copy early, before any other copies are inserted. We don't
1521  * actually know the destination of the copy, but that's ok because the
1522  * dst_interval will be filled out later.
1523  */
1524 static void
insert_tied_dst_copy(struct ra_ctx * ctx,struct ir3_register * dst)1525 insert_tied_dst_copy(struct ra_ctx *ctx, struct ir3_register *dst)
1526 {
1527    struct ir3_register *tied = dst->tied;
1528 
1529    if (!tied)
1530       return;
1531 
1532    struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1533    struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1534 
1535    if (tied_interval->is_killed)
1536       return;
1537 
1538    physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);
1539 
1540    array_insert(ctx, ctx->parallel_copies,
1541                 (struct ra_parallel_copy){
1542                    .interval = dst_interval,
1543                    .src = tied_physreg,
1544                 });
1545 }
1546 
1547 static void
allocate_dst(struct ra_ctx * ctx,struct ir3_register * dst)1548 allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1549 {
1550    struct ra_file *file = ra_get_file(ctx, dst);
1551 
1552    struct ir3_register *tied = dst->tied;
1553    if (tied) {
1554       struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1555       if (tied_interval->is_killed) {
1556          /* The easy case: the source is killed, so we can just reuse it
1557           * for the destination.
1558           */
1559          allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval));
1560          return;
1561       }
1562    }
1563 
1564    /* All the hard work is done by get_reg here. */
1565    physreg_t physreg = get_reg(ctx, file, dst);
1566 
1567    allocate_dst_fixed(ctx, dst, physreg);
1568 }
1569 
1570 static void
assign_src(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)1571 assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
1572            struct ir3_register *src)
1573 {
1574    struct ra_interval *interval = &ctx->intervals[src->def->name];
1575    struct ra_file *file = ra_get_file(ctx, src);
1576 
1577    struct ir3_register *tied = src->tied;
1578    physreg_t physreg;
1579    if (tied) {
1580       struct ra_interval *tied_interval = &ctx->intervals[tied->name];
1581       physreg = ra_interval_get_physreg(tied_interval);
1582    } else {
1583       physreg = ra_interval_get_physreg(interval);
1584    }
1585 
1586    assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags));
1587 
1588    if (src->flags & IR3_REG_FIRST_KILL)
1589       ra_file_remove(file, interval);
1590 }
1591 
1592 /* Insert a parallel copy instruction before the instruction with the parallel
1593  * copy entries we've built up.
1594  */
1595 static void
insert_parallel_copy_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1596 insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1597 {
1598    if (ctx->parallel_copies_count == 0)
1599       return;
1600 
1601    struct ir3_instruction *pcopy =
1602       ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY,
1603                        ctx->parallel_copies_count, ctx->parallel_copies_count);
1604 
1605    for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1606       struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1607       struct ir3_register *reg =
1608          ir3_dst_create(pcopy, INVALID_REG,
1609                         entry->interval->interval.reg->flags &
1610                         (IR3_REG_HALF | IR3_REG_ARRAY | IR3_REG_SHARED));
1611       reg->size = entry->interval->interval.reg->size;
1612       reg->wrmask = entry->interval->interval.reg->wrmask;
1613       assign_reg(pcopy, reg, ra_interval_get_num(entry->interval));
1614    }
1615 
1616    for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1617       struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1618       struct ir3_register *reg =
1619          ir3_src_create(pcopy, INVALID_REG,
1620                         entry->interval->interval.reg->flags &
1621                         (IR3_REG_HALF | IR3_REG_ARRAY | IR3_REG_SHARED));
1622       reg->size = entry->interval->interval.reg->size;
1623       reg->wrmask = entry->interval->interval.reg->wrmask;
1624       assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags));
1625    }
1626 
1627    list_del(&pcopy->node);
1628    list_addtail(&pcopy->node, &instr->node);
1629    ctx->parallel_copies_count = 0;
1630 }
1631 
1632 static void
handle_normal_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1633 handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1634 {
1635    /* First, mark sources as going-to-be-killed while allocating the dest. */
1636    ra_foreach_src (src, instr) {
1637       mark_src_killed(ctx, src);
1638    }
1639 
1640    /* Pre-insert tied dst copies. */
1641    ra_foreach_dst (dst, instr) {
1642       insert_tied_dst_copy(ctx, dst);
1643    }
1644 
1645    /* Allocate the destination. */
1646    ra_foreach_dst (dst, instr) {
1647       allocate_dst(ctx, dst);
1648    }
1649 
1650    /* Now handle sources. Go backward so that in case there are multiple
1651     * sources with the same def and that def is killed we only remove it at
1652     * the end.
1653     */
1654    ra_foreach_src_rev (src, instr) {
1655       assign_src(ctx, instr, src);
1656    }
1657 
1658    /* Now finally insert the destination into the map. */
1659    ra_foreach_dst (dst, instr) {
1660       insert_dst(ctx, dst);
1661    }
1662 
1663    insert_parallel_copy_instr(ctx, instr);
1664 }
1665 
1666 static void
handle_split(struct ra_ctx * ctx,struct ir3_instruction * instr)1667 handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
1668 {
1669    struct ir3_register *dst = instr->dsts[0];
1670    struct ir3_register *src = instr->srcs[0];
1671 
1672    if (!(dst->flags & IR3_REG_SSA))
1673       return;
1674 
1675    if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
1676       handle_normal_instr(ctx, instr);
1677       return;
1678    }
1679 
1680    struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1681 
1682    physreg_t physreg = ra_interval_get_physreg(src_interval);
1683    assign_src(ctx, instr, src);
1684 
1685    allocate_dst_fixed(
1686       ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset);
1687    insert_dst(ctx, dst);
1688 }
1689 
1690 static void
handle_collect(struct ra_ctx * ctx,struct ir3_instruction * instr)1691 handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
1692 {
1693    if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1694       return;
1695 
1696    struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;
1697    unsigned dst_offset = instr->dsts[0]->merge_set_offset;
1698 
1699    if (!dst_set || dst_set->regs_count == 1) {
1700       handle_normal_instr(ctx, instr);
1701       return;
1702    }
1703 
1704    /* We need to check if any of the sources are contained in an interval
1705     * that is at least as large as the vector. In this case, we should put
1706     * the vector inside that larger interval. (There should be one
1707     * unambiguous place to put it, because values sharing the same merge set
1708     * should be allocated together.) This can happen in a case like:
1709     *
1710     * ssa_1 (wrmask=0xf) = ...
1711     * ssa_2 = split ssa_1 off:0
1712     * ssa_3 = split ssa_1 off:1
1713     * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3
1714     * ... = (kill)ssa_1
1715     * ... = (kill)ssa_4
1716     *
1717     * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it.
1718     */
1719    physreg_t dst_fixed = (physreg_t)~0u;
1720 
1721    ra_foreach_src (src, instr) {
1722       if (src->flags & IR3_REG_FIRST_KILL) {
1723          mark_src_killed(ctx, src);
1724       }
1725 
1726       struct ra_interval *interval = &ctx->intervals[src->def->name];
1727 
1728       /* We only need special handling if the source's interval overlaps with
1729        * the destination's interval.
1730        */
1731       if (src->def->interval_start >= instr->dsts[0]->interval_end ||
1732           instr->dsts[0]->interval_start >= src->def->interval_end ||
1733           interval->is_killed)
1734          continue;
1735 
1736       while (interval->interval.parent != NULL) {
1737          interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1738       }
1739       if (reg_size(interval->interval.reg) >= reg_size(instr->dsts[0])) {
1740          dst_fixed = interval->physreg_start -
1741                      interval->interval.reg->merge_set_offset + dst_offset;
1742       } else {
1743          /* For sources whose root interval is smaller than the
1744           * destination (i.e. the normal case), we will shuffle them
1745           * around after allocating the destination. Mark them killed so
1746           * that the destination can be allocated over them, even if they
1747           * aren't actually killed.
1748           */
1749          ra_file_mark_killed(ra_get_file(ctx, src), interval);
1750       }
1751    }
1752 
1753    if (dst_fixed != (physreg_t)~0u)
1754       allocate_dst_fixed(ctx, instr->dsts[0], dst_fixed);
1755    else
1756       allocate_dst(ctx, instr->dsts[0]);
1757 
1758    /* Remove the temporary is_killed we added */
1759    ra_foreach_src (src, instr) {
1760       struct ra_interval *interval = &ctx->intervals[src->def->name];
1761       while (interval->interval.parent != NULL) {
1762          interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1763       }
1764 
1765       /* Filter out cases where it actually should be killed */
1766       if (interval != &ctx->intervals[src->def->name] ||
1767           !(src->flags & IR3_REG_KILL)) {
1768          ra_file_unmark_killed(ra_get_file(ctx, src), interval);
1769       }
1770    }
1771 
1772    ra_foreach_src_rev (src, instr) {
1773       assign_src(ctx, instr, src);
1774    }
1775 
1776    /* We need to do this before insert_dst(), so that children of the
1777     * destination which got marked as killed and then shuffled around to make
1778     * space for the destination have the correct pcopy destination that
1779     * matches what we assign the source of the collect to in assign_src().
1780     *
1781     * TODO: In this case we'll wind up copying the value in the pcopy and
1782     * then again in the collect. We could avoid one of those by updating the
1783     * pcopy destination to match up with the final location of the source
1784     * after the collect and making the collect a no-op. However this doesn't
1785     * seem to happen often.
1786     */
1787    insert_parallel_copy_instr(ctx, instr);
1788 
1789    /* Note: insert_dst will automatically shuffle around any intervals that
1790     * are a child of the collect by making them children of the collect.
1791     */
1792 
1793    insert_dst(ctx, instr->dsts[0]);
1794 }
1795 
1796 /* Parallel copies before RA should only be at the end of the block, for
1797  * phi's. For these we only need to fill in the sources, and then we fill in
1798  * the destinations in the successor block.
1799  */
1800 static void
handle_pcopy(struct ra_ctx * ctx,struct ir3_instruction * instr)1801 handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
1802 {
1803    ra_foreach_src_rev (src, instr) {
1804       assign_src(ctx, instr, src);
1805    }
1806 }
1807 
1808 /* Some inputs may need to be precolored. We need to handle those first, so
1809  * that other non-precolored inputs don't accidentally get allocated over
1810  * them. Inputs are the very first thing in the shader, so it shouldn't be a
1811  * problem to allocate them to a specific physreg.
1812  */
1813 
1814 static void
handle_precolored_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1815 handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1816 {
1817    if (instr->dsts[0]->num == INVALID_REG ||
1818        !(instr->dsts[0]->flags & IR3_REG_SSA))
1819       return;
1820 
1821    struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1822    struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1823    physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]);
1824    allocate_dst_fixed(ctx, instr->dsts[0], physreg);
1825 
1826    d("insert precolored dst %u physreg %u", instr->dsts[0]->name,
1827      ra_interval_get_physreg(interval));
1828 
1829    ra_file_insert(file, interval);
1830    interval->frozen = true;
1831 }
1832 
1833 static void
handle_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1834 handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1835 {
1836    if (instr->dsts[0]->num != INVALID_REG)
1837       return;
1838 
1839    allocate_dst(ctx, instr->dsts[0]);
1840 
1841    struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1842    struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1843    ra_file_insert(file, interval);
1844 }
1845 
1846 static void
assign_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1847 assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1848 {
1849    if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1850       return;
1851 
1852    struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1853    struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1854 
1855    if (instr->dsts[0]->num == INVALID_REG) {
1856       assign_reg(instr, instr->dsts[0], ra_interval_get_num(interval));
1857    } else {
1858       interval->frozen = false;
1859    }
1860 
1861    if (instr->dsts[0]->flags & IR3_REG_UNUSED)
1862       ra_file_remove(file, interval);
1863 
1864    ra_foreach_src_rev (src, instr)
1865       assign_src(ctx, instr, src);
1866 }
1867 
1868 /* chmask is a bit weird, because it has pre-colored sources due to the need
1869  * to pass some registers to the next stage. Fortunately there are only at
1870  * most two, and there should be no other live values by the time we get to
1871  * this instruction, so we only have to do the minimum and don't need any
1872  * fancy fallbacks.
1873  *
1874  * TODO: Add more complete handling of precolored sources, e.g. for function
1875  * argument handling. We'd need a way to mark sources as fixed so that they
1876  * don't get moved around when placing other sources in the fallback case, and
1877  * a duplication of much of the logic in get_reg(). This also opens another
1878  * can of worms, e.g. what if the precolored source is a split of a vector
1879  * which is still live -- this breaks our assumption that splits don't incur
1880  * any "extra" register requirements and we'd have to break it out of the
1881  * parent ra_interval.
1882  */
1883 
1884 static void
handle_precolored_source(struct ra_ctx * ctx,struct ir3_register * src)1885 handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src)
1886 {
1887    struct ra_file *file = ra_get_file(ctx, src);
1888    struct ra_interval *interval = &ctx->intervals[src->def->name];
1889    physreg_t physreg = ra_reg_get_physreg(src);
1890 
1891    if (ra_interval_get_num(interval) == src->num)
1892       return;
1893 
1894    /* Try evicting stuff in our way if it isn't free. This won't move
1895     * anything unless it overlaps with our precolored physreg, so we don't
1896     * have to worry about evicting other precolored sources.
1897     */
1898    if (!get_reg_specified(ctx, file, src, physreg, true)) {
1899       unsigned eviction_count;
1900       if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true,
1901                           false)) {
1902          unreachable("failed to evict for precolored source!");
1903          return;
1904       }
1905    }
1906 
1907    ra_move_interval(ctx, file, interval, physreg);
1908 }
1909 
1910 static void
handle_chmask(struct ra_ctx * ctx,struct ir3_instruction * instr)1911 handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr)
1912 {
1913    /* Note: we purposely don't mark sources as killed, so that we can reuse
1914     * some of the get_reg() machinery as-if the source is a destination.
1915     * Marking it as killed would make e.g. get_reg_specified() wouldn't work
1916     * correctly.
1917     */
1918    ra_foreach_src (src, instr) {
1919       assert(src->num != INVALID_REG);
1920       handle_precolored_source(ctx, src);
1921    }
1922 
1923    ra_foreach_src (src, instr) {
1924       struct ra_file *file = ra_get_file(ctx, src);
1925       struct ra_interval *interval = &ctx->intervals[src->def->name];
1926       if (src->flags & IR3_REG_FIRST_KILL)
1927          ra_file_remove(file, interval);
1928    }
1929 
1930    insert_parallel_copy_instr(ctx, instr);
1931 }
1932 
1933 static physreg_t
read_register(struct ra_ctx * ctx,struct ir3_block * block,struct ir3_register * def)1934 read_register(struct ra_ctx *ctx, struct ir3_block *block,
1935               struct ir3_register *def)
1936 {
1937    struct ra_block_state *state = &ctx->blocks[block->index];
1938    if (state->renames) {
1939       struct hash_entry *entry = _mesa_hash_table_search(state->renames, def);
1940       if (entry) {
1941          return (physreg_t)(uintptr_t)entry->data;
1942       }
1943    }
1944 
1945    return ra_reg_get_physreg(def);
1946 }
1947 
1948 static void
handle_live_in(struct ra_ctx * ctx,struct ir3_register * def)1949 handle_live_in(struct ra_ctx *ctx, struct ir3_register *def)
1950 {
1951    physreg_t physreg = ~0;
1952    for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
1953       struct ir3_block *pred = ctx->block->predecessors[i];
1954       struct ra_block_state *pred_state = &ctx->blocks[pred->index];
1955 
1956       if (!pred_state->visited)
1957          continue;
1958 
1959       physreg = read_register(ctx, pred, def);
1960       break;
1961    }
1962 
1963    assert(physreg != (physreg_t)~0);
1964 
1965    struct ra_interval *interval = &ctx->intervals[def->name];
1966    struct ra_file *file = ra_get_file(ctx, def);
1967    ra_interval_init(interval, def);
1968    interval->physreg_start = physreg;
1969    interval->physreg_end = physreg + reg_size(def);
1970    ra_file_insert(file, interval);
1971 }
1972 
1973 static void
handle_live_out(struct ra_ctx * ctx,struct ir3_register * def)1974 handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
1975 {
1976    /* Skip parallelcopy's which in the original program are only used as phi
1977     * arguments. Even though phi arguments are live out, they are only
1978     * assigned when the phi is.
1979     */
1980    if (def->instr->opc == OPC_META_PARALLEL_COPY)
1981       return;
1982 
1983    struct ra_block_state *state = &ctx->blocks[ctx->block->index];
1984    struct ra_interval *interval = &ctx->intervals[def->name];
1985    physreg_t physreg = ra_interval_get_physreg(interval);
1986    if (physreg != ra_reg_get_physreg(def)) {
1987       if (!state->renames)
1988          state->renames = _mesa_pointer_hash_table_create(ctx);
1989       _mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg);
1990    }
1991 }
1992 
1993 static void
handle_phi(struct ra_ctx * ctx,struct ir3_register * def)1994 handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
1995 {
1996    if (!(def->flags & IR3_REG_SSA))
1997       return;
1998 
1999    struct ra_file *file = ra_get_file(ctx, def);
2000    struct ra_interval *interval = &ctx->intervals[def->name];
2001 
2002    /* phis are always scalar, so they should already be the smallest possible
2003     * size. However they may be coalesced with other live-in values/phi
2004     * nodes, so check for that here.
2005     */
2006    struct ir3_reg_interval *parent_ir3 =
2007       ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start);
2008    physreg_t physreg;
2009    if (parent_ir3) {
2010       struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3);
2011       physreg = ra_interval_get_physreg(parent) +
2012                 (def->interval_start - parent_ir3->reg->interval_start);
2013    } else {
2014       physreg = get_reg(ctx, file, def);
2015    }
2016 
2017    allocate_dst_fixed(ctx, def, physreg);
2018 
2019    ra_file_insert(file, interval);
2020 }
2021 
2022 static void
assign_phi(struct ra_ctx * ctx,struct ir3_instruction * phi)2023 assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
2024 {
2025    if (!(phi->dsts[0]->flags & IR3_REG_SSA))
2026       return;
2027 
2028    struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);
2029    struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];
2030    assert(!interval->interval.parent);
2031    unsigned num = ra_interval_get_num(interval);
2032    assign_reg(phi, phi->dsts[0], num);
2033 
2034    /* Assign the parallelcopy sources of this phi */
2035    for (unsigned i = 0; i < phi->srcs_count; i++) {
2036       if (phi->srcs[i]->def) {
2037          assign_reg(phi, phi->srcs[i], num);
2038          assign_reg(phi, phi->srcs[i]->def, num);
2039       }
2040    }
2041 
2042    if (phi->dsts[0]->flags & IR3_REG_UNUSED)
2043       ra_file_remove(file, interval);
2044 }
2045 
2046 /* When we split a live range, we sometimes need to emit fixup code at the end
2047  * of a block. For example, something like:
2048  *
2049  * a = ...
2050  * if (...) {
2051  *    ...
2052  *    a' = a
2053  *    b = ... // a evicted to make room for b
2054  *    ...
2055  * }
2056  * ... = a
2057  *
2058  * When we insert the copy to a' in insert_parallel_copy_instr(), this forces
2059  * to insert another copy "a = a'" at the end of the if. Normally this would
2060  * also entail adding a phi node, but since we're about to go out of SSA
2061  * anyway we just insert an extra move. Note, however, that "b" might be used
2062  * in a phi node at the end of the if and share registers with "a", so we
2063  * have to be careful to extend any preexisting parallelcopy instruction
2064  * instead of creating our own in order to guarantee that they properly get
2065  * swapped.
2066  */
2067 
2068 static void
insert_liveout_copy(struct ir3_block * block,physreg_t dst,physreg_t src,struct ir3_register * reg)2069 insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,
2070                     struct ir3_register *reg)
2071 {
2072    struct ir3_instruction *old_pcopy = NULL;
2073    if (!list_is_empty(&block->instr_list)) {
2074       struct ir3_instruction *last =
2075          list_entry(block->instr_list.prev, struct ir3_instruction, node);
2076       if (last->opc == OPC_META_PARALLEL_COPY)
2077          old_pcopy = last;
2078    }
2079 
2080    unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0;
2081    struct ir3_instruction *pcopy = ir3_instr_create(
2082       block, OPC_META_PARALLEL_COPY, old_pcopy_srcs + 1, old_pcopy_srcs + 1);
2083 
2084    for (unsigned i = 0; i < old_pcopy_srcs; i++) {
2085       old_pcopy->dsts[i]->instr = pcopy;
2086       pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i];
2087    }
2088 
2089    unsigned flags = reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
2090 
2091    struct ir3_register *dst_reg = ir3_dst_create(pcopy, INVALID_REG, flags);
2092    dst_reg->wrmask = reg->wrmask;
2093    dst_reg->size = reg->size;
2094    assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags));
2095 
2096    for (unsigned i = 0; i < old_pcopy_srcs; i++) {
2097       pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i];
2098    }
2099 
2100    struct ir3_register *src_reg = ir3_src_create(pcopy, INVALID_REG, flags);
2101    src_reg->wrmask = reg->wrmask;
2102    src_reg->size = reg->size;
2103    assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags));
2104 
2105    if (old_pcopy)
2106       list_del(&old_pcopy->node);
2107 }
2108 
2109 static void
insert_live_in_move(struct ra_ctx * ctx,struct ra_interval * interval)2110 insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
2111 {
2112    physreg_t physreg = ra_interval_get_physreg(interval);
2113 
2114    for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
2115       struct ir3_block *pred = ctx->block->predecessors[i];
2116       struct ra_block_state *pred_state = &ctx->blocks[pred->index];
2117 
2118       if (!pred_state->visited)
2119          continue;
2120 
2121       physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
2122       if (pred_reg != physreg) {
2123          assert(!(interval->interval.reg->flags & IR3_REG_SHARED));
2124          insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
2125       }
2126    }
2127 }
2128 
2129 static void
insert_file_live_in_moves(struct ra_ctx * ctx,struct ra_file * file)2130 insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file)
2131 {
2132    BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index];
2133    rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2134                     physreg_node) {
2135       /* Skip phi nodes. This needs to happen after phi nodes are allocated,
2136        * because we may have to move live-ins around to make space for phi
2137        * nodes, but we shouldn't be handling phi nodes here.
2138        */
2139       if (BITSET_TEST(live_in, interval->interval.reg->name))
2140          insert_live_in_move(ctx, interval);
2141    }
2142 }
2143 
2144 static void
insert_entry_regs(struct ra_block_state * state,struct ra_file * file)2145 insert_entry_regs(struct ra_block_state *state, struct ra_file *file)
2146 {
2147    rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2148                     physreg_node) {
2149       _mesa_hash_table_insert(state->entry_regs, interval->interval.reg,
2150                               (void *)(uintptr_t)interval->physreg_start);
2151    }
2152 }
2153 
2154 static void
insert_live_in_moves(struct ra_ctx * ctx)2155 insert_live_in_moves(struct ra_ctx *ctx)
2156 {
2157    insert_file_live_in_moves(ctx, &ctx->full);
2158    insert_file_live_in_moves(ctx, &ctx->half);
2159    insert_file_live_in_moves(ctx, &ctx->shared);
2160 
2161    /* If not all predecessors are visited, insert live-in regs so that
2162     * insert_live_out_moves() will work.
2163     */
2164    bool all_preds_visited = true;
2165    for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
2166       if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) {
2167          all_preds_visited = false;
2168          break;
2169       }
2170    }
2171 
2172    if (!all_preds_visited) {
2173       struct ra_block_state *state = &ctx->blocks[ctx->block->index];
2174       state->entry_regs = _mesa_pointer_hash_table_create(ctx);
2175 
2176       insert_entry_regs(state, &ctx->full);
2177       insert_entry_regs(state, &ctx->half);
2178       insert_entry_regs(state, &ctx->shared);
2179    }
2180 }
2181 
2182 static void
insert_live_out_move(struct ra_ctx * ctx,struct ra_interval * interval)2183 insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval)
2184 {
2185    for (unsigned i = 0; i < 2; i++) {
2186       if (!ctx->block->successors[i])
2187          continue;
2188 
2189       struct ir3_block *succ = ctx->block->successors[i];
2190       struct ra_block_state *succ_state = &ctx->blocks[succ->index];
2191 
2192       if (!succ_state->visited)
2193          continue;
2194 
2195       struct hash_entry *entry = _mesa_hash_table_search(
2196          succ_state->entry_regs, interval->interval.reg);
2197       if (!entry)
2198          continue;
2199 
2200       physreg_t new_reg = (physreg_t)(uintptr_t)entry->data;
2201       if (new_reg != interval->physreg_start) {
2202          insert_liveout_copy(ctx->block, new_reg, interval->physreg_start,
2203                              interval->interval.reg);
2204       }
2205    }
2206 }
2207 
2208 static void
insert_file_live_out_moves(struct ra_ctx * ctx,struct ra_file * file)2209 insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file)
2210 {
2211    rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2212                     physreg_node) {
2213       insert_live_out_move(ctx, interval);
2214    }
2215 }
2216 
2217 static void
insert_live_out_moves(struct ra_ctx * ctx)2218 insert_live_out_moves(struct ra_ctx *ctx)
2219 {
2220    insert_file_live_out_moves(ctx, &ctx->full);
2221    insert_file_live_out_moves(ctx, &ctx->half);
2222    insert_file_live_out_moves(ctx, &ctx->shared);
2223 }
2224 
2225 static void
handle_block(struct ra_ctx * ctx,struct ir3_block * block)2226 handle_block(struct ra_ctx *ctx, struct ir3_block *block)
2227 {
2228    ctx->block = block;
2229 
2230    /* Reset the register files from the last block */
2231    ra_file_init(&ctx->full);
2232    ra_file_init(&ctx->half);
2233    ra_file_init(&ctx->shared);
2234 
2235    /* Handle live-ins, phis, and input meta-instructions. These all appear
2236     * live at the beginning of the block, and interfere with each other
2237     * therefore need to be allocated "in parallel". This means that we
2238     * have to allocate all of them, inserting them into the file, and then
2239     * delay updating the IR until all of them are allocated.
2240     *
2241     * Handle precolored inputs first, because we need to make sure that other
2242     * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes
2243     * and inputs at the same time, because the first block doesn't have
2244     * predecessors. Therefore handle_live_in doesn't have to worry about
2245     * them.
2246     */
2247 
2248    foreach_instr (instr, &block->instr_list) {
2249       if (instr->opc == OPC_META_INPUT)
2250          handle_precolored_input(ctx, instr);
2251       else
2252          break;
2253    }
2254 
2255    unsigned name;
2256    BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
2257                        ctx->live->definitions_count) {
2258       struct ir3_register *reg = ctx->live->definitions[name];
2259       handle_live_in(ctx, reg);
2260    }
2261 
2262    foreach_instr (instr, &block->instr_list) {
2263       if (instr->opc == OPC_META_PHI)
2264          handle_phi(ctx, instr->dsts[0]);
2265       else if (instr->opc == OPC_META_INPUT ||
2266                instr->opc == OPC_META_TEX_PREFETCH)
2267          handle_input(ctx, instr);
2268       else
2269          break;
2270    }
2271 
2272    /* After this point, every live-in/phi/input has an interval assigned to
2273     * it. We delay actually assigning values until everything has been
2274     * allocated, so we can simply ignore any parallel copy entries created
2275     * when shuffling them around.
2276     */
2277    ctx->parallel_copies_count = 0;
2278 
2279    insert_live_in_moves(ctx);
2280 
2281    if (RA_DEBUG) {
2282       d("after live-in block %u:\n", block->index);
2283       ra_ctx_dump(ctx);
2284    }
2285 
2286    /* Now we're done with processing live-ins, and can handle the body of the
2287     * block.
2288     */
2289    foreach_instr (instr, &block->instr_list) {
2290       di(instr, "processing");
2291 
2292       if (instr->opc == OPC_META_PHI)
2293          assign_phi(ctx, instr);
2294       else if (instr->opc == OPC_META_INPUT ||
2295                instr->opc == OPC_META_TEX_PREFETCH)
2296          assign_input(ctx, instr);
2297       else if (instr->opc == OPC_META_SPLIT)
2298          handle_split(ctx, instr);
2299       else if (instr->opc == OPC_META_COLLECT)
2300          handle_collect(ctx, instr);
2301       else if (instr->opc == OPC_META_PARALLEL_COPY)
2302          handle_pcopy(ctx, instr);
2303       else if (instr->opc == OPC_CHMASK)
2304          handle_chmask(ctx, instr);
2305       else
2306          handle_normal_instr(ctx, instr);
2307 
2308       if (RA_DEBUG)
2309          ra_ctx_dump(ctx);
2310    }
2311 
2312    insert_live_out_moves(ctx);
2313 
2314    BITSET_FOREACH_SET (name, ctx->live->live_out[block->index],
2315                        ctx->live->definitions_count) {
2316       struct ir3_register *reg = ctx->live->definitions[name];
2317       handle_live_out(ctx, reg);
2318    }
2319 
2320    ctx->blocks[block->index].visited = true;
2321 }
2322 
2323 static unsigned
calc_target_full_pressure(struct ir3_shader_variant * v,unsigned pressure)2324 calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
2325 {
2326    /* Registers are allocated in units of vec4, so switch from units of
2327     * half-regs to vec4.
2328     */
2329    unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4);
2330 
2331    bool double_threadsize = ir3_should_double_threadsize(v, reg_count);
2332 
2333    unsigned target = reg_count;
2334    unsigned reg_independent_max_waves =
2335       ir3_get_reg_independent_max_waves(v, double_threadsize);
2336    unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
2337       v->compiler, reg_count, double_threadsize);
2338    unsigned target_waves =
2339       MIN2(reg_independent_max_waves, reg_dependent_max_waves);
2340 
2341    while (target <= RA_FULL_SIZE / (2 * 4) &&
2342           ir3_should_double_threadsize(v, target) == double_threadsize &&
2343           ir3_get_reg_dependent_max_waves(v->compiler, target,
2344                                           double_threadsize) >= target_waves)
2345       target++;
2346 
2347    return (target - 1) * 2 * 4;
2348 }
2349 
2350 static void
add_pressure(struct ir3_pressure * pressure,struct ir3_register * reg,bool merged_regs)2351 add_pressure(struct ir3_pressure *pressure, struct ir3_register *reg,
2352              bool merged_regs)
2353 {
2354    unsigned size = reg_size(reg);
2355    if (reg->flags & IR3_REG_HALF)
2356       pressure->half += size;
2357    if (!(reg->flags & IR3_REG_HALF) || merged_regs)
2358       pressure->full += size;
2359 }
2360 
2361 static void
dummy_interval_add(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)2362 dummy_interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2363 {
2364 }
2365 
2366 static void
dummy_interval_delete(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)2367 dummy_interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2368 {
2369 }
2370 
2371 static void
dummy_interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * parent,struct ir3_reg_interval * child)2372 dummy_interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *parent,
2373                      struct ir3_reg_interval *child)
2374 {
2375 }
2376 
2377 /* Calculate the minimum possible limit on register pressure so that spilling
2378  * still succeeds. Used to implement IR3_SHADER_DEBUG=spillall.
2379  */
2380 
2381 static void
calc_min_limit_pressure(struct ir3_shader_variant * v,struct ir3_liveness * live,struct ir3_pressure * limit)2382 calc_min_limit_pressure(struct ir3_shader_variant *v,
2383                         struct ir3_liveness *live,
2384                         struct ir3_pressure *limit)
2385 {
2386    struct ir3_block *start = ir3_start_block(v->ir);
2387    struct ir3_reg_ctx *ctx = ralloc(NULL, struct ir3_reg_ctx);
2388    struct ir3_reg_interval *intervals =
2389       rzalloc_array(ctx, struct ir3_reg_interval, live->definitions_count);
2390 
2391    ctx->interval_add = dummy_interval_add;
2392    ctx->interval_delete = dummy_interval_delete;
2393    ctx->interval_readd = dummy_interval_readd;
2394 
2395    limit->full = limit->half = 0;
2396 
2397    struct ir3_pressure cur_pressure = {0};
2398    foreach_instr (input, &start->instr_list) {
2399       if (input->opc != OPC_META_INPUT &&
2400           input->opc != OPC_META_TEX_PREFETCH)
2401          break;
2402 
2403       add_pressure(&cur_pressure, input->dsts[0], v->mergedregs);
2404    }
2405 
2406    limit->full = MAX2(limit->full, cur_pressure.full);
2407    limit->half = MAX2(limit->half, cur_pressure.half);
2408 
2409    foreach_instr (input, &start->instr_list) {
2410       if (input->opc != OPC_META_INPUT &&
2411           input->opc != OPC_META_TEX_PREFETCH)
2412          break;
2413 
2414       /* pre-colored inputs may have holes, which increases the pressure. */
2415       struct ir3_register *dst = input->dsts[0];
2416       if (dst->num != INVALID_REG) {
2417          unsigned physreg = ra_reg_get_physreg(dst) + reg_size(dst);
2418          if (dst->flags & IR3_REG_HALF)
2419             limit->half = MAX2(limit->half, physreg);
2420          if (!(dst->flags & IR3_REG_HALF) || v->mergedregs)
2421             limit->full = MAX2(limit->full, physreg);
2422       }
2423    }
2424 
2425    foreach_block (block, &v->ir->block_list) {
2426       rb_tree_init(&ctx->intervals);
2427 
2428       unsigned name;
2429       BITSET_FOREACH_SET (name, live->live_in[block->index],
2430                           live->definitions_count) {
2431          struct ir3_register *reg = live->definitions[name];
2432          ir3_reg_interval_init(&intervals[reg->name], reg);
2433          ir3_reg_interval_insert(ctx, &intervals[reg->name]);
2434       }
2435 
2436       foreach_instr (instr, &block->instr_list) {
2437          ra_foreach_dst (dst, instr) {
2438             ir3_reg_interval_init(&intervals[dst->name], dst);
2439          }
2440          /* phis and parallel copies can be deleted via spilling */
2441 
2442          if (instr->opc == OPC_META_PHI) {
2443             ir3_reg_interval_insert(ctx, &intervals[instr->dsts[0]->name]);
2444             continue;
2445          }
2446 
2447          if (instr->opc == OPC_META_PARALLEL_COPY)
2448             continue;
2449 
2450          cur_pressure = (struct ir3_pressure) {0};
2451 
2452          ra_foreach_dst (dst, instr) {
2453             if (dst->tied && !(dst->tied->flags & IR3_REG_KILL))
2454                add_pressure(&cur_pressure, dst, v->mergedregs);
2455          }
2456 
2457          ra_foreach_src_rev (src, instr) {
2458             /* We currently don't support spilling the parent of a source when
2459              * making space for sources, so we have to keep track of the
2460              * intervals and figure out the root of the tree to figure out how
2461              * much space we need.
2462              *
2463              * TODO: We should probably support this in the spiller.
2464              */
2465             struct ir3_reg_interval *interval = &intervals[src->def->name];
2466             while (interval->parent)
2467                interval = interval->parent;
2468             add_pressure(&cur_pressure, interval->reg, v->mergedregs);
2469 
2470             if (src->flags & IR3_REG_FIRST_KILL)
2471                ir3_reg_interval_remove(ctx, &intervals[src->def->name]);
2472          }
2473 
2474          limit->full = MAX2(limit->full, cur_pressure.full);
2475          limit->half = MAX2(limit->half, cur_pressure.half);
2476 
2477          cur_pressure = (struct ir3_pressure) {0};
2478 
2479          ra_foreach_dst (dst, instr) {
2480             ir3_reg_interval_init(&intervals[dst->name], dst);
2481             ir3_reg_interval_insert(ctx, &intervals[dst->name]);
2482             add_pressure(&cur_pressure, dst, v->mergedregs);
2483          }
2484 
2485          limit->full = MAX2(limit->full, cur_pressure.full);
2486          limit->half = MAX2(limit->half, cur_pressure.half);
2487       }
2488    }
2489 
2490    /* Account for the base register, which needs to be available everywhere. */
2491    limit->full += 2;
2492 
2493    ralloc_free(ctx);
2494 }
2495 
2496 /*
2497  * If barriers are used, it must be possible for all waves in the workgroup
2498  * to execute concurrently. Thus we may have to reduce the registers limit.
2499  */
2500 static void
calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant * v,struct ir3_pressure * limit_pressure)2501 calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant *v,
2502                                         struct ir3_pressure *limit_pressure)
2503 {
2504    const struct ir3_compiler *compiler = v->compiler;
2505 
2506    unsigned threads_per_wg;
2507    if (v->local_size_variable) {
2508       /* We have to expect the worst case. */
2509       threads_per_wg = compiler->max_variable_workgroup_size;
2510    } else {
2511       threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2];
2512    }
2513 
2514    /* The register file is grouped into reg_size_vec4 number of parts.
2515     * Each part has enough registers to add a single vec4 register to
2516     * each thread of a single-sized wave-pair. With double threadsize
2517     * each wave-pair would consume two parts of the register file to get
2518     * a single vec4 for a thread. The more active wave-pairs the less
2519     * parts each could get.
2520     */
2521 
2522    bool double_threadsize = ir3_should_double_threadsize(v, 0);
2523    unsigned waves_per_wg = DIV_ROUND_UP(
2524       threads_per_wg, compiler->threadsize_base * (double_threadsize ? 2 : 1) *
2525                          compiler->wave_granularity);
2526 
2527    uint32_t vec4_regs_per_thread =
2528       compiler->reg_size_vec4 / (waves_per_wg * (double_threadsize ? 2 : 1));
2529    assert(vec4_regs_per_thread > 0);
2530 
2531    uint32_t half_regs_per_thread = vec4_regs_per_thread * 4 * 2;
2532 
2533    if (limit_pressure->full > half_regs_per_thread) {
2534       if (v->mergedregs) {
2535          limit_pressure->full = half_regs_per_thread;
2536       } else {
2537          /* TODO: Handle !mergedregs case, probably we would have to do this
2538           * after the first register pressure pass.
2539           */
2540       }
2541    }
2542 }
2543 
2544 int
ir3_ra(struct ir3_shader_variant * v)2545 ir3_ra(struct ir3_shader_variant *v)
2546 {
2547    ir3_calc_dominance(v->ir);
2548 
2549    ir3_create_parallel_copies(v->ir);
2550 
2551    struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
2552 
2553    ctx->merged_regs = v->mergedregs;
2554    ctx->compiler = v->compiler;
2555    ctx->stage = v->type;
2556 
2557    struct ir3_liveness *live = ir3_calc_liveness(ctx, v->ir);
2558 
2559    ir3_debug_print(v->ir, "AFTER: create_parallel_copies");
2560 
2561    ir3_merge_regs(live, v->ir);
2562 
2563    bool has_shared_vectors = false;
2564    foreach_block (block, &v->ir->block_list) {
2565       foreach_instr (instr, &block->instr_list) {
2566          ra_foreach_dst (dst, instr) {
2567             if ((dst->flags & IR3_REG_SHARED) && reg_elems(dst) > 1) {
2568                has_shared_vectors = true;
2569                break;
2570             }
2571          }
2572       }
2573    }
2574 
2575    struct ir3_pressure max_pressure;
2576    ir3_calc_pressure(v, live, &max_pressure);
2577    d("max pressure:");
2578    d("\tfull: %u", max_pressure.full);
2579    d("\thalf: %u", max_pressure.half);
2580    d("\tshared: %u", max_pressure.shared);
2581 
2582    struct ir3_pressure limit_pressure;
2583    limit_pressure.full = RA_FULL_SIZE;
2584    limit_pressure.half = RA_HALF_SIZE;
2585    limit_pressure.shared = RA_SHARED_SIZE;
2586 
2587    if (gl_shader_stage_is_compute(v->type) && v->has_barrier) {
2588       calc_limit_pressure_for_cs_with_barrier(v, &limit_pressure);
2589    }
2590 
2591    /* If the user forces a doubled threadsize, we may have to lower the limit
2592     * because on some gens the register file is not big enough to hold a
2593     * double-size wave with all 48 registers in use.
2594     */
2595    if (v->shader_options.real_wavesize == IR3_DOUBLE_ONLY) {
2596       limit_pressure.full =
2597          MAX2(limit_pressure.full, ctx->compiler->reg_size_vec4 / 2 * 16);
2598    }
2599 
2600    /* If requested, lower the limit so that spilling happens more often. */
2601    if (ir3_shader_debug & IR3_DBG_SPILLALL)
2602       calc_min_limit_pressure(v, live, &limit_pressure);
2603 
2604    if (max_pressure.shared > limit_pressure.shared || has_shared_vectors) {
2605       ir3_ra_shared(v, live);
2606 
2607       /* Recalculate liveness and register pressure now that additional values
2608        * have been added.
2609        */
2610       ralloc_free(live);
2611       live = ir3_calc_liveness(ctx, v->ir);
2612       ir3_calc_pressure(v, live, &max_pressure);
2613 
2614       ir3_debug_print(v->ir, "AFTER: shared register allocation");
2615    }
2616 
2617    bool spilled = false;
2618    if (max_pressure.full > limit_pressure.full ||
2619        max_pressure.half > limit_pressure.half) {
2620       if (!v->compiler->has_pvtmem) {
2621          d("max pressure exceeded!");
2622          goto fail;
2623       }
2624       d("max pressure exceeded, spilling!");
2625       IR3_PASS(v->ir, ir3_spill, v, &live, &limit_pressure);
2626       ir3_calc_pressure(v, live, &max_pressure);
2627       assert(max_pressure.full <= limit_pressure.full &&
2628              max_pressure.half <= limit_pressure.half);
2629       spilled = true;
2630    }
2631 
2632    ctx->live = live;
2633    ctx->intervals =
2634       rzalloc_array(ctx, struct ra_interval, live->definitions_count);
2635    ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count);
2636 
2637    ctx->full.size = calc_target_full_pressure(v, max_pressure.full);
2638    d("full size: %u", ctx->full.size);
2639 
2640    if (!v->mergedregs)
2641       ctx->half.size = RA_HALF_SIZE;
2642 
2643    ctx->shared.size = RA_SHARED_SIZE;
2644 
2645    ctx->full.start = ctx->half.start = ctx->shared.start = 0;
2646 
2647    foreach_block (block, &v->ir->block_list)
2648       handle_block(ctx, block);
2649 
2650    ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count, false);
2651 
2652    /* Strip array-ness and SSA-ness at the end, because various helpers still
2653     * need to work even on definitions that have already been assigned. For
2654     * example, we need to preserve array-ness so that array live-ins have the
2655     * right size.
2656     */
2657    foreach_block (block, &v->ir->block_list) {
2658       foreach_instr (instr, &block->instr_list) {
2659          for (unsigned i = 0; i < instr->dsts_count; i++) {
2660             instr->dsts[i]->flags &= ~IR3_REG_SSA;
2661 
2662             /* Parallel copies of array registers copy the whole register, and
2663              * we need some way to let the parallel copy code know that this was
2664              * an array whose size is determined by reg->size. So keep the array
2665              * flag on those. spill/reload also need to work on the entire
2666              * array.
2667              */
2668             if (!is_meta(instr) && instr->opc != OPC_RELOAD_MACRO)
2669                instr->dsts[i]->flags &= ~IR3_REG_ARRAY;
2670          }
2671 
2672          for (unsigned i = 0; i < instr->srcs_count; i++) {
2673             instr->srcs[i]->flags &= ~IR3_REG_SSA;
2674 
2675             if (!is_meta(instr) && instr->opc != OPC_SPILL_MACRO)
2676                instr->srcs[i]->flags &= ~IR3_REG_ARRAY;
2677          }
2678       }
2679    }
2680 
2681    ir3_debug_print(v->ir, "AFTER: register allocation");
2682 
2683    if (spilled) {
2684       IR3_PASS(v->ir, ir3_lower_spill);
2685    }
2686 
2687    ir3_lower_copies(v);
2688 
2689    ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
2690 
2691    ralloc_free(ctx);
2692 
2693    return 0;
2694 fail:
2695    ralloc_free(ctx);
2696    return -1;
2697 }
2698