• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2021 Valve Corporation
3  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "ir3_ra.h"
26 #include "util/rb_tree.h"
27 #include "util/u_math.h"
28 #include "ir3_shader.h"
29 
30 /* This file implements an SSA-based register allocator. Unlike other
31  * SSA-based allocators, it handles vector split/collect "smartly," meaning
32  * that multiple values may share the same register interval. From the
33  * perspective of the allocator itself, only the top-level intervals matter,
34  * and the allocator is only concerned with allocating top-level intervals,
35  * which may mean moving other top-level intervals around. Other intervals,
36  * like the destination of a split instruction or the source of a collect
37  * instruction, are "locked" to their parent interval. The details of this are
38  * mostly handled by ir3_merge_regs and ir3_reg_ctx.
39  *
40  * We currently don't do any backtracking, but we do use the merge sets as a
41  * form of affinity to try to avoid moves from phis/splits/collects. Each
42  * merge set is what a more "classic" graph-coloring or live-range based
43  * allocator would consider a single register, but here we use it as merely a
44  * hint, except when multiple overlapping values are live at the same time.
45  * Each merge set has a "preferred" register, and we try to honor that when
46  * allocating values in the merge set.
47  */
48 
49 /* ir3_reg_ctx implementation. */
50 
51 static int
ir3_reg_interval_cmp(const struct rb_node * node,const void * data)52 ir3_reg_interval_cmp(const struct rb_node *node, const void *data)
53 {
54    unsigned reg = *(const unsigned *)data;
55    const struct ir3_reg_interval *interval =
56       ir3_rb_node_to_interval_const(node);
57    if (interval->reg->interval_start > reg)
58       return -1;
59    else if (interval->reg->interval_end <= reg)
60       return 1;
61    else
62       return 0;
63 }
64 
65 static struct ir3_reg_interval *
ir3_reg_interval_search(struct rb_tree * tree,unsigned offset)66 ir3_reg_interval_search(struct rb_tree *tree, unsigned offset)
67 {
68    struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp);
69    return node ? ir3_rb_node_to_interval(node) : NULL;
70 }
71 
72 static struct ir3_reg_interval *
ir3_reg_interval_search_sloppy(struct rb_tree * tree,unsigned offset)73 ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset)
74 {
75    struct rb_node *node =
76       rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp);
77    return node ? ir3_rb_node_to_interval(node) : NULL;
78 }
79 
80 /* Get the interval covering the reg, or the closest to the right if it
81  * doesn't exist.
82  */
83 static struct ir3_reg_interval *
ir3_reg_interval_search_right(struct rb_tree * tree,unsigned offset)84 ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset)
85 {
86    struct ir3_reg_interval *interval =
87       ir3_reg_interval_search_sloppy(tree, offset);
88    if (!interval) {
89       return NULL;
90    } else if (interval->reg->interval_end > offset) {
91       return interval;
92    } else {
93       /* There is no interval covering reg, and ra_file_search_sloppy()
94        * returned the closest range to the left, so the next interval to the
95        * right should be the closest to the right.
96        */
97       return ir3_reg_interval_next_or_null(interval);
98    }
99 }
100 
101 static int
ir3_reg_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)102 ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
103 {
104    const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a);
105    const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b);
106    return b->reg->interval_start - a->reg->interval_start;
107 }
108 
109 static void
interval_insert(struct ir3_reg_ctx * ctx,struct rb_tree * tree,struct ir3_reg_interval * interval)110 interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree,
111                 struct ir3_reg_interval *interval)
112 {
113    struct ir3_reg_interval *right =
114       ir3_reg_interval_search_right(tree, interval->reg->interval_start);
115    if (right && right->reg->interval_start < interval->reg->interval_end) {
116       /* We disallow trees where different members have different half-ness.
117        * This means that we can't treat bitcasts as copies like normal
118        * split/collect, so something like this would require an extra copy
119        * in mergedregs mode, and count as 4 half-units of register pressure
120        * instead of 2:
121        *
122        * f16vec2 foo = unpackFloat2x16(bar)
123        * ... = foo.x
124        * ... = bar
125        *
126        * However, relaxing this rule would open a huge can of worms. What
127        * happens when there's a vector of 16 things, and the fifth element
128        * has been bitcasted as a half-reg? Would that element alone have to
129        * be small enough to be used as a half-reg source? Let's keep that
130        * can of worms firmly shut for now.
131        */
132       assert((interval->reg->flags & IR3_REG_HALF) ==
133              (right->reg->flags & IR3_REG_HALF));
134 
135       if (right->reg->interval_end <= interval->reg->interval_end &&
136           right->reg->interval_start >= interval->reg->interval_start) {
137          /* Check if we're inserting something that's already inserted */
138          assert(interval != right);
139 
140          /* "right" is contained in "interval" and must become a child of
141           * it. There may be further children too.
142           */
143          for (struct ir3_reg_interval *next = ir3_reg_interval_next(right);
144               right && right->reg->interval_start < interval->reg->interval_end;
145               right = next, next = ir3_reg_interval_next_or_null(next)) {
146             /* "right" must be contained in "interval." */
147             assert(right->reg->interval_end <= interval->reg->interval_end);
148             assert((interval->reg->flags & IR3_REG_HALF) ==
149                    (right->reg->flags & IR3_REG_HALF));
150             if (!right->parent)
151                ctx->interval_delete(ctx, right);
152             right->parent = interval;
153             rb_tree_remove(tree, &right->node);
154             rb_tree_insert(&interval->children, &right->node,
155                            ir3_reg_interval_insert_cmp);
156          }
157       } else {
158          /* "right" must contain "interval," since intervals must form a
159           * tree.
160           */
161          assert(right->reg->interval_start <= interval->reg->interval_start);
162          interval->parent = right;
163          interval_insert(ctx, &right->children, interval);
164          return;
165       }
166    }
167 
168    if (!interval->parent)
169       ctx->interval_add(ctx, interval);
170    rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp);
171    interval->inserted = true;
172 }
173 
174 void
ir3_reg_interval_insert(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)175 ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
176                         struct ir3_reg_interval *interval)
177 {
178    rb_tree_init(&interval->children);
179    interval->parent = NULL;
180    interval_insert(ctx, &ctx->intervals, interval);
181 }
182 
183 /* Call after ir3_reg_interval_remove_temp() to reinsert the interval */
184 static void
ir3_reg_interval_reinsert(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)185 ir3_reg_interval_reinsert(struct ir3_reg_ctx *ctx,
186                           struct ir3_reg_interval *interval)
187 {
188    interval->parent = NULL;
189    interval_insert(ctx, &ctx->intervals, interval);
190 }
191 
192 void
ir3_reg_interval_remove(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)193 ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
194                         struct ir3_reg_interval *interval)
195 {
196    if (interval->parent) {
197       rb_tree_remove(&interval->parent->children, &interval->node);
198    } else {
199       ctx->interval_delete(ctx, interval);
200       rb_tree_remove(&ctx->intervals, &interval->node);
201    }
202 
203    rb_tree_foreach_safe (struct ir3_reg_interval, child, &interval->children,
204                          node) {
205       rb_tree_remove(&interval->children, &child->node);
206       child->parent = interval->parent;
207 
208       if (interval->parent) {
209          rb_tree_insert(&child->parent->children, &child->node,
210                         ir3_reg_interval_insert_cmp);
211       } else {
212          ctx->interval_readd(ctx, interval, child);
213          rb_tree_insert(&ctx->intervals, &child->node,
214                         ir3_reg_interval_insert_cmp);
215       }
216    }
217 
218    interval->inserted = false;
219 }
220 
221 static void
_mark_free(struct ir3_reg_interval * interval)222 _mark_free(struct ir3_reg_interval *interval)
223 {
224    interval->inserted = false;
225    rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
226       _mark_free(child);
227    }
228 }
229 
230 /* Remove an interval and all its children from the tree. */
231 void
ir3_reg_interval_remove_all(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)232 ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
233                             struct ir3_reg_interval *interval)
234 {
235    assert(!interval->parent);
236 
237    ctx->interval_delete(ctx, interval);
238    rb_tree_remove(&ctx->intervals, &interval->node);
239    _mark_free(interval);
240 }
241 
242 /* Used when popping an interval to be shuffled around. Don't disturb children
243  * so that it can be later reinserted.
244  */
245 static void
ir3_reg_interval_remove_temp(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)246 ir3_reg_interval_remove_temp(struct ir3_reg_ctx *ctx,
247                              struct ir3_reg_interval *interval)
248 {
249    assert(!interval->parent);
250 
251    ctx->interval_delete(ctx, interval);
252    rb_tree_remove(&ctx->intervals, &interval->node);
253 }
254 
255 static void
interval_dump(struct log_stream * stream,struct ir3_reg_interval * interval,unsigned indent)256 interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval,
257               unsigned indent)
258 {
259    for (unsigned i = 0; i < indent; i++)
260       mesa_log_stream_printf(stream, "\t");
261    mesa_log_stream_printf(stream, "reg %u start %u\n", interval->reg->name,
262                           interval->reg->interval_start);
263 
264    rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
265       interval_dump(stream, child, indent + 1);
266    }
267 
268    for (unsigned i = 0; i < indent; i++)
269       mesa_log_stream_printf(stream, "\t");
270    mesa_log_stream_printf(stream, "reg %u end %u\n", interval->reg->name,
271                           interval->reg->interval_end);
272 }
273 
274 void
ir3_reg_interval_dump(struct log_stream * stream,struct ir3_reg_interval * interval)275 ir3_reg_interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval)
276 {
277    interval_dump(stream, interval, 0);
278 }
279 
280 /* These are the core datastructures used by the register allocator. First
281  * ra_interval and ra_file, which are used for intra-block tracking and use
282  * the ir3_reg_ctx infrastructure:
283  */
284 
285 struct ra_interval {
286    struct ir3_reg_interval interval;
287 
288    struct rb_node physreg_node;
289    physreg_t physreg_start, physreg_end;
290 
291    /* True if this is a source of the current instruction which is entirely
292     * killed. This means we can allocate the dest over it, but we can't break
293     * it up.
294     */
295    bool is_killed;
296 
297    /* True if this interval cannot be moved from its position. This is only
298     * used for precolored inputs to ensure that other inputs don't get
299     * allocated on top of them.
300     */
301    bool frozen;
302 };
303 
304 struct ra_file {
305    struct ir3_reg_ctx reg_ctx;
306 
307    BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
308    BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
309 
310    struct rb_tree physreg_intervals;
311 
312    unsigned size;
313    unsigned start;
314 };
315 
316 /* State for inter-block tracking. When we split a live range to make space
317  * for a vector, we may need to insert fixup code when a block has multiple
318  * predecessors that have moved the same live value to different registers.
319  * This keeps track of state required to do that.
320  */
321 
322 struct ra_block_state {
323    /* Map of defining ir3_register -> physreg it was allocated to at the end
324     * of the block.
325     */
326    struct hash_table *renames;
327 
328    /* For loops, we need to process a block before all its predecessors have
329     * been processed. In particular, we need to pick registers for values
330     * without knowing if all the predecessors have been renamed. This keeps
331     * track of the registers we chose so that when we visit the back-edge we
332     * can move them appropriately. If all predecessors have been visited
333     * before this block is visited then we don't need to fill this out. This
334     * is a map from ir3_register -> physreg.
335     */
336    struct hash_table *entry_regs;
337 
338    /* True if the block has been visited and "renames" is complete.
339     */
340    bool visited;
341 };
342 
343 struct ra_parallel_copy {
344    struct ra_interval *interval;
345    physreg_t src;
346 };
347 
348 /* The main context: */
349 
350 struct ra_ctx {
351    /* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom
352     * half of this file too.
353     */
354    struct ra_file full;
355 
356    /* hr0.x - hr63.w, only used without merged-regs. */
357    struct ra_file half;
358 
359    /* Shared regs. */
360    struct ra_file shared;
361 
362    struct ir3_liveness *live;
363 
364    struct ir3_block *block;
365 
366    const struct ir3_compiler *compiler;
367    gl_shader_stage stage;
368 
369    /* Pending moves of top-level intervals that will be emitted once we're
370     * finished:
371     */
372    DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies);
373 
374    struct ra_interval *intervals;
375    struct ra_block_state *blocks;
376 
377    bool merged_regs;
378 };
379 
380 #define foreach_interval(interval, file)                                       \
381    rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals,  \
382                     physreg_node)
383 #define foreach_interval_rev(interval, file)                                   \
384    rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals,  \
385                     physreg_node)
386 #define foreach_interval_safe(interval, file)                                  \
387    rb_tree_foreach_safe (struct ra_interval, interval,                         \
388                          &(file)->physreg_intervals, physreg_node)
389 #define foreach_interval_rev_safe(interval, file)                              \
390    rb_tree_foreach_rev_safe(struct ra_interval, interval,                      \
391                             &(file)->physreg_intervals, physreg_node)
392 
393 static struct ra_interval *
rb_node_to_interval(struct rb_node * node)394 rb_node_to_interval(struct rb_node *node)
395 {
396    return rb_node_data(struct ra_interval, node, physreg_node);
397 }
398 
399 static const struct ra_interval *
rb_node_to_interval_const(const struct rb_node * node)400 rb_node_to_interval_const(const struct rb_node *node)
401 {
402    return rb_node_data(struct ra_interval, node, physreg_node);
403 }
404 
405 static struct ra_interval *
ra_interval_next(struct ra_interval * interval)406 ra_interval_next(struct ra_interval *interval)
407 {
408    struct rb_node *next = rb_node_next(&interval->physreg_node);
409    return next ? rb_node_to_interval(next) : NULL;
410 }
411 
412 static struct ra_interval *
ra_interval_next_or_null(struct ra_interval * interval)413 ra_interval_next_or_null(struct ra_interval *interval)
414 {
415    return interval ? ra_interval_next(interval) : NULL;
416 }
417 
418 static int
ra_interval_cmp(const struct rb_node * node,const void * data)419 ra_interval_cmp(const struct rb_node *node, const void *data)
420 {
421    physreg_t reg = *(const physreg_t *)data;
422    const struct ra_interval *interval = rb_node_to_interval_const(node);
423    if (interval->physreg_start > reg)
424       return -1;
425    else if (interval->physreg_end <= reg)
426       return 1;
427    else
428       return 0;
429 }
430 
431 static struct ra_interval *
ra_interval_search_sloppy(struct rb_tree * tree,physreg_t reg)432 ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
433 {
434    struct rb_node *node = rb_tree_search_sloppy(tree, &reg, ra_interval_cmp);
435    return node ? rb_node_to_interval(node) : NULL;
436 }
437 
438 /* Get the interval covering the reg, or the closest to the right if it
439  * doesn't exist.
440  */
441 static struct ra_interval *
ra_interval_search_right(struct rb_tree * tree,physreg_t reg)442 ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
443 {
444    struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
445    if (!interval) {
446       return NULL;
447    } else if (interval->physreg_end > reg) {
448       return interval;
449    } else {
450       /* There is no interval covering reg, and ra_file_search_sloppy()
451        * returned the closest range to the left, so the next interval to the
452        * right should be the closest to the right.
453        */
454       return ra_interval_next_or_null(interval);
455    }
456 }
457 
458 static struct ra_interval *
ra_file_search_right(struct ra_file * file,physreg_t reg)459 ra_file_search_right(struct ra_file *file, physreg_t reg)
460 {
461    return ra_interval_search_right(&file->physreg_intervals, reg);
462 }
463 
464 static int
ra_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)465 ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
466 {
467    const struct ra_interval *a = rb_node_to_interval_const(_a);
468    const struct ra_interval *b = rb_node_to_interval_const(_b);
469    return b->physreg_start - a->physreg_start;
470 }
471 
472 static struct ra_interval *
ir3_reg_interval_to_ra_interval(struct ir3_reg_interval * interval)473 ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
474 {
475    return rb_node_data(struct ra_interval, interval, interval);
476 }
477 
478 static struct ra_file *
ir3_reg_ctx_to_file(struct ir3_reg_ctx * ctx)479 ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx)
480 {
481    return rb_node_data(struct ra_file, ctx, reg_ctx);
482 }
483 
484 static void
interval_add(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _interval)485 interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
486 {
487    struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
488    struct ra_file *file = ir3_reg_ctx_to_file(ctx);
489 
490    /* We can assume in this case that physreg_start/physreg_end is already
491     * initialized.
492     */
493    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
494       BITSET_CLEAR(file->available, i);
495       BITSET_CLEAR(file->available_to_evict, i);
496    }
497 
498    rb_tree_insert(&file->physreg_intervals, &interval->physreg_node,
499                   ra_interval_insert_cmp);
500 }
501 
502 static void
interval_delete(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _interval)503 interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
504 {
505    struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
506    struct ra_file *file = ir3_reg_ctx_to_file(ctx);
507 
508    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
509       BITSET_SET(file->available, i);
510       BITSET_SET(file->available_to_evict, i);
511    }
512 
513    rb_tree_remove(&file->physreg_intervals, &interval->physreg_node);
514 }
515 
516 static void
interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)517 interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
518                struct ir3_reg_interval *_child)
519 {
520    struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
521    struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
522 
523    child->physreg_start =
524       parent->physreg_start + (child->interval.reg->interval_start -
525                                parent->interval.reg->interval_start);
526    child->physreg_end =
527       child->physreg_start +
528       (child->interval.reg->interval_end - child->interval.reg->interval_start);
529 
530    interval_add(ctx, _child);
531 }
532 
533 static void
ra_file_init(struct ra_file * file)534 ra_file_init(struct ra_file *file)
535 {
536    for (unsigned i = 0; i < file->size; i++) {
537       BITSET_SET(file->available, i);
538       BITSET_SET(file->available_to_evict, i);
539    }
540 
541    rb_tree_init(&file->reg_ctx.intervals);
542    rb_tree_init(&file->physreg_intervals);
543 
544    file->reg_ctx.interval_add = interval_add;
545    file->reg_ctx.interval_delete = interval_delete;
546    file->reg_ctx.interval_readd = interval_readd;
547 }
548 
549 static void
ra_file_insert(struct ra_file * file,struct ra_interval * interval)550 ra_file_insert(struct ra_file *file, struct ra_interval *interval)
551 {
552    assert(interval->physreg_start < interval->physreg_end);
553    assert(interval->physreg_end <= file->size);
554    if (interval->interval.reg->flags & IR3_REG_HALF)
555       assert(interval->physreg_end <= RA_HALF_SIZE);
556 
557    ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
558 }
559 
560 static void
ra_file_remove(struct ra_file * file,struct ra_interval * interval)561 ra_file_remove(struct ra_file *file, struct ra_interval *interval)
562 {
563    ir3_reg_interval_remove(&file->reg_ctx, &interval->interval);
564 }
565 
566 static void
ra_file_mark_killed(struct ra_file * file,struct ra_interval * interval)567 ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval)
568 {
569    assert(!interval->interval.parent);
570 
571    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
572       BITSET_SET(file->available, i);
573    }
574 
575    interval->is_killed = true;
576 }
577 
578 static void
ra_file_unmark_killed(struct ra_file * file,struct ra_interval * interval)579 ra_file_unmark_killed(struct ra_file *file, struct ra_interval *interval)
580 {
581    assert(!interval->interval.parent);
582 
583    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
584       BITSET_CLEAR(file->available, i);
585    }
586 
587    interval->is_killed = false;
588 }
589 
590 static physreg_t
ra_interval_get_physreg(const struct ra_interval * interval)591 ra_interval_get_physreg(const struct ra_interval *interval)
592 {
593    unsigned child_start = interval->interval.reg->interval_start;
594 
595    while (interval->interval.parent) {
596       interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
597    }
598 
599    return interval->physreg_start +
600           (child_start - interval->interval.reg->interval_start);
601 }
602 
603 static unsigned
ra_interval_get_num(const struct ra_interval * interval)604 ra_interval_get_num(const struct ra_interval *interval)
605 {
606    return ra_physreg_to_num(ra_interval_get_physreg(interval),
607                             interval->interval.reg->flags);
608 }
609 
610 static void
ra_interval_init(struct ra_interval * interval,struct ir3_register * reg)611 ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
612 {
613    ir3_reg_interval_init(&interval->interval, reg);
614    interval->is_killed = false;
615    interval->frozen = false;
616 }
617 
618 static void
ra_interval_dump(struct log_stream * stream,struct ra_interval * interval)619 ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
620 {
621    mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
622 
623    ir3_reg_interval_dump(stream, &interval->interval);
624 }
625 
626 static void
ra_file_dump(struct log_stream * stream,struct ra_file * file)627 ra_file_dump(struct log_stream *stream, struct ra_file *file)
628 {
629    rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
630                     physreg_node) {
631       ra_interval_dump(stream, interval);
632    }
633 
634    unsigned start, end;
635    mesa_log_stream_printf(stream, "available:\n");
636    BITSET_FOREACH_RANGE (start, end, file->available, file->size) {
637       mesa_log_stream_printf(stream, "%u-%u ", start, end);
638    }
639    mesa_log_stream_printf(stream, "\n");
640 
641    mesa_log_stream_printf(stream, "available to evict:\n");
642    BITSET_FOREACH_RANGE (start, end, file->available_to_evict, file->size) {
643       mesa_log_stream_printf(stream, "%u-%u ", start, end);
644    }
645    mesa_log_stream_printf(stream, "\n");
646    mesa_log_stream_printf(stream, "start: %u\n", file->start);
647 }
648 
649 static void
ra_ctx_dump(struct ra_ctx * ctx)650 ra_ctx_dump(struct ra_ctx *ctx)
651 {
652    struct log_stream *stream = mesa_log_streami();
653    mesa_log_stream_printf(stream, "full:\n");
654    ra_file_dump(stream, &ctx->full);
655    mesa_log_stream_printf(stream, "half:\n");
656    ra_file_dump(stream, &ctx->half);
657    mesa_log_stream_printf(stream, "shared:");
658    ra_file_dump(stream, &ctx->shared);
659    mesa_log_stream_destroy(stream);
660 }
661 
662 static unsigned
reg_file_size(struct ra_file * file,struct ir3_register * reg)663 reg_file_size(struct ra_file *file, struct ir3_register *reg)
664 {
665    /* Half-regs can only take up the first half of the combined regfile */
666    if (reg->flags & IR3_REG_HALF)
667       return MIN2(file->size, RA_HALF_SIZE);
668    else
669       return file->size;
670 }
671 
672 /* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple
673  * top-level intervals at once. Pop multiple intervals, then push them back in
674  * any order.
675  */
676 
677 struct ra_removed_interval {
678    struct ra_interval *interval;
679    unsigned size;
680 };
681 
682 static struct ra_removed_interval
ra_pop_interval(struct ra_ctx * ctx,struct ra_file * file,struct ra_interval * interval)683 ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
684                 struct ra_interval *interval)
685 {
686    assert(!interval->interval.parent);
687 
688    /* Check if we've already moved this reg before */
689    unsigned pcopy_index;
690    for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count;
691         pcopy_index++) {
692       if (ctx->parallel_copies[pcopy_index].interval == interval)
693          break;
694    }
695 
696    if (pcopy_index == ctx->parallel_copies_count) {
697       array_insert(ctx, ctx->parallel_copies,
698                    (struct ra_parallel_copy){
699                       .interval = interval,
700                       .src = interval->physreg_start,
701                    });
702    }
703 
704    ir3_reg_interval_remove_temp(&file->reg_ctx, &interval->interval);
705 
706    return (struct ra_removed_interval){
707       .interval = interval,
708       .size = interval->physreg_end - interval->physreg_start,
709    };
710 }
711 
712 static void
ra_push_interval(struct ra_ctx * ctx,struct ra_file * file,const struct ra_removed_interval * removed,physreg_t dst)713 ra_push_interval(struct ra_ctx *ctx, struct ra_file *file,
714                  const struct ra_removed_interval *removed, physreg_t dst)
715 {
716    struct ra_interval *interval = removed->interval;
717 
718    interval->physreg_start = dst;
719    interval->physreg_end = dst + removed->size;
720 
721    ir3_reg_interval_reinsert(&file->reg_ctx, &interval->interval);
722 }
723 
724 /* Pick up the interval and place it at "dst". */
725 static void
ra_move_interval(struct ra_ctx * ctx,struct ra_file * file,struct ra_interval * interval,physreg_t dst)726 ra_move_interval(struct ra_ctx *ctx, struct ra_file *file,
727                  struct ra_interval *interval, physreg_t dst)
728 {
729    struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval);
730    ra_push_interval(ctx, file, &temp, dst);
731 }
732 
733 static bool
get_reg_specified(struct ra_file * file,struct ir3_register * reg,physreg_t physreg,bool is_source)734 get_reg_specified(struct ra_file *file, struct ir3_register *reg,
735                   physreg_t physreg, bool is_source)
736 {
737    for (unsigned i = 0; i < reg_size(reg); i++) {
738       if (!BITSET_TEST(is_source ? file->available_to_evict : file->available,
739                        physreg + i))
740          return false;
741    }
742 
743    return true;
744 }
745 
746 /* Try to evict any registers conflicting with the proposed spot "physreg" for
747  * "reg". That is, move them to other places so that we can allocate "physreg"
748  * here.
749  */
750 
751 static bool
try_evict_regs(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg,physreg_t physreg,unsigned * _eviction_count,bool is_source,bool speculative)752 try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
753                struct ir3_register *reg, physreg_t physreg,
754                unsigned *_eviction_count, bool is_source, bool speculative)
755 {
756    BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
757    memcpy(available_to_evict, file->available_to_evict,
758           sizeof(available_to_evict));
759 
760    BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
761    memcpy(available, file->available, sizeof(available));
762 
763    for (unsigned i = 0; i < reg_size(reg); i++) {
764       BITSET_CLEAR(available_to_evict, physreg + i);
765       BITSET_CLEAR(available, physreg + i);
766    }
767 
768    unsigned eviction_count = 0;
769    /* Iterate over each range conflicting with physreg */
770    for (struct ra_interval *conflicting = ra_file_search_right(file, physreg),
771                            *next = ra_interval_next_or_null(conflicting);
772         conflicting != NULL &&
773         conflicting->physreg_start < physreg + reg_size(reg);
774         conflicting = next, next = ra_interval_next_or_null(next)) {
775       if (!is_source && conflicting->is_killed)
776          continue;
777 
778       if (conflicting->frozen) {
779          assert(speculative);
780          return false;
781       }
782 
783       unsigned conflicting_file_size =
784          reg_file_size(file, conflicting->interval.reg);
785       unsigned avail_start, avail_end;
786       bool evicted = false;
787       BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict,
788                             conflicting_file_size) {
789          unsigned size = avail_end - avail_start;
790 
791          /* non-half registers must be aligned */
792          if (!(conflicting->interval.reg->flags & IR3_REG_HALF) &&
793              avail_start % 2 == 1) {
794             avail_start++;
795             size--;
796          }
797 
798          if (size >= conflicting->physreg_end - conflicting->physreg_start) {
799             for (unsigned i = 0;
800                  i < conflicting->physreg_end - conflicting->physreg_start; i++)
801                BITSET_CLEAR(available_to_evict, avail_start + i);
802             eviction_count +=
803                conflicting->physreg_end - conflicting->physreg_start;
804             if (!speculative)
805                ra_move_interval(ctx, file, conflicting, avail_start);
806             evicted = true;
807             break;
808          }
809       }
810 
811       if (evicted)
812          continue;
813 
814       /* If we couldn't evict this range, we may be able to swap it with a
815        * killed range to acheive the same effect.
816        */
817       foreach_interval (killed, file) {
818          if (!killed->is_killed)
819             continue;
820 
821          if (killed->physreg_end - killed->physreg_start !=
822              conflicting->physreg_end - conflicting->physreg_start)
823             continue;
824 
825          if (killed->physreg_end > conflicting_file_size ||
826              conflicting->physreg_end > reg_file_size(file, killed->interval.reg))
827             continue;
828 
829          /* We can't swap the killed range if it partially/fully overlaps the
830           * space we're trying to allocate or (in speculative mode) if it's
831           * already been swapped and will overlap when we actually evict.
832           */
833          bool killed_available = true;
834          for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
835             if (!BITSET_TEST(available, i)) {
836                killed_available = false;
837                break;
838             }
839          }
840 
841          if (!killed_available)
842             continue;
843 
844          /* Check for alignment if one is a full reg */
845          if ((!(killed->interval.reg->flags & IR3_REG_HALF) ||
846               !(conflicting->interval.reg->flags & IR3_REG_HALF)) &&
847              (killed->physreg_start % 2 != 0 ||
848               conflicting->physreg_start % 2 != 0))
849             continue;
850 
851          for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
852             BITSET_CLEAR(available, i);
853          }
854          /* Because this will generate swaps instead of moves, multiply the
855           * cost by 2.
856           */
857          eviction_count += (killed->physreg_end - killed->physreg_start) * 2;
858          if (!speculative) {
859             physreg_t killed_start = killed->physreg_start,
860                       conflicting_start = conflicting->physreg_start;
861             struct ra_removed_interval killed_removed =
862                ra_pop_interval(ctx, file, killed);
863             struct ra_removed_interval conflicting_removed =
864                ra_pop_interval(ctx, file, conflicting);
865             ra_push_interval(ctx, file, &killed_removed, conflicting_start);
866             ra_push_interval(ctx, file, &conflicting_removed, killed_start);
867          }
868 
869          evicted = true;
870          break;
871       }
872 
873       if (!evicted)
874          return false;
875    }
876 
877    *_eviction_count = eviction_count;
878    return true;
879 }
880 
881 static int
removed_interval_cmp(const void * _i1,const void * _i2)882 removed_interval_cmp(const void *_i1, const void *_i2)
883 {
884    const struct ra_removed_interval *i1 = _i1;
885    const struct ra_removed_interval *i2 = _i2;
886 
887    /* We sort the registers as follows:
888     *
889     * |--------------------------------------------------------------------|
890     * |                    |             |             |                   |
891     * |  Half live-through | Half killed | Full killed | Full live-through |
892     * |                    |             |             |                   |
893     * |--------------------------------------------------------------------|
894     *                        |                 |
895     *                        |   Destination   |
896     *                        |                 |
897     *                        |-----------------|
898     *
899     * Half-registers have to be first so that they stay in the low half of
900     * the register file. Then half and full killed must stay together so that
901     * there's a contiguous range where we can put the register. With this
902     * structure we should be able to accomodate any collection of intervals
903     * such that the total number of half components is within the half limit
904     * and the combined components are within the full limit.
905     */
906 
907    unsigned i1_align = reg_elem_size(i1->interval->interval.reg);
908    unsigned i2_align = reg_elem_size(i2->interval->interval.reg);
909    if (i1_align > i2_align)
910       return 1;
911    if (i1_align < i2_align)
912       return -1;
913 
914    if (i1_align == 1) {
915       if (i2->interval->is_killed)
916          return -1;
917       if (i1->interval->is_killed)
918          return 1;
919    } else {
920       if (i2->interval->is_killed)
921          return 1;
922       if (i1->interval->is_killed)
923          return -1;
924    }
925 
926    return 0;
927 }
928 
929 /* "Compress" all the live intervals so that there is enough space for the
930  * destination register. As there can be gaps when a more-aligned interval
931  * follows a less-aligned interval, this also sorts them to remove such
932  * "padding", which may be required when space is very tight.  This isn't
933  * amazing, but should be used only as a last resort in case the register file
934  * is almost full and badly fragmented.
935  *
936  * Return the physreg to use.
937  */
938 static physreg_t
compress_regs_left(struct ra_ctx * ctx,struct ra_file * file,unsigned size,unsigned align,bool is_source)939 compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size,
940                    unsigned align, bool is_source)
941 {
942    DECLARE_ARRAY(struct ra_removed_interval, intervals);
943    intervals_count = intervals_sz = 0;
944    intervals = NULL;
945 
946    unsigned removed_size = 0, removed_half_size = 0;
947    unsigned file_size =
948       align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size;
949    physreg_t start_reg = 0;
950 
951    foreach_interval_rev_safe (interval, file) {
952       /* Check if we can sort the intervals *after* this one and have enough
953        * space leftover to accomodate "size" units. Also check that we have
954        * enough space leftover for half-registers, if we're inserting a
955        * half-register (otherwise we only shift any half-registers down so they
956        * should be safe).
957        */
958       if (interval->physreg_end + size + removed_size <= file->size &&
959           (align != 1 ||
960            interval->physreg_end + size + removed_half_size <= file_size)) {
961          start_reg = interval->physreg_end;
962          break;
963       }
964 
965       /* We assume that all frozen intervals are at the start and that we
966        * can avoid popping them.
967        */
968       assert(!interval->frozen);
969 
970       /* Killed sources don't count because they go at the end and can
971        * overlap the register we're trying to add.
972        */
973       if (!interval->is_killed && !is_source) {
974          removed_size += interval->physreg_end - interval->physreg_start;
975          if (interval->interval.reg->flags & IR3_REG_HALF) {
976             removed_half_size += interval->physreg_end -
977                interval->physreg_start;
978          }
979       }
980 
981       /* Now that we've done the accounting, pop this off */
982       d("popping interval %u physreg %u\n", interval->interval.reg->name,
983         interval->physreg_start);
984       array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval));
985    }
986 
987    /* TODO: In addition to skipping registers at the beginning that are
988     * well-packed, we should try to skip registers at the end.
989     */
990 
991    qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp);
992 
993    physreg_t physreg = start_reg;
994    physreg_t ret_reg = (physreg_t)~0;
995    for (unsigned i = 0; i < intervals_count; i++) {
996       if (ret_reg == (physreg_t)~0 &&
997           ((intervals[i].interval->is_killed && !is_source) ||
998            !(intervals[i].interval->interval.reg->flags & IR3_REG_HALF))) {
999          ret_reg = ALIGN(physreg, align);
1000       }
1001 
1002       if (ret_reg != (physreg_t)~0 &&
1003           (is_source || !intervals[i].interval->is_killed)) {
1004          physreg = MAX2(physreg, ret_reg + size);
1005       }
1006 
1007       if (!(intervals[i].interval->interval.reg->flags & IR3_REG_HALF)) {
1008          physreg = ALIGN(physreg, 2);
1009       }
1010 
1011       if (physreg + intervals[i].size >
1012           reg_file_size(file, intervals[i].interval->interval.reg)) {
1013          d("ran out of room for interval %u!\n",
1014            intervals[i].interval->interval.reg->name);
1015          unreachable("reg pressure calculation was wrong!");
1016          return 0;
1017       }
1018 
1019       d("pushing interval %u physreg %u\n",
1020         intervals[i].interval->interval.reg->name, physreg);
1021       ra_push_interval(ctx, file, &intervals[i], physreg);
1022 
1023       physreg += intervals[i].size;
1024    }
1025 
1026    if (ret_reg == (physreg_t)~0)
1027       ret_reg = physreg;
1028 
1029    ret_reg = ALIGN(ret_reg, align);
1030    if (ret_reg + size > file_size) {
1031       d("ran out of room for the new interval!\n");
1032       unreachable("reg pressure calculation was wrong!");
1033       return 0;
1034    }
1035 
1036    return ret_reg;
1037 }
1038 
1039 static void
update_affinity(struct ra_file * file,struct ir3_register * reg,physreg_t physreg)1040 update_affinity(struct ra_file *file, struct ir3_register *reg,
1041                 physreg_t physreg)
1042 {
1043    if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0)
1044       return;
1045 
1046    if (physreg < reg->merge_set_offset)
1047       return;
1048 
1049    if ((physreg - reg->merge_set_offset + reg->merge_set->size) > file->size)
1050       return;
1051 
1052    reg->merge_set->preferred_reg = physreg - reg->merge_set_offset;
1053 }
1054 
1055 /* Try to find free space for a register without shuffling anything. This uses
1056  * a round-robin algorithm to reduce false dependencies.
1057  */
1058 static physreg_t
find_best_gap(struct ra_file * file,unsigned file_size,unsigned size,unsigned align,bool is_source)1059 find_best_gap(struct ra_file *file, unsigned file_size, unsigned size,
1060               unsigned align, bool is_source)
1061 {
1062    /* This can happen if we create a very large merge set. Just bail out in that
1063     * case.
1064     */
1065    if (size > file_size)
1066       return (physreg_t) ~0;
1067 
1068    BITSET_WORD *available =
1069       is_source ? file->available_to_evict : file->available;
1070 
1071    unsigned start = ALIGN(file->start, align) % (file_size - size + align);
1072    unsigned candidate = start;
1073    do {
1074       bool is_available = true;
1075       for (unsigned i = 0; i < size; i++) {
1076          if (!BITSET_TEST(available, candidate + i)) {
1077             is_available = false;
1078             break;
1079          }
1080       }
1081 
1082       if (is_available) {
1083          file->start = (candidate + size) % file_size;
1084          return candidate;
1085       }
1086 
1087       candidate += align;
1088       if (candidate + size > file_size)
1089          candidate = 0;
1090    } while (candidate != start);
1091 
1092    return (physreg_t)~0;
1093 }
1094 
1095 static struct ra_file *
ra_get_file(struct ra_ctx * ctx,struct ir3_register * reg)1096 ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
1097 {
1098    if (reg->flags & IR3_REG_SHARED)
1099       return &ctx->shared;
1100    else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
1101       return &ctx->full;
1102    else
1103       return &ctx->half;
1104 }
1105 
1106 /* This is the main entrypoint for picking a register. Pick a free register
1107  * for "reg", shuffling around sources if necessary. In the normal case where
1108  * "is_source" is false, this register can overlap with killed sources
1109  * (intervals with "is_killed == true"). If "is_source" is true, then
1110  * is_killed is ignored and the register returned must not overlap with killed
1111  * sources. This must be used for tied registers, because we're actually
1112  * allocating the destination and the tied source at the same time.
1113  */
1114 
1115 static physreg_t
get_reg(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg,bool is_source)1116 get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg,
1117         bool is_source)
1118 {
1119    unsigned file_size = reg_file_size(file, reg);
1120    if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
1121       physreg_t preferred_reg =
1122          reg->merge_set->preferred_reg + reg->merge_set_offset;
1123       if (preferred_reg < file_size &&
1124           preferred_reg % reg_elem_size(reg) == 0 &&
1125           get_reg_specified(file, reg, preferred_reg, is_source))
1126          return preferred_reg;
1127    }
1128 
1129    /* If this register is a subset of a merge set which we have not picked a
1130     * register for, first try to allocate enough space for the entire merge
1131     * set.
1132     */
1133    unsigned size = reg_size(reg);
1134    if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
1135        size < reg->merge_set->size) {
1136       physreg_t best_reg = find_best_gap(file, file_size, reg->merge_set->size,
1137                                          reg->merge_set->alignment, is_source);
1138       if (best_reg != (physreg_t)~0u) {
1139          best_reg += reg->merge_set_offset;
1140          return best_reg;
1141       }
1142    }
1143 
1144    /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
1145     * Because this doesn't introduce unnecessary dependencies, and it
1146     * potentially avoids needing (ss) syncs for write after read hazards for
1147     * SFU instructions:
1148     */
1149    if (is_sfu(reg->instr) || is_alu(reg->instr)) {
1150       for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
1151          struct ir3_register *src = reg->instr->srcs[i];
1152          if (!ra_reg_is_src(src))
1153             continue;
1154          if (ra_get_file(ctx, src) == file && reg_size(src) >= size) {
1155             struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1156             physreg_t src_physreg = ra_interval_get_physreg(src_interval);
1157             if (src_physreg % reg_elem_size(reg) == 0 &&
1158                 src_physreg + size <= file_size &&
1159                 get_reg_specified(file, reg, src_physreg, is_source))
1160                return src_physreg;
1161          }
1162       }
1163    }
1164 
1165    physreg_t best_reg =
1166       find_best_gap(file, file_size, size, reg_elem_size(reg), is_source);
1167    if (best_reg != (physreg_t)~0u) {
1168       return best_reg;
1169    }
1170 
1171    /* Ok, we couldn't find anything that fits. Here is where we have to start
1172     * moving things around to make stuff fit. First try solely evicting
1173     * registers in the way.
1174     */
1175    unsigned best_eviction_count = ~0;
1176    for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) {
1177       unsigned eviction_count;
1178       if (try_evict_regs(ctx, file, reg, i, &eviction_count, is_source, true)) {
1179          if (eviction_count < best_eviction_count) {
1180             best_eviction_count = eviction_count;
1181             best_reg = i;
1182          }
1183       }
1184    }
1185 
1186    if (best_eviction_count != ~0) {
1187       ASSERTED bool result = try_evict_regs(
1188          ctx, file, reg, best_reg, &best_eviction_count, is_source, false);
1189       assert(result);
1190       return best_reg;
1191    }
1192 
1193    /* Use the dumb fallback only if try_evict_regs() fails. */
1194    return compress_regs_left(ctx, file, reg_size(reg), reg_elem_size(reg),
1195                              is_source);
1196 }
1197 
1198 static void
assign_reg(struct ir3_instruction * instr,struct ir3_register * reg,unsigned num)1199 assign_reg(struct ir3_instruction *instr, struct ir3_register *reg,
1200            unsigned num)
1201 {
1202    if (reg->flags & IR3_REG_ARRAY) {
1203       reg->array.base = num;
1204       if (reg->flags & IR3_REG_RELATIV)
1205          reg->array.offset += num;
1206       else
1207          reg->num = num + reg->array.offset;
1208    } else {
1209       reg->num = num;
1210    }
1211 }
1212 
1213 static void
mark_src_killed(struct ra_ctx * ctx,struct ir3_register * src)1214 mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src)
1215 {
1216    struct ra_interval *interval = &ctx->intervals[src->def->name];
1217 
1218    if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed ||
1219        interval->interval.parent ||
1220        !rb_tree_is_empty(&interval->interval.children))
1221       return;
1222 
1223    ra_file_mark_killed(ra_get_file(ctx, src), interval);
1224 }
1225 
1226 static void
insert_dst(struct ra_ctx * ctx,struct ir3_register * dst)1227 insert_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1228 {
1229    struct ra_file *file = ra_get_file(ctx, dst);
1230    struct ra_interval *interval = &ctx->intervals[dst->name];
1231 
1232    d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval));
1233 
1234    if (!(dst->flags & IR3_REG_UNUSED))
1235       ra_file_insert(file, interval);
1236 
1237    assign_reg(dst->instr, dst, ra_interval_get_num(interval));
1238 }
1239 
1240 static void
allocate_dst_fixed(struct ra_ctx * ctx,struct ir3_register * dst,physreg_t physreg)1241 allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst,
1242                    physreg_t physreg)
1243 {
1244    struct ra_file *file = ra_get_file(ctx, dst);
1245    struct ra_interval *interval = &ctx->intervals[dst->name];
1246    update_affinity(file, dst, physreg);
1247 
1248    ra_interval_init(interval, dst);
1249    interval->physreg_start = physreg;
1250    interval->physreg_end = physreg + reg_size(dst);
1251 }
1252 
1253 static void
allocate_dst(struct ra_ctx * ctx,struct ir3_register * dst)1254 allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1255 {
1256    struct ra_file *file = ra_get_file(ctx, dst);
1257 
1258    struct ir3_register *tied = dst->tied;
1259    if (tied) {
1260       struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1261       struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1262       physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);
1263       if (tied_interval->is_killed) {
1264          /* The easy case: the source is killed, so we can just reuse it
1265           * for the destination.
1266           */
1267          allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval));
1268       } else {
1269          /* The source is live-through, so we need to get a free register
1270           * (which is free for both the source and destination!), copy the
1271           * original source to it, then use that for the source and
1272           * destination.
1273           */
1274          physreg_t physreg = get_reg(ctx, file, dst, true);
1275          allocate_dst_fixed(ctx, dst, physreg);
1276          array_insert(ctx, ctx->parallel_copies,
1277                       (struct ra_parallel_copy){
1278                          .interval = dst_interval,
1279                          .src = tied_physreg,
1280                       });
1281       }
1282 
1283       return;
1284    }
1285 
1286    /* All the hard work is done by get_reg here. */
1287    physreg_t physreg = get_reg(ctx, file, dst, false);
1288 
1289    allocate_dst_fixed(ctx, dst, physreg);
1290 }
1291 
1292 static void
assign_src(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)1293 assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
1294            struct ir3_register *src)
1295 {
1296    struct ra_interval *interval = &ctx->intervals[src->def->name];
1297    struct ra_file *file = ra_get_file(ctx, src);
1298 
1299    struct ir3_register *tied = src->tied;
1300    physreg_t physreg;
1301    if (tied) {
1302       struct ra_interval *tied_interval = &ctx->intervals[tied->name];
1303       physreg = ra_interval_get_physreg(tied_interval);
1304    } else {
1305       physreg = ra_interval_get_physreg(interval);
1306    }
1307 
1308    assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags));
1309 
1310    if (src->flags & IR3_REG_FIRST_KILL)
1311       ra_file_remove(file, interval);
1312 }
1313 
1314 /* Insert a parallel copy instruction before the instruction with the parallel
1315  * copy entries we've built up.
1316  */
1317 static void
insert_parallel_copy_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1318 insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1319 {
1320    if (ctx->parallel_copies_count == 0)
1321       return;
1322 
1323    struct ir3_instruction *pcopy =
1324       ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY,
1325                        ctx->parallel_copies_count, ctx->parallel_copies_count);
1326 
1327    for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1328       struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1329       struct ir3_register *reg =
1330          ir3_dst_create(pcopy, INVALID_REG,
1331                         entry->interval->interval.reg->flags & ~IR3_REG_SSA);
1332       reg->size = entry->interval->interval.reg->size;
1333       reg->wrmask = entry->interval->interval.reg->wrmask;
1334       assign_reg(pcopy, reg, ra_interval_get_num(entry->interval));
1335    }
1336 
1337    for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1338       struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1339       struct ir3_register *reg =
1340          ir3_src_create(pcopy, INVALID_REG,
1341                         entry->interval->interval.reg->flags & ~IR3_REG_SSA);
1342       reg->size = entry->interval->interval.reg->size;
1343       reg->wrmask = entry->interval->interval.reg->wrmask;
1344       assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags));
1345    }
1346 
1347    list_del(&pcopy->node);
1348    list_addtail(&pcopy->node, &instr->node);
1349    ctx->parallel_copies_count = 0;
1350 }
1351 
1352 static void
handle_normal_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1353 handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1354 {
1355    /* First, mark sources as going-to-be-killed while allocating the dest. */
1356    ra_foreach_src (src, instr) {
1357       mark_src_killed(ctx, src);
1358    }
1359 
1360    /* Allocate the destination. */
1361    ra_foreach_dst (dst, instr) {
1362       allocate_dst(ctx, dst);
1363    }
1364 
1365    /* Now handle sources. Go backward so that in case there are multiple
1366     * sources with the same def and that def is killed we only remove it at
1367     * the end.
1368     */
1369    ra_foreach_src_rev (src, instr) {
1370       assign_src(ctx, instr, src);
1371    }
1372 
1373    /* Now finally insert the destination into the map. */
1374    ra_foreach_dst (dst, instr) {
1375       insert_dst(ctx, dst);
1376    }
1377 
1378    insert_parallel_copy_instr(ctx, instr);
1379 }
1380 
1381 static void
handle_split(struct ra_ctx * ctx,struct ir3_instruction * instr)1382 handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
1383 {
1384    struct ir3_register *dst = instr->dsts[0];
1385    struct ir3_register *src = instr->srcs[0];
1386 
1387    if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
1388       handle_normal_instr(ctx, instr);
1389       return;
1390    }
1391 
1392    struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1393 
1394    physreg_t physreg = ra_interval_get_physreg(src_interval);
1395    assign_src(ctx, instr, src);
1396 
1397    allocate_dst_fixed(
1398       ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset);
1399    insert_dst(ctx, dst);
1400 }
1401 
1402 static void
handle_collect(struct ra_ctx * ctx,struct ir3_instruction * instr)1403 handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
1404 {
1405    struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;
1406    unsigned dst_offset = instr->dsts[0]->merge_set_offset;
1407 
1408    if (!dst_set || dst_set->regs_count == 1) {
1409       handle_normal_instr(ctx, instr);
1410       return;
1411    }
1412 
1413    /* We need to check if any of the sources are contained in an interval
1414     * that is at least as large as the vector. In this case, we should put
1415     * the vector inside that larger interval. (There should be one
1416     * unambiguous place to put it, because values sharing the same merge set
1417     * should be allocated together.) This can happen in a case like:
1418     *
1419     * ssa_1 (wrmask=0xf) = ...
1420     * ssa_2 = split ssa_1 off:0
1421     * ssa_3 = split ssa_1 off:1
1422     * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3
1423     * ... = (kill)ssa_1
1424     * ... = (kill)ssa_4
1425     *
1426     * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it.
1427     */
1428    physreg_t dst_fixed = (physreg_t)~0u;
1429 
1430    ra_foreach_src (src, instr) {
1431       if (src->flags & IR3_REG_FIRST_KILL) {
1432          mark_src_killed(ctx, src);
1433       }
1434 
1435       struct ra_interval *interval = &ctx->intervals[src->def->name];
1436 
1437       if (src->def->merge_set != dst_set || interval->is_killed)
1438          continue;
1439       while (interval->interval.parent != NULL) {
1440          interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1441       }
1442       if (reg_size(interval->interval.reg) >= reg_size(instr->dsts[0])) {
1443          dst_fixed = interval->physreg_start -
1444                      interval->interval.reg->merge_set_offset + dst_offset;
1445       } else {
1446          /* For sources whose root interval is smaller than the
1447           * destination (i.e. the normal case), we will shuffle them
1448           * around after allocating the destination. Mark them killed so
1449           * that the destination can be allocated over them, even if they
1450           * aren't actually killed.
1451           */
1452          ra_file_mark_killed(ra_get_file(ctx, src), interval);
1453       }
1454    }
1455 
1456    if (dst_fixed != (physreg_t)~0u)
1457       allocate_dst_fixed(ctx, instr->dsts[0], dst_fixed);
1458    else
1459       allocate_dst(ctx, instr->dsts[0]);
1460 
1461    /* Remove the temporary is_killed we added */
1462    ra_foreach_src (src, instr) {
1463       struct ra_interval *interval = &ctx->intervals[src->def->name];
1464       while (interval->interval.parent != NULL) {
1465          interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1466       }
1467 
1468       /* Filter out cases where it actually should be killed */
1469       if (interval != &ctx->intervals[src->def->name] ||
1470           !(src->flags & IR3_REG_KILL)) {
1471          ra_file_unmark_killed(ra_get_file(ctx, src), interval);
1472       }
1473    }
1474 
1475    ra_foreach_src_rev (src, instr) {
1476       assign_src(ctx, instr, src);
1477    }
1478 
1479    /* We need to do this before insert_dst(), so that children of the
1480     * destination which got marked as killed and then shuffled around to make
1481     * space for the destination have the correct pcopy destination that
1482     * matches what we assign the source of the collect to in assign_src().
1483     *
1484     * TODO: In this case we'll wind up copying the value in the pcopy and
1485     * then again in the collect. We could avoid one of those by updating the
1486     * pcopy destination to match up with the final location of the source
1487     * after the collect and making the collect a no-op. However this doesn't
1488     * seem to happen often.
1489     */
1490    insert_parallel_copy_instr(ctx, instr);
1491 
1492    /* Note: insert_dst will automatically shuffle around any intervals that
1493     * are a child of the collect by making them children of the collect.
1494     */
1495 
1496    insert_dst(ctx, instr->dsts[0]);
1497 }
1498 
1499 /* Parallel copies before RA should only be at the end of the block, for
1500  * phi's. For these we only need to fill in the sources, and then we fill in
1501  * the destinations in the successor block.
1502  */
1503 static void
handle_pcopy(struct ra_ctx * ctx,struct ir3_instruction * instr)1504 handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
1505 {
1506    ra_foreach_src_rev (src, instr) {
1507       assign_src(ctx, instr, src);
1508    }
1509 }
1510 
1511 /* Some inputs may need to be precolored. We need to handle those first, so
1512  * that other non-precolored inputs don't accidentally get allocated over
1513  * them. Inputs are the very first thing in the shader, so it shouldn't be a
1514  * problem to allocate them to a specific physreg.
1515  */
1516 
1517 static void
handle_precolored_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1518 handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1519 {
1520    if (instr->dsts[0]->num == INVALID_REG)
1521       return;
1522 
1523    struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1524    physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]);
1525    allocate_dst_fixed(ctx, instr->dsts[0], physreg);
1526    insert_dst(ctx, instr->dsts[0]);
1527    interval->frozen = true;
1528 }
1529 
1530 static void
handle_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1531 handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1532 {
1533    if (instr->dsts[0]->num != INVALID_REG)
1534       return;
1535 
1536    allocate_dst(ctx, instr->dsts[0]);
1537 
1538    struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1539    struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1540    ra_file_insert(file, interval);
1541 }
1542 
1543 static void
assign_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1544 assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1545 {
1546    struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1547    struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1548 
1549    if (instr->dsts[0]->num == INVALID_REG) {
1550       assign_reg(instr, instr->dsts[0], ra_interval_get_num(interval));
1551    } else {
1552       interval->frozen = false;
1553    }
1554 
1555    if (instr->dsts[0]->flags & IR3_REG_UNUSED)
1556       ra_file_remove(file, interval);
1557 
1558    ra_foreach_src_rev (src, instr)
1559       assign_src(ctx, instr, src);
1560 }
1561 
1562 /* chmask is a bit weird, because it has pre-colored sources due to the need
1563  * to pass some registers to the next stage. Fortunately there are only at
1564  * most two, and there should be no other live values by the time we get to
1565  * this instruction, so we only have to do the minimum and don't need any
1566  * fancy fallbacks.
1567  *
1568  * TODO: Add more complete handling of precolored sources, e.g. for function
1569  * argument handling. We'd need a way to mark sources as fixed so that they
1570  * don't get moved around when placing other sources in the fallback case, and
1571  * a duplication of much of the logic in get_reg(). This also opens another
1572  * can of worms, e.g. what if the precolored source is a split of a vector
1573  * which is still live -- this breaks our assumption that splits don't incur
1574  * any "extra" register requirements and we'd have to break it out of the
1575  * parent ra_interval.
1576  */
1577 
1578 static void
handle_precolored_source(struct ra_ctx * ctx,struct ir3_register * src)1579 handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src)
1580 {
1581    struct ra_file *file = ra_get_file(ctx, src);
1582    struct ra_interval *interval = &ctx->intervals[src->def->name];
1583    physreg_t physreg = ra_reg_get_physreg(src);
1584 
1585    if (ra_interval_get_num(interval) == src->num)
1586       return;
1587 
1588    /* Try evicting stuff in our way if it isn't free. This won't move
1589     * anything unless it overlaps with our precolored physreg, so we don't
1590     * have to worry about evicting other precolored sources.
1591     */
1592    if (!get_reg_specified(file, src, physreg, true)) {
1593       unsigned eviction_count;
1594       if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true,
1595                           false)) {
1596          unreachable("failed to evict for precolored source!");
1597          return;
1598       }
1599    }
1600 
1601    ra_move_interval(ctx, file, interval, physreg);
1602 }
1603 
1604 static void
handle_chmask(struct ra_ctx * ctx,struct ir3_instruction * instr)1605 handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr)
1606 {
1607    /* Note: we purposely don't mark sources as killed, so that we can reuse
1608     * some of the get_reg() machinery as-if the source is a destination.
1609     * Marking it as killed would make e.g. get_reg_specified() wouldn't work
1610     * correctly.
1611     */
1612    ra_foreach_src (src, instr) {
1613       assert(src->num != INVALID_REG);
1614       handle_precolored_source(ctx, src);
1615    }
1616 
1617    ra_foreach_src (src, instr) {
1618       struct ra_file *file = ra_get_file(ctx, src);
1619       struct ra_interval *interval = &ctx->intervals[src->def->name];
1620       if (src->flags & IR3_REG_FIRST_KILL)
1621          ra_file_remove(file, interval);
1622    }
1623 
1624    insert_parallel_copy_instr(ctx, instr);
1625 }
1626 
1627 static physreg_t
read_register(struct ra_ctx * ctx,struct ir3_block * block,struct ir3_register * def)1628 read_register(struct ra_ctx *ctx, struct ir3_block *block,
1629               struct ir3_register *def)
1630 {
1631    struct ra_block_state *state = &ctx->blocks[block->index];
1632    if (state->renames) {
1633       struct hash_entry *entry = _mesa_hash_table_search(state->renames, def);
1634       if (entry) {
1635          return (physreg_t)(uintptr_t)entry->data;
1636       }
1637    }
1638 
1639    return ra_reg_get_physreg(def);
1640 }
1641 
1642 static void
handle_live_in(struct ra_ctx * ctx,struct ir3_register * def)1643 handle_live_in(struct ra_ctx *ctx, struct ir3_register *def)
1644 {
1645    physreg_t physreg = ~0;
1646    for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
1647       struct ir3_block *pred = ctx->block->predecessors[i];
1648       struct ra_block_state *pred_state = &ctx->blocks[pred->index];
1649 
1650       if (!pred_state->visited)
1651          continue;
1652 
1653       physreg = read_register(ctx, pred, def);
1654       break;
1655    }
1656 
1657    assert(physreg != (physreg_t)~0);
1658 
1659    struct ra_interval *interval = &ctx->intervals[def->name];
1660    struct ra_file *file = ra_get_file(ctx, def);
1661    ra_interval_init(interval, def);
1662    interval->physreg_start = physreg;
1663    interval->physreg_end = physreg + reg_size(def);
1664    ra_file_insert(file, interval);
1665 }
1666 
1667 static void
handle_live_out(struct ra_ctx * ctx,struct ir3_register * def)1668 handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
1669 {
1670    /* Skip parallelcopy's which in the original program are only used as phi
1671     * arguments. Even though phi arguments are live out, they are only
1672     * assigned when the phi is.
1673     */
1674    if (def->instr->opc == OPC_META_PARALLEL_COPY)
1675       return;
1676 
1677    struct ra_block_state *state = &ctx->blocks[ctx->block->index];
1678    struct ra_interval *interval = &ctx->intervals[def->name];
1679    physreg_t physreg = ra_interval_get_physreg(interval);
1680    if (physreg != ra_reg_get_physreg(def)) {
1681       if (!state->renames)
1682          state->renames = _mesa_pointer_hash_table_create(ctx);
1683       _mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg);
1684    }
1685 }
1686 
1687 static void
handle_phi(struct ra_ctx * ctx,struct ir3_register * def)1688 handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
1689 {
1690    struct ra_file *file = ra_get_file(ctx, def);
1691    struct ra_interval *interval = &ctx->intervals[def->name];
1692 
1693    /* phis are always scalar, so they should already be the smallest possible
1694     * size. However they may be coalesced with other live-in values/phi
1695     * nodes, so check for that here.
1696     */
1697    struct ir3_reg_interval *parent_ir3 =
1698       ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start);
1699    physreg_t physreg;
1700    if (parent_ir3) {
1701       struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3);
1702       physreg = ra_interval_get_physreg(parent) +
1703                 (def->interval_start - parent_ir3->reg->interval_start);
1704    } else {
1705       physreg = get_reg(ctx, file, def, false);
1706    }
1707 
1708    allocate_dst_fixed(ctx, def, physreg);
1709 
1710    ra_file_insert(file, interval);
1711 }
1712 
1713 static void
assign_phi(struct ra_ctx * ctx,struct ir3_instruction * phi)1714 assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
1715 {
1716    struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);
1717    struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];
1718    assert(!interval->interval.parent);
1719    unsigned num = ra_interval_get_num(interval);
1720    assign_reg(phi, phi->dsts[0], num);
1721 
1722    /* Assign the parallelcopy sources of this phi */
1723    for (unsigned i = 0; i < phi->srcs_count; i++) {
1724       if (phi->srcs[i]->def) {
1725          assign_reg(phi, phi->srcs[i], num);
1726          assign_reg(phi, phi->srcs[i]->def, num);
1727       }
1728    }
1729 
1730    if (phi->dsts[0]->flags & IR3_REG_UNUSED)
1731       ra_file_remove(file, interval);
1732 }
1733 
1734 /* When we split a live range, we sometimes need to emit fixup code at the end
1735  * of a block. For example, something like:
1736  *
1737  * a = ...
1738  * if (...) {
1739  *    ...
1740  *    a' = a
1741  *    b = ... // a evicted to make room for b
1742  *    ...
1743  * }
1744  * ... = a
1745  *
1746  * When we insert the copy to a' in insert_parallel_copy_instr(), this forces
1747  * to insert another copy "a = a'" at the end of the if. Normally this would
1748  * also entail adding a phi node, but since we're about to go out of SSA
1749  * anyway we just insert an extra move. Note, however, that "b" might be used
1750  * in a phi node at the end of the if and share registers with "a", so we
1751  * have to be careful to extend any preexisting parallelcopy instruction
1752  * instead of creating our own in order to guarantee that they properly get
1753  * swapped.
1754  */
1755 
1756 static void
insert_liveout_copy(struct ir3_block * block,physreg_t dst,physreg_t src,struct ir3_register * reg)1757 insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,
1758                     struct ir3_register *reg)
1759 {
1760    struct ir3_instruction *old_pcopy = NULL;
1761    if (!list_is_empty(&block->instr_list)) {
1762       struct ir3_instruction *last =
1763          LIST_ENTRY(struct ir3_instruction, block->instr_list.prev, node);
1764       if (last->opc == OPC_META_PARALLEL_COPY)
1765          old_pcopy = last;
1766    }
1767 
1768    unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0;
1769    struct ir3_instruction *pcopy = ir3_instr_create(
1770       block, OPC_META_PARALLEL_COPY, old_pcopy_srcs + 1, old_pcopy_srcs + 1);
1771 
1772    for (unsigned i = 0; i < old_pcopy_srcs; i++) {
1773       old_pcopy->dsts[i]->instr = pcopy;
1774       pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i];
1775    }
1776 
1777    struct ir3_register *dst_reg =
1778       ir3_dst_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA);
1779    dst_reg->wrmask = reg->wrmask;
1780    dst_reg->size = reg->size;
1781    assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags));
1782 
1783    for (unsigned i = 0; i < old_pcopy_srcs; i++) {
1784       pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i];
1785    }
1786 
1787    struct ir3_register *src_reg =
1788       ir3_src_create(pcopy, INVALID_REG, reg->flags & ~IR3_REG_SSA);
1789    src_reg->wrmask = reg->wrmask;
1790    src_reg->size = reg->size;
1791    assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags));
1792 
1793    if (old_pcopy)
1794       list_del(&old_pcopy->node);
1795 }
1796 
1797 static void
insert_live_in_move(struct ra_ctx * ctx,struct ra_interval * interval)1798 insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
1799 {
1800    physreg_t physreg = ra_interval_get_physreg(interval);
1801 
1802    bool shared = interval->interval.reg->flags & IR3_REG_SHARED;
1803    struct ir3_block **predecessors =
1804       shared ? ctx->block->physical_predecessors : ctx->block->predecessors;
1805    unsigned predecessors_count = shared
1806                                     ? ctx->block->physical_predecessors_count
1807                                     : ctx->block->predecessors_count;
1808 
1809    for (unsigned i = 0; i < predecessors_count; i++) {
1810       struct ir3_block *pred = predecessors[i];
1811       struct ra_block_state *pred_state = &ctx->blocks[pred->index];
1812 
1813       if (!pred_state->visited)
1814          continue;
1815 
1816       physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
1817       if (pred_reg != physreg) {
1818          insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
1819 
1820          /* This is a bit tricky, but when visiting the destination of a
1821           * physical-only edge, we have two predecessors (the if and the
1822           * header block) and both have multiple successors. We pick the
1823           * register for all live-ins from the normal edge, which should
1824           * guarantee that there's no need for shuffling things around in
1825           * the normal predecessor as long as there are no phi nodes, but
1826           * we still may need to insert fixup code in the physical
1827           * predecessor (i.e. the last block of the if) and that has
1828           * another successor (the block after the if) so we need to update
1829           * the renames state for when we process the other successor. This
1830           * crucially depends on the other successor getting processed
1831           * after this.
1832           *
1833           * For normal (non-physical) edges we disallow critical edges so
1834           * that hacks like this aren't necessary.
1835           */
1836          if (!pred_state->renames)
1837             pred_state->renames = _mesa_pointer_hash_table_create(ctx);
1838          _mesa_hash_table_insert(pred_state->renames, interval->interval.reg,
1839                                  (void *)(uintptr_t)physreg);
1840       }
1841    }
1842 }
1843 
1844 static void
insert_file_live_in_moves(struct ra_ctx * ctx,struct ra_file * file)1845 insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file)
1846 {
1847    BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index];
1848    rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
1849                     physreg_node) {
1850       /* Skip phi nodes. This needs to happen after phi nodes are allocated,
1851        * because we may have to move live-ins around to make space for phi
1852        * nodes, but we shouldn't be handling phi nodes here.
1853        */
1854       if (BITSET_TEST(live_in, interval->interval.reg->name))
1855          insert_live_in_move(ctx, interval);
1856    }
1857 }
1858 
1859 static void
insert_entry_regs(struct ra_block_state * state,struct ra_file * file)1860 insert_entry_regs(struct ra_block_state *state, struct ra_file *file)
1861 {
1862    rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
1863                     physreg_node) {
1864       _mesa_hash_table_insert(state->entry_regs, interval->interval.reg,
1865                               (void *)(uintptr_t)interval->physreg_start);
1866    }
1867 }
1868 
1869 static void
insert_live_in_moves(struct ra_ctx * ctx)1870 insert_live_in_moves(struct ra_ctx *ctx)
1871 {
1872    insert_file_live_in_moves(ctx, &ctx->full);
1873    insert_file_live_in_moves(ctx, &ctx->half);
1874    insert_file_live_in_moves(ctx, &ctx->shared);
1875 
1876    /* If not all predecessors are visited, insert live-in regs so that
1877     * insert_live_out_moves() will work.
1878     */
1879    bool all_preds_visited = true;
1880    for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
1881       if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) {
1882          all_preds_visited = false;
1883          break;
1884       }
1885    }
1886 
1887    if (!all_preds_visited) {
1888       struct ra_block_state *state = &ctx->blocks[ctx->block->index];
1889       state->entry_regs = _mesa_pointer_hash_table_create(ctx);
1890 
1891       insert_entry_regs(state, &ctx->full);
1892       insert_entry_regs(state, &ctx->half);
1893       insert_entry_regs(state, &ctx->shared);
1894    }
1895 }
1896 
1897 static void
insert_live_out_move(struct ra_ctx * ctx,struct ra_interval * interval)1898 insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval)
1899 {
1900    for (unsigned i = 0; i < 2; i++) {
1901       if (!ctx->block->successors[i])
1902          continue;
1903 
1904       struct ir3_block *succ = ctx->block->successors[i];
1905       struct ra_block_state *succ_state = &ctx->blocks[succ->index];
1906 
1907       if (!succ_state->visited)
1908          continue;
1909 
1910       struct hash_entry *entry = _mesa_hash_table_search(
1911          succ_state->entry_regs, interval->interval.reg);
1912       if (!entry)
1913          continue;
1914 
1915       physreg_t new_reg = (physreg_t)(uintptr_t)entry->data;
1916       if (new_reg != interval->physreg_start) {
1917          insert_liveout_copy(ctx->block, new_reg, interval->physreg_start,
1918                              interval->interval.reg);
1919       }
1920    }
1921 }
1922 
1923 static void
insert_file_live_out_moves(struct ra_ctx * ctx,struct ra_file * file)1924 insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file)
1925 {
1926    rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
1927                     physreg_node) {
1928       insert_live_out_move(ctx, interval);
1929    }
1930 }
1931 
1932 static void
insert_live_out_moves(struct ra_ctx * ctx)1933 insert_live_out_moves(struct ra_ctx *ctx)
1934 {
1935    insert_file_live_out_moves(ctx, &ctx->full);
1936    insert_file_live_out_moves(ctx, &ctx->half);
1937    insert_file_live_out_moves(ctx, &ctx->shared);
1938 }
1939 
1940 static void
handle_block(struct ra_ctx * ctx,struct ir3_block * block)1941 handle_block(struct ra_ctx *ctx, struct ir3_block *block)
1942 {
1943    ctx->block = block;
1944 
1945    /* Reset the register files from the last block */
1946    ra_file_init(&ctx->full);
1947    ra_file_init(&ctx->half);
1948    ra_file_init(&ctx->shared);
1949 
1950    /* Handle live-ins, phis, and input meta-instructions. These all appear
1951     * live at the beginning of the block, and interfere with each other
1952     * therefore need to be allocated "in parallel". This means that we
1953     * have to allocate all of them, inserting them into the file, and then
1954     * delay updating the IR until all of them are allocated.
1955     *
1956     * Handle precolored inputs first, because we need to make sure that other
1957     * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes
1958     * and inputs at the same time, because the first block doesn't have
1959     * predecessors. Therefore handle_live_in doesn't have to worry about
1960     * them.
1961     */
1962 
1963    foreach_instr (instr, &block->instr_list) {
1964       if (instr->opc == OPC_META_INPUT)
1965          handle_precolored_input(ctx, instr);
1966       else
1967          break;
1968    }
1969 
1970    unsigned name;
1971    BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1972                        ctx->live->definitions_count) {
1973       struct ir3_register *reg = ctx->live->definitions[name];
1974       handle_live_in(ctx, reg);
1975    }
1976 
1977    foreach_instr (instr, &block->instr_list) {
1978       if (instr->opc == OPC_META_PHI)
1979          handle_phi(ctx, instr->dsts[0]);
1980       else if (instr->opc == OPC_META_INPUT ||
1981                instr->opc == OPC_META_TEX_PREFETCH)
1982          handle_input(ctx, instr);
1983       else
1984          break;
1985    }
1986 
1987    /* After this point, every live-in/phi/input has an interval assigned to
1988     * it. We delay actually assigning values until everything has been
1989     * allocated, so we can simply ignore any parallel copy entries created
1990     * when shuffling them around.
1991     */
1992    ctx->parallel_copies_count = 0;
1993 
1994    insert_live_in_moves(ctx);
1995 
1996    if (RA_DEBUG) {
1997       d("after live-in block %u:\n", block->index);
1998       ra_ctx_dump(ctx);
1999    }
2000 
2001    /* Now we're done with processing live-ins, and can handle the body of the
2002     * block.
2003     */
2004    foreach_instr (instr, &block->instr_list) {
2005       di(instr, "processing");
2006 
2007       if (instr->opc == OPC_META_PHI)
2008          assign_phi(ctx, instr);
2009       else if (instr->opc == OPC_META_INPUT ||
2010                instr->opc == OPC_META_TEX_PREFETCH)
2011          assign_input(ctx, instr);
2012       else if (instr->opc == OPC_META_SPLIT)
2013          handle_split(ctx, instr);
2014       else if (instr->opc == OPC_META_COLLECT)
2015          handle_collect(ctx, instr);
2016       else if (instr->opc == OPC_META_PARALLEL_COPY)
2017          handle_pcopy(ctx, instr);
2018       else if (instr->opc == OPC_CHMASK)
2019          handle_chmask(ctx, instr);
2020       else
2021          handle_normal_instr(ctx, instr);
2022 
2023       if (RA_DEBUG)
2024          ra_ctx_dump(ctx);
2025    }
2026 
2027    insert_live_out_moves(ctx);
2028 
2029    BITSET_FOREACH_SET (name, ctx->live->live_out[block->index],
2030                        ctx->live->definitions_count) {
2031       struct ir3_register *reg = ctx->live->definitions[name];
2032       handle_live_out(ctx, reg);
2033    }
2034 
2035    ctx->blocks[block->index].visited = true;
2036 }
2037 
2038 static unsigned
calc_target_full_pressure(struct ir3_shader_variant * v,unsigned pressure)2039 calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
2040 {
2041    /* Registers are allocated in units of vec4, so switch from units of
2042     * half-regs to vec4.
2043     */
2044    unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4);
2045 
2046    bool double_threadsize = ir3_should_double_threadsize(v, reg_count);
2047 
2048    unsigned target = reg_count;
2049    unsigned reg_independent_max_waves =
2050       ir3_get_reg_independent_max_waves(v, double_threadsize);
2051    unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
2052       v->shader->compiler, reg_count, double_threadsize);
2053    unsigned target_waves =
2054       MIN2(reg_independent_max_waves, reg_dependent_max_waves);
2055 
2056    while (target <= RA_FULL_SIZE / (2 * 4) &&
2057           ir3_should_double_threadsize(v, target) == double_threadsize &&
2058           ir3_get_reg_dependent_max_waves(v->shader->compiler, target,
2059                                           double_threadsize) >= target_waves)
2060       target++;
2061 
2062    return (target - 1) * 2 * 4;
2063 }
2064 
2065 static void
add_pressure(struct ir3_pressure * pressure,struct ir3_register * reg,bool merged_regs)2066 add_pressure(struct ir3_pressure *pressure, struct ir3_register *reg,
2067              bool merged_regs)
2068 {
2069    unsigned size = reg_size(reg);
2070    if (reg->flags & IR3_REG_HALF)
2071       pressure->half += size;
2072    if (!(reg->flags & IR3_REG_HALF) || merged_regs)
2073       pressure->full += size;
2074 }
2075 
2076 static void
dummy_interval_add(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)2077 dummy_interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2078 {
2079 }
2080 
2081 static void
dummy_interval_delete(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)2082 dummy_interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2083 {
2084 }
2085 
2086 static void
dummy_interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * parent,struct ir3_reg_interval * child)2087 dummy_interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *parent,
2088                      struct ir3_reg_interval *child)
2089 {
2090 }
2091 
2092 /* Calculate the minimum possible limit on register pressure so that spilling
2093  * still succeeds. Used to implement IR3_SHADER_DEBUG=spillall.
2094  */
2095 
2096 static void
calc_min_limit_pressure(struct ir3_shader_variant * v,struct ir3_liveness * live,struct ir3_pressure * limit)2097 calc_min_limit_pressure(struct ir3_shader_variant *v,
2098                         struct ir3_liveness *live,
2099                         struct ir3_pressure *limit)
2100 {
2101    struct ir3_block *start = ir3_start_block(v->ir);
2102    struct ir3_reg_ctx *ctx = ralloc(NULL, struct ir3_reg_ctx);
2103    struct ir3_reg_interval *intervals =
2104       rzalloc_array(ctx, struct ir3_reg_interval, live->definitions_count);
2105 
2106    ctx->interval_add = dummy_interval_add;
2107    ctx->interval_delete = dummy_interval_delete;
2108    ctx->interval_readd = dummy_interval_readd;
2109 
2110    limit->full = limit->half = 0;
2111 
2112    struct ir3_pressure cur_pressure = {0};
2113    foreach_instr (input, &start->instr_list) {
2114       if (input->opc != OPC_META_INPUT &&
2115           input->opc != OPC_META_TEX_PREFETCH)
2116          break;
2117 
2118       add_pressure(&cur_pressure, input->dsts[0], v->mergedregs);
2119    }
2120 
2121    limit->full = MAX2(limit->full, cur_pressure.full);
2122    limit->half = MAX2(limit->half, cur_pressure.half);
2123 
2124    foreach_instr (input, &start->instr_list) {
2125       if (input->opc != OPC_META_INPUT &&
2126           input->opc != OPC_META_TEX_PREFETCH)
2127          break;
2128 
2129       /* pre-colored inputs may have holes, which increases the pressure. */
2130       struct ir3_register *dst = input->dsts[0];
2131       if (dst->num != INVALID_REG) {
2132          unsigned physreg = ra_reg_get_physreg(dst) + reg_size(dst);
2133          if (dst->flags & IR3_REG_HALF)
2134             limit->half = MAX2(limit->half, physreg);
2135          if (!(dst->flags & IR3_REG_HALF) || v->mergedregs)
2136             limit->full = MAX2(limit->full, physreg);
2137       }
2138    }
2139 
2140    foreach_block (block, &v->ir->block_list) {
2141       rb_tree_init(&ctx->intervals);
2142 
2143       unsigned name;
2144       BITSET_FOREACH_SET (name, live->live_in[block->index],
2145                           live->definitions_count) {
2146          struct ir3_register *reg = live->definitions[name];
2147          ir3_reg_interval_init(&intervals[reg->name], reg);
2148          ir3_reg_interval_insert(ctx, &intervals[reg->name]);
2149       }
2150 
2151       foreach_instr (instr, &block->instr_list) {
2152          ra_foreach_dst (dst, instr) {
2153             ir3_reg_interval_init(&intervals[dst->name], dst);
2154          }
2155          /* phis and parallel copies can be deleted via spilling */
2156 
2157          if (instr->opc == OPC_META_PHI) {
2158             ir3_reg_interval_insert(ctx, &intervals[instr->dsts[0]->name]);
2159             continue;
2160          }
2161 
2162          if (instr->opc == OPC_META_PARALLEL_COPY)
2163             continue;
2164 
2165          cur_pressure = (struct ir3_pressure) {0};
2166 
2167          ra_foreach_dst (dst, instr) {
2168             if (dst->tied && !(dst->tied->flags & IR3_REG_KILL))
2169                add_pressure(&cur_pressure, dst, v->mergedregs);
2170          }
2171 
2172          ra_foreach_src_rev (src, instr) {
2173             /* We currently don't support spilling the parent of a source when
2174              * making space for sources, so we have to keep track of the
2175              * intervals and figure out the root of the tree to figure out how
2176              * much space we need.
2177              *
2178              * TODO: We should probably support this in the spiller.
2179              */
2180             struct ir3_reg_interval *interval = &intervals[src->def->name];
2181             while (interval->parent)
2182                interval = interval->parent;
2183             add_pressure(&cur_pressure, interval->reg, v->mergedregs);
2184 
2185             if (src->flags & IR3_REG_FIRST_KILL)
2186                ir3_reg_interval_remove(ctx, &intervals[src->def->name]);
2187          }
2188 
2189          limit->full = MAX2(limit->full, cur_pressure.full);
2190          limit->half = MAX2(limit->half, cur_pressure.half);
2191 
2192          cur_pressure = (struct ir3_pressure) {0};
2193 
2194          ra_foreach_dst (dst, instr) {
2195             ir3_reg_interval_init(&intervals[dst->name], dst);
2196             ir3_reg_interval_insert(ctx, &intervals[dst->name]);
2197             add_pressure(&cur_pressure, dst, v->mergedregs);
2198          }
2199 
2200          limit->full = MAX2(limit->full, cur_pressure.full);
2201          limit->half = MAX2(limit->half, cur_pressure.half);
2202       }
2203    }
2204 
2205    /* Account for the base register, which needs to be available everywhere. */
2206    limit->full += 2;
2207 
2208    ralloc_free(ctx);
2209 }
2210 
2211 int
ir3_ra(struct ir3_shader_variant * v)2212 ir3_ra(struct ir3_shader_variant *v)
2213 {
2214    ir3_calc_dominance(v->ir);
2215 
2216    ir3_create_parallel_copies(v->ir);
2217 
2218    struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
2219 
2220    ctx->merged_regs = v->mergedregs;
2221    ctx->compiler = v->shader->compiler;
2222    ctx->stage = v->type;
2223 
2224    struct ir3_liveness *live = ir3_calc_liveness(ctx, v->ir);
2225 
2226    ir3_debug_print(v->ir, "AFTER: create_parallel_copies");
2227 
2228    ir3_merge_regs(live, v->ir);
2229 
2230    struct ir3_pressure max_pressure;
2231    ir3_calc_pressure(v, live, &max_pressure);
2232    d("max pressure:");
2233    d("\tfull: %u", max_pressure.full);
2234    d("\thalf: %u", max_pressure.half);
2235    d("\tshared: %u", max_pressure.shared);
2236 
2237    /* TODO: calculate half/full limit correctly for CS with barrier */
2238    struct ir3_pressure limit_pressure;
2239    limit_pressure.full = RA_FULL_SIZE;
2240    limit_pressure.half = RA_HALF_SIZE;
2241    limit_pressure.shared = RA_SHARED_SIZE;
2242 
2243    /* If requested, lower the limit so that spilling happens more often. */
2244    if (ir3_shader_debug & IR3_DBG_SPILLALL)
2245       calc_min_limit_pressure(v, live, &limit_pressure);
2246 
2247    if (max_pressure.shared > limit_pressure.shared) {
2248       /* TODO shared reg -> normal reg spilling */
2249       d("shared max pressure exceeded!");
2250       goto fail;
2251    }
2252 
2253    bool spilled = false;
2254    if (max_pressure.full > limit_pressure.full ||
2255        max_pressure.half > limit_pressure.half) {
2256       if (!v->shader->compiler->has_pvtmem) {
2257          d("max pressure exceeded!");
2258          goto fail;
2259       }
2260       d("max pressure exceeded, spilling!");
2261       IR3_PASS(v->ir, ir3_spill, v, &live, &limit_pressure);
2262       ir3_calc_pressure(v, live, &max_pressure);
2263       assert(max_pressure.full <= limit_pressure.full &&
2264              max_pressure.half <= limit_pressure.half);
2265       spilled = true;
2266    }
2267 
2268    ctx->live = live;
2269    ctx->intervals =
2270       rzalloc_array(ctx, struct ra_interval, live->definitions_count);
2271    ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count);
2272 
2273    ctx->full.size = calc_target_full_pressure(v, max_pressure.full);
2274    d("full size: %u", ctx->full.size);
2275 
2276    if (!v->mergedregs)
2277       ctx->half.size = RA_HALF_SIZE;
2278 
2279    ctx->shared.size = RA_SHARED_SIZE;
2280 
2281    ctx->full.start = ctx->half.start = ctx->shared.start = 0;
2282 
2283    foreach_block (block, &v->ir->block_list)
2284       handle_block(ctx, block);
2285 
2286    ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count);
2287 
2288    /* Strip array-ness and SSA-ness at the end, because various helpers still
2289     * need to work even on definitions that have already been assigned. For
2290     * example, we need to preserve array-ness so that array live-ins have the
2291     * right size.
2292     */
2293    foreach_block (block, &v->ir->block_list) {
2294       foreach_instr (instr, &block->instr_list) {
2295          for (unsigned i = 0; i < instr->dsts_count; i++) {
2296             instr->dsts[i]->flags &= ~IR3_REG_SSA;
2297 
2298             /* Parallel copies of array registers copy the whole register, and
2299              * we need some way to let the parallel copy code know that this was
2300              * an array whose size is determined by reg->size. So keep the array
2301              * flag on those. spill/reload also need to work on the entire
2302              * array.
2303              */
2304             if (!is_meta(instr) && instr->opc != OPC_RELOAD_MACRO)
2305                instr->dsts[i]->flags &= ~IR3_REG_ARRAY;
2306          }
2307 
2308          for (unsigned i = 0; i < instr->srcs_count; i++) {
2309             instr->srcs[i]->flags &= ~IR3_REG_SSA;
2310 
2311             if (!is_meta(instr) && instr->opc != OPC_SPILL_MACRO)
2312                instr->srcs[i]->flags &= ~IR3_REG_ARRAY;
2313          }
2314       }
2315    }
2316 
2317    ir3_debug_print(v->ir, "AFTER: register allocation");
2318 
2319    if (spilled) {
2320       IR3_PASS(v->ir, ir3_lower_spill);
2321    }
2322 
2323    ir3_lower_copies(v);
2324 
2325    ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
2326 
2327    ralloc_free(ctx);
2328 
2329    return 0;
2330 fail:
2331    ralloc_free(ctx);
2332    return -1;
2333 }
2334