• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2021 Valve Corporation
3  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "ir3_ra.h"
26 #include "util/rb_tree.h"
27 #include "util/u_math.h"
28 #include "ir3_shader.h"
29 
30 /* This file implements an SSA-based register allocator. Unlike other
31  * SSA-based allocators, it handles vector split/collect "smartly," meaning
32  * that multiple values may share the same register interval. From the
33  * perspective of the allocator itself, only the top-level intervals matter,
34  * and the allocator is only concerned with allocating top-level intervals,
35  * which may mean moving other top-level intervals around. Other intervals,
36  * like the destination of a split instruction or the source of a collect
37  * instruction, are "locked" to their parent interval. The details of this are
38  * mostly handled by ir3_merge_regs and ir3_reg_ctx.
39  *
40  * We currently don't do any backtracking, but we do use the merge sets as a
41  * form of affinity to try to avoid moves from phis/splits/collects. Each
42  * merge set is what a more "classic" graph-coloring or live-range based
43  * allocator would consider a single register, but here we use it as merely a
44  * hint, except when multiple overlapping values are live at the same time.
45  * Each merge set has a "preferred" register, and we try to honor that when
46  * allocating values in the merge set.
47  */
48 
49 /* ir3_reg_ctx implementation. */
50 
51 static int
ir3_reg_interval_cmp(const struct rb_node * node,const void * data)52 ir3_reg_interval_cmp(const struct rb_node *node, const void *data)
53 {
54    unsigned reg = *(const unsigned *)data;
55    const struct ir3_reg_interval *interval =
56       ir3_rb_node_to_interval_const(node);
57    if (interval->reg->interval_start > reg)
58       return -1;
59    else if (interval->reg->interval_end <= reg)
60       return 1;
61    else
62       return 0;
63 }
64 
65 static struct ir3_reg_interval *
ir3_reg_interval_search(struct rb_tree * tree,unsigned offset)66 ir3_reg_interval_search(struct rb_tree *tree, unsigned offset)
67 {
68    struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp);
69    return node ? ir3_rb_node_to_interval(node) : NULL;
70 }
71 
72 static struct ir3_reg_interval *
ir3_reg_interval_search_sloppy(struct rb_tree * tree,unsigned offset)73 ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset)
74 {
75    struct rb_node *node =
76       rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp);
77    return node ? ir3_rb_node_to_interval(node) : NULL;
78 }
79 
80 /* Get the interval covering the reg, or the closest to the right if it
81  * doesn't exist.
82  */
83 static struct ir3_reg_interval *
ir3_reg_interval_search_right(struct rb_tree * tree,unsigned offset)84 ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset)
85 {
86    struct ir3_reg_interval *interval =
87       ir3_reg_interval_search_sloppy(tree, offset);
88    if (!interval) {
89       return NULL;
90    } else if (interval->reg->interval_end > offset) {
91       return interval;
92    } else {
93       /* There is no interval covering reg, and ra_file_search_sloppy()
94        * returned the closest range to the left, so the next interval to the
95        * right should be the closest to the right.
96        */
97       return ir3_reg_interval_next_or_null(interval);
98    }
99 }
100 
101 static int
ir3_reg_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)102 ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
103 {
104    const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a);
105    const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b);
106    return b->reg->interval_start - a->reg->interval_start;
107 }
108 
109 static void
interval_insert(struct ir3_reg_ctx * ctx,struct rb_tree * tree,struct ir3_reg_interval * interval)110 interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree,
111                 struct ir3_reg_interval *interval)
112 {
113    struct ir3_reg_interval *right =
114       ir3_reg_interval_search_right(tree, interval->reg->interval_start);
115    if (right && right->reg->interval_start < interval->reg->interval_end) {
116       /* We disallow trees where different members have different half-ness.
117        * This means that we can't treat bitcasts as copies like normal
118        * split/collect, so something like this would require an extra copy
119        * in mergedregs mode, and count as 4 half-units of register pressure
120        * instead of 2:
121        *
122        * f16vec2 foo = unpackFloat2x16(bar)
123        * ... = foo.x
124        * ... = bar
125        *
126        * However, relaxing this rule would open a huge can of worms. What
127        * happens when there's a vector of 16 things, and the fifth element
128        * has been bitcasted as a half-reg? Would that element alone have to
129        * be small enough to be used as a half-reg source? Let's keep that
130        * can of worms firmly shut for now.
131        */
132       assert((interval->reg->flags & IR3_REG_HALF) ==
133              (right->reg->flags & IR3_REG_HALF));
134 
135       if (right->reg->interval_end <= interval->reg->interval_end &&
136           right->reg->interval_start >= interval->reg->interval_start) {
137          /* Check if we're inserting something that's already inserted */
138          assert(interval != right);
139 
140          /* "right" is contained in "interval" and must become a child of
141           * it. There may be further children too.
142           */
143          for (struct ir3_reg_interval *next = ir3_reg_interval_next(right);
144               right && right->reg->interval_start < interval->reg->interval_end;
145               right = next, next = ir3_reg_interval_next_or_null(next)) {
146             /* "right" must be contained in "interval." */
147             assert(right->reg->interval_end <= interval->reg->interval_end);
148             assert((interval->reg->flags & IR3_REG_HALF) ==
149                    (right->reg->flags & IR3_REG_HALF));
150             if (!right->parent)
151                ctx->interval_delete(ctx, right);
152             right->parent = interval;
153             rb_tree_remove(tree, &right->node);
154             rb_tree_insert(&interval->children, &right->node,
155                            ir3_reg_interval_insert_cmp);
156          }
157       } else {
158          /* "right" must contain "interval," since intervals must form a
159           * tree.
160           */
161          assert(right->reg->interval_start <= interval->reg->interval_start);
162          interval->parent = right;
163          interval_insert(ctx, &right->children, interval);
164          return;
165       }
166    }
167 
168    if (!interval->parent)
169       ctx->interval_add(ctx, interval);
170    rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp);
171    interval->inserted = true;
172 }
173 
174 void
ir3_reg_interval_insert(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)175 ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
176                         struct ir3_reg_interval *interval)
177 {
178    rb_tree_init(&interval->children);
179    interval->parent = NULL;
180    interval_insert(ctx, &ctx->intervals, interval);
181 }
182 
183 /* Call after ir3_reg_interval_remove_temp() to reinsert the interval */
184 static void
ir3_reg_interval_reinsert(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)185 ir3_reg_interval_reinsert(struct ir3_reg_ctx *ctx,
186                           struct ir3_reg_interval *interval)
187 {
188    interval->parent = NULL;
189    interval_insert(ctx, &ctx->intervals, interval);
190 }
191 
192 void
ir3_reg_interval_remove(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)193 ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
194                         struct ir3_reg_interval *interval)
195 {
196    if (interval->parent) {
197       rb_tree_remove(&interval->parent->children, &interval->node);
198    } else {
199       ctx->interval_delete(ctx, interval);
200       rb_tree_remove(&ctx->intervals, &interval->node);
201    }
202 
203    rb_tree_foreach_safe (struct ir3_reg_interval, child, &interval->children,
204                          node) {
205       rb_tree_remove(&interval->children, &child->node);
206       child->parent = interval->parent;
207 
208       if (interval->parent) {
209          rb_tree_insert(&child->parent->children, &child->node,
210                         ir3_reg_interval_insert_cmp);
211       } else {
212          ctx->interval_readd(ctx, interval, child);
213          rb_tree_insert(&ctx->intervals, &child->node,
214                         ir3_reg_interval_insert_cmp);
215       }
216    }
217 
218    interval->inserted = false;
219 }
220 
221 static void
_mark_free(struct ir3_reg_interval * interval)222 _mark_free(struct ir3_reg_interval *interval)
223 {
224    interval->inserted = false;
225    rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
226       _mark_free(child);
227    }
228 }
229 
230 /* Remove an interval and all its children from the tree. */
231 void
ir3_reg_interval_remove_all(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)232 ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
233                             struct ir3_reg_interval *interval)
234 {
235    assert(!interval->parent);
236 
237    ctx->interval_delete(ctx, interval);
238    rb_tree_remove(&ctx->intervals, &interval->node);
239    _mark_free(interval);
240 }
241 
242 /* Used when popping an interval to be shuffled around. Don't disturb children
243  * so that it can be later reinserted.
244  */
245 static void
ir3_reg_interval_remove_temp(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)246 ir3_reg_interval_remove_temp(struct ir3_reg_ctx *ctx,
247                              struct ir3_reg_interval *interval)
248 {
249    assert(!interval->parent);
250 
251    ctx->interval_delete(ctx, interval);
252    rb_tree_remove(&ctx->intervals, &interval->node);
253 }
254 
255 static void
interval_dump(struct log_stream * stream,struct ir3_reg_interval * interval,unsigned indent)256 interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval,
257               unsigned indent)
258 {
259    for (unsigned i = 0; i < indent; i++)
260       mesa_log_stream_printf(stream, "\t");
261    mesa_log_stream_printf(stream, "reg %u start %u\n", interval->reg->name,
262                           interval->reg->interval_start);
263 
264    rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
265       interval_dump(stream, child, indent + 1);
266    }
267 
268    for (unsigned i = 0; i < indent; i++)
269       mesa_log_stream_printf(stream, "\t");
270    mesa_log_stream_printf(stream, "reg %u end %u\n", interval->reg->name,
271                           interval->reg->interval_end);
272 }
273 
274 void
ir3_reg_interval_dump(struct log_stream * stream,struct ir3_reg_interval * interval)275 ir3_reg_interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval)
276 {
277    interval_dump(stream, interval, 0);
278 }
279 
280 /* These are the core datastructures used by the register allocator. First
281  * ra_interval and ra_file, which are used for intra-block tracking and use
282  * the ir3_reg_ctx infrastructure:
283  */
284 
285 struct ra_interval {
286    struct ir3_reg_interval interval;
287 
288    struct rb_node physreg_node;
289    physreg_t physreg_start, physreg_end;
290 
291    /* True if this is a source of the current instruction which is entirely
292     * killed. This means we can allocate the dest over it, but we can't break
293     * it up.
294     */
295    bool is_killed;
296 
297    /* True if this interval cannot be moved from its position. This is only
298     * used for precolored inputs to ensure that other inputs don't get
299     * allocated on top of them.
300     */
301    bool frozen;
302 };
303 
304 struct ra_file {
305    struct ir3_reg_ctx reg_ctx;
306 
307    BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
308    BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
309 
310    struct rb_tree physreg_intervals;
311 
312    unsigned size;
313    unsigned start;
314 };
315 
316 /* State for inter-block tracking. When we split a live range to make space
317  * for a vector, we may need to insert fixup code when a block has multiple
318  * predecessors that have moved the same live value to different registers.
319  * This keeps track of state required to do that.
320  */
321 
322 struct ra_block_state {
323    /* Map of defining ir3_register -> physreg it was allocated to at the end
324     * of the block.
325     */
326    struct hash_table *renames;
327 
328    /* For loops, we need to process a block before all its predecessors have
329     * been processed. In particular, we need to pick registers for values
330     * without knowing if all the predecessors have been renamed. This keeps
331     * track of the registers we chose so that when we visit the back-edge we
332     * can move them appropriately. If all predecessors have been visited
333     * before this block is visited then we don't need to fill this out. This
334     * is a map from ir3_register -> physreg.
335     */
336    struct hash_table *entry_regs;
337 
338    /* True if the block has been visited and "renames" is complete.
339     */
340    bool visited;
341 };
342 
343 struct ra_parallel_copy {
344    struct ra_interval *interval;
345    physreg_t src;
346 };
347 
348 /* The main context: */
349 
350 struct ra_ctx {
351    /* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom
352     * half of this file too.
353     */
354    struct ra_file full;
355 
356    /* hr0.x - hr63.w, only used without merged-regs. */
357    struct ra_file half;
358 
359    /* Shared regs. */
360    struct ra_file shared;
361 
362    struct ir3_liveness *live;
363 
364    struct ir3_block *block;
365 
366    const struct ir3_compiler *compiler;
367    gl_shader_stage stage;
368 
369    /* Pending moves of top-level intervals that will be emitted once we're
370     * finished:
371     */
372    DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies);
373 
374    struct ra_interval *intervals;
375    struct ra_block_state *blocks;
376 
377    bool merged_regs;
378 };
379 
380 #define foreach_interval(interval, file)                                       \
381    rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals,  \
382                     physreg_node)
383 #define foreach_interval_rev(interval, file)                                   \
384    rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals,  \
385                     physreg_node)
386 #define foreach_interval_safe(interval, file)                                  \
387    rb_tree_foreach_safe (struct ra_interval, interval,                         \
388                          &(file)->physreg_intervals, physreg_node)
389 #define foreach_interval_rev_safe(interval, file)                              \
390    rb_tree_foreach_rev_safe(struct ra_interval, interval,                      \
391                             &(file)->physreg_intervals, physreg_node)
392 
393 static struct ra_interval *
rb_node_to_interval(struct rb_node * node)394 rb_node_to_interval(struct rb_node *node)
395 {
396    return rb_node_data(struct ra_interval, node, physreg_node);
397 }
398 
399 static const struct ra_interval *
rb_node_to_interval_const(const struct rb_node * node)400 rb_node_to_interval_const(const struct rb_node *node)
401 {
402    return rb_node_data(struct ra_interval, node, physreg_node);
403 }
404 
405 static struct ra_interval *
ra_interval_next(struct ra_interval * interval)406 ra_interval_next(struct ra_interval *interval)
407 {
408    struct rb_node *next = rb_node_next(&interval->physreg_node);
409    return next ? rb_node_to_interval(next) : NULL;
410 }
411 
412 static struct ra_interval *
ra_interval_next_or_null(struct ra_interval * interval)413 ra_interval_next_or_null(struct ra_interval *interval)
414 {
415    return interval ? ra_interval_next(interval) : NULL;
416 }
417 
418 static int
ra_interval_cmp(const struct rb_node * node,const void * data)419 ra_interval_cmp(const struct rb_node *node, const void *data)
420 {
421    physreg_t reg = *(const physreg_t *)data;
422    const struct ra_interval *interval = rb_node_to_interval_const(node);
423    if (interval->physreg_start > reg)
424       return -1;
425    else if (interval->physreg_end <= reg)
426       return 1;
427    else
428       return 0;
429 }
430 
431 static struct ra_interval *
ra_interval_search_sloppy(struct rb_tree * tree,physreg_t reg)432 ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
433 {
434    struct rb_node *node = rb_tree_search_sloppy(tree, &reg, ra_interval_cmp);
435    return node ? rb_node_to_interval(node) : NULL;
436 }
437 
438 /* Get the interval covering the reg, or the closest to the right if it
439  * doesn't exist.
440  */
441 static struct ra_interval *
ra_interval_search_right(struct rb_tree * tree,physreg_t reg)442 ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
443 {
444    struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
445    if (!interval) {
446       return NULL;
447    } else if (interval->physreg_end > reg) {
448       return interval;
449    } else {
450       /* There is no interval covering reg, and ra_file_search_sloppy()
451        * returned the closest range to the left, so the next interval to the
452        * right should be the closest to the right.
453        */
454       return ra_interval_next_or_null(interval);
455    }
456 }
457 
458 static struct ra_interval *
ra_file_search_right(struct ra_file * file,physreg_t reg)459 ra_file_search_right(struct ra_file *file, physreg_t reg)
460 {
461    return ra_interval_search_right(&file->physreg_intervals, reg);
462 }
463 
464 static int
ra_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)465 ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
466 {
467    const struct ra_interval *a = rb_node_to_interval_const(_a);
468    const struct ra_interval *b = rb_node_to_interval_const(_b);
469    return b->physreg_start - a->physreg_start;
470 }
471 
472 static struct ra_interval *
ir3_reg_interval_to_ra_interval(struct ir3_reg_interval * interval)473 ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
474 {
475    return rb_node_data(struct ra_interval, interval, interval);
476 }
477 
478 static struct ra_file *
ir3_reg_ctx_to_file(struct ir3_reg_ctx * ctx)479 ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx)
480 {
481    return rb_node_data(struct ra_file, ctx, reg_ctx);
482 }
483 
484 static void
interval_add(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _interval)485 interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
486 {
487    struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
488    struct ra_file *file = ir3_reg_ctx_to_file(ctx);
489 
490    /* We can assume in this case that physreg_start/physreg_end is already
491     * initialized.
492     */
493    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
494       BITSET_CLEAR(file->available, i);
495       BITSET_CLEAR(file->available_to_evict, i);
496    }
497 
498    rb_tree_insert(&file->physreg_intervals, &interval->physreg_node,
499                   ra_interval_insert_cmp);
500 }
501 
502 static void
interval_delete(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _interval)503 interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
504 {
505    struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
506    struct ra_file *file = ir3_reg_ctx_to_file(ctx);
507 
508    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
509       BITSET_SET(file->available, i);
510       BITSET_SET(file->available_to_evict, i);
511    }
512 
513    rb_tree_remove(&file->physreg_intervals, &interval->physreg_node);
514 }
515 
516 static void
interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)517 interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
518                struct ir3_reg_interval *_child)
519 {
520    struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
521    struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
522 
523    child->physreg_start =
524       parent->physreg_start + (child->interval.reg->interval_start -
525                                parent->interval.reg->interval_start);
526    child->physreg_end =
527       child->physreg_start +
528       (child->interval.reg->interval_end - child->interval.reg->interval_start);
529 
530    interval_add(ctx, _child);
531 }
532 
533 static void
ra_file_init(struct ra_file * file)534 ra_file_init(struct ra_file *file)
535 {
536    for (unsigned i = 0; i < file->size; i++) {
537       BITSET_SET(file->available, i);
538       BITSET_SET(file->available_to_evict, i);
539    }
540 
541    rb_tree_init(&file->reg_ctx.intervals);
542    rb_tree_init(&file->physreg_intervals);
543 
544    file->reg_ctx.interval_add = interval_add;
545    file->reg_ctx.interval_delete = interval_delete;
546    file->reg_ctx.interval_readd = interval_readd;
547 }
548 
549 static void
ra_file_insert(struct ra_file * file,struct ra_interval * interval)550 ra_file_insert(struct ra_file *file, struct ra_interval *interval)
551 {
552    assert(interval->physreg_start < interval->physreg_end);
553    assert(interval->physreg_end <= file->size);
554    if (interval->interval.reg->flags & IR3_REG_HALF)
555       assert(interval->physreg_end <= RA_HALF_SIZE);
556 
557    ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
558 }
559 
560 static void
ra_file_remove(struct ra_file * file,struct ra_interval * interval)561 ra_file_remove(struct ra_file *file, struct ra_interval *interval)
562 {
563    ir3_reg_interval_remove(&file->reg_ctx, &interval->interval);
564 }
565 
566 static void
ra_file_mark_killed(struct ra_file * file,struct ra_interval * interval)567 ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval)
568 {
569    assert(!interval->interval.parent);
570 
571    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
572       BITSET_SET(file->available, i);
573    }
574 
575    interval->is_killed = true;
576 }
577 
578 static void
ra_file_unmark_killed(struct ra_file * file,struct ra_interval * interval)579 ra_file_unmark_killed(struct ra_file *file, struct ra_interval *interval)
580 {
581    assert(!interval->interval.parent);
582 
583    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
584       BITSET_CLEAR(file->available, i);
585    }
586 
587    interval->is_killed = false;
588 }
589 
590 static physreg_t
ra_interval_get_physreg(const struct ra_interval * interval)591 ra_interval_get_physreg(const struct ra_interval *interval)
592 {
593    unsigned child_start = interval->interval.reg->interval_start;
594 
595    while (interval->interval.parent) {
596       interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
597    }
598 
599    return interval->physreg_start +
600           (child_start - interval->interval.reg->interval_start);
601 }
602 
603 static unsigned
ra_interval_get_num(const struct ra_interval * interval)604 ra_interval_get_num(const struct ra_interval *interval)
605 {
606    return ra_physreg_to_num(ra_interval_get_physreg(interval),
607                             interval->interval.reg->flags);
608 }
609 
610 static void
ra_interval_init(struct ra_interval * interval,struct ir3_register * reg)611 ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
612 {
613    ir3_reg_interval_init(&interval->interval, reg);
614    interval->is_killed = false;
615    interval->frozen = false;
616 }
617 
618 static void
ra_interval_dump(struct log_stream * stream,struct ra_interval * interval)619 ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
620 {
621    mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
622 
623    ir3_reg_interval_dump(stream, &interval->interval);
624 }
625 
626 static void
ra_file_dump(struct log_stream * stream,struct ra_file * file)627 ra_file_dump(struct log_stream *stream, struct ra_file *file)
628 {
629    rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
630                     physreg_node) {
631       ra_interval_dump(stream, interval);
632    }
633 
634    unsigned start, end;
635    mesa_log_stream_printf(stream, "available:\n");
636    BITSET_FOREACH_RANGE (start, end, file->available, file->size) {
637       mesa_log_stream_printf(stream, "%u-%u ", start, end);
638    }
639    mesa_log_stream_printf(stream, "\n");
640 
641    mesa_log_stream_printf(stream, "available to evict:\n");
642    BITSET_FOREACH_RANGE (start, end, file->available_to_evict, file->size) {
643       mesa_log_stream_printf(stream, "%u-%u ", start, end);
644    }
645    mesa_log_stream_printf(stream, "\n");
646    mesa_log_stream_printf(stream, "start: %u\n", file->start);
647 }
648 
649 static void
ra_ctx_dump(struct ra_ctx * ctx)650 ra_ctx_dump(struct ra_ctx *ctx)
651 {
652    struct log_stream *stream = mesa_log_streami();
653    mesa_log_stream_printf(stream, "full:\n");
654    ra_file_dump(stream, &ctx->full);
655    mesa_log_stream_printf(stream, "half:\n");
656    ra_file_dump(stream, &ctx->half);
657    mesa_log_stream_printf(stream, "shared:");
658    ra_file_dump(stream, &ctx->shared);
659    mesa_log_stream_destroy(stream);
660 }
661 
662 static unsigned
reg_file_size(struct ra_file * file,struct ir3_register * reg)663 reg_file_size(struct ra_file *file, struct ir3_register *reg)
664 {
665    /* Half-regs can only take up the first half of the combined regfile */
666    if (reg->flags & IR3_REG_HALF)
667       return MIN2(file->size, RA_HALF_SIZE);
668    else
669       return file->size;
670 }
671 
672 /* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple
673  * top-level intervals at once. Pop multiple intervals, then push them back in
674  * any order.
675  */
676 
677 struct ra_removed_interval {
678    struct ra_interval *interval;
679    unsigned size;
680 };
681 
682 static struct ra_removed_interval
ra_pop_interval(struct ra_ctx * ctx,struct ra_file * file,struct ra_interval * interval)683 ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
684                 struct ra_interval *interval)
685 {
686    assert(!interval->interval.parent);
687 
688    /* Check if we've already moved this reg before */
689    unsigned pcopy_index;
690    for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count;
691         pcopy_index++) {
692       if (ctx->parallel_copies[pcopy_index].interval == interval)
693          break;
694    }
695 
696    if (pcopy_index == ctx->parallel_copies_count) {
697       array_insert(ctx, ctx->parallel_copies,
698                    (struct ra_parallel_copy){
699                       .interval = interval,
700                       .src = interval->physreg_start,
701                    });
702    }
703 
704    ir3_reg_interval_remove_temp(&file->reg_ctx, &interval->interval);
705 
706    return (struct ra_removed_interval){
707       .interval = interval,
708       .size = interval->physreg_end - interval->physreg_start,
709    };
710 }
711 
712 static void
ra_push_interval(struct ra_ctx * ctx,struct ra_file * file,const struct ra_removed_interval * removed,physreg_t dst)713 ra_push_interval(struct ra_ctx *ctx, struct ra_file *file,
714                  const struct ra_removed_interval *removed, physreg_t dst)
715 {
716    struct ra_interval *interval = removed->interval;
717 
718    interval->physreg_start = dst;
719    interval->physreg_end = dst + removed->size;
720 
721    assert(interval->physreg_end <= file->size);
722    if (interval->interval.reg->flags & IR3_REG_HALF)
723       assert(interval->physreg_end <= RA_HALF_SIZE);
724 
725    ir3_reg_interval_reinsert(&file->reg_ctx, &interval->interval);
726 }
727 
728 /* Pick up the interval and place it at "dst". */
729 static void
ra_move_interval(struct ra_ctx * ctx,struct ra_file * file,struct ra_interval * interval,physreg_t dst)730 ra_move_interval(struct ra_ctx *ctx, struct ra_file *file,
731                  struct ra_interval *interval, physreg_t dst)
732 {
733    struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval);
734    ra_push_interval(ctx, file, &temp, dst);
735 }
736 
737 static struct ra_file *
ra_get_file(struct ra_ctx * ctx,struct ir3_register * reg)738 ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
739 {
740    if (reg->flags & IR3_REG_SHARED)
741       return &ctx->shared;
742    else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
743       return &ctx->full;
744    else
745       return &ctx->half;
746 }
747 
748 
749 /* Returns true if the proposed spot for "dst" or a killed source overlaps a
750  * destination that's been allocated.
751  */
752 static bool
check_dst_overlap(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * dst,physreg_t start,physreg_t end)753 check_dst_overlap(struct ra_ctx *ctx, struct ra_file *file,
754                   struct ir3_register *dst, physreg_t start,
755                   physreg_t end)
756 {
757    struct ir3_instruction *instr = dst->instr;
758 
759    ra_foreach_dst (other_dst, instr) {
760       /* We assume only destinations before the current one have been allocated.
761        */
762       if (other_dst == dst)
763          break;
764 
765       if (ra_get_file(ctx, other_dst) != file)
766          continue;
767 
768       struct ra_interval *other_interval = &ctx->intervals[other_dst->name];
769       assert(!other_interval->interval.parent);
770       physreg_t other_start = other_interval->physreg_start;
771       physreg_t other_end = other_interval->physreg_end;
772 
773       if (other_end > start && end > other_start)
774          return true;
775    }
776 
777    return false;
778 }
779 
780 /* True if the destination is "early-clobber," meaning that it cannot be
781  * allocated over killed sources. Some destinations always require it, but it
782  * also is implicitly true for tied destinations whose source is live-through.
783  * If the source is killed, then we skip allocating a register for the
784  * destination altogether so we don't need to worry about that case here.
785  */
786 static bool
is_early_clobber(struct ir3_register * reg)787 is_early_clobber(struct ir3_register *reg)
788 {
789    return (reg->flags & IR3_REG_EARLY_CLOBBER) || reg->tied;
790 }
791 
792 static bool
get_reg_specified(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg,physreg_t physreg,bool is_source)793 get_reg_specified(struct ra_ctx *ctx, struct ra_file *file,
794                   struct ir3_register *reg, physreg_t physreg, bool is_source)
795 {
796    for (unsigned i = 0; i < reg_size(reg); i++) {
797       if (!BITSET_TEST(is_early_clobber(reg) || is_source ?
798                            file->available_to_evict : file->available,
799                        physreg + i))
800          return false;
801    }
802 
803    if (!is_source &&
804        check_dst_overlap(ctx, file, reg, physreg, physreg + reg_size(reg)))
805       return false;
806 
807    return true;
808 }
809 
810 /* Try to evict any registers conflicting with the proposed spot "physreg" for
811  * "reg". That is, move them to other places so that we can allocate "physreg"
812  * here.
813  */
814 
815 static bool
try_evict_regs(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg,physreg_t physreg,unsigned * _eviction_count,bool is_source,bool speculative)816 try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
817                struct ir3_register *reg, physreg_t physreg,
818                unsigned *_eviction_count, bool is_source, bool speculative)
819 {
820    BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
821    memcpy(available_to_evict, file->available_to_evict,
822           sizeof(available_to_evict));
823 
824    BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
825    memcpy(available, file->available, sizeof(available));
826 
827    for (unsigned i = 0; i < reg_size(reg); i++) {
828       BITSET_CLEAR(available_to_evict, physreg + i);
829       BITSET_CLEAR(available, physreg + i);
830    }
831 
832    unsigned eviction_count = 0;
833    /* Iterate over each range conflicting with physreg */
834    for (struct ra_interval *conflicting = ra_file_search_right(file, physreg),
835                            *next = ra_interval_next_or_null(conflicting);
836         conflicting != NULL &&
837         conflicting->physreg_start < physreg + reg_size(reg);
838         conflicting = next, next = ra_interval_next_or_null(next)) {
839       if (!is_early_clobber(reg) && !is_source && conflicting->is_killed)
840          continue;
841 
842       if (conflicting->frozen) {
843          assert(speculative);
844          return false;
845       }
846 
847       unsigned conflicting_file_size =
848          reg_file_size(file, conflicting->interval.reg);
849       unsigned avail_start, avail_end;
850       bool evicted = false;
851       BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict,
852                             conflicting_file_size) {
853          unsigned size = avail_end - avail_start;
854 
855          /* non-half registers must be aligned */
856          if (!(conflicting->interval.reg->flags & IR3_REG_HALF) &&
857              avail_start % 2 == 1) {
858             avail_start++;
859             size--;
860          }
861 
862          unsigned conflicting_size =
863             conflicting->physreg_end - conflicting->physreg_start;
864          if (size >= conflicting_size &&
865              !check_dst_overlap(ctx, file, reg, avail_start, avail_start +
866                                 conflicting_size)) {
867             for (unsigned i = 0;
868                  i < conflicting->physreg_end - conflicting->physreg_start; i++)
869                BITSET_CLEAR(available_to_evict, avail_start + i);
870             eviction_count +=
871                conflicting->physreg_end - conflicting->physreg_start;
872             if (!speculative)
873                ra_move_interval(ctx, file, conflicting, avail_start);
874             evicted = true;
875             break;
876          }
877       }
878 
879       if (evicted)
880          continue;
881 
882       /* If we couldn't evict this range, we may be able to swap it with a
883        * killed range to acheive the same effect.
884        */
885       foreach_interval (killed, file) {
886          if (!killed->is_killed)
887             continue;
888 
889          if (killed->physreg_end - killed->physreg_start !=
890              conflicting->physreg_end - conflicting->physreg_start)
891             continue;
892 
893          if (killed->physreg_end > conflicting_file_size ||
894              conflicting->physreg_end > reg_file_size(file, killed->interval.reg))
895             continue;
896 
897          /* We can't swap the killed range if it partially/fully overlaps the
898           * space we're trying to allocate or (in speculative mode) if it's
899           * already been swapped and will overlap when we actually evict.
900           */
901          bool killed_available = true;
902          for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
903             if (!BITSET_TEST(available, i)) {
904                killed_available = false;
905                break;
906             }
907          }
908 
909          if (!killed_available)
910             continue;
911 
912          if (check_dst_overlap(ctx, file, reg, killed->physreg_start,
913                                killed->physreg_end))
914             continue;
915 
916          /* Check for alignment if one is a full reg */
917          if ((!(killed->interval.reg->flags & IR3_REG_HALF) ||
918               !(conflicting->interval.reg->flags & IR3_REG_HALF)) &&
919              (killed->physreg_start % 2 != 0 ||
920               conflicting->physreg_start % 2 != 0))
921             continue;
922 
923          for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
924             BITSET_CLEAR(available, i);
925          }
926          /* Because this will generate swaps instead of moves, multiply the
927           * cost by 2.
928           */
929          eviction_count += (killed->physreg_end - killed->physreg_start) * 2;
930          if (!speculative) {
931             physreg_t killed_start = killed->physreg_start,
932                       conflicting_start = conflicting->physreg_start;
933             struct ra_removed_interval killed_removed =
934                ra_pop_interval(ctx, file, killed);
935             struct ra_removed_interval conflicting_removed =
936                ra_pop_interval(ctx, file, conflicting);
937             ra_push_interval(ctx, file, &killed_removed, conflicting_start);
938             ra_push_interval(ctx, file, &conflicting_removed, killed_start);
939          }
940 
941          evicted = true;
942          break;
943       }
944 
945       if (!evicted)
946          return false;
947    }
948 
949    *_eviction_count = eviction_count;
950    return true;
951 }
952 
953 static int
removed_interval_cmp(const void * _i1,const void * _i2)954 removed_interval_cmp(const void *_i1, const void *_i2)
955 {
956    const struct ra_removed_interval *i1 = _i1;
957    const struct ra_removed_interval *i2 = _i2;
958 
959    /* We sort the registers as follows:
960     *
961     * |------------------------------------------------------------------------------------------|
962     * |               |                    |        |        |                    |              |
963     * |  Half         | Half early-clobber | Half   | Full   | Full early-clobber | Full         |
964     * |  live-through | destination        | killed | killed | destination        | live-through |
965     * |               |                    |        |        |                    |              |
966     * |------------------------------------------------------------------------------------------|
967     *                                      |                 |
968     *                                      |   Destination   |
969     *                                      |                 |
970     *                                      |-----------------|
971     *
972     * Half-registers have to be first so that they stay in the low half of
973     * the register file. Then half and full killed must stay together so that
974     * there's a contiguous range where we can put the register. With this
975     * structure we should be able to accomodate any collection of intervals
976     * such that the total number of half components is within the half limit
977     * and the combined components are within the full limit.
978     */
979 
980    unsigned i1_align = reg_elem_size(i1->interval->interval.reg);
981    unsigned i2_align = reg_elem_size(i2->interval->interval.reg);
982    if (i1_align > i2_align)
983       return 1;
984    if (i1_align < i2_align)
985       return -1;
986 
987    if (i1_align == 1) {
988       if (i2->interval->is_killed)
989          return -1;
990       if (i1->interval->is_killed)
991          return 1;
992    } else {
993       if (i2->interval->is_killed)
994          return 1;
995       if (i1->interval->is_killed)
996          return -1;
997    }
998 
999    return 0;
1000 }
1001 
1002 static int
dsts_cmp(const void * _i1,const void * _i2)1003 dsts_cmp(const void *_i1, const void *_i2)
1004 {
1005    struct ir3_register *i1 = *(struct ir3_register *const *) _i1;
1006    struct ir3_register *i2 = *(struct ir3_register *const *) _i2;
1007 
1008    /* Treat tied destinations as-if they are live-through sources, and normal
1009     * destinations as killed sources.
1010     */
1011    unsigned i1_align = reg_elem_size(i1);
1012    unsigned i2_align = reg_elem_size(i2);
1013    if (i1_align > i2_align)
1014       return 1;
1015    if (i1_align < i2_align)
1016       return -1;
1017 
1018    if (i1_align == 1) {
1019       if (!is_early_clobber(i2))
1020          return -1;
1021       if (!is_early_clobber(i1))
1022          return 1;
1023    } else {
1024       if (!is_early_clobber(i2))
1025          return 1;
1026       if (!is_early_clobber(i1))
1027          return -1;
1028    }
1029 
1030    return 0;
1031 }
1032 
1033 /* "Compress" all the live intervals so that there is enough space for the
1034  * destination register. As there can be gaps when a more-aligned interval
1035  * follows a less-aligned interval, this also sorts them to remove such
1036  * "padding", which may be required when space is very tight.  This isn't
1037  * amazing, but should be used only as a last resort in case the register file
1038  * is almost full and badly fragmented.
1039  *
1040  * Return the physreg to use.
1041  */
1042 static physreg_t
compress_regs_left(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg)1043 compress_regs_left(struct ra_ctx *ctx, struct ra_file *file,
1044                    struct ir3_register *reg)
1045 {
1046    unsigned align = reg_elem_size(reg);
1047    DECLARE_ARRAY(struct ra_removed_interval, intervals);
1048    intervals_count = intervals_sz = 0;
1049    intervals = NULL;
1050 
1051    DECLARE_ARRAY(struct ir3_register *, dsts);
1052    dsts_count = dsts_sz = 0;
1053    dsts = NULL;
1054    array_insert(ctx, dsts, reg);
1055    bool dst_inserted[reg->instr->dsts_count];
1056 
1057    unsigned dst_size = reg->tied ? 0 : reg_size(reg);
1058    unsigned ec_dst_size = is_early_clobber(reg) ? reg_size(reg) : 0;
1059    unsigned half_dst_size = 0, ec_half_dst_size = 0;
1060    if (align == 1) {
1061       half_dst_size = dst_size;
1062       ec_half_dst_size = ec_dst_size;
1063    }
1064 
1065    unsigned removed_size = 0, removed_half_size = 0;
1066    unsigned removed_killed_size = 0, removed_killed_half_size = 0;
1067    unsigned file_size =
1068       align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size;
1069    physreg_t start_reg = 0;
1070 
1071    foreach_interval_rev_safe (interval, file) {
1072       /* We'll check if we can compact the intervals starting here. */
1073       physreg_t candidate_start = interval->physreg_end;
1074 
1075       /* Check if there are any other destinations we need to compact. */
1076       ra_foreach_dst_n (other_dst, n, reg->instr) {
1077          if (other_dst == reg)
1078             break;
1079          if (ra_get_file(ctx, other_dst) != file)
1080             continue;
1081          if (dst_inserted[n])
1082             continue;
1083 
1084          struct ra_interval *other_interval = &ctx->intervals[other_dst->name];
1085          /* if the destination partially overlaps this interval, we need to
1086           * extend candidate_start to the end.
1087           */
1088          if (other_interval->physreg_start < candidate_start) {
1089             candidate_start = MAX2(candidate_start,
1090                                    other_interval->physreg_end);
1091             continue;
1092          }
1093 
1094          dst_inserted[n] = true;
1095 
1096          /* dst intervals with a tied killed source are considered attached to
1097           * that source. Don't actually insert them. This means we have to
1098           * update them below if their tied source moves.
1099           */
1100          if (other_dst->tied) {
1101             struct ra_interval *tied_interval =
1102                &ctx->intervals[other_dst->tied->def->name];
1103             if (tied_interval->is_killed)
1104                continue;
1105          }
1106 
1107          d("popping destination %u physreg %u\n",
1108            other_interval->interval.reg->name,
1109            other_interval->physreg_start);
1110 
1111          array_insert(ctx, dsts, other_dst);
1112          unsigned interval_size = reg_size(other_dst);
1113          if (is_early_clobber(other_dst)) {
1114             ec_dst_size += interval_size;
1115             if (other_interval->interval.reg->flags & IR3_REG_HALF)
1116                ec_half_dst_size += interval_size;
1117          } else {
1118             dst_size += interval_size;
1119             if (other_interval->interval.reg->flags & IR3_REG_HALF)
1120                half_dst_size += interval_size;
1121          }
1122       }
1123 
1124       /* Check if we can sort the intervals *after* this one and have enough
1125        * space leftover to accomodate all intervals, keeping in mind that killed
1126        * sources overlap non-tied destinations. Also check that we have enough
1127        * space leftover for half-registers, if we're inserting a half-register
1128        * (otherwise we only shift any half-registers down so they should be
1129        * safe).
1130        */
1131       if (candidate_start + removed_size + ec_dst_size +
1132           MAX2(removed_killed_size, dst_size) <= file->size &&
1133           (align != 1 ||
1134            candidate_start + removed_half_size + ec_half_dst_size +
1135            MAX2(removed_killed_half_size, half_dst_size) <= file_size)) {
1136          start_reg = candidate_start;
1137          break;
1138       }
1139 
1140       /* We assume that all frozen intervals are at the start and that we
1141        * can avoid popping them.
1142        */
1143       assert(!interval->frozen);
1144 
1145       /* Killed sources are different because they go at the end and can
1146        * overlap the register we're trying to add.
1147        */
1148       unsigned interval_size = interval->physreg_end - interval->physreg_start;
1149       if (interval->is_killed) {
1150          removed_killed_size += interval_size;
1151          if (interval->interval.reg->flags & IR3_REG_HALF)
1152             removed_killed_half_size += interval_size;
1153       } else {
1154          removed_size += interval_size;
1155          if (interval->interval.reg->flags & IR3_REG_HALF)
1156             removed_half_size += interval_size;
1157       }
1158 
1159       /* Now that we've done the accounting, pop this off */
1160       d("popping interval %u physreg %u%s\n", interval->interval.reg->name,
1161         interval->physreg_start, interval->is_killed ? ", killed" : "");
1162       array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval));
1163    }
1164 
1165    /* TODO: In addition to skipping registers at the beginning that are
1166     * well-packed, we should try to skip registers at the end.
1167     */
1168 
1169    qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp);
1170    qsort(dsts, dsts_count, sizeof(*dsts), dsts_cmp);
1171 
1172    physreg_t live_reg = start_reg;
1173    physreg_t dst_reg = (physreg_t)~0;
1174    physreg_t ret_reg = (physreg_t)~0;
1175    unsigned dst_index = 0;
1176    unsigned live_index = 0;
1177 
1178    /* We have two lists of intervals to process, live intervals and destination
1179     * intervals. Process them in the order of the disgram in insert_cmp().
1180     */
1181    while (live_index < intervals_count || dst_index < dsts_count) {
1182       bool process_dst;
1183       if (live_index == intervals_count) {
1184          process_dst = true;
1185       } else if (dst_index == dsts_count) {
1186          process_dst = false;
1187       } else {
1188          struct ir3_register *dst = dsts[dst_index];
1189          struct ra_interval *live_interval = intervals[live_index].interval;
1190 
1191          bool live_half = live_interval->interval.reg->flags & IR3_REG_HALF;
1192          bool live_killed = live_interval->is_killed;
1193          bool dst_half = dst->flags & IR3_REG_HALF;
1194          bool dst_early_clobber = is_early_clobber(dst);
1195 
1196          if (live_half && !live_killed) {
1197             /* far-left of diagram. */
1198             process_dst = false;
1199          } else if (dst_half && dst_early_clobber) {
1200             /* mid-left of diagram. */
1201             process_dst = true;
1202          } else if (!dst_early_clobber) {
1203             /* bottom of disagram. */
1204             process_dst = true;
1205          } else if (live_killed) {
1206             /* middle of diagram. */
1207             process_dst = false;
1208          } else if (!dst_half && dst_early_clobber) {
1209             /* mid-right of diagram. */
1210             process_dst = true;
1211          } else {
1212             /* far right of diagram. */
1213             assert(!live_killed && !live_half);
1214             process_dst = false;
1215          }
1216       }
1217 
1218       struct ir3_register *cur_reg =
1219          process_dst ? dsts[dst_index] :
1220          intervals[live_index].interval->interval.reg;
1221 
1222       physreg_t physreg;
1223       if (process_dst && !is_early_clobber(cur_reg)) {
1224          if (dst_reg == (physreg_t)~0)
1225             dst_reg = live_reg;
1226          physreg = dst_reg;
1227       } else {
1228          physreg = live_reg;
1229          struct ra_interval *live_interval = intervals[live_index].interval;
1230          bool live_killed = live_interval->is_killed;
1231          /* If this is live-through and we've processed the destinations, we
1232           * need to make sure we take into account any overlapping destinations.
1233           */
1234          if (!live_killed && dst_reg != (physreg_t)~0)
1235             physreg = MAX2(physreg, dst_reg);
1236       }
1237 
1238       if (!(cur_reg->flags & IR3_REG_HALF))
1239          physreg = ALIGN(physreg, 2);
1240 
1241       d("pushing reg %u physreg %u\n", cur_reg->name, physreg);
1242 
1243       unsigned interval_size = reg_size(cur_reg);
1244       if (physreg + interval_size >
1245           reg_file_size(file, cur_reg)) {
1246          d("ran out of room for interval %u!\n",
1247            cur_reg->name);
1248          unreachable("reg pressure calculation was wrong!");
1249          return 0;
1250       }
1251 
1252       if (process_dst) {
1253          if (cur_reg == reg) {
1254             ret_reg = physreg;
1255          } else {
1256             struct ra_interval *interval = &ctx->intervals[cur_reg->name];
1257             interval->physreg_start = physreg;
1258             interval->physreg_end = physreg + interval_size;
1259          }
1260          dst_index++;
1261       } else {
1262          ra_push_interval(ctx, file, &intervals[live_index], physreg);
1263          live_index++;
1264       }
1265 
1266       physreg += interval_size;
1267 
1268       if (process_dst && !is_early_clobber(cur_reg)) {
1269          dst_reg = physreg;
1270       } else {
1271          live_reg = physreg;
1272       }
1273    }
1274 
1275    /* If we shuffled around a tied source that is killed, we may have to update
1276     * its corresponding destination since we didn't insert it above.
1277     */
1278    ra_foreach_dst (dst, reg->instr) {
1279       if (dst == reg)
1280          break;
1281 
1282       struct ir3_register *tied = dst->tied;
1283       if (!tied)
1284          continue;
1285 
1286       struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1287       if (!tied_interval->is_killed)
1288          continue;
1289 
1290       struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1291       unsigned dst_size = reg_size(dst);
1292       dst_interval->physreg_start = ra_interval_get_physreg(tied_interval);
1293       dst_interval->physreg_end = dst_interval->physreg_start + dst_size;
1294    }
1295 
1296    return ret_reg;
1297 }
1298 
1299 static void
update_affinity(struct ra_file * file,struct ir3_register * reg,physreg_t physreg)1300 update_affinity(struct ra_file *file, struct ir3_register *reg,
1301                 physreg_t physreg)
1302 {
1303    if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0)
1304       return;
1305 
1306    if (physreg < reg->merge_set_offset)
1307       return;
1308 
1309    if ((physreg - reg->merge_set_offset + reg->merge_set->size) > file->size)
1310       return;
1311 
1312    reg->merge_set->preferred_reg = physreg - reg->merge_set_offset;
1313 }
1314 
1315 /* Try to find free space for a register without shuffling anything. This uses
1316  * a round-robin algorithm to reduce false dependencies.
1317  */
1318 static physreg_t
find_best_gap(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * dst,unsigned file_size,unsigned size,unsigned align)1319 find_best_gap(struct ra_ctx *ctx, struct ra_file *file,
1320               struct ir3_register *dst, unsigned file_size, unsigned size,
1321               unsigned align)
1322 {
1323    /* This can happen if we create a very large merge set. Just bail out in that
1324     * case.
1325     */
1326    if (size > file_size)
1327       return (physreg_t) ~0;
1328 
1329    BITSET_WORD *available =
1330       is_early_clobber(dst) ? file->available_to_evict : file->available;
1331 
1332    unsigned start = ALIGN(file->start, align) % (file_size - size + align);
1333    unsigned candidate = start;
1334    do {
1335       bool is_available = true;
1336       for (unsigned i = 0; i < size; i++) {
1337          if (!BITSET_TEST(available, candidate + i)) {
1338             is_available = false;
1339             break;
1340          }
1341       }
1342 
1343       if (is_available) {
1344          is_available =
1345             !check_dst_overlap(ctx, file, dst, candidate, candidate + size);
1346       }
1347 
1348       if (is_available) {
1349          file->start = (candidate + size) % file_size;
1350          return candidate;
1351       }
1352 
1353       candidate += align;
1354       if (candidate + size > file_size)
1355          candidate = 0;
1356    } while (candidate != start);
1357 
1358    return (physreg_t)~0;
1359 }
1360 
1361 /* This is the main entrypoint for picking a register. Pick a free register
1362  * for "reg", shuffling around sources if necessary. In the normal case where
1363  * "is_source" is false, this register can overlap with killed sources
1364  * (intervals with "is_killed == true"). If "is_source" is true, then
1365  * is_killed is ignored and the register returned must not overlap with killed
1366  * sources. This must be used for tied registers, because we're actually
1367  * allocating the destination and the tied source at the same time.
1368  */
1369 
1370 static physreg_t
get_reg(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg)1371 get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg)
1372 {
1373    unsigned file_size = reg_file_size(file, reg);
1374    if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
1375       physreg_t preferred_reg =
1376          reg->merge_set->preferred_reg + reg->merge_set_offset;
1377       if (preferred_reg + reg_size(reg) <= file_size &&
1378           preferred_reg % reg_elem_size(reg) == 0 &&
1379           get_reg_specified(ctx, file, reg, preferred_reg, false))
1380          return preferred_reg;
1381    }
1382 
1383    /* If this register is a subset of a merge set which we have not picked a
1384     * register for, first try to allocate enough space for the entire merge
1385     * set.
1386     */
1387    unsigned size = reg_size(reg);
1388    if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
1389        size < reg->merge_set->size) {
1390       physreg_t best_reg = find_best_gap(ctx, file, reg, file_size,
1391                                          reg->merge_set->size,
1392                                          reg->merge_set->alignment);
1393       if (best_reg != (physreg_t)~0u) {
1394          best_reg += reg->merge_set_offset;
1395          return best_reg;
1396       }
1397    }
1398 
1399    /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
1400     * Because this doesn't introduce unnecessary dependencies, and it
1401     * potentially avoids needing (ss) syncs for write after read hazards for
1402     * SFU instructions:
1403     */
1404    if (is_sfu(reg->instr) || is_alu(reg->instr)) {
1405       for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
1406          struct ir3_register *src = reg->instr->srcs[i];
1407          if (!ra_reg_is_src(src))
1408             continue;
1409          if (ra_get_file(ctx, src) == file && reg_size(src) >= size) {
1410             struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1411             physreg_t src_physreg = ra_interval_get_physreg(src_interval);
1412             if (src_physreg % reg_elem_size(reg) == 0 &&
1413                 src_physreg + size <= file_size &&
1414                 get_reg_specified(ctx, file, reg, src_physreg, false))
1415                return src_physreg;
1416          }
1417       }
1418    }
1419 
1420    physreg_t best_reg =
1421       find_best_gap(ctx, file, reg, file_size, size, reg_elem_size(reg));
1422    if (best_reg != (physreg_t)~0u) {
1423       return best_reg;
1424    }
1425 
1426    /* Ok, we couldn't find anything that fits. Here is where we have to start
1427     * moving things around to make stuff fit. First try solely evicting
1428     * registers in the way.
1429     */
1430    unsigned best_eviction_count = ~0;
1431    for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) {
1432       unsigned eviction_count;
1433       if (try_evict_regs(ctx, file, reg, i, &eviction_count, false, true)) {
1434          if (eviction_count < best_eviction_count) {
1435             best_eviction_count = eviction_count;
1436             best_reg = i;
1437          }
1438       }
1439    }
1440 
1441    if (best_eviction_count != ~0) {
1442       ASSERTED bool result = try_evict_regs(
1443          ctx, file, reg, best_reg, &best_eviction_count, false, false);
1444       assert(result);
1445       return best_reg;
1446    }
1447 
1448    /* Use the dumb fallback only if try_evict_regs() fails. */
1449    return compress_regs_left(ctx, file, reg);
1450 }
1451 
1452 static void
assign_reg(struct ir3_instruction * instr,struct ir3_register * reg,unsigned num)1453 assign_reg(struct ir3_instruction *instr, struct ir3_register *reg,
1454            unsigned num)
1455 {
1456    if (reg->flags & IR3_REG_ARRAY) {
1457       reg->array.base = num;
1458       if (reg->flags & IR3_REG_RELATIV)
1459          reg->array.offset += num;
1460       else
1461          reg->num = num + reg->array.offset;
1462    } else {
1463       reg->num = num;
1464    }
1465 }
1466 
1467 static void
mark_src_killed(struct ra_ctx * ctx,struct ir3_register * src)1468 mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src)
1469 {
1470    struct ra_interval *interval = &ctx->intervals[src->def->name];
1471 
1472    if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed ||
1473        interval->interval.parent ||
1474        !rb_tree_is_empty(&interval->interval.children))
1475       return;
1476 
1477    ra_file_mark_killed(ra_get_file(ctx, src), interval);
1478 }
1479 
1480 static void
insert_dst(struct ra_ctx * ctx,struct ir3_register * dst)1481 insert_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1482 {
1483    struct ra_file *file = ra_get_file(ctx, dst);
1484    struct ra_interval *interval = &ctx->intervals[dst->name];
1485 
1486    d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval));
1487 
1488    if (!(dst->flags & IR3_REG_UNUSED))
1489       ra_file_insert(file, interval);
1490 
1491    assign_reg(dst->instr, dst, ra_interval_get_num(interval));
1492 }
1493 
1494 static void
allocate_dst_fixed(struct ra_ctx * ctx,struct ir3_register * dst,physreg_t physreg)1495 allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst,
1496                    physreg_t physreg)
1497 {
1498    struct ra_file *file = ra_get_file(ctx, dst);
1499    struct ra_interval *interval = &ctx->intervals[dst->name];
1500    update_affinity(file, dst, physreg);
1501 
1502    ra_interval_init(interval, dst);
1503    interval->physreg_start = physreg;
1504    interval->physreg_end = physreg + reg_size(dst);
1505 }
1506 
1507 /* If a tied destination interferes with its source register, we have to insert
1508  * a copy beforehand to copy the source to the destination. Because we are using
1509  * the parallel_copies array and not creating a separate copy, this copy will
1510  * happen in parallel with any shuffling around of the tied source, so we have
1511  * to copy the source *as it exists before it is shuffled around*. We do this by
1512  * inserting the copy early, before any other copies are inserted. We don't
1513  * actually know the destination of the copy, but that's ok because the
1514  * dst_interval will be filled out later.
1515  */
1516 static void
insert_tied_dst_copy(struct ra_ctx * ctx,struct ir3_register * dst)1517 insert_tied_dst_copy(struct ra_ctx *ctx, struct ir3_register *dst)
1518 {
1519    struct ir3_register *tied = dst->tied;
1520 
1521    if (!tied)
1522       return;
1523 
1524    struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1525    struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1526 
1527    if (tied_interval->is_killed)
1528       return;
1529 
1530    physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);
1531 
1532    array_insert(ctx, ctx->parallel_copies,
1533                 (struct ra_parallel_copy){
1534                    .interval = dst_interval,
1535                    .src = tied_physreg,
1536                 });
1537 }
1538 
1539 static void
allocate_dst(struct ra_ctx * ctx,struct ir3_register * dst)1540 allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1541 {
1542    struct ra_file *file = ra_get_file(ctx, dst);
1543 
1544    struct ir3_register *tied = dst->tied;
1545    if (tied) {
1546       struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1547       if (tied_interval->is_killed) {
1548          /* The easy case: the source is killed, so we can just reuse it
1549           * for the destination.
1550           */
1551          allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval));
1552          return;
1553       }
1554    }
1555 
1556    /* All the hard work is done by get_reg here. */
1557    physreg_t physreg = get_reg(ctx, file, dst);
1558 
1559    allocate_dst_fixed(ctx, dst, physreg);
1560 }
1561 
1562 static void
assign_src(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)1563 assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
1564            struct ir3_register *src)
1565 {
1566    struct ra_interval *interval = &ctx->intervals[src->def->name];
1567    struct ra_file *file = ra_get_file(ctx, src);
1568 
1569    struct ir3_register *tied = src->tied;
1570    physreg_t physreg;
1571    if (tied) {
1572       struct ra_interval *tied_interval = &ctx->intervals[tied->name];
1573       physreg = ra_interval_get_physreg(tied_interval);
1574    } else {
1575       physreg = ra_interval_get_physreg(interval);
1576    }
1577 
1578    assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags));
1579 
1580    if (src->flags & IR3_REG_FIRST_KILL)
1581       ra_file_remove(file, interval);
1582 }
1583 
1584 /* Insert a parallel copy instruction before the instruction with the parallel
1585  * copy entries we've built up.
1586  */
1587 static void
insert_parallel_copy_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1588 insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1589 {
1590    if (ctx->parallel_copies_count == 0)
1591       return;
1592 
1593    struct ir3_instruction *pcopy =
1594       ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY,
1595                        ctx->parallel_copies_count, ctx->parallel_copies_count);
1596 
1597    for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1598       struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1599       struct ir3_register *reg =
1600          ir3_dst_create(pcopy, INVALID_REG,
1601                         entry->interval->interval.reg->flags &
1602                         (IR3_REG_HALF | IR3_REG_ARRAY));
1603       reg->size = entry->interval->interval.reg->size;
1604       reg->wrmask = entry->interval->interval.reg->wrmask;
1605       assign_reg(pcopy, reg, ra_interval_get_num(entry->interval));
1606    }
1607 
1608    for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1609       struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1610       struct ir3_register *reg =
1611          ir3_src_create(pcopy, INVALID_REG,
1612                         entry->interval->interval.reg->flags &
1613                         (IR3_REG_HALF | IR3_REG_ARRAY));
1614       reg->size = entry->interval->interval.reg->size;
1615       reg->wrmask = entry->interval->interval.reg->wrmask;
1616       assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags));
1617    }
1618 
1619    list_del(&pcopy->node);
1620    list_addtail(&pcopy->node, &instr->node);
1621    ctx->parallel_copies_count = 0;
1622 }
1623 
1624 static void
handle_normal_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1625 handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1626 {
1627    /* First, mark sources as going-to-be-killed while allocating the dest. */
1628    ra_foreach_src (src, instr) {
1629       mark_src_killed(ctx, src);
1630    }
1631 
1632    /* Pre-insert tied dst copies. */
1633    ra_foreach_dst (dst, instr) {
1634       insert_tied_dst_copy(ctx, dst);
1635    }
1636 
1637    /* Allocate the destination. */
1638    ra_foreach_dst (dst, instr) {
1639       allocate_dst(ctx, dst);
1640    }
1641 
1642    /* Now handle sources. Go backward so that in case there are multiple
1643     * sources with the same def and that def is killed we only remove it at
1644     * the end.
1645     */
1646    ra_foreach_src_rev (src, instr) {
1647       assign_src(ctx, instr, src);
1648    }
1649 
1650    /* Now finally insert the destination into the map. */
1651    ra_foreach_dst (dst, instr) {
1652       insert_dst(ctx, dst);
1653    }
1654 
1655    insert_parallel_copy_instr(ctx, instr);
1656 }
1657 
1658 static void
handle_split(struct ra_ctx * ctx,struct ir3_instruction * instr)1659 handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
1660 {
1661    struct ir3_register *dst = instr->dsts[0];
1662    struct ir3_register *src = instr->srcs[0];
1663 
1664    if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
1665       handle_normal_instr(ctx, instr);
1666       return;
1667    }
1668 
1669    struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1670 
1671    physreg_t physreg = ra_interval_get_physreg(src_interval);
1672    assign_src(ctx, instr, src);
1673 
1674    allocate_dst_fixed(
1675       ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset);
1676    insert_dst(ctx, dst);
1677 }
1678 
1679 static void
handle_collect(struct ra_ctx * ctx,struct ir3_instruction * instr)1680 handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
1681 {
1682    struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;
1683    unsigned dst_offset = instr->dsts[0]->merge_set_offset;
1684 
1685    if (!dst_set || dst_set->regs_count == 1) {
1686       handle_normal_instr(ctx, instr);
1687       return;
1688    }
1689 
1690    /* We need to check if any of the sources are contained in an interval
1691     * that is at least as large as the vector. In this case, we should put
1692     * the vector inside that larger interval. (There should be one
1693     * unambiguous place to put it, because values sharing the same merge set
1694     * should be allocated together.) This can happen in a case like:
1695     *
1696     * ssa_1 (wrmask=0xf) = ...
1697     * ssa_2 = split ssa_1 off:0
1698     * ssa_3 = split ssa_1 off:1
1699     * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3
1700     * ... = (kill)ssa_1
1701     * ... = (kill)ssa_4
1702     *
1703     * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it.
1704     */
1705    physreg_t dst_fixed = (physreg_t)~0u;
1706 
1707    ra_foreach_src (src, instr) {
1708       if (src->flags & IR3_REG_FIRST_KILL) {
1709          mark_src_killed(ctx, src);
1710       }
1711 
1712       struct ra_interval *interval = &ctx->intervals[src->def->name];
1713 
1714       if (src->def->merge_set != dst_set || interval->is_killed)
1715          continue;
1716       while (interval->interval.parent != NULL) {
1717          interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1718       }
1719       if (reg_size(interval->interval.reg) >= reg_size(instr->dsts[0])) {
1720          dst_fixed = interval->physreg_start -
1721                      interval->interval.reg->merge_set_offset + dst_offset;
1722       } else {
1723          /* For sources whose root interval is smaller than the
1724           * destination (i.e. the normal case), we will shuffle them
1725           * around after allocating the destination. Mark them killed so
1726           * that the destination can be allocated over them, even if they
1727           * aren't actually killed.
1728           */
1729          ra_file_mark_killed(ra_get_file(ctx, src), interval);
1730       }
1731    }
1732 
1733    if (dst_fixed != (physreg_t)~0u)
1734       allocate_dst_fixed(ctx, instr->dsts[0], dst_fixed);
1735    else
1736       allocate_dst(ctx, instr->dsts[0]);
1737 
1738    /* Remove the temporary is_killed we added */
1739    ra_foreach_src (src, instr) {
1740       struct ra_interval *interval = &ctx->intervals[src->def->name];
1741       while (interval->interval.parent != NULL) {
1742          interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1743       }
1744 
1745       /* Filter out cases where it actually should be killed */
1746       if (interval != &ctx->intervals[src->def->name] ||
1747           !(src->flags & IR3_REG_KILL)) {
1748          ra_file_unmark_killed(ra_get_file(ctx, src), interval);
1749       }
1750    }
1751 
1752    ra_foreach_src_rev (src, instr) {
1753       assign_src(ctx, instr, src);
1754    }
1755 
1756    /* We need to do this before insert_dst(), so that children of the
1757     * destination which got marked as killed and then shuffled around to make
1758     * space for the destination have the correct pcopy destination that
1759     * matches what we assign the source of the collect to in assign_src().
1760     *
1761     * TODO: In this case we'll wind up copying the value in the pcopy and
1762     * then again in the collect. We could avoid one of those by updating the
1763     * pcopy destination to match up with the final location of the source
1764     * after the collect and making the collect a no-op. However this doesn't
1765     * seem to happen often.
1766     */
1767    insert_parallel_copy_instr(ctx, instr);
1768 
1769    /* Note: insert_dst will automatically shuffle around any intervals that
1770     * are a child of the collect by making them children of the collect.
1771     */
1772 
1773    insert_dst(ctx, instr->dsts[0]);
1774 }
1775 
1776 /* Parallel copies before RA should only be at the end of the block, for
1777  * phi's. For these we only need to fill in the sources, and then we fill in
1778  * the destinations in the successor block.
1779  */
1780 static void
handle_pcopy(struct ra_ctx * ctx,struct ir3_instruction * instr)1781 handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
1782 {
1783    ra_foreach_src_rev (src, instr) {
1784       assign_src(ctx, instr, src);
1785    }
1786 }
1787 
1788 /* Some inputs may need to be precolored. We need to handle those first, so
1789  * that other non-precolored inputs don't accidentally get allocated over
1790  * them. Inputs are the very first thing in the shader, so it shouldn't be a
1791  * problem to allocate them to a specific physreg.
1792  */
1793 
1794 static void
handle_precolored_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1795 handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1796 {
1797    if (instr->dsts[0]->num == INVALID_REG)
1798       return;
1799 
1800    struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1801    struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1802    physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]);
1803    allocate_dst_fixed(ctx, instr->dsts[0], physreg);
1804 
1805    d("insert precolored dst %u physreg %u", instr->dsts[0]->name,
1806      ra_interval_get_physreg(interval));
1807 
1808    ra_file_insert(file, interval);
1809    interval->frozen = true;
1810 }
1811 
1812 static void
handle_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1813 handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1814 {
1815    if (instr->dsts[0]->num != INVALID_REG)
1816       return;
1817 
1818    allocate_dst(ctx, instr->dsts[0]);
1819 
1820    struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1821    struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1822    ra_file_insert(file, interval);
1823 }
1824 
1825 static void
assign_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1826 assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1827 {
1828    struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1829    struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1830 
1831    if (instr->dsts[0]->num == INVALID_REG) {
1832       assign_reg(instr, instr->dsts[0], ra_interval_get_num(interval));
1833    } else {
1834       interval->frozen = false;
1835    }
1836 
1837    if (instr->dsts[0]->flags & IR3_REG_UNUSED)
1838       ra_file_remove(file, interval);
1839 
1840    ra_foreach_src_rev (src, instr)
1841       assign_src(ctx, instr, src);
1842 }
1843 
1844 /* chmask is a bit weird, because it has pre-colored sources due to the need
1845  * to pass some registers to the next stage. Fortunately there are only at
1846  * most two, and there should be no other live values by the time we get to
1847  * this instruction, so we only have to do the minimum and don't need any
1848  * fancy fallbacks.
1849  *
1850  * TODO: Add more complete handling of precolored sources, e.g. for function
1851  * argument handling. We'd need a way to mark sources as fixed so that they
1852  * don't get moved around when placing other sources in the fallback case, and
1853  * a duplication of much of the logic in get_reg(). This also opens another
1854  * can of worms, e.g. what if the precolored source is a split of a vector
1855  * which is still live -- this breaks our assumption that splits don't incur
1856  * any "extra" register requirements and we'd have to break it out of the
1857  * parent ra_interval.
1858  */
1859 
1860 static void
handle_precolored_source(struct ra_ctx * ctx,struct ir3_register * src)1861 handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src)
1862 {
1863    struct ra_file *file = ra_get_file(ctx, src);
1864    struct ra_interval *interval = &ctx->intervals[src->def->name];
1865    physreg_t physreg = ra_reg_get_physreg(src);
1866 
1867    if (ra_interval_get_num(interval) == src->num)
1868       return;
1869 
1870    /* Try evicting stuff in our way if it isn't free. This won't move
1871     * anything unless it overlaps with our precolored physreg, so we don't
1872     * have to worry about evicting other precolored sources.
1873     */
1874    if (!get_reg_specified(ctx, file, src, physreg, true)) {
1875       unsigned eviction_count;
1876       if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true,
1877                           false)) {
1878          unreachable("failed to evict for precolored source!");
1879          return;
1880       }
1881    }
1882 
1883    ra_move_interval(ctx, file, interval, physreg);
1884 }
1885 
1886 static void
handle_chmask(struct ra_ctx * ctx,struct ir3_instruction * instr)1887 handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr)
1888 {
1889    /* Note: we purposely don't mark sources as killed, so that we can reuse
1890     * some of the get_reg() machinery as-if the source is a destination.
1891     * Marking it as killed would make e.g. get_reg_specified() wouldn't work
1892     * correctly.
1893     */
1894    ra_foreach_src (src, instr) {
1895       assert(src->num != INVALID_REG);
1896       handle_precolored_source(ctx, src);
1897    }
1898 
1899    ra_foreach_src (src, instr) {
1900       struct ra_file *file = ra_get_file(ctx, src);
1901       struct ra_interval *interval = &ctx->intervals[src->def->name];
1902       if (src->flags & IR3_REG_FIRST_KILL)
1903          ra_file_remove(file, interval);
1904    }
1905 
1906    insert_parallel_copy_instr(ctx, instr);
1907 }
1908 
1909 static physreg_t
read_register(struct ra_ctx * ctx,struct ir3_block * block,struct ir3_register * def)1910 read_register(struct ra_ctx *ctx, struct ir3_block *block,
1911               struct ir3_register *def)
1912 {
1913    struct ra_block_state *state = &ctx->blocks[block->index];
1914    if (state->renames) {
1915       struct hash_entry *entry = _mesa_hash_table_search(state->renames, def);
1916       if (entry) {
1917          return (physreg_t)(uintptr_t)entry->data;
1918       }
1919    }
1920 
1921    return ra_reg_get_physreg(def);
1922 }
1923 
1924 static void
handle_live_in(struct ra_ctx * ctx,struct ir3_register * def)1925 handle_live_in(struct ra_ctx *ctx, struct ir3_register *def)
1926 {
1927    physreg_t physreg = ~0;
1928    for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
1929       struct ir3_block *pred = ctx->block->predecessors[i];
1930       struct ra_block_state *pred_state = &ctx->blocks[pred->index];
1931 
1932       if (!pred_state->visited)
1933          continue;
1934 
1935       physreg = read_register(ctx, pred, def);
1936       break;
1937    }
1938 
1939    assert(physreg != (physreg_t)~0);
1940 
1941    struct ra_interval *interval = &ctx->intervals[def->name];
1942    struct ra_file *file = ra_get_file(ctx, def);
1943    ra_interval_init(interval, def);
1944    interval->physreg_start = physreg;
1945    interval->physreg_end = physreg + reg_size(def);
1946    ra_file_insert(file, interval);
1947 }
1948 
1949 static void
handle_live_out(struct ra_ctx * ctx,struct ir3_register * def)1950 handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
1951 {
1952    /* Skip parallelcopy's which in the original program are only used as phi
1953     * arguments. Even though phi arguments are live out, they are only
1954     * assigned when the phi is.
1955     */
1956    if (def->instr->opc == OPC_META_PARALLEL_COPY)
1957       return;
1958 
1959    struct ra_block_state *state = &ctx->blocks[ctx->block->index];
1960    struct ra_interval *interval = &ctx->intervals[def->name];
1961    physreg_t physreg = ra_interval_get_physreg(interval);
1962    if (physreg != ra_reg_get_physreg(def)) {
1963       if (!state->renames)
1964          state->renames = _mesa_pointer_hash_table_create(ctx);
1965       _mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg);
1966    }
1967 }
1968 
1969 static void
handle_phi(struct ra_ctx * ctx,struct ir3_register * def)1970 handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
1971 {
1972    struct ra_file *file = ra_get_file(ctx, def);
1973    struct ra_interval *interval = &ctx->intervals[def->name];
1974 
1975    /* phis are always scalar, so they should already be the smallest possible
1976     * size. However they may be coalesced with other live-in values/phi
1977     * nodes, so check for that here.
1978     */
1979    struct ir3_reg_interval *parent_ir3 =
1980       ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start);
1981    physreg_t physreg;
1982    if (parent_ir3) {
1983       struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3);
1984       physreg = ra_interval_get_physreg(parent) +
1985                 (def->interval_start - parent_ir3->reg->interval_start);
1986    } else {
1987       physreg = get_reg(ctx, file, def);
1988    }
1989 
1990    allocate_dst_fixed(ctx, def, physreg);
1991 
1992    ra_file_insert(file, interval);
1993 }
1994 
1995 static void
assign_phi(struct ra_ctx * ctx,struct ir3_instruction * phi)1996 assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
1997 {
1998    struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);
1999    struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];
2000    assert(!interval->interval.parent);
2001    unsigned num = ra_interval_get_num(interval);
2002    assign_reg(phi, phi->dsts[0], num);
2003 
2004    /* Assign the parallelcopy sources of this phi */
2005    for (unsigned i = 0; i < phi->srcs_count; i++) {
2006       if (phi->srcs[i]->def) {
2007          assign_reg(phi, phi->srcs[i], num);
2008          assign_reg(phi, phi->srcs[i]->def, num);
2009       }
2010    }
2011 
2012    if (phi->dsts[0]->flags & IR3_REG_UNUSED)
2013       ra_file_remove(file, interval);
2014 }
2015 
2016 /* When we split a live range, we sometimes need to emit fixup code at the end
2017  * of a block. For example, something like:
2018  *
2019  * a = ...
2020  * if (...) {
2021  *    ...
2022  *    a' = a
2023  *    b = ... // a evicted to make room for b
2024  *    ...
2025  * }
2026  * ... = a
2027  *
2028  * When we insert the copy to a' in insert_parallel_copy_instr(), this forces
2029  * to insert another copy "a = a'" at the end of the if. Normally this would
2030  * also entail adding a phi node, but since we're about to go out of SSA
2031  * anyway we just insert an extra move. Note, however, that "b" might be used
2032  * in a phi node at the end of the if and share registers with "a", so we
2033  * have to be careful to extend any preexisting parallelcopy instruction
2034  * instead of creating our own in order to guarantee that they properly get
2035  * swapped.
2036  */
2037 
2038 static void
insert_liveout_copy(struct ir3_block * block,physreg_t dst,physreg_t src,struct ir3_register * reg)2039 insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,
2040                     struct ir3_register *reg)
2041 {
2042    struct ir3_instruction *old_pcopy = NULL;
2043    if (!list_is_empty(&block->instr_list)) {
2044       struct ir3_instruction *last =
2045          list_entry(block->instr_list.prev, struct ir3_instruction, node);
2046       if (last->opc == OPC_META_PARALLEL_COPY)
2047          old_pcopy = last;
2048    }
2049 
2050    unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0;
2051    struct ir3_instruction *pcopy = ir3_instr_create(
2052       block, OPC_META_PARALLEL_COPY, old_pcopy_srcs + 1, old_pcopy_srcs + 1);
2053 
2054    for (unsigned i = 0; i < old_pcopy_srcs; i++) {
2055       old_pcopy->dsts[i]->instr = pcopy;
2056       pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i];
2057    }
2058 
2059    unsigned flags = reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
2060 
2061    struct ir3_register *dst_reg = ir3_dst_create(pcopy, INVALID_REG, flags);
2062    dst_reg->wrmask = reg->wrmask;
2063    dst_reg->size = reg->size;
2064    assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags));
2065 
2066    for (unsigned i = 0; i < old_pcopy_srcs; i++) {
2067       pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i];
2068    }
2069 
2070    struct ir3_register *src_reg = ir3_src_create(pcopy, INVALID_REG, flags);
2071    src_reg->wrmask = reg->wrmask;
2072    src_reg->size = reg->size;
2073    assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags));
2074 
2075    if (old_pcopy)
2076       list_del(&old_pcopy->node);
2077 }
2078 
2079 static void
insert_live_in_move(struct ra_ctx * ctx,struct ra_interval * interval)2080 insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
2081 {
2082    physreg_t physreg = ra_interval_get_physreg(interval);
2083 
2084    bool shared = interval->interval.reg->flags & IR3_REG_SHARED;
2085    struct ir3_block **predecessors =
2086       shared ? ctx->block->physical_predecessors : ctx->block->predecessors;
2087    unsigned predecessors_count = shared
2088                                     ? ctx->block->physical_predecessors_count
2089                                     : ctx->block->predecessors_count;
2090 
2091    for (unsigned i = 0; i < predecessors_count; i++) {
2092       struct ir3_block *pred = predecessors[i];
2093       struct ra_block_state *pred_state = &ctx->blocks[pred->index];
2094 
2095       if (!pred_state->visited)
2096          continue;
2097 
2098       physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
2099       if (pred_reg != physreg) {
2100          insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
2101 
2102          /* This is a bit tricky, but when visiting the destination of a
2103           * physical-only edge, we have two predecessors (the if and the
2104           * header block) and both have multiple successors. We pick the
2105           * register for all live-ins from the normal edge, which should
2106           * guarantee that there's no need for shuffling things around in
2107           * the normal predecessor as long as there are no phi nodes, but
2108           * we still may need to insert fixup code in the physical
2109           * predecessor (i.e. the last block of the if) and that has
2110           * another successor (the block after the if) so we need to update
2111           * the renames state for when we process the other successor. This
2112           * crucially depends on the other successor getting processed
2113           * after this.
2114           *
2115           * For normal (non-physical) edges we disallow critical edges so
2116           * that hacks like this aren't necessary.
2117           */
2118          if (!pred_state->renames)
2119             pred_state->renames = _mesa_pointer_hash_table_create(ctx);
2120          _mesa_hash_table_insert(pred_state->renames, interval->interval.reg,
2121                                  (void *)(uintptr_t)physreg);
2122       }
2123    }
2124 }
2125 
2126 static void
insert_file_live_in_moves(struct ra_ctx * ctx,struct ra_file * file)2127 insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file)
2128 {
2129    BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index];
2130    rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2131                     physreg_node) {
2132       /* Skip phi nodes. This needs to happen after phi nodes are allocated,
2133        * because we may have to move live-ins around to make space for phi
2134        * nodes, but we shouldn't be handling phi nodes here.
2135        */
2136       if (BITSET_TEST(live_in, interval->interval.reg->name))
2137          insert_live_in_move(ctx, interval);
2138    }
2139 }
2140 
2141 static void
insert_entry_regs(struct ra_block_state * state,struct ra_file * file)2142 insert_entry_regs(struct ra_block_state *state, struct ra_file *file)
2143 {
2144    rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2145                     physreg_node) {
2146       _mesa_hash_table_insert(state->entry_regs, interval->interval.reg,
2147                               (void *)(uintptr_t)interval->physreg_start);
2148    }
2149 }
2150 
2151 static void
insert_live_in_moves(struct ra_ctx * ctx)2152 insert_live_in_moves(struct ra_ctx *ctx)
2153 {
2154    insert_file_live_in_moves(ctx, &ctx->full);
2155    insert_file_live_in_moves(ctx, &ctx->half);
2156    insert_file_live_in_moves(ctx, &ctx->shared);
2157 
2158    /* If not all predecessors are visited, insert live-in regs so that
2159     * insert_live_out_moves() will work.
2160     */
2161    bool all_preds_visited = true;
2162    for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
2163       if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) {
2164          all_preds_visited = false;
2165          break;
2166       }
2167    }
2168 
2169    if (!all_preds_visited) {
2170       struct ra_block_state *state = &ctx->blocks[ctx->block->index];
2171       state->entry_regs = _mesa_pointer_hash_table_create(ctx);
2172 
2173       insert_entry_regs(state, &ctx->full);
2174       insert_entry_regs(state, &ctx->half);
2175       insert_entry_regs(state, &ctx->shared);
2176    }
2177 }
2178 
2179 static void
insert_live_out_move(struct ra_ctx * ctx,struct ra_interval * interval)2180 insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval)
2181 {
2182    for (unsigned i = 0; i < 2; i++) {
2183       if (!ctx->block->successors[i])
2184          continue;
2185 
2186       struct ir3_block *succ = ctx->block->successors[i];
2187       struct ra_block_state *succ_state = &ctx->blocks[succ->index];
2188 
2189       if (!succ_state->visited)
2190          continue;
2191 
2192       struct hash_entry *entry = _mesa_hash_table_search(
2193          succ_state->entry_regs, interval->interval.reg);
2194       if (!entry)
2195          continue;
2196 
2197       physreg_t new_reg = (physreg_t)(uintptr_t)entry->data;
2198       if (new_reg != interval->physreg_start) {
2199          insert_liveout_copy(ctx->block, new_reg, interval->physreg_start,
2200                              interval->interval.reg);
2201       }
2202    }
2203 }
2204 
2205 static void
insert_file_live_out_moves(struct ra_ctx * ctx,struct ra_file * file)2206 insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file)
2207 {
2208    rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2209                     physreg_node) {
2210       insert_live_out_move(ctx, interval);
2211    }
2212 }
2213 
2214 static void
insert_live_out_moves(struct ra_ctx * ctx)2215 insert_live_out_moves(struct ra_ctx *ctx)
2216 {
2217    insert_file_live_out_moves(ctx, &ctx->full);
2218    insert_file_live_out_moves(ctx, &ctx->half);
2219    insert_file_live_out_moves(ctx, &ctx->shared);
2220 }
2221 
2222 static void
handle_block(struct ra_ctx * ctx,struct ir3_block * block)2223 handle_block(struct ra_ctx *ctx, struct ir3_block *block)
2224 {
2225    ctx->block = block;
2226 
2227    /* Reset the register files from the last block */
2228    ra_file_init(&ctx->full);
2229    ra_file_init(&ctx->half);
2230    ra_file_init(&ctx->shared);
2231 
2232    /* Handle live-ins, phis, and input meta-instructions. These all appear
2233     * live at the beginning of the block, and interfere with each other
2234     * therefore need to be allocated "in parallel". This means that we
2235     * have to allocate all of them, inserting them into the file, and then
2236     * delay updating the IR until all of them are allocated.
2237     *
2238     * Handle precolored inputs first, because we need to make sure that other
2239     * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes
2240     * and inputs at the same time, because the first block doesn't have
2241     * predecessors. Therefore handle_live_in doesn't have to worry about
2242     * them.
2243     */
2244 
2245    foreach_instr (instr, &block->instr_list) {
2246       if (instr->opc == OPC_META_INPUT)
2247          handle_precolored_input(ctx, instr);
2248       else
2249          break;
2250    }
2251 
2252    unsigned name;
2253    BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
2254                        ctx->live->definitions_count) {
2255       struct ir3_register *reg = ctx->live->definitions[name];
2256       handle_live_in(ctx, reg);
2257    }
2258 
2259    foreach_instr (instr, &block->instr_list) {
2260       if (instr->opc == OPC_META_PHI)
2261          handle_phi(ctx, instr->dsts[0]);
2262       else if (instr->opc == OPC_META_INPUT ||
2263                instr->opc == OPC_META_TEX_PREFETCH)
2264          handle_input(ctx, instr);
2265       else
2266          break;
2267    }
2268 
2269    /* After this point, every live-in/phi/input has an interval assigned to
2270     * it. We delay actually assigning values until everything has been
2271     * allocated, so we can simply ignore any parallel copy entries created
2272     * when shuffling them around.
2273     */
2274    ctx->parallel_copies_count = 0;
2275 
2276    insert_live_in_moves(ctx);
2277 
2278    if (RA_DEBUG) {
2279       d("after live-in block %u:\n", block->index);
2280       ra_ctx_dump(ctx);
2281    }
2282 
2283    /* Now we're done with processing live-ins, and can handle the body of the
2284     * block.
2285     */
2286    foreach_instr (instr, &block->instr_list) {
2287       di(instr, "processing");
2288 
2289       if (instr->opc == OPC_META_PHI)
2290          assign_phi(ctx, instr);
2291       else if (instr->opc == OPC_META_INPUT ||
2292                instr->opc == OPC_META_TEX_PREFETCH)
2293          assign_input(ctx, instr);
2294       else if (instr->opc == OPC_META_SPLIT)
2295          handle_split(ctx, instr);
2296       else if (instr->opc == OPC_META_COLLECT)
2297          handle_collect(ctx, instr);
2298       else if (instr->opc == OPC_META_PARALLEL_COPY)
2299          handle_pcopy(ctx, instr);
2300       else if (instr->opc == OPC_CHMASK)
2301          handle_chmask(ctx, instr);
2302       else
2303          handle_normal_instr(ctx, instr);
2304 
2305       if (RA_DEBUG)
2306          ra_ctx_dump(ctx);
2307    }
2308 
2309    insert_live_out_moves(ctx);
2310 
2311    BITSET_FOREACH_SET (name, ctx->live->live_out[block->index],
2312                        ctx->live->definitions_count) {
2313       struct ir3_register *reg = ctx->live->definitions[name];
2314       handle_live_out(ctx, reg);
2315    }
2316 
2317    ctx->blocks[block->index].visited = true;
2318 }
2319 
2320 static unsigned
calc_target_full_pressure(struct ir3_shader_variant * v,unsigned pressure)2321 calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
2322 {
2323    /* Registers are allocated in units of vec4, so switch from units of
2324     * half-regs to vec4.
2325     */
2326    unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4);
2327 
2328    bool double_threadsize = ir3_should_double_threadsize(v, reg_count);
2329 
2330    unsigned target = reg_count;
2331    unsigned reg_independent_max_waves =
2332       ir3_get_reg_independent_max_waves(v, double_threadsize);
2333    unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
2334       v->compiler, reg_count, double_threadsize);
2335    unsigned target_waves =
2336       MIN2(reg_independent_max_waves, reg_dependent_max_waves);
2337 
2338    while (target <= RA_FULL_SIZE / (2 * 4) &&
2339           ir3_should_double_threadsize(v, target) == double_threadsize &&
2340           ir3_get_reg_dependent_max_waves(v->compiler, target,
2341                                           double_threadsize) >= target_waves)
2342       target++;
2343 
2344    return (target - 1) * 2 * 4;
2345 }
2346 
2347 static void
add_pressure(struct ir3_pressure * pressure,struct ir3_register * reg,bool merged_regs)2348 add_pressure(struct ir3_pressure *pressure, struct ir3_register *reg,
2349              bool merged_regs)
2350 {
2351    unsigned size = reg_size(reg);
2352    if (reg->flags & IR3_REG_HALF)
2353       pressure->half += size;
2354    if (!(reg->flags & IR3_REG_HALF) || merged_regs)
2355       pressure->full += size;
2356 }
2357 
2358 static void
dummy_interval_add(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)2359 dummy_interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2360 {
2361 }
2362 
2363 static void
dummy_interval_delete(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)2364 dummy_interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2365 {
2366 }
2367 
2368 static void
dummy_interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * parent,struct ir3_reg_interval * child)2369 dummy_interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *parent,
2370                      struct ir3_reg_interval *child)
2371 {
2372 }
2373 
2374 /* Calculate the minimum possible limit on register pressure so that spilling
2375  * still succeeds. Used to implement IR3_SHADER_DEBUG=spillall.
2376  */
2377 
2378 static void
calc_min_limit_pressure(struct ir3_shader_variant * v,struct ir3_liveness * live,struct ir3_pressure * limit)2379 calc_min_limit_pressure(struct ir3_shader_variant *v,
2380                         struct ir3_liveness *live,
2381                         struct ir3_pressure *limit)
2382 {
2383    struct ir3_block *start = ir3_start_block(v->ir);
2384    struct ir3_reg_ctx *ctx = ralloc(NULL, struct ir3_reg_ctx);
2385    struct ir3_reg_interval *intervals =
2386       rzalloc_array(ctx, struct ir3_reg_interval, live->definitions_count);
2387 
2388    ctx->interval_add = dummy_interval_add;
2389    ctx->interval_delete = dummy_interval_delete;
2390    ctx->interval_readd = dummy_interval_readd;
2391 
2392    limit->full = limit->half = 0;
2393 
2394    struct ir3_pressure cur_pressure = {0};
2395    foreach_instr (input, &start->instr_list) {
2396       if (input->opc != OPC_META_INPUT &&
2397           input->opc != OPC_META_TEX_PREFETCH)
2398          break;
2399 
2400       add_pressure(&cur_pressure, input->dsts[0], v->mergedregs);
2401    }
2402 
2403    limit->full = MAX2(limit->full, cur_pressure.full);
2404    limit->half = MAX2(limit->half, cur_pressure.half);
2405 
2406    foreach_instr (input, &start->instr_list) {
2407       if (input->opc != OPC_META_INPUT &&
2408           input->opc != OPC_META_TEX_PREFETCH)
2409          break;
2410 
2411       /* pre-colored inputs may have holes, which increases the pressure. */
2412       struct ir3_register *dst = input->dsts[0];
2413       if (dst->num != INVALID_REG) {
2414          unsigned physreg = ra_reg_get_physreg(dst) + reg_size(dst);
2415          if (dst->flags & IR3_REG_HALF)
2416             limit->half = MAX2(limit->half, physreg);
2417          if (!(dst->flags & IR3_REG_HALF) || v->mergedregs)
2418             limit->full = MAX2(limit->full, physreg);
2419       }
2420    }
2421 
2422    foreach_block (block, &v->ir->block_list) {
2423       rb_tree_init(&ctx->intervals);
2424 
2425       unsigned name;
2426       BITSET_FOREACH_SET (name, live->live_in[block->index],
2427                           live->definitions_count) {
2428          struct ir3_register *reg = live->definitions[name];
2429          ir3_reg_interval_init(&intervals[reg->name], reg);
2430          ir3_reg_interval_insert(ctx, &intervals[reg->name]);
2431       }
2432 
2433       foreach_instr (instr, &block->instr_list) {
2434          ra_foreach_dst (dst, instr) {
2435             ir3_reg_interval_init(&intervals[dst->name], dst);
2436          }
2437          /* phis and parallel copies can be deleted via spilling */
2438 
2439          if (instr->opc == OPC_META_PHI) {
2440             ir3_reg_interval_insert(ctx, &intervals[instr->dsts[0]->name]);
2441             continue;
2442          }
2443 
2444          if (instr->opc == OPC_META_PARALLEL_COPY)
2445             continue;
2446 
2447          cur_pressure = (struct ir3_pressure) {0};
2448 
2449          ra_foreach_dst (dst, instr) {
2450             if (dst->tied && !(dst->tied->flags & IR3_REG_KILL))
2451                add_pressure(&cur_pressure, dst, v->mergedregs);
2452          }
2453 
2454          ra_foreach_src_rev (src, instr) {
2455             /* We currently don't support spilling the parent of a source when
2456              * making space for sources, so we have to keep track of the
2457              * intervals and figure out the root of the tree to figure out how
2458              * much space we need.
2459              *
2460              * TODO: We should probably support this in the spiller.
2461              */
2462             struct ir3_reg_interval *interval = &intervals[src->def->name];
2463             while (interval->parent)
2464                interval = interval->parent;
2465             add_pressure(&cur_pressure, interval->reg, v->mergedregs);
2466 
2467             if (src->flags & IR3_REG_FIRST_KILL)
2468                ir3_reg_interval_remove(ctx, &intervals[src->def->name]);
2469          }
2470 
2471          limit->full = MAX2(limit->full, cur_pressure.full);
2472          limit->half = MAX2(limit->half, cur_pressure.half);
2473 
2474          cur_pressure = (struct ir3_pressure) {0};
2475 
2476          ra_foreach_dst (dst, instr) {
2477             ir3_reg_interval_init(&intervals[dst->name], dst);
2478             ir3_reg_interval_insert(ctx, &intervals[dst->name]);
2479             add_pressure(&cur_pressure, dst, v->mergedregs);
2480          }
2481 
2482          limit->full = MAX2(limit->full, cur_pressure.full);
2483          limit->half = MAX2(limit->half, cur_pressure.half);
2484       }
2485    }
2486 
2487    /* Account for the base register, which needs to be available everywhere. */
2488    limit->full += 2;
2489 
2490    ralloc_free(ctx);
2491 }
2492 
2493 /*
2494  * If barriers are used, it must be possible for all waves in the workgroup
2495  * to execute concurrently. Thus we may have to reduce the registers limit.
2496  */
2497 static void
calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant * v,struct ir3_pressure * limit_pressure)2498 calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant *v,
2499                                         struct ir3_pressure *limit_pressure)
2500 {
2501    const struct ir3_compiler *compiler = v->compiler;
2502 
2503    unsigned threads_per_wg;
2504    if (v->local_size_variable) {
2505       /* We have to expect the worst case. */
2506       threads_per_wg = compiler->max_variable_workgroup_size;
2507    } else {
2508       threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2];
2509    }
2510 
2511    /* The register file is grouped into reg_size_vec4 number of parts.
2512     * Each part has enough registers to add a single vec4 register to
2513     * each thread of a single-sized wave-pair. With double threadsize
2514     * each wave-pair would consume two parts of the register file to get
2515     * a single vec4 for a thread. The more active wave-pairs the less
2516     * parts each could get.
2517     */
2518 
2519    bool double_threadsize = ir3_should_double_threadsize(v, 0);
2520    unsigned waves_per_wg = DIV_ROUND_UP(
2521       threads_per_wg, compiler->threadsize_base * (double_threadsize ? 2 : 1) *
2522                          compiler->wave_granularity);
2523 
2524    uint32_t vec4_regs_per_thread =
2525       compiler->reg_size_vec4 / (waves_per_wg * (double_threadsize ? 2 : 1));
2526    assert(vec4_regs_per_thread > 0);
2527 
2528    uint32_t half_regs_per_thread = vec4_regs_per_thread * 4 * 2;
2529 
2530    if (limit_pressure->full > half_regs_per_thread) {
2531       if (v->mergedregs) {
2532          limit_pressure->full = half_regs_per_thread;
2533       } else {
2534          /* TODO: Handle !mergedregs case, probably we would have to do this
2535           * after the first register pressure pass.
2536           */
2537       }
2538    }
2539 }
2540 
2541 int
ir3_ra(struct ir3_shader_variant * v)2542 ir3_ra(struct ir3_shader_variant *v)
2543 {
2544    ir3_calc_dominance(v->ir);
2545 
2546    ir3_create_parallel_copies(v->ir);
2547 
2548    struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
2549 
2550    ctx->merged_regs = v->mergedregs;
2551    ctx->compiler = v->compiler;
2552    ctx->stage = v->type;
2553 
2554    struct ir3_liveness *live = ir3_calc_liveness(ctx, v->ir);
2555 
2556    ir3_debug_print(v->ir, "AFTER: create_parallel_copies");
2557 
2558    ir3_merge_regs(live, v->ir);
2559 
2560    struct ir3_pressure max_pressure;
2561    ir3_calc_pressure(v, live, &max_pressure);
2562    d("max pressure:");
2563    d("\tfull: %u", max_pressure.full);
2564    d("\thalf: %u", max_pressure.half);
2565    d("\tshared: %u", max_pressure.shared);
2566 
2567    struct ir3_pressure limit_pressure;
2568    limit_pressure.full = RA_FULL_SIZE;
2569    limit_pressure.half = RA_HALF_SIZE;
2570    limit_pressure.shared = RA_SHARED_SIZE;
2571 
2572    if (gl_shader_stage_is_compute(v->type) && v->has_barrier) {
2573       calc_limit_pressure_for_cs_with_barrier(v, &limit_pressure);
2574    }
2575 
2576    /* If the user forces a doubled threadsize, we may have to lower the limit
2577     * because on some gens the register file is not big enough to hold a
2578     * double-size wave with all 48 registers in use.
2579     */
2580    if (v->real_wavesize == IR3_DOUBLE_ONLY) {
2581       limit_pressure.full =
2582          MAX2(limit_pressure.full, ctx->compiler->reg_size_vec4 / 2 * 16);
2583    }
2584 
2585    /* If requested, lower the limit so that spilling happens more often. */
2586    if (ir3_shader_debug & IR3_DBG_SPILLALL)
2587       calc_min_limit_pressure(v, live, &limit_pressure);
2588 
2589    if (max_pressure.shared > limit_pressure.shared) {
2590       /* TODO shared reg -> normal reg spilling */
2591       d("shared max pressure exceeded!");
2592       goto fail;
2593    }
2594 
2595    bool spilled = false;
2596    if (max_pressure.full > limit_pressure.full ||
2597        max_pressure.half > limit_pressure.half) {
2598       if (!v->compiler->has_pvtmem) {
2599          d("max pressure exceeded!");
2600          goto fail;
2601       }
2602       d("max pressure exceeded, spilling!");
2603       IR3_PASS(v->ir, ir3_spill, v, &live, &limit_pressure);
2604       ir3_calc_pressure(v, live, &max_pressure);
2605       assert(max_pressure.full <= limit_pressure.full &&
2606              max_pressure.half <= limit_pressure.half);
2607       spilled = true;
2608    }
2609 
2610    ctx->live = live;
2611    ctx->intervals =
2612       rzalloc_array(ctx, struct ra_interval, live->definitions_count);
2613    ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count);
2614 
2615    ctx->full.size = calc_target_full_pressure(v, max_pressure.full);
2616    d("full size: %u", ctx->full.size);
2617 
2618    if (!v->mergedregs)
2619       ctx->half.size = RA_HALF_SIZE;
2620 
2621    ctx->shared.size = RA_SHARED_SIZE;
2622 
2623    ctx->full.start = ctx->half.start = ctx->shared.start = 0;
2624 
2625    foreach_block (block, &v->ir->block_list)
2626       handle_block(ctx, block);
2627 
2628    ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count);
2629 
2630    /* Strip array-ness and SSA-ness at the end, because various helpers still
2631     * need to work even on definitions that have already been assigned. For
2632     * example, we need to preserve array-ness so that array live-ins have the
2633     * right size.
2634     */
2635    foreach_block (block, &v->ir->block_list) {
2636       foreach_instr (instr, &block->instr_list) {
2637          for (unsigned i = 0; i < instr->dsts_count; i++) {
2638             instr->dsts[i]->flags &= ~IR3_REG_SSA;
2639 
2640             /* Parallel copies of array registers copy the whole register, and
2641              * we need some way to let the parallel copy code know that this was
2642              * an array whose size is determined by reg->size. So keep the array
2643              * flag on those. spill/reload also need to work on the entire
2644              * array.
2645              */
2646             if (!is_meta(instr) && instr->opc != OPC_RELOAD_MACRO)
2647                instr->dsts[i]->flags &= ~IR3_REG_ARRAY;
2648          }
2649 
2650          for (unsigned i = 0; i < instr->srcs_count; i++) {
2651             instr->srcs[i]->flags &= ~IR3_REG_SSA;
2652 
2653             if (!is_meta(instr) && instr->opc != OPC_SPILL_MACRO)
2654                instr->srcs[i]->flags &= ~IR3_REG_ARRAY;
2655          }
2656       }
2657    }
2658 
2659    ir3_debug_print(v->ir, "AFTER: register allocation");
2660 
2661    if (spilled) {
2662       IR3_PASS(v->ir, ir3_lower_spill);
2663    }
2664 
2665    ir3_lower_copies(v);
2666 
2667    ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
2668 
2669    ralloc_free(ctx);
2670 
2671    return 0;
2672 fail:
2673    ralloc_free(ctx);
2674    return -1;
2675 }
2676