• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "ir3_compiler.h"
7 #include "ir3_ra.h"
8 #include "util/ralloc.h"
9 
10 /* This pass "merges" compatible phi-web SSA values. First, we insert a bunch
11  * of parallelcopy's to trivially turn the program into CSSA form. Then we
12  * try to "merge" SSA def's into "merge sets" which could be allocated to a
13  * single register in order to eliminate copies. First we merge phi nodes,
14  * which should always succeed because of the parallelcopy's we inserted, and
15  * then we try to coalesce the copies we introduced.
16  *
17  * The merged registers are used for three purposes:
18  *
19  * 1. We always use the same pvtmem slot for spilling all SSA defs in each
20  * merge set. This prevents us from having to insert memory-to-memory copies
21  * in the spiller and makes sure we don't insert unecessary copies.
22  * 2. When two values are live at the same time, part of the same merge
23  * set, and they overlap each other in the merge set, they always occupy
24  * overlapping physical registers in RA. This reduces register pressure and
25  * copies in several important scenarios:
26  *	- When sources of a collect are used later by something else, we don't
27  *	have to introduce copies.
28  *	- We can handle sequences of extracts that "explode" a vector into its
29  *	components without any additional copying.
30  * 3. We use the merge sets for affinities in register allocation: That is, we
31  * try to allocate all the definitions in the same merge set to the
32  * same/compatible registers. This helps us e.g. allocate sources of a collect
33  * to contiguous registers without too much special code in RA.
34  *
35  * In a "normal" register allocator, or when spilling, we'd just merge
36  * registers in the same merge set to the same register, but with SSA-based
37  * register allocation we may have to split the live interval.
38  *
39  * The implementation is based on "Revisiting Out-of-SSA Translation for
40  * Correctness, CodeQuality, and Efficiency," and is broadly similar to the
41  * implementation in nir_from_ssa, with the twist that we also try to coalesce
42  * META_SPLIT and META_COLLECT. This makes this pass more complicated but
43  * prevents us from needing to handle these specially in RA and the spiller,
44  * which are already complicated enough. This also forces us to implement that
45  * value-comparison optimization they explain, as without it we wouldn't be
46  * able to coalesce META_SPLIT even in the simplest of cases.
47  */
48 
49 /* In order to dynamically reconstruct the dominance forest, we need the
50  * instructions ordered by a preorder traversal of the dominance tree:
51  */
52 
53 static unsigned
index_instrs(struct ir3_block * block,unsigned index)54 index_instrs(struct ir3_block *block, unsigned index)
55 {
56    foreach_instr (instr, &block->instr_list)
57       instr->ip = index++;
58 
59    for (unsigned i = 0; i < block->dom_children_count; i++)
60       index = index_instrs(block->dom_children[i], index);
61 
62    return index;
63 }
64 
65 void
ir3_index_instrs_for_merge_sets(struct ir3 * ir)66 ir3_index_instrs_for_merge_sets(struct ir3 *ir)
67 {
68    index_instrs(ir3_start_block(ir), 0);
69 }
70 
71 /* Definitions within a merge set are ordered by instr->ip as set above: */
72 
73 static bool
def_after(struct ir3_register * a,struct ir3_register * b)74 def_after(struct ir3_register *a, struct ir3_register *b)
75 {
76    return a->instr->ip > b->instr->ip;
77 }
78 
79 static bool
def_dominates(struct ir3_register * a,struct ir3_register * b)80 def_dominates(struct ir3_register *a, struct ir3_register *b)
81 {
82    if (def_after(a, b)) {
83       return false;
84    } else if (a->instr->block == b->instr->block) {
85       return def_after(b, a);
86    } else {
87       return ir3_block_dominates(a->instr->block, b->instr->block);
88    }
89 }
90 
91 /* This represents a region inside a register. The offset is relative to the
92  * start of the register, and offset + size <= size(reg).
93  */
94 struct def_value {
95    struct ir3_register *reg;
96    unsigned offset, size;
97 };
98 
99 /* Chase any copies to get the source of a region inside a register. This is
100  * Value(a) in the paper.
101  */
102 static struct def_value
chase_copies(struct def_value value)103 chase_copies(struct def_value value)
104 {
105    while (true) {
106       struct ir3_instruction *instr = value.reg->instr;
107       if (instr->opc == OPC_META_SPLIT) {
108          value.offset += instr->split.off * reg_elem_size(value.reg);
109          value.reg = instr->srcs[0]->def;
110       } else if (instr->opc == OPC_META_COLLECT) {
111          if (value.offset % reg_elem_size(value.reg) != 0 ||
112              value.size > reg_elem_size(value.reg) ||
113              value.offset + value.size > reg_size(value.reg))
114             break;
115          struct ir3_register *src =
116             instr->srcs[value.offset / reg_elem_size(value.reg)];
117          if (!src->def)
118             break;
119          value.offset = 0;
120          value.reg = src->def;
121       } else {
122          /* TODO: parallelcopy */
123          break;
124       }
125    }
126 
127    return value;
128 }
129 
130 /* This represents an entry in the merge set, and consists of a register +
131  * offset from the merge set base.
132  */
133 struct merge_def {
134    struct ir3_register *reg;
135    unsigned offset;
136 };
137 
138 static bool
can_skip_interference(const struct merge_def * a,const struct merge_def * b)139 can_skip_interference(const struct merge_def *a, const struct merge_def *b)
140 {
141    unsigned a_start = a->offset;
142    unsigned b_start = b->offset;
143    unsigned a_end = a_start + reg_size(a->reg);
144    unsigned b_end = b_start + reg_size(b->reg);
145 
146    /* Registers that don't overlap never interfere */
147    if (a_end <= b_start || b_end <= a_start)
148       return true;
149 
150    /* Disallow skipping interference unless one definition contains the
151     * other. This restriction is important for register allocation, because
152     * it means that at any given point in the program, the live values in a
153     * given merge set will form a tree. If they didn't, then one live value
154     * would partially overlap another, and they would have overlapping live
155     * ranges because they're live at the same point. This simplifies register
156     * allocation and spilling.
157     */
158    if (!((a_start <= b_start && a_end >= b_end) ||
159          (b_start <= a_start && b_end >= a_end)))
160       return false;
161 
162    /* For each register, chase the intersection of a and b to find the
163     * ultimate source.
164     */
165    unsigned start = MAX2(a_start, b_start);
166    unsigned end = MIN2(a_end, b_end);
167    struct def_value a_value = chase_copies((struct def_value){
168       .reg = a->reg,
169       .offset = start - a_start,
170       .size = end - start,
171    });
172    struct def_value b_value = chase_copies((struct def_value){
173       .reg = b->reg,
174       .offset = start - b_start,
175       .size = end - start,
176    });
177    return a_value.reg == b_value.reg && a_value.offset == b_value.offset;
178 }
179 
180 static struct ir3_merge_set *
get_merge_set(struct ir3_register * def)181 get_merge_set(struct ir3_register *def)
182 {
183    if (def->merge_set)
184       return def->merge_set;
185 
186    struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
187    set->preferred_reg = ~0;
188    set->interval_start = ~0;
189    set->spill_slot = ~0;
190    set->size = reg_size(def);
191    set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
192    set->regs_count = 1;
193    set->regs = ralloc(set, struct ir3_register *);
194    set->regs[0] = def;
195 
196    return set;
197 }
198 
199 /* Merges b into a */
200 static struct ir3_merge_set *
merge_merge_sets(struct ir3_merge_set * a,struct ir3_merge_set * b,int b_offset)201 merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b, int b_offset)
202 {
203    if (b_offset < 0)
204       return merge_merge_sets(b, a, -b_offset);
205 
206    struct ir3_register **new_regs =
207       rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count);
208 
209    unsigned a_index = 0, b_index = 0, new_index = 0;
210    for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) {
211       if (b_index < b->regs_count &&
212           (a_index == a->regs_count ||
213            def_after(a->regs[a_index], b->regs[b_index]))) {
214          new_regs[new_index] = b->regs[b_index++];
215          new_regs[new_index]->merge_set_offset += b_offset;
216       } else {
217          new_regs[new_index] = a->regs[a_index++];
218       }
219       new_regs[new_index]->merge_set = a;
220    }
221 
222    assert(new_index == a->regs_count + b->regs_count);
223 
224    /* Technically this should be the lcm, but because alignment is only 1 or
225     * 2 so far this should be ok.
226     */
227    a->alignment = MAX2(a->alignment, b->alignment);
228    a->regs_count += b->regs_count;
229    ralloc_free(a->regs);
230    a->regs = new_regs;
231    a->size = MAX2(a->size, b->size + b_offset);
232 
233    return a;
234 }
235 
236 static bool
merge_sets_interfere(struct ir3_liveness * live,struct ir3_merge_set * a,struct ir3_merge_set * b,int b_offset)237 merge_sets_interfere(struct ir3_liveness *live, struct ir3_merge_set *a,
238                      struct ir3_merge_set *b, int b_offset)
239 {
240    if (b_offset < 0)
241       return merge_sets_interfere(live, b, a, -b_offset);
242 
243    struct merge_def dom[a->regs_count + b->regs_count];
244    unsigned a_index = 0, b_index = 0;
245    int dom_index = -1;
246 
247    /* Reject trying to merge the sets if the alignment doesn't work out */
248    if (b_offset % a->alignment != 0)
249       return true;
250 
251    while (a_index < a->regs_count || b_index < b->regs_count) {
252       struct merge_def current;
253       if (a_index == a->regs_count) {
254          current.reg = b->regs[b_index];
255          current.offset = current.reg->merge_set_offset + b_offset;
256          b_index++;
257       } else if (b_index == b->regs_count) {
258          current.reg = a->regs[a_index];
259          current.offset = current.reg->merge_set_offset;
260          a_index++;
261       } else {
262          if (def_after(b->regs[b_index], a->regs[a_index])) {
263             current.reg = a->regs[a_index];
264             current.offset = current.reg->merge_set_offset;
265             a_index++;
266          } else {
267             current.reg = b->regs[b_index];
268             current.offset = current.reg->merge_set_offset + b_offset;
269             b_index++;
270          }
271       }
272 
273       while (dom_index >= 0 &&
274              !def_dominates(dom[dom_index].reg, current.reg)) {
275          dom_index--;
276       }
277 
278       /* TODO: in the original paper, just dom[dom_index] needs to be
279        * checked for interference. We implement the value-chasing extension
280        * as well as support for sub-registers, which complicates this
281        * significantly because it's no longer the case that if a dominates b
282        * dominates c and a and b don't interfere then we only need to check
283        * interference between b and c to be sure a and c don't interfere --
284        * this means we may have to check for interference against values
285        * higher in the stack then dom[dom_index]. In the paper there's a
286        * description of a way to do less interference tests with the
287        * value-chasing extension, but we'd have to come up with something
288        * ourselves for handling the similar problems that come up with
289        * allowing values to contain subregisters. For now we just test
290        * everything in the stack.
291        */
292       for (int i = 0; i <= dom_index; i++) {
293          if (can_skip_interference(&current, &dom[i]))
294             continue;
295 
296          /* Ok, now we actually have to check interference. Since we know
297           * that dom[i] dominates current, this boils down to checking
298           * whether dom[i] is live after current.
299           */
300          if (ir3_def_live_after(live, dom[i].reg, current.reg->instr))
301             return true;
302       }
303 
304       dom[++dom_index] = current;
305    }
306 
307    return false;
308 }
309 
310 static void
try_merge_defs(struct ir3_liveness * live,struct ir3_register * a,struct ir3_register * b,unsigned b_offset)311 try_merge_defs(struct ir3_liveness *live, struct ir3_register *a,
312                struct ir3_register *b, unsigned b_offset)
313 {
314    struct ir3_merge_set *a_set = get_merge_set(a);
315    struct ir3_merge_set *b_set = get_merge_set(b);
316 
317    if (a_set == b_set) {
318       /* Note: Even in this case we may not always successfully be able to
319        * coalesce this copy, if the offsets don't line up. But in any
320        * case, we can't do anything.
321        */
322       return;
323    }
324 
325    int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
326 
327    if (!merge_sets_interfere(live, a_set, b_set, b_set_offset))
328       merge_merge_sets(a_set, b_set, b_set_offset);
329 }
330 
331 void
ir3_force_merge(struct ir3_register * a,struct ir3_register * b,int b_offset)332 ir3_force_merge(struct ir3_register *a, struct ir3_register *b, int b_offset)
333 {
334    struct ir3_merge_set *a_set = get_merge_set(a);
335    struct ir3_merge_set *b_set = get_merge_set(b);
336 
337    if (a_set == b_set)
338       return;
339 
340    int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
341    merge_merge_sets(a_set, b_set, b_set_offset);
342 }
343 
344 static void
coalesce_phi(struct ir3_liveness * live,struct ir3_instruction * phi)345 coalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi)
346 {
347    for (unsigned i = 0; i < phi->srcs_count; i++) {
348       if (phi->srcs[i]->def)
349          try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0);
350    }
351 }
352 
353 static void
aggressive_coalesce_parallel_copy(struct ir3_liveness * live,struct ir3_instruction * pcopy)354 aggressive_coalesce_parallel_copy(struct ir3_liveness *live,
355                                   struct ir3_instruction *pcopy)
356 {
357    for (unsigned i = 0; i < pcopy->dsts_count; i++) {
358       if (!(pcopy->srcs[i]->flags & IR3_REG_SSA))
359          continue;
360       try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0);
361    }
362 }
363 
364 static void
aggressive_coalesce_split(struct ir3_liveness * live,struct ir3_instruction * split)365 aggressive_coalesce_split(struct ir3_liveness *live,
366                           struct ir3_instruction *split)
367 {
368    if (!(split->dsts[0]->flags & IR3_REG_SSA))
369       return;
370    try_merge_defs(live, split->srcs[0]->def, split->dsts[0],
371                   split->split.off * reg_elem_size(split->dsts[0]));
372 }
373 
374 static void
aggressive_coalesce_collect(struct ir3_liveness * live,struct ir3_instruction * collect)375 aggressive_coalesce_collect(struct ir3_liveness *live,
376                             struct ir3_instruction *collect)
377 {
378    for (unsigned i = 0, offset = 0; i < collect->srcs_count;
379         offset += reg_elem_size(collect->srcs[i]), i++) {
380       if (!(collect->srcs[i]->flags & IR3_REG_SSA))
381          continue;
382       try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset);
383    }
384 }
385 
386 static void
aggressive_coalesce_rpt(struct ir3_liveness * live,struct ir3_instruction * instr)387 aggressive_coalesce_rpt(struct ir3_liveness *live,
388                         struct ir3_instruction *instr)
389 {
390    if (!ir3_instr_is_first_rpt(instr))
391       return;
392 
393    struct ir3_register *def = instr->dsts[0];
394    unsigned def_offset = 0;
395    unsigned src_offsets[instr->srcs_count];
396    memset(src_offsets, 0, sizeof(unsigned) * instr->srcs_count);
397 
398    foreach_instr_rpt_excl (rpt, instr) {
399       if (!(rpt->dsts[0]->flags & IR3_REG_SSA))
400          continue;
401 
402       def_offset += reg_elem_size(def);
403       try_merge_defs(live, def, rpt->dsts[0], def_offset);
404 
405       foreach_src_n (src, src_n, instr) {
406          struct ir3_register *rpt_src = rpt->srcs[src_n];
407 
408          if (!(src->flags & IR3_REG_SSA) || !(rpt_src->flags & IR3_REG_SSA))
409             continue;
410          if (src->def == rpt_src->def)
411             continue;
412 
413          src_offsets[src_n] += reg_elem_size(src->def);
414          try_merge_defs(live, src->def, rpt_src->def, src_offsets[src_n]);
415       }
416    }
417 }
418 
419 static void
create_parallel_copy(struct ir3_block * block)420 create_parallel_copy(struct ir3_block *block)
421 {
422    for (unsigned i = 0; i < 2; i++) {
423       if (!block->successors[i])
424          continue;
425 
426       struct ir3_block *succ = block->successors[i];
427 
428       unsigned pred_idx = ir3_block_get_pred_index(succ, block);
429 
430       unsigned phi_count = 0;
431       foreach_instr (phi, &succ->instr_list) {
432          if (phi->opc != OPC_META_PHI)
433             break;
434 
435          /* Avoid phis we've already colored */
436          if (!(phi->dsts[0]->flags & IR3_REG_SSA))
437             continue;
438 
439          /* Avoid undef */
440          if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
441              !phi->srcs[pred_idx]->def)
442             continue;
443 
444          /* We don't support critical edges. If we were to support them,
445           * we'd need to insert parallel copies after the phi node to solve
446           * the lost-copy problem.
447           */
448          assert(i == 0 && !block->successors[1]);
449          phi_count++;
450       }
451 
452       if (phi_count == 0)
453          continue;
454 
455       struct ir3_register *src[phi_count];
456       unsigned j = 0;
457       foreach_instr (phi, &succ->instr_list) {
458          if (phi->opc != OPC_META_PHI)
459             break;
460          if (!(phi->dsts[0]->flags & IR3_REG_SSA))
461             continue;
462          if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
463              !phi->srcs[pred_idx]->def)
464             continue;
465          src[j++] = phi->srcs[pred_idx];
466       }
467       assert(j == phi_count);
468 
469       struct ir3_instruction *pcopy =
470          ir3_instr_create_at(ir3_before_terminator(block),
471                              OPC_META_PARALLEL_COPY, phi_count, phi_count);
472 
473       for (j = 0; j < phi_count; j++) {
474          struct ir3_register *reg = __ssa_dst(pcopy);
475          reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
476          reg->size = src[j]->size;
477          reg->wrmask = src[j]->wrmask;
478       }
479 
480       for (j = 0; j < phi_count; j++) {
481          pcopy->srcs[pcopy->srcs_count++] =
482             ir3_reg_clone(block->shader, src[j]);
483       }
484 
485       j = 0;
486       foreach_instr (phi, &succ->instr_list) {
487          if (phi->opc != OPC_META_PHI)
488             break;
489          if (!(phi->dsts[0]->flags & IR3_REG_SSA))
490             continue;
491          if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
492              !phi->srcs[pred_idx]->def)
493             continue;
494          phi->srcs[pred_idx]->def = pcopy->dsts[j];
495          pcopy->dsts[j]->flags |= phi->dsts[0]->flags & IR3_REG_SHARED;
496          phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags;
497          phi->srcs[pred_idx]->num = INVALID_REG;
498          j++;
499       }
500       assert(j == phi_count);
501    }
502 }
503 
504 void
ir3_create_parallel_copies(struct ir3 * ir)505 ir3_create_parallel_copies(struct ir3 *ir)
506 {
507    foreach_block (block, &ir->block_list) {
508       create_parallel_copy(block);
509    }
510 }
511 
512 static void
index_merge_sets(struct ir3_liveness * live,struct ir3 * ir)513 index_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
514 {
515    unsigned offset = 0;
516    foreach_block (block, &ir->block_list) {
517       foreach_instr (instr, &block->instr_list) {
518          for (unsigned i = 0; i < instr->dsts_count; i++) {
519             struct ir3_register *dst = instr->dsts[i];
520 
521             unsigned dst_offset;
522             struct ir3_merge_set *merge_set = dst->merge_set;
523             unsigned size = reg_size(dst);
524             if (merge_set) {
525                if (merge_set->interval_start == ~0) {
526                   merge_set->interval_start = offset;
527                   offset += merge_set->size;
528                }
529                dst_offset = merge_set->interval_start + dst->merge_set_offset;
530             } else {
531                dst_offset = offset;
532                offset += size;
533             }
534 
535             dst->interval_start = dst_offset;
536             dst->interval_end = dst_offset + size;
537          }
538       }
539    }
540 
541    live->interval_offset = offset;
542 }
543 
544 #define RESET      "\x1b[0m"
545 #define BLUE       "\x1b[0;34m"
546 #define SYN_SSA(x) BLUE x RESET
547 
548 static void
dump_merge_sets(struct ir3 * ir)549 dump_merge_sets(struct ir3 *ir)
550 {
551    d("merge sets:");
552    struct set *merge_sets = _mesa_pointer_set_create(NULL);
553    foreach_block (block, &ir->block_list) {
554       foreach_instr (instr, &block->instr_list) {
555          for (unsigned i = 0; i < instr->dsts_count; i++) {
556             struct ir3_register *dst = instr->dsts[i];
557 
558             struct ir3_merge_set *merge_set = dst->merge_set;
559             if (!merge_set || _mesa_set_search(merge_sets, merge_set))
560                continue;
561 
562             d("merge set, size %u, align %u, interval start %u:",
563               merge_set->size, merge_set->alignment, merge_set->interval_start);
564             for (unsigned j = 0; j < merge_set->regs_count; j++) {
565                struct ir3_register *reg = merge_set->regs[j];
566                const char *s = (reg->flags & IR3_REG_SHARED) ? "s" : "";
567                const char *h = (reg->flags & IR3_REG_HALF) ? "h" : "";
568                d("\t%s%s" SYN_SSA("ssa_%u") ":%u, offset %u, interval: %u-%u",
569                  s, h, reg->instr->serialno, reg->name, reg->merge_set_offset,
570                  reg->interval_start, reg->interval_end);
571             }
572 
573             _mesa_set_add(merge_sets, merge_set);
574          }
575       }
576    }
577 
578    ralloc_free(merge_sets);
579 }
580 
581 void
ir3_merge_regs(struct ir3_liveness * live,struct ir3 * ir)582 ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir)
583 {
584    /* First pass: coalesce phis, which must be together. */
585    foreach_block (block, &ir->block_list) {
586       foreach_instr (instr, &block->instr_list) {
587          if (instr->opc != OPC_META_PHI)
588             break;
589 
590          coalesce_phi(live, instr);
591       }
592    }
593 
594    /* Second pass: aggressively coalesce parallelcopy, split, collect */
595    foreach_block (block, &ir->block_list) {
596       foreach_instr (instr, &block->instr_list) {
597          switch (instr->opc) {
598          case OPC_META_SPLIT:
599             aggressive_coalesce_split(live, instr);
600             break;
601          case OPC_META_COLLECT:
602             aggressive_coalesce_collect(live, instr);
603             break;
604          case OPC_META_PARALLEL_COPY:
605             aggressive_coalesce_parallel_copy(live, instr);
606             break;
607          default:
608             break;
609          }
610       }
611    }
612 
613    foreach_block (block, &ir->block_list) {
614       foreach_instr (instr, &block->instr_list) {
615          aggressive_coalesce_rpt(live, instr);
616       }
617    }
618 
619    index_merge_sets(live, ir);
620 
621    if (ir3_shader_debug & IR3_DBG_RAMSGS)
622       dump_merge_sets(ir);
623 }
624