• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "ir3_compiler.h"
7 #include "ir3_ra.h"
8 #include "util/ralloc.h"
9 
10 /* This pass "merges" compatible phi-web SSA values. First, we insert a bunch
11  * of parallelcopy's to trivially turn the program into CSSA form. Then we
12  * try to "merge" SSA def's into "merge sets" which could be allocated to a
13  * single register in order to eliminate copies. First we merge phi nodes,
14  * which should always succeed because of the parallelcopy's we inserted, and
15  * then we try to coalesce the copies we introduced.
16  *
17  * The merged registers are used for three purposes:
18  *
19  * 1. We always use the same pvtmem slot for spilling all SSA defs in each
20  * merge set. This prevents us from having to insert memory-to-memory copies
21  * in the spiller and makes sure we don't insert unecessary copies.
22  * 2. When two values are live at the same time, part of the same merge
23  * set, and they overlap each other in the merge set, they always occupy
24  * overlapping physical registers in RA. This reduces register pressure and
25  * copies in several important scenarios:
26  *	- When sources of a collect are used later by something else, we don't
27  *	have to introduce copies.
28  *	- We can handle sequences of extracts that "explode" a vector into its
29  *	components without any additional copying.
30  * 3. We use the merge sets for affinities in register allocation: That is, we
31  * try to allocate all the definitions in the same merge set to the
32  * same/compatible registers. This helps us e.g. allocate sources of a collect
33  * to contiguous registers without too much special code in RA.
34  *
35  * In a "normal" register allocator, or when spilling, we'd just merge
36  * registers in the same merge set to the same register, but with SSA-based
37  * register allocation we may have to split the live interval.
38  *
39  * The implementation is based on "Revisiting Out-of-SSA Translation for
40  * Correctness, CodeQuality, and Efficiency," and is broadly similar to the
41  * implementation in nir_from_ssa, with the twist that we also try to coalesce
42  * META_SPLIT and META_COLLECT. This makes this pass more complicated but
43  * prevents us from needing to handle these specially in RA and the spiller,
44  * which are already complicated enough. This also forces us to implement that
45  * value-comparison optimization they explain, as without it we wouldn't be
46  * able to coalesce META_SPLIT even in the simplest of cases.
47  */
48 
49 /* In order to dynamically reconstruct the dominance forest, we need the
50  * instructions ordered by a preorder traversal of the dominance tree:
51  */
52 
53 static unsigned
index_instrs(struct ir3_block * block,unsigned index)54 index_instrs(struct ir3_block *block, unsigned index)
55 {
56    foreach_instr (instr, &block->instr_list)
57       instr->ip = index++;
58 
59    for (unsigned i = 0; i < block->dom_children_count; i++)
60       index = index_instrs(block->dom_children[i], index);
61 
62    return index;
63 }
64 
65 void
ir3_index_instrs_for_merge_sets(struct ir3 * ir)66 ir3_index_instrs_for_merge_sets(struct ir3 *ir)
67 {
68    index_instrs(ir3_start_block(ir), 0);
69 }
70 
71 /* Definitions within a merge set are ordered by instr->ip as set above: */
72 
73 static bool
def_after(struct ir3_register * a,struct ir3_register * b)74 def_after(struct ir3_register *a, struct ir3_register *b)
75 {
76    return a->instr->ip > b->instr->ip;
77 }
78 
79 static bool
def_dominates(struct ir3_register * a,struct ir3_register * b)80 def_dominates(struct ir3_register *a, struct ir3_register *b)
81 {
82    if (def_after(a, b)) {
83       return false;
84    } else if (a->instr->block == b->instr->block) {
85       return def_after(b, a);
86    } else {
87       return ir3_block_dominates(a->instr->block, b->instr->block);
88    }
89 }
90 
91 /* This represents a region inside a register. The offset is relative to the
92  * start of the register, and offset + size <= size(reg).
93  */
94 struct def_value {
95    struct ir3_register *reg;
96    unsigned offset, size;
97 };
98 
99 /* Chase any copies to get the source of a region inside a register. This is
100  * Value(a) in the paper.
101  */
102 static struct def_value
chase_copies(struct def_value value)103 chase_copies(struct def_value value)
104 {
105    while (true) {
106       struct ir3_instruction *instr = value.reg->instr;
107       if (instr->opc == OPC_META_SPLIT) {
108          value.offset += instr->split.off * reg_elem_size(value.reg);
109          value.reg = instr->srcs[0]->def;
110       } else if (instr->opc == OPC_META_COLLECT) {
111          if (value.offset % reg_elem_size(value.reg) != 0 ||
112              value.size > reg_elem_size(value.reg) ||
113              value.offset + value.size > reg_size(value.reg))
114             break;
115          struct ir3_register *src =
116             instr->srcs[value.offset / reg_elem_size(value.reg)];
117          if (!src->def)
118             break;
119          value.offset = 0;
120          value.reg = src->def;
121       } else {
122          /* TODO: parallelcopy */
123          break;
124       }
125    }
126 
127    return value;
128 }
129 
130 /* This represents an entry in the merge set, and consists of a register +
131  * offset from the merge set base.
132  */
133 struct merge_def {
134    struct ir3_register *reg;
135    unsigned offset;
136 };
137 
138 static bool
can_skip_interference(const struct merge_def * a,const struct merge_def * b)139 can_skip_interference(const struct merge_def *a, const struct merge_def *b)
140 {
141    unsigned a_start = a->offset;
142    unsigned b_start = b->offset;
143    unsigned a_end = a_start + reg_size(a->reg);
144    unsigned b_end = b_start + reg_size(b->reg);
145 
146    /* Registers that don't overlap never interfere */
147    if (a_end <= b_start || b_end <= a_start)
148       return true;
149 
150    /* Disallow skipping interference unless one definition contains the
151     * other. This restriction is important for register allocation, because
152     * it means that at any given point in the program, the live values in a
153     * given merge set will form a tree. If they didn't, then one live value
154     * would partially overlap another, and they would have overlapping live
155     * ranges because they're live at the same point. This simplifies register
156     * allocation and spilling.
157     */
158    if (!((a_start <= b_start && a_end >= b_end) ||
159          (b_start <= a_start && b_end >= a_end)))
160       return false;
161 
162    /* For each register, chase the intersection of a and b to find the
163     * ultimate source.
164     */
165    unsigned start = MAX2(a_start, b_start);
166    unsigned end = MIN2(a_end, b_end);
167    struct def_value a_value = chase_copies((struct def_value){
168       .reg = a->reg,
169       .offset = start - a_start,
170       .size = end - start,
171    });
172    struct def_value b_value = chase_copies((struct def_value){
173       .reg = b->reg,
174       .offset = start - b_start,
175       .size = end - start,
176    });
177    return a_value.reg == b_value.reg && a_value.offset == b_value.offset;
178 }
179 
180 static struct ir3_merge_set *
get_merge_set(struct ir3_register * def)181 get_merge_set(struct ir3_register *def)
182 {
183    if (def->merge_set)
184       return def->merge_set;
185 
186    struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
187    set->preferred_reg = ~0;
188    set->interval_start = ~0;
189    set->spill_slot = ~0;
190    set->size = reg_size(def);
191    set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
192    set->regs_count = 1;
193    set->regs = ralloc(set, struct ir3_register *);
194    set->regs[0] = def;
195 
196    return set;
197 }
198 
199 /* Merges b into a */
200 static struct ir3_merge_set *
merge_merge_sets(struct ir3_merge_set * a,struct ir3_merge_set * b,int b_offset)201 merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b, int b_offset)
202 {
203    if (b_offset < 0)
204       return merge_merge_sets(b, a, -b_offset);
205 
206    struct ir3_register **new_regs =
207       rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count);
208 
209    unsigned a_index = 0, b_index = 0, new_index = 0;
210    for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) {
211       if (b_index < b->regs_count &&
212           (a_index == a->regs_count ||
213            def_after(a->regs[a_index], b->regs[b_index]))) {
214          new_regs[new_index] = b->regs[b_index++];
215          new_regs[new_index]->merge_set_offset += b_offset;
216       } else {
217          new_regs[new_index] = a->regs[a_index++];
218       }
219       new_regs[new_index]->merge_set = a;
220    }
221 
222    assert(new_index == a->regs_count + b->regs_count);
223 
224    /* Technically this should be the lcm, but because alignment is only 1 or
225     * 2 so far this should be ok.
226     */
227    a->alignment = MAX2(a->alignment, b->alignment);
228    a->regs_count += b->regs_count;
229    ralloc_free(a->regs);
230    a->regs = new_regs;
231    a->size = MAX2(a->size, b->size + b_offset);
232 
233    return a;
234 }
235 
236 static bool
merge_sets_interfere(struct ir3_liveness * live,struct ir3_merge_set * a,struct ir3_merge_set * b,int b_offset)237 merge_sets_interfere(struct ir3_liveness *live, struct ir3_merge_set *a,
238                      struct ir3_merge_set *b, int b_offset)
239 {
240    if (b_offset < 0)
241       return merge_sets_interfere(live, b, a, -b_offset);
242 
243    struct merge_def dom[a->regs_count + b->regs_count];
244    unsigned a_index = 0, b_index = 0;
245    int dom_index = -1;
246 
247    /* Reject trying to merge the sets if the alignment doesn't work out */
248    if (b_offset % a->alignment != 0)
249       return true;
250 
251    while (a_index < a->regs_count || b_index < b->regs_count) {
252       struct merge_def current;
253       if (a_index == a->regs_count) {
254          current.reg = b->regs[b_index];
255          current.offset = current.reg->merge_set_offset + b_offset;
256          b_index++;
257       } else if (b_index == b->regs_count) {
258          current.reg = a->regs[a_index];
259          current.offset = current.reg->merge_set_offset;
260          a_index++;
261       } else {
262          if (def_after(b->regs[b_index], a->regs[a_index])) {
263             current.reg = a->regs[a_index];
264             current.offset = current.reg->merge_set_offset;
265             a_index++;
266          } else {
267             current.reg = b->regs[b_index];
268             current.offset = current.reg->merge_set_offset + b_offset;
269             b_index++;
270          }
271       }
272 
273       while (dom_index >= 0 &&
274              !def_dominates(dom[dom_index].reg, current.reg)) {
275          dom_index--;
276       }
277 
278       /* TODO: in the original paper, just dom[dom_index] needs to be
279        * checked for interference. We implement the value-chasing extension
280        * as well as support for sub-registers, which complicates this
281        * significantly because it's no longer the case that if a dominates b
282        * dominates c and a and b don't interfere then we only need to check
283        * interference between b and c to be sure a and c don't interfere --
284        * this means we may have to check for interference against values
285        * higher in the stack then dom[dom_index]. In the paper there's a
286        * description of a way to do less interference tests with the
287        * value-chasing extension, but we'd have to come up with something
288        * ourselves for handling the similar problems that come up with
289        * allowing values to contain subregisters. For now we just test
290        * everything in the stack.
291        */
292       for (int i = 0; i <= dom_index; i++) {
293          if (can_skip_interference(&current, &dom[i]))
294             continue;
295 
296          /* Ok, now we actually have to check interference. Since we know
297           * that dom[i] dominates current, this boils down to checking
298           * whether dom[i] is live after current.
299           */
300          if (ir3_def_live_after(live, dom[i].reg, current.reg->instr))
301             return true;
302       }
303 
304       dom[++dom_index] = current;
305    }
306 
307    return false;
308 }
309 
310 static void
try_merge_defs(struct ir3_liveness * live,struct ir3_register * a,struct ir3_register * b,unsigned b_offset)311 try_merge_defs(struct ir3_liveness *live, struct ir3_register *a,
312                struct ir3_register *b, unsigned b_offset)
313 {
314    struct ir3_merge_set *a_set = get_merge_set(a);
315    struct ir3_merge_set *b_set = get_merge_set(b);
316 
317    if (a_set == b_set) {
318       /* Note: Even in this case we may not always successfully be able to
319        * coalesce this copy, if the offsets don't line up. But in any
320        * case, we can't do anything.
321        */
322       return;
323    }
324 
325    int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
326 
327    if (!merge_sets_interfere(live, a_set, b_set, b_set_offset))
328       merge_merge_sets(a_set, b_set, b_set_offset);
329 }
330 
331 void
ir3_force_merge(struct ir3_register * a,struct ir3_register * b,int b_offset)332 ir3_force_merge(struct ir3_register *a, struct ir3_register *b, int b_offset)
333 {
334    struct ir3_merge_set *a_set = get_merge_set(a);
335    struct ir3_merge_set *b_set = get_merge_set(b);
336 
337    if (a_set == b_set)
338       return;
339 
340    int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
341    merge_merge_sets(a_set, b_set, b_set_offset);
342 }
343 
344 static void
coalesce_phi(struct ir3_liveness * live,struct ir3_instruction * phi)345 coalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi)
346 {
347    for (unsigned i = 0; i < phi->srcs_count; i++) {
348       if (phi->srcs[i]->def)
349          try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0);
350    }
351 }
352 
353 static void
aggressive_coalesce_parallel_copy(struct ir3_liveness * live,struct ir3_instruction * pcopy)354 aggressive_coalesce_parallel_copy(struct ir3_liveness *live,
355                                   struct ir3_instruction *pcopy)
356 {
357    for (unsigned i = 0; i < pcopy->dsts_count; i++) {
358       if (!(pcopy->srcs[i]->flags & IR3_REG_SSA))
359          continue;
360       try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0);
361    }
362 }
363 
364 static void
aggressive_coalesce_split(struct ir3_liveness * live,struct ir3_instruction * split)365 aggressive_coalesce_split(struct ir3_liveness *live,
366                           struct ir3_instruction *split)
367 {
368    if (!(split->dsts[0]->flags & IR3_REG_SSA))
369       return;
370    try_merge_defs(live, split->srcs[0]->def, split->dsts[0],
371                   split->split.off * reg_elem_size(split->dsts[0]));
372 }
373 
374 static void
aggressive_coalesce_collect(struct ir3_liveness * live,struct ir3_instruction * collect)375 aggressive_coalesce_collect(struct ir3_liveness *live,
376                             struct ir3_instruction *collect)
377 {
378    for (unsigned i = 0, offset = 0; i < collect->srcs_count;
379         offset += reg_elem_size(collect->srcs[i]), i++) {
380       if (!(collect->srcs[i]->flags & IR3_REG_SSA) ||
381           !collect->srcs[i]->def)
382          continue;
383       try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset);
384    }
385 }
386 
387 static void
aggressive_coalesce_rpt(struct ir3_liveness * live,struct ir3_instruction * instr)388 aggressive_coalesce_rpt(struct ir3_liveness *live,
389                         struct ir3_instruction *instr)
390 {
391    if (!ir3_instr_is_first_rpt(instr))
392       return;
393 
394    struct ir3_register *def = instr->dsts[0];
395    unsigned def_offset = 0;
396    unsigned src_offsets[instr->srcs_count];
397    memset(src_offsets, 0, sizeof(unsigned) * instr->srcs_count);
398 
399    foreach_instr_rpt_excl (rpt, instr) {
400       if (!(rpt->dsts[0]->flags & IR3_REG_SSA))
401          continue;
402 
403       def_offset += reg_elem_size(def);
404       try_merge_defs(live, def, rpt->dsts[0], def_offset);
405 
406       foreach_src_n (src, src_n, instr) {
407          struct ir3_register *rpt_src = rpt->srcs[src_n];
408 
409          if (!(src->flags & IR3_REG_SSA) || !(rpt_src->flags & IR3_REG_SSA))
410             continue;
411          if (src->def == rpt_src->def)
412             continue;
413 
414          src_offsets[src_n] += reg_elem_size(src->def);
415          try_merge_defs(live, src->def, rpt_src->def, src_offsets[src_n]);
416       }
417    }
418 }
419 
420 static void
create_parallel_copy(struct ir3_block * block)421 create_parallel_copy(struct ir3_block *block)
422 {
423    for (unsigned i = 0; i < 2; i++) {
424       if (!block->successors[i])
425          continue;
426 
427       struct ir3_block *succ = block->successors[i];
428 
429       unsigned pred_idx = ir3_block_get_pred_index(succ, block);
430 
431       unsigned phi_count = 0;
432       foreach_instr (phi, &succ->instr_list) {
433          if (phi->opc != OPC_META_PHI)
434             break;
435 
436          /* Avoid phis we've already colored */
437          if (!(phi->dsts[0]->flags & IR3_REG_SSA))
438             continue;
439 
440          /* Avoid undef */
441          if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
442              !phi->srcs[pred_idx]->def)
443             continue;
444 
445          /* We don't support critical edges. If we were to support them,
446           * we'd need to insert parallel copies after the phi node to solve
447           * the lost-copy problem.
448           */
449          assert(i == 0 && !block->successors[1]);
450          phi_count++;
451       }
452 
453       if (phi_count == 0)
454          continue;
455 
456       struct ir3_register *src[phi_count];
457       unsigned j = 0;
458       foreach_instr (phi, &succ->instr_list) {
459          if (phi->opc != OPC_META_PHI)
460             break;
461          if (!(phi->dsts[0]->flags & IR3_REG_SSA))
462             continue;
463          if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
464              !phi->srcs[pred_idx]->def)
465             continue;
466          src[j++] = phi->srcs[pred_idx];
467       }
468       assert(j == phi_count);
469 
470       struct ir3_instruction *pcopy =
471          ir3_instr_create_at(ir3_before_terminator(block),
472                              OPC_META_PARALLEL_COPY, phi_count, phi_count);
473 
474       for (j = 0; j < phi_count; j++) {
475          struct ir3_register *reg = __ssa_dst(pcopy);
476          reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
477          reg->size = src[j]->size;
478          reg->wrmask = src[j]->wrmask;
479       }
480 
481       for (j = 0; j < phi_count; j++) {
482          pcopy->srcs[pcopy->srcs_count++] =
483             ir3_reg_clone(block->shader, src[j]);
484       }
485 
486       j = 0;
487       foreach_instr (phi, &succ->instr_list) {
488          if (phi->opc != OPC_META_PHI)
489             break;
490          if (!(phi->dsts[0]->flags & IR3_REG_SSA))
491             continue;
492          if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
493              !phi->srcs[pred_idx]->def)
494             continue;
495          phi->srcs[pred_idx]->def = pcopy->dsts[j];
496          pcopy->dsts[j]->flags |= phi->dsts[0]->flags & IR3_REG_SHARED;
497          phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags;
498          phi->srcs[pred_idx]->num = INVALID_REG;
499          j++;
500       }
501       assert(j == phi_count);
502    }
503 }
504 
505 void
ir3_create_parallel_copies(struct ir3 * ir)506 ir3_create_parallel_copies(struct ir3 *ir)
507 {
508    foreach_block (block, &ir->block_list) {
509       create_parallel_copy(block);
510    }
511 }
512 
513 static void
index_merge_sets(struct ir3_liveness * live,struct ir3 * ir)514 index_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
515 {
516    unsigned offset = 0;
517    foreach_block (block, &ir->block_list) {
518       foreach_instr (instr, &block->instr_list) {
519          for (unsigned i = 0; i < instr->dsts_count; i++) {
520             struct ir3_register *dst = instr->dsts[i];
521 
522             unsigned dst_offset;
523             struct ir3_merge_set *merge_set = dst->merge_set;
524             unsigned size = reg_size(dst);
525             if (merge_set) {
526                if (merge_set->interval_start == ~0) {
527                   merge_set->interval_start = offset;
528                   offset += merge_set->size;
529                }
530                dst_offset = merge_set->interval_start + dst->merge_set_offset;
531             } else {
532                dst_offset = offset;
533                offset += size;
534             }
535 
536             dst->interval_start = dst_offset;
537             dst->interval_end = dst_offset + size;
538          }
539       }
540    }
541 
542    live->interval_offset = offset;
543 }
544 
545 #define RESET      "\x1b[0m"
546 #define BLUE       "\x1b[0;34m"
547 #define SYN_SSA(x) BLUE x RESET
548 
549 static void
dump_merge_sets(struct ir3 * ir)550 dump_merge_sets(struct ir3 *ir)
551 {
552    d("merge sets:");
553    struct set *merge_sets = _mesa_pointer_set_create(NULL);
554    foreach_block (block, &ir->block_list) {
555       foreach_instr (instr, &block->instr_list) {
556          for (unsigned i = 0; i < instr->dsts_count; i++) {
557             struct ir3_register *dst = instr->dsts[i];
558 
559             struct ir3_merge_set *merge_set = dst->merge_set;
560             if (!merge_set || _mesa_set_search(merge_sets, merge_set))
561                continue;
562 
563             d("merge set, size %u, align %u, interval start %u:",
564               merge_set->size, merge_set->alignment, merge_set->interval_start);
565             for (unsigned j = 0; j < merge_set->regs_count; j++) {
566                struct ir3_register *reg = merge_set->regs[j];
567                const char *s = (reg->flags & IR3_REG_SHARED) ? "s" : "";
568                const char *h = (reg->flags & IR3_REG_HALF) ? "h" : "";
569                d("\t%s%s" SYN_SSA("ssa_%u") ":%u, offset %u, interval: %u-%u",
570                  s, h, reg->instr->serialno, reg->name, reg->merge_set_offset,
571                  reg->interval_start, reg->interval_end);
572             }
573 
574             _mesa_set_add(merge_sets, merge_set);
575          }
576       }
577    }
578 
579    ralloc_free(merge_sets);
580 }
581 
582 void
ir3_merge_regs(struct ir3_liveness * live,struct ir3 * ir)583 ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir)
584 {
585    /* First pass: coalesce phis, which must be together. */
586    foreach_block (block, &ir->block_list) {
587       foreach_instr (instr, &block->instr_list) {
588          if (instr->opc != OPC_META_PHI)
589             break;
590 
591          coalesce_phi(live, instr);
592       }
593    }
594 
595    /* Second pass: aggressively coalesce parallelcopy, split, collect */
596    foreach_block (block, &ir->block_list) {
597       foreach_instr (instr, &block->instr_list) {
598          switch (instr->opc) {
599          case OPC_META_SPLIT:
600             aggressive_coalesce_split(live, instr);
601             break;
602          case OPC_META_COLLECT:
603             aggressive_coalesce_collect(live, instr);
604             break;
605          case OPC_META_PARALLEL_COPY:
606             aggressive_coalesce_parallel_copy(live, instr);
607             break;
608          default:
609             break;
610          }
611       }
612    }
613 
614    foreach_block (block, &ir->block_list) {
615       foreach_instr (instr, &block->instr_list) {
616          aggressive_coalesce_rpt(live, instr);
617       }
618    }
619 
620    index_merge_sets(live, ir);
621 
622    if (ir3_shader_debug & IR3_DBG_RAMSGS)
623       dump_merge_sets(ir);
624 }
625