1 /*
2 * Copyright (C) 2021 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "ir3_compiler.h"
25 #include "ir3_ra.h"
26 #include "util/ralloc.h"
27
28 /* This pass "merges" compatible phi-web SSA values. First, we insert a bunch
29 * of parallelcopy's to trivially turn the program into CSSA form. Then we
30 * try to "merge" SSA def's into "merge sets" which could be allocated to a
31 * single register in order to eliminate copies. First we merge phi nodes,
32 * which should always succeed because of the parallelcopy's we inserted, and
33 * then we try to coalesce the copies we introduced.
34 *
35 * The merged registers are used for three purposes:
36 *
37 * 1. We always use the same pvtmem slot for spilling all SSA defs in each
38 * merge set. This prevents us from having to insert memory-to-memory copies
39 * in the spiller and makes sure we don't insert unecessary copies.
40 * 2. When two values are live at the same time, part of the same merge
41 * set, and they overlap each other in the merge set, they always occupy
42 * overlapping physical registers in RA. This reduces register pressure and
43 * copies in several important scenarios:
44 * - When sources of a collect are used later by something else, we don't
45 * have to introduce copies.
46 * - We can handle sequences of extracts that "explode" a vector into its
47 * components without any additional copying.
48 * 3. We use the merge sets for affinities in register allocation: That is, we
49 * try to allocate all the definitions in the same merge set to the
50 * same/compatible registers. This helps us e.g. allocate sources of a collect
51 * to contiguous registers without too much special code in RA.
52 *
53 * In a "normal" register allocator, or when spilling, we'd just merge
54 * registers in the same merge set to the same register, but with SSA-based
55 * register allocation we may have to split the live interval.
56 *
57 * The implementation is based on "Revisiting Out-of-SSA Translation for
58 * Correctness, CodeQuality, and Efficiency," and is broadly similar to the
59 * implementation in nir_from_ssa, with the twist that we also try to coalesce
60 * META_SPLIT and META_COLLECT. This makes this pass more complicated but
61 * prevents us from needing to handle these specially in RA and the spiller,
62 * which are already complicated enough. This also forces us to implement that
63 * value-comparison optimization they explain, as without it we wouldn't be
64 * able to coalesce META_SPLIT even in the simplest of cases.
65 */
66
67 /* In order to dynamically reconstruct the dominance forest, we need the
68 * instructions ordered by a preorder traversal of the dominance tree:
69 */
70
71 static unsigned
index_instrs(struct ir3_block * block,unsigned index)72 index_instrs(struct ir3_block *block, unsigned index)
73 {
74 foreach_instr (instr, &block->instr_list)
75 instr->ip = index++;
76
77 for (unsigned i = 0; i < block->dom_children_count; i++)
78 index = index_instrs(block->dom_children[i], index);
79
80 return index;
81 }
82
83 /* Definitions within a merge set are ordered by instr->ip as set above: */
84
85 static bool
def_after(struct ir3_register * a,struct ir3_register * b)86 def_after(struct ir3_register *a, struct ir3_register *b)
87 {
88 return a->instr->ip > b->instr->ip;
89 }
90
91 static bool
def_dominates(struct ir3_register * a,struct ir3_register * b)92 def_dominates(struct ir3_register *a, struct ir3_register *b)
93 {
94 if (def_after(a, b)) {
95 return false;
96 } else if (a->instr->block == b->instr->block) {
97 return def_after(b, a);
98 } else {
99 return ir3_block_dominates(a->instr->block, b->instr->block);
100 }
101 }
102
103 /* This represents a region inside a register. The offset is relative to the
104 * start of the register, and offset + size <= size(reg).
105 */
106 struct def_value {
107 struct ir3_register *reg;
108 unsigned offset, size;
109 };
110
111 /* Chase any copies to get the source of a region inside a register. This is
112 * Value(a) in the paper.
113 */
114 static struct def_value
chase_copies(struct def_value value)115 chase_copies(struct def_value value)
116 {
117 while (true) {
118 struct ir3_instruction *instr = value.reg->instr;
119 if (instr->opc == OPC_META_SPLIT) {
120 value.offset += instr->split.off * reg_elem_size(value.reg);
121 value.reg = instr->srcs[0]->def;
122 } else if (instr->opc == OPC_META_COLLECT) {
123 if (value.offset % reg_elem_size(value.reg) != 0 ||
124 value.size > reg_elem_size(value.reg) ||
125 value.offset + value.size > reg_size(value.reg))
126 break;
127 struct ir3_register *src =
128 instr->srcs[value.offset / reg_elem_size(value.reg)];
129 if (!src->def)
130 break;
131 value.offset = 0;
132 value.reg = src->def;
133 } else {
134 /* TODO: parallelcopy */
135 break;
136 }
137 }
138
139 return value;
140 }
141
142 /* This represents an entry in the merge set, and consists of a register +
143 * offset from the merge set base.
144 */
145 struct merge_def {
146 struct ir3_register *reg;
147 unsigned offset;
148 };
149
150 static bool
can_skip_interference(const struct merge_def * a,const struct merge_def * b)151 can_skip_interference(const struct merge_def *a, const struct merge_def *b)
152 {
153 unsigned a_start = a->offset;
154 unsigned b_start = b->offset;
155 unsigned a_end = a_start + reg_size(a->reg);
156 unsigned b_end = b_start + reg_size(b->reg);
157
158 /* Registers that don't overlap never interfere */
159 if (a_end <= b_start || b_end <= a_start)
160 return true;
161
162 /* Disallow skipping interference unless one definition contains the
163 * other. This restriction is important for register allocation, because
164 * it means that at any given point in the program, the live values in a
165 * given merge set will form a tree. If they didn't, then one live value
166 * would partially overlap another, and they would have overlapping live
167 * ranges because they're live at the same point. This simplifies register
168 * allocation and spilling.
169 */
170 if (!((a_start <= b_start && a_end >= b_end) ||
171 (b_start <= a_start && b_end >= a_end)))
172 return false;
173
174 /* For each register, chase the intersection of a and b to find the
175 * ultimate source.
176 */
177 unsigned start = MAX2(a_start, b_start);
178 unsigned end = MIN2(a_end, b_end);
179 struct def_value a_value = chase_copies((struct def_value){
180 .reg = a->reg,
181 .offset = start - a_start,
182 .size = end - start,
183 });
184 struct def_value b_value = chase_copies((struct def_value){
185 .reg = b->reg,
186 .offset = start - b_start,
187 .size = end - start,
188 });
189 return a_value.reg == b_value.reg && a_value.offset == b_value.offset;
190 }
191
192 static struct ir3_merge_set *
get_merge_set(struct ir3_register * def)193 get_merge_set(struct ir3_register *def)
194 {
195 if (def->merge_set)
196 return def->merge_set;
197
198 struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
199 set->preferred_reg = ~0;
200 set->interval_start = ~0;
201 set->spill_slot = ~0;
202 set->size = reg_size(def);
203 set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
204 set->regs_count = 1;
205 set->regs = ralloc(set, struct ir3_register *);
206 set->regs[0] = def;
207
208 return set;
209 }
210
211 /* Merges b into a */
212 static struct ir3_merge_set *
merge_merge_sets(struct ir3_merge_set * a,struct ir3_merge_set * b,int b_offset)213 merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b, int b_offset)
214 {
215 if (b_offset < 0)
216 return merge_merge_sets(b, a, -b_offset);
217
218 struct ir3_register **new_regs =
219 rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count);
220
221 unsigned a_index = 0, b_index = 0, new_index = 0;
222 for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) {
223 if (b_index < b->regs_count &&
224 (a_index == a->regs_count ||
225 def_after(a->regs[a_index], b->regs[b_index]))) {
226 new_regs[new_index] = b->regs[b_index++];
227 new_regs[new_index]->merge_set_offset += b_offset;
228 } else {
229 new_regs[new_index] = a->regs[a_index++];
230 }
231 new_regs[new_index]->merge_set = a;
232 }
233
234 assert(new_index == a->regs_count + b->regs_count);
235
236 /* Technically this should be the lcm, but because alignment is only 1 or
237 * 2 so far this should be ok.
238 */
239 a->alignment = MAX2(a->alignment, b->alignment);
240 a->regs_count += b->regs_count;
241 ralloc_free(a->regs);
242 a->regs = new_regs;
243 a->size = MAX2(a->size, b->size + b_offset);
244
245 return a;
246 }
247
248 static bool
merge_sets_interfere(struct ir3_liveness * live,struct ir3_merge_set * a,struct ir3_merge_set * b,int b_offset)249 merge_sets_interfere(struct ir3_liveness *live, struct ir3_merge_set *a,
250 struct ir3_merge_set *b, int b_offset)
251 {
252 if (b_offset < 0)
253 return merge_sets_interfere(live, b, a, -b_offset);
254
255 struct merge_def dom[a->regs_count + b->regs_count];
256 unsigned a_index = 0, b_index = 0;
257 int dom_index = -1;
258
259 /* Reject trying to merge the sets if the alignment doesn't work out */
260 if (b_offset % a->alignment != 0)
261 return true;
262
263 while (a_index < a->regs_count || b_index < b->regs_count) {
264 struct merge_def current;
265 if (a_index == a->regs_count) {
266 current.reg = b->regs[b_index];
267 current.offset = current.reg->merge_set_offset + b_offset;
268 b_index++;
269 } else if (b_index == b->regs_count) {
270 current.reg = a->regs[a_index];
271 current.offset = current.reg->merge_set_offset;
272 a_index++;
273 } else {
274 if (def_after(b->regs[b_index], a->regs[a_index])) {
275 current.reg = a->regs[a_index];
276 current.offset = current.reg->merge_set_offset;
277 a_index++;
278 } else {
279 current.reg = b->regs[b_index];
280 current.offset = current.reg->merge_set_offset + b_offset;
281 b_index++;
282 }
283 }
284
285 while (dom_index >= 0 &&
286 !def_dominates(dom[dom_index].reg, current.reg)) {
287 dom_index--;
288 }
289
290 /* TODO: in the original paper, just dom[dom_index] needs to be
291 * checked for interference. We implement the value-chasing extension
292 * as well as support for sub-registers, which complicates this
293 * significantly because it's no longer the case that if a dominates b
294 * dominates c and a and b don't interfere then we only need to check
295 * interference between b and c to be sure a and c don't interfere --
296 * this means we may have to check for interference against values
297 * higher in the stack then dom[dom_index]. In the paper there's a
298 * description of a way to do less interference tests with the
299 * value-chasing extension, but we'd have to come up with something
300 * ourselves for handling the similar problems that come up with
301 * allowing values to contain subregisters. For now we just test
302 * everything in the stack.
303 */
304 for (int i = 0; i <= dom_index; i++) {
305 if (can_skip_interference(¤t, &dom[i]))
306 continue;
307
308 /* Ok, now we actually have to check interference. Since we know
309 * that dom[i] dominates current, this boils down to checking
310 * whether dom[i] is live after current.
311 */
312 if (ir3_def_live_after(live, dom[i].reg, current.reg->instr))
313 return true;
314 }
315
316 dom[++dom_index] = current;
317 }
318
319 return false;
320 }
321
322 static void
try_merge_defs(struct ir3_liveness * live,struct ir3_register * a,struct ir3_register * b,unsigned b_offset)323 try_merge_defs(struct ir3_liveness *live, struct ir3_register *a,
324 struct ir3_register *b, unsigned b_offset)
325 {
326 struct ir3_merge_set *a_set = get_merge_set(a);
327 struct ir3_merge_set *b_set = get_merge_set(b);
328
329 if (a_set == b_set) {
330 /* Note: Even in this case we may not always successfully be able to
331 * coalesce this copy, if the offsets don't line up. But in any
332 * case, we can't do anything.
333 */
334 return;
335 }
336
337 int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
338
339 if (!merge_sets_interfere(live, a_set, b_set, b_set_offset))
340 merge_merge_sets(a_set, b_set, b_set_offset);
341 }
342
343 void
ir3_force_merge(struct ir3_register * a,struct ir3_register * b,int b_offset)344 ir3_force_merge(struct ir3_register *a, struct ir3_register *b, int b_offset)
345 {
346 struct ir3_merge_set *a_set = get_merge_set(a);
347 struct ir3_merge_set *b_set = get_merge_set(b);
348
349 if (a_set == b_set)
350 return;
351
352 int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
353 merge_merge_sets(a_set, b_set, b_set_offset);
354 }
355
356 static void
coalesce_phi(struct ir3_liveness * live,struct ir3_instruction * phi)357 coalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi)
358 {
359 for (unsigned i = 0; i < phi->srcs_count; i++) {
360 if (phi->srcs[i]->def)
361 try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0);
362 }
363 }
364
365 static void
aggressive_coalesce_parallel_copy(struct ir3_liveness * live,struct ir3_instruction * pcopy)366 aggressive_coalesce_parallel_copy(struct ir3_liveness *live,
367 struct ir3_instruction *pcopy)
368 {
369 for (unsigned i = 0; i < pcopy->dsts_count; i++) {
370 if (!(pcopy->srcs[i]->flags & IR3_REG_SSA))
371 continue;
372 try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0);
373 }
374 }
375
376 static void
aggressive_coalesce_split(struct ir3_liveness * live,struct ir3_instruction * split)377 aggressive_coalesce_split(struct ir3_liveness *live,
378 struct ir3_instruction *split)
379 {
380 if (!(split->dsts[0]->flags & IR3_REG_SSA))
381 return;
382 try_merge_defs(live, split->srcs[0]->def, split->dsts[0],
383 split->split.off * reg_elem_size(split->dsts[0]));
384 }
385
386 static void
aggressive_coalesce_collect(struct ir3_liveness * live,struct ir3_instruction * collect)387 aggressive_coalesce_collect(struct ir3_liveness *live,
388 struct ir3_instruction *collect)
389 {
390 for (unsigned i = 0, offset = 0; i < collect->srcs_count;
391 offset += reg_elem_size(collect->srcs[i]), i++) {
392 if (!(collect->srcs[i]->flags & IR3_REG_SSA))
393 continue;
394 try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset);
395 }
396 }
397
398 static void
create_parallel_copy(struct ir3_block * block)399 create_parallel_copy(struct ir3_block *block)
400 {
401 for (unsigned i = 0; i < 2; i++) {
402 if (!block->successors[i])
403 continue;
404
405 struct ir3_block *succ = block->successors[i];
406
407 unsigned pred_idx = ir3_block_get_pred_index(succ, block);
408
409 unsigned phi_count = 0;
410 foreach_instr (phi, &succ->instr_list) {
411 if (phi->opc != OPC_META_PHI)
412 break;
413
414 /* Avoid phis we've already colored */
415 if (!(phi->dsts[0]->flags & IR3_REG_SSA))
416 continue;
417
418 /* Avoid undef */
419 if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
420 !phi->srcs[pred_idx]->def)
421 continue;
422
423 /* We don't support critical edges. If we were to support them,
424 * we'd need to insert parallel copies after the phi node to solve
425 * the lost-copy problem.
426 */
427 assert(i == 0 && !block->successors[1]);
428 phi_count++;
429 }
430
431 if (phi_count == 0)
432 continue;
433
434 struct ir3_register *src[phi_count];
435 unsigned j = 0;
436 foreach_instr (phi, &succ->instr_list) {
437 if (phi->opc != OPC_META_PHI)
438 break;
439 if (!(phi->dsts[0]->flags & IR3_REG_SSA))
440 continue;
441 if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
442 !phi->srcs[pred_idx]->def)
443 continue;
444 src[j++] = phi->srcs[pred_idx];
445 }
446 assert(j == phi_count);
447
448 struct ir3_instruction *pcopy =
449 ir3_instr_create(block, OPC_META_PARALLEL_COPY, phi_count, phi_count);
450
451 for (j = 0; j < phi_count; j++) {
452 struct ir3_register *reg = __ssa_dst(pcopy);
453 reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
454 reg->size = src[j]->size;
455 reg->wrmask = src[j]->wrmask;
456 }
457
458 for (j = 0; j < phi_count; j++) {
459 pcopy->srcs[pcopy->srcs_count++] =
460 ir3_reg_clone(block->shader, src[j]);
461 }
462
463 j = 0;
464 foreach_instr (phi, &succ->instr_list) {
465 if (phi->opc != OPC_META_PHI)
466 break;
467 if (!(phi->dsts[0]->flags & IR3_REG_SSA))
468 continue;
469 if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
470 !phi->srcs[pred_idx]->def)
471 continue;
472 phi->srcs[pred_idx]->def = pcopy->dsts[j];
473 phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags;
474 j++;
475 }
476 assert(j == phi_count);
477 }
478 }
479
480 void
ir3_create_parallel_copies(struct ir3 * ir)481 ir3_create_parallel_copies(struct ir3 *ir)
482 {
483 foreach_block (block, &ir->block_list) {
484 create_parallel_copy(block);
485 }
486 }
487
488 static void
index_merge_sets(struct ir3_liveness * live,struct ir3 * ir)489 index_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
490 {
491 unsigned offset = 0;
492 foreach_block (block, &ir->block_list) {
493 foreach_instr (instr, &block->instr_list) {
494 for (unsigned i = 0; i < instr->dsts_count; i++) {
495 struct ir3_register *dst = instr->dsts[i];
496
497 unsigned dst_offset;
498 struct ir3_merge_set *merge_set = dst->merge_set;
499 unsigned size = reg_size(dst);
500 if (merge_set) {
501 if (merge_set->interval_start == ~0) {
502 merge_set->interval_start = offset;
503 offset += merge_set->size;
504 }
505 dst_offset = merge_set->interval_start + dst->merge_set_offset;
506 } else {
507 dst_offset = offset;
508 offset += size;
509 }
510
511 dst->interval_start = dst_offset;
512 dst->interval_end = dst_offset + size;
513 }
514 }
515 }
516
517 live->interval_offset = offset;
518 }
519
520 #define RESET "\x1b[0m"
521 #define BLUE "\x1b[0;34m"
522 #define SYN_SSA(x) BLUE x RESET
523
524 static void
dump_merge_sets(struct ir3 * ir)525 dump_merge_sets(struct ir3 *ir)
526 {
527 d("merge sets:");
528 struct set *merge_sets = _mesa_pointer_set_create(NULL);
529 foreach_block (block, &ir->block_list) {
530 foreach_instr (instr, &block->instr_list) {
531 for (unsigned i = 0; i < instr->dsts_count; i++) {
532 struct ir3_register *dst = instr->dsts[i];
533
534 struct ir3_merge_set *merge_set = dst->merge_set;
535 if (!merge_set || _mesa_set_search(merge_sets, merge_set))
536 continue;
537
538 d("merge set, size %u, align %u:", merge_set->size,
539 merge_set->alignment);
540 for (unsigned j = 0; j < merge_set->regs_count; j++) {
541 struct ir3_register *reg = merge_set->regs[j];
542 d("\t" SYN_SSA("ssa_%u") ":%u, offset %u",
543 reg->instr->serialno, reg->name, reg->merge_set_offset);
544 }
545
546 _mesa_set_add(merge_sets, merge_set);
547 }
548 }
549 }
550
551 ralloc_free(merge_sets);
552 }
553
554 void
ir3_merge_regs(struct ir3_liveness * live,struct ir3 * ir)555 ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir)
556 {
557 index_instrs(ir3_start_block(ir), 0);
558
559 /* First pass: coalesce phis, which must be together. */
560 foreach_block (block, &ir->block_list) {
561 foreach_instr (instr, &block->instr_list) {
562 if (instr->opc != OPC_META_PHI)
563 break;
564
565 coalesce_phi(live, instr);
566 }
567 }
568
569 /* Second pass: aggressively coalesce parallelcopy, split, collect */
570 foreach_block (block, &ir->block_list) {
571 foreach_instr (instr, &block->instr_list) {
572 switch (instr->opc) {
573 case OPC_META_SPLIT:
574 aggressive_coalesce_split(live, instr);
575 break;
576 case OPC_META_COLLECT:
577 aggressive_coalesce_collect(live, instr);
578 break;
579 case OPC_META_PARALLEL_COPY:
580 aggressive_coalesce_parallel_copy(live, instr);
581 break;
582 default:
583 break;
584 }
585 }
586 }
587
588 index_merge_sets(live, ir);
589
590 if (ir3_shader_debug & IR3_DBG_RAMSGS)
591 dump_merge_sets(ir);
592 }
593