1 /*
2 * Copyright (C) 2021 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "ir3_ra.h"
25 #include "ir3_shader.h"
26
27 struct copy_src {
28 unsigned flags;
29 union {
30 uint32_t imm;
31 physreg_t reg;
32 unsigned const_num;
33 };
34 };
35
36 struct copy_entry {
37 physreg_t dst;
38 unsigned flags;
39 bool done;
40
41 struct copy_src src;
42 };
43
44 static unsigned
copy_entry_size(const struct copy_entry * entry)45 copy_entry_size(const struct copy_entry *entry)
46 {
47 return (entry->flags & IR3_REG_HALF) ? 1 : 2;
48 }
49
50 static struct copy_src
get_copy_src(const struct ir3_register * reg,unsigned offset)51 get_copy_src(const struct ir3_register *reg, unsigned offset)
52 {
53 if (reg->flags & IR3_REG_IMMED) {
54 return (struct copy_src){
55 .flags = IR3_REG_IMMED,
56 .imm = reg->uim_val,
57 };
58 } else if (reg->flags & IR3_REG_CONST) {
59 return (struct copy_src){
60 .flags = IR3_REG_CONST,
61 .const_num = reg->num,
62 };
63 } else {
64 return (struct copy_src){
65 .flags = 0,
66 .reg = ra_reg_get_physreg(reg) + offset,
67 };
68 }
69 }
70
71 static void
do_xor(struct ir3_instruction * instr,unsigned dst_num,unsigned src1_num,unsigned src2_num,unsigned flags)72 do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num,
73 unsigned src2_num, unsigned flags)
74 {
75 struct ir3_instruction * xor
76 = ir3_instr_create(instr->block, OPC_XOR_B, 1, 2);
77 ir3_dst_create(xor, dst_num, flags);
78 ir3_src_create(xor, src1_num, flags);
79 ir3_src_create(xor, src2_num, flags);
80
81 ir3_instr_move_before(xor, instr);
82 }
83
84 static void
do_swap(struct ir3_compiler * compiler,struct ir3_instruction * instr,const struct copy_entry * entry)85 do_swap(struct ir3_compiler *compiler, struct ir3_instruction *instr,
86 const struct copy_entry *entry)
87 {
88 assert(!entry->src.flags);
89
90 if (entry->flags & IR3_REG_HALF) {
91 /* We currently make sure to never emit parallel copies where the
92 * source/destination is a half-reg above the range accessable to half
93 * registers. However, when a full-reg source overlaps a half-reg
94 * destination or vice versa, it can be very, very complicated to come
95 * up with a series of "legal" swaps and copies to resolve the
96 * parallel copy. So here we provide a fallback to implement the
97 * "illegal" swap instead. This may also be useful for implementing
98 * "spilling" half-regs to the inaccessable space.
99 */
100 if (entry->src.reg >= RA_HALF_SIZE) {
101 /* Choose a temporary that doesn't overlap src or dst */
102 physreg_t tmp = entry->dst < 2 ? 2 : 0;
103
104 /* Swap src and the temporary */
105 do_swap(compiler, instr,
106 &(struct copy_entry){
107 .src = {.reg = entry->src.reg & ~1u},
108 .dst = tmp,
109 .flags = entry->flags & ~IR3_REG_HALF,
110 });
111
112 /* If src and dst are within the same full register, then swapping src
113 * with tmp above will also move dst to tmp. Account for that here.
114 */
115 unsigned dst =
116 (entry->src.reg & ~1u) == (entry->dst & ~1u) ?
117 tmp + (entry->dst & 1u) : entry->dst;
118
119 /* Do the original swap with src replaced with tmp */
120 do_swap(compiler, instr,
121 &(struct copy_entry){
122 .src = {.reg = tmp + (entry->src.reg & 1)},
123 .dst = dst,
124 .flags = entry->flags,
125 });
126
127 /* Swap src and the temporary back */
128 do_swap(compiler, instr,
129 &(struct copy_entry){
130 .src = {.reg = entry->src.reg & ~1u},
131 .dst = tmp,
132 .flags = entry->flags & ~IR3_REG_HALF,
133 });
134 return;
135 }
136
137 /* If dst is not addressable, we only need to swap the arguments and
138 * let the case above handle it.
139 */
140 if (entry->dst >= RA_HALF_SIZE) {
141 do_swap(compiler, instr,
142 &(struct copy_entry){
143 .src = {.reg = entry->dst},
144 .dst = entry->src.reg,
145 .flags = entry->flags,
146 });
147 return;
148 }
149 }
150
151 unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
152 unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
153
154 /* a5xx+ is known to support swz, which enables us to swap two registers
155 * in-place. If unsupported we emulate it using the xor trick.
156 */
157 if (compiler->gen < 5) {
158 /* Shared regs only exist since a5xx, so we don't have to provide a
159 * fallback path for them.
160 */
161 assert(!(entry->flags & IR3_REG_SHARED));
162 do_xor(instr, dst_num, dst_num, src_num, entry->flags);
163 do_xor(instr, src_num, src_num, dst_num, entry->flags);
164 do_xor(instr, dst_num, dst_num, src_num, entry->flags);
165 } else {
166 /* Use a macro for shared regs because any shared reg writes need to
167 * be wrapped in a getone block to work correctly. Writing shared regs
168 * with multiple threads active does not work, even if they all return
169 * the same value.
170 */
171 unsigned opc =
172 (entry->flags & IR3_REG_SHARED) ? OPC_SWZ_SHARED_MACRO : OPC_SWZ;
173 struct ir3_instruction *swz = ir3_instr_create(instr->block, opc, 2, 2);
174 ir3_dst_create(swz, dst_num, entry->flags);
175 ir3_dst_create(swz, src_num, entry->flags);
176 ir3_src_create(swz, src_num, entry->flags);
177 ir3_src_create(swz, dst_num, entry->flags);
178 swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
179 swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
180 swz->repeat = 1;
181 ir3_instr_move_before(swz, instr);
182 }
183 }
184
185 static void
do_copy(struct ir3_compiler * compiler,struct ir3_instruction * instr,const struct copy_entry * entry)186 do_copy(struct ir3_compiler *compiler, struct ir3_instruction *instr,
187 const struct copy_entry *entry)
188 {
189 if (entry->flags & IR3_REG_HALF) {
190 /* See do_swap() for why this is here. */
191 if (entry->dst >= RA_HALF_SIZE) {
192 /* TODO: is there a hw instruction we can use for this case? */
193 physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0;
194
195 do_swap(compiler, instr,
196 &(struct copy_entry){
197 .src = {.reg = entry->dst & ~1u},
198 .dst = tmp,
199 .flags = entry->flags & ~IR3_REG_HALF,
200 });
201
202 /* Similar to in do_swap(), account for src being swapped with tmp if
203 * src and dst are in the same register.
204 */
205 struct copy_src src = entry->src;
206 if (!src.flags && (src.reg & ~1u) == (entry->dst & ~1u))
207 src.reg = tmp + (src.reg & 1u);
208
209 do_copy(compiler, instr,
210 &(struct copy_entry){
211 .src = src,
212 .dst = tmp + (entry->dst & 1),
213 .flags = entry->flags,
214 });
215
216 do_swap(compiler, instr,
217 &(struct copy_entry){
218 .src = {.reg = entry->dst & ~1u},
219 .dst = tmp,
220 .flags = entry->flags & ~IR3_REG_HALF,
221 });
222 return;
223 }
224
225 if (!entry->src.flags && entry->src.reg >= RA_HALF_SIZE) {
226 unsigned src_num = ra_physreg_to_num(entry->src.reg & ~1u,
227 entry->flags & ~IR3_REG_HALF);
228 unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
229
230 if (entry->src.reg % 2 == 0) {
231 /* cov.u32u16 dst, src */
232 struct ir3_instruction *cov =
233 ir3_instr_create(instr->block, OPC_MOV, 1, 1);
234 ir3_dst_create(cov, dst_num, entry->flags);
235 ir3_src_create(cov, src_num, entry->flags & ~IR3_REG_HALF);
236 cov->cat1.dst_type = TYPE_U16;
237 cov->cat1.src_type = TYPE_U32;
238 ir3_instr_move_before(cov, instr);
239 } else {
240 /* shr.b dst, src, (16) */
241 struct ir3_instruction *shr =
242 ir3_instr_create(instr->block, OPC_SHR_B, 1, 2);
243 ir3_dst_create(shr, dst_num, entry->flags);
244 ir3_src_create(shr, src_num, entry->flags & ~IR3_REG_HALF);
245 ir3_src_create(shr, 0, IR3_REG_IMMED)->uim_val = 16;
246 ir3_instr_move_before(shr, instr);
247 }
248 return;
249 }
250 }
251
252 unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
253 unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
254
255 /* Similar to the swap case, we have to use a macro for shared regs. */
256 unsigned opc =
257 (entry->flags & IR3_REG_SHARED) ? OPC_READ_FIRST_MACRO : OPC_MOV;
258 struct ir3_instruction *mov = ir3_instr_create(instr->block, opc, 1, 1);
259 ir3_dst_create(mov, dst_num, entry->flags);
260 ir3_src_create(mov, src_num, entry->flags | entry->src.flags);
261 mov->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
262 mov->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
263 if (entry->src.flags & IR3_REG_IMMED)
264 mov->srcs[0]->uim_val = entry->src.imm;
265 else if (entry->src.flags & IR3_REG_CONST)
266 mov->srcs[0]->num = entry->src.const_num;
267 ir3_instr_move_before(mov, instr);
268 }
269
270 struct copy_ctx {
271 /* For each physreg, the number of pending copy entries that use it as a
272 * source. Once this drops to zero, then the physreg is unblocked and can
273 * be moved to.
274 */
275 unsigned physreg_use_count[RA_MAX_FILE_SIZE];
276
277 /* For each physreg, the pending copy_entry that uses it as a dest. */
278 struct copy_entry *physreg_dst[RA_MAX_FILE_SIZE];
279
280 struct copy_entry entries[RA_MAX_FILE_SIZE];
281 unsigned entry_count;
282 };
283
284 static bool
entry_blocked(struct copy_entry * entry,struct copy_ctx * ctx)285 entry_blocked(struct copy_entry *entry, struct copy_ctx *ctx)
286 {
287 for (unsigned i = 0; i < copy_entry_size(entry); i++) {
288 if (ctx->physreg_use_count[entry->dst + i] != 0)
289 return true;
290 }
291
292 return false;
293 }
294
295 static void
split_32bit_copy(struct copy_ctx * ctx,struct copy_entry * entry)296 split_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry)
297 {
298 assert(!entry->done);
299 assert(!(entry->src.flags & (IR3_REG_IMMED | IR3_REG_CONST)));
300 assert(copy_entry_size(entry) == 2);
301 struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++];
302
303 new_entry->dst = entry->dst + 1;
304 new_entry->src.flags = entry->src.flags;
305 new_entry->src.reg = entry->src.reg + 1;
306 new_entry->done = false;
307 entry->flags |= IR3_REG_HALF;
308 new_entry->flags = entry->flags;
309 ctx->physreg_dst[entry->dst + 1] = new_entry;
310 }
311
312 static void
_handle_copies(struct ir3_compiler * compiler,struct ir3_instruction * instr,struct copy_ctx * ctx)313 _handle_copies(struct ir3_compiler *compiler, struct ir3_instruction *instr,
314 struct copy_ctx *ctx)
315 {
316 /* Set up the bookkeeping */
317 memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst));
318 memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
319
320 for (unsigned i = 0; i < ctx->entry_count; i++) {
321 struct copy_entry *entry = &ctx->entries[i];
322 for (unsigned j = 0; j < copy_entry_size(entry); j++) {
323 if (!entry->src.flags)
324 ctx->physreg_use_count[entry->src.reg + j]++;
325
326 /* Copies should not have overlapping destinations. */
327 assert(!ctx->physreg_dst[entry->dst + j]);
328 ctx->physreg_dst[entry->dst + j] = entry;
329 }
330 }
331
332 bool progress = true;
333 while (progress) {
334 progress = false;
335
336 /* Step 1: resolve paths in the transfer graph. This means finding
337 * copies whose destination aren't blocked by something else and then
338 * emitting them, continuing this process until every copy is blocked
339 * and there are only cycles left.
340 *
341 * TODO: We should note that src is also available in dst to unblock
342 * cycles that src is involved in.
343 */
344
345 for (unsigned i = 0; i < ctx->entry_count; i++) {
346 struct copy_entry *entry = &ctx->entries[i];
347 if (!entry->done && !entry_blocked(entry, ctx)) {
348 entry->done = true;
349 progress = true;
350 do_copy(compiler, instr, entry);
351 for (unsigned j = 0; j < copy_entry_size(entry); j++) {
352 if (!entry->src.flags)
353 ctx->physreg_use_count[entry->src.reg + j]--;
354 ctx->physreg_dst[entry->dst + j] = NULL;
355 }
356 }
357 }
358
359 if (progress)
360 continue;
361
362 /* Step 2: Find partially blocked copies and split them. In the
363 * mergedregs case, we can 32-bit copies which are only blocked on one
364 * 16-bit half, and splitting them helps get things moving.
365 *
366 * We can skip splitting copies if the source isn't a register,
367 * however, because it does not unblock anything and therefore doesn't
368 * contribute to making forward progress with step 1. These copies
369 * should still be resolved eventually in step 1 because they can't be
370 * part of a cycle.
371 */
372 for (unsigned i = 0; i < ctx->entry_count; i++) {
373 struct copy_entry *entry = &ctx->entries[i];
374 if (entry->done || entry->flags & IR3_REG_HALF)
375 continue;
376
377 if (((ctx->physreg_use_count[entry->dst] == 0 ||
378 ctx->physreg_use_count[entry->dst + 1] == 0)) &&
379 !(entry->src.flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
380 split_32bit_copy(ctx, entry);
381 progress = true;
382 }
383 }
384 }
385
386 /* Step 3: resolve cycles through swapping.
387 *
388 * At this point, the transfer graph should consist of only cycles.
389 * The reason is that, given any physreg n_1 that's the source of a
390 * remaining entry, it has a destination n_2, which (because every
391 * copy is blocked) is the source of some other copy whose destination
392 * is n_3, and so we can follow the chain until we get a cycle. If we
393 * reached some other node than n_1:
394 *
395 * n_1 -> n_2 -> ... -> n_i
396 * ^ |
397 * |-------------|
398 *
399 * then n_2 would be the destination of 2 copies, which is illegal
400 * (checked above in an assert). So n_1 must be part of a cycle:
401 *
402 * n_1 -> n_2 -> ... -> n_i
403 * ^ |
404 * |---------------------|
405 *
406 * and this must be only cycle n_1 is involved in, because any other
407 * path starting from n_1 would also have to end in n_1, resulting in
408 * a node somewhere along the way being the destination of 2 copies
409 * when the 2 paths merge.
410 *
411 * The way we resolve the cycle is through picking a copy (n_1, n_2)
412 * and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
413 * out of the cycle:
414 *
415 * n_1 -> ... -> n_i
416 * ^ |
417 * |--------------|
418 *
419 * and we can keep repeating this until the cycle is empty.
420 */
421
422 for (unsigned i = 0; i < ctx->entry_count; i++) {
423 struct copy_entry *entry = &ctx->entries[i];
424 if (entry->done)
425 continue;
426
427 assert(!entry->src.flags);
428
429 /* catch trivial copies */
430 if (entry->dst == entry->src.reg) {
431 entry->done = true;
432 continue;
433 }
434
435 do_swap(compiler, instr, entry);
436
437 /* Split any blocking copies whose sources are only partially
438 * contained within our destination.
439 */
440 if (entry->flags & IR3_REG_HALF) {
441 for (unsigned j = 0; j < ctx->entry_count; j++) {
442 struct copy_entry *blocking = &ctx->entries[j];
443
444 if (blocking->done)
445 continue;
446
447 if (blocking->src.reg <= entry->dst &&
448 blocking->src.reg + 1 >= entry->dst &&
449 !(blocking->flags & IR3_REG_HALF)) {
450 split_32bit_copy(ctx, blocking);
451 }
452 }
453 }
454
455 /* Update sources of blocking copies.
456 *
457 * Note: at this point, every blocking copy's source should be
458 * contained within our destination.
459 */
460 for (unsigned j = 0; j < ctx->entry_count; j++) {
461 struct copy_entry *blocking = &ctx->entries[j];
462 if (blocking->src.reg >= entry->dst &&
463 blocking->src.reg < entry->dst + copy_entry_size(entry)) {
464 blocking->src.reg =
465 entry->src.reg + (blocking->src.reg - entry->dst);
466 }
467 }
468
469 entry->done = true;
470 }
471 }
472
473 static void
handle_copies(struct ir3_shader_variant * v,struct ir3_instruction * instr,struct copy_entry * entries,unsigned entry_count)474 handle_copies(struct ir3_shader_variant *v, struct ir3_instruction *instr,
475 struct copy_entry *entries, unsigned entry_count)
476 {
477 struct copy_ctx ctx;
478
479 /* handle shared copies first */
480 ctx.entry_count = 0;
481 for (unsigned i = 0; i < entry_count; i++) {
482 if (entries[i].flags & IR3_REG_SHARED)
483 ctx.entries[ctx.entry_count++] = entries[i];
484 }
485 _handle_copies(v->shader->compiler, instr, &ctx);
486
487 if (v->mergedregs) {
488 /* Half regs and full regs are in the same file, so handle everything
489 * at once.
490 */
491 ctx.entry_count = 0;
492 for (unsigned i = 0; i < entry_count; i++) {
493 if (!(entries[i].flags & IR3_REG_SHARED))
494 ctx.entries[ctx.entry_count++] = entries[i];
495 }
496 _handle_copies(v->shader->compiler, instr, &ctx);
497 } else {
498 /* There may be both half copies and full copies, so we have to split
499 * them up since they don't interfere.
500 */
501 ctx.entry_count = 0;
502 for (unsigned i = 0; i < entry_count; i++) {
503 if (entries[i].flags & IR3_REG_HALF)
504 ctx.entries[ctx.entry_count++] = entries[i];
505 }
506 _handle_copies(v->shader->compiler, instr, &ctx);
507
508 ctx.entry_count = 0;
509 for (unsigned i = 0; i < entry_count; i++) {
510 if (!(entries[i].flags & (IR3_REG_HALF | IR3_REG_SHARED)))
511 ctx.entries[ctx.entry_count++] = entries[i];
512 }
513 _handle_copies(v->shader->compiler, instr, &ctx);
514 }
515 }
516
517 void
ir3_lower_copies(struct ir3_shader_variant * v)518 ir3_lower_copies(struct ir3_shader_variant *v)
519 {
520 DECLARE_ARRAY(struct copy_entry, copies);
521 copies_count = copies_sz = 0;
522 copies = NULL;
523
524 foreach_block (block, &v->ir->block_list) {
525 foreach_instr_safe (instr, &block->instr_list) {
526 if (instr->opc == OPC_META_PARALLEL_COPY) {
527 copies_count = 0;
528 for (unsigned i = 0; i < instr->dsts_count; i++) {
529 struct ir3_register *dst = instr->dsts[i];
530 struct ir3_register *src = instr->srcs[i];
531 unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
532 unsigned dst_physreg = ra_reg_get_physreg(dst);
533 for (unsigned j = 0; j < reg_elems(dst); j++) {
534 array_insert(
535 NULL, copies,
536 (struct copy_entry){
537 .dst = dst_physreg + j * reg_elem_size(dst),
538 .src = get_copy_src(src, j * reg_elem_size(dst)),
539 .flags = flags,
540 });
541 }
542 }
543 handle_copies(v, instr, copies, copies_count);
544 list_del(&instr->node);
545 } else if (instr->opc == OPC_META_COLLECT) {
546 copies_count = 0;
547 struct ir3_register *dst = instr->dsts[0];
548 unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
549 for (unsigned i = 0; i < instr->srcs_count; i++) {
550 struct ir3_register *src = instr->srcs[i];
551 array_insert(NULL, copies,
552 (struct copy_entry){
553 .dst = ra_num_to_physreg(dst->num + i, flags),
554 .src = get_copy_src(src, 0),
555 .flags = flags,
556 });
557 }
558 handle_copies(v, instr, copies, copies_count);
559 list_del(&instr->node);
560 } else if (instr->opc == OPC_META_SPLIT) {
561 copies_count = 0;
562 struct ir3_register *dst = instr->dsts[0];
563 struct ir3_register *src = instr->srcs[0];
564 unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
565 array_insert(NULL, copies,
566 (struct copy_entry){
567 .dst = ra_reg_get_physreg(dst),
568 .src = get_copy_src(
569 src, instr->split.off * reg_elem_size(dst)),
570 .flags = flags,
571 });
572 handle_copies(v, instr, copies, copies_count);
573 list_del(&instr->node);
574 } else if (instr->opc == OPC_META_PHI) {
575 list_del(&instr->node);
576 }
577 }
578 }
579
580 if (copies)
581 ralloc_free(copies);
582 }
583