• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2021 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "ir3_ra.h"
25 #include "ir3_shader.h"
26 
27 struct copy_src {
28    unsigned flags;
29    union {
30       uint32_t imm;
31       physreg_t reg;
32       unsigned const_num;
33    };
34 };
35 
36 struct copy_entry {
37    physreg_t dst;
38    unsigned flags;
39    bool done;
40 
41    struct copy_src src;
42 };
43 
44 static unsigned
copy_entry_size(const struct copy_entry * entry)45 copy_entry_size(const struct copy_entry *entry)
46 {
47    return (entry->flags & IR3_REG_HALF) ? 1 : 2;
48 }
49 
50 static struct copy_src
get_copy_src(const struct ir3_register * reg,unsigned offset)51 get_copy_src(const struct ir3_register *reg, unsigned offset)
52 {
53    if (reg->flags & IR3_REG_IMMED) {
54       return (struct copy_src){
55          .flags = IR3_REG_IMMED,
56          .imm = reg->uim_val,
57       };
58    } else if (reg->flags & IR3_REG_CONST) {
59       return (struct copy_src){
60          .flags = IR3_REG_CONST,
61          .const_num = reg->num,
62       };
63    } else {
64       return (struct copy_src){
65          .flags = 0,
66          .reg = ra_reg_get_physreg(reg) + offset,
67       };
68    }
69 }
70 
71 static void
do_xor(struct ir3_instruction * instr,unsigned dst_num,unsigned src1_num,unsigned src2_num,unsigned flags)72 do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num,
73        unsigned src2_num, unsigned flags)
74 {
75    struct ir3_instruction * xor
76       = ir3_instr_create(instr->block, OPC_XOR_B, 1, 2);
77    ir3_dst_create(xor, dst_num, flags);
78    ir3_src_create(xor, src1_num, flags);
79    ir3_src_create(xor, src2_num, flags);
80 
81    ir3_instr_move_before(xor, instr);
82 }
83 
84 static void
do_swap(struct ir3_compiler * compiler,struct ir3_instruction * instr,const struct copy_entry * entry)85 do_swap(struct ir3_compiler *compiler, struct ir3_instruction *instr,
86         const struct copy_entry *entry)
87 {
88    assert(!entry->src.flags);
89 
90    if (entry->flags & IR3_REG_HALF) {
91       /* We currently make sure to never emit parallel copies where the
92        * source/destination is a half-reg above the range accessable to half
93        * registers. However, when a full-reg source overlaps a half-reg
94        * destination or vice versa, it can be very, very complicated to come
95        * up with a series of "legal" swaps and copies to resolve the
96        * parallel copy. So here we provide a fallback to implement the
97        * "illegal" swap instead. This may also be useful for implementing
98        * "spilling" half-regs to the inaccessable space.
99        */
100       if (entry->src.reg >= RA_HALF_SIZE) {
101          /* Choose a temporary that doesn't overlap src or dst */
102          physreg_t tmp = entry->dst < 2 ? 2 : 0;
103 
104          /* Swap src and the temporary */
105          do_swap(compiler, instr,
106                  &(struct copy_entry){
107                     .src = {.reg = entry->src.reg & ~1u},
108                     .dst = tmp,
109                     .flags = entry->flags & ~IR3_REG_HALF,
110                  });
111 
112          /* If src and dst are within the same full register, then swapping src
113           * with tmp above will also move dst to tmp. Account for that here.
114           */
115          unsigned dst =
116             (entry->src.reg & ~1u) == (entry->dst & ~1u) ?
117             tmp + (entry->dst & 1u) : entry->dst;
118 
119          /* Do the original swap with src replaced with tmp */
120          do_swap(compiler, instr,
121                  &(struct copy_entry){
122                     .src = {.reg = tmp + (entry->src.reg & 1)},
123                     .dst = dst,
124                     .flags = entry->flags,
125                  });
126 
127          /* Swap src and the temporary back */
128          do_swap(compiler, instr,
129                  &(struct copy_entry){
130                     .src = {.reg = entry->src.reg & ~1u},
131                     .dst = tmp,
132                     .flags = entry->flags & ~IR3_REG_HALF,
133                  });
134          return;
135       }
136 
137       /* If dst is not addressable, we only need to swap the arguments and
138        * let the case above handle it.
139        */
140       if (entry->dst >= RA_HALF_SIZE) {
141          do_swap(compiler, instr,
142                  &(struct copy_entry){
143                     .src = {.reg = entry->dst},
144                     .dst = entry->src.reg,
145                     .flags = entry->flags,
146                  });
147          return;
148       }
149    }
150 
151    unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
152    unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
153 
154    /* a5xx+ is known to support swz, which enables us to swap two registers
155     * in-place. If unsupported we emulate it using the xor trick.
156     */
157    if (compiler->gen < 5) {
158       /* Shared regs only exist since a5xx, so we don't have to provide a
159        * fallback path for them.
160        */
161       assert(!(entry->flags & IR3_REG_SHARED));
162       do_xor(instr, dst_num, dst_num, src_num, entry->flags);
163       do_xor(instr, src_num, src_num, dst_num, entry->flags);
164       do_xor(instr, dst_num, dst_num, src_num, entry->flags);
165    } else {
166       /* Use a macro for shared regs because any shared reg writes need to
167        * be wrapped in a getone block to work correctly. Writing shared regs
168        * with multiple threads active does not work, even if they all return
169        * the same value.
170        */
171       unsigned opc =
172          (entry->flags & IR3_REG_SHARED) ? OPC_SWZ_SHARED_MACRO : OPC_SWZ;
173       struct ir3_instruction *swz = ir3_instr_create(instr->block, opc, 2, 2);
174       ir3_dst_create(swz, dst_num, entry->flags);
175       ir3_dst_create(swz, src_num, entry->flags);
176       ir3_src_create(swz, src_num, entry->flags);
177       ir3_src_create(swz, dst_num, entry->flags);
178       swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
179       swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
180       swz->repeat = 1;
181       ir3_instr_move_before(swz, instr);
182    }
183 }
184 
185 static void
do_copy(struct ir3_compiler * compiler,struct ir3_instruction * instr,const struct copy_entry * entry)186 do_copy(struct ir3_compiler *compiler, struct ir3_instruction *instr,
187         const struct copy_entry *entry)
188 {
189    if (entry->flags & IR3_REG_HALF) {
190       /* See do_swap() for why this is here. */
191       if (entry->dst >= RA_HALF_SIZE) {
192          /* TODO: is there a hw instruction we can use for this case? */
193          physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0;
194 
195          do_swap(compiler, instr,
196                  &(struct copy_entry){
197                     .src = {.reg = entry->dst & ~1u},
198                     .dst = tmp,
199                     .flags = entry->flags & ~IR3_REG_HALF,
200                  });
201 
202          /* Similar to in do_swap(), account for src being swapped with tmp if
203           * src and dst are in the same register.
204           */
205          struct copy_src src = entry->src;
206          if (!src.flags && (src.reg & ~1u) == (entry->dst & ~1u))
207             src.reg = tmp + (src.reg & 1u);
208 
209          do_copy(compiler, instr,
210                  &(struct copy_entry){
211                     .src = src,
212                     .dst = tmp + (entry->dst & 1),
213                     .flags = entry->flags,
214                  });
215 
216          do_swap(compiler, instr,
217                  &(struct copy_entry){
218                     .src = {.reg = entry->dst & ~1u},
219                     .dst = tmp,
220                     .flags = entry->flags & ~IR3_REG_HALF,
221                  });
222          return;
223       }
224 
225       if (!entry->src.flags && entry->src.reg >= RA_HALF_SIZE) {
226          unsigned src_num = ra_physreg_to_num(entry->src.reg & ~1u,
227                                               entry->flags & ~IR3_REG_HALF);
228          unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
229 
230          if (entry->src.reg % 2 == 0) {
231             /* cov.u32u16 dst, src */
232             struct ir3_instruction *cov =
233                ir3_instr_create(instr->block, OPC_MOV, 1, 1);
234             ir3_dst_create(cov, dst_num, entry->flags);
235             ir3_src_create(cov, src_num, entry->flags & ~IR3_REG_HALF);
236             cov->cat1.dst_type = TYPE_U16;
237             cov->cat1.src_type = TYPE_U32;
238             ir3_instr_move_before(cov, instr);
239          } else {
240             /* shr.b dst, src, (16) */
241             struct ir3_instruction *shr =
242                ir3_instr_create(instr->block, OPC_SHR_B, 1, 2);
243             ir3_dst_create(shr, dst_num, entry->flags);
244             ir3_src_create(shr, src_num, entry->flags & ~IR3_REG_HALF);
245             ir3_src_create(shr, 0, IR3_REG_IMMED)->uim_val = 16;
246             ir3_instr_move_before(shr, instr);
247          }
248          return;
249       }
250    }
251 
252    unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
253    unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
254 
255    /* Similar to the swap case, we have to use a macro for shared regs. */
256    unsigned opc =
257       (entry->flags & IR3_REG_SHARED) ? OPC_READ_FIRST_MACRO : OPC_MOV;
258    struct ir3_instruction *mov = ir3_instr_create(instr->block, opc, 1, 1);
259    ir3_dst_create(mov, dst_num, entry->flags);
260    ir3_src_create(mov, src_num, entry->flags | entry->src.flags);
261    mov->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
262    mov->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
263    if (entry->src.flags & IR3_REG_IMMED)
264       mov->srcs[0]->uim_val = entry->src.imm;
265    else if (entry->src.flags & IR3_REG_CONST)
266       mov->srcs[0]->num = entry->src.const_num;
267    ir3_instr_move_before(mov, instr);
268 }
269 
270 struct copy_ctx {
271    /* For each physreg, the number of pending copy entries that use it as a
272     * source. Once this drops to zero, then the physreg is unblocked and can
273     * be moved to.
274     */
275    unsigned physreg_use_count[RA_MAX_FILE_SIZE];
276 
277    /* For each physreg, the pending copy_entry that uses it as a dest. */
278    struct copy_entry *physreg_dst[RA_MAX_FILE_SIZE];
279 
280    struct copy_entry entries[RA_MAX_FILE_SIZE];
281    unsigned entry_count;
282 };
283 
284 static bool
entry_blocked(struct copy_entry * entry,struct copy_ctx * ctx)285 entry_blocked(struct copy_entry *entry, struct copy_ctx *ctx)
286 {
287    for (unsigned i = 0; i < copy_entry_size(entry); i++) {
288       if (ctx->physreg_use_count[entry->dst + i] != 0)
289          return true;
290    }
291 
292    return false;
293 }
294 
295 static void
split_32bit_copy(struct copy_ctx * ctx,struct copy_entry * entry)296 split_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry)
297 {
298    assert(!entry->done);
299    assert(!(entry->src.flags & (IR3_REG_IMMED | IR3_REG_CONST)));
300    assert(copy_entry_size(entry) == 2);
301    struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++];
302 
303    new_entry->dst = entry->dst + 1;
304    new_entry->src.flags = entry->src.flags;
305    new_entry->src.reg = entry->src.reg + 1;
306    new_entry->done = false;
307    entry->flags |= IR3_REG_HALF;
308    new_entry->flags = entry->flags;
309    ctx->physreg_dst[entry->dst + 1] = new_entry;
310 }
311 
312 static void
_handle_copies(struct ir3_compiler * compiler,struct ir3_instruction * instr,struct copy_ctx * ctx)313 _handle_copies(struct ir3_compiler *compiler, struct ir3_instruction *instr,
314                struct copy_ctx *ctx)
315 {
316    /* Set up the bookkeeping */
317    memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst));
318    memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
319 
320    for (unsigned i = 0; i < ctx->entry_count; i++) {
321       struct copy_entry *entry = &ctx->entries[i];
322       for (unsigned j = 0; j < copy_entry_size(entry); j++) {
323          if (!entry->src.flags)
324             ctx->physreg_use_count[entry->src.reg + j]++;
325 
326          /* Copies should not have overlapping destinations. */
327          assert(!ctx->physreg_dst[entry->dst + j]);
328          ctx->physreg_dst[entry->dst + j] = entry;
329       }
330    }
331 
332    bool progress = true;
333    while (progress) {
334       progress = false;
335 
336       /* Step 1: resolve paths in the transfer graph. This means finding
337        * copies whose destination aren't blocked by something else and then
338        * emitting them, continuing this process until every copy is blocked
339        * and there are only cycles left.
340        *
341        * TODO: We should note that src is also available in dst to unblock
342        * cycles that src is involved in.
343        */
344 
345       for (unsigned i = 0; i < ctx->entry_count; i++) {
346          struct copy_entry *entry = &ctx->entries[i];
347          if (!entry->done && !entry_blocked(entry, ctx)) {
348             entry->done = true;
349             progress = true;
350             do_copy(compiler, instr, entry);
351             for (unsigned j = 0; j < copy_entry_size(entry); j++) {
352                if (!entry->src.flags)
353                   ctx->physreg_use_count[entry->src.reg + j]--;
354                ctx->physreg_dst[entry->dst + j] = NULL;
355             }
356          }
357       }
358 
359       if (progress)
360          continue;
361 
362       /* Step 2: Find partially blocked copies and split them. In the
363        * mergedregs case, we can 32-bit copies which are only blocked on one
364        * 16-bit half, and splitting them helps get things moving.
365        *
366        * We can skip splitting copies if the source isn't a register,
367        * however, because it does not unblock anything and therefore doesn't
368        * contribute to making forward progress with step 1. These copies
369        * should still be resolved eventually in step 1 because they can't be
370        * part of a cycle.
371        */
372       for (unsigned i = 0; i < ctx->entry_count; i++) {
373          struct copy_entry *entry = &ctx->entries[i];
374          if (entry->done || entry->flags & IR3_REG_HALF)
375             continue;
376 
377          if (((ctx->physreg_use_count[entry->dst] == 0 ||
378                ctx->physreg_use_count[entry->dst + 1] == 0)) &&
379              !(entry->src.flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
380             split_32bit_copy(ctx, entry);
381             progress = true;
382          }
383       }
384    }
385 
386    /* Step 3: resolve cycles through swapping.
387     *
388     * At this point, the transfer graph should consist of only cycles.
389     * The reason is that, given any physreg n_1 that's the source of a
390     * remaining entry, it has a destination n_2, which (because every
391     * copy is blocked) is the source of some other copy whose destination
392     * is n_3, and so we can follow the chain until we get a cycle. If we
393     * reached some other node than n_1:
394     *
395     *  n_1 -> n_2 -> ... -> n_i
396     *          ^             |
397     *          |-------------|
398     *
399     *  then n_2 would be the destination of 2 copies, which is illegal
400     *  (checked above in an assert). So n_1 must be part of a cycle:
401     *
402     *  n_1 -> n_2 -> ... -> n_i
403     *  ^                     |
404     *  |---------------------|
405     *
406     *  and this must be only cycle n_1 is involved in, because any other
407     *  path starting from n_1 would also have to end in n_1, resulting in
408     *  a node somewhere along the way being the destination of 2 copies
409     *  when the 2 paths merge.
410     *
411     *  The way we resolve the cycle is through picking a copy (n_1, n_2)
412     *  and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
413     *  out of the cycle:
414     *
415     *  n_1 -> ... -> n_i
416     *  ^              |
417     *  |--------------|
418     *
419     *  and we can keep repeating this until the cycle is empty.
420     */
421 
422    for (unsigned i = 0; i < ctx->entry_count; i++) {
423       struct copy_entry *entry = &ctx->entries[i];
424       if (entry->done)
425          continue;
426 
427       assert(!entry->src.flags);
428 
429       /* catch trivial copies */
430       if (entry->dst == entry->src.reg) {
431          entry->done = true;
432          continue;
433       }
434 
435       do_swap(compiler, instr, entry);
436 
437       /* Split any blocking copies whose sources are only partially
438        * contained within our destination.
439        */
440       if (entry->flags & IR3_REG_HALF) {
441          for (unsigned j = 0; j < ctx->entry_count; j++) {
442             struct copy_entry *blocking = &ctx->entries[j];
443 
444             if (blocking->done)
445                continue;
446 
447             if (blocking->src.reg <= entry->dst &&
448                 blocking->src.reg + 1 >= entry->dst &&
449                 !(blocking->flags & IR3_REG_HALF)) {
450                split_32bit_copy(ctx, blocking);
451             }
452          }
453       }
454 
455       /* Update sources of blocking copies.
456        *
457        * Note: at this point, every blocking copy's source should be
458        * contained within our destination.
459        */
460       for (unsigned j = 0; j < ctx->entry_count; j++) {
461          struct copy_entry *blocking = &ctx->entries[j];
462          if (blocking->src.reg >= entry->dst &&
463              blocking->src.reg < entry->dst + copy_entry_size(entry)) {
464             blocking->src.reg =
465                entry->src.reg + (blocking->src.reg - entry->dst);
466          }
467       }
468 
469       entry->done = true;
470    }
471 }
472 
473 static void
handle_copies(struct ir3_shader_variant * v,struct ir3_instruction * instr,struct copy_entry * entries,unsigned entry_count)474 handle_copies(struct ir3_shader_variant *v, struct ir3_instruction *instr,
475               struct copy_entry *entries, unsigned entry_count)
476 {
477    struct copy_ctx ctx;
478 
479    /* handle shared copies first */
480    ctx.entry_count = 0;
481    for (unsigned i = 0; i < entry_count; i++) {
482       if (entries[i].flags & IR3_REG_SHARED)
483          ctx.entries[ctx.entry_count++] = entries[i];
484    }
485    _handle_copies(v->compiler, instr, &ctx);
486 
487    if (v->mergedregs) {
488       /* Half regs and full regs are in the same file, so handle everything
489        * at once.
490        */
491       ctx.entry_count = 0;
492       for (unsigned i = 0; i < entry_count; i++) {
493          if (!(entries[i].flags & IR3_REG_SHARED))
494             ctx.entries[ctx.entry_count++] = entries[i];
495       }
496       _handle_copies(v->compiler, instr, &ctx);
497    } else {
498       /* There may be both half copies and full copies, so we have to split
499        * them up since they don't interfere.
500        */
501       ctx.entry_count = 0;
502       for (unsigned i = 0; i < entry_count; i++) {
503          if (entries[i].flags & IR3_REG_HALF)
504             ctx.entries[ctx.entry_count++] = entries[i];
505       }
506       _handle_copies(v->compiler, instr, &ctx);
507 
508       ctx.entry_count = 0;
509       for (unsigned i = 0; i < entry_count; i++) {
510          if (!(entries[i].flags & (IR3_REG_HALF | IR3_REG_SHARED)))
511             ctx.entries[ctx.entry_count++] = entries[i];
512       }
513       _handle_copies(v->compiler, instr, &ctx);
514    }
515 }
516 
517 void
ir3_lower_copies(struct ir3_shader_variant * v)518 ir3_lower_copies(struct ir3_shader_variant *v)
519 {
520    DECLARE_ARRAY(struct copy_entry, copies);
521    copies_count = copies_sz = 0;
522    copies = NULL;
523 
524    foreach_block (block, &v->ir->block_list) {
525       foreach_instr_safe (instr, &block->instr_list) {
526          if (instr->opc == OPC_META_PARALLEL_COPY) {
527             copies_count = 0;
528             for (unsigned i = 0; i < instr->dsts_count; i++) {
529                struct ir3_register *dst = instr->dsts[i];
530                struct ir3_register *src = instr->srcs[i];
531                unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
532                unsigned dst_physreg = ra_reg_get_physreg(dst);
533                for (unsigned j = 0; j < reg_elems(dst); j++) {
534                   array_insert(
535                      NULL, copies,
536                      (struct copy_entry){
537                         .dst = dst_physreg + j * reg_elem_size(dst),
538                         .src = get_copy_src(src, j * reg_elem_size(dst)),
539                         .flags = flags,
540                      });
541                }
542             }
543             handle_copies(v, instr, copies, copies_count);
544             list_del(&instr->node);
545          } else if (instr->opc == OPC_META_COLLECT) {
546             copies_count = 0;
547             struct ir3_register *dst = instr->dsts[0];
548             unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
549             for (unsigned i = 0; i < instr->srcs_count; i++) {
550                struct ir3_register *src = instr->srcs[i];
551                array_insert(NULL, copies,
552                             (struct copy_entry){
553                                .dst = ra_num_to_physreg(dst->num + i, flags),
554                                .src = get_copy_src(src, 0),
555                                .flags = flags,
556                             });
557             }
558             handle_copies(v, instr, copies, copies_count);
559             list_del(&instr->node);
560          } else if (instr->opc == OPC_META_SPLIT) {
561             copies_count = 0;
562             struct ir3_register *dst = instr->dsts[0];
563             struct ir3_register *src = instr->srcs[0];
564             unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
565             array_insert(NULL, copies,
566                          (struct copy_entry){
567                             .dst = ra_reg_get_physreg(dst),
568                             .src = get_copy_src(
569                                src, instr->split.off * reg_elem_size(dst)),
570                             .flags = flags,
571                          });
572             handle_copies(v, instr, copies, copies_count);
573             list_del(&instr->node);
574          } else if (instr->opc == OPC_META_PHI) {
575             list_del(&instr->node);
576          }
577       }
578    }
579 
580    if (copies)
581       ralloc_free(copies);
582 }
583