• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2022 Alyssa Rosenzweig
3  * Copyright 2021 Valve Corporation
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "agx_builder.h"
8 #include "agx_compiler.h"
9 
10 UNUSED static void
print_copy(const struct agx_copy * cp)11 print_copy(const struct agx_copy *cp)
12 {
13    printf("%sr%u = ", cp->dest_mem ? "m" : "", cp->dest);
14    agx_print_index(cp->src, false, stdout);
15    printf("\n");
16 }
17 
18 UNUSED static void
print_copies(const struct agx_copy * copies,unsigned nr)19 print_copies(const struct agx_copy *copies, unsigned nr)
20 {
21    printf("[\n");
22 
23    for (unsigned i = 0; i < nr; ++i) {
24       printf("  ");
25       print_copy(&copies[i]);
26    }
27 
28    printf("]\n");
29 }
30 
31 /*
32  * Emits code for
33  *
34  *    for (int i = 0; i < n; ++i)
35  *       registers[dests[i]] = registers[srcs[i]];
36  *
37  * ...with all copies happening in parallel.
38  *
39  * That is, emit machine instructions equivalent to a parallel copy. This is
40  * used to lower not only parallel copies but also collects and splits, which
41  * also have parallel copy semantics.
42  *
43  * We only handles register-register copies, not general agx_index sources. This
44  * suffices for its internal use for register allocation.
45  */
46 
47 static void
do_copy(agx_builder * b,const struct agx_copy * copy)48 do_copy(agx_builder *b, const struct agx_copy *copy)
49 {
50    agx_index dst = copy->dest_mem
51                       ? agx_memory_register(copy->dest, copy->src.size)
52                       : agx_register(copy->dest, copy->src.size);
53 
54    if (copy->dest_mem && copy->src.memory) {
55       /* Memory-memory copies need to be lowered to memory-register and
56        * register-memory, using a reserved scratch register.
57        */
58       agx_index scratch_reg = agx_register(2, copy->src.size);
59       agx_mov_to(b, scratch_reg, copy->src);
60       agx_mov_to(b, dst, scratch_reg);
61    } else if (copy->src.type == AGX_INDEX_IMMEDIATE) {
62       agx_mov_imm_to(b, dst, copy->src.value);
63    } else {
64       agx_mov_to(b, dst, copy->src);
65    }
66 }
67 
68 static void
do_swap(agx_builder * b,const struct agx_copy * copy)69 do_swap(agx_builder *b, const struct agx_copy *copy)
70 {
71    assert(copy->src.type == AGX_INDEX_REGISTER && "only GPRs are swapped");
72 
73    if (copy->dest == copy->src.value)
74       return;
75 
76    agx_index x = copy->dest_mem
77                     ? agx_memory_register(copy->dest, copy->src.size)
78                     : agx_register(copy->dest, copy->src.size);
79    agx_index y = copy->src;
80    assert(x.memory == y.memory);
81 
82    /* Memory-memory swaps lowered here, GPR swaps lowered later */
83    if (x.memory) {
84       agx_index temp1 = agx_register(4, copy->src.size);
85       agx_index temp2 = agx_register(6, copy->src.size);
86 
87       agx_mov_to(b, temp1, x);
88       agx_mov_to(b, temp2, y);
89       agx_mov_to(b, y, temp1);
90       agx_mov_to(b, x, temp2);
91    } else {
92       agx_swap(b, x, y);
93    }
94 }
95 
96 struct copy_ctx {
97    /* Number of copies being processed */
98    unsigned entry_count;
99 
100    /* For each physreg, the number of pending copy entries that use it as a
101     * source. Once this drops to zero, then the physreg is unblocked and can
102     * be moved to.
103     */
104    unsigned physreg_use_count[AGX_NUM_MODELED_REGS];
105 
106    /* For each physreg, the pending copy_entry that uses it as a dest. */
107    struct agx_copy *physreg_dest[AGX_NUM_MODELED_REGS];
108 
109    struct agx_copy entries[AGX_NUM_MODELED_REGS];
110 };
111 
112 static bool
entry_blocked(struct agx_copy * entry,struct copy_ctx * ctx)113 entry_blocked(struct agx_copy *entry, struct copy_ctx *ctx)
114 {
115    for (unsigned i = 0; i < agx_size_align_16(entry->src.size); i++) {
116       if (ctx->physreg_use_count[entry->dest + i] != 0)
117          return true;
118    }
119 
120    return false;
121 }
122 
123 static bool
is_real(struct agx_copy * entry)124 is_real(struct agx_copy *entry)
125 {
126    return entry->src.type == AGX_INDEX_REGISTER &&
127           entry->dest_mem == entry->src.memory;
128 }
129 
130 /* TODO: Generalize to other bit sizes */
131 static void
split_32bit_copy(struct copy_ctx * ctx,struct agx_copy * entry)132 split_32bit_copy(struct copy_ctx *ctx, struct agx_copy *entry)
133 {
134    assert(!entry->done);
135    assert(is_real(entry));
136    assert(agx_size_align_16(entry->src.size) == 2);
137    struct agx_copy *new_entry = &ctx->entries[ctx->entry_count++];
138 
139    new_entry->dest = entry->dest + 1;
140    new_entry->dest_mem = entry->dest_mem;
141    new_entry->src = entry->src;
142    new_entry->src.value += 1;
143    new_entry->done = false;
144    entry->src.size = AGX_SIZE_16;
145    new_entry->src.size = AGX_SIZE_16;
146    ctx->physreg_dest[entry->dest + 1] = new_entry;
147 }
148 
149 static void
agx_emit_parallel_copies_for_class(agx_builder * b,struct agx_copy * copies,unsigned num_copies,bool cls)150 agx_emit_parallel_copies_for_class(agx_builder *b, struct agx_copy *copies,
151                                    unsigned num_copies, bool cls)
152 {
153    /* First, lower away 64-bit copies to smaller chunks, since we don't have
154     * 64-bit ALU so we always want to split.
155     */
156    struct agx_copy *copies2 = calloc(sizeof(copies[0]), num_copies * 2);
157    unsigned num_copies2 = 0;
158 
159    for (unsigned i = 0; i < num_copies; ++i) {
160       struct agx_copy copy = copies[i];
161 
162       /* Filter by class */
163       if (copy.dest_mem != cls)
164          continue;
165 
166       assert(copy.dest < AGX_NUM_MODELED_REGS);
167 
168       if (copy.src.size == AGX_SIZE_64) {
169          copy.src.size = AGX_SIZE_32;
170          copies2[num_copies2++] = copy;
171 
172          if (copy.src.type == AGX_INDEX_IMMEDIATE) {
173             static_assert(sizeof(copy.src.value) * 8 == 32, "known size");
174             copy.src.value = 0;
175          } else {
176             assert(copy.src.type == AGX_INDEX_REGISTER ||
177                    copy.src.type == AGX_INDEX_UNIFORM);
178 
179             copy.src.value += 2;
180          }
181 
182          copy.dest += 2;
183          copies2[num_copies2++] = copy;
184       } else {
185          copies2[num_copies2++] = copy;
186       }
187    }
188 
189    copies = copies2;
190    num_copies = num_copies2;
191 
192    /* Set up the bookkeeping */
193    struct copy_ctx _ctx = {.entry_count = num_copies};
194    struct copy_ctx *ctx = &_ctx;
195 
196    memset(ctx->physreg_dest, 0, sizeof(ctx->physreg_dest));
197    memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
198 
199    for (unsigned i = 0; i < ctx->entry_count; i++) {
200       struct agx_copy *entry = &copies[i];
201 
202       ctx->entries[i] = *entry;
203 
204       for (unsigned j = 0; j < agx_size_align_16(entry->src.size); j++) {
205          if (is_real(entry))
206             ctx->physreg_use_count[entry->src.value + j]++;
207 
208          /* Copies should not have overlapping destinations. */
209          assert(!ctx->physreg_dest[entry->dest + j]);
210          ctx->physreg_dest[entry->dest + j] = &ctx->entries[i];
211       }
212    }
213 
214    /* Try to vectorize aligned 16-bit copies to use 32-bit operations instead */
215    for (unsigned i = 0; i < ctx->entry_count; i++) {
216       struct agx_copy *entry = &ctx->entries[i];
217       if (entry->src.size != AGX_SIZE_16)
218          continue;
219 
220       if ((entry->dest & 1) || (entry->src.value & 1))
221          continue;
222 
223       if (entry->src.type != AGX_INDEX_UNIFORM &&
224           entry->src.type != AGX_INDEX_REGISTER)
225          continue;
226 
227       unsigned next_dest = entry->dest + 1;
228       assert(next_dest < ARRAY_SIZE(ctx->physreg_dest) && "aligned reg");
229 
230       struct agx_copy *next_copy = ctx->physreg_dest[next_dest];
231       if (!next_copy)
232          continue;
233 
234       assert(next_copy->dest == next_dest && "data structure invariant");
235       assert(next_copy->src.size == AGX_SIZE_16 && "unaligned copy");
236 
237       if (next_copy->src.type != entry->src.type)
238          continue;
239 
240       if (next_copy->src.value != (entry->src.value + 1))
241          continue;
242 
243       /* Vectorize the copies */
244       ctx->physreg_dest[next_dest] = entry;
245       entry->src.size = AGX_SIZE_32;
246       next_copy->done = true;
247    }
248 
249    bool progress = true;
250    while (progress) {
251       progress = false;
252 
253       /* Step 1: resolve paths in the transfer graph. This means finding
254        * copies whose destination aren't blocked by something else and then
255        * emitting them, continuing this process until every copy is blocked
256        * and there are only cycles left.
257        *
258        * TODO: We should note that src is also available in dest to unblock
259        * cycles that src is involved in.
260        */
261 
262       for (unsigned i = 0; i < ctx->entry_count; i++) {
263          struct agx_copy *entry = &ctx->entries[i];
264          if (!entry->done && !entry_blocked(entry, ctx)) {
265             entry->done = true;
266             progress = true;
267             do_copy(b, entry);
268             for (unsigned j = 0; j < agx_size_align_16(entry->src.size); j++) {
269                if (is_real(entry))
270                   ctx->physreg_use_count[entry->src.value + j]--;
271                ctx->physreg_dest[entry->dest + j] = NULL;
272             }
273          }
274       }
275 
276       if (progress)
277          continue;
278 
279       /* Step 2: Find partially blocked copies and split them. In the
280        * mergedregs case, we can 32-bit copies which are only blocked on one
281        * 16-bit half, and splitting them helps get things moving.
282        *
283        * We can skip splitting copies if the source isn't a register,
284        * however, because it does not unblock anything and therefore doesn't
285        * contribute to making forward progress with step 1. These copies
286        * should still be resolved eventually in step 1 because they can't be
287        * part of a cycle.
288        */
289       for (unsigned i = 0; i < ctx->entry_count; i++) {
290          struct agx_copy *entry = &ctx->entries[i];
291          if (entry->done || (agx_size_align_16(entry->src.size) != 2))
292             continue;
293 
294          if (((ctx->physreg_use_count[entry->dest] == 0 ||
295                ctx->physreg_use_count[entry->dest + 1] == 0)) &&
296              is_real(entry)) {
297             split_32bit_copy(ctx, entry);
298             progress = true;
299          }
300       }
301    }
302 
303    /* Step 3: resolve cycles through swapping.
304     *
305     * At this point, the transfer graph should consist of only cycles.
306     * The reason is that, given any physreg n_1 that's the source of a
307     * remaining entry, it has a destination n_2, which (because every
308     * copy is blocked) is the source of some other copy whose destination
309     * is n_3, and so we can follow the chain until we get a cycle. If we
310     * reached some other node than n_1:
311     *
312     *  n_1 -> n_2 -> ... -> n_i
313     *          ^             |
314     *          |-------------|
315     *
316     *  then n_2 would be the destination of 2 copies, which is illegal
317     *  (checked above in an assert). So n_1 must be part of a cycle:
318     *
319     *  n_1 -> n_2 -> ... -> n_i
320     *  ^                     |
321     *  |---------------------|
322     *
323     *  and this must be only cycle n_1 is involved in, because any other
324     *  path starting from n_1 would also have to end in n_1, resulting in
325     *  a node somewhere along the way being the destination of 2 copies
326     *  when the 2 paths merge.
327     *
328     *  The way we resolve the cycle is through picking a copy (n_1, n_2)
329     *  and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
330     *  out of the cycle:
331     *
332     *  n_1 -> ... -> n_i
333     *  ^              |
334     *  |--------------|
335     *
336     *  and we can keep repeating this until the cycle is empty.
337     */
338 
339    for (unsigned i = 0; i < ctx->entry_count; i++) {
340       struct agx_copy *entry = &ctx->entries[i];
341       if (entry->done)
342          continue;
343 
344       assert(is_real(entry));
345 
346       /* catch trivial copies */
347       if (entry->dest == entry->src.value) {
348          entry->done = true;
349          continue;
350       }
351 
352       do_swap(b, entry);
353 
354       /* Split any blocking copies whose sources are only partially
355        * contained within our destination.
356        */
357       if (agx_size_align_16(entry->src.size) == 1) {
358          for (unsigned j = 0; j < ctx->entry_count; j++) {
359             struct agx_copy *blocking = &ctx->entries[j];
360 
361             if (blocking->done)
362                continue;
363 
364             if (blocking->src.value <= entry->dest &&
365                 blocking->src.value + 1 >= entry->dest &&
366                 agx_size_align_16(blocking->src.size) == 2) {
367                split_32bit_copy(ctx, blocking);
368             }
369          }
370       }
371 
372       /* Update sources of blocking copies.
373        *
374        * Note: at this point, every blocking copy's source should be
375        * contained within our destination.
376        */
377       for (unsigned j = 0; j < ctx->entry_count; j++) {
378          struct agx_copy *blocking = &ctx->entries[j];
379          if (blocking->src.value >= entry->dest &&
380              blocking->src.value <
381                 entry->dest + agx_size_align_16(entry->src.size)) {
382             blocking->src.value =
383                entry->src.value + (blocking->src.value - entry->dest);
384          }
385       }
386 
387       entry->done = true;
388    }
389 
390    free(copies2);
391 }
392 
393 void
agx_emit_parallel_copies(agx_builder * b,struct agx_copy * copies,unsigned num_copies)394 agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies,
395                          unsigned num_copies)
396 {
397    /* Emit copies fo reach register class separately because we don't have
398     * register class awareness in the parallel copy lowering data structure.
399     */
400    agx_emit_parallel_copies_for_class(b, copies, num_copies, false);
401    agx_emit_parallel_copies_for_class(b, copies, num_copies, true);
402 }
403