• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2022 Alyssa Rosenzweig
3  * Copyright 2021 Valve Corporation
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "agx_builder.h"
8 #include "agx_compiler.h"
9 
10 UNUSED static void
print_copy(const struct agx_copy * cp)11 print_copy(const struct agx_copy *cp)
12 {
13    printf("%sr%u = ", cp->dest_mem ? "m" : "", cp->dest);
14    agx_print_index(cp->src, false, stdout);
15    printf("\n");
16 }
17 
18 UNUSED static void
print_copies(const struct agx_copy * copies,unsigned nr)19 print_copies(const struct agx_copy *copies, unsigned nr)
20 {
21    printf("[\n");
22 
23    for (unsigned i = 0; i < nr; ++i) {
24       printf("  ");
25       print_copy(&copies[i]);
26    }
27 
28    printf("]\n");
29 }
30 
31 /*
32  * Emits code for
33  *
34  *    for (int i = 0; i < n; ++i)
35  *       registers[dests[i]] = registers[srcs[i]];
36  *
37  * ...with all copies happening in parallel.
38  *
39  * That is, emit machine instructions equivalent to a parallel copy. This is
40  * used to lower not only parallel copies but also collects and splits, which
41  * also have parallel copy semantics.
42  *
43  * We only handles register-register copies, not general agx_index sources. This
44  * suffices for its internal use for register allocation.
45  */
46 
47 static agx_index
scratch_slot(agx_context * ctx,enum agx_size size)48 scratch_slot(agx_context *ctx, enum agx_size size)
49 {
50    /* Reserve scratch slots. scratch_size is in bytes, spill_pcopy_base and
51     * agx_memory_register are in memory registers (16-bit elements).
52     */
53    if (!ctx->has_spill_pcopy_reserved) {
54       ctx->scratch_size = align(ctx->scratch_size, 16);
55       ctx->spill_pcopy_base = ctx->scratch_size / 2;
56       ctx->scratch_size += 8;
57       ctx->has_spill_pcopy_reserved = true;
58    }
59 
60    return agx_memory_register(ctx->spill_pcopy_base, size);
61 }
62 
63 static void
do_copy(agx_builder * b,const struct agx_copy * copy)64 do_copy(agx_builder *b, const struct agx_copy *copy)
65 {
66    agx_index dst = copy->dest_mem
67                       ? agx_memory_register(copy->dest, copy->src.size)
68                       : agx_register(copy->dest, copy->src.size);
69 
70    if (copy->dest_mem && copy->src.memory) {
71       /* Memory-memory copies need to be lowered to memory-register and
72        * register-memory, spilling a GPR to an auxiliary memory slot. This
73        * avoids needing reserving a scratch register for this edge case.
74        */
75       agx_index scratch_reg = agx_register(0, copy->src.size);
76       agx_index scratch_mem = scratch_slot(b->shader, copy->src.size);
77 
78       agx_mov_to(b, scratch_mem, scratch_reg);
79       agx_mov_to(b, scratch_reg, copy->src);
80       agx_mov_to(b, dst, scratch_reg);
81       agx_mov_to(b, scratch_reg, scratch_mem);
82    } else if (copy->src.type == AGX_INDEX_IMMEDIATE) {
83       agx_mov_imm_to(b, dst, copy->src.value);
84    } else {
85       agx_mov_to(b, dst, copy->src);
86    }
87 }
88 
89 static void
do_swap(agx_builder * b,const struct agx_copy * copy)90 do_swap(agx_builder *b, const struct agx_copy *copy)
91 {
92    assert(copy->src.type == AGX_INDEX_REGISTER && "only GPRs are swapped");
93 
94    if (copy->dest == copy->src.value)
95       return;
96 
97    /* We can swap lo/hi halves of a 32-bit register with a 32-bit extr */
98    if (copy->src.size == AGX_SIZE_16 &&
99        (copy->dest >> 1) == (copy->src.value >> 1) && !copy->dest_mem) {
100 
101       assert(((copy->dest & 1) == (1 - (copy->src.value & 1))) &&
102              "no trivial swaps, and only 2 halves of a register");
103 
104       /* r0 = extr r0, r0, #16
105        *    = (((r0 << 32) | r0) >> 16) & 0xFFFFFFFF
106        *    = (((r0 << 32) >> 16) & 0xFFFFFFFF) | (r0 >> 16)
107        *    = (r0l << 16) | r0h
108        */
109       agx_index reg32 = agx_register(copy->dest & ~1, AGX_SIZE_32);
110       agx_extr_to(b, reg32, reg32, reg32, agx_immediate(16), 0);
111       return;
112    }
113 
114    agx_index x = copy->dest_mem
115                     ? agx_memory_register(copy->dest, copy->src.size)
116                     : agx_register(copy->dest, copy->src.size);
117    agx_index y = copy->src;
118 
119    /* Memory-memory swaps need to be lowered */
120    assert(x.memory == y.memory);
121    if (x.memory) {
122       agx_index temp1 = agx_register(0, copy->src.size);
123       agx_index temp2 = agx_register(2, copy->src.size);
124 
125       agx_index scratch_reg2 = agx_register(0, copy->src.size);
126       agx_index scratch_mem2 = scratch_slot(b->shader, copy->src.size);
127       scratch_reg2.channels_m1++;
128       scratch_mem2.channels_m1++;
129 
130       agx_mov_to(b, scratch_mem2, scratch_reg2);
131       agx_mov_to(b, temp1, x);
132       agx_mov_to(b, temp2, y);
133       agx_mov_to(b, y, temp1);
134       agx_mov_to(b, x, temp2);
135       agx_mov_to(b, scratch_reg2, scratch_mem2);
136       return;
137    }
138 
139    /* Otherwise, we're swapping GPRs and fallback on a XOR swap. */
140    agx_xor_to(b, x, x, y);
141    agx_xor_to(b, y, x, y);
142    agx_xor_to(b, x, x, y);
143 }
144 
145 struct copy_ctx {
146    /* Number of copies being processed */
147    unsigned entry_count;
148 
149    /* For each physreg, the number of pending copy entries that use it as a
150     * source. Once this drops to zero, then the physreg is unblocked and can
151     * be moved to.
152     */
153    unsigned physreg_use_count[AGX_NUM_MODELED_REGS];
154 
155    /* For each physreg, the pending copy_entry that uses it as a dest. */
156    struct agx_copy *physreg_dest[AGX_NUM_MODELED_REGS];
157 
158    struct agx_copy entries[AGX_NUM_MODELED_REGS];
159 };
160 
161 static bool
entry_blocked(struct agx_copy * entry,struct copy_ctx * ctx)162 entry_blocked(struct agx_copy *entry, struct copy_ctx *ctx)
163 {
164    for (unsigned i = 0; i < agx_size_align_16(entry->src.size); i++) {
165       if (ctx->physreg_use_count[entry->dest + i] != 0)
166          return true;
167    }
168 
169    return false;
170 }
171 
172 static bool
is_real(struct agx_copy * entry)173 is_real(struct agx_copy *entry)
174 {
175    return entry->src.type == AGX_INDEX_REGISTER &&
176           entry->dest_mem == entry->src.memory;
177 }
178 
179 /* TODO: Generalize to other bit sizes */
180 static void
split_32bit_copy(struct copy_ctx * ctx,struct agx_copy * entry)181 split_32bit_copy(struct copy_ctx *ctx, struct agx_copy *entry)
182 {
183    assert(!entry->done);
184    assert(is_real(entry));
185    assert(agx_size_align_16(entry->src.size) == 2);
186    struct agx_copy *new_entry = &ctx->entries[ctx->entry_count++];
187 
188    new_entry->dest = entry->dest + 1;
189    new_entry->dest_mem = entry->dest_mem;
190    new_entry->src = entry->src;
191    new_entry->src.value += 1;
192    new_entry->done = false;
193    entry->src.size = AGX_SIZE_16;
194    new_entry->src.size = AGX_SIZE_16;
195    ctx->physreg_dest[entry->dest + 1] = new_entry;
196 }
197 
198 static void
agx_emit_parallel_copies_for_class(agx_builder * b,struct agx_copy * copies,unsigned num_copies,bool cls)199 agx_emit_parallel_copies_for_class(agx_builder *b, struct agx_copy *copies,
200                                    unsigned num_copies, bool cls)
201 {
202    /* First, lower away 64-bit copies to smaller chunks, since we don't have
203     * 64-bit ALU so we always want to split.
204     */
205    struct agx_copy *copies2 = calloc(sizeof(copies[0]), num_copies * 2);
206    unsigned num_copies2 = 0;
207 
208    for (unsigned i = 0; i < num_copies; ++i) {
209       struct agx_copy copy = copies[i];
210 
211       /* Filter by class */
212       if (copy.dest_mem != cls)
213          continue;
214 
215       assert(copy.dest < AGX_NUM_MODELED_REGS);
216 
217       if (copy.src.size == AGX_SIZE_64) {
218          copy.src.size = AGX_SIZE_32;
219          copies2[num_copies2++] = copy;
220 
221          if (copy.src.type == AGX_INDEX_IMMEDIATE) {
222             static_assert(sizeof(copy.src.value) * 8 == 32, "known size");
223             copy.src.value = 0;
224          } else {
225             assert(copy.src.type == AGX_INDEX_REGISTER ||
226                    copy.src.type == AGX_INDEX_UNIFORM);
227 
228             copy.src.value += 2;
229          }
230 
231          copy.dest += 2;
232          copies2[num_copies2++] = copy;
233       } else {
234          copies2[num_copies2++] = copy;
235       }
236    }
237 
238    copies = copies2;
239    num_copies = num_copies2;
240 
241    /* Set up the bookkeeping */
242    struct copy_ctx _ctx = {.entry_count = num_copies};
243    struct copy_ctx *ctx = &_ctx;
244 
245    memset(ctx->physreg_dest, 0, sizeof(ctx->physreg_dest));
246    memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
247 
248    for (unsigned i = 0; i < ctx->entry_count; i++) {
249       struct agx_copy *entry = &copies[i];
250 
251       ctx->entries[i] = *entry;
252 
253       for (unsigned j = 0; j < agx_size_align_16(entry->src.size); j++) {
254          if (is_real(entry))
255             ctx->physreg_use_count[entry->src.value + j]++;
256 
257          /* Copies should not have overlapping destinations. */
258          assert(!ctx->physreg_dest[entry->dest + j]);
259          ctx->physreg_dest[entry->dest + j] = &ctx->entries[i];
260       }
261    }
262 
263    /* Try to vectorize aligned 16-bit copies to use 32-bit operations instead */
264    for (unsigned i = 0; i < ctx->entry_count; i++) {
265       struct agx_copy *entry = &ctx->entries[i];
266       if (entry->src.size != AGX_SIZE_16)
267          continue;
268 
269       if ((entry->dest & 1) || (entry->src.value & 1))
270          continue;
271 
272       if (entry->src.type != AGX_INDEX_UNIFORM &&
273           entry->src.type != AGX_INDEX_REGISTER)
274          continue;
275 
276       unsigned next_dest = entry->dest + 1;
277       assert(next_dest < ARRAY_SIZE(ctx->physreg_dest) && "aligned reg");
278 
279       struct agx_copy *next_copy = ctx->physreg_dest[next_dest];
280       if (!next_copy)
281          continue;
282 
283       assert(next_copy->dest == next_dest && "data structure invariant");
284       assert(next_copy->src.size == AGX_SIZE_16 && "unaligned copy");
285 
286       if (next_copy->src.type != entry->src.type)
287          continue;
288 
289       if (next_copy->src.value != (entry->src.value + 1))
290          continue;
291 
292       /* Vectorize the copies */
293       ctx->physreg_dest[next_dest] = entry;
294       entry->src.size = AGX_SIZE_32;
295       next_copy->done = true;
296    }
297 
298    bool progress = true;
299    while (progress) {
300       progress = false;
301 
302       /* Step 1: resolve paths in the transfer graph. This means finding
303        * copies whose destination aren't blocked by something else and then
304        * emitting them, continuing this process until every copy is blocked
305        * and there are only cycles left.
306        *
307        * TODO: We should note that src is also available in dest to unblock
308        * cycles that src is involved in.
309        */
310 
311       for (unsigned i = 0; i < ctx->entry_count; i++) {
312          struct agx_copy *entry = &ctx->entries[i];
313          if (!entry->done && !entry_blocked(entry, ctx)) {
314             entry->done = true;
315             progress = true;
316             do_copy(b, entry);
317             for (unsigned j = 0; j < agx_size_align_16(entry->src.size); j++) {
318                if (is_real(entry))
319                   ctx->physreg_use_count[entry->src.value + j]--;
320                ctx->physreg_dest[entry->dest + j] = NULL;
321             }
322          }
323       }
324 
325       if (progress)
326          continue;
327 
328       /* Step 2: Find partially blocked copies and split them. In the
329        * mergedregs case, we can 32-bit copies which are only blocked on one
330        * 16-bit half, and splitting them helps get things moving.
331        *
332        * We can skip splitting copies if the source isn't a register,
333        * however, because it does not unblock anything and therefore doesn't
334        * contribute to making forward progress with step 1. These copies
335        * should still be resolved eventually in step 1 because they can't be
336        * part of a cycle.
337        */
338       for (unsigned i = 0; i < ctx->entry_count; i++) {
339          struct agx_copy *entry = &ctx->entries[i];
340          if (entry->done || (agx_size_align_16(entry->src.size) != 2))
341             continue;
342 
343          if (((ctx->physreg_use_count[entry->dest] == 0 ||
344                ctx->physreg_use_count[entry->dest + 1] == 0)) &&
345              is_real(entry)) {
346             split_32bit_copy(ctx, entry);
347             progress = true;
348          }
349       }
350    }
351 
352    /* Step 3: resolve cycles through swapping.
353     *
354     * At this point, the transfer graph should consist of only cycles.
355     * The reason is that, given any physreg n_1 that's the source of a
356     * remaining entry, it has a destination n_2, which (because every
357     * copy is blocked) is the source of some other copy whose destination
358     * is n_3, and so we can follow the chain until we get a cycle. If we
359     * reached some other node than n_1:
360     *
361     *  n_1 -> n_2 -> ... -> n_i
362     *          ^             |
363     *          |-------------|
364     *
365     *  then n_2 would be the destination of 2 copies, which is illegal
366     *  (checked above in an assert). So n_1 must be part of a cycle:
367     *
368     *  n_1 -> n_2 -> ... -> n_i
369     *  ^                     |
370     *  |---------------------|
371     *
372     *  and this must be only cycle n_1 is involved in, because any other
373     *  path starting from n_1 would also have to end in n_1, resulting in
374     *  a node somewhere along the way being the destination of 2 copies
375     *  when the 2 paths merge.
376     *
377     *  The way we resolve the cycle is through picking a copy (n_1, n_2)
378     *  and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
379     *  out of the cycle:
380     *
381     *  n_1 -> ... -> n_i
382     *  ^              |
383     *  |--------------|
384     *
385     *  and we can keep repeating this until the cycle is empty.
386     */
387 
388    for (unsigned i = 0; i < ctx->entry_count; i++) {
389       struct agx_copy *entry = &ctx->entries[i];
390       if (entry->done)
391          continue;
392 
393       assert(is_real(entry));
394 
395       /* catch trivial copies */
396       if (entry->dest == entry->src.value) {
397          entry->done = true;
398          continue;
399       }
400 
401       do_swap(b, entry);
402 
403       /* Split any blocking copies whose sources are only partially
404        * contained within our destination.
405        */
406       if (agx_size_align_16(entry->src.size) == 1) {
407          for (unsigned j = 0; j < ctx->entry_count; j++) {
408             struct agx_copy *blocking = &ctx->entries[j];
409 
410             if (blocking->done)
411                continue;
412 
413             if (blocking->src.value <= entry->dest &&
414                 blocking->src.value + 1 >= entry->dest &&
415                 agx_size_align_16(blocking->src.size) == 2) {
416                split_32bit_copy(ctx, blocking);
417             }
418          }
419       }
420 
421       /* Update sources of blocking copies.
422        *
423        * Note: at this point, every blocking copy's source should be
424        * contained within our destination.
425        */
426       for (unsigned j = 0; j < ctx->entry_count; j++) {
427          struct agx_copy *blocking = &ctx->entries[j];
428          if (blocking->src.value >= entry->dest &&
429              blocking->src.value <
430                 entry->dest + agx_size_align_16(entry->src.size)) {
431             blocking->src.value =
432                entry->src.value + (blocking->src.value - entry->dest);
433          }
434       }
435 
436       entry->done = true;
437    }
438 
439    free(copies2);
440 }
441 
442 void
agx_emit_parallel_copies(agx_builder * b,struct agx_copy * copies,unsigned num_copies)443 agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies,
444                          unsigned num_copies)
445 {
446    /* Emit copies fo reach register class separately because we don't have
447     * register class awareness in the parallel copy lowering data structure.
448     */
449    agx_emit_parallel_copies_for_class(b, copies, num_copies, false);
450    agx_emit_parallel_copies_for_class(b, copies, num_copies, true);
451 }
452