1 /*
2 * Copyright 2022 Alyssa Rosenzweig
3 * Copyright 2021 Valve Corporation
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "agx_builder.h"
8 #include "agx_compiler.h"
9
10 UNUSED static void
print_copy(const struct agx_copy * cp)11 print_copy(const struct agx_copy *cp)
12 {
13 printf("%sr%u = ", cp->dest_mem ? "m" : "", cp->dest);
14 agx_print_index(cp->src, false, stdout);
15 printf("\n");
16 }
17
18 UNUSED static void
print_copies(const struct agx_copy * copies,unsigned nr)19 print_copies(const struct agx_copy *copies, unsigned nr)
20 {
21 printf("[\n");
22
23 for (unsigned i = 0; i < nr; ++i) {
24 printf(" ");
25 print_copy(&copies[i]);
26 }
27
28 printf("]\n");
29 }
30
31 /*
32 * Emits code for
33 *
34 * for (int i = 0; i < n; ++i)
35 * registers[dests[i]] = registers[srcs[i]];
36 *
37 * ...with all copies happening in parallel.
38 *
39 * That is, emit machine instructions equivalent to a parallel copy. This is
40 * used to lower not only parallel copies but also collects and splits, which
41 * also have parallel copy semantics.
42 *
43 * We only handles register-register copies, not general agx_index sources. This
44 * suffices for its internal use for register allocation.
45 */
46
47 static void
do_copy(agx_builder * b,const struct agx_copy * copy)48 do_copy(agx_builder *b, const struct agx_copy *copy)
49 {
50 agx_index dst = copy->dest_mem
51 ? agx_memory_register(copy->dest, copy->src.size)
52 : agx_register(copy->dest, copy->src.size);
53
54 if (copy->dest_mem && copy->src.memory) {
55 /* Memory-memory copies need to be lowered to memory-register and
56 * register-memory, using a reserved scratch register.
57 */
58 agx_index scratch_reg = agx_register(2, copy->src.size);
59 agx_mov_to(b, scratch_reg, copy->src);
60 agx_mov_to(b, dst, scratch_reg);
61 } else if (copy->src.type == AGX_INDEX_IMMEDIATE) {
62 agx_mov_imm_to(b, dst, copy->src.value);
63 } else {
64 agx_mov_to(b, dst, copy->src);
65 }
66 }
67
68 static void
do_swap(agx_builder * b,const struct agx_copy * copy)69 do_swap(agx_builder *b, const struct agx_copy *copy)
70 {
71 assert(copy->src.type == AGX_INDEX_REGISTER && "only GPRs are swapped");
72
73 if (copy->dest == copy->src.value)
74 return;
75
76 agx_index x = copy->dest_mem
77 ? agx_memory_register(copy->dest, copy->src.size)
78 : agx_register(copy->dest, copy->src.size);
79 agx_index y = copy->src;
80 assert(x.memory == y.memory);
81
82 /* Memory-memory swaps lowered here, GPR swaps lowered later */
83 if (x.memory) {
84 agx_index temp1 = agx_register(4, copy->src.size);
85 agx_index temp2 = agx_register(6, copy->src.size);
86
87 agx_mov_to(b, temp1, x);
88 agx_mov_to(b, temp2, y);
89 agx_mov_to(b, y, temp1);
90 agx_mov_to(b, x, temp2);
91 } else {
92 agx_swap(b, x, y);
93 }
94 }
95
96 struct copy_ctx {
97 /* Number of copies being processed */
98 unsigned entry_count;
99
100 /* For each physreg, the number of pending copy entries that use it as a
101 * source. Once this drops to zero, then the physreg is unblocked and can
102 * be moved to.
103 */
104 unsigned physreg_use_count[AGX_NUM_MODELED_REGS];
105
106 /* For each physreg, the pending copy_entry that uses it as a dest. */
107 struct agx_copy *physreg_dest[AGX_NUM_MODELED_REGS];
108
109 struct agx_copy entries[AGX_NUM_MODELED_REGS];
110 };
111
112 static bool
entry_blocked(struct agx_copy * entry,struct copy_ctx * ctx)113 entry_blocked(struct agx_copy *entry, struct copy_ctx *ctx)
114 {
115 for (unsigned i = 0; i < agx_size_align_16(entry->src.size); i++) {
116 if (ctx->physreg_use_count[entry->dest + i] != 0)
117 return true;
118 }
119
120 return false;
121 }
122
123 static bool
is_real(struct agx_copy * entry)124 is_real(struct agx_copy *entry)
125 {
126 return entry->src.type == AGX_INDEX_REGISTER &&
127 entry->dest_mem == entry->src.memory;
128 }
129
130 /* TODO: Generalize to other bit sizes */
131 static void
split_32bit_copy(struct copy_ctx * ctx,struct agx_copy * entry)132 split_32bit_copy(struct copy_ctx *ctx, struct agx_copy *entry)
133 {
134 assert(!entry->done);
135 assert(is_real(entry));
136 assert(agx_size_align_16(entry->src.size) == 2);
137 struct agx_copy *new_entry = &ctx->entries[ctx->entry_count++];
138
139 new_entry->dest = entry->dest + 1;
140 new_entry->dest_mem = entry->dest_mem;
141 new_entry->src = entry->src;
142 new_entry->src.value += 1;
143 new_entry->done = false;
144 entry->src.size = AGX_SIZE_16;
145 new_entry->src.size = AGX_SIZE_16;
146 ctx->physreg_dest[entry->dest + 1] = new_entry;
147 }
148
149 static void
agx_emit_parallel_copies_for_class(agx_builder * b,struct agx_copy * copies,unsigned num_copies,bool cls)150 agx_emit_parallel_copies_for_class(agx_builder *b, struct agx_copy *copies,
151 unsigned num_copies, bool cls)
152 {
153 /* First, lower away 64-bit copies to smaller chunks, since we don't have
154 * 64-bit ALU so we always want to split.
155 */
156 struct agx_copy *copies2 = calloc(sizeof(copies[0]), num_copies * 2);
157 unsigned num_copies2 = 0;
158
159 for (unsigned i = 0; i < num_copies; ++i) {
160 struct agx_copy copy = copies[i];
161
162 /* Filter by class */
163 if (copy.dest_mem != cls)
164 continue;
165
166 assert(copy.dest < AGX_NUM_MODELED_REGS);
167
168 if (copy.src.size == AGX_SIZE_64) {
169 copy.src.size = AGX_SIZE_32;
170 copies2[num_copies2++] = copy;
171
172 if (copy.src.type == AGX_INDEX_IMMEDIATE) {
173 static_assert(sizeof(copy.src.value) * 8 == 32, "known size");
174 copy.src.value = 0;
175 } else {
176 assert(copy.src.type == AGX_INDEX_REGISTER ||
177 copy.src.type == AGX_INDEX_UNIFORM);
178
179 copy.src.value += 2;
180 }
181
182 copy.dest += 2;
183 copies2[num_copies2++] = copy;
184 } else {
185 copies2[num_copies2++] = copy;
186 }
187 }
188
189 copies = copies2;
190 num_copies = num_copies2;
191
192 /* Set up the bookkeeping */
193 struct copy_ctx _ctx = {.entry_count = num_copies};
194 struct copy_ctx *ctx = &_ctx;
195
196 memset(ctx->physreg_dest, 0, sizeof(ctx->physreg_dest));
197 memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
198
199 for (unsigned i = 0; i < ctx->entry_count; i++) {
200 struct agx_copy *entry = &copies[i];
201
202 ctx->entries[i] = *entry;
203
204 for (unsigned j = 0; j < agx_size_align_16(entry->src.size); j++) {
205 if (is_real(entry))
206 ctx->physreg_use_count[entry->src.value + j]++;
207
208 /* Copies should not have overlapping destinations. */
209 assert(!ctx->physreg_dest[entry->dest + j]);
210 ctx->physreg_dest[entry->dest + j] = &ctx->entries[i];
211 }
212 }
213
214 /* Try to vectorize aligned 16-bit copies to use 32-bit operations instead */
215 for (unsigned i = 0; i < ctx->entry_count; i++) {
216 struct agx_copy *entry = &ctx->entries[i];
217 if (entry->src.size != AGX_SIZE_16)
218 continue;
219
220 if ((entry->dest & 1) || (entry->src.value & 1))
221 continue;
222
223 if (entry->src.type != AGX_INDEX_UNIFORM &&
224 entry->src.type != AGX_INDEX_REGISTER)
225 continue;
226
227 unsigned next_dest = entry->dest + 1;
228 assert(next_dest < ARRAY_SIZE(ctx->physreg_dest) && "aligned reg");
229
230 struct agx_copy *next_copy = ctx->physreg_dest[next_dest];
231 if (!next_copy)
232 continue;
233
234 assert(next_copy->dest == next_dest && "data structure invariant");
235 assert(next_copy->src.size == AGX_SIZE_16 && "unaligned copy");
236
237 if (next_copy->src.type != entry->src.type)
238 continue;
239
240 if (next_copy->src.value != (entry->src.value + 1))
241 continue;
242
243 /* Vectorize the copies */
244 ctx->physreg_dest[next_dest] = entry;
245 entry->src.size = AGX_SIZE_32;
246 next_copy->done = true;
247 }
248
249 bool progress = true;
250 while (progress) {
251 progress = false;
252
253 /* Step 1: resolve paths in the transfer graph. This means finding
254 * copies whose destination aren't blocked by something else and then
255 * emitting them, continuing this process until every copy is blocked
256 * and there are only cycles left.
257 *
258 * TODO: We should note that src is also available in dest to unblock
259 * cycles that src is involved in.
260 */
261
262 for (unsigned i = 0; i < ctx->entry_count; i++) {
263 struct agx_copy *entry = &ctx->entries[i];
264 if (!entry->done && !entry_blocked(entry, ctx)) {
265 entry->done = true;
266 progress = true;
267 do_copy(b, entry);
268 for (unsigned j = 0; j < agx_size_align_16(entry->src.size); j++) {
269 if (is_real(entry))
270 ctx->physreg_use_count[entry->src.value + j]--;
271 ctx->physreg_dest[entry->dest + j] = NULL;
272 }
273 }
274 }
275
276 if (progress)
277 continue;
278
279 /* Step 2: Find partially blocked copies and split them. In the
280 * mergedregs case, we can 32-bit copies which are only blocked on one
281 * 16-bit half, and splitting them helps get things moving.
282 *
283 * We can skip splitting copies if the source isn't a register,
284 * however, because it does not unblock anything and therefore doesn't
285 * contribute to making forward progress with step 1. These copies
286 * should still be resolved eventually in step 1 because they can't be
287 * part of a cycle.
288 */
289 for (unsigned i = 0; i < ctx->entry_count; i++) {
290 struct agx_copy *entry = &ctx->entries[i];
291 if (entry->done || (agx_size_align_16(entry->src.size) != 2))
292 continue;
293
294 if (((ctx->physreg_use_count[entry->dest] == 0 ||
295 ctx->physreg_use_count[entry->dest + 1] == 0)) &&
296 is_real(entry)) {
297 split_32bit_copy(ctx, entry);
298 progress = true;
299 }
300 }
301 }
302
303 /* Step 3: resolve cycles through swapping.
304 *
305 * At this point, the transfer graph should consist of only cycles.
306 * The reason is that, given any physreg n_1 that's the source of a
307 * remaining entry, it has a destination n_2, which (because every
308 * copy is blocked) is the source of some other copy whose destination
309 * is n_3, and so we can follow the chain until we get a cycle. If we
310 * reached some other node than n_1:
311 *
312 * n_1 -> n_2 -> ... -> n_i
313 * ^ |
314 * |-------------|
315 *
316 * then n_2 would be the destination of 2 copies, which is illegal
317 * (checked above in an assert). So n_1 must be part of a cycle:
318 *
319 * n_1 -> n_2 -> ... -> n_i
320 * ^ |
321 * |---------------------|
322 *
323 * and this must be only cycle n_1 is involved in, because any other
324 * path starting from n_1 would also have to end in n_1, resulting in
325 * a node somewhere along the way being the destination of 2 copies
326 * when the 2 paths merge.
327 *
328 * The way we resolve the cycle is through picking a copy (n_1, n_2)
329 * and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
330 * out of the cycle:
331 *
332 * n_1 -> ... -> n_i
333 * ^ |
334 * |--------------|
335 *
336 * and we can keep repeating this until the cycle is empty.
337 */
338
339 for (unsigned i = 0; i < ctx->entry_count; i++) {
340 struct agx_copy *entry = &ctx->entries[i];
341 if (entry->done)
342 continue;
343
344 assert(is_real(entry));
345
346 /* catch trivial copies */
347 if (entry->dest == entry->src.value) {
348 entry->done = true;
349 continue;
350 }
351
352 do_swap(b, entry);
353
354 /* Split any blocking copies whose sources are only partially
355 * contained within our destination.
356 */
357 if (agx_size_align_16(entry->src.size) == 1) {
358 for (unsigned j = 0; j < ctx->entry_count; j++) {
359 struct agx_copy *blocking = &ctx->entries[j];
360
361 if (blocking->done)
362 continue;
363
364 if (blocking->src.value <= entry->dest &&
365 blocking->src.value + 1 >= entry->dest &&
366 agx_size_align_16(blocking->src.size) == 2) {
367 split_32bit_copy(ctx, blocking);
368 }
369 }
370 }
371
372 /* Update sources of blocking copies.
373 *
374 * Note: at this point, every blocking copy's source should be
375 * contained within our destination.
376 */
377 for (unsigned j = 0; j < ctx->entry_count; j++) {
378 struct agx_copy *blocking = &ctx->entries[j];
379 if (blocking->src.value >= entry->dest &&
380 blocking->src.value <
381 entry->dest + agx_size_align_16(entry->src.size)) {
382 blocking->src.value =
383 entry->src.value + (blocking->src.value - entry->dest);
384 }
385 }
386
387 entry->done = true;
388 }
389
390 free(copies2);
391 }
392
393 void
agx_emit_parallel_copies(agx_builder * b,struct agx_copy * copies,unsigned num_copies)394 agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies,
395 unsigned num_copies)
396 {
397 /* Emit copies fo reach register class separately because we don't have
398 * register class awareness in the parallel copy lowering data structure.
399 */
400 agx_emit_parallel_copies_for_class(b, copies, num_copies, false);
401 agx_emit_parallel_copies_for_class(b, copies, num_copies, true);
402 }
403