/* * Copyright 2022 Alyssa Rosenzweig * Copyright 2021 Valve Corporation * SPDX-License-Identifier: MIT */ #include "agx_builder.h" #include "agx_compiler.h" UNUSED static void print_copy(const struct agx_copy *cp) { printf("%sr%u = ", cp->dest_mem ? "m" : "", cp->dest); agx_print_index(cp->src, false, stdout); printf("\n"); } UNUSED static void print_copies(const struct agx_copy *copies, unsigned nr) { printf("[\n"); for (unsigned i = 0; i < nr; ++i) { printf(" "); print_copy(&copies[i]); } printf("]\n"); } /* * Emits code for * * for (int i = 0; i < n; ++i) * registers[dests[i]] = registers[srcs[i]]; * * ...with all copies happening in parallel. * * That is, emit machine instructions equivalent to a parallel copy. This is * used to lower not only parallel copies but also collects and splits, which * also have parallel copy semantics. * * We only handles register-register copies, not general agx_index sources. This * suffices for its internal use for register allocation. */ static void do_copy(agx_builder *b, const struct agx_copy *copy) { agx_index dst = copy->dest_mem ? agx_memory_register(copy->dest, copy->src.size) : agx_register(copy->dest, copy->src.size); if (copy->dest_mem && copy->src.memory) { /* Memory-memory copies need to be lowered to memory-register and * register-memory, using a reserved scratch register. */ agx_index scratch_reg = agx_register(2, copy->src.size); agx_mov_to(b, scratch_reg, copy->src); agx_mov_to(b, dst, scratch_reg); } else if (copy->src.type == AGX_INDEX_IMMEDIATE) { agx_mov_imm_to(b, dst, copy->src.value); } else { agx_mov_to(b, dst, copy->src); } } static void do_swap(agx_builder *b, const struct agx_copy *copy) { assert(copy->src.type == AGX_INDEX_REGISTER && "only GPRs are swapped"); if (copy->dest == copy->src.value) return; agx_index x = copy->dest_mem ? agx_memory_register(copy->dest, copy->src.size) : agx_register(copy->dest, copy->src.size); agx_index y = copy->src; assert(x.memory == y.memory); /* Memory-memory swaps lowered here, GPR swaps lowered later */ if (x.memory) { agx_index temp1 = agx_register(4, copy->src.size); agx_index temp2 = agx_register(6, copy->src.size); agx_mov_to(b, temp1, x); agx_mov_to(b, temp2, y); agx_mov_to(b, y, temp1); agx_mov_to(b, x, temp2); } else { agx_swap(b, x, y); } } struct copy_ctx { /* Number of copies being processed */ unsigned entry_count; /* For each physreg, the number of pending copy entries that use it as a * source. Once this drops to zero, then the physreg is unblocked and can * be moved to. */ unsigned physreg_use_count[AGX_NUM_MODELED_REGS]; /* For each physreg, the pending copy_entry that uses it as a dest. */ struct agx_copy *physreg_dest[AGX_NUM_MODELED_REGS]; struct agx_copy entries[AGX_NUM_MODELED_REGS]; }; static bool entry_blocked(struct agx_copy *entry, struct copy_ctx *ctx) { for (unsigned i = 0; i < agx_size_align_16(entry->src.size); i++) { if (ctx->physreg_use_count[entry->dest + i] != 0) return true; } return false; } static bool is_real(struct agx_copy *entry) { return entry->src.type == AGX_INDEX_REGISTER && entry->dest_mem == entry->src.memory; } /* TODO: Generalize to other bit sizes */ static void split_32bit_copy(struct copy_ctx *ctx, struct agx_copy *entry) { assert(!entry->done); assert(is_real(entry)); assert(agx_size_align_16(entry->src.size) == 2); struct agx_copy *new_entry = &ctx->entries[ctx->entry_count++]; new_entry->dest = entry->dest + 1; new_entry->dest_mem = entry->dest_mem; new_entry->src = entry->src; new_entry->src.value += 1; new_entry->done = false; entry->src.size = AGX_SIZE_16; new_entry->src.size = AGX_SIZE_16; ctx->physreg_dest[entry->dest + 1] = new_entry; } static void agx_emit_parallel_copies_for_class(agx_builder *b, struct agx_copy *copies, unsigned num_copies, bool cls) { /* First, lower away 64-bit copies to smaller chunks, since we don't have * 64-bit ALU so we always want to split. */ struct agx_copy *copies2 = calloc(sizeof(copies[0]), num_copies * 2); unsigned num_copies2 = 0; for (unsigned i = 0; i < num_copies; ++i) { struct agx_copy copy = copies[i]; /* Filter by class */ if (copy.dest_mem != cls) continue; assert(copy.dest < AGX_NUM_MODELED_REGS); if (copy.src.size == AGX_SIZE_64) { copy.src.size = AGX_SIZE_32; copies2[num_copies2++] = copy; if (copy.src.type == AGX_INDEX_IMMEDIATE) { static_assert(sizeof(copy.src.value) * 8 == 32, "known size"); copy.src.value = 0; } else { assert(copy.src.type == AGX_INDEX_REGISTER || copy.src.type == AGX_INDEX_UNIFORM); copy.src.value += 2; } copy.dest += 2; copies2[num_copies2++] = copy; } else { copies2[num_copies2++] = copy; } } copies = copies2; num_copies = num_copies2; /* Set up the bookkeeping */ struct copy_ctx _ctx = {.entry_count = num_copies}; struct copy_ctx *ctx = &_ctx; memset(ctx->physreg_dest, 0, sizeof(ctx->physreg_dest)); memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count)); for (unsigned i = 0; i < ctx->entry_count; i++) { struct agx_copy *entry = &copies[i]; ctx->entries[i] = *entry; for (unsigned j = 0; j < agx_size_align_16(entry->src.size); j++) { if (is_real(entry)) ctx->physreg_use_count[entry->src.value + j]++; /* Copies should not have overlapping destinations. */ assert(!ctx->physreg_dest[entry->dest + j]); ctx->physreg_dest[entry->dest + j] = &ctx->entries[i]; } } /* Try to vectorize aligned 16-bit copies to use 32-bit operations instead */ for (unsigned i = 0; i < ctx->entry_count; i++) { struct agx_copy *entry = &ctx->entries[i]; if (entry->src.size != AGX_SIZE_16) continue; if ((entry->dest & 1) || (entry->src.value & 1)) continue; if (entry->src.type != AGX_INDEX_UNIFORM && entry->src.type != AGX_INDEX_REGISTER) continue; unsigned next_dest = entry->dest + 1; assert(next_dest < ARRAY_SIZE(ctx->physreg_dest) && "aligned reg"); struct agx_copy *next_copy = ctx->physreg_dest[next_dest]; if (!next_copy) continue; assert(next_copy->dest == next_dest && "data structure invariant"); assert(next_copy->src.size == AGX_SIZE_16 && "unaligned copy"); if (next_copy->src.type != entry->src.type) continue; if (next_copy->src.value != (entry->src.value + 1)) continue; /* Vectorize the copies */ ctx->physreg_dest[next_dest] = entry; entry->src.size = AGX_SIZE_32; next_copy->done = true; } bool progress = true; while (progress) { progress = false; /* Step 1: resolve paths in the transfer graph. This means finding * copies whose destination aren't blocked by something else and then * emitting them, continuing this process until every copy is blocked * and there are only cycles left. * * TODO: We should note that src is also available in dest to unblock * cycles that src is involved in. */ for (unsigned i = 0; i < ctx->entry_count; i++) { struct agx_copy *entry = &ctx->entries[i]; if (!entry->done && !entry_blocked(entry, ctx)) { entry->done = true; progress = true; do_copy(b, entry); for (unsigned j = 0; j < agx_size_align_16(entry->src.size); j++) { if (is_real(entry)) ctx->physreg_use_count[entry->src.value + j]--; ctx->physreg_dest[entry->dest + j] = NULL; } } } if (progress) continue; /* Step 2: Find partially blocked copies and split them. In the * mergedregs case, we can 32-bit copies which are only blocked on one * 16-bit half, and splitting them helps get things moving. * * We can skip splitting copies if the source isn't a register, * however, because it does not unblock anything and therefore doesn't * contribute to making forward progress with step 1. These copies * should still be resolved eventually in step 1 because they can't be * part of a cycle. */ for (unsigned i = 0; i < ctx->entry_count; i++) { struct agx_copy *entry = &ctx->entries[i]; if (entry->done || (agx_size_align_16(entry->src.size) != 2)) continue; if (((ctx->physreg_use_count[entry->dest] == 0 || ctx->physreg_use_count[entry->dest + 1] == 0)) && is_real(entry)) { split_32bit_copy(ctx, entry); progress = true; } } } /* Step 3: resolve cycles through swapping. * * At this point, the transfer graph should consist of only cycles. * The reason is that, given any physreg n_1 that's the source of a * remaining entry, it has a destination n_2, which (because every * copy is blocked) is the source of some other copy whose destination * is n_3, and so we can follow the chain until we get a cycle. If we * reached some other node than n_1: * * n_1 -> n_2 -> ... -> n_i * ^ | * |-------------| * * then n_2 would be the destination of 2 copies, which is illegal * (checked above in an assert). So n_1 must be part of a cycle: * * n_1 -> n_2 -> ... -> n_i * ^ | * |---------------------| * * and this must be only cycle n_1 is involved in, because any other * path starting from n_1 would also have to end in n_1, resulting in * a node somewhere along the way being the destination of 2 copies * when the 2 paths merge. * * The way we resolve the cycle is through picking a copy (n_1, n_2) * and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken * out of the cycle: * * n_1 -> ... -> n_i * ^ | * |--------------| * * and we can keep repeating this until the cycle is empty. */ for (unsigned i = 0; i < ctx->entry_count; i++) { struct agx_copy *entry = &ctx->entries[i]; if (entry->done) continue; assert(is_real(entry)); /* catch trivial copies */ if (entry->dest == entry->src.value) { entry->done = true; continue; } do_swap(b, entry); /* Split any blocking copies whose sources are only partially * contained within our destination. */ if (agx_size_align_16(entry->src.size) == 1) { for (unsigned j = 0; j < ctx->entry_count; j++) { struct agx_copy *blocking = &ctx->entries[j]; if (blocking->done) continue; if (blocking->src.value <= entry->dest && blocking->src.value + 1 >= entry->dest && agx_size_align_16(blocking->src.size) == 2) { split_32bit_copy(ctx, blocking); } } } /* Update sources of blocking copies. * * Note: at this point, every blocking copy's source should be * contained within our destination. */ for (unsigned j = 0; j < ctx->entry_count; j++) { struct agx_copy *blocking = &ctx->entries[j]; if (blocking->src.value >= entry->dest && blocking->src.value < entry->dest + agx_size_align_16(entry->src.size)) { blocking->src.value = entry->src.value + (blocking->src.value - entry->dest); } } entry->done = true; } free(copies2); } void agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies, unsigned num_copies) { /* Emit copies fo reach register class separately because we don't have * register class awareness in the parallel copy lowering data structure. */ agx_emit_parallel_copies_for_class(b, copies, num_copies, false); agx_emit_parallel_copies_for_class(b, copies, num_copies, true); }