1 /*
2 * Copyright (C) 2021 Alyssa Rosenzweig <alyssa@rosenzweig.io>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "agx_compiler.h"
25 #include "agx_builder.h"
26
27 /* SSA-based register allocator */
28
29 /** Returns number of registers written by an instruction */
30 unsigned
agx_write_registers(agx_instr * I,unsigned d)31 agx_write_registers(agx_instr *I, unsigned d)
32 {
33 unsigned size = I->dest[d].size == AGX_SIZE_32 ? 2 : 1;
34
35 switch (I->op) {
36 case AGX_OPCODE_LD_VARY:
37 assert(1 <= I->channels && I->channels <= 4);
38 return I->channels * size;
39
40 case AGX_OPCODE_DEVICE_LOAD:
41 case AGX_OPCODE_TEXTURE_SAMPLE:
42 case AGX_OPCODE_LD_TILE:
43 /* TODO: mask */
44 return 4 * size;
45
46 case AGX_OPCODE_LD_VARY_FLAT:
47 return 6;
48 case AGX_OPCODE_P_COMBINE:
49 {
50 unsigned components = 0;
51
52 for (unsigned i = 0; i < 4; ++i) {
53 if (!agx_is_null(I->src[i]))
54 components = i + 1;
55 }
56
57 return components * size;
58 }
59 default:
60 return size;
61 }
62 }
63
64 static unsigned
agx_assign_regs(BITSET_WORD * used_regs,unsigned count,unsigned align,unsigned max)65 agx_assign_regs(BITSET_WORD *used_regs, unsigned count, unsigned align, unsigned max)
66 {
67 for (unsigned reg = 0; reg < max; reg += align) {
68 bool conflict = false;
69
70 for (unsigned j = 0; j < count; ++j)
71 conflict |= BITSET_TEST(used_regs, reg + j);
72
73 if (!conflict) {
74 for (unsigned j = 0; j < count; ++j)
75 BITSET_SET(used_regs, reg + j);
76
77 return reg;
78 }
79 }
80
81 /* Couldn't find a free register, dump the state of the register file */
82 fprintf(stderr, "Failed to find register of size %u aligned %u max %u.\n",
83 count, align, max);
84
85 fprintf(stderr, "Register file:\n");
86 for (unsigned i = 0; i < BITSET_WORDS(max); ++i)
87 fprintf(stderr, " %08X\n", used_regs[i]);
88
89 unreachable("Could not find a free register");
90 }
91
92 /** Assign registers to SSA values in a block. */
93
94 static void
agx_ra_assign_local(agx_block * block,uint8_t * ssa_to_reg,uint8_t * ncomps)95 agx_ra_assign_local(agx_block *block, uint8_t *ssa_to_reg, uint8_t *ncomps)
96 {
97 BITSET_DECLARE(used_regs, AGX_NUM_REGS) = { 0 };
98
99 agx_foreach_predecessor(block, pred) {
100 for (unsigned i = 0; i < BITSET_WORDS(AGX_NUM_REGS); ++i)
101 used_regs[i] |= (*pred)->regs_out[i];
102 }
103
104 BITSET_SET(used_regs, 0); // control flow writes r0l
105 BITSET_SET(used_regs, 5*2); // TODO: precolouring, don't overwrite vertex ID
106 BITSET_SET(used_regs, (5*2 + 1));
107 BITSET_SET(used_regs, (6*2 + 0));
108 BITSET_SET(used_regs, (6*2 + 1));
109
110 agx_foreach_instr_in_block(block, I) {
111 /* Optimization: if a split contains the last use of a vector, the split
112 * can be removed by assigning the destinations overlapping the source.
113 */
114 if (I->op == AGX_OPCODE_P_SPLIT && I->src[0].kill) {
115 unsigned reg = ssa_to_reg[I->src[0].value];
116 unsigned length = ncomps[I->src[0].value];
117 unsigned width = agx_size_align_16(I->src[0].size);
118 unsigned count = length / width;
119
120 agx_foreach_dest(I, d) {
121 /* Skip excess components */
122 if (d >= count) {
123 assert(agx_is_null(I->dest[d]));
124 continue;
125 }
126
127 /* The source of the split is killed. If a destination of the split
128 * is null, that channel is killed. Free it.
129 */
130 if (agx_is_null(I->dest[d])) {
131 for (unsigned i = 0; i < width; ++i)
132 BITSET_CLEAR(used_regs, reg + (width * d) + i);
133
134 continue;
135 }
136
137 /* Otherwise, transfer the liveness */
138 unsigned offset = d * width;
139
140 assert(I->dest[d].type == AGX_INDEX_NORMAL);
141 assert(offset < length);
142
143 ssa_to_reg[I->dest[d].value] = reg + offset;
144 }
145
146 continue;
147 }
148
149 /* First, free killed sources */
150 agx_foreach_src(I, s) {
151 if (I->src[s].type == AGX_INDEX_NORMAL && I->src[s].kill) {
152 unsigned reg = ssa_to_reg[I->src[s].value];
153 unsigned count = ncomps[I->src[s].value];
154
155 for (unsigned i = 0; i < count; ++i)
156 BITSET_CLEAR(used_regs, reg + i);
157 }
158 }
159
160 /* Next, assign destinations one at a time. This is always legal
161 * because of the SSA form.
162 */
163 agx_foreach_dest(I, d) {
164 if (I->dest[d].type == AGX_INDEX_NORMAL) {
165 unsigned count = agx_write_registers(I, d);
166 unsigned align = (I->dest[d].size == AGX_SIZE_16) ? 1 : 2;
167 unsigned reg = agx_assign_regs(used_regs, count, align, AGX_NUM_REGS);
168
169 ssa_to_reg[I->dest[d].value] = reg;
170 }
171 }
172 }
173
174 STATIC_ASSERT(sizeof(block->regs_out) == sizeof(used_regs));
175 memcpy(block->regs_out, used_regs, sizeof(used_regs));
176 }
177
178 /*
179 * Resolve an agx_index of type NORMAL or REGISTER to a physical register, once
180 * registers have been allocated for all SSA values.
181 */
182 static unsigned
agx_index_to_reg(uint8_t * ssa_to_reg,agx_index idx)183 agx_index_to_reg(uint8_t *ssa_to_reg, agx_index idx)
184 {
185 if (idx.type == AGX_INDEX_NORMAL) {
186 return ssa_to_reg[idx.value];
187 } else {
188 assert(idx.type == AGX_INDEX_REGISTER);
189 return idx.value;
190 }
191 }
192
193 /*
194 * Lower phis to parallel copies at the logical end of a given block. If a block
195 * needs parallel copies inserted, a successor of the block has a phi node. To
196 * have a (nontrivial) phi node, a block must have multiple predecessors. So the
197 * edge from the block to the successor (with phi) is not the only edge entering
198 * the successor. Because the control flow graph has no critical edges, this
199 * edge must therefore be the only edge leaving the block, so the block must
200 * have only a single successor.
201 */
202 static void
agx_insert_parallel_copies(agx_context * ctx,agx_block * block)203 agx_insert_parallel_copies(agx_context *ctx, agx_block *block)
204 {
205 bool any_succ = false;
206 unsigned nr_phi = 0;
207
208 /* Phi nodes logically happen on the control flow edge, so parallel copies
209 * are added at the end of the predecessor */
210 agx_builder b = agx_init_builder(ctx, agx_after_block_logical(block));
211
212 agx_foreach_successor(block, succ) {
213 assert(nr_phi == 0 && "control flow graph has a critical edge");
214
215 /* Phi nodes can only come at the start of the block */
216 agx_foreach_instr_in_block(succ, phi) {
217 if (phi->op != AGX_OPCODE_PHI) break;
218
219 assert(!any_succ && "control flow graph has a critical edge");
220 nr_phi++;
221 }
222
223 any_succ = true;
224
225 /* Nothing to do if there are no phi nodes */
226 if (nr_phi == 0)
227 continue;
228
229 unsigned pred_index = agx_predecessor_index(succ, block);
230
231 /* Create a parallel copy lowering all the phi nodes */
232 struct agx_copy *copies = calloc(sizeof(*copies), nr_phi);
233
234 unsigned i = 0;
235
236 agx_foreach_instr_in_block(succ, phi) {
237 if (phi->op != AGX_OPCODE_PHI) break;
238
239 agx_index dest = phi->dest[0];
240 agx_index src = phi->src[pred_index];
241
242 assert(dest.type == AGX_INDEX_REGISTER);
243 assert(src.type == AGX_INDEX_REGISTER);
244 assert(dest.size == src.size);
245
246 copies[i++] = (struct agx_copy) {
247 .dest = dest.value,
248 .src = src.value,
249 .size = src.size
250 };
251 }
252
253 agx_emit_parallel_copies(&b, copies, nr_phi);
254
255 free(copies);
256 }
257 }
258
259 void
agx_ra(agx_context * ctx)260 agx_ra(agx_context *ctx)
261 {
262 unsigned *alloc = calloc(ctx->alloc, sizeof(unsigned));
263
264 agx_compute_liveness(ctx);
265 uint8_t *ssa_to_reg = calloc(ctx->alloc, sizeof(uint8_t));
266 uint8_t *ncomps = calloc(ctx->alloc, sizeof(uint8_t));
267
268 agx_foreach_instr_global(ctx, I) {
269 agx_foreach_dest(I, d) {
270 if (I->dest[d].type != AGX_INDEX_NORMAL) continue;
271
272 unsigned v = I->dest[d].value;
273 assert(ncomps[v] == 0 && "broken SSA");
274 ncomps[v] = agx_write_registers(I, d);
275 }
276 }
277
278 /* Assign registers in dominance-order. This coincides with source-order due
279 * to a NIR invariant, so we do not need special handling for this.
280 */
281 agx_foreach_block(ctx, block) {
282 agx_ra_assign_local(block, ssa_to_reg, ncomps);
283 }
284
285 agx_foreach_instr_global(ctx, ins) {
286 agx_foreach_src(ins, s) {
287 if (ins->src[s].type == AGX_INDEX_NORMAL) {
288 unsigned v = ssa_to_reg[ins->src[s].value];
289 ins->src[s] = agx_replace_index(ins->src[s], agx_register(v, ins->src[s].size));
290 }
291 }
292
293 agx_foreach_dest(ins, d) {
294 if (ins->dest[d].type == AGX_INDEX_NORMAL) {
295 unsigned v = ssa_to_reg[ins->dest[d].value];
296 ins->dest[d] = agx_replace_index(ins->dest[d], agx_register(v, ins->dest[d].size));
297 }
298 }
299 }
300
301 agx_foreach_instr_global_safe(ctx, ins) {
302 /* Lower away RA pseudo-instructions */
303 agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
304
305 if (ins->op == AGX_OPCODE_P_COMBINE) {
306 unsigned base = agx_index_to_reg(ssa_to_reg, ins->dest[0]);
307 unsigned width = agx_size_align_16(ins->dest[0].size);
308
309 struct agx_copy copies[4];
310 unsigned n = 0;
311
312 /* Move the sources */
313 for (unsigned i = 0; i < 4; ++i) {
314 if (agx_is_null(ins->src[i])) continue;
315 assert(ins->src[i].size == ins->dest[0].size);
316
317 copies[n++] = (struct agx_copy) {
318 .dest = base + (i * width),
319 .src = agx_index_to_reg(ssa_to_reg, ins->src[i]) ,
320 .size = ins->src[i].size
321 };
322 }
323
324 agx_emit_parallel_copies(&b, copies, n);
325 agx_remove_instruction(ins);
326 continue;
327 } else if (ins->op == AGX_OPCODE_P_EXTRACT) {
328 /* Uses the destination size */
329 unsigned size = agx_size_align_16(ins->dest[0].size);
330 unsigned left = agx_index_to_reg(ssa_to_reg, ins->dest[0]);
331 unsigned right = agx_index_to_reg(ssa_to_reg, ins->src[0]) + (size * ins->imm);
332
333 if (left != right) {
334 agx_mov_to(&b, agx_register(left, ins->dest[0].size),
335 agx_register(right, ins->src[0].size));
336 }
337
338 agx_remove_instruction(ins);
339 continue;
340 } else if (ins->op == AGX_OPCODE_P_SPLIT) {
341 unsigned base = agx_index_to_reg(ssa_to_reg, ins->src[0]);
342 unsigned width = agx_size_align_16(ins->src[0].size);
343
344 struct agx_copy copies[4];
345 unsigned n = 0;
346
347 /* Move the sources */
348 for (unsigned i = 0; i < 4; ++i) {
349 if (agx_is_null(ins->dest[i])) continue;
350 assert(ins->dest[i].size == ins->src[0].size);
351
352 copies[n++] = (struct agx_copy) {
353 .dest = agx_index_to_reg(ssa_to_reg, ins->dest[i]),
354 .src = base + (i * width),
355 .size = ins->dest[i].size
356 };
357 }
358
359 /* Lower away */
360 agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
361 agx_emit_parallel_copies(&b, copies, n);
362 agx_remove_instruction(ins);
363 continue;
364 }
365
366
367 }
368
369 /* Insert parallel copies lowering phi nodes */
370 agx_foreach_block(ctx, block) {
371 agx_insert_parallel_copies(ctx, block);
372 }
373
374 /* Phi nodes can be removed now */
375 agx_foreach_instr_global_safe(ctx, I) {
376 if (I->op == AGX_OPCODE_PHI || I->op == AGX_OPCODE_P_LOGICAL_END)
377 agx_remove_instruction(I);
378
379 /* Remove identity moves */
380 if (I->op == AGX_OPCODE_MOV && I->src[0].type == AGX_INDEX_REGISTER &&
381 I->dest[0].size == I->src[0].size && I->src[0].value == I->dest[0].value) {
382
383 assert(I->dest[0].type == AGX_INDEX_REGISTER);
384 agx_remove_instruction(I);
385 }
386 }
387
388 free(ssa_to_reg);
389 free(ncomps);
390 free(alloc);
391 }
392