• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2021 Alyssa Rosenzweig <alyssa@rosenzweig.io>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "agx_compiler.h"
25 #include "agx_builder.h"
26 
27 /* SSA-based register allocator */
28 
29 /** Returns number of registers written by an instruction */
30 unsigned
agx_write_registers(agx_instr * I,unsigned d)31 agx_write_registers(agx_instr *I, unsigned d)
32 {
33    unsigned size = I->dest[d].size == AGX_SIZE_32 ? 2 : 1;
34 
35    switch (I->op) {
36    case AGX_OPCODE_LD_VARY:
37       assert(1 <= I->channels && I->channels <= 4);
38       return I->channels * size;
39 
40    case AGX_OPCODE_DEVICE_LOAD:
41    case AGX_OPCODE_TEXTURE_SAMPLE:
42    case AGX_OPCODE_LD_TILE:
43       /* TODO: mask */
44       return 4 * size;
45 
46    case AGX_OPCODE_LD_VARY_FLAT:
47       return 6;
48    case AGX_OPCODE_P_COMBINE:
49    {
50       unsigned components = 0;
51 
52       for (unsigned i = 0; i < 4; ++i) {
53          if (!agx_is_null(I->src[i]))
54             components = i + 1;
55       }
56 
57       return components * size;
58    }
59    default:
60       return size;
61    }
62 }
63 
64 static unsigned
agx_assign_regs(BITSET_WORD * used_regs,unsigned count,unsigned align,unsigned max)65 agx_assign_regs(BITSET_WORD *used_regs, unsigned count, unsigned align, unsigned max)
66 {
67    for (unsigned reg = 0; reg < max; reg += align) {
68       bool conflict = false;
69 
70       for (unsigned j = 0; j < count; ++j)
71          conflict |= BITSET_TEST(used_regs, reg + j);
72 
73       if (!conflict) {
74          for (unsigned j = 0; j < count; ++j)
75             BITSET_SET(used_regs, reg + j);
76 
77          return reg;
78       }
79    }
80 
81    /* Couldn't find a free register, dump the state of the register file */
82    fprintf(stderr, "Failed to find register of size %u aligned %u max %u.\n",
83            count, align, max);
84 
85    fprintf(stderr, "Register file:\n");
86    for (unsigned i = 0; i < BITSET_WORDS(max); ++i)
87       fprintf(stderr, "    %08X\n", used_regs[i]);
88 
89    unreachable("Could not find a free register");
90 }
91 
92 /** Assign registers to SSA values in a block. */
93 
94 static void
agx_ra_assign_local(agx_block * block,uint8_t * ssa_to_reg,uint8_t * ncomps)95 agx_ra_assign_local(agx_block *block, uint8_t *ssa_to_reg, uint8_t *ncomps)
96 {
97    BITSET_DECLARE(used_regs, AGX_NUM_REGS) = { 0 };
98 
99    agx_foreach_predecessor(block, pred) {
100       for (unsigned i = 0; i < BITSET_WORDS(AGX_NUM_REGS); ++i)
101          used_regs[i] |= (*pred)->regs_out[i];
102    }
103 
104    BITSET_SET(used_regs, 0); // control flow writes r0l
105    BITSET_SET(used_regs, 5*2); // TODO: precolouring, don't overwrite vertex ID
106    BITSET_SET(used_regs, (5*2 + 1));
107    BITSET_SET(used_regs, (6*2 + 0));
108    BITSET_SET(used_regs, (6*2 + 1));
109 
110    agx_foreach_instr_in_block(block, I) {
111       /* Optimization: if a split contains the last use of a vector, the split
112        * can be removed by assigning the destinations overlapping the source.
113        */
114       if (I->op == AGX_OPCODE_P_SPLIT && I->src[0].kill) {
115          unsigned reg = ssa_to_reg[I->src[0].value];
116          unsigned length = ncomps[I->src[0].value];
117          unsigned width = agx_size_align_16(I->src[0].size);
118          unsigned count = length / width;
119 
120          agx_foreach_dest(I, d) {
121             /* Skip excess components */
122             if (d >= count) {
123                assert(agx_is_null(I->dest[d]));
124                continue;
125             }
126 
127             /* The source of the split is killed. If a destination of the split
128              * is null, that channel is killed. Free it.
129              */
130             if (agx_is_null(I->dest[d])) {
131                for (unsigned i = 0; i < width; ++i)
132                   BITSET_CLEAR(used_regs, reg + (width * d) + i);
133 
134                continue;
135             }
136 
137             /* Otherwise, transfer the liveness */
138             unsigned offset = d * width;
139 
140             assert(I->dest[d].type == AGX_INDEX_NORMAL);
141             assert(offset < length);
142 
143             ssa_to_reg[I->dest[d].value] = reg + offset;
144          }
145 
146          continue;
147       }
148 
149       /* First, free killed sources */
150       agx_foreach_src(I, s) {
151          if (I->src[s].type == AGX_INDEX_NORMAL && I->src[s].kill) {
152             unsigned reg = ssa_to_reg[I->src[s].value];
153             unsigned count = ncomps[I->src[s].value];
154 
155             for (unsigned i = 0; i < count; ++i)
156                BITSET_CLEAR(used_regs, reg + i);
157          }
158       }
159 
160       /* Next, assign destinations one at a time. This is always legal
161        * because of the SSA form.
162        */
163       agx_foreach_dest(I, d) {
164          if (I->dest[d].type == AGX_INDEX_NORMAL) {
165             unsigned count = agx_write_registers(I, d);
166             unsigned align = (I->dest[d].size == AGX_SIZE_16) ? 1 : 2;
167             unsigned reg = agx_assign_regs(used_regs, count, align, AGX_NUM_REGS);
168 
169             ssa_to_reg[I->dest[d].value] = reg;
170          }
171       }
172    }
173 
174    STATIC_ASSERT(sizeof(block->regs_out) == sizeof(used_regs));
175    memcpy(block->regs_out, used_regs, sizeof(used_regs));
176 }
177 
178 /*
179  * Resolve an agx_index of type NORMAL or REGISTER to a physical register, once
180  * registers have been allocated for all SSA values.
181  */
182 static unsigned
agx_index_to_reg(uint8_t * ssa_to_reg,agx_index idx)183 agx_index_to_reg(uint8_t *ssa_to_reg, agx_index idx)
184 {
185    if (idx.type == AGX_INDEX_NORMAL) {
186       return ssa_to_reg[idx.value];
187    } else {
188       assert(idx.type == AGX_INDEX_REGISTER);
189       return idx.value;
190    }
191 }
192 
193 /*
194  * Lower phis to parallel copies at the logical end of a given block. If a block
195  * needs parallel copies inserted, a successor of the block has a phi node. To
196  * have a (nontrivial) phi node, a block must have multiple predecessors. So the
197  * edge from the block to the successor (with phi) is not the only edge entering
198  * the successor. Because the control flow graph has no critical edges, this
199  * edge must therefore be the only edge leaving the block, so the block must
200  * have only a single successor.
201  */
202 static void
agx_insert_parallel_copies(agx_context * ctx,agx_block * block)203 agx_insert_parallel_copies(agx_context *ctx, agx_block *block)
204 {
205    bool any_succ = false;
206    unsigned nr_phi = 0;
207 
208    /* Phi nodes logically happen on the control flow edge, so parallel copies
209     * are added at the end of the predecessor */
210    agx_builder b = agx_init_builder(ctx, agx_after_block_logical(block));
211 
212    agx_foreach_successor(block, succ) {
213       assert(nr_phi == 0 && "control flow graph has a critical edge");
214 
215       /* Phi nodes can only come at the start of the block */
216       agx_foreach_instr_in_block(succ, phi) {
217          if (phi->op != AGX_OPCODE_PHI) break;
218 
219          assert(!any_succ && "control flow graph has a critical edge");
220          nr_phi++;
221       }
222 
223       any_succ = true;
224 
225       /* Nothing to do if there are no phi nodes */
226       if (nr_phi == 0)
227          continue;
228 
229       unsigned pred_index = agx_predecessor_index(succ, block);
230 
231       /* Create a parallel copy lowering all the phi nodes */
232       struct agx_copy *copies = calloc(sizeof(*copies), nr_phi);
233 
234       unsigned i = 0;
235 
236       agx_foreach_instr_in_block(succ, phi) {
237          if (phi->op != AGX_OPCODE_PHI) break;
238 
239          agx_index dest = phi->dest[0];
240          agx_index src = phi->src[pred_index];
241 
242          assert(dest.type == AGX_INDEX_REGISTER);
243          assert(src.type == AGX_INDEX_REGISTER);
244          assert(dest.size == src.size);
245 
246          copies[i++] = (struct agx_copy) {
247             .dest = dest.value,
248             .src = src.value,
249             .size = src.size
250          };
251       }
252 
253       agx_emit_parallel_copies(&b, copies, nr_phi);
254 
255       free(copies);
256    }
257 }
258 
259 void
agx_ra(agx_context * ctx)260 agx_ra(agx_context *ctx)
261 {
262    unsigned *alloc = calloc(ctx->alloc, sizeof(unsigned));
263 
264    agx_compute_liveness(ctx);
265    uint8_t *ssa_to_reg = calloc(ctx->alloc, sizeof(uint8_t));
266    uint8_t *ncomps = calloc(ctx->alloc, sizeof(uint8_t));
267 
268    agx_foreach_instr_global(ctx, I) {
269       agx_foreach_dest(I, d) {
270          if (I->dest[d].type != AGX_INDEX_NORMAL) continue;
271 
272          unsigned v = I->dest[d].value;
273          assert(ncomps[v] == 0 && "broken SSA");
274          ncomps[v] = agx_write_registers(I, d);
275       }
276    }
277 
278    /* Assign registers in dominance-order. This coincides with source-order due
279     * to a NIR invariant, so we do not need special handling for this.
280     */
281    agx_foreach_block(ctx, block) {
282       agx_ra_assign_local(block, ssa_to_reg, ncomps);
283    }
284 
285    agx_foreach_instr_global(ctx, ins) {
286       agx_foreach_src(ins, s) {
287          if (ins->src[s].type == AGX_INDEX_NORMAL) {
288             unsigned v = ssa_to_reg[ins->src[s].value];
289             ins->src[s] = agx_replace_index(ins->src[s], agx_register(v, ins->src[s].size));
290          }
291       }
292 
293       agx_foreach_dest(ins, d) {
294          if (ins->dest[d].type == AGX_INDEX_NORMAL) {
295             unsigned v = ssa_to_reg[ins->dest[d].value];
296             ins->dest[d] = agx_replace_index(ins->dest[d], agx_register(v, ins->dest[d].size));
297          }
298       }
299    }
300 
301    agx_foreach_instr_global_safe(ctx, ins) {
302       /* Lower away RA pseudo-instructions */
303       agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
304 
305       if (ins->op == AGX_OPCODE_P_COMBINE) {
306          unsigned base = agx_index_to_reg(ssa_to_reg, ins->dest[0]);
307          unsigned width = agx_size_align_16(ins->dest[0].size);
308 
309          struct agx_copy copies[4];
310          unsigned n = 0;
311 
312          /* Move the sources */
313          for (unsigned i = 0; i < 4; ++i) {
314             if (agx_is_null(ins->src[i])) continue;
315             assert(ins->src[i].size == ins->dest[0].size);
316 
317             copies[n++] = (struct agx_copy) {
318                .dest = base + (i * width),
319                .src = agx_index_to_reg(ssa_to_reg, ins->src[i]) ,
320                .size = ins->src[i].size
321             };
322          }
323 
324          agx_emit_parallel_copies(&b, copies, n);
325          agx_remove_instruction(ins);
326          continue;
327       } else if (ins->op == AGX_OPCODE_P_EXTRACT) {
328          /* Uses the destination size */
329          unsigned size = agx_size_align_16(ins->dest[0].size);
330          unsigned left = agx_index_to_reg(ssa_to_reg, ins->dest[0]);
331          unsigned right = agx_index_to_reg(ssa_to_reg, ins->src[0]) + (size * ins->imm);
332 
333          if (left != right) {
334             agx_mov_to(&b, agx_register(left, ins->dest[0].size),
335                   agx_register(right, ins->src[0].size));
336          }
337 
338          agx_remove_instruction(ins);
339          continue;
340       } else if (ins->op == AGX_OPCODE_P_SPLIT) {
341          unsigned base = agx_index_to_reg(ssa_to_reg, ins->src[0]);
342          unsigned width = agx_size_align_16(ins->src[0].size);
343 
344          struct agx_copy copies[4];
345          unsigned n = 0;
346 
347          /* Move the sources */
348          for (unsigned i = 0; i < 4; ++i) {
349             if (agx_is_null(ins->dest[i])) continue;
350             assert(ins->dest[i].size == ins->src[0].size);
351 
352             copies[n++] = (struct agx_copy) {
353                .dest = agx_index_to_reg(ssa_to_reg, ins->dest[i]),
354                .src = base + (i * width),
355                .size = ins->dest[i].size
356             };
357          }
358 
359          /* Lower away */
360          agx_builder b = agx_init_builder(ctx, agx_after_instr(ins));
361          agx_emit_parallel_copies(&b, copies, n);
362          agx_remove_instruction(ins);
363          continue;
364       }
365 
366 
367    }
368 
369    /* Insert parallel copies lowering phi nodes */
370    agx_foreach_block(ctx, block) {
371       agx_insert_parallel_copies(ctx, block);
372    }
373 
374    /* Phi nodes can be removed now */
375    agx_foreach_instr_global_safe(ctx, I) {
376       if (I->op == AGX_OPCODE_PHI || I->op == AGX_OPCODE_P_LOGICAL_END)
377          agx_remove_instruction(I);
378 
379       /* Remove identity moves */
380       if (I->op == AGX_OPCODE_MOV && I->src[0].type == AGX_INDEX_REGISTER &&
381           I->dest[0].size == I->src[0].size && I->src[0].value == I->dest[0].value) {
382 
383          assert(I->dest[0].type == AGX_INDEX_REGISTER);
384          agx_remove_instruction(I);
385       }
386    }
387 
388    free(ssa_to_reg);
389    free(ncomps);
390    free(alloc);
391 }
392