1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3 /*
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 * Rob Clark <robclark@freedesktop.org>
27 */
28
29 #include "util/u_math.h"
30 #include "util/register_allocate.h"
31 #include "util/ralloc.h"
32 #include "util/bitset.h"
33
34 #include "freedreno_util.h"
35
36 #include "ir3.h"
37 #include "ir3_compiler.h"
38
39 /*
40 * Register Assignment:
41 *
42 * Uses the register_allocate util, which implements graph coloring
43 * algo with interference classes. To handle the cases where we need
44 * consecutive registers (for example, texture sample instructions),
45 * we model these as larger (double/quad/etc) registers which conflict
46 * with the corresponding registers in other classes.
47 *
48 * Additionally we create additional classes for half-regs, which
49 * do not conflict with the full-reg classes. We do need at least
50 * sizes 1-4 (to deal w/ texture sample instructions output to half-
51 * reg). At the moment we don't create the higher order half-reg
52 * classes as half-reg frequently does not have enough precision
53 * for texture coords at higher resolutions.
54 *
55 * There are some additional cases that we need to handle specially,
56 * as the graph coloring algo doesn't understand "partial writes".
57 * For example, a sequence like:
58 *
59 * add r0.z, ...
60 * sam (f32)(xy)r0.x, ...
61 * ...
62 * sam (f32)(xyzw)r0.w, r0.x, ... ; 3d texture, so r0.xyz are coord
63 *
64 * In this scenario, we treat r0.xyz as class size 3, which is written
65 * (from a use/def perspective) at the 'add' instruction and ignore the
66 * subsequent partial writes to r0.xy. So the 'add r0.z, ...' is the
67 * defining instruction, as it is the first to partially write r0.xyz.
68 *
69 * Note i965 has a similar scenario, which they solve with a virtual
70 * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
71 * register assignment. But for us that is horrible from a scheduling
72 * standpoint. Instead what we do is use idea of 'definer' instruction.
73 * Ie. the first instruction (lowest ip) to write to the variable is the
74 * one we consider from use/def perspective when building interference
75 * graph. (Other instructions which write other variable components
76 * just define the variable some more.)
77 *
78 * Arrays of arbitrary size are handled via pre-coloring a consecutive
79 * sequence of registers. Additional scalar (single component) reg
80 * names are allocated starting at ctx->class_base[total_class_count]
81 * (see arr->base), which are pre-colored. In the use/def graph direct
82 * access is treated as a single element use/def, and indirect access
83 * is treated as use or def of all array elements. (Only the first
84 * def is tracked, in case of multiple indirect writes, etc.)
85 */
86
87 static const unsigned class_sizes[] = {
88 1, 2, 3, 4,
89 4 + 4, /* txd + 1d/2d */
90 4 + 6, /* txd + 3d */
91 };
92 #define class_count ARRAY_SIZE(class_sizes)
93
94 static const unsigned half_class_sizes[] = {
95 1, 2, 3, 4,
96 };
97 #define half_class_count ARRAY_SIZE(half_class_sizes)
98
99 /* seems to just be used for compute shaders? Seems like vec1 and vec3
100 * are sufficient (for now?)
101 */
102 static const unsigned high_class_sizes[] = {
103 1, 3,
104 };
105 #define high_class_count ARRAY_SIZE(high_class_sizes)
106
107 #define total_class_count (class_count + half_class_count + high_class_count)
108
109 /* Below a0.x are normal regs. RA doesn't need to assign a0.x/p0.x. */
110 #define NUM_REGS (4 * 48) /* r0 to r47 */
111 #define NUM_HIGH_REGS (4 * 8) /* r48 to r55 */
112 #define FIRST_HIGH_REG (4 * 48)
113 /* Number of virtual regs in a given class: */
114 #define CLASS_REGS(i) (NUM_REGS - (class_sizes[i] - 1))
115 #define HALF_CLASS_REGS(i) (NUM_REGS - (half_class_sizes[i] - 1))
116 #define HIGH_CLASS_REGS(i) (NUM_HIGH_REGS - (high_class_sizes[i] - 1))
117
118 #define HALF_OFFSET (class_count)
119 #define HIGH_OFFSET (class_count + half_class_count)
120
121 /* register-set, created one time, used for all shaders: */
122 struct ir3_ra_reg_set {
123 struct ra_regs *regs;
124 unsigned int classes[class_count];
125 unsigned int half_classes[half_class_count];
126 unsigned int high_classes[high_class_count];
127 /* maps flat virtual register space to base gpr: */
128 uint16_t *ra_reg_to_gpr;
129 /* maps cls,gpr to flat virtual register space: */
130 uint16_t **gpr_to_ra_reg;
131 };
132
133 static void
build_q_values(unsigned int ** q_values,unsigned off,const unsigned * sizes,unsigned count)134 build_q_values(unsigned int **q_values, unsigned off,
135 const unsigned *sizes, unsigned count)
136 {
137 for (unsigned i = 0; i < count; i++) {
138 q_values[i + off] = rzalloc_array(q_values, unsigned, total_class_count);
139
140 /* From register_allocate.c:
141 *
142 * q(B,C) (indexed by C, B is this register class) in
143 * Runeson/Nyström paper. This is "how many registers of B could
144 * the worst choice register from C conflict with".
145 *
146 * If we just let the register allocation algorithm compute these
147 * values, is extremely expensive. However, since all of our
148 * registers are laid out, we can very easily compute them
149 * ourselves. View the register from C as fixed starting at GRF n
150 * somewhere in the middle, and the register from B as sliding back
151 * and forth. Then the first register to conflict from B is the
152 * one starting at n - class_size[B] + 1 and the last register to
153 * conflict will start at n + class_size[B] - 1. Therefore, the
154 * number of conflicts from B is class_size[B] + class_size[C] - 1.
155 *
156 * +-+-+-+-+-+-+ +-+-+-+-+-+-+
157 * B | | | | | |n| --> | | | | | | |
158 * +-+-+-+-+-+-+ +-+-+-+-+-+-+
159 * +-+-+-+-+-+
160 * C |n| | | | |
161 * +-+-+-+-+-+
162 *
163 * (Idea copied from brw_fs_reg_allocate.cpp)
164 */
165 for (unsigned j = 0; j < count; j++)
166 q_values[i + off][j + off] = sizes[i] + sizes[j] - 1;
167 }
168 }
169
170 /* One-time setup of RA register-set, which describes all the possible
171 * "virtual" registers and their interferences. Ie. double register
172 * occupies (and conflicts with) two single registers, and so forth.
173 * Since registers do not need to be aligned to their class size, they
174 * can conflict with other registers in the same class too. Ie:
175 *
176 * Single (base) | Double
177 * --------------+---------------
178 * R0 | D0
179 * R1 | D0 D1
180 * R2 | D1 D2
181 * R3 | D2
182 * .. and so on..
183 *
184 * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
185 * really just four scalar registers. Don't let that confuse you.)
186 */
187 struct ir3_ra_reg_set *
ir3_ra_alloc_reg_set(void * memctx)188 ir3_ra_alloc_reg_set(void *memctx)
189 {
190 struct ir3_ra_reg_set *set = rzalloc(memctx, struct ir3_ra_reg_set);
191 unsigned ra_reg_count, reg, first_half_reg, first_high_reg, base;
192 unsigned int **q_values;
193
194 /* calculate # of regs across all classes: */
195 ra_reg_count = 0;
196 for (unsigned i = 0; i < class_count; i++)
197 ra_reg_count += CLASS_REGS(i);
198 for (unsigned i = 0; i < half_class_count; i++)
199 ra_reg_count += HALF_CLASS_REGS(i);
200 for (unsigned i = 0; i < high_class_count; i++)
201 ra_reg_count += HIGH_CLASS_REGS(i);
202
203 /* allocate and populate q_values: */
204 q_values = ralloc_array(set, unsigned *, total_class_count);
205
206 build_q_values(q_values, 0, class_sizes, class_count);
207 build_q_values(q_values, HALF_OFFSET, half_class_sizes, half_class_count);
208 build_q_values(q_values, HIGH_OFFSET, high_class_sizes, high_class_count);
209
210 /* allocate the reg-set.. */
211 set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
212 set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
213 set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
214
215 /* .. and classes */
216 reg = 0;
217 for (unsigned i = 0; i < class_count; i++) {
218 set->classes[i] = ra_alloc_reg_class(set->regs);
219
220 set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
221
222 for (unsigned j = 0; j < CLASS_REGS(i); j++) {
223 ra_class_add_reg(set->regs, set->classes[i], reg);
224
225 set->ra_reg_to_gpr[reg] = j;
226 set->gpr_to_ra_reg[i][j] = reg;
227
228 for (unsigned br = j; br < j + class_sizes[i]; br++)
229 ra_add_transitive_reg_conflict(set->regs, br, reg);
230
231 reg++;
232 }
233 }
234
235 first_half_reg = reg;
236 base = HALF_OFFSET;
237
238 for (unsigned i = 0; i < half_class_count; i++) {
239 set->half_classes[i] = ra_alloc_reg_class(set->regs);
240
241 set->gpr_to_ra_reg[base + i] =
242 ralloc_array(set, uint16_t, HALF_CLASS_REGS(i));
243
244 for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
245 ra_class_add_reg(set->regs, set->half_classes[i], reg);
246
247 set->ra_reg_to_gpr[reg] = j;
248 set->gpr_to_ra_reg[base + i][j] = reg;
249
250 for (unsigned br = j; br < j + half_class_sizes[i]; br++)
251 ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg);
252
253 reg++;
254 }
255 }
256
257 first_high_reg = reg;
258 base = HIGH_OFFSET;
259
260 for (unsigned i = 0; i < high_class_count; i++) {
261 set->high_classes[i] = ra_alloc_reg_class(set->regs);
262
263 set->gpr_to_ra_reg[base + i] =
264 ralloc_array(set, uint16_t, HIGH_CLASS_REGS(i));
265
266 for (unsigned j = 0; j < HIGH_CLASS_REGS(i); j++) {
267 ra_class_add_reg(set->regs, set->high_classes[i], reg);
268
269 set->ra_reg_to_gpr[reg] = j;
270 set->gpr_to_ra_reg[base + i][j] = reg;
271
272 for (unsigned br = j; br < j + high_class_sizes[i]; br++)
273 ra_add_transitive_reg_conflict(set->regs, br + first_high_reg, reg);
274
275 reg++;
276 }
277 }
278
279
280 ra_set_finalize(set->regs, q_values);
281
282 ralloc_free(q_values);
283
284 return set;
285 }
286
287 /* additional block-data (per-block) */
288 struct ir3_ra_block_data {
289 BITSET_WORD *def; /* variables defined before used in block */
290 BITSET_WORD *use; /* variables used before defined in block */
291 BITSET_WORD *livein; /* which defs reach entry point of block */
292 BITSET_WORD *liveout; /* which defs reach exit point of block */
293 };
294
295 /* additional instruction-data (per-instruction) */
296 struct ir3_ra_instr_data {
297 /* cached instruction 'definer' info: */
298 struct ir3_instruction *defn;
299 int off, sz, cls;
300 };
301
302 /* register-assign context, per-shader */
303 struct ir3_ra_ctx {
304 struct ir3 *ir;
305 enum shader_t type;
306 bool frag_face;
307
308 struct ir3_ra_reg_set *set;
309 struct ra_graph *g;
310 unsigned alloc_count;
311 /* one per class, plus one slot for arrays: */
312 unsigned class_alloc_count[total_class_count + 1];
313 unsigned class_base[total_class_count + 1];
314 unsigned instr_cnt;
315 unsigned *def, *use; /* def/use table */
316 struct ir3_ra_instr_data *instrd;
317 };
318
319 /* does it conflict? */
320 static inline bool
intersects(unsigned a_start,unsigned a_end,unsigned b_start,unsigned b_end)321 intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
322 {
323 return !((a_start >= b_end) || (b_start >= a_end));
324 }
325
326 static bool
is_half(struct ir3_instruction * instr)327 is_half(struct ir3_instruction *instr)
328 {
329 return !!(instr->regs[0]->flags & IR3_REG_HALF);
330 }
331
332 static bool
is_high(struct ir3_instruction * instr)333 is_high(struct ir3_instruction *instr)
334 {
335 return !!(instr->regs[0]->flags & IR3_REG_HIGH);
336 }
337
338 static int
size_to_class(unsigned sz,bool half,bool high)339 size_to_class(unsigned sz, bool half, bool high)
340 {
341 if (high) {
342 for (unsigned i = 0; i < high_class_count; i++)
343 if (high_class_sizes[i] >= sz)
344 return i + HIGH_OFFSET;
345 } else if (half) {
346 for (unsigned i = 0; i < half_class_count; i++)
347 if (half_class_sizes[i] >= sz)
348 return i + HALF_OFFSET;
349 } else {
350 for (unsigned i = 0; i < class_count; i++)
351 if (class_sizes[i] >= sz)
352 return i;
353 }
354 debug_assert(0);
355 return -1;
356 }
357
358 static bool
is_temp(struct ir3_register * reg)359 is_temp(struct ir3_register *reg)
360 {
361 if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
362 return false;
363 if ((reg->num == regid(REG_A0, 0)) ||
364 (reg->num == regid(REG_P0, 0)))
365 return false;
366 return true;
367 }
368
369 static bool
writes_gpr(struct ir3_instruction * instr)370 writes_gpr(struct ir3_instruction *instr)
371 {
372 if (is_store(instr))
373 return false;
374 /* is dest a normal temp register: */
375 return is_temp(instr->regs[0]);
376 }
377
378 static bool
instr_before(struct ir3_instruction * a,struct ir3_instruction * b)379 instr_before(struct ir3_instruction *a, struct ir3_instruction *b)
380 {
381 if (a->flags & IR3_INSTR_UNUSED)
382 return false;
383 return (a->ip < b->ip);
384 }
385
386 static struct ir3_instruction *
get_definer(struct ir3_ra_ctx * ctx,struct ir3_instruction * instr,int * sz,int * off)387 get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
388 int *sz, int *off)
389 {
390 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
391 struct ir3_instruction *d = NULL;
392
393 if (id->defn) {
394 *sz = id->sz;
395 *off = id->off;
396 return id->defn;
397 }
398
399 if (instr->opc == OPC_META_FI) {
400 /* What about the case where collect is subset of array, we
401 * need to find the distance between where actual array starts
402 * and fanin.. that probably doesn't happen currently.
403 */
404 struct ir3_register *src;
405 int dsz, doff;
406
407 /* note: don't use foreach_ssa_src as this gets called once
408 * while assigning regs (which clears SSA flag)
409 */
410 foreach_src_n(src, n, instr) {
411 struct ir3_instruction *dd;
412 if (!src->instr)
413 continue;
414
415 dd = get_definer(ctx, src->instr, &dsz, &doff);
416
417 if ((!d) || instr_before(dd, d)) {
418 d = dd;
419 *sz = dsz;
420 *off = doff - n;
421 }
422 }
423
424 } else if (instr->cp.right || instr->cp.left) {
425 /* covers also the meta:fo case, which ends up w/ single
426 * scalar instructions for each component:
427 */
428 struct ir3_instruction *f = ir3_neighbor_first(instr);
429
430 /* by definition, the entire sequence forms one linked list
431 * of single scalar register nodes (even if some of them may
432 * be fanouts from a texture sample (for example) instr. We
433 * just need to walk the list finding the first element of
434 * the group defined (lowest ip)
435 */
436 int cnt = 0;
437
438 /* need to skip over unused in the group: */
439 while (f && (f->flags & IR3_INSTR_UNUSED)) {
440 f = f->cp.right;
441 cnt++;
442 }
443
444 while (f) {
445 if ((!d) || instr_before(f, d))
446 d = f;
447 if (f == instr)
448 *off = cnt;
449 f = f->cp.right;
450 cnt++;
451 }
452
453 *sz = cnt;
454
455 } else {
456 /* second case is looking directly at the instruction which
457 * produces multiple values (eg, texture sample), rather
458 * than the fanout nodes that point back to that instruction.
459 * This isn't quite right, because it may be part of a larger
460 * group, such as:
461 *
462 * sam (f32)(xyzw)r0.x, ...
463 * add r1.x, ...
464 * add r1.y, ...
465 * sam (f32)(xyzw)r2.x, r0.w <-- (r0.w, r1.x, r1.y)
466 *
467 * need to come up with a better way to handle that case.
468 */
469 if (instr->address) {
470 *sz = instr->regs[0]->size;
471 } else {
472 *sz = util_last_bit(instr->regs[0]->wrmask);
473 }
474 *off = 0;
475 d = instr;
476 }
477
478 if (d->regs[0]->flags & IR3_REG_PHI_SRC) {
479 struct ir3_instruction *phi = d->regs[0]->instr;
480 struct ir3_instruction *dd;
481 int dsz, doff;
482
483 dd = get_definer(ctx, phi, &dsz, &doff);
484
485 *sz = MAX2(*sz, dsz);
486 *off = doff;
487
488 if (instr_before(dd, d)) {
489 d = dd;
490 }
491 }
492
493 if (d->opc == OPC_META_PHI) {
494 /* we have already inserted parallel-copies into
495 * the phi, so we don't need to chase definers
496 */
497 struct ir3_register *src;
498 struct ir3_instruction *dd = d;
499
500 /* note: don't use foreach_ssa_src as this gets called once
501 * while assigning regs (which clears SSA flag)
502 */
503 foreach_src(src, d) {
504 if (!src->instr)
505 continue;
506 if (instr_before(src->instr, dd))
507 dd = src->instr;
508 }
509
510 d = dd;
511 }
512
513 if (d->opc == OPC_META_FO) {
514 struct ir3_instruction *dd;
515 int dsz, doff;
516
517 dd = get_definer(ctx, d->regs[1]->instr, &dsz, &doff);
518
519 /* by definition, should come before: */
520 debug_assert(instr_before(dd, d));
521
522 *sz = MAX2(*sz, dsz);
523
524 debug_assert(instr->opc == OPC_META_FO);
525 *off = MAX2(*off, instr->fo.off);
526
527 d = dd;
528 }
529
530 id->defn = d;
531 id->sz = *sz;
532 id->off = *off;
533
534 return d;
535 }
536
537 static void
ra_block_find_definers(struct ir3_ra_ctx * ctx,struct ir3_block * block)538 ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
539 {
540 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
541 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
542 if (instr->regs_count == 0)
543 continue;
544 /* couple special cases: */
545 if (writes_addr(instr) || writes_pred(instr)) {
546 id->cls = -1;
547 } else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
548 id->cls = total_class_count;
549 id->defn = instr;
550 } else {
551 id->defn = get_definer(ctx, instr, &id->sz, &id->off);
552 id->cls = size_to_class(id->sz, is_half(id->defn), is_high(id->defn));
553 }
554 }
555 }
556
557 /* give each instruction a name (and ip), and count up the # of names
558 * of each class
559 */
560 static void
ra_block_name_instructions(struct ir3_ra_ctx * ctx,struct ir3_block * block)561 ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
562 {
563 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
564 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
565
566 #ifdef DEBUG
567 instr->name = ~0;
568 #endif
569
570 ctx->instr_cnt++;
571
572 if (instr->regs_count == 0)
573 continue;
574
575 if (!writes_gpr(instr))
576 continue;
577
578 if (id->defn != instr)
579 continue;
580
581 /* arrays which don't fit in one of the pre-defined class
582 * sizes are pre-colored:
583 */
584 if (id->cls >= 0) {
585 instr->name = ctx->class_alloc_count[id->cls]++;
586 ctx->alloc_count++;
587 }
588 }
589 }
590
591 static void
ra_init(struct ir3_ra_ctx * ctx)592 ra_init(struct ir3_ra_ctx *ctx)
593 {
594 unsigned n, base;
595
596 ir3_clear_mark(ctx->ir);
597 n = ir3_count_instructions(ctx->ir);
598
599 ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
600
601 list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
602 ra_block_find_definers(ctx, block);
603 }
604
605 list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
606 ra_block_name_instructions(ctx, block);
607 }
608
609 /* figure out the base register name for each class. The
610 * actual ra name is class_base[cls] + instr->name;
611 */
612 ctx->class_base[0] = 0;
613 for (unsigned i = 1; i <= total_class_count; i++) {
614 ctx->class_base[i] = ctx->class_base[i-1] +
615 ctx->class_alloc_count[i-1];
616 }
617
618 /* and vreg names for array elements: */
619 base = ctx->class_base[total_class_count];
620 list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
621 arr->base = base;
622 ctx->class_alloc_count[total_class_count] += arr->length;
623 base += arr->length;
624 }
625 ctx->alloc_count += ctx->class_alloc_count[total_class_count];
626
627 ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
628 ralloc_steal(ctx->g, ctx->instrd);
629 ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
630 ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
631 }
632
633 static unsigned
__ra_name(struct ir3_ra_ctx * ctx,int cls,struct ir3_instruction * defn)634 __ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
635 {
636 unsigned name;
637 debug_assert(cls >= 0);
638 debug_assert(cls < total_class_count); /* we shouldn't get arrays here.. */
639 name = ctx->class_base[cls] + defn->name;
640 debug_assert(name < ctx->alloc_count);
641 return name;
642 }
643
644 static int
ra_name(struct ir3_ra_ctx * ctx,struct ir3_ra_instr_data * id)645 ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
646 {
647 /* TODO handle name mapping for arrays */
648 return __ra_name(ctx, id->cls, id->defn);
649 }
650
651 static void
ra_destroy(struct ir3_ra_ctx * ctx)652 ra_destroy(struct ir3_ra_ctx *ctx)
653 {
654 ralloc_free(ctx->g);
655 }
656
657 static void
ra_block_compute_live_ranges(struct ir3_ra_ctx * ctx,struct ir3_block * block)658 ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
659 {
660 struct ir3_ra_block_data *bd;
661 unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
662
663 #define def(name, instr) \
664 do { \
665 /* defined on first write: */ \
666 if (!ctx->def[name]) \
667 ctx->def[name] = instr->ip; \
668 ctx->use[name] = instr->ip; \
669 BITSET_SET(bd->def, name); \
670 } while(0);
671
672 #define use(name, instr) \
673 do { \
674 ctx->use[name] = MAX2(ctx->use[name], instr->ip); \
675 if (!BITSET_TEST(bd->def, name)) \
676 BITSET_SET(bd->use, name); \
677 } while(0);
678
679 bd = rzalloc(ctx->g, struct ir3_ra_block_data);
680
681 bd->def = rzalloc_array(bd, BITSET_WORD, bitset_words);
682 bd->use = rzalloc_array(bd, BITSET_WORD, bitset_words);
683 bd->livein = rzalloc_array(bd, BITSET_WORD, bitset_words);
684 bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
685
686 block->data = bd;
687
688 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
689 struct ir3_instruction *src;
690 struct ir3_register *reg;
691
692 if (instr->regs_count == 0)
693 continue;
694
695 /* There are a couple special cases to deal with here:
696 *
697 * fanout: used to split values from a higher class to a lower
698 * class, for example split the results of a texture fetch
699 * into individual scalar values; We skip over these from
700 * a 'def' perspective, and for a 'use' we walk the chain
701 * up to the defining instruction.
702 *
703 * fanin: used to collect values from lower class and assemble
704 * them together into a higher class, for example arguments
705 * to texture sample instructions; We consider these to be
706 * defined at the earliest fanin source.
707 *
708 * phi: used to merge values from different flow control paths
709 * to the same reg. Consider defined at earliest phi src,
710 * and update all the other phi src's (which may come later
711 * in the program) as users to extend the var's live range.
712 *
713 * Most of this, other than phi, is completely handled in the
714 * get_definer() helper.
715 *
716 * In either case, we trace the instruction back to the original
717 * definer and consider that as the def/use ip.
718 */
719
720 if (writes_gpr(instr)) {
721 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
722 struct ir3_register *dst = instr->regs[0];
723
724 if (dst->flags & IR3_REG_ARRAY) {
725 struct ir3_array *arr =
726 ir3_lookup_array(ctx->ir, dst->array.id);
727 unsigned i;
728
729 debug_assert(!(dst->flags & IR3_REG_PHI_SRC));
730
731 arr->start_ip = MIN2(arr->start_ip, instr->ip);
732 arr->end_ip = MAX2(arr->end_ip, instr->ip);
733
734 /* set the node class now.. in case we don't encounter
735 * this array dst again. From register_alloc algo's
736 * perspective, these are all single/scalar regs:
737 */
738 for (i = 0; i < arr->length; i++) {
739 unsigned name = arr->base + i;
740 ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
741 }
742
743 /* indirect write is treated like a write to all array
744 * elements, since we don't know which one is actually
745 * written:
746 */
747 if (dst->flags & IR3_REG_RELATIV) {
748 for (i = 0; i < arr->length; i++) {
749 unsigned name = arr->base + i;
750 def(name, instr);
751 }
752 } else {
753 unsigned name = arr->base + dst->array.offset;
754 def(name, instr);
755 }
756
757 } else if (id->defn == instr) {
758 unsigned name = ra_name(ctx, id);
759
760 /* since we are in SSA at this point: */
761 debug_assert(!BITSET_TEST(bd->use, name));
762
763 def(name, id->defn);
764
765 if (is_high(id->defn)) {
766 ra_set_node_class(ctx->g, name,
767 ctx->set->high_classes[id->cls - HIGH_OFFSET]);
768 } else if (is_half(id->defn)) {
769 ra_set_node_class(ctx->g, name,
770 ctx->set->half_classes[id->cls - HALF_OFFSET]);
771 } else {
772 ra_set_node_class(ctx->g, name,
773 ctx->set->classes[id->cls]);
774 }
775
776 /* extend the live range for phi srcs, which may come
777 * from the bottom of the loop
778 */
779 if (id->defn->regs[0]->flags & IR3_REG_PHI_SRC) {
780 struct ir3_instruction *phi = id->defn->regs[0]->instr;
781 foreach_ssa_src(src, phi) {
782 /* if src is after phi, then we need to extend
783 * the liverange to the end of src's block:
784 */
785 if (src->ip > phi->ip) {
786 struct ir3_instruction *last =
787 list_last_entry(&src->block->instr_list,
788 struct ir3_instruction, node);
789 ctx->use[name] = MAX2(ctx->use[name], last->ip);
790 }
791 }
792 }
793 }
794 }
795
796 foreach_src(reg, instr) {
797 if (reg->flags & IR3_REG_ARRAY) {
798 struct ir3_array *arr =
799 ir3_lookup_array(ctx->ir, reg->array.id);
800 arr->start_ip = MIN2(arr->start_ip, instr->ip);
801 arr->end_ip = MAX2(arr->end_ip, instr->ip);
802 /* indirect read is treated like a read fromall array
803 * elements, since we don't know which one is actually
804 * read:
805 */
806 if (reg->flags & IR3_REG_RELATIV) {
807 unsigned i;
808 for (i = 0; i < arr->length; i++) {
809 unsigned name = arr->base + i;
810 use(name, instr);
811 }
812 } else {
813 unsigned name = arr->base + reg->array.offset;
814 use(name, instr);
815 debug_assert(reg->array.offset < arr->length);
816 }
817 } else if ((src = ssa(reg)) && writes_gpr(src)) {
818 unsigned name = ra_name(ctx, &ctx->instrd[src->ip]);
819 use(name, instr);
820 }
821 }
822 }
823 }
824
825 static bool
ra_compute_livein_liveout(struct ir3_ra_ctx * ctx)826 ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
827 {
828 unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
829 bool progress = false;
830
831 list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
832 struct ir3_ra_block_data *bd = block->data;
833
834 /* update livein: */
835 for (unsigned i = 0; i < bitset_words; i++) {
836 BITSET_WORD new_livein =
837 (bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
838
839 if (new_livein & ~bd->livein[i]) {
840 bd->livein[i] |= new_livein;
841 progress = true;
842 }
843 }
844
845 /* update liveout: */
846 for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
847 struct ir3_block *succ = block->successors[j];
848 struct ir3_ra_block_data *succ_bd;
849
850 if (!succ)
851 continue;
852
853 succ_bd = succ->data;
854
855 for (unsigned i = 0; i < bitset_words; i++) {
856 BITSET_WORD new_liveout =
857 (succ_bd->livein[i] & ~bd->liveout[i]);
858
859 if (new_liveout) {
860 bd->liveout[i] |= new_liveout;
861 progress = true;
862 }
863 }
864 }
865 }
866
867 return progress;
868 }
869
870 static void
print_bitset(const char * name,BITSET_WORD * bs,unsigned cnt)871 print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt)
872 {
873 bool first = true;
874 debug_printf(" %s:", name);
875 for (unsigned i = 0; i < cnt; i++) {
876 if (BITSET_TEST(bs, i)) {
877 if (!first)
878 debug_printf(",");
879 debug_printf(" %04u", i);
880 first = false;
881 }
882 }
883 debug_printf("\n");
884 }
885
886 static void
ra_add_interference(struct ir3_ra_ctx * ctx)887 ra_add_interference(struct ir3_ra_ctx *ctx)
888 {
889 struct ir3 *ir = ctx->ir;
890
891 /* initialize array live ranges: */
892 list_for_each_entry (struct ir3_array, arr, &ir->array_list, node) {
893 arr->start_ip = ~0;
894 arr->end_ip = 0;
895 }
896
897 /* compute live ranges (use/def) on a block level, also updating
898 * block's def/use bitmasks (used below to calculate per-block
899 * livein/liveout):
900 */
901 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
902 ra_block_compute_live_ranges(ctx, block);
903 }
904
905 /* update per-block livein/liveout: */
906 while (ra_compute_livein_liveout(ctx)) {}
907
908 if (fd_mesa_debug & FD_DBG_OPTMSGS) {
909 debug_printf("AFTER LIVEIN/OUT:\n");
910 ir3_print(ir);
911 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
912 struct ir3_ra_block_data *bd = block->data;
913 debug_printf("block%u:\n", block_id(block));
914 print_bitset("def", bd->def, ctx->alloc_count);
915 print_bitset("use", bd->use, ctx->alloc_count);
916 print_bitset("l/i", bd->livein, ctx->alloc_count);
917 print_bitset("l/o", bd->liveout, ctx->alloc_count);
918 }
919 }
920
921 /* extend start/end ranges based on livein/liveout info from cfg: */
922 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
923 struct ir3_ra_block_data *bd = block->data;
924
925 for (unsigned i = 0; i < ctx->alloc_count; i++) {
926 if (BITSET_TEST(bd->livein, i)) {
927 ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
928 ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
929 }
930
931 if (BITSET_TEST(bd->liveout, i)) {
932 ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
933 ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
934 }
935 }
936 }
937
938 /* need to fix things up to keep outputs live: */
939 for (unsigned i = 0; i < ir->noutputs; i++) {
940 struct ir3_instruction *instr = ir->outputs[i];
941 unsigned name = ra_name(ctx, &ctx->instrd[instr->ip]);
942 ctx->use[name] = ctx->instr_cnt;
943 }
944
945 for (unsigned i = 0; i < ctx->alloc_count; i++) {
946 for (unsigned j = 0; j < ctx->alloc_count; j++) {
947 if (intersects(ctx->def[i], ctx->use[i],
948 ctx->def[j], ctx->use[j])) {
949 ra_add_node_interference(ctx->g, i, j);
950 }
951 }
952 }
953 }
954
955 /* some instructions need fix-up if dst register is half precision: */
fixup_half_instr_dst(struct ir3_instruction * instr)956 static void fixup_half_instr_dst(struct ir3_instruction *instr)
957 {
958 switch (opc_cat(instr->opc)) {
959 case 1: /* move instructions */
960 instr->cat1.dst_type = half_type(instr->cat1.dst_type);
961 break;
962 case 3:
963 switch (instr->opc) {
964 case OPC_MAD_F32:
965 instr->opc = OPC_MAD_F16;
966 break;
967 case OPC_SEL_B32:
968 instr->opc = OPC_SEL_B16;
969 break;
970 case OPC_SEL_S32:
971 instr->opc = OPC_SEL_S16;
972 break;
973 case OPC_SEL_F32:
974 instr->opc = OPC_SEL_F16;
975 break;
976 case OPC_SAD_S32:
977 instr->opc = OPC_SAD_S16;
978 break;
979 /* instructions may already be fixed up: */
980 case OPC_MAD_F16:
981 case OPC_SEL_B16:
982 case OPC_SEL_S16:
983 case OPC_SEL_F16:
984 case OPC_SAD_S16:
985 break;
986 default:
987 assert(0);
988 break;
989 }
990 break;
991 case 5:
992 instr->cat5.type = half_type(instr->cat5.type);
993 break;
994 }
995 }
996 /* some instructions need fix-up if src register is half precision: */
fixup_half_instr_src(struct ir3_instruction * instr)997 static void fixup_half_instr_src(struct ir3_instruction *instr)
998 {
999 switch (instr->opc) {
1000 case OPC_MOV:
1001 instr->cat1.src_type = half_type(instr->cat1.src_type);
1002 break;
1003 default:
1004 break;
1005 }
1006 }
1007
1008 /* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
1009 * array access(es) which do not have any previous access to depend
1010 * on from scheduling point of view
1011 */
1012 static void
reg_assign(struct ir3_ra_ctx * ctx,struct ir3_register * reg,struct ir3_instruction * instr)1013 reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
1014 struct ir3_instruction *instr)
1015 {
1016 struct ir3_ra_instr_data *id;
1017
1018 if (reg->flags & IR3_REG_ARRAY) {
1019 struct ir3_array *arr =
1020 ir3_lookup_array(ctx->ir, reg->array.id);
1021 unsigned name = arr->base + reg->array.offset;
1022 unsigned r = ra_get_node_reg(ctx->g, name);
1023 unsigned num = ctx->set->ra_reg_to_gpr[r];
1024
1025 if (reg->flags & IR3_REG_RELATIV) {
1026 reg->array.offset = num;
1027 } else {
1028 reg->num = num;
1029 }
1030
1031 reg->flags &= ~IR3_REG_ARRAY;
1032 } else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
1033 unsigned name = ra_name(ctx, id);
1034 unsigned r = ra_get_node_reg(ctx->g, name);
1035 unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
1036
1037 debug_assert(!(reg->flags & IR3_REG_RELATIV));
1038
1039 if (is_high(id->defn))
1040 num += FIRST_HIGH_REG;
1041
1042 reg->num = num;
1043 reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
1044
1045 if (is_half(id->defn))
1046 reg->flags |= IR3_REG_HALF;
1047 }
1048 }
1049
1050 static void
ra_block_alloc(struct ir3_ra_ctx * ctx,struct ir3_block * block)1051 ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
1052 {
1053 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
1054 struct ir3_register *reg;
1055
1056 if (instr->regs_count == 0)
1057 continue;
1058
1059 if (writes_gpr(instr)) {
1060 reg_assign(ctx, instr->regs[0], instr);
1061 if (instr->regs[0]->flags & IR3_REG_HALF)
1062 fixup_half_instr_dst(instr);
1063 }
1064
1065 foreach_src_n(reg, n, instr) {
1066 struct ir3_instruction *src = reg->instr;
1067 /* Note: reg->instr could be null for IR3_REG_ARRAY */
1068 if (!(src || (reg->flags & IR3_REG_ARRAY)))
1069 continue;
1070 reg_assign(ctx, instr->regs[n+1], src);
1071 if (instr->regs[n+1]->flags & IR3_REG_HALF)
1072 fixup_half_instr_src(instr);
1073 }
1074 }
1075 }
1076
1077 static int
ra_alloc(struct ir3_ra_ctx * ctx)1078 ra_alloc(struct ir3_ra_ctx *ctx)
1079 {
1080 unsigned n = 0;
1081
1082 /* frag shader inputs get pre-assigned, since we have some
1083 * constraints/unknowns about setup for some of these regs:
1084 */
1085 if (ctx->type == SHADER_FRAGMENT) {
1086 struct ir3 *ir = ctx->ir;
1087 unsigned i = 0, j;
1088 if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
1089 struct ir3_instruction *instr = ir->inputs[i];
1090 int cls = size_to_class(1, true, false);
1091 unsigned name = __ra_name(ctx, cls, instr);
1092 unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];
1093
1094 /* if we have frag_face, it gets hr0.x */
1095 ra_set_node_reg(ctx->g, name, reg);
1096 i += 4;
1097 }
1098
1099 j = 0;
1100 for (; i < ir->ninputs; i++) {
1101 struct ir3_instruction *instr = ir->inputs[i];
1102 if (instr) {
1103 struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
1104
1105 if (id->defn == instr) {
1106 unsigned name, reg;
1107
1108 name = ra_name(ctx, id);
1109 reg = ctx->set->gpr_to_ra_reg[id->cls][j];
1110
1111 ra_set_node_reg(ctx->g, name, reg);
1112 j += id->sz;
1113 }
1114 }
1115 }
1116 n = j;
1117 }
1118
1119 /* pre-assign array elements:
1120 */
1121 list_for_each_entry (struct ir3_array, arr, &ctx->ir->array_list, node) {
1122 unsigned base = n;
1123
1124 if (arr->end_ip == 0)
1125 continue;
1126
1127 /* figure out what else we conflict with which has already
1128 * been assigned:
1129 */
1130 retry:
1131 list_for_each_entry (struct ir3_array, arr2, &ctx->ir->array_list, node) {
1132 if (arr2 == arr)
1133 break;
1134 if (arr2->end_ip == 0)
1135 continue;
1136 /* if it intersects with liverange AND register range.. */
1137 if (intersects(arr->start_ip, arr->end_ip,
1138 arr2->start_ip, arr2->end_ip) &&
1139 intersects(base, base + arr->length,
1140 arr2->reg, arr2->reg + arr2->length)) {
1141 base = MAX2(base, arr2->reg + arr2->length);
1142 goto retry;
1143 }
1144 }
1145
1146 arr->reg = base;
1147
1148 for (unsigned i = 0; i < arr->length; i++) {
1149 unsigned name, reg;
1150
1151 name = arr->base + i;
1152 reg = ctx->set->gpr_to_ra_reg[0][base++];
1153
1154 ra_set_node_reg(ctx->g, name, reg);
1155 }
1156 }
1157
1158 if (!ra_allocate(ctx->g))
1159 return -1;
1160
1161 list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
1162 ra_block_alloc(ctx, block);
1163 }
1164
1165 return 0;
1166 }
1167
ir3_ra(struct ir3 * ir,enum shader_t type,bool frag_coord,bool frag_face)1168 int ir3_ra(struct ir3 *ir, enum shader_t type,
1169 bool frag_coord, bool frag_face)
1170 {
1171 struct ir3_ra_ctx ctx = {
1172 .ir = ir,
1173 .type = type,
1174 .frag_face = frag_face,
1175 .set = ir->compiler->set,
1176 };
1177 int ret;
1178
1179 ra_init(&ctx);
1180 ra_add_interference(&ctx);
1181 ret = ra_alloc(&ctx);
1182 ra_destroy(&ctx);
1183
1184 return ret;
1185 }
1186