1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Eric Anholt <eric@anholt.net>
25 *
26 */
27
28 /** @file register_allocate.c
29 *
30 * Graph-coloring register allocator.
31 *
32 * The basic idea of graph coloring is to make a node in a graph for
33 * every thing that needs a register (color) number assigned, and make
34 * edges in the graph between nodes that interfere (can't be allocated
35 * to the same register at the same time).
36 *
37 * During the "simplify" process, any any node with fewer edges than
38 * there are registers means that that edge can get assigned a
39 * register regardless of what its neighbors choose, so that node is
40 * pushed on a stack and removed (with its edges) from the graph.
41 * That likely causes other nodes to become trivially colorable as well.
42 *
43 * Then during the "select" process, nodes are popped off of that
44 * stack, their edges restored, and assigned a color different from
45 * their neighbors. Because they were pushed on the stack only when
46 * they were trivially colorable, any color chosen won't interfere
47 * with the registers to be popped later.
48 *
49 * The downside to most graph coloring is that real hardware often has
50 * limitations, like registers that need to be allocated to a node in
51 * pairs, or aligned on some boundary. This implementation follows
52 * the paper "Retargetable Graph-Coloring Register Allocation for
53 * Irregular Architectures" by Johan Runeson and Sven-Olof Nyström.
54 *
55 * In this system, there are register classes each containing various
56 * registers, and registers may interfere with other registers. For
57 * example, one might have a class of base registers, and a class of
58 * aligned register pairs that would each interfere with their pair of
59 * the base registers. Each node has a register class it needs to be
60 * assigned to. Define p(B) to be the size of register class B, and
61 * q(B,C) to be the number of registers in B that the worst choice
62 * register in C could conflict with. Then, this system replaces the
63 * basic graph coloring test of "fewer edges from this node than there
64 * are registers" with "For this node of class B, the sum of q(B,C)
65 * for each neighbor node of class C is less than pB".
66 *
67 * A nice feature of the pq test is that q(B,C) can be computed once
68 * up front and stored in a 2-dimensional array, so that the cost of
69 * coloring a node is constant with the number of registers. We do
70 * this during ra_set_finalize().
71 */
72
73 #include <stdbool.h>
74 #include <stdlib.h>
75
76 #include "blob.h"
77 #include "ralloc.h"
78 #include "util/bitset.h"
79 #include "u_math.h"
80 #include "register_allocate.h"
81 #include "register_allocate_internal.h"
82
83 /**
84 * Creates a set of registers for the allocator.
85 *
86 * mem_ctx is a ralloc context for the allocator. The reg set may be freed
87 * using ralloc_free().
88 */
89 struct ra_regs *
ra_alloc_reg_set(void * mem_ctx,unsigned int count,bool need_conflict_lists)90 ra_alloc_reg_set(void *mem_ctx, unsigned int count, bool need_conflict_lists)
91 {
92 unsigned int i;
93 struct ra_regs *regs;
94
95 regs = rzalloc(mem_ctx, struct ra_regs);
96 regs->count = count;
97 regs->regs = rzalloc_array(regs, struct ra_reg, count);
98 regs->uses_conflict_lists = need_conflict_lists;
99
100 for (i = 0; i < count; i++) {
101 regs->regs[i].conflicts = rzalloc_array(regs->regs, BITSET_WORD,
102 BITSET_WORDS(count));
103 BITSET_SET(regs->regs[i].conflicts, i);
104
105 if (need_conflict_lists) {
106 struct ra_list *list = ®s->regs[i].conflict_list;
107 list->cap = 16;
108 list->elems = ralloc_array(regs->regs, unsigned int, list->cap);
109 list->elems[list->size++] = i;
110 }
111 }
112
113 return regs;
114 }
115
116 /**
117 * The register allocator by default prefers to allocate low register numbers,
118 * since it was written for hardware (gen4/5 Intel) that is limited in its
119 * multithreadedness by the number of registers used in a given shader.
120 *
121 * However, for hardware without that restriction, densely packed register
122 * allocation can put serious constraints on instruction scheduling. This
123 * function tells the allocator to rotate around the registers if possible as
124 * it allocates the nodes.
125 */
126 void
ra_set_allocate_round_robin(struct ra_regs * regs)127 ra_set_allocate_round_robin(struct ra_regs *regs)
128 {
129 regs->round_robin = true;
130 }
131
132 static void
ra_add_conflict_list(struct ra_regs * regs,unsigned int r1,unsigned int r2)133 ra_add_conflict_list(struct ra_regs *regs, unsigned int r1, unsigned int r2)
134 {
135 struct ra_reg *reg1 = ®s->regs[r1];
136
137 if (regs->uses_conflict_lists) {
138 struct ra_list *list = ®1->conflict_list;
139 if (list->size == list->cap) {
140 assert(list->cap);
141 list->cap *= 2;
142 list->elems = reralloc(regs, list->elems, unsigned int, list->cap);
143 }
144 list->elems[list->size++] = r2;
145 }
146
147 BITSET_SET(reg1->conflicts, r2);
148 }
149
150 void
ra_add_reg_conflict(struct ra_regs * regs,unsigned int r1,unsigned int r2)151 ra_add_reg_conflict(struct ra_regs *regs, unsigned int r1, unsigned int r2)
152 {
153 if (!BITSET_TEST(regs->regs[r1].conflicts, r2)) {
154 ra_add_conflict_list(regs, r1, r2);
155 ra_add_conflict_list(regs, r2, r1);
156 }
157 }
158
159 /**
160 * Adds a conflict between base_reg and reg, and also between reg and
161 * anything that base_reg conflicts with.
162 *
163 * This can simplify code for setting up multiple register classes
164 * which are aggregates of some base hardware registers, compared to
165 * explicitly using ra_add_reg_conflict.
166 */
167 void
ra_add_transitive_reg_conflict(struct ra_regs * regs,unsigned int base_reg,unsigned int reg)168 ra_add_transitive_reg_conflict(struct ra_regs *regs,
169 unsigned int base_reg, unsigned int reg)
170 {
171 ra_add_reg_conflict(regs, reg, base_reg);
172
173 struct ra_list *list = ®s->regs[base_reg].conflict_list;
174 for (unsigned int i = 0; i < list->size; i++)
175 ra_add_reg_conflict(regs, reg, list->elems[i]);
176 }
177
178 /**
179 * Set up conflicts between base_reg and it's two half registers reg0 and
180 * reg1, but take care to not add conflicts between reg0 and reg1.
181 *
182 * This is useful for architectures where full size registers are aliased by
183 * two half size registers (eg 32 bit float and 16 bit float registers).
184 */
185 void
ra_add_transitive_reg_pair_conflict(struct ra_regs * regs,unsigned int base_reg,unsigned int reg0,unsigned int reg1)186 ra_add_transitive_reg_pair_conflict(struct ra_regs *regs,
187 unsigned int base_reg, unsigned int reg0, unsigned int reg1)
188 {
189 ra_add_reg_conflict(regs, reg0, base_reg);
190 ra_add_reg_conflict(regs, reg1, base_reg);
191
192 struct ra_list *list = ®s->regs[base_reg].conflict_list;
193 for (unsigned int i = 0; i < list->size; i++) {
194 unsigned int conflict = list->elems[i];
195 if (conflict != reg1)
196 ra_add_reg_conflict(regs, reg0, conflict);
197 if (conflict != reg0)
198 ra_add_reg_conflict(regs, reg1, conflict);
199 }
200 }
201
202 /**
203 * Makes every conflict on the given register transitive. In other words,
204 * every register that conflicts with r will now conflict with every other
205 * register conflicting with r.
206 *
207 * This can simplify code for setting up multiple register classes
208 * which are aggregates of some base hardware registers, compared to
209 * explicitly using ra_add_reg_conflict.
210 */
211 void
ra_make_reg_conflicts_transitive(struct ra_regs * regs,unsigned int r)212 ra_make_reg_conflicts_transitive(struct ra_regs *regs, unsigned int r)
213 {
214 struct ra_reg *reg = ®s->regs[r];
215 int c;
216
217 BITSET_FOREACH_SET(c, reg->conflicts, regs->count) {
218 struct ra_reg *other = ®s->regs[c];
219 unsigned i;
220 for (i = 0; i < BITSET_WORDS(regs->count); i++)
221 other->conflicts[i] |= reg->conflicts[i];
222 }
223 }
224
225 struct ra_class *
ra_alloc_reg_class(struct ra_regs * regs)226 ra_alloc_reg_class(struct ra_regs *regs)
227 {
228 struct ra_class *class;
229
230 regs->classes = reralloc(regs->regs, regs->classes, struct ra_class *,
231 regs->class_count + 1);
232
233 class = rzalloc(regs, struct ra_class);
234 class->regset = regs;
235
236 /* Users may rely on the class index being allocated in order starting from 0. */
237 class->index = regs->class_count++;
238 regs->classes[class->index] = class;
239
240 class->regs = rzalloc_array(class, BITSET_WORD, BITSET_WORDS(regs->count));
241
242 return class;
243 }
244
245 /**
246 * Creates a register class for contiguous register groups of a base register
247 * set.
248 *
249 * A reg set using this type of register class must use only this type of
250 * register class.
251 */
252 struct ra_class *
ra_alloc_contig_reg_class(struct ra_regs * regs,int contig_len)253 ra_alloc_contig_reg_class(struct ra_regs *regs, int contig_len)
254 {
255 struct ra_class *c = ra_alloc_reg_class(regs);
256
257 assert(contig_len != 0);
258 c->contig_len = contig_len;
259
260 return c;
261 }
262
263 struct ra_class *
ra_get_class_from_index(struct ra_regs * regs,unsigned int class)264 ra_get_class_from_index(struct ra_regs *regs, unsigned int class)
265 {
266 return regs->classes[class];
267 }
268
269 unsigned int
ra_class_index(struct ra_class * c)270 ra_class_index(struct ra_class *c)
271 {
272 return c->index;
273 }
274
275 void
ra_class_add_reg(struct ra_class * class,unsigned int r)276 ra_class_add_reg(struct ra_class *class, unsigned int r)
277 {
278 assert(r < class->regset->count);
279 assert(r + class->contig_len <= class->regset->count);
280
281 BITSET_SET(class->regs, r);
282 class->p++;
283 }
284
285 /**
286 * Returns true if the register belongs to the given class.
287 */
288 static bool
reg_belongs_to_class(unsigned int r,struct ra_class * c)289 reg_belongs_to_class(unsigned int r, struct ra_class *c)
290 {
291 return BITSET_TEST(c->regs, r);
292 }
293
294 /**
295 * Must be called after all conflicts and register classes have been
296 * set up and before the register set is used for allocation.
297 * To avoid costly q value computation, use the q_values paramater
298 * to pass precomputed q values to this function.
299 */
300 void
ra_set_finalize(struct ra_regs * regs,unsigned int ** q_values)301 ra_set_finalize(struct ra_regs *regs, unsigned int **q_values)
302 {
303 unsigned int b, c;
304
305 for (b = 0; b < regs->class_count; b++) {
306 regs->classes[b]->q = ralloc_array(regs, unsigned int, regs->class_count);
307 }
308
309 if (q_values) {
310 for (b = 0; b < regs->class_count; b++) {
311 for (c = 0; c < regs->class_count; c++) {
312 regs->classes[b]->q[c] = q_values[b][c];
313 }
314 }
315 } else {
316 /* Compute, for each class B and C, how many regs of B an
317 * allocation to C could conflict with.
318 */
319 for (b = 0; b < regs->class_count; b++) {
320 for (c = 0; c < regs->class_count; c++) {
321 struct ra_class *class_b = regs->classes[b];
322 struct ra_class *class_c = regs->classes[c];
323
324 if (class_b->contig_len && class_c->contig_len) {
325 if (class_b->contig_len == 1 && class_c->contig_len == 1) {
326 /* If both classes are single registers, then they only
327 * conflict if there are any regs shared between them. This
328 * is a cheap test for a common case.
329 */
330 class_b->q[c] = 0;
331 for (int i = 0; i < BITSET_WORDS(regs->count); i++) {
332 if (class_b->regs[i] & class_c->regs[i]) {
333 class_b->q[c] = 1;
334 break;
335 }
336 }
337 } else {
338 int max_possible_conflicts = class_b->contig_len + class_c->contig_len - 1;
339
340 unsigned int max_conflicts = 0;
341 unsigned int rc;
342 BITSET_FOREACH_SET(rc, regs->classes[c]->regs, regs->count) {
343 int start = MAX2(0, (int)rc - class_b->contig_len + 1);
344 int end = MIN2(regs->count, rc + class_c->contig_len);
345 unsigned int conflicts = 0;
346 for (int i = start; i < end; i++) {
347 if (BITSET_TEST(class_b->regs, i))
348 conflicts++;
349 }
350 max_conflicts = MAX2(max_conflicts, conflicts);
351 /* Unless a class has some restriction like the register
352 * bases are all aligned, then we should quickly find this
353 * limit and exit the loop.
354 */
355 if (max_conflicts == max_possible_conflicts)
356 break;
357 }
358 class_b->q[c] = max_conflicts;
359 }
360 } else {
361 /* If you're doing contiguous classes, you have to be all in
362 * because I don't want to deal with it.
363 */
364 assert(!class_b->contig_len && !class_c->contig_len);
365
366 unsigned int rc;
367 int max_conflicts = 0;
368
369 BITSET_FOREACH_SET(rc, regs->classes[c]->regs, regs->count) {
370 int conflicts = 0;
371
372 struct ra_list *list = ®s->regs[rc].conflict_list;
373 for (unsigned int i = 0; i < list->size; i++) {
374 unsigned int rb = list->elems[i];
375 if (reg_belongs_to_class(rb, regs->classes[b]))
376 conflicts++;
377 }
378 max_conflicts = MAX2(max_conflicts, conflicts);
379 }
380 regs->classes[b]->q[c] = max_conflicts;
381 }
382 }
383 }
384 }
385
386 if (regs->uses_conflict_lists) {
387 for (b = 0; b < regs->count; b++) {
388 struct ra_list *list = ®s->regs[b].conflict_list;
389 ralloc_free(list->elems);
390 list->size = 0;
391 list->cap = 0;
392 }
393 }
394
395 bool all_contig = true;
396 for (int c = 0; c < regs->class_count; c++)
397 all_contig &= regs->classes[c]->contig_len != 0;
398 if (all_contig) {
399 /* In this case, we never need the conflicts lists (and it would probably
400 * be a mistake to look at conflicts when doing contiguous classes!), so
401 * free them. TODO: Avoid the allocation in the first place.
402 */
403 for (int i = 0; i < regs->count; i++) {
404 ralloc_free(regs->regs[i].conflicts);
405 regs->regs[i].conflicts = NULL;
406 }
407 }
408 }
409
410 void
ra_set_serialize(const struct ra_regs * regs,struct blob * blob)411 ra_set_serialize(const struct ra_regs *regs, struct blob *blob)
412 {
413 blob_write_uint32(blob, regs->count);
414 blob_write_uint32(blob, regs->class_count);
415
416 bool is_contig = regs->classes[0]->contig_len != 0;
417 blob_write_uint8(blob, is_contig);
418
419 if (!is_contig) {
420 for (unsigned int r = 0; r < regs->count; r++) {
421 struct ra_reg *reg = ®s->regs[r];
422 blob_write_bytes(blob, reg->conflicts, BITSET_WORDS(regs->count) *
423 sizeof(BITSET_WORD));
424 assert(reg->conflict_list.size == 0);
425 }
426 }
427
428 for (unsigned int c = 0; c < regs->class_count; c++) {
429 struct ra_class *class = regs->classes[c];
430 blob_write_bytes(blob, class->regs, BITSET_WORDS(regs->count) *
431 sizeof(BITSET_WORD));
432 blob_write_uint32(blob, class->contig_len);
433 blob_write_uint32(blob, class->p);
434 blob_write_bytes(blob, class->q, regs->class_count * sizeof(*class->q));
435 }
436
437 blob_write_uint32(blob, regs->round_robin);
438 }
439
440 struct ra_regs *
ra_set_deserialize(void * mem_ctx,struct blob_reader * blob)441 ra_set_deserialize(void *mem_ctx, struct blob_reader *blob)
442 {
443 unsigned int reg_count = blob_read_uint32(blob);
444 unsigned int class_count = blob_read_uint32(blob);
445 bool is_contig = blob_read_uint8(blob);
446
447 struct ra_regs *regs = ra_alloc_reg_set(mem_ctx, reg_count, false);
448 assert(regs->count == reg_count);
449
450 if (is_contig) {
451 for (int i = 0; i < regs->count; i++) {
452 ralloc_free(regs->regs[i].conflicts);
453 regs->regs[i].conflicts = NULL;
454 }
455 } else {
456 for (unsigned int r = 0; r < reg_count; r++) {
457 struct ra_reg *reg = ®s->regs[r];
458 blob_copy_bytes(blob, reg->conflicts, BITSET_WORDS(reg_count) *
459 sizeof(BITSET_WORD));
460 }
461 }
462
463 assert(regs->classes == NULL);
464 regs->classes = ralloc_array(regs->regs, struct ra_class *, class_count);
465 regs->class_count = class_count;
466
467 for (unsigned int c = 0; c < class_count; c++) {
468 struct ra_class *class = rzalloc(regs, struct ra_class);
469 regs->classes[c] = class;
470 class->regset = regs;
471 class->index = c;
472
473 class->regs = ralloc_array(class, BITSET_WORD, BITSET_WORDS(reg_count));
474 blob_copy_bytes(blob, class->regs, BITSET_WORDS(reg_count) *
475 sizeof(BITSET_WORD));
476
477 class->contig_len = blob_read_uint32(blob);
478 class->p = blob_read_uint32(blob);
479
480 class->q = ralloc_array(regs->classes[c], unsigned int, class_count);
481 blob_copy_bytes(blob, class->q, class_count * sizeof(*class->q));
482 }
483
484 regs->round_robin = blob_read_uint32(blob);
485
486 return regs;
487 }
488
489 static uint64_t
ra_get_num_adjacency_bits(uint64_t n)490 ra_get_num_adjacency_bits(uint64_t n)
491 {
492 return (n * (n - 1)) / 2;
493 }
494
495 static uint64_t
ra_get_adjacency_bit_index(unsigned n1,unsigned n2)496 ra_get_adjacency_bit_index(unsigned n1, unsigned n2)
497 {
498 assert(n1 != n2);
499 unsigned k1 = MAX2(n1, n2);
500 unsigned k2 = MIN2(n1, n2);
501 return ra_get_num_adjacency_bits(k1) + k2;
502 }
503
504 static bool
ra_test_adjacency_bit(struct ra_graph * g,unsigned n1,unsigned n2)505 ra_test_adjacency_bit(struct ra_graph *g, unsigned n1, unsigned n2)
506 {
507 uint64_t index = ra_get_adjacency_bit_index(n1, n2);
508 return BITSET_TEST(g->adjacency, index);
509 }
510
511 static void
ra_set_adjacency_bit(struct ra_graph * g,unsigned n1,unsigned n2)512 ra_set_adjacency_bit(struct ra_graph *g, unsigned n1, unsigned n2)
513 {
514 unsigned index = ra_get_adjacency_bit_index(n1, n2);
515 BITSET_SET(g->adjacency, index);
516 }
517
518 static void
ra_clear_adjacency_bit(struct ra_graph * g,unsigned n1,unsigned n2)519 ra_clear_adjacency_bit(struct ra_graph *g, unsigned n1, unsigned n2)
520 {
521 unsigned index = ra_get_adjacency_bit_index(n1, n2);
522 BITSET_CLEAR(g->adjacency, index);
523 }
524
525 static void
ra_add_node_adjacency(struct ra_graph * g,unsigned int n1,unsigned int n2)526 ra_add_node_adjacency(struct ra_graph *g, unsigned int n1, unsigned int n2)
527 {
528 assert(n1 != n2);
529
530 int n1_class = g->nodes[n1].class;
531 int n2_class = g->nodes[n2].class;
532 g->nodes[n1].q_total += g->regs->classes[n1_class]->q[n2_class];
533
534 struct ra_list *adj = &g->nodes[n1].adjacency;
535 if (adj->size == adj->cap) {
536 adj->cap = MAX2(64, adj->cap * 2);
537 adj->elems = reralloc(g, adj->elems, unsigned int, adj->cap);
538 }
539 adj->elems[adj->size++] = n2;
540 }
541
542 static void
ra_node_remove_adjacency(struct ra_graph * g,unsigned int n1,unsigned int n2)543 ra_node_remove_adjacency(struct ra_graph *g, unsigned int n1, unsigned int n2)
544 {
545 assert(n1 != n2);
546 ra_clear_adjacency_bit(g, n1, n2);
547
548 int n1_class = g->nodes[n1].class;
549 int n2_class = g->nodes[n2].class;
550 g->nodes[n1].q_total -= g->regs->classes[n1_class]->q[n2_class];
551
552 struct ra_list *adj = &g->nodes[n1].adjacency;
553 for (unsigned i = 0; i < adj->size; i++) {
554 if (adj->elems[i] == n2) {
555 adj->elems[i] = adj->elems[adj->size - 1];
556 adj->size--;
557 break;
558 }
559 }
560 }
561
562 static void
ra_realloc_interference_graph(struct ra_graph * g,unsigned int alloc)563 ra_realloc_interference_graph(struct ra_graph *g, unsigned int alloc)
564 {
565 if (alloc <= g->alloc)
566 return;
567
568 /* If we always have a whole number of BITSET_WORDs, it makes it much
569 * easier to memset the top of the growing bitsets.
570 */
571 assert(g->alloc % BITSET_WORDBITS == 0);
572 alloc = align(alloc, BITSET_WORDBITS);
573 g->nodes = rerzalloc(g, g->nodes, struct ra_node, g->alloc, alloc);
574 g->nodes_extra = rerzalloc(g, g->nodes_extra, struct ra_node_extra, g->alloc, alloc);
575 g->adjacency = rerzalloc(g, g->adjacency, BITSET_WORD,
576 BITSET_WORDS(ra_get_num_adjacency_bits(g->alloc)),
577 BITSET_WORDS(ra_get_num_adjacency_bits(alloc)));
578
579 /* Initialize new nodes. */
580 for (unsigned i = g->alloc; i < alloc; i++) {
581 struct ra_node* node = g->nodes + i;
582 node->q_total = 0;
583 node->reg = NO_REG;
584 g->nodes_extra[i].forced_reg = NO_REG;
585 }
586
587 /* These are scratch values and don't need to be zeroed. We'll clear them
588 * as part of ra_select() setup.
589 */
590 unsigned bitset_count = BITSET_WORDS(alloc);
591 g->tmp.stack = reralloc(g, g->tmp.stack, unsigned int, alloc);
592 g->tmp.in_stack = reralloc(g, g->tmp.in_stack, BITSET_WORD, bitset_count);
593
594 g->tmp.reg_assigned = reralloc(g, g->tmp.reg_assigned, BITSET_WORD,
595 bitset_count);
596 g->tmp.pq_test = reralloc(g, g->tmp.pq_test, BITSET_WORD, bitset_count);
597 g->tmp.min_q_total = reralloc(g, g->tmp.min_q_total, unsigned int,
598 bitset_count);
599 g->tmp.min_q_node = reralloc(g, g->tmp.min_q_node, unsigned int,
600 bitset_count);
601
602 g->alloc = alloc;
603 }
604
605 struct ra_graph *
ra_alloc_interference_graph(struct ra_regs * regs,unsigned int count)606 ra_alloc_interference_graph(struct ra_regs *regs, unsigned int count)
607 {
608 struct ra_graph *g;
609
610 g = rzalloc(NULL, struct ra_graph);
611 g->regs = regs;
612 g->count = count;
613 ra_realloc_interference_graph(g, count);
614
615 return g;
616 }
617
618 void
ra_resize_interference_graph(struct ra_graph * g,unsigned int count)619 ra_resize_interference_graph(struct ra_graph *g, unsigned int count)
620 {
621 g->count = count;
622 if (count > g->alloc)
623 ra_realloc_interference_graph(g, g->alloc * 2);
624 }
625
ra_set_select_reg_callback(struct ra_graph * g,ra_select_reg_callback callback,void * data)626 void ra_set_select_reg_callback(struct ra_graph *g,
627 ra_select_reg_callback callback,
628 void *data)
629 {
630 g->select_reg_callback = callback;
631 g->select_reg_callback_data = data;
632 }
633
634 void
ra_set_node_class(struct ra_graph * g,unsigned int n,struct ra_class * class)635 ra_set_node_class(struct ra_graph *g,
636 unsigned int n, struct ra_class *class)
637 {
638 g->nodes[n].class = class->index;
639 }
640
641 struct ra_class *
ra_get_node_class(struct ra_graph * g,unsigned int n)642 ra_get_node_class(struct ra_graph *g,
643 unsigned int n)
644 {
645 return g->regs->classes[g->nodes[n].class];
646 }
647
648 unsigned int
ra_add_node(struct ra_graph * g,struct ra_class * class)649 ra_add_node(struct ra_graph *g, struct ra_class *class)
650 {
651 unsigned int n = g->count;
652 ra_resize_interference_graph(g, g->count + 1);
653
654 ra_set_node_class(g, n, class);
655
656 return n;
657 }
658
659 void
ra_add_node_interference(struct ra_graph * g,unsigned int n1,unsigned int n2)660 ra_add_node_interference(struct ra_graph *g,
661 unsigned int n1, unsigned int n2)
662 {
663 assert(n1 < g->count && n2 < g->count);
664 if (n1 != n2 && !ra_test_adjacency_bit(g, n1, n2)) {
665 ra_set_adjacency_bit(g, n1, n2);
666 ra_add_node_adjacency(g, n1, n2);
667 ra_add_node_adjacency(g, n2, n1);
668 }
669 }
670
671 void
ra_reset_node_interference(struct ra_graph * g,unsigned int n)672 ra_reset_node_interference(struct ra_graph *g, unsigned int n)
673 {
674 struct ra_list *adj = &g->nodes[n].adjacency;
675
676 for (unsigned i = 0; i < adj->size; i++)
677 ra_node_remove_adjacency(g, adj->elems[i], n);
678
679 adj->size = 0;
680 }
681
682 static void
update_pq_info(struct ra_graph * g,unsigned int n)683 update_pq_info(struct ra_graph *g, unsigned int n)
684 {
685 int i = n / BITSET_WORDBITS;
686 int n_class = g->nodes[n].class;
687 if (g->nodes[n].tmp.q_total < g->regs->classes[n_class]->p) {
688 BITSET_SET(g->tmp.pq_test, n);
689 } else if (g->tmp.min_q_total[i] != UINT_MAX) {
690 /* Only update min_q_total and min_q_node if min_q_total != UINT_MAX so
691 * that we don't update while we have stale data and accidentally mark
692 * it as non-stale. Also, in order to remain consistent with the old
693 * naive implementation of the algorithm, we do a lexicographical sort
694 * to ensure that we always choose the node with the highest node index.
695 */
696 if (g->nodes[n].tmp.q_total < g->tmp.min_q_total[i] ||
697 (g->nodes[n].tmp.q_total == g->tmp.min_q_total[i] &&
698 n > g->tmp.min_q_node[i])) {
699 g->tmp.min_q_total[i] = g->nodes[n].tmp.q_total;
700 g->tmp.min_q_node[i] = n;
701 }
702 }
703 }
704
705 static void
add_node_to_stack(struct ra_graph * g,unsigned int n)706 add_node_to_stack(struct ra_graph *g, unsigned int n)
707 {
708 int n_class = g->nodes[n].class;
709
710 assert(!BITSET_TEST(g->tmp.in_stack, n));
711
712 struct ra_list *adj = &g->nodes[n].adjacency;
713 for (unsigned i = 0; i < adj->size; i++) {
714 unsigned int n2 = adj->elems[i];
715 unsigned int n2_class = g->nodes[n2].class;
716
717 if (!BITSET_TEST(g->tmp.in_stack, n2) &&
718 !BITSET_TEST(g->tmp.reg_assigned, n2)) {
719 assert(g->nodes[n2].tmp.q_total >= g->regs->classes[n2_class]->q[n_class]);
720 g->nodes[n2].tmp.q_total -= g->regs->classes[n2_class]->q[n_class];
721 update_pq_info(g, n2);
722 }
723 }
724
725 g->tmp.stack[g->tmp.stack_count] = n;
726 g->tmp.stack_count++;
727 BITSET_SET(g->tmp.in_stack, n);
728
729 /* Flag the min_q_total for n's block as dirty so it gets recalculated */
730 g->tmp.min_q_total[n / BITSET_WORDBITS] = UINT_MAX;
731 }
732
733 /**
734 * Simplifies the interference graph by pushing all
735 * trivially-colorable nodes into a stack of nodes to be colored,
736 * removing them from the graph, and rinsing and repeating.
737 *
738 * If we encounter a case where we can't push any nodes on the stack, then
739 * we optimistically choose a node and push it on the stack. We heuristically
740 * push the node with the lowest total q value, since it has the fewest
741 * neighbors and therefore is most likely to be allocated.
742 */
743 static void
ra_simplify(struct ra_graph * g)744 ra_simplify(struct ra_graph *g)
745 {
746 bool progress = true;
747 unsigned int stack_optimistic_start = UINT_MAX;
748
749 /* Figure out the high bit and bit mask for the first iteration of a loop
750 * over BITSET_WORDs.
751 */
752 const unsigned int top_word_high_bit = (g->count - 1) % BITSET_WORDBITS;
753
754 /* Do a quick pre-pass to set things up */
755 g->tmp.stack_count = 0;
756 for (int i = BITSET_WORDS(g->count) - 1, high_bit = top_word_high_bit;
757 i >= 0; i--, high_bit = BITSET_WORDBITS - 1) {
758 g->tmp.in_stack[i] = 0;
759 g->tmp.reg_assigned[i] = 0;
760 g->tmp.pq_test[i] = 0;
761 g->tmp.min_q_total[i] = UINT_MAX;
762 g->tmp.min_q_node[i] = UINT_MAX;
763 for (int j = high_bit; j >= 0; j--) {
764 unsigned int n = i * BITSET_WORDBITS + j;
765 g->nodes[n].reg = g->nodes_extra[n].forced_reg;
766 g->nodes[n].tmp.q_total = g->nodes[n].q_total;
767 if (g->nodes[n].reg != NO_REG)
768 g->tmp.reg_assigned[i] |= BITSET_BIT(j);
769 update_pq_info(g, n);
770 }
771 }
772
773 while (progress) {
774 unsigned int min_q_total = UINT_MAX;
775 unsigned int min_q_node = UINT_MAX;
776
777 progress = false;
778
779 for (int i = BITSET_WORDS(g->count) - 1, high_bit = top_word_high_bit;
780 i >= 0; i--, high_bit = BITSET_WORDBITS - 1) {
781 BITSET_WORD mask = ~(BITSET_WORD)0 >> (31 - high_bit);
782
783 BITSET_WORD skip = g->tmp.in_stack[i] | g->tmp.reg_assigned[i];
784 if (skip == mask)
785 continue;
786
787 BITSET_WORD pq = g->tmp.pq_test[i] & ~skip;
788 if (pq) {
789 /* In this case, we have stuff we can immediately take off the
790 * stack. This also means that we're guaranteed to make progress
791 * and we don't need to bother updating lowest_q_total because we
792 * know we're going to loop again before attempting to do anything
793 * optimistic.
794 */
795 for (int j = high_bit; j >= 0; j--) {
796 if (pq & BITSET_BIT(j)) {
797 unsigned int n = i * BITSET_WORDBITS + j;
798 assert(n < g->count);
799 add_node_to_stack(g, n);
800 /* add_node_to_stack() may update pq_test for this word so
801 * we need to update our local copy.
802 */
803 pq = g->tmp.pq_test[i] & ~skip;
804 progress = true;
805 }
806 }
807 } else if (!progress) {
808 if (g->tmp.min_q_total[i] == UINT_MAX) {
809 /* The min_q_total and min_q_node are dirty because we added
810 * one of these nodes to the stack. It needs to be
811 * recalculated.
812 */
813 for (int j = high_bit; j >= 0; j--) {
814 if (skip & BITSET_BIT(j))
815 continue;
816
817 unsigned int n = i * BITSET_WORDBITS + j;
818 assert(n < g->count);
819 if (g->nodes[n].tmp.q_total < g->tmp.min_q_total[i]) {
820 g->tmp.min_q_total[i] = g->nodes[n].tmp.q_total;
821 g->tmp.min_q_node[i] = n;
822 }
823 }
824 }
825 if (g->tmp.min_q_total[i] < min_q_total) {
826 min_q_node = g->tmp.min_q_node[i];
827 min_q_total = g->tmp.min_q_total[i];
828 }
829 }
830 }
831
832 if (!progress && min_q_total != UINT_MAX) {
833 if (stack_optimistic_start == UINT_MAX)
834 stack_optimistic_start = g->tmp.stack_count;
835
836 add_node_to_stack(g, min_q_node);
837 progress = true;
838 }
839 }
840
841 g->tmp.stack_optimistic_start = stack_optimistic_start;
842 }
843
844 bool
ra_class_allocations_conflict(struct ra_class * c1,unsigned int r1,struct ra_class * c2,unsigned int r2)845 ra_class_allocations_conflict(struct ra_class *c1, unsigned int r1,
846 struct ra_class *c2, unsigned int r2)
847 {
848 if (c1->contig_len) {
849 assert(c2->contig_len);
850
851 int r1_end = r1 + c1->contig_len;
852 int r2_end = r2 + c2->contig_len;
853 return !(r2 >= r1_end || r1 >= r2_end);
854 } else {
855 return BITSET_TEST(c1->regset->regs[r1].conflicts, r2);
856 }
857 }
858
859 static struct ra_node *
ra_find_conflicting_neighbor(struct ra_graph * g,unsigned int n,unsigned int r)860 ra_find_conflicting_neighbor(struct ra_graph *g, unsigned int n, unsigned int r)
861 {
862 struct ra_list *adj = &g->nodes[n].adjacency;
863 for (unsigned i = 0; i < adj->size; i++) {
864 unsigned int n2 = adj->elems[i];
865
866 /* If our adjacent node is in the stack, it's not allocated yet. */
867 if (!BITSET_TEST(g->tmp.in_stack, n2) &&
868 ra_class_allocations_conflict(g->regs->classes[g->nodes[n].class], r,
869 g->regs->classes[g->nodes[n2].class], g->nodes[n2].reg)) {
870 return &g->nodes[n2];
871 }
872 }
873
874 return NULL;
875 }
876
877 /* Computes a bitfield of what regs are available for a given register
878 * selection.
879 *
880 * This lets drivers implement a more complicated policy than our simple first
881 * or round robin policies (which don't require knowing the whole bitset)
882 */
883 static bool
ra_compute_available_regs(struct ra_graph * g,unsigned int n,BITSET_WORD * regs)884 ra_compute_available_regs(struct ra_graph *g, unsigned int n, BITSET_WORD *regs)
885 {
886 struct ra_class *c = g->regs->classes[g->nodes[n].class];
887
888 /* Populate with the set of regs that are in the node's class. */
889 memcpy(regs, c->regs, BITSET_WORDS(g->regs->count) * sizeof(BITSET_WORD));
890
891 /* Remove any regs that conflict with nodes that we're adjacent to and have
892 * already colored.
893 */
894 struct ra_list *adj = &g->nodes[n].adjacency;
895 for (unsigned i = 0; i < adj->size; i++) {
896 struct ra_node *n2 = &g->nodes[adj->elems[i]];
897 struct ra_class *n2c = g->regs->classes[n2->class];
898
899 if (!BITSET_TEST(g->tmp.in_stack, adj->elems[i])) {
900 if (c->contig_len) {
901 int start = MAX2(0, (int)n2->reg - c->contig_len + 1);
902 int end = MIN2(g->regs->count, n2->reg + n2c->contig_len);
903 for (unsigned i = start; i < end; i++)
904 BITSET_CLEAR(regs, i);
905 } else {
906 for (int j = 0; j < BITSET_WORDS(g->regs->count); j++)
907 regs[j] &= ~g->regs->regs[n2->reg].conflicts[j];
908 }
909 }
910 }
911
912 for (int i = 0; i < BITSET_WORDS(g->regs->count); i++) {
913 if (regs[i])
914 return true;
915 }
916
917 return false;
918 }
919
920 /**
921 * Pops nodes from the stack back into the graph, coloring them with
922 * registers as they go.
923 *
924 * If all nodes were trivially colorable, then this must succeed. If
925 * not (optimistic coloring), then it may return false;
926 */
927 static bool
ra_select(struct ra_graph * g)928 ra_select(struct ra_graph *g)
929 {
930 int start_search_reg = 0;
931 BITSET_WORD *select_regs = NULL;
932
933 if (g->select_reg_callback)
934 select_regs = malloc(BITSET_WORDS(g->regs->count) * sizeof(BITSET_WORD));
935
936 while (g->tmp.stack_count != 0) {
937 unsigned int ri;
938 unsigned int r = -1;
939 int n = g->tmp.stack[g->tmp.stack_count - 1];
940 struct ra_class *c = g->regs->classes[g->nodes[n].class];
941
942 /* set this to false even if we return here so that
943 * ra_get_best_spill_node() considers this node later.
944 */
945 BITSET_CLEAR(g->tmp.in_stack, n);
946
947 if (g->select_reg_callback) {
948 if (!ra_compute_available_regs(g, n, select_regs)) {
949 free(select_regs);
950 return false;
951 }
952
953 r = g->select_reg_callback(n, select_regs, g->select_reg_callback_data);
954 assert(r < g->regs->count);
955 } else {
956 /* Find the lowest-numbered reg which is not used by a member
957 * of the graph adjacent to us.
958 */
959 for (ri = 0; ri < g->regs->count; ri++) {
960 r = (start_search_reg + ri) % g->regs->count;
961 if (!reg_belongs_to_class(r, c))
962 continue;
963
964 struct ra_node *conflicting = ra_find_conflicting_neighbor(g, n, r);
965 if (!conflicting) {
966 /* Found a reg! */
967 break;
968 }
969 if (g->regs->classes[conflicting->class]->contig_len) {
970 /* Skip to point at the last base reg of the conflicting reg
971 * allocation -- the loop will increment us to check the next reg
972 * after the conflicting allocaiton.
973 */
974 unsigned conflicting_end = (conflicting->reg +
975 g->regs->classes[conflicting->class]->contig_len - 1);
976 assert(conflicting_end >= r);
977 ri += conflicting_end - r;
978 }
979 }
980
981 if (ri >= g->regs->count)
982 return false;
983 }
984
985 g->nodes[n].reg = r;
986 g->tmp.stack_count--;
987
988 /* Rotate the starting point except for any nodes above the lowest
989 * optimistically colorable node. The likelihood that we will succeed
990 * at allocating optimistically colorable nodes is highly dependent on
991 * the way that the previous nodes popped off the stack are laid out.
992 * The round-robin strategy increases the fragmentation of the register
993 * file and decreases the number of nearby nodes assigned to the same
994 * color, what increases the likelihood of spilling with respect to the
995 * dense packing strategy.
996 */
997 if (g->regs->round_robin &&
998 g->tmp.stack_count - 1 <= g->tmp.stack_optimistic_start)
999 start_search_reg = r + 1;
1000 }
1001
1002 free(select_regs);
1003
1004 return true;
1005 }
1006
1007 bool
ra_allocate(struct ra_graph * g)1008 ra_allocate(struct ra_graph *g)
1009 {
1010 ra_simplify(g);
1011 return ra_select(g);
1012 }
1013
1014 unsigned int
ra_get_node_reg(struct ra_graph * g,unsigned int n)1015 ra_get_node_reg(struct ra_graph *g, unsigned int n)
1016 {
1017 if (g->nodes_extra[n].forced_reg != NO_REG)
1018 return g->nodes_extra[n].forced_reg;
1019 else
1020 return g->nodes[n].reg;
1021 }
1022
1023 /**
1024 * Forces a node to a specific register. This can be used to avoid
1025 * creating a register class containing one node when handling data
1026 * that must live in a fixed location and is known to not conflict
1027 * with other forced register assignment (as is common with shader
1028 * input data). These nodes do not end up in the stack during
1029 * ra_simplify(), and thus at ra_select() time it is as if they were
1030 * the first popped off the stack and assigned their fixed locations.
1031 * Nodes that use this function do not need to be assigned a register
1032 * class.
1033 *
1034 * Must be called before ra_simplify().
1035 */
1036 void
ra_set_node_reg(struct ra_graph * g,unsigned int n,unsigned int reg)1037 ra_set_node_reg(struct ra_graph *g, unsigned int n, unsigned int reg)
1038 {
1039 g->nodes_extra[n].forced_reg = reg;
1040 }
1041
1042 static float
ra_get_spill_benefit(struct ra_graph * g,unsigned int n)1043 ra_get_spill_benefit(struct ra_graph *g, unsigned int n)
1044 {
1045 float benefit = 0;
1046 int n_class = g->nodes[n].class;
1047
1048 /* Define the benefit of eliminating an interference between n, n2
1049 * through spilling as q(C, B) / p(C). This is similar to the
1050 * "count number of edges" approach of traditional graph coloring,
1051 * but takes classes into account.
1052 */
1053 struct ra_list *adj = &g->nodes[n].adjacency;
1054 for (unsigned i = 0; i < adj->size; i++) {
1055 unsigned int n2 = adj->elems[i];
1056 unsigned int n2_class = g->nodes[n2].class;
1057 benefit += ((float)g->regs->classes[n_class]->q[n2_class] /
1058 g->regs->classes[n_class]->p);
1059 }
1060
1061 return benefit;
1062 }
1063
1064 float
ra_debug_get_spill_benefit(struct ra_graph * g,unsigned int n)1065 ra_debug_get_spill_benefit(struct ra_graph *g, unsigned int n)
1066 {
1067 return ra_get_spill_benefit(g, n);
1068 }
1069
1070 /**
1071 * Returns a node number to be spilled according to the cost/benefit using
1072 * the pq test, or -1 if there are no spillable nodes.
1073 */
1074 int
ra_get_best_spill_node(struct ra_graph * g)1075 ra_get_best_spill_node(struct ra_graph *g)
1076 {
1077 unsigned int best_node = -1;
1078 float best_benefit = 0.0;
1079 unsigned int n;
1080
1081 /* Consider any nodes that we colored successfully or the node we failed to
1082 * color for spilling. When we failed to color a node in ra_select(), we
1083 * only considered these nodes, so spilling any other ones would not result
1084 * in us making progress.
1085 */
1086 for (n = 0; n < g->count; n++) {
1087 float cost = g->nodes_extra[n].spill_cost;
1088 float benefit;
1089
1090 if (cost <= 0.0f)
1091 continue;
1092
1093 if (BITSET_TEST(g->tmp.in_stack, n))
1094 continue;
1095
1096 benefit = ra_get_spill_benefit(g, n);
1097
1098 if (benefit / cost > best_benefit) {
1099 best_benefit = benefit / cost;
1100 best_node = n;
1101 }
1102 }
1103
1104 return best_node;
1105 }
1106
1107 /**
1108 * Only nodes with a spill cost set (cost != 0.0) will be considered
1109 * for register spilling.
1110 */
1111 void
ra_set_node_spill_cost(struct ra_graph * g,unsigned int n,float cost)1112 ra_set_node_spill_cost(struct ra_graph *g, unsigned int n, float cost)
1113 {
1114 g->nodes_extra[n].spill_cost = cost;
1115 }
1116
1117 float
ra_debug_get_node_spill_cost(struct ra_graph * g,unsigned int n)1118 ra_debug_get_node_spill_cost(struct ra_graph *g, unsigned int n)
1119 {
1120 return g->nodes_extra[n].spill_cost;
1121 }
1122