1 /*
2 * Copyright (c) 2017 Lima Project
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sub license,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the
12 * next paragraph) shall be included in all copies or substantial portions
13 * of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 *
23 */
24
25 #include "util/ralloc.h"
26 #include "util/register_allocate.h"
27 #include "util/u_debug.h"
28
29 #include "ppir.h"
30 #include "lima_context.h"
31
32 #define PPIR_REG_COUNT (6 * 4)
33
34 enum ppir_ra_reg_class {
35 ppir_ra_reg_class_vec1,
36 ppir_ra_reg_class_vec2,
37 ppir_ra_reg_class_vec3,
38 ppir_ra_reg_class_vec4,
39
40 /* 4 reg class for load/store instr regs:
41 * load/store instr has no swizzle field, so the (virtual) register
42 * must be allocated at the beginning of a (physical) register,
43 */
44 ppir_ra_reg_class_head_vec1,
45 ppir_ra_reg_class_head_vec2,
46 ppir_ra_reg_class_head_vec3,
47 ppir_ra_reg_class_head_vec4,
48
49 ppir_ra_reg_class_num,
50 };
51
ppir_regalloc_init(void * mem_ctx)52 struct ra_regs *ppir_regalloc_init(void *mem_ctx)
53 {
54 struct ra_regs *ret = ra_alloc_reg_set(mem_ctx, PPIR_REG_COUNT, false);
55 if (!ret)
56 return NULL;
57
58 /* Classes for contiguous 1-4 channel groups anywhere within a register. */
59 struct ra_class *classes[ppir_ra_reg_class_num];
60 for (int i = 0; i < ppir_ra_reg_class_head_vec1; i++) {
61 classes[i] = ra_alloc_contig_reg_class(ret, i + 1);
62
63 for (int j = 0; j < PPIR_REG_COUNT; j += 4) {
64 for (int swiz = 0; swiz < (4 - i); swiz++)
65 ra_class_add_reg(classes[i], j + swiz);
66 }
67 }
68
69 /* Classes for contiguous 1-4 channels with a start channel of .x */
70 for (int i = ppir_ra_reg_class_head_vec1; i < ppir_ra_reg_class_num; i++) {
71 classes[i] = ra_alloc_contig_reg_class(ret, i - ppir_ra_reg_class_head_vec1 + 1);
72
73 for (int j = 0; j < PPIR_REG_COUNT; j += 4)
74 ra_class_add_reg(classes[i], j);
75 }
76
77 ra_set_finalize(ret, NULL);
78 return ret;
79 }
80
ppir_regalloc_update_reglist_ssa(ppir_compiler * comp)81 static void ppir_regalloc_update_reglist_ssa(ppir_compiler *comp)
82 {
83 list_for_each_entry(ppir_block, block, &comp->block_list, list) {
84 list_for_each_entry(ppir_node, node, &block->node_list, list) {
85 if (node->is_end)
86 continue;
87
88 if (!node->instr || node->op == ppir_op_const)
89 continue;
90
91 ppir_dest *dest = ppir_node_get_dest(node);
92 if (dest) {
93 ppir_reg *reg = NULL;
94
95 if (dest->type == ppir_target_ssa) {
96 reg = &dest->ssa;
97 list_addtail(®->list, &comp->reg_list);
98 comp->reg_num++;
99 }
100 }
101 }
102 }
103 }
104
ppir_regalloc_print_result(ppir_compiler * comp)105 static void ppir_regalloc_print_result(ppir_compiler *comp)
106 {
107 printf("======ppir regalloc result======\n");
108 list_for_each_entry(ppir_block, block, &comp->block_list, list) {
109 list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {
110 printf("%03d:", instr->index);
111 for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) {
112 ppir_node *node = instr->slots[i];
113 if (!node)
114 continue;
115
116 printf(" (%d|", node->index);
117
118 ppir_dest *dest = ppir_node_get_dest(node);
119 if (dest)
120 printf("%d", ppir_target_get_dest_reg_index(dest));
121
122 printf("|");
123
124 for (int i = 0; i < ppir_node_get_src_num(node); i++) {
125 if (i)
126 printf(" ");
127 printf("%d", ppir_target_get_src_reg_index(ppir_node_get_src(node, i)));
128 }
129
130 printf(")");
131 }
132 printf("\n");
133 }
134 }
135 printf("--------------------------\n");
136 }
137
create_new_instr_after(ppir_block * block,ppir_instr * ref,ppir_node * node)138 static bool create_new_instr_after(ppir_block *block, ppir_instr *ref,
139 ppir_node *node)
140 {
141 ppir_instr *newinstr = ppir_instr_create(block);
142 if (unlikely(!newinstr))
143 return false;
144
145 list_del(&newinstr->list);
146 list_add(&newinstr->list, &ref->list);
147
148 if (!ppir_instr_insert_node(newinstr, node))
149 return false;
150
151 list_for_each_entry_from(ppir_instr, instr, ref, &block->instr_list, list) {
152 instr->seq++;
153 }
154 newinstr->seq = ref->seq+1;
155 newinstr->scheduled = true;
156 return true;
157 }
158
create_new_instr_before(ppir_block * block,ppir_instr * ref,ppir_node * node)159 static bool create_new_instr_before(ppir_block *block, ppir_instr *ref,
160 ppir_node *node)
161 {
162 ppir_instr *newinstr = ppir_instr_create(block);
163 if (unlikely(!newinstr))
164 return false;
165
166 list_del(&newinstr->list);
167 list_addtail(&newinstr->list, &ref->list);
168
169 if (!ppir_instr_insert_node(newinstr, node))
170 return false;
171
172 list_for_each_entry_from(ppir_instr, instr, ref, &block->instr_list, list) {
173 instr->seq++;
174 }
175 newinstr->seq = ref->seq-1;
176 newinstr->scheduled = true;
177 return true;
178 }
179
ppir_update_spilled_src(ppir_compiler * comp,ppir_block * block,ppir_node * node,ppir_src * src,ppir_node ** fill_node)180 static bool ppir_update_spilled_src(ppir_compiler *comp, ppir_block *block,
181 ppir_node *node, ppir_src *src,
182 ppir_node **fill_node)
183 {
184 /* nodes might have multiple references to the same value.
185 * avoid creating unnecessary loads for the same fill by
186 * saving the node resulting from the temporary load */
187 if (*fill_node)
188 goto update_src;
189
190 int num_components = src->reg->num_components;
191
192 /* alloc new node to load value */
193 ppir_node *load_node = ppir_node_create(block, ppir_op_load_temp, -1, 0);
194 if (!load_node)
195 return false;
196 list_addtail(&load_node->list, &node->list);
197 comp->num_fills++;
198
199 ppir_load_node *load = ppir_node_to_load(load_node);
200
201 load->index = -comp->prog->state.stack_size; /* index sizes are negative */
202 load->num_components = num_components;
203
204 ppir_dest *ld_dest = &load->dest;
205 ld_dest->type = ppir_target_pipeline;
206 ld_dest->pipeline = ppir_pipeline_reg_uniform;
207 ld_dest->write_mask = u_bit_consecutive(0, num_components);
208
209 /* If the uniform slot is empty, we can insert the load_temp
210 * there and use it directly. Exceptionally, if the node is in the
211 * varying or texld slot, this doesn't work. */
212 if (!node->instr->slots[PPIR_INSTR_SLOT_UNIFORM] &&
213 node->instr_pos != PPIR_INSTR_SLOT_VARYING &&
214 node->instr_pos != PPIR_INSTR_SLOT_TEXLD) {
215 ppir_node_target_assign(src, load_node);
216 *fill_node = load_node;
217 return ppir_instr_insert_node(node->instr, load_node);
218 }
219
220 /* Uniform slot was taken, so fall back to a new instruction with a mov */
221 if (!create_new_instr_before(block, node->instr, load_node))
222 return false;
223
224 /* Create move node */
225 ppir_node *move_node = ppir_node_create(block, ppir_op_mov, -1 , 0);
226 if (unlikely(!move_node))
227 return false;
228 list_addtail(&move_node->list, &node->list);
229
230 ppir_alu_node *move_alu = ppir_node_to_alu(move_node);
231
232 move_alu->num_src = 1;
233 move_alu->src->type = ppir_target_pipeline;
234 move_alu->src->pipeline = ppir_pipeline_reg_uniform;
235 for (int i = 0; i < 4; i++)
236 move_alu->src->swizzle[i] = i;
237
238 ppir_dest *alu_dest = &move_alu->dest;
239 alu_dest->type = ppir_target_ssa;
240 alu_dest->ssa.num_components = num_components;
241 alu_dest->ssa.spilled = true;
242 alu_dest->write_mask = u_bit_consecutive(0, num_components);
243
244 list_addtail(&alu_dest->ssa.list, &comp->reg_list);
245 comp->reg_num++;
246
247 if (!ppir_instr_insert_node(load_node->instr, move_node))
248 return false;
249
250 /* insert the new node as predecessor */
251 ppir_node_foreach_pred_safe(node, dep) {
252 ppir_node *pred = dep->pred;
253 ppir_node_remove_dep(dep);
254 ppir_node_add_dep(load_node, pred, ppir_dep_src);
255 }
256 ppir_node_add_dep(node, move_node, ppir_dep_src);
257 ppir_node_add_dep(move_node, load_node, ppir_dep_src);
258
259 *fill_node = move_node;
260
261 update_src:
262 /* switch node src to use the fill node dest */
263 ppir_node_target_assign(src, *fill_node);
264
265 return true;
266 }
267
ppir_update_spilled_dest_load(ppir_compiler * comp,ppir_block * block,ppir_node * node)268 static bool ppir_update_spilled_dest_load(ppir_compiler *comp, ppir_block *block,
269 ppir_node *node)
270 {
271 ppir_dest *dest = ppir_node_get_dest(node);
272 assert(dest != NULL);
273 assert(dest->type == ppir_target_register);
274 ppir_reg *reg = dest->reg;
275 int num_components = reg->num_components;
276
277 /* alloc new node to load value */
278 ppir_node *load_node = ppir_node_create(block, ppir_op_load_temp, -1, 0);
279 if (!load_node)
280 return NULL;
281 list_addtail(&load_node->list, &node->list);
282 comp->num_fills++;
283
284 ppir_load_node *load = ppir_node_to_load(load_node);
285
286 load->index = -comp->prog->state.stack_size; /* index sizes are negative */
287 load->num_components = num_components;
288
289 load->dest.type = ppir_target_pipeline;
290 load->dest.pipeline = ppir_pipeline_reg_uniform;
291 load->dest.write_mask = u_bit_consecutive(0, num_components);
292
293 /* New instruction is needed since we're updating a dest register
294 * and we can't write to the uniform pipeline reg */
295 if (!create_new_instr_before(block, node->instr, load_node))
296 return false;
297
298 /* Create move node */
299 ppir_node *move_node = ppir_node_create(block, ppir_op_mov, -1 , 0);
300 if (unlikely(!move_node))
301 return false;
302 list_addtail(&move_node->list, &node->list);
303
304 ppir_alu_node *move_alu = ppir_node_to_alu(move_node);
305
306 move_alu->num_src = 1;
307 move_alu->src->type = ppir_target_pipeline;
308 move_alu->src->pipeline = ppir_pipeline_reg_uniform;
309 for (int i = 0; i < 4; i++)
310 move_alu->src->swizzle[i] = i;
311
312 move_alu->dest.type = ppir_target_register;
313 move_alu->dest.reg = reg;
314 move_alu->dest.write_mask = u_bit_consecutive(0, num_components);
315
316 if (!ppir_instr_insert_node(load_node->instr, move_node))
317 return false;
318
319 ppir_node_foreach_pred_safe(node, dep) {
320 ppir_node *pred = dep->pred;
321 ppir_node_remove_dep(dep);
322 ppir_node_add_dep(load_node, pred, ppir_dep_src);
323 }
324 ppir_node_add_dep(node, move_node, ppir_dep_src);
325 ppir_node_add_dep(move_node, load_node, ppir_dep_src);
326
327 return true;
328 }
329
ppir_update_spilled_dest(ppir_compiler * comp,ppir_block * block,ppir_node * node)330 static bool ppir_update_spilled_dest(ppir_compiler *comp, ppir_block *block,
331 ppir_node *node)
332 {
333 ppir_dest *dest = ppir_node_get_dest(node);
334 assert(dest != NULL);
335 ppir_reg *reg = ppir_dest_get_reg(dest);
336
337 /* alloc new node to store value */
338 ppir_node *store_node = ppir_node_create(block, ppir_op_store_temp, -1, 0);
339 if (!store_node)
340 return false;
341 list_addtail(&store_node->list, &node->list);
342 comp->num_spills++;
343
344 ppir_store_node *store = ppir_node_to_store(store_node);
345
346 store->index = -comp->prog->state.stack_size; /* index sizes are negative */
347
348 ppir_node_target_assign(&store->src, node);
349 store->num_components = reg->num_components;
350
351 /* insert the new node as successor */
352 ppir_node_foreach_succ_safe(node, dep) {
353 ppir_node *succ = dep->succ;
354 ppir_node_remove_dep(dep);
355 ppir_node_add_dep(succ, store_node, ppir_dep_src);
356 }
357 ppir_node_add_dep(store_node, node, ppir_dep_src);
358
359 /* If the store temp slot is empty, we can insert the store_temp
360 * there and use it directly. Exceptionally, if the node is in the
361 * combine slot, this doesn't work. */
362 if (!node->instr->slots[PPIR_INSTR_SLOT_STORE_TEMP] &&
363 node->instr_pos != PPIR_INSTR_SLOT_ALU_COMBINE)
364 return ppir_instr_insert_node(node->instr, store_node);
365
366 /* Not possible to merge store, so fall back to a new instruction */
367 return create_new_instr_after(block, node->instr, store_node);
368 }
369
ppir_regalloc_spill_reg(ppir_compiler * comp,ppir_reg * chosen)370 static bool ppir_regalloc_spill_reg(ppir_compiler *comp, ppir_reg *chosen)
371 {
372 list_for_each_entry(ppir_block, block, &comp->block_list, list) {
373 list_for_each_entry(ppir_node, node, &block->node_list, list) {
374
375 ppir_dest *dest = ppir_node_get_dest(node);
376 if (dest && ppir_dest_get_reg(dest) == chosen) {
377 /* If dest is a register, it might be updating only some its
378 * components, so need to load the existing value first */
379 if (dest->type == ppir_target_register) {
380 if (!ppir_update_spilled_dest_load(comp, block, node))
381 return false;
382 }
383 if (!ppir_update_spilled_dest(comp, block, node))
384 return false;
385 }
386
387 ppir_node *fill_node = NULL;
388 /* nodes might have multiple references to the same value.
389 * avoid creating unnecessary loads for the same fill by
390 * saving the node resulting from the temporary load */
391 for (int i = 0; i < ppir_node_get_src_num(node); i++) {
392 ppir_src *src = ppir_node_get_src(node, i);
393 ppir_reg *reg = ppir_src_get_reg(src);
394 if (reg == chosen) {
395 if (!ppir_update_spilled_src(comp, block, node, src, &fill_node))
396 return false;
397 }
398 }
399 }
400 }
401
402 return true;
403 }
404
ppir_regalloc_choose_spill_node(ppir_compiler * comp,struct ra_graph * g)405 static ppir_reg *ppir_regalloc_choose_spill_node(ppir_compiler *comp,
406 struct ra_graph *g)
407 {
408 float spill_costs[comp->reg_num];
409 /* experimentally determined, it seems to be worth scaling cost of
410 * regs in instructions that have used uniform/store_temp slots,
411 * but not too much as to offset the num_components base cost. */
412 const float slot_scale = 1.1f;
413
414 list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
415 if (reg->spilled) {
416 /* not considered for spilling */
417 spill_costs[reg->regalloc_index] = 0.0f;
418 continue;
419 }
420
421 /* It is beneficial to spill registers with higher component number,
422 * so increase the cost of spilling registers with few components */
423 float spill_cost = 4.0f / (float)reg->num_components;
424 spill_costs[reg->regalloc_index] = spill_cost;
425 }
426
427 list_for_each_entry(ppir_block, block, &comp->block_list, list) {
428 list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {
429 if (instr->slots[PPIR_INSTR_SLOT_UNIFORM]) {
430 for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) {
431 ppir_node *node = instr->slots[i];
432 if (!node)
433 continue;
434 for (int j = 0; j < ppir_node_get_src_num(node); j++) {
435 ppir_src *src = ppir_node_get_src(node, j);
436 if (!src)
437 continue;
438 ppir_reg *reg = ppir_src_get_reg(src);
439 if (!reg)
440 continue;
441
442 spill_costs[reg->regalloc_index] *= slot_scale;
443 }
444 }
445 }
446 if (instr->slots[PPIR_INSTR_SLOT_STORE_TEMP]) {
447 for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) {
448 ppir_node *node = instr->slots[i];
449 if (!node)
450 continue;
451 ppir_dest *dest = ppir_node_get_dest(node);
452 if (!dest)
453 continue;
454 ppir_reg *reg = ppir_dest_get_reg(dest);
455 if (!reg)
456 continue;
457
458 spill_costs[reg->regalloc_index] *= slot_scale;
459 }
460 }
461 }
462 }
463
464 for (int i = 0; i < comp->reg_num; i++)
465 ra_set_node_spill_cost(g, i, spill_costs[i]);
466
467 int r = ra_get_best_spill_node(g);
468 if (r == -1)
469 return NULL;
470
471 ppir_reg *chosen = NULL;
472 int i = 0;
473 list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
474 if (i++ == r) {
475 chosen = reg;
476 break;
477 }
478 }
479 assert(chosen);
480 chosen->spilled = true;
481 chosen->is_head = true; /* store_temp unable to do swizzle */
482
483 return chosen;
484 }
485
ppir_regalloc_reset_liveness_info(ppir_compiler * comp)486 static void ppir_regalloc_reset_liveness_info(ppir_compiler *comp)
487 {
488 int idx = 0;
489
490 list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
491 reg->regalloc_index = idx++;
492 }
493
494 list_for_each_entry(ppir_block, block, &comp->block_list, list) {
495 list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {
496
497 if (instr->live_mask)
498 ralloc_free(instr->live_mask);
499 instr->live_mask = rzalloc_array(comp, uint8_t,
500 reg_mask_size(comp->reg_num));
501
502 if (instr->live_set)
503 ralloc_free(instr->live_set);
504 instr->live_set = rzalloc_array(comp, BITSET_WORD, comp->reg_num);
505
506 if (instr->live_internal)
507 ralloc_free(instr->live_internal);
508 instr->live_internal = rzalloc_array(comp, BITSET_WORD, comp->reg_num);
509 }
510 }
511 }
512
ppir_all_interference(ppir_compiler * comp,struct ra_graph * g,BITSET_WORD * liveness)513 static void ppir_all_interference(ppir_compiler *comp, struct ra_graph *g,
514 BITSET_WORD *liveness)
515 {
516 int i, j;
517 BITSET_FOREACH_SET(i, liveness, comp->reg_num) {
518 BITSET_FOREACH_SET(j, liveness, comp->reg_num) {
519 ra_add_node_interference(g, i, j);
520 }
521 BITSET_CLEAR(liveness, i);
522 }
523 }
524
525 int lima_ppir_force_spilling = 0;
526
ppir_regalloc_prog_try(ppir_compiler * comp,bool * spilled)527 static bool ppir_regalloc_prog_try(ppir_compiler *comp, bool *spilled)
528 {
529 ppir_regalloc_reset_liveness_info(comp);
530
531 struct ra_graph *g = ra_alloc_interference_graph(
532 comp->ra, comp->reg_num);
533
534 int n = 0;
535 list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
536 int c = ppir_ra_reg_class_vec1 + (reg->num_components - 1);
537 if (reg->is_head)
538 c += 4;
539 ra_set_node_class(g, n++, ra_get_class_from_index(comp->ra, c));
540 }
541
542 ppir_liveness_analysis(comp);
543
544 list_for_each_entry(ppir_block, block, &comp->block_list, list) {
545 list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {
546 int i;
547 BITSET_FOREACH_SET(i, instr->live_internal, comp->reg_num) {
548 BITSET_SET(instr->live_set, i);
549 }
550 ppir_all_interference(comp, g, instr->live_set);
551 }
552 }
553
554 *spilled = false;
555 bool ok = ra_allocate(g);
556 if (!ok || (comp->force_spilling-- > 0)) {
557 ppir_reg *chosen = ppir_regalloc_choose_spill_node(comp, g);
558 if (chosen) {
559 /* stack_size will be used to assemble the frame reg in lima_draw.
560 * It is also be used in the spilling code, as negative indices
561 * starting from -1, to create stack addresses. */
562 comp->prog->state.stack_size++;
563 if (!ppir_regalloc_spill_reg(comp, chosen))
564 goto err_out;
565 /* Ask the outer loop to call back in. */
566 *spilled = true;
567
568 ppir_debug("spilled register %d/%d, num_components: %d\n",
569 chosen->regalloc_index, comp->reg_num,
570 chosen->num_components);
571 goto err_out;
572 }
573
574 ppir_error("regalloc fail\n");
575 goto err_out;
576 }
577
578 n = 0;
579 list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
580 reg->index = ra_get_node_reg(g, n++);
581 }
582
583 ralloc_free(g);
584
585 if (lima_debug & LIMA_DEBUG_PP)
586 ppir_regalloc_print_result(comp);
587
588 return true;
589
590 err_out:
591 ralloc_free(g);
592 return false;
593 }
594
ppir_regalloc_prog(ppir_compiler * comp)595 bool ppir_regalloc_prog(ppir_compiler *comp)
596 {
597 bool spilled = false;
598 comp->prog->state.stack_size = 0;
599
600 /* Set from an environment variable to force spilling
601 * for debugging purposes, see lima_screen.c */
602 comp->force_spilling = lima_ppir_force_spilling;
603
604 ppir_regalloc_update_reglist_ssa(comp);
605
606 /* No registers? Probably shader consists of discard instruction */
607 if (list_is_empty(&comp->reg_list))
608 return true;
609
610 /* this will most likely succeed in the first
611 * try, except for very complicated shaders */
612 while (!ppir_regalloc_prog_try(comp, &spilled))
613 if (!spilled)
614 return false;
615
616 return true;
617 }
618