• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017 Lima Project
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sub license,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the
12  * next paragraph) shall be included in all copies or substantial portions
13  * of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  *
23  */
24 
25 #include "util/ralloc.h"
26 #include "util/register_allocate.h"
27 #include "util/u_debug.h"
28 
29 #include "ppir.h"
30 #include "lima_context.h"
31 
32 #define PPIR_REG_COUNT  (6 * 4)
33 
34 enum ppir_ra_reg_class {
35    ppir_ra_reg_class_vec1,
36    ppir_ra_reg_class_vec2,
37    ppir_ra_reg_class_vec3,
38    ppir_ra_reg_class_vec4,
39 
40    /* 4 reg class for load/store instr regs:
41     * load/store instr has no swizzle field, so the (virtual) register
42     * must be allocated at the beginning of a (physical) register,
43     */
44    ppir_ra_reg_class_head_vec1,
45    ppir_ra_reg_class_head_vec2,
46    ppir_ra_reg_class_head_vec3,
47    ppir_ra_reg_class_head_vec4,
48 
49    ppir_ra_reg_class_num,
50 };
51 
ppir_regalloc_init(void * mem_ctx)52 struct ra_regs *ppir_regalloc_init(void *mem_ctx)
53 {
54    struct ra_regs *ret = ra_alloc_reg_set(mem_ctx, PPIR_REG_COUNT, false);
55    if (!ret)
56       return NULL;
57 
58    /* Classes for contiguous 1-4 channel groups anywhere within a register. */
59    struct ra_class *classes[ppir_ra_reg_class_num];
60    for (int i = 0; i < ppir_ra_reg_class_head_vec1; i++) {
61       classes[i] = ra_alloc_contig_reg_class(ret, i + 1);
62 
63       for (int j = 0; j < PPIR_REG_COUNT; j += 4) {
64          for (int swiz = 0; swiz < (4 - i); swiz++)
65             ra_class_add_reg(classes[i], j + swiz);
66       }
67    }
68 
69    /* Classes for contiguous 1-4 channels with a start channel of .x */
70    for (int i = ppir_ra_reg_class_head_vec1; i < ppir_ra_reg_class_num; i++) {
71       classes[i] = ra_alloc_contig_reg_class(ret, i - ppir_ra_reg_class_head_vec1 + 1);
72 
73       for (int j = 0; j < PPIR_REG_COUNT; j += 4)
74          ra_class_add_reg(classes[i], j);
75    }
76 
77    ra_set_finalize(ret, NULL);
78    return ret;
79 }
80 
ppir_regalloc_update_reglist_ssa(ppir_compiler * comp)81 static void ppir_regalloc_update_reglist_ssa(ppir_compiler *comp)
82 {
83    list_for_each_entry(ppir_block, block, &comp->block_list, list) {
84       list_for_each_entry(ppir_node, node, &block->node_list, list) {
85          if (node->is_end)
86             continue;
87 
88          if (!node->instr || node->op == ppir_op_const)
89             continue;
90 
91          ppir_dest *dest = ppir_node_get_dest(node);
92          if (dest) {
93             ppir_reg *reg = NULL;
94 
95             if (dest->type == ppir_target_ssa) {
96                reg = &dest->ssa;
97                list_addtail(&reg->list, &comp->reg_list);
98                comp->reg_num++;
99             }
100          }
101       }
102    }
103 }
104 
ppir_regalloc_print_result(ppir_compiler * comp)105 static void ppir_regalloc_print_result(ppir_compiler *comp)
106 {
107    printf("======ppir regalloc result======\n");
108    list_for_each_entry(ppir_block, block, &comp->block_list, list) {
109       list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {
110          printf("%03d:", instr->index);
111          for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) {
112             ppir_node *node = instr->slots[i];
113             if (!node)
114                continue;
115 
116             printf(" (%d|", node->index);
117 
118             ppir_dest *dest = ppir_node_get_dest(node);
119             if (dest)
120                printf("%d", ppir_target_get_dest_reg_index(dest));
121 
122             printf("|");
123 
124             for (int i = 0; i < ppir_node_get_src_num(node); i++) {
125                if (i)
126                   printf(" ");
127                printf("%d", ppir_target_get_src_reg_index(ppir_node_get_src(node, i)));
128             }
129 
130             printf(")");
131          }
132          printf("\n");
133       }
134    }
135    printf("--------------------------\n");
136 }
137 
create_new_instr_after(ppir_block * block,ppir_instr * ref,ppir_node * node)138 static bool create_new_instr_after(ppir_block *block, ppir_instr *ref,
139                                    ppir_node *node)
140 {
141    ppir_instr *newinstr = ppir_instr_create(block);
142    if (unlikely(!newinstr))
143       return false;
144 
145    list_del(&newinstr->list);
146    list_add(&newinstr->list, &ref->list);
147 
148    if (!ppir_instr_insert_node(newinstr, node))
149       return false;
150 
151    list_for_each_entry_from(ppir_instr, instr, ref, &block->instr_list, list) {
152       instr->seq++;
153    }
154    newinstr->seq = ref->seq+1;
155    newinstr->scheduled = true;
156    return true;
157 }
158 
create_new_instr_before(ppir_block * block,ppir_instr * ref,ppir_node * node)159 static bool create_new_instr_before(ppir_block *block, ppir_instr *ref,
160                                     ppir_node *node)
161 {
162    ppir_instr *newinstr = ppir_instr_create(block);
163    if (unlikely(!newinstr))
164       return false;
165 
166    list_del(&newinstr->list);
167    list_addtail(&newinstr->list, &ref->list);
168 
169    if (!ppir_instr_insert_node(newinstr, node))
170       return false;
171 
172    list_for_each_entry_from(ppir_instr, instr, ref, &block->instr_list, list) {
173       instr->seq++;
174    }
175    newinstr->seq = ref->seq-1;
176    newinstr->scheduled = true;
177    return true;
178 }
179 
ppir_update_spilled_src(ppir_compiler * comp,ppir_block * block,ppir_node * node,ppir_src * src,ppir_node ** fill_node)180 static bool ppir_update_spilled_src(ppir_compiler *comp, ppir_block *block,
181                                     ppir_node *node, ppir_src *src,
182                                     ppir_node **fill_node)
183 {
184    /* nodes might have multiple references to the same value.
185     * avoid creating unnecessary loads for the same fill by
186     * saving the node resulting from the temporary load */
187    if (*fill_node)
188       goto update_src;
189 
190    int num_components = src->reg->num_components;
191 
192    /* alloc new node to load value */
193    ppir_node *load_node = ppir_node_create(block, ppir_op_load_temp, -1, 0);
194    if (!load_node)
195       return false;
196    list_addtail(&load_node->list, &node->list);
197    comp->num_fills++;
198 
199    ppir_load_node *load = ppir_node_to_load(load_node);
200 
201    load->index = -comp->prog->state.stack_size; /* index sizes are negative */
202    load->num_components = num_components;
203 
204    ppir_dest *ld_dest = &load->dest;
205    ld_dest->type = ppir_target_pipeline;
206    ld_dest->pipeline = ppir_pipeline_reg_uniform;
207    ld_dest->write_mask = u_bit_consecutive(0, num_components);
208 
209    /* If the uniform slot is empty, we can insert the load_temp
210     * there and use it directly. Exceptionally, if the node is in the
211     * varying or texld slot, this doesn't work. */
212    if (!node->instr->slots[PPIR_INSTR_SLOT_UNIFORM] &&
213         node->instr_pos != PPIR_INSTR_SLOT_VARYING &&
214         node->instr_pos != PPIR_INSTR_SLOT_TEXLD) {
215       ppir_node_target_assign(src, load_node);
216       *fill_node = load_node;
217       return ppir_instr_insert_node(node->instr, load_node);
218    }
219 
220    /* Uniform slot was taken, so fall back to a new instruction with a mov */
221    if (!create_new_instr_before(block, node->instr, load_node))
222       return false;
223 
224    /* Create move node */
225    ppir_node *move_node = ppir_node_create(block, ppir_op_mov, -1 , 0);
226    if (unlikely(!move_node))
227       return false;
228    list_addtail(&move_node->list, &node->list);
229 
230    ppir_alu_node *move_alu = ppir_node_to_alu(move_node);
231 
232    move_alu->num_src = 1;
233    move_alu->src->type = ppir_target_pipeline;
234    move_alu->src->pipeline = ppir_pipeline_reg_uniform;
235    for (int i = 0; i < 4; i++)
236       move_alu->src->swizzle[i] = i;
237 
238    ppir_dest *alu_dest = &move_alu->dest;
239    alu_dest->type = ppir_target_ssa;
240    alu_dest->ssa.num_components = num_components;
241    alu_dest->ssa.spilled = true;
242    alu_dest->write_mask = u_bit_consecutive(0, num_components);
243 
244    list_addtail(&alu_dest->ssa.list, &comp->reg_list);
245    comp->reg_num++;
246 
247    if (!ppir_instr_insert_node(load_node->instr, move_node))
248       return false;
249 
250    /* insert the new node as predecessor */
251    ppir_node_foreach_pred_safe(node, dep) {
252       ppir_node *pred = dep->pred;
253       ppir_node_remove_dep(dep);
254       ppir_node_add_dep(load_node, pred, ppir_dep_src);
255    }
256    ppir_node_add_dep(node, move_node, ppir_dep_src);
257    ppir_node_add_dep(move_node, load_node, ppir_dep_src);
258 
259    *fill_node = move_node;
260 
261 update_src:
262    /* switch node src to use the fill node dest */
263    ppir_node_target_assign(src, *fill_node);
264 
265    return true;
266 }
267 
ppir_update_spilled_dest_load(ppir_compiler * comp,ppir_block * block,ppir_node * node)268 static bool ppir_update_spilled_dest_load(ppir_compiler *comp, ppir_block *block,
269                                           ppir_node *node)
270 {
271    ppir_dest *dest = ppir_node_get_dest(node);
272    assert(dest != NULL);
273    assert(dest->type == ppir_target_register);
274    ppir_reg *reg = dest->reg;
275    int num_components = reg->num_components;
276 
277    /* alloc new node to load value */
278    ppir_node *load_node = ppir_node_create(block, ppir_op_load_temp, -1, 0);
279    if (!load_node)
280       return NULL;
281    list_addtail(&load_node->list, &node->list);
282    comp->num_fills++;
283 
284    ppir_load_node *load = ppir_node_to_load(load_node);
285 
286    load->index = -comp->prog->state.stack_size; /* index sizes are negative */
287    load->num_components = num_components;
288 
289    load->dest.type = ppir_target_pipeline;
290    load->dest.pipeline = ppir_pipeline_reg_uniform;
291    load->dest.write_mask = u_bit_consecutive(0, num_components);
292 
293    /* New instruction is needed since we're updating a dest register
294     * and we can't write to the uniform pipeline reg */
295    if (!create_new_instr_before(block, node->instr, load_node))
296       return false;
297 
298    /* Create move node */
299    ppir_node *move_node = ppir_node_create(block, ppir_op_mov, -1 , 0);
300    if (unlikely(!move_node))
301       return false;
302    list_addtail(&move_node->list, &node->list);
303 
304    ppir_alu_node *move_alu = ppir_node_to_alu(move_node);
305 
306    move_alu->num_src = 1;
307    move_alu->src->type = ppir_target_pipeline;
308    move_alu->src->pipeline = ppir_pipeline_reg_uniform;
309    for (int i = 0; i < 4; i++)
310       move_alu->src->swizzle[i] = i;
311 
312    move_alu->dest.type = ppir_target_register;
313    move_alu->dest.reg = reg;
314    move_alu->dest.write_mask = u_bit_consecutive(0, num_components);
315 
316    if (!ppir_instr_insert_node(load_node->instr, move_node))
317       return false;
318 
319    ppir_node_foreach_pred_safe(node, dep) {
320       ppir_node *pred = dep->pred;
321       ppir_node_remove_dep(dep);
322       ppir_node_add_dep(load_node, pred, ppir_dep_src);
323    }
324    ppir_node_add_dep(node, move_node, ppir_dep_src);
325    ppir_node_add_dep(move_node, load_node, ppir_dep_src);
326 
327    return true;
328 }
329 
ppir_update_spilled_dest(ppir_compiler * comp,ppir_block * block,ppir_node * node)330 static bool ppir_update_spilled_dest(ppir_compiler *comp, ppir_block *block,
331                                      ppir_node *node)
332 {
333    ppir_dest *dest = ppir_node_get_dest(node);
334    assert(dest != NULL);
335    ppir_reg *reg = ppir_dest_get_reg(dest);
336 
337    /* alloc new node to store value */
338    ppir_node *store_node = ppir_node_create(block, ppir_op_store_temp, -1, 0);
339    if (!store_node)
340       return false;
341    list_addtail(&store_node->list, &node->list);
342    comp->num_spills++;
343 
344    ppir_store_node *store = ppir_node_to_store(store_node);
345 
346    store->index = -comp->prog->state.stack_size; /* index sizes are negative */
347 
348    ppir_node_target_assign(&store->src, node);
349    store->num_components = reg->num_components;
350 
351    /* insert the new node as successor */
352    ppir_node_foreach_succ_safe(node, dep) {
353       ppir_node *succ = dep->succ;
354       ppir_node_remove_dep(dep);
355       ppir_node_add_dep(succ, store_node, ppir_dep_src);
356    }
357    ppir_node_add_dep(store_node, node, ppir_dep_src);
358 
359    /* If the store temp slot is empty, we can insert the store_temp
360     * there and use it directly. Exceptionally, if the node is in the
361     * combine slot, this doesn't work. */
362    if (!node->instr->slots[PPIR_INSTR_SLOT_STORE_TEMP] &&
363         node->instr_pos != PPIR_INSTR_SLOT_ALU_COMBINE)
364       return ppir_instr_insert_node(node->instr, store_node);
365 
366    /* Not possible to merge store, so fall back to a new instruction */
367    return create_new_instr_after(block, node->instr, store_node);
368 }
369 
ppir_regalloc_spill_reg(ppir_compiler * comp,ppir_reg * chosen)370 static bool ppir_regalloc_spill_reg(ppir_compiler *comp, ppir_reg *chosen)
371 {
372    list_for_each_entry(ppir_block, block, &comp->block_list, list) {
373       list_for_each_entry(ppir_node, node, &block->node_list, list) {
374 
375          ppir_dest *dest = ppir_node_get_dest(node);
376          if (dest && ppir_dest_get_reg(dest) == chosen) {
377             /* If dest is a register, it might be updating only some its
378              * components, so need to load the existing value first */
379             if (dest->type == ppir_target_register) {
380                if (!ppir_update_spilled_dest_load(comp, block, node))
381                   return false;
382             }
383             if (!ppir_update_spilled_dest(comp, block, node))
384                return false;
385          }
386 
387          ppir_node *fill_node = NULL;
388          /* nodes might have multiple references to the same value.
389           * avoid creating unnecessary loads for the same fill by
390           * saving the node resulting from the temporary load */
391          for (int i = 0; i < ppir_node_get_src_num(node); i++) {
392             ppir_src *src = ppir_node_get_src(node, i);
393             ppir_reg *reg = ppir_src_get_reg(src);
394             if (reg == chosen) {
395                if (!ppir_update_spilled_src(comp, block, node, src, &fill_node))
396                   return false;
397             }
398          }
399       }
400    }
401 
402    return true;
403 }
404 
ppir_regalloc_choose_spill_node(ppir_compiler * comp,struct ra_graph * g)405 static ppir_reg *ppir_regalloc_choose_spill_node(ppir_compiler *comp,
406                                                  struct ra_graph *g)
407 {
408    float spill_costs[comp->reg_num];
409    /* experimentally determined, it seems to be worth scaling cost of
410     * regs in instructions that have used uniform/store_temp slots,
411     * but not too much as to offset the num_components base cost. */
412    const float slot_scale = 1.1f;
413 
414    list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
415       if (reg->spilled) {
416          /* not considered for spilling */
417          spill_costs[reg->regalloc_index] = 0.0f;
418          continue;
419       }
420 
421       /* It is beneficial to spill registers with higher component number,
422        * so increase the cost of spilling registers with few components */
423       float spill_cost = 4.0f / (float)reg->num_components;
424       spill_costs[reg->regalloc_index] = spill_cost;
425    }
426 
427    list_for_each_entry(ppir_block, block, &comp->block_list, list) {
428       list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {
429          if (instr->slots[PPIR_INSTR_SLOT_UNIFORM]) {
430             for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) {
431                ppir_node *node = instr->slots[i];
432                if (!node)
433                   continue;
434                for (int j = 0; j < ppir_node_get_src_num(node); j++) {
435                   ppir_src *src = ppir_node_get_src(node, j);
436                   if (!src)
437                      continue;
438                   ppir_reg *reg = ppir_src_get_reg(src);
439                   if (!reg)
440                      continue;
441 
442                   spill_costs[reg->regalloc_index] *= slot_scale;
443                }
444             }
445          }
446          if (instr->slots[PPIR_INSTR_SLOT_STORE_TEMP]) {
447             for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) {
448                ppir_node *node = instr->slots[i];
449                if (!node)
450                   continue;
451                ppir_dest *dest = ppir_node_get_dest(node);
452                if (!dest)
453                   continue;
454                ppir_reg *reg = ppir_dest_get_reg(dest);
455                if (!reg)
456                   continue;
457 
458                spill_costs[reg->regalloc_index] *= slot_scale;
459             }
460          }
461       }
462    }
463 
464    for (int i = 0; i < comp->reg_num; i++)
465       ra_set_node_spill_cost(g, i, spill_costs[i]);
466 
467    int r = ra_get_best_spill_node(g);
468    if (r == -1)
469       return NULL;
470 
471    ppir_reg *chosen = NULL;
472    int i = 0;
473    list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
474       if (i++ == r) {
475          chosen = reg;
476          break;
477       }
478    }
479    assert(chosen);
480    chosen->spilled = true;
481    chosen->is_head = true; /* store_temp unable to do swizzle */
482 
483    return chosen;
484 }
485 
ppir_regalloc_reset_liveness_info(ppir_compiler * comp)486 static void ppir_regalloc_reset_liveness_info(ppir_compiler *comp)
487 {
488    int idx = 0;
489 
490    list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
491       reg->regalloc_index = idx++;
492    }
493 
494    list_for_each_entry(ppir_block, block, &comp->block_list, list) {
495       list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {
496 
497          if (instr->live_mask)
498             ralloc_free(instr->live_mask);
499          instr->live_mask = rzalloc_array(comp, uint8_t,
500                                           reg_mask_size(comp->reg_num));
501 
502          if (instr->live_set)
503             ralloc_free(instr->live_set);
504          instr->live_set = rzalloc_array(comp, BITSET_WORD, comp->reg_num);
505 
506          if (instr->live_internal)
507             ralloc_free(instr->live_internal);
508          instr->live_internal = rzalloc_array(comp, BITSET_WORD, comp->reg_num);
509       }
510    }
511 }
512 
ppir_all_interference(ppir_compiler * comp,struct ra_graph * g,BITSET_WORD * liveness)513 static void ppir_all_interference(ppir_compiler *comp, struct ra_graph *g,
514                                   BITSET_WORD *liveness)
515 {
516    int i, j;
517    BITSET_FOREACH_SET(i, liveness, comp->reg_num) {
518       BITSET_FOREACH_SET(j, liveness, comp->reg_num) {
519          ra_add_node_interference(g, i, j);
520       }
521       BITSET_CLEAR(liveness, i);
522    }
523 }
524 
525 int lima_ppir_force_spilling = 0;
526 
ppir_regalloc_prog_try(ppir_compiler * comp,bool * spilled)527 static bool ppir_regalloc_prog_try(ppir_compiler *comp, bool *spilled)
528 {
529    ppir_regalloc_reset_liveness_info(comp);
530 
531    struct ra_graph *g = ra_alloc_interference_graph(
532       comp->ra, comp->reg_num);
533 
534    int n = 0;
535    list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
536       int c = ppir_ra_reg_class_vec1 + (reg->num_components - 1);
537       if (reg->is_head)
538          c += 4;
539       ra_set_node_class(g, n++, ra_get_class_from_index(comp->ra, c));
540    }
541 
542    ppir_liveness_analysis(comp);
543 
544    list_for_each_entry(ppir_block, block, &comp->block_list, list) {
545       list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {
546          int i;
547          BITSET_FOREACH_SET(i, instr->live_internal, comp->reg_num) {
548             BITSET_SET(instr->live_set, i);
549          }
550          ppir_all_interference(comp, g, instr->live_set);
551       }
552    }
553 
554    *spilled = false;
555    bool ok = ra_allocate(g);
556    if (!ok || (comp->force_spilling-- > 0)) {
557       ppir_reg *chosen = ppir_regalloc_choose_spill_node(comp, g);
558       if (chosen) {
559          /* stack_size will be used to assemble the frame reg in lima_draw.
560           * It is also be used in the spilling code, as negative indices
561           * starting from -1, to create stack addresses. */
562          comp->prog->state.stack_size++;
563          if (!ppir_regalloc_spill_reg(comp, chosen))
564             goto err_out;
565          /* Ask the outer loop to call back in. */
566          *spilled = true;
567 
568          ppir_debug("spilled register %d/%d, num_components: %d\n",
569                     chosen->regalloc_index, comp->reg_num,
570                     chosen->num_components);
571          goto err_out;
572       }
573 
574       ppir_error("regalloc fail\n");
575       goto err_out;
576    }
577 
578    n = 0;
579    list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
580       reg->index = ra_get_node_reg(g, n++);
581    }
582 
583    ralloc_free(g);
584 
585    if (lima_debug & LIMA_DEBUG_PP)
586       ppir_regalloc_print_result(comp);
587 
588    return true;
589 
590 err_out:
591    ralloc_free(g);
592    return false;
593 }
594 
ppir_regalloc_prog(ppir_compiler * comp)595 bool ppir_regalloc_prog(ppir_compiler *comp)
596 {
597    bool spilled = false;
598    comp->prog->state.stack_size = 0;
599 
600    /* Set from an environment variable to force spilling
601     * for debugging purposes, see lima_screen.c */
602    comp->force_spilling = lima_ppir_force_spilling;
603 
604    ppir_regalloc_update_reglist_ssa(comp);
605 
606    /* No registers? Probably shader consists of discard instruction */
607    if (list_is_empty(&comp->reg_list))
608       return true;
609 
610    /* this will most likely succeed in the first
611     * try, except for very complicated shaders */
612    while (!ppir_regalloc_prog_try(comp, &spilled))
613       if (!spilled)
614          return false;
615 
616    return true;
617 }
618