• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2022 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors (Collabora):
24  *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25  */
26 
27 /* Bottom-up local scheduler to reduce register pressure */
28 
29 #include "compiler.h"
30 #include "util/dag.h"
31 
32 struct sched_ctx {
33         /* Dependency graph */
34         struct dag *dag;
35 
36         /* Live set */
37         uint8_t *live;
38 
39         /* Size of the live set */
40         unsigned max;
41 };
42 
43 struct sched_node {
44         struct dag_node dag;
45 
46         /* Instruction this node represents */
47         bi_instr *instr;
48 };
49 
50 static unsigned
label_index(bi_context * ctx,bi_index idx)51 label_index(bi_context *ctx, bi_index idx)
52 {
53         if (idx.reg) {
54                 assert(idx.value < ctx->reg_alloc);
55                 return idx.value + ctx->ssa_alloc;
56         } else {
57                 assert(idx.value < ctx->ssa_alloc);
58                 return idx.value;
59         }
60 }
61 
62 static void
add_dep(struct sched_node * a,struct sched_node * b)63 add_dep(struct sched_node *a, struct sched_node *b)
64 {
65         if (a && b)
66                 dag_add_edge(&a->dag, &b->dag, 0);
67 }
68 
69 static struct dag *
create_dag(bi_context * ctx,bi_block * block,void * memctx)70 create_dag(bi_context *ctx, bi_block *block, void *memctx)
71 {
72         struct dag *dag = dag_create(ctx);
73 
74         unsigned count = ctx->ssa_alloc + ctx->reg_alloc;
75         struct sched_node **last_read =
76                 calloc(count, sizeof(struct sched_node *));
77         struct sched_node **last_write =
78                 calloc(count, sizeof(struct sched_node *));
79         struct sched_node *coverage = NULL;
80         struct sched_node *preload = NULL;
81 
82         /* Last memory load, to serialize stores against */
83         struct sched_node *memory_load = NULL;
84 
85         /* Last memory store, to serialize loads and stores against */
86         struct sched_node *memory_store = NULL;
87 
88         bi_foreach_instr_in_block(block, I) {
89                 /* Leave branches at the end */
90                 if (I->op == BI_OPCODE_JUMP || bi_opcode_props[I->op].branch)
91                         break;
92 
93                 assert(I->branch_target == NULL);
94 
95                 struct sched_node *node = rzalloc(memctx, struct sched_node);
96                 node->instr = I;
97                 dag_init_node(dag, &node->dag);
98 
99                 /* Reads depend on writes */
100                 bi_foreach_src(I, s) {
101                         bi_index src = I->src[s];
102 
103                         if (src.type == BI_INDEX_NORMAL) {
104                                 add_dep(node, last_write[label_index(ctx, src)]);
105 
106                                 /* Serialize access to nir_register for
107                                  * simplicity. We could do better.
108                                  */
109                                 if (src.reg)
110                                         add_dep(node, last_read[label_index(ctx, src)]);
111                         }
112                 }
113 
114                 /* Writes depend on reads and writes */
115                 bi_foreach_dest(I, s) {
116                         bi_index dest = I->dest[s];
117 
118                         if (dest.type == BI_INDEX_NORMAL) {
119                                 add_dep(node, last_read[label_index(ctx, dest)]);
120                                 add_dep(node, last_write[label_index(ctx, dest)]);
121 
122                                 last_write[label_index(ctx, dest)] = node;
123                         }
124                 }
125 
126                 bi_foreach_src(I, s) {
127                         bi_index src = I->src[s];
128 
129                         if (src.type == BI_INDEX_NORMAL) {
130                                 last_read[label_index(ctx, src)] = node;
131                         }
132                 }
133 
134                 switch (bi_opcode_props[I->op].message) {
135                 case BIFROST_MESSAGE_LOAD:
136                         /* Regular memory loads needs to be serialized against
137                          * other memory access. However, UBO memory is read-only
138                          * so it can be moved around freely.
139                          */
140                         if (I->seg != BI_SEG_UBO) {
141                                 add_dep(node, memory_store);
142                                 memory_load = node;
143                         }
144 
145                         break;
146 
147                 case BIFROST_MESSAGE_ATTRIBUTE:
148                         /* Regular attribute loads can be reordered, but
149                          * writeable attributes can't be. Our one use of
150                          * writeable attributes are images.
151                          */
152                         if ((I->op == BI_OPCODE_LD_TEX) ||
153                             (I->op == BI_OPCODE_LD_TEX_IMM) ||
154                             (I->op == BI_OPCODE_LD_ATTR_TEX)) {
155                                 add_dep(node, memory_store);
156                                 memory_load = node;
157                         }
158 
159                         break;
160 
161                 case BIFROST_MESSAGE_STORE:
162                         assert(I->seg != BI_SEG_UBO);
163                         add_dep(node, memory_load);
164                         add_dep(node, memory_store);
165                         memory_store = node;
166                         break;
167 
168                 case BIFROST_MESSAGE_ATOMIC:
169                 case BIFROST_MESSAGE_BARRIER:
170                         add_dep(node, memory_load);
171                         add_dep(node, memory_store);
172                         memory_load = node;
173                         memory_store = node;
174                         break;
175 
176                 case BIFROST_MESSAGE_BLEND:
177                 case BIFROST_MESSAGE_Z_STENCIL:
178                 case BIFROST_MESSAGE_TILE:
179                         add_dep(node, coverage);
180                         coverage = node;
181                         break;
182 
183                 case BIFROST_MESSAGE_ATEST:
184                         /* ATEST signals the end of shader side effects */
185                         add_dep(node, memory_store);
186                         memory_store = node;
187 
188                         /* ATEST also updates coverage */
189                         add_dep(node, coverage);
190                         coverage = node;
191                         break;
192                 default:
193                         break;
194                 }
195 
196                 add_dep(node, preload);
197 
198                 if (I->op == BI_OPCODE_DISCARD_F32) {
199                         /* Serialize against ATEST */
200                         add_dep(node, coverage);
201                         coverage = node;
202 
203                         /* Also serialize against memory and barriers */
204                         add_dep(node, memory_load);
205                         add_dep(node, memory_store);
206                         memory_load = node;
207                         memory_store = node;
208                 } else if (I->op == BI_OPCODE_MOV_I32 && I->src[0].type == BI_INDEX_REGISTER) {
209                         preload = node;
210                 }
211         }
212 
213         free(last_read);
214         free(last_write);
215 
216         return dag;
217 }
218 
219 /*
220  * Calculate the change in register pressure from scheduling a given
221  * instruction. Equivalently, calculate the difference in the number of live
222  * registers before and after the instruction, given the live set after the
223  * instruction. This calculation follows immediately from the dataflow
224  * definition of liveness:
225  *
226  *      live_in = (live_out - KILL) + GEN
227  */
228 static signed
calculate_pressure_delta(bi_instr * I,uint8_t * live,unsigned max)229 calculate_pressure_delta(bi_instr *I, uint8_t *live, unsigned max)
230 {
231         signed delta = 0;
232 
233         /* Destinations must be unique */
234         bi_foreach_dest(I, d) {
235                 unsigned node = bi_get_node(I->dest[d]);
236 
237                 if (node < max && live[node])
238                         delta -= bi_count_write_registers(I, d);
239         }
240 
241         bi_foreach_src(I, src) {
242                 unsigned node = bi_get_node(I->src[src]);
243                 if (node >= max)
244                         continue;
245 
246                 /* Filter duplicates */
247                 bool dupe = false;
248 
249                 for (unsigned i = 0; i < src; ++i) {
250                         if (bi_get_node(I->src[i]) == node) {
251                                 dupe = true;
252                                 break;
253                         }
254                 }
255 
256                 if (!dupe && !live[node])
257                         delta += bi_count_read_registers(I, src);
258         }
259 
260         return delta;
261 }
262 
263 /*
264  * Choose the next instruction, bottom-up. For now we use a simple greedy
265  * heuristic: choose the instruction that has the best effect on liveness.
266  */
267 static struct sched_node *
choose_instr(struct sched_ctx * s)268 choose_instr(struct sched_ctx *s)
269 {
270         int32_t min_delta = INT32_MAX;
271         struct sched_node *best = NULL;
272 
273         list_for_each_entry(struct sched_node, n, &s->dag->heads, dag.link) {
274                 int32_t delta = calculate_pressure_delta(n->instr, s->live, s->max);
275 
276                 if (delta < min_delta) {
277                         best = n;
278                         min_delta = delta;
279                 }
280         }
281 
282         return best;
283 }
284 
285 static void
pressure_schedule_block(bi_context * ctx,bi_block * block,struct sched_ctx * s)286 pressure_schedule_block(bi_context *ctx, bi_block *block, struct sched_ctx *s)
287 {
288         /* off by a constant, that's ok */
289         signed pressure = 0;
290         signed orig_max_pressure = 0;
291         unsigned nr_ins = 0;
292 
293         memcpy(s->live, block->live_out, s->max);
294 
295         bi_foreach_instr_in_block_rev(block, I) {
296                 pressure += calculate_pressure_delta(I, s->live, s->max);
297                 orig_max_pressure = MAX2(pressure, orig_max_pressure);
298                 bi_liveness_ins_update(s->live, I, s->max);
299                 nr_ins++;
300         }
301 
302         memcpy(s->live, block->live_out, s->max);
303 
304         /* off by a constant, that's ok */
305         signed max_pressure = 0;
306         pressure = 0;
307 
308         struct sched_node **schedule = calloc(nr_ins, sizeof(struct sched_node *));
309         nr_ins = 0;
310 
311         while (!list_is_empty(&s->dag->heads)) {
312                 struct sched_node *node = choose_instr(s);
313                 pressure += calculate_pressure_delta(node->instr, s->live, s->max);
314                 max_pressure = MAX2(pressure, max_pressure);
315                 dag_prune_head(s->dag, &node->dag);
316 
317                 schedule[nr_ins++] = node;
318                 bi_liveness_ins_update(s->live, node->instr, s->max);
319         }
320 
321         /* Bail if it looks like it's worse */
322         if (max_pressure >= orig_max_pressure) {
323                 free(schedule);
324                 return;
325         }
326 
327         /* Apply the schedule */
328         for (unsigned i = 0; i < nr_ins; ++i) {
329                 bi_remove_instruction(schedule[i]->instr);
330                 list_add(&schedule[i]->instr->link, &block->instructions);
331         }
332 
333         free(schedule);
334 }
335 
336 void
bi_pressure_schedule(bi_context * ctx)337 bi_pressure_schedule(bi_context *ctx)
338 {
339         bi_compute_liveness(ctx);
340         unsigned temp_count = bi_max_temp(ctx);
341         void *memctx = ralloc_context(ctx);
342         uint8_t *live = ralloc_array(memctx, uint8_t, temp_count);
343 
344         bi_foreach_block(ctx, block) {
345                 struct sched_ctx sctx = {
346                         .dag = create_dag(ctx, block, memctx),
347                         .max = temp_count,
348                         .live = live
349                 };
350 
351                 pressure_schedule_block(ctx, block, &sctx);
352         }
353 
354         ralloc_free(memctx);
355 }
356