1 /*
2 * Copyright (C) 2022 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors (Collabora):
24 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25 */
26
27 /* Bottom-up local scheduler to reduce register pressure */
28
29 #include "compiler.h"
30 #include "util/dag.h"
31
32 struct sched_ctx {
33 /* Dependency graph */
34 struct dag *dag;
35
36 /* Live set */
37 uint8_t *live;
38
39 /* Size of the live set */
40 unsigned max;
41 };
42
43 struct sched_node {
44 struct dag_node dag;
45
46 /* Instruction this node represents */
47 bi_instr *instr;
48 };
49
50 static unsigned
label_index(bi_context * ctx,bi_index idx)51 label_index(bi_context *ctx, bi_index idx)
52 {
53 if (idx.reg) {
54 assert(idx.value < ctx->reg_alloc);
55 return idx.value + ctx->ssa_alloc;
56 } else {
57 assert(idx.value < ctx->ssa_alloc);
58 return idx.value;
59 }
60 }
61
62 static void
add_dep(struct sched_node * a,struct sched_node * b)63 add_dep(struct sched_node *a, struct sched_node *b)
64 {
65 if (a && b)
66 dag_add_edge(&a->dag, &b->dag, 0);
67 }
68
69 static struct dag *
create_dag(bi_context * ctx,bi_block * block,void * memctx)70 create_dag(bi_context *ctx, bi_block *block, void *memctx)
71 {
72 struct dag *dag = dag_create(ctx);
73
74 unsigned count = ctx->ssa_alloc + ctx->reg_alloc;
75 struct sched_node **last_read =
76 calloc(count, sizeof(struct sched_node *));
77 struct sched_node **last_write =
78 calloc(count, sizeof(struct sched_node *));
79 struct sched_node *coverage = NULL;
80 struct sched_node *preload = NULL;
81
82 /* Last memory load, to serialize stores against */
83 struct sched_node *memory_load = NULL;
84
85 /* Last memory store, to serialize loads and stores against */
86 struct sched_node *memory_store = NULL;
87
88 bi_foreach_instr_in_block(block, I) {
89 /* Leave branches at the end */
90 if (I->op == BI_OPCODE_JUMP || bi_opcode_props[I->op].branch)
91 break;
92
93 assert(I->branch_target == NULL);
94
95 struct sched_node *node = rzalloc(memctx, struct sched_node);
96 node->instr = I;
97 dag_init_node(dag, &node->dag);
98
99 /* Reads depend on writes */
100 bi_foreach_src(I, s) {
101 bi_index src = I->src[s];
102
103 if (src.type == BI_INDEX_NORMAL) {
104 add_dep(node, last_write[label_index(ctx, src)]);
105
106 /* Serialize access to nir_register for
107 * simplicity. We could do better.
108 */
109 if (src.reg)
110 add_dep(node, last_read[label_index(ctx, src)]);
111 }
112 }
113
114 /* Writes depend on reads and writes */
115 bi_foreach_dest(I, s) {
116 bi_index dest = I->dest[s];
117
118 if (dest.type == BI_INDEX_NORMAL) {
119 add_dep(node, last_read[label_index(ctx, dest)]);
120 add_dep(node, last_write[label_index(ctx, dest)]);
121
122 last_write[label_index(ctx, dest)] = node;
123 }
124 }
125
126 bi_foreach_src(I, s) {
127 bi_index src = I->src[s];
128
129 if (src.type == BI_INDEX_NORMAL) {
130 last_read[label_index(ctx, src)] = node;
131 }
132 }
133
134 switch (bi_opcode_props[I->op].message) {
135 case BIFROST_MESSAGE_LOAD:
136 /* Regular memory loads needs to be serialized against
137 * other memory access. However, UBO memory is read-only
138 * so it can be moved around freely.
139 */
140 if (I->seg != BI_SEG_UBO) {
141 add_dep(node, memory_store);
142 memory_load = node;
143 }
144
145 break;
146
147 case BIFROST_MESSAGE_ATTRIBUTE:
148 /* Regular attribute loads can be reordered, but
149 * writeable attributes can't be. Our one use of
150 * writeable attributes are images.
151 */
152 if ((I->op == BI_OPCODE_LD_TEX) ||
153 (I->op == BI_OPCODE_LD_TEX_IMM) ||
154 (I->op == BI_OPCODE_LD_ATTR_TEX)) {
155 add_dep(node, memory_store);
156 memory_load = node;
157 }
158
159 break;
160
161 case BIFROST_MESSAGE_STORE:
162 assert(I->seg != BI_SEG_UBO);
163 add_dep(node, memory_load);
164 add_dep(node, memory_store);
165 memory_store = node;
166 break;
167
168 case BIFROST_MESSAGE_ATOMIC:
169 case BIFROST_MESSAGE_BARRIER:
170 add_dep(node, memory_load);
171 add_dep(node, memory_store);
172 memory_load = node;
173 memory_store = node;
174 break;
175
176 case BIFROST_MESSAGE_BLEND:
177 case BIFROST_MESSAGE_Z_STENCIL:
178 case BIFROST_MESSAGE_TILE:
179 add_dep(node, coverage);
180 coverage = node;
181 break;
182
183 case BIFROST_MESSAGE_ATEST:
184 /* ATEST signals the end of shader side effects */
185 add_dep(node, memory_store);
186 memory_store = node;
187
188 /* ATEST also updates coverage */
189 add_dep(node, coverage);
190 coverage = node;
191 break;
192 default:
193 break;
194 }
195
196 add_dep(node, preload);
197
198 if (I->op == BI_OPCODE_DISCARD_F32) {
199 /* Serialize against ATEST */
200 add_dep(node, coverage);
201 coverage = node;
202
203 /* Also serialize against memory and barriers */
204 add_dep(node, memory_load);
205 add_dep(node, memory_store);
206 memory_load = node;
207 memory_store = node;
208 } else if (I->op == BI_OPCODE_MOV_I32 && I->src[0].type == BI_INDEX_REGISTER) {
209 preload = node;
210 }
211 }
212
213 free(last_read);
214 free(last_write);
215
216 return dag;
217 }
218
219 /*
220 * Calculate the change in register pressure from scheduling a given
221 * instruction. Equivalently, calculate the difference in the number of live
222 * registers before and after the instruction, given the live set after the
223 * instruction. This calculation follows immediately from the dataflow
224 * definition of liveness:
225 *
226 * live_in = (live_out - KILL) + GEN
227 */
228 static signed
calculate_pressure_delta(bi_instr * I,uint8_t * live,unsigned max)229 calculate_pressure_delta(bi_instr *I, uint8_t *live, unsigned max)
230 {
231 signed delta = 0;
232
233 /* Destinations must be unique */
234 bi_foreach_dest(I, d) {
235 unsigned node = bi_get_node(I->dest[d]);
236
237 if (node < max && live[node])
238 delta -= bi_count_write_registers(I, d);
239 }
240
241 bi_foreach_src(I, src) {
242 unsigned node = bi_get_node(I->src[src]);
243 if (node >= max)
244 continue;
245
246 /* Filter duplicates */
247 bool dupe = false;
248
249 for (unsigned i = 0; i < src; ++i) {
250 if (bi_get_node(I->src[i]) == node) {
251 dupe = true;
252 break;
253 }
254 }
255
256 if (!dupe && !live[node])
257 delta += bi_count_read_registers(I, src);
258 }
259
260 return delta;
261 }
262
263 /*
264 * Choose the next instruction, bottom-up. For now we use a simple greedy
265 * heuristic: choose the instruction that has the best effect on liveness.
266 */
267 static struct sched_node *
choose_instr(struct sched_ctx * s)268 choose_instr(struct sched_ctx *s)
269 {
270 int32_t min_delta = INT32_MAX;
271 struct sched_node *best = NULL;
272
273 list_for_each_entry(struct sched_node, n, &s->dag->heads, dag.link) {
274 int32_t delta = calculate_pressure_delta(n->instr, s->live, s->max);
275
276 if (delta < min_delta) {
277 best = n;
278 min_delta = delta;
279 }
280 }
281
282 return best;
283 }
284
285 static void
pressure_schedule_block(bi_context * ctx,bi_block * block,struct sched_ctx * s)286 pressure_schedule_block(bi_context *ctx, bi_block *block, struct sched_ctx *s)
287 {
288 /* off by a constant, that's ok */
289 signed pressure = 0;
290 signed orig_max_pressure = 0;
291 unsigned nr_ins = 0;
292
293 memcpy(s->live, block->live_out, s->max);
294
295 bi_foreach_instr_in_block_rev(block, I) {
296 pressure += calculate_pressure_delta(I, s->live, s->max);
297 orig_max_pressure = MAX2(pressure, orig_max_pressure);
298 bi_liveness_ins_update(s->live, I, s->max);
299 nr_ins++;
300 }
301
302 memcpy(s->live, block->live_out, s->max);
303
304 /* off by a constant, that's ok */
305 signed max_pressure = 0;
306 pressure = 0;
307
308 struct sched_node **schedule = calloc(nr_ins, sizeof(struct sched_node *));
309 nr_ins = 0;
310
311 while (!list_is_empty(&s->dag->heads)) {
312 struct sched_node *node = choose_instr(s);
313 pressure += calculate_pressure_delta(node->instr, s->live, s->max);
314 max_pressure = MAX2(pressure, max_pressure);
315 dag_prune_head(s->dag, &node->dag);
316
317 schedule[nr_ins++] = node;
318 bi_liveness_ins_update(s->live, node->instr, s->max);
319 }
320
321 /* Bail if it looks like it's worse */
322 if (max_pressure >= orig_max_pressure) {
323 free(schedule);
324 return;
325 }
326
327 /* Apply the schedule */
328 for (unsigned i = 0; i < nr_ins; ++i) {
329 bi_remove_instruction(schedule[i]->instr);
330 list_add(&schedule[i]->instr->link, &block->instructions);
331 }
332
333 free(schedule);
334 }
335
336 void
bi_pressure_schedule(bi_context * ctx)337 bi_pressure_schedule(bi_context *ctx)
338 {
339 bi_compute_liveness(ctx);
340 unsigned temp_count = bi_max_temp(ctx);
341 void *memctx = ralloc_context(ctx);
342 uint8_t *live = ralloc_array(memctx, uint8_t, temp_count);
343
344 bi_foreach_block(ctx, block) {
345 struct sched_ctx sctx = {
346 .dag = create_dag(ctx, block, memctx),
347 .max = temp_count,
348 .live = live
349 };
350
351 pressure_schedule_block(ctx, block, &sctx);
352 }
353
354 ralloc_free(memctx);
355 }
356