1 /*
2 * Copyright (C) 2022 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors (Collabora):
24 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25 */
26
27 /* Bottom-up local scheduler to reduce register pressure */
28
29 #include "util/dag.h"
30 #include "compiler.h"
31
32 struct sched_ctx {
33 /* Dependency graph */
34 struct dag *dag;
35
36 /* Live set */
37 BITSET_WORD *live;
38 };
39
40 struct sched_node {
41 struct dag_node dag;
42
43 /* Instruction this node represents */
44 bi_instr *instr;
45 };
46
47 static void
add_dep(struct sched_node * a,struct sched_node * b)48 add_dep(struct sched_node *a, struct sched_node *b)
49 {
50 if (a && b)
51 dag_add_edge(&a->dag, &b->dag, 0);
52 }
53
54 static struct dag *
create_dag(bi_context * ctx,bi_block * block,void * memctx)55 create_dag(bi_context *ctx, bi_block *block, void *memctx)
56 {
57 struct dag *dag = dag_create(ctx);
58
59 struct sched_node **last_write =
60 calloc(ctx->ssa_alloc, sizeof(struct sched_node *));
61 struct sched_node *coverage = NULL;
62 struct sched_node *preload = NULL;
63
64 /* Last memory load, to serialize stores against */
65 struct sched_node *memory_load = NULL;
66
67 /* Last memory store, to serialize loads and stores against */
68 struct sched_node *memory_store = NULL;
69
70 bi_foreach_instr_in_block(block, I) {
71 /* Leave branches at the end */
72 if (I->op == BI_OPCODE_JUMP || bi_opcode_props[I->op].branch)
73 break;
74
75 assert(I->branch_target == NULL);
76
77 struct sched_node *node = rzalloc(memctx, struct sched_node);
78 node->instr = I;
79 dag_init_node(dag, &node->dag);
80
81 /* Reads depend on writes, no other hazards in SSA */
82 bi_foreach_ssa_src(I, s)
83 add_dep(node, last_write[I->src[s].value]);
84
85 bi_foreach_dest(I, d)
86 last_write[I->dest[d].value] = node;
87
88 add_dep(node, preload);
89
90 switch (bi_opcode_props[I->op].message) {
91 case BIFROST_MESSAGE_LOAD:
92 /* Regular memory loads needs to be serialized against
93 * other memory access. However, UBO memory is read-only
94 * so it can be moved around freely.
95 */
96 if (I->seg != BI_SEG_UBO) {
97 add_dep(node, memory_store);
98 memory_load = node;
99 }
100
101 break;
102
103 case BIFROST_MESSAGE_ATTRIBUTE:
104 /* Regular attribute loads can be reordered, but
105 * writeable attributes can't be. Our one use of
106 * writeable attributes are images.
107 */
108 if ((I->op == BI_OPCODE_LD_TEX) || (I->op == BI_OPCODE_LD_TEX_IMM) ||
109 (I->op == BI_OPCODE_LD_ATTR_TEX)) {
110 add_dep(node, memory_store);
111 memory_load = node;
112 }
113
114 break;
115
116 case BIFROST_MESSAGE_STORE:
117 assert(I->seg != BI_SEG_UBO);
118 add_dep(node, memory_load);
119 add_dep(node, memory_store);
120 memory_store = node;
121 break;
122
123 case BIFROST_MESSAGE_ATOMIC:
124 case BIFROST_MESSAGE_BARRIER:
125 add_dep(node, memory_load);
126 add_dep(node, memory_store);
127 memory_load = node;
128 memory_store = node;
129 break;
130
131 case BIFROST_MESSAGE_BLEND:
132 case BIFROST_MESSAGE_Z_STENCIL:
133 case BIFROST_MESSAGE_TILE:
134 add_dep(node, coverage);
135 coverage = node;
136 break;
137
138 case BIFROST_MESSAGE_ATEST:
139 /* ATEST signals the end of shader side effects */
140 add_dep(node, memory_store);
141 memory_store = node;
142
143 /* ATEST also updates coverage */
144 add_dep(node, coverage);
145 coverage = node;
146 break;
147 default:
148 break;
149 }
150
151 if (I->op == BI_OPCODE_DISCARD_F32) {
152 /* Serialize against ATEST */
153 add_dep(node, coverage);
154 coverage = node;
155 }
156 if (I->op == BI_OPCODE_DISCARD_F32 ||
157 I->op == BI_OPCODE_MEMORY_BARRIER) {
158 /* Serialize against memory operations and barriers */
159 add_dep(node, memory_load);
160 add_dep(node, memory_store);
161 memory_load = node;
162 memory_store = node;
163 }
164 if ((I->op == BI_OPCODE_PHI) ||
165 (I->op == BI_OPCODE_MOV_I32 &&
166 I->src[0].type == BI_INDEX_REGISTER)) {
167 preload = node;
168 }
169 }
170
171 free(last_write);
172
173 return dag;
174 }
175
176 /*
177 * Calculate the change in register pressure from scheduling a given
178 * instruction. Equivalently, calculate the difference in the number of live
179 * registers before and after the instruction, given the live set after the
180 * instruction. This calculation follows immediately from the dataflow
181 * definition of liveness:
182 *
183 * live_in = (live_out - KILL) + GEN
184 */
185 static signed
calculate_pressure_delta(bi_instr * I,BITSET_WORD * live)186 calculate_pressure_delta(bi_instr *I, BITSET_WORD *live)
187 {
188 signed delta = 0;
189
190 /* Destinations must be unique */
191 bi_foreach_dest(I, d) {
192 if (BITSET_TEST(live, I->dest[d].value))
193 delta -= bi_count_write_registers(I, d);
194 }
195
196 bi_foreach_ssa_src(I, src) {
197 /* Filter duplicates */
198 bool dupe = false;
199
200 for (unsigned i = 0; i < src; ++i) {
201 if (bi_is_equiv(I->src[i], I->src[src])) {
202 dupe = true;
203 break;
204 }
205 }
206
207 if (!dupe && !BITSET_TEST(live, I->src[src].value))
208 delta += bi_count_read_registers(I, src);
209 }
210
211 return delta;
212 }
213
214 /*
215 * Choose the next instruction, bottom-up. For now we use a simple greedy
216 * heuristic: choose the instruction that has the best effect on liveness.
217 */
218 static struct sched_node *
choose_instr(struct sched_ctx * s)219 choose_instr(struct sched_ctx *s)
220 {
221 int32_t min_delta = INT32_MAX;
222 struct sched_node *best = NULL;
223
224 list_for_each_entry(struct sched_node, n, &s->dag->heads, dag.link) {
225 int32_t delta = calculate_pressure_delta(n->instr, s->live);
226
227 if (delta < min_delta) {
228 best = n;
229 min_delta = delta;
230 }
231 }
232
233 return best;
234 }
235
236 static void
pressure_schedule_block(bi_context * ctx,bi_block * block,struct sched_ctx * s)237 pressure_schedule_block(bi_context *ctx, bi_block *block, struct sched_ctx *s)
238 {
239 /* off by a constant, that's ok */
240 signed pressure = 0;
241 signed orig_max_pressure = 0;
242 unsigned nr_ins = 0;
243
244 memcpy(s->live, block->ssa_live_out,
245 BITSET_WORDS(ctx->ssa_alloc) * sizeof(BITSET_WORD));
246
247 bi_foreach_instr_in_block_rev(block, I) {
248 pressure += calculate_pressure_delta(I, s->live);
249 orig_max_pressure = MAX2(pressure, orig_max_pressure);
250 bi_liveness_ins_update_ssa(s->live, I);
251 nr_ins++;
252 }
253
254 memcpy(s->live, block->ssa_live_out,
255 BITSET_WORDS(ctx->ssa_alloc) * sizeof(BITSET_WORD));
256
257 /* off by a constant, that's ok */
258 signed max_pressure = 0;
259 pressure = 0;
260
261 struct sched_node **schedule = calloc(nr_ins, sizeof(struct sched_node *));
262 nr_ins = 0;
263
264 while (!list_is_empty(&s->dag->heads)) {
265 struct sched_node *node = choose_instr(s);
266 pressure += calculate_pressure_delta(node->instr, s->live);
267 max_pressure = MAX2(pressure, max_pressure);
268 dag_prune_head(s->dag, &node->dag);
269
270 schedule[nr_ins++] = node;
271 bi_liveness_ins_update_ssa(s->live, node->instr);
272 }
273
274 /* Bail if it looks like it's worse */
275 if (max_pressure >= orig_max_pressure) {
276 free(schedule);
277 return;
278 }
279
280 /* Apply the schedule */
281 for (unsigned i = 0; i < nr_ins; ++i) {
282 bi_remove_instruction(schedule[i]->instr);
283 list_add(&schedule[i]->instr->link, &block->instructions);
284 }
285
286 free(schedule);
287 }
288
289 void
bi_pressure_schedule(bi_context * ctx)290 bi_pressure_schedule(bi_context *ctx)
291 {
292 bi_compute_liveness_ssa(ctx);
293 void *memctx = ralloc_context(ctx);
294 BITSET_WORD *live =
295 ralloc_array(memctx, BITSET_WORD, BITSET_WORDS(ctx->ssa_alloc));
296
297 bi_foreach_block(ctx, block) {
298 struct sched_ctx sctx = {.dag = create_dag(ctx, block, memctx),
299 .live = live};
300
301 pressure_schedule_block(ctx, block, &sctx);
302 }
303
304 ralloc_free(memctx);
305 }
306