1 /*
2 * Copyright (C) 2022 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "bi_builder.h"
25 #include "va_compiler.h"
26 #include "valhall_enums.h"
27
28 /*
29 * Insert flow control into a scheduled and register allocated shader. This
30 * pass runs after scheduling and register allocation. This pass only
31 * inserts NOPs with the appropriate flow control modifiers. It should be
32 * followed by a cleanup pass to merge flow control modifiers on adjacent
33 * instructions, eliminating the NOPs. This decouples optimization from
34 * correctness, simplifying both passes.
35 *
36 * This pass is responsible for calculating dependencies, according to the
37 * rules:
38 *
39 * 1. An instruction that depends on the results of a previous asyncronous
40 * must first wait for that instruction's slot, unless all
41 * reaching code paths already depended on it.
42 * 2. More generally, any dependencies must be encoded. This includes
43 * Write-After-Write and Write-After-Read hazards with LOAD/STORE to memory.
44 * 3. The shader must wait on slot #6 before running BLEND, ATEST
45 * 4. The shader must wait on slot #7 before running BLEND, ST_TILE
46 * 6. BARRIER must wait on every active slot.
47 *
48 * Unlike Bifrost, it is not necessary to worry about outbound staging
49 * registers, as the hardware stalls reading staging registers when issuing
50 * asynchronous instructions. So we don't track reads in our model of the
51 * hardware scoreboard. This makes things a bit simpler.
52 *
53 * We may reuse slots for multiple asynchronous instructions, though there may
54 * be a performance penalty.
55 */
56
57 #define BI_NUM_REGISTERS 64
58
59 /*
60 * Insert a NOP instruction with given flow control.
61 */
62 static void
bi_flow(bi_context * ctx,bi_cursor cursor,enum va_flow flow)63 bi_flow(bi_context *ctx, bi_cursor cursor, enum va_flow flow)
64 {
65 bi_builder b = bi_init_builder(ctx, cursor);
66
67 bi_nop(&b)->flow = flow;
68 }
69
70 static uint64_t
bi_read_mask(bi_instr * I)71 bi_read_mask(bi_instr *I)
72 {
73 uint64_t mask = 0;
74
75 bi_foreach_src(I, s) {
76 if (I->src[s].type == BI_INDEX_REGISTER) {
77 unsigned reg = I->src[s].value;
78 unsigned count = bi_count_read_registers(I, s);
79
80 mask |= (BITFIELD64_MASK(count) << reg);
81 }
82 }
83
84 return mask;
85 }
86
87 static uint64_t
bi_write_mask(bi_instr * I)88 bi_write_mask(bi_instr *I)
89 {
90 uint64_t mask = 0;
91
92 bi_foreach_dest(I, d) {
93 assert(I->dest[d].type == BI_INDEX_REGISTER);
94
95 unsigned reg = I->dest[d].value;
96 unsigned count = bi_count_write_registers(I, d);
97
98 mask |= (BITFIELD64_MASK(count) << reg);
99 }
100
101 return mask;
102 }
103
104 static bool
bi_ld_vary_writes_hidden_register(const bi_instr * I)105 bi_ld_vary_writes_hidden_register(const bi_instr *I)
106 {
107 /* Only varying loads can write the hidden register */
108 if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_VARYING)
109 return false;
110
111 /* They only write in some update modes */
112 return (I->update == BI_UPDATE_STORE) || (I->update == BI_UPDATE_CLOBBER);
113 }
114
115 static bool
bi_is_memory_access(const bi_instr * I)116 bi_is_memory_access(const bi_instr *I)
117 {
118 /* On the attribute unit but functionally a general memory load */
119 if (I->op == BI_OPCODE_LD_ATTR_TEX)
120 return true;
121
122 /* UBOs are read-only so there are no ordering constriants */
123 if (I->seg == BI_SEG_UBO)
124 return false;
125
126 switch (bi_opcode_props[I->op].message) {
127 case BIFROST_MESSAGE_LOAD:
128 case BIFROST_MESSAGE_STORE:
129 case BIFROST_MESSAGE_ATOMIC:
130 return true;
131 default:
132 return false;
133 }
134 }
135
136 /* Update the scoreboard model to assign an instruction to a given slot */
137
138 static void
bi_push_instr(struct bi_scoreboard_state * st,bi_instr * I)139 bi_push_instr(struct bi_scoreboard_state *st, bi_instr *I)
140 {
141 if (bi_opcode_props[I->op].sr_write)
142 st->write[I->slot] |= bi_write_mask(I);
143
144 if (bi_is_memory_access(I))
145 st->memory |= BITFIELD_BIT(I->slot);
146
147 if (bi_opcode_props[I->op].message == BIFROST_MESSAGE_VARYING)
148 st->varying |= BITFIELD_BIT(I->slot);
149 }
150
151 static uint8_t MUST_CHECK
bi_pop_slot(struct bi_scoreboard_state * st,unsigned slot)152 bi_pop_slot(struct bi_scoreboard_state *st, unsigned slot)
153 {
154 st->write[slot] = 0;
155 st->varying &= ~BITFIELD_BIT(slot);
156 st->memory &= ~BITFIELD_BIT(slot);
157
158 return BITFIELD_BIT(slot);
159 }
160
161 /* Adds a dependency on each slot writing any specified register */
162
163 static uint8_t MUST_CHECK
bi_depend_on_writers(struct bi_scoreboard_state * st,uint64_t regmask)164 bi_depend_on_writers(struct bi_scoreboard_state *st, uint64_t regmask)
165 {
166 uint8_t slots = 0;
167
168 for (unsigned slot = 0; slot < ARRAY_SIZE(st->write); ++slot) {
169 if (st->write[slot] & regmask)
170 slots |= bi_pop_slot(st, slot);
171 }
172
173 return slots;
174 }
175
176 /* Sets the dependencies for a given clause, updating the model */
177
178 static void
bi_set_dependencies(bi_block * block,bi_instr * I,struct bi_scoreboard_state * st)179 bi_set_dependencies(bi_block *block, bi_instr *I,
180 struct bi_scoreboard_state *st)
181 {
182 /* Depend on writers to handle read-after-write and write-after-write
183 * dependencies. Write-after-read dependencies are handled in the hardware
184 * where necessary, so we don't worry about them.
185 */
186 I->flow |= bi_depend_on_writers(st, bi_read_mask(I) | bi_write_mask(I));
187
188 /* Handle write-after-write and write-after-read dependencies for the varying
189 * hidden registers. Read-after-write dependencies handled in hardware.
190 */
191 if (bi_ld_vary_writes_hidden_register(I)) {
192 u_foreach_bit(slot, st->varying)
193 I->flow |= bi_pop_slot(st, slot);
194 }
195
196 /* For now, serialize all memory access */
197 if (bi_is_memory_access(I)) {
198 u_foreach_bit(slot, st->memory)
199 I->flow |= bi_pop_slot(st, slot);
200 }
201
202 /* We need to wait for all general slots before a barrier. The reason is
203 * unknown. In theory, this is redundant, since the BARRIER instruction will
204 * be followed immediately by .wait which waits for all slots. However, that
205 * doesn't seem to work properly in practice.
206 *
207 * The DDK is observed to use the same workaround, going so far as
208 * introducing a NOP before a BARRIER at the beginning of a basic block when
209 * there are outstanding stores.
210 *
211 * NOP.wait12
212 * BARRIER.slot7.wait
213 *
214 * Luckily, this situation is pretty rare. The wait introduced here can
215 * usually be merged into the preceding instruction.
216 *
217 * We also use the same workaround to serialize all async instructions when
218 * debugging this pass with the BIFROST_MESA_DEBUG=nosb option.
219 */
220 if (I->op == BI_OPCODE_BARRIER || (bifrost_debug & BIFROST_DBG_NOSB)) {
221 for (unsigned i = 0; i < VA_NUM_GENERAL_SLOTS; ++i) {
222 if (st->write[i] || ((st->varying | st->memory) & BITFIELD_BIT(i)))
223 I->flow |= bi_pop_slot(st, i);
224 }
225 }
226 }
227
228 static bool
scoreboard_block_update(bi_context * ctx,bi_block * blk)229 scoreboard_block_update(bi_context *ctx, bi_block *blk)
230 {
231 bool progress = false;
232
233 /* pending_in[s] = sum { p in pred[s] } ( pending_out[p] ) */
234 bi_foreach_predecessor(blk, pred) {
235 for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) {
236 blk->scoreboard_in.read[i] |= (*pred)->scoreboard_out.read[i];
237 blk->scoreboard_in.write[i] |= (*pred)->scoreboard_out.write[i];
238 blk->scoreboard_in.varying |= (*pred)->scoreboard_out.varying;
239 blk->scoreboard_in.memory |= (*pred)->scoreboard_out.memory;
240 }
241 }
242
243 struct bi_scoreboard_state state = blk->scoreboard_in;
244
245 /* Assign locally */
246
247 bi_foreach_instr_in_block(blk, I) {
248 bi_set_dependencies(blk, I, &state);
249 bi_push_instr(&state, I);
250 }
251
252 /* Insert a wait for varyings at the end of the block.
253 *
254 * A varying load with .store has to wait for all other varying loads
255 * in the quad to complete. The bad case looks like:
256 *
257 * if (dynamic) {
258 * x = ld_var()
259 * } else {
260 * x = ld_var()
261 * }
262 *
263 * Logically, a given thread executes only a single ld_var instruction. But
264 * if the quad diverges, the second ld_var has to wait for the first ld_var.
265 * For correct handling, we need to maintain a physical control flow graph
266 * and do the dataflow analysis on that instead of the logical control flow
267 * graph. However, this probably doesn't matter much in practice. This seems
268 * like a decent compromise for now.
269 *
270 * TODO: Consider optimizing this case.
271 */
272 if (state.varying) {
273 uint8_t flow = 0;
274
275 u_foreach_bit(slot, state.varying)
276 flow |= bi_pop_slot(&state, slot);
277
278 bi_flow(ctx, bi_after_block(blk), flow);
279 }
280
281 /* To figure out progress, diff scoreboard_out */
282 progress = !!memcmp(&state, &blk->scoreboard_out, sizeof(state));
283
284 blk->scoreboard_out = state;
285
286 return progress;
287 }
288
289 static void
va_assign_scoreboard(bi_context * ctx)290 va_assign_scoreboard(bi_context *ctx)
291 {
292 u_worklist worklist;
293 bi_worklist_init(ctx, &worklist);
294
295 bi_foreach_block(ctx, block) {
296 bi_worklist_push_tail(&worklist, block);
297 }
298
299 /* Perform forward data flow analysis to calculate dependencies */
300 while (!u_worklist_is_empty(&worklist)) {
301 /* Pop from the front for forward analysis */
302 bi_block *blk = bi_worklist_pop_head(&worklist);
303
304 if (scoreboard_block_update(ctx, blk)) {
305 bi_foreach_successor(blk, succ)
306 bi_worklist_push_tail(&worklist, succ);
307 }
308 }
309
310 u_worklist_fini(&worklist);
311 }
312
313 /*
314 * Determine if execution should terminate after a given block. Execution cannot
315 * terminate within a basic block.
316 */
317 static bool
va_should_end(bi_block * block)318 va_should_end(bi_block *block)
319 {
320 /* Don't return if we're succeeded by instructions */
321 for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
322 bi_block *succ = block->successors[i];
323
324 if (succ)
325 return false;
326 }
327
328 return true;
329 }
330
331 /*
332 * We should discard helper invocations as soon as helper invocations die after
333 * their last use. Either they die after an instruction using helper
334 * invocations, or they die along a control flow edge. The former is handled by
335 * discarding appropriately after instructions. The latter is handled by
336 * inserting a discard at the _start_ of some blocks:
337 *
338 * Lemma: If a non-critical edge discards helpers, it is the only edge that
339 * enters its destination.
340 *
341 * Proof: An edge discards helpers if helpers are live at the end of the source
342 * block and dead at the start of the destination block. By definition, helpers
343 * are live at the end of a block iff they are live at the start of some
344 * successor of a block. The source block therefore has a successor with helpers
345 * live at the start and a successor with helpers dead at the start. As the
346 * source block has at least two successors, the edge is NOT the only edge
347 * exiting its source. Hence it is the only edge entering the destination,
348 * otherwise the edge would be critical.
349 *
350 * By corrollary, we may handle discards on control flow edges by discarding at
351 * the start of blocks with a single predecessor.
352 *
353 * This routine tests if a block should discard helper invocations at its start.
354 */
355 static bool
va_discard_before_block(bi_block * block)356 va_discard_before_block(bi_block *block)
357 {
358 /* Do not discard if the block requires helpers at the start */
359 if (block->pass_flags)
360 return false;
361
362 /* By the lemma, if we need to discard, there is a unique predecessor */
363 if (bi_num_predecessors(block) != 1)
364 return false;
365
366 bi_block *pred = *util_dynarray_element(&block->predecessors, bi_block *, 0);
367
368 /* Discard if helpers are live at the end of the predecessor, due to helpers
369 * live at the start of some (other) successor.
370 */
371 bi_foreach_successor(pred, succ) {
372 if (succ->pass_flags)
373 return true;
374 }
375
376 return false;
377 }
378
379 /*
380 * Test if a program is empty, in the sense of having zero instructions. Empty
381 * shaders get special handling.
382 */
383 static bool
bi_is_empty(bi_context * ctx)384 bi_is_empty(bi_context *ctx)
385 {
386 bi_foreach_instr_global(ctx, _)
387 return false;
388
389 return true;
390 }
391
392 /*
393 * Given a program with no flow control modifiers, insert NOPs signaling the
394 * required flow control. Not much optimization happens here.
395 */
396 void
va_insert_flow_control_nops(bi_context * ctx)397 va_insert_flow_control_nops(bi_context *ctx)
398 {
399 /* Special case: if a program is empty, leave it empty. In particular, do not
400 * insert NOP.end. There is special handling in the driver for skipping empty
401 * shaders for shader stage. The .end is not necessary and disrupts
402 * optimizations.
403 */
404 if (bi_is_empty(ctx))
405 return;
406
407 /* First do dataflow analysis for the scoreboard. This populates I->flow with
408 * a bitmap of slots to wait on.
409 *
410 * Also do dataflow analysis for helper invocations in fragment shaders. This
411 * populates block->pass_flags with helper invocation information.
412 */
413 va_assign_scoreboard(ctx);
414 bi_analyze_helper_terminate(ctx);
415
416 bi_foreach_block(ctx, block) {
417 /* Handle discards along control flow edges */
418 if (va_discard_before_block(block))
419 bi_flow(ctx, bi_before_block(block), VA_FLOW_DISCARD);
420
421 bi_foreach_instr_in_block_safe(block, I) {
422 switch (I->op) {
423 /* Signal barriers immediately */
424 case BI_OPCODE_BARRIER:
425 bi_flow(ctx, bi_after_instr(I), VA_FLOW_WAIT);
426 break;
427
428 /* Insert waits for tilebuffer and depth/stencil instructions. These
429 * only happen in regular fragment shaders, as the required waits are
430 * assumed to already have happened in blend shaders.
431 *
432 * For discarded thread handling, ATEST must be serialized against all
433 * other asynchronous instructions and should be serialized against all
434 * instructions. Wait for slot 0 immediately after the ATEST.
435 */
436 case BI_OPCODE_BLEND:
437 case BI_OPCODE_LD_TILE:
438 case BI_OPCODE_ST_TILE:
439 if (!ctx->inputs->is_blend)
440 bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT);
441 break;
442 case BI_OPCODE_ATEST:
443 bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT0126);
444 bi_flow(ctx, bi_after_instr(I), VA_FLOW_WAIT0);
445 break;
446 case BI_OPCODE_ZS_EMIT:
447 if (!ctx->inputs->is_blend)
448 bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT0126);
449 break;
450
451 default:
452 break;
453 }
454
455 if (I->flow && I->op != BI_OPCODE_NOP) {
456 /* Wait on the results of asynchronous instructions
457 *
458 * Bitmap of general slots lines up with the encoding of va_flow for
459 * waits on general slots. The dataflow analysis should be ignoring
460 * the special slots #6 and #7, which are handled separately.
461 */
462 assert((I->flow & ~BITFIELD_MASK(VA_NUM_GENERAL_SLOTS)) == 0);
463
464 bi_flow(ctx, bi_before_instr(I), I->flow);
465 I->flow = 0;
466 }
467 }
468
469 /* Terminate helpers after the last use */
470 if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend &&
471 block->pass_flags && bi_block_terminates_helpers(block)) {
472
473 bi_foreach_instr_in_block_safe_rev(block, I) {
474 if (bi_instr_uses_helpers(I)) {
475 bi_flow(ctx, bi_after_instr(I), VA_FLOW_DISCARD);
476 break;
477 }
478 }
479 }
480
481 /* End exeuction at the end of the block if needed, or reconverge if we
482 * continue but we don't need to end execution.
483 */
484 if (va_should_end(block) || block->needs_nop) {
485 /* Don't bother adding a NOP into an unreachable block */
486 if (block == bi_start_block(&ctx->blocks) ||
487 bi_num_predecessors(block))
488 bi_flow(ctx, bi_after_block(block), VA_FLOW_END);
489 } else if (bi_reconverge_branches(block)) {
490 /* TODO: Do we have ever need to reconverge from an empty block? */
491 if (!list_is_empty(&block->instructions))
492 bi_flow(ctx, bi_after_block(block), VA_FLOW_RECONVERGE);
493 }
494 }
495
496 /* If helpers are not used anywhere, they are not used at the start, so we
497 * terminate at the start. Else, helpers are used somewhere in the shader and
498 * are terminated after last use.
499 */
500 bi_block *start = bi_start_block(&ctx->blocks);
501 bool frag = (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend);
502
503 if (frag && !start->pass_flags)
504 bi_flow(ctx, bi_before_block(start), VA_FLOW_DISCARD);
505 }
506
507 /*
508 * Assign slots to all asynchronous instructions. A few special instructions
509 * require specific slots. For the rest, we assign slots in a round-robin
510 * fashion to reduce false dependencies when encoding waits.
511 *
512 * This should be called before va_insert_flow_control_nops.
513 */
514 void
va_assign_slots(bi_context * ctx)515 va_assign_slots(bi_context *ctx)
516 {
517 unsigned counter = 0;
518
519 bi_foreach_instr_global(ctx, I) {
520 if (I->op == BI_OPCODE_BARRIER) {
521 I->slot = 7;
522 } else if (I->op == BI_OPCODE_ZS_EMIT || I->op == BI_OPCODE_ATEST) {
523 I->slot = 0;
524 } else if (bi_opcode_props[I->op].message) {
525 I->slot = counter++;
526
527 if (counter == 3)
528 counter = 0;
529 }
530 }
531 }
532