1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 #include "util/ralloc.h"
28 #include "util/u_math.h"
29
30 #include "ir3.h"
31 #include "ir3_shader.h"
32
33 /*
34 * Legalize:
35 *
36 * The legalize pass handles ensuring sufficient nop's and sync flags for
37 * correct execution.
38 *
39 * 1) Iteratively determine where sync ((sy)/(ss)) flags are needed,
40 * based on state flowing out of predecessor blocks until there is
41 * no further change. In some cases this requires inserting nops.
42 * 2) Mark (ei) on last varying input
43 * 3) Final nop scheduling for instruction latency
44 * 4) Resolve jumps and schedule blocks, marking potential convergence
45 * points with (jp)
46 */
47
48 struct ir3_legalize_ctx {
49 struct ir3_compiler *compiler;
50 struct ir3_shader_variant *so;
51 gl_shader_stage type;
52 int max_bary;
53 bool early_input_release;
54 bool has_inputs;
55 };
56
57 struct ir3_legalize_state {
58 regmask_t needs_ss;
59 regmask_t needs_ss_war; /* write after read */
60 regmask_t needs_sy;
61 bool needs_ss_for_const;
62 };
63
64 struct ir3_legalize_block_data {
65 bool valid;
66 struct ir3_legalize_state state;
67 };
68
69 static inline void
apply_ss(struct ir3_instruction * instr,struct ir3_legalize_state * state,bool mergedregs)70 apply_ss(struct ir3_instruction *instr,
71 struct ir3_legalize_state *state,
72 bool mergedregs)
73 {
74 instr->flags |= IR3_INSTR_SS;
75 regmask_init(&state->needs_ss_war, mergedregs);
76 regmask_init(&state->needs_ss, mergedregs);
77 state->needs_ss_for_const = false;
78 }
79
80 static inline void
apply_sy(struct ir3_instruction * instr,struct ir3_legalize_state * state,bool mergedregs)81 apply_sy(struct ir3_instruction *instr,
82 struct ir3_legalize_state *state,
83 bool mergedregs)
84 {
85 instr->flags |= IR3_INSTR_SY;
86 regmask_init(&state->needs_sy, mergedregs);
87 }
88
89 /* We want to evaluate each block from the position of any other
90 * predecessor block, in order that the flags set are the union of
91 * all possible program paths.
92 *
93 * To do this, we need to know the output state (needs_ss/ss_war/sy)
94 * of all predecessor blocks. The tricky thing is loops, which mean
95 * that we can't simply recursively process each predecessor block
96 * before legalizing the current block.
97 *
98 * How we handle that is by looping over all the blocks until the
99 * results converge. If the output state of a given block changes
100 * in a given pass, this means that all successor blocks are not
101 * yet fully legalized.
102 */
103
104 static bool
legalize_block(struct ir3_legalize_ctx * ctx,struct ir3_block * block)105 legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
106 {
107 struct ir3_legalize_block_data *bd = block->data;
108
109 if (bd->valid)
110 return false;
111
112 struct ir3_instruction *last_n = NULL;
113 struct list_head instr_list;
114 struct ir3_legalize_state prev_state = bd->state;
115 struct ir3_legalize_state *state = &bd->state;
116 bool last_input_needs_ss = false;
117 bool has_tex_prefetch = false;
118 bool mergedregs = ctx->so->mergedregs;
119
120 /* our input state is the OR of all predecessor blocks' state: */
121 for (unsigned i = 0; i < block->predecessors_count; i++) {
122 struct ir3_block *predecessor = block->predecessors[i];
123 struct ir3_legalize_block_data *pbd = predecessor->data;
124 struct ir3_legalize_state *pstate = &pbd->state;
125
126 /* Our input (ss)/(sy) state is based on OR'ing the output
127 * state of all our predecessor blocks
128 */
129 regmask_or(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
130 regmask_or(&state->needs_ss_war, &state->needs_ss_war,
131 &pstate->needs_ss_war);
132 regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy);
133 state->needs_ss_for_const |= pstate->needs_ss_for_const;
134 }
135
136 /* We need to take phsyical-only edges into account when tracking shared
137 * registers.
138 */
139 for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
140 struct ir3_block *predecessor = block->physical_predecessors[i];
141 struct ir3_legalize_block_data *pbd = predecessor->data;
142 struct ir3_legalize_state *pstate = &pbd->state;
143
144 regmask_or_shared(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
145 }
146
147 unsigned input_count = 0;
148
149 foreach_instr (n, &block->instr_list) {
150 if (is_input(n)) {
151 input_count++;
152 }
153 }
154
155 unsigned inputs_remaining = input_count;
156
157 /* Either inputs are in the first block or we expect inputs to be released
158 * with the end of the program.
159 */
160 assert(input_count == 0 || !ctx->early_input_release ||
161 block == ir3_after_preamble(block->shader));
162
163 /* remove all the instructions from the list, we'll be adding
164 * them back in as we go
165 */
166 list_replace(&block->instr_list, &instr_list);
167 list_inithead(&block->instr_list);
168
169 foreach_instr_safe (n, &instr_list) {
170 unsigned i;
171
172 n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
173
174 /* _meta::tex_prefetch instructions removed later in
175 * collect_tex_prefetches()
176 */
177 if (is_meta(n) && (n->opc != OPC_META_TEX_PREFETCH))
178 continue;
179
180 if (is_input(n)) {
181 struct ir3_register *inloc = n->srcs[0];
182 assert(inloc->flags & IR3_REG_IMMED);
183 ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val);
184 }
185
186 if ((last_n && is_barrier(last_n)) || n->opc == OPC_SHPE) {
187 apply_ss(n, state, mergedregs);
188 apply_sy(n, state, mergedregs);
189 last_input_needs_ss = false;
190 }
191
192 if (last_n && (last_n->opc == OPC_PREDT)) {
193 apply_ss(n, state, mergedregs);
194 }
195
196 /* NOTE: consider dst register too.. it could happen that
197 * texture sample instruction (for example) writes some
198 * components which are unused. A subsequent instruction
199 * that writes the same register can race w/ the sam instr
200 * resulting in undefined results:
201 */
202 for (i = 0; i < n->dsts_count + n->srcs_count; i++) {
203 struct ir3_register *reg;
204 if (i < n->dsts_count)
205 reg = n->dsts[i];
206 else
207 reg = n->srcs[i - n->dsts_count];
208
209 if (reg_gpr(reg)) {
210
211 /* TODO: we probably only need (ss) for alu
212 * instr consuming sfu result.. need to make
213 * some tests for both this and (sy)..
214 */
215 if (regmask_get(&state->needs_ss, reg)) {
216 apply_ss(n, state, mergedregs);
217 last_input_needs_ss = false;
218 }
219
220 if (regmask_get(&state->needs_sy, reg)) {
221 apply_sy(n, state, mergedregs);
222 }
223 } else if ((reg->flags & IR3_REG_CONST)) {
224 if (state->needs_ss_for_const) {
225 apply_ss(n, state, mergedregs);
226 last_input_needs_ss = false;
227 }
228 }
229 }
230
231 foreach_dst (reg, n) {
232 if (regmask_get(&state->needs_ss_war, reg)) {
233 apply_ss(n, state, mergedregs);
234 last_input_needs_ss = false;
235 }
236 }
237
238 /* cat5+ does not have an (ss) bit, if needed we need to
239 * insert a nop to carry the sync flag. Would be kinda
240 * clever if we were aware of this during scheduling, but
241 * this should be a pretty rare case:
242 */
243 if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) {
244 struct ir3_instruction *nop;
245 nop = ir3_NOP(block);
246 nop->flags |= IR3_INSTR_SS;
247 n->flags &= ~IR3_INSTR_SS;
248 }
249
250 /* need to be able to set (ss) on first instruction: */
251 if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5) && !is_meta(n))
252 ir3_NOP(block);
253
254 if (ctx->compiler->samgq_workaround &&
255 ctx->type != MESA_SHADER_FRAGMENT &&
256 ctx->type != MESA_SHADER_COMPUTE && n->opc == OPC_SAMGQ) {
257 struct ir3_instruction *samgp;
258
259 list_delinit(&n->node);
260
261 for (i = 0; i < 4; i++) {
262 samgp = ir3_instr_clone(n);
263 samgp->opc = OPC_SAMGP0 + i;
264 if (i > 1)
265 samgp->flags |= IR3_INSTR_SY;
266 }
267 } else {
268 list_delinit(&n->node);
269 list_addtail(&n->node, &block->instr_list);
270 }
271
272 if (is_sfu(n))
273 regmask_set(&state->needs_ss, n->dsts[0]);
274
275 foreach_dst (dst, n) {
276 if (dst->flags & IR3_REG_SHARED)
277 regmask_set(&state->needs_ss, dst);
278 }
279
280 if (is_tex_or_prefetch(n)) {
281 regmask_set(&state->needs_sy, n->dsts[0]);
282 if (n->opc == OPC_META_TEX_PREFETCH)
283 has_tex_prefetch = true;
284 } else if (n->opc == OPC_RESINFO) {
285 regmask_set(&state->needs_ss, n->dsts[0]);
286 ir3_NOP(block)->flags |= IR3_INSTR_SS;
287 last_input_needs_ss = false;
288 } else if (is_load(n)) {
289 if (is_local_mem_load(n))
290 regmask_set(&state->needs_ss, n->dsts[0]);
291 else
292 regmask_set(&state->needs_sy, n->dsts[0]);
293 } else if (is_atomic(n->opc)) {
294 if (is_bindless_atomic(n->opc)) {
295 regmask_set(&state->needs_sy, n->srcs[2]);
296 } else if (is_global_a3xx_atomic(n->opc) ||
297 is_global_a6xx_atomic(n->opc)) {
298 regmask_set(&state->needs_sy, n->dsts[0]);
299 } else {
300 regmask_set(&state->needs_ss, n->dsts[0]);
301 }
302 } else if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
303 state->needs_ss_for_const = true;
304 }
305
306 if (is_ssbo(n->opc) || is_global_a3xx_atomic(n->opc) ||
307 is_bindless_atomic(n->opc))
308 ctx->so->has_ssbo = true;
309
310 /* both tex/sfu appear to not always immediately consume
311 * their src register(s):
312 */
313 if (is_tex(n) || is_sfu(n) || is_mem(n)) {
314 foreach_src (reg, n) {
315 regmask_set(&state->needs_ss_war, reg);
316 }
317 }
318
319 if (ctx->early_input_release && is_input(n)) {
320 last_input_needs_ss |= (n->opc == OPC_LDLV);
321
322 assert(inputs_remaining > 0);
323 inputs_remaining--;
324 if (inputs_remaining == 0) {
325 /* This is the last input. We add the (ei) flag to release
326 * varying memory after this executes. If it's an ldlv,
327 * however, we need to insert a dummy bary.f on which we can
328 * set the (ei) flag. We may also need to insert an (ss) to
329 * guarantee that all ldlv's have finished fetching their
330 * results before releasing the varying memory.
331 */
332 struct ir3_instruction *last_input = n;
333 if (n->opc == OPC_LDLV) {
334 struct ir3_instruction *baryf;
335
336 /* (ss)bary.f (ei)r63.x, 0, r0.x */
337 baryf = ir3_instr_create(block, OPC_BARY_F, 1, 2);
338 ir3_dst_create(baryf, regid(63, 0), 0);
339 ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
340 ir3_src_create(baryf, regid(0, 0), 0);
341
342 last_input = baryf;
343 }
344
345 last_input->dsts[0]->flags |= IR3_REG_EI;
346 if (last_input_needs_ss) {
347 apply_ss(last_input, state, mergedregs);
348 }
349 }
350 }
351
352 last_n = n;
353 }
354
355 assert(inputs_remaining == 0 || !ctx->early_input_release);
356
357 if (has_tex_prefetch && !ctx->has_inputs) {
358 /* texture prefetch, but *no* inputs.. we need to insert a
359 * dummy bary.f at the top of the shader to unblock varying
360 * storage:
361 */
362 struct ir3_instruction *baryf;
363
364 /* (ss)bary.f (ei)r63.x, 0, r0.x */
365 baryf = ir3_instr_create(block, OPC_BARY_F, 1, 2);
366 ir3_dst_create(baryf, regid(63, 0), 0)->flags |= IR3_REG_EI;
367 ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
368 ir3_src_create(baryf, regid(0, 0), 0);
369
370 /* insert the dummy bary.f at head: */
371 list_delinit(&baryf->node);
372 list_add(&baryf->node, &block->instr_list);
373 }
374
375 bd->valid = true;
376
377 if (memcmp(&prev_state, state, sizeof(*state))) {
378 /* our output state changed, this invalidates all of our
379 * successors:
380 */
381 for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
382 if (!block->successors[i])
383 break;
384 struct ir3_legalize_block_data *pbd = block->successors[i]->data;
385 pbd->valid = false;
386 }
387 }
388
389 return true;
390 }
391
392 /* Expands dsxpp and dsypp macros to:
393 *
394 * dsxpp.1 dst, src
395 * dsxpp.1.p dst, src
396 *
397 * We apply this after flags syncing, as we don't want to sync in between the
398 * two (which might happen if dst == src). We do it before nop scheduling
399 * because that needs to count actual instructions.
400 */
401 static bool
apply_fine_deriv_macro(struct ir3_legalize_ctx * ctx,struct ir3_block * block)402 apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
403 {
404 struct list_head instr_list;
405
406 /* remove all the instructions from the list, we'll be adding
407 * them back in as we go
408 */
409 list_replace(&block->instr_list, &instr_list);
410 list_inithead(&block->instr_list);
411
412 foreach_instr_safe (n, &instr_list) {
413 list_addtail(&n->node, &block->instr_list);
414
415 if (n->opc == OPC_DSXPP_MACRO || n->opc == OPC_DSYPP_MACRO) {
416 n->opc = (n->opc == OPC_DSXPP_MACRO) ? OPC_DSXPP_1 : OPC_DSYPP_1;
417
418 struct ir3_instruction *op_p = ir3_instr_clone(n);
419 op_p->flags = IR3_INSTR_P;
420
421 ctx->so->need_full_quad = true;
422 }
423 }
424
425 return true;
426 }
427
428 static void
apply_push_consts_load_macro(struct ir3_legalize_ctx * ctx,struct ir3_block * block)429 apply_push_consts_load_macro(struct ir3_legalize_ctx *ctx,
430 struct ir3_block *block)
431 {
432 foreach_instr (n, &block->instr_list) {
433 if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
434 struct ir3_instruction *stsc = ir3_instr_create(block, OPC_STSC, 0, 2);
435 ir3_instr_move_after(stsc, n);
436 ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val =
437 n->push_consts.dst_base;
438 ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val =
439 n->push_consts.src_base;
440 stsc->cat6.iim_val = n->push_consts.src_size;
441 stsc->cat6.type = TYPE_U32;
442
443 if (ctx->compiler->stsc_duplication_quirk) {
444 struct ir3_instruction *nop = ir3_NOP(block);
445 ir3_instr_move_after(nop, stsc);
446 nop->flags |= IR3_INSTR_SS;
447 ir3_instr_move_after(ir3_instr_clone(stsc), nop);
448 }
449
450 list_delinit(&n->node);
451 break;
452 } else if (!is_meta(n)) {
453 break;
454 }
455 }
456 }
457
458 /* NOTE: branch instructions are always the last instruction(s)
459 * in the block. We take advantage of this as we resolve the
460 * branches, since "if (foo) break;" constructs turn into
461 * something like:
462 *
463 * block3 {
464 * ...
465 * 0029:021: mov.s32s32 r62.x, r1.y
466 * 0082:022: br !p0.x, target=block5
467 * 0083:023: br p0.x, target=block4
468 * // succs: if _[0029:021: mov.s32s32] block4; else block5;
469 * }
470 * block4 {
471 * 0084:024: jump, target=block6
472 * // succs: block6;
473 * }
474 * block5 {
475 * 0085:025: jump, target=block7
476 * // succs: block7;
477 * }
478 *
479 * ie. only instruction in block4/block5 is a jump, so when
480 * resolving branches we can easily detect this by checking
481 * that the first instruction in the target block is itself
482 * a jump, and setup the br directly to the jump's target
483 * (and strip back out the now unreached jump)
484 *
485 * TODO sometimes we end up with things like:
486 *
487 * br !p0.x, #2
488 * br p0.x, #12
489 * add.u r0.y, r0.y, 1
490 *
491 * If we swapped the order of the branches, we could drop one.
492 */
493 static struct ir3_block *
resolve_dest_block(struct ir3_block * block)494 resolve_dest_block(struct ir3_block *block)
495 {
496 /* special case for last block: */
497 if (!block->successors[0])
498 return block;
499
500 /* NOTE that we may or may not have inserted the jump
501 * in the target block yet, so conditions to resolve
502 * the dest to the dest block's successor are:
503 *
504 * (1) successor[1] == NULL &&
505 * (2) (block-is-empty || only-instr-is-jump)
506 */
507 if (block->successors[1] == NULL) {
508 if (list_is_empty(&block->instr_list)) {
509 return block->successors[0];
510 } else if (list_length(&block->instr_list) == 1) {
511 struct ir3_instruction *instr =
512 list_first_entry(&block->instr_list, struct ir3_instruction, node);
513 if (instr->opc == OPC_JUMP) {
514 /* If this jump is backwards, then we will probably convert
515 * the jump being resolved to a backwards jump, which will
516 * change a loop-with-continue or loop-with-if into a
517 * doubly-nested loop and change the convergence behavior.
518 * Disallow this here.
519 */
520 if (block->successors[0]->index <= block->index)
521 return block;
522 return block->successors[0];
523 }
524 }
525 }
526 return block;
527 }
528
529 static void
remove_unused_block(struct ir3_block * old_target)530 remove_unused_block(struct ir3_block *old_target)
531 {
532 list_delinit(&old_target->node);
533
534 /* cleanup dangling predecessors: */
535 for (unsigned i = 0; i < ARRAY_SIZE(old_target->successors); i++) {
536 if (old_target->successors[i]) {
537 struct ir3_block *succ = old_target->successors[i];
538 ir3_block_remove_predecessor(succ, old_target);
539 }
540 }
541 }
542
543 static bool
retarget_jump(struct ir3_instruction * instr,struct ir3_block * new_target)544 retarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target)
545 {
546 struct ir3_block *old_target = instr->cat0.target;
547 struct ir3_block *cur_block = instr->block;
548
549 /* update current blocks successors to reflect the retargetting: */
550 if (cur_block->successors[0] == old_target) {
551 cur_block->successors[0] = new_target;
552 } else {
553 assert(cur_block->successors[1] == old_target);
554 cur_block->successors[1] = new_target;
555 }
556
557 /* update new target's predecessors: */
558 ir3_block_add_predecessor(new_target, cur_block);
559
560 /* and remove old_target's predecessor: */
561 ir3_block_remove_predecessor(old_target, cur_block);
562
563 /* If we reconverged at the old target, we'll reconverge at the new target
564 * too:
565 */
566 new_target->reconvergence_point |= old_target->reconvergence_point;
567
568 instr->cat0.target = new_target;
569
570 if (old_target->predecessors_count == 0) {
571 remove_unused_block(old_target);
572 return true;
573 }
574
575 return false;
576 }
577
578 static bool
opt_jump(struct ir3 * ir)579 opt_jump(struct ir3 *ir)
580 {
581 bool progress = false;
582
583 unsigned index = 0;
584 foreach_block (block, &ir->block_list)
585 block->index = index++;
586
587 foreach_block (block, &ir->block_list) {
588 /* This pass destroys the physical CFG so don't keep it around to avoid
589 * validation errors.
590 */
591 block->physical_successors_count = 0;
592 block->physical_predecessors_count = 0;
593
594 foreach_instr (instr, &block->instr_list) {
595 if (!is_flow(instr) || !instr->cat0.target)
596 continue;
597
598 struct ir3_block *tblock = resolve_dest_block(instr->cat0.target);
599 if (tblock != instr->cat0.target) {
600 progress = true;
601
602 /* Exit early if we deleted a block to avoid iterator
603 * weirdness/assert fails
604 */
605 if (retarget_jump(instr, tblock))
606 return true;
607 }
608 }
609
610 /* Detect the case where the block ends either with:
611 * - A single unconditional jump to the next block.
612 * - Two jump instructions with opposite conditions, and one of the
613 * them jumps to the next block.
614 * We can remove the one that jumps to the next block in either case.
615 */
616 if (list_is_empty(&block->instr_list))
617 continue;
618
619 struct ir3_instruction *jumps[2] = {NULL, NULL};
620 jumps[0] =
621 list_last_entry(&block->instr_list, struct ir3_instruction, node);
622 if (!list_is_singular(&block->instr_list))
623 jumps[1] =
624 list_last_entry(&jumps[0]->node, struct ir3_instruction, node);
625
626 if (jumps[0]->opc == OPC_JUMP)
627 jumps[1] = NULL;
628 else if (jumps[0]->opc != OPC_B || !jumps[1] || jumps[1]->opc != OPC_B)
629 continue;
630
631 for (unsigned i = 0; i < 2; i++) {
632 if (!jumps[i])
633 continue;
634
635 struct ir3_block *tblock = jumps[i]->cat0.target;
636 if (&tblock->node == block->node.next) {
637 list_delinit(&jumps[i]->node);
638 progress = true;
639 break;
640 }
641 }
642 }
643
644 return progress;
645 }
646
647 static void
resolve_jumps(struct ir3 * ir)648 resolve_jumps(struct ir3 *ir)
649 {
650 foreach_block (block, &ir->block_list)
651 foreach_instr (instr, &block->instr_list)
652 if (is_flow(instr) && instr->cat0.target) {
653 struct ir3_instruction *target = list_first_entry(
654 &instr->cat0.target->instr_list, struct ir3_instruction, node);
655
656 instr->cat0.immed = (int)target->ip - (int)instr->ip;
657 }
658 }
659
660 static void
mark_jp(struct ir3_block * block)661 mark_jp(struct ir3_block *block)
662 {
663 /* We only call this on the end block (in kill_sched) or after retargeting
664 * all jumps to empty blocks (in mark_xvergence_points) so there's no need to
665 * worry about empty blocks.
666 */
667 assert(!list_is_empty(&block->instr_list));
668
669 struct ir3_instruction *target =
670 list_first_entry(&block->instr_list, struct ir3_instruction, node);
671 target->flags |= IR3_INSTR_JP;
672 }
673
674 /* Mark points where control flow reconverges.
675 *
676 * Re-convergence points are where "parked" threads are reconverged with threads
677 * that took the opposite path last time around. We already calculated them, we
678 * just need to mark them with (jp).
679 */
680 static void
mark_xvergence_points(struct ir3 * ir)681 mark_xvergence_points(struct ir3 *ir)
682 {
683 foreach_block (block, &ir->block_list) {
684 if (block->reconvergence_point)
685 mark_jp(block);
686 }
687 }
688
689 /* Insert the branch/jump instructions for flow control between blocks.
690 * Initially this is done naively, without considering if the successor
691 * block immediately follows the current block (ie. so no jump required),
692 * but that is cleaned up in opt_jump().
693 *
694 * TODO what ensures that the last write to p0.x in a block is the
695 * branch condition? Have we been getting lucky all this time?
696 */
697 static void
block_sched(struct ir3 * ir)698 block_sched(struct ir3 *ir)
699 {
700 foreach_block (block, &ir->block_list) {
701 if (block->successors[1]) {
702 /* if/else, conditional branches to "then" or "else": */
703 struct ir3_instruction *br1, *br2;
704
705 if (block->brtype == IR3_BRANCH_GETONE ||
706 block->brtype == IR3_BRANCH_GETLAST ||
707 block->brtype == IR3_BRANCH_SHPS) {
708 /* getone/shps can't be inverted, and it wouldn't even make sense
709 * to follow it with an inverted branch, so follow it by an
710 * unconditional branch.
711 */
712 assert(!block->condition);
713 if (block->brtype == IR3_BRANCH_GETONE)
714 br1 = ir3_GETONE(block);
715 else if (block->brtype == IR3_BRANCH_GETLAST)
716 br1 = ir3_GETLAST(block);
717 else
718 br1 = ir3_SHPS(block);
719 br1->cat0.target = block->successors[1];
720
721 br2 = ir3_JUMP(block);
722 br2->cat0.target = block->successors[0];
723 } else {
724 assert(block->condition);
725
726 /* create "else" branch first (since "then" block should
727 * frequently/always end up being a fall-thru):
728 */
729 br1 = ir3_instr_create(block, OPC_B, 0, 1);
730 ir3_src_create(br1, regid(REG_P0, 0), 0)->def =
731 block->condition->dsts[0];
732 br1->cat0.inv1 = true;
733 br1->cat0.target = block->successors[1];
734
735 /* "then" branch: */
736 br2 = ir3_instr_create(block, OPC_B, 0, 1);
737 ir3_src_create(br2, regid(REG_P0, 0), 0)->def =
738 block->condition->dsts[0];
739 br2->cat0.target = block->successors[0];
740
741 switch (block->brtype) {
742 case IR3_BRANCH_COND:
743 br1->cat0.brtype = br2->cat0.brtype = BRANCH_PLAIN;
744 break;
745 case IR3_BRANCH_ALL:
746 br1->cat0.brtype = BRANCH_ANY;
747 br2->cat0.brtype = BRANCH_ALL;
748 break;
749 case IR3_BRANCH_ANY:
750 br1->cat0.brtype = BRANCH_ALL;
751 br2->cat0.brtype = BRANCH_ANY;
752 break;
753 case IR3_BRANCH_GETONE:
754 case IR3_BRANCH_GETLAST:
755 case IR3_BRANCH_SHPS:
756 unreachable("can't get here");
757 }
758 }
759 } else if (block->successors[0]) {
760 /* otherwise unconditional jump to next block: */
761 struct ir3_instruction *jmp;
762
763 jmp = ir3_JUMP(block);
764 jmp->cat0.target = block->successors[0];
765 }
766 }
767 }
768
769 /* Here we workaround the fact that kill doesn't actually kill the thread as
770 * GL expects. The last instruction always needs to be an end instruction,
771 * which means that if we're stuck in a loop where kill is the only way out,
772 * then we may have to jump out to the end. kill may also have the d3d
773 * semantics of converting the thread to a helper thread, rather than setting
774 * the exec mask to 0, in which case the helper thread could get stuck in an
775 * infinite loop.
776 *
777 * We do this late, both to give the scheduler the opportunity to reschedule
778 * kill instructions earlier and to avoid having to create a separate basic
779 * block.
780 *
781 * TODO: Assuming that the wavefront doesn't stop as soon as all threads are
782 * killed, we might benefit by doing this more aggressively when the remaining
783 * part of the program after the kill is large, since that would let us
784 * skip over the instructions when there are no non-killed threads left.
785 */
786 static void
kill_sched(struct ir3 * ir,struct ir3_shader_variant * so)787 kill_sched(struct ir3 *ir, struct ir3_shader_variant *so)
788 {
789 /* True if we know that this block will always eventually lead to the end
790 * block:
791 */
792 bool always_ends = true;
793 bool added = false;
794 struct ir3_block *last_block =
795 list_last_entry(&ir->block_list, struct ir3_block, node);
796
797 foreach_block_rev (block, &ir->block_list) {
798 for (unsigned i = 0; i < 2 && block->successors[i]; i++) {
799 if (block->successors[i]->start_ip <= block->end_ip)
800 always_ends = false;
801 }
802
803 if (always_ends)
804 continue;
805
806 foreach_instr_safe (instr, &block->instr_list) {
807 if (instr->opc != OPC_KILL)
808 continue;
809
810 struct ir3_instruction *br = ir3_instr_create(block, OPC_B, 0, 1);
811 ir3_src_create(br, instr->srcs[0]->num, instr->srcs[0]->flags)->wrmask =
812 1;
813 br->cat0.target =
814 list_last_entry(&ir->block_list, struct ir3_block, node);
815
816 list_del(&br->node);
817 list_add(&br->node, &instr->node);
818
819 added = true;
820 }
821 }
822
823 if (added) {
824 /* I'm not entirely sure how the branchstack works, but we probably
825 * need to add at least one entry for the divergence which is resolved
826 * at the end:
827 */
828 so->branchstack++;
829
830 /* We don't update predecessors/successors, so we have to do this
831 * manually:
832 */
833 mark_jp(last_block);
834 }
835 }
836
837 /* Insert nop's required to make this a legal/valid shader program: */
838 static void
nop_sched(struct ir3 * ir,struct ir3_shader_variant * so)839 nop_sched(struct ir3 *ir, struct ir3_shader_variant *so)
840 {
841 foreach_block (block, &ir->block_list) {
842 struct ir3_instruction *last = NULL;
843 struct list_head instr_list;
844
845 /* remove all the instructions from the list, we'll be adding
846 * them back in as we go
847 */
848 list_replace(&block->instr_list, &instr_list);
849 list_inithead(&block->instr_list);
850
851 foreach_instr_safe (instr, &instr_list) {
852 unsigned delay = ir3_delay_calc(block, instr, so->mergedregs);
853
854 /* NOTE: I think the nopN encoding works for a5xx and
855 * probably a4xx, but not a3xx. So far only tested on
856 * a6xx.
857 */
858
859 if ((delay > 0) && (ir->compiler->gen >= 6) && last &&
860 ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3)) &&
861 (last->repeat == 0)) {
862 /* the previous cat2/cat3 instruction can encode at most 3 nop's: */
863 unsigned transfer = MIN2(delay, 3 - last->nop);
864 last->nop += transfer;
865 delay -= transfer;
866 }
867
868 if ((delay > 0) && last && (last->opc == OPC_NOP)) {
869 /* the previous nop can encode at most 5 repeats: */
870 unsigned transfer = MIN2(delay, 5 - last->repeat);
871 last->repeat += transfer;
872 delay -= transfer;
873 }
874
875 if (delay > 0) {
876 assert(delay <= 6);
877 ir3_NOP(block)->repeat = delay - 1;
878 }
879
880 list_addtail(&instr->node, &block->instr_list);
881 last = instr;
882 }
883 }
884 }
885
886 static void
dbg_sync_sched(struct ir3 * ir,struct ir3_shader_variant * so)887 dbg_sync_sched(struct ir3 *ir, struct ir3_shader_variant *so)
888 {
889 foreach_block (block, &ir->block_list) {
890 foreach_instr_safe (instr, &block->instr_list) {
891 if (opc_cat(instr->opc) == 4 || opc_cat(instr->opc) == 5 ||
892 opc_cat(instr->opc) == 6) {
893 struct ir3_instruction *nop = ir3_NOP(block);
894 nop->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
895 ir3_instr_move_after(nop, instr);
896 }
897 }
898 }
899 }
900
901 static void
dbg_nop_sched(struct ir3 * ir,struct ir3_shader_variant * so)902 dbg_nop_sched(struct ir3 *ir, struct ir3_shader_variant *so)
903 {
904 foreach_block (block, &ir->block_list) {
905 foreach_instr_safe (instr, &block->instr_list) {
906 struct ir3_instruction *nop = ir3_NOP(block);
907 nop->repeat = 5;
908 ir3_instr_move_before(nop, instr);
909 }
910 }
911 }
912
913 struct ir3_helper_block_data {
914 /* Whether helper invocations may be used on any path starting at the
915 * beginning of the block.
916 */
917 bool uses_helpers_beginning;
918
919 /* Whether helper invocations may be used by the end of the block. Branch
920 * instructions are considered to be "between" blocks, because (eq) has to be
921 * inserted after them in the successor blocks, so branch instructions using
922 * helpers will result in uses_helpers_end = true for their block.
923 */
924 bool uses_helpers_end;
925 };
926
927 /* Insert (eq) after the last instruction using the results of helper
928 * invocations. Use a backwards dataflow analysis to determine at which points
929 * in the program helper invocations are definitely never used, and then insert
930 * (eq) at the point where we cross from a point where they may be used to a
931 * point where they are never used.
932 */
933 static void
helper_sched(struct ir3_legalize_ctx * ctx,struct ir3 * ir,struct ir3_shader_variant * so)934 helper_sched(struct ir3_legalize_ctx *ctx, struct ir3 *ir,
935 struct ir3_shader_variant *so)
936 {
937 bool non_prefetch_helpers = false;
938
939 foreach_block (block, &ir->block_list) {
940 struct ir3_helper_block_data *bd =
941 rzalloc(ctx, struct ir3_helper_block_data);
942 foreach_instr (instr, &block->instr_list) {
943 if (uses_helpers(instr)) {
944 bd->uses_helpers_beginning = true;
945 if (instr->opc != OPC_META_TEX_PREFETCH) {
946 non_prefetch_helpers = true;
947 break;
948 }
949 }
950
951 if (instr->opc == OPC_SHPE) {
952 /* (eq) is not allowed in preambles, mark the whole preamble as
953 * requiring helpers to avoid putting it there.
954 */
955 bd->uses_helpers_beginning = true;
956 bd->uses_helpers_end = true;
957 }
958 }
959
960 if (block->brtype == IR3_BRANCH_ALL ||
961 block->brtype == IR3_BRANCH_ANY ||
962 block->brtype == IR3_BRANCH_GETONE) {
963 bd->uses_helpers_beginning = true;
964 bd->uses_helpers_end = true;
965 }
966
967 block->data = bd;
968 }
969
970 /* If only prefetches use helpers then we can disable them in the shader via
971 * a register setting.
972 */
973 if (!non_prefetch_helpers) {
974 so->prefetch_end_of_quad = true;
975 return;
976 }
977
978 bool progress;
979 do {
980 progress = false;
981 foreach_block_rev (block, &ir->block_list) {
982 struct ir3_helper_block_data *bd = block->data;
983
984 if (!bd->uses_helpers_beginning)
985 continue;
986
987 for (unsigned i = 0; i < block->predecessors_count; i++) {
988 struct ir3_block *pred = block->predecessors[i];
989 struct ir3_helper_block_data *pred_bd = pred->data;
990 if (!pred_bd->uses_helpers_end) {
991 pred_bd->uses_helpers_end = true;
992 }
993 if (!pred_bd->uses_helpers_beginning) {
994 pred_bd->uses_helpers_beginning = true;
995 progress = true;
996 }
997 }
998 }
999 } while (progress);
1000
1001 /* Now, we need to determine the points where helper invocations become
1002 * unused.
1003 */
1004 foreach_block (block, &ir->block_list) {
1005 struct ir3_helper_block_data *bd = block->data;
1006 if (bd->uses_helpers_end)
1007 continue;
1008
1009 /* We need to check the predecessors because of situations with critical
1010 * edges like this that can occur after optimizing jumps:
1011 *
1012 * br p0.x, #endif
1013 * ...
1014 * sam ...
1015 * ...
1016 * endif:
1017 * ...
1018 * end
1019 *
1020 * The endif block will have uses_helpers_beginning = false and
1021 * uses_helpers_end = false, but because we jump to there from the
1022 * beginning of the if where uses_helpers_end = true, we still want to
1023 * add an (eq) at the beginning of the block:
1024 *
1025 * br p0.x, #endif
1026 * ...
1027 * sam ...
1028 * (eq)nop
1029 * ...
1030 * endif:
1031 * (eq)nop
1032 * ...
1033 * end
1034 *
1035 * This an extra nop in the case where the branch isn't taken, but that's
1036 * probably preferable to adding an extra jump instruction which is what
1037 * would happen if we ran this pass before optimizing jumps:
1038 *
1039 * br p0.x, #else
1040 * ...
1041 * sam ...
1042 * (eq)nop
1043 * ...
1044 * jump #endif
1045 * else:
1046 * (eq)nop
1047 * endif:
1048 * ...
1049 * end
1050 *
1051 * We also need this to make sure we insert (eq) after branches which use
1052 * helper invocations.
1053 */
1054 bool pred_uses_helpers = bd->uses_helpers_beginning;
1055 for (unsigned i = 0; i < block->predecessors_count; i++) {
1056 struct ir3_block *pred = block->predecessors[i];
1057 struct ir3_helper_block_data *pred_bd = pred->data;
1058 if (pred_bd->uses_helpers_end) {
1059 pred_uses_helpers = true;
1060 break;
1061 }
1062 }
1063
1064 if (!pred_uses_helpers)
1065 continue;
1066
1067 /* The last use of helpers is somewhere between the beginning and the
1068 * end. first_instr will be the first instruction where helpers are no
1069 * longer required, or NULL if helpers are not required just at the end.
1070 */
1071 struct ir3_instruction *first_instr = NULL;
1072 foreach_instr_rev (instr, &block->instr_list) {
1073 /* Skip prefetches because they actually execute before the block
1074 * starts and at this stage they aren't guaranteed to be at the start
1075 * of the block.
1076 */
1077 if (uses_helpers(instr) && instr->opc != OPC_META_TEX_PREFETCH)
1078 break;
1079 first_instr = instr;
1080 }
1081
1082 bool killed = false;
1083 bool expensive_instruction_in_block = false;
1084 if (first_instr) {
1085 foreach_instr_from (instr, first_instr, &block->instr_list) {
1086 /* If there's already a nop, we don't have to worry about whether to
1087 * insert one.
1088 */
1089 if (instr->opc == OPC_NOP) {
1090 instr->flags |= IR3_INSTR_EQ;
1091 killed = true;
1092 break;
1093 }
1094
1095 /* ALU and SFU instructions probably aren't going to benefit much
1096 * from killing helper invocations, because they complete at least
1097 * an entire quad in a cycle and don't access any quad-divergent
1098 * memory, so delay emitting (eq) in the hopes that we find a nop
1099 * afterwards.
1100 */
1101 if (is_alu(instr) || is_sfu(instr))
1102 continue;
1103
1104 expensive_instruction_in_block = true;
1105 break;
1106 }
1107 }
1108
1109 /* If this block isn't the last block before the end instruction, assume
1110 * that there may be expensive instructions in later blocks so it's worth
1111 * it to insert a nop.
1112 */
1113 if (!killed && (expensive_instruction_in_block ||
1114 block->successors[0] != ir3_end_block(ir))) {
1115 struct ir3_instruction *nop = ir3_NOP(block);
1116 nop->flags |= IR3_INSTR_EQ;
1117 if (first_instr)
1118 ir3_instr_move_before(nop, first_instr);
1119 }
1120 }
1121 }
1122
1123 bool
ir3_legalize(struct ir3 * ir,struct ir3_shader_variant * so,int * max_bary)1124 ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
1125 {
1126 struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx);
1127 bool mergedregs = so->mergedregs;
1128 bool progress;
1129
1130 ctx->so = so;
1131 ctx->max_bary = -1;
1132 ctx->compiler = ir->compiler;
1133 ctx->type = ir->type;
1134
1135 /* allocate per-block data: */
1136 foreach_block (block, &ir->block_list) {
1137 struct ir3_legalize_block_data *bd =
1138 rzalloc(ctx, struct ir3_legalize_block_data);
1139
1140 regmask_init(&bd->state.needs_ss_war, mergedregs);
1141 regmask_init(&bd->state.needs_ss, mergedregs);
1142 regmask_init(&bd->state.needs_sy, mergedregs);
1143
1144 block->data = bd;
1145 }
1146
1147 /* We may have failed to pull all input loads into the first block.
1148 * In such case at the moment we aren't able to find a better place
1149 * to for (ei) than the end of the program.
1150 * a5xx and a6xx do automatically release varying storage at the end.
1151 */
1152 ctx->early_input_release = true;
1153 struct ir3_block *start_block = ir3_after_preamble(ir);
1154 foreach_block (block, &ir->block_list) {
1155 foreach_instr (instr, &block->instr_list) {
1156 if (is_input(instr)) {
1157 ctx->has_inputs = true;
1158 if (block != start_block) {
1159 ctx->early_input_release = false;
1160 break;
1161 }
1162 }
1163 }
1164 }
1165
1166 assert(ctx->early_input_release || ctx->compiler->gen >= 5);
1167
1168 /* process each block: */
1169 do {
1170 progress = false;
1171 foreach_block (block, &ir->block_list) {
1172 progress |= legalize_block(ctx, block);
1173 }
1174 } while (progress);
1175
1176 *max_bary = ctx->max_bary;
1177
1178 block_sched(ir);
1179 if (so->type == MESA_SHADER_FRAGMENT)
1180 kill_sched(ir, so);
1181
1182 foreach_block (block, &ir->block_list) {
1183 progress |= apply_fine_deriv_macro(ctx, block);
1184 }
1185
1186 foreach_block (block, &ir->block_list) {
1187 if (block->brtype == IR3_BRANCH_GETONE) {
1188 apply_push_consts_load_macro(ctx, block->successors[0]);
1189 break;
1190 }
1191 }
1192
1193 nop_sched(ir, so);
1194
1195 if (ir3_shader_debug & IR3_DBG_FULLSYNC) {
1196 dbg_sync_sched(ir, so);
1197 }
1198
1199 if (ir3_shader_debug & IR3_DBG_FULLNOP) {
1200 dbg_nop_sched(ir, so);
1201 }
1202
1203 while (opt_jump(ir))
1204 ;
1205
1206 /* TODO: does (eq) exist before a6xx? */
1207 if (so->type == MESA_SHADER_FRAGMENT && so->need_pixlod &&
1208 so->compiler->gen >= 6)
1209 helper_sched(ctx, ir, so);
1210
1211 ir3_count_instructions(ir);
1212 resolve_jumps(ir);
1213
1214 mark_xvergence_points(ir);
1215
1216 ralloc_free(ctx);
1217
1218 return true;
1219 }
1220