• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include "util/ralloc.h"
28 #include "util/u_math.h"
29 
30 #include "ir3.h"
31 #include "ir3_shader.h"
32 
33 /*
34  * Legalize:
35  *
36  * The legalize pass handles ensuring sufficient nop's and sync flags for
37  * correct execution.
38  *
39  * 1) Iteratively determine where sync ((sy)/(ss)) flags are needed,
40  *    based on state flowing out of predecessor blocks until there is
41  *    no further change.  In some cases this requires inserting nops.
42  * 2) Mark (ei) on last varying input
43  * 3) Final nop scheduling for instruction latency
44  * 4) Resolve jumps and schedule blocks, marking potential convergence
45  *    points with (jp)
46  */
47 
48 struct ir3_legalize_ctx {
49    struct ir3_compiler *compiler;
50    struct ir3_shader_variant *so;
51    gl_shader_stage type;
52    int max_bary;
53    bool early_input_release;
54    bool has_inputs;
55 };
56 
57 struct ir3_legalize_state {
58    regmask_t needs_ss;
59    regmask_t needs_ss_war; /* write after read */
60    regmask_t needs_sy;
61    bool needs_ss_for_const;
62 };
63 
64 struct ir3_legalize_block_data {
65    bool valid;
66    struct ir3_legalize_state state;
67 };
68 
69 static inline void
apply_ss(struct ir3_instruction * instr,struct ir3_legalize_state * state,bool mergedregs)70 apply_ss(struct ir3_instruction *instr,
71          struct ir3_legalize_state *state,
72          bool mergedregs)
73 {
74    instr->flags |= IR3_INSTR_SS;
75    regmask_init(&state->needs_ss_war, mergedregs);
76    regmask_init(&state->needs_ss, mergedregs);
77    state->needs_ss_for_const = false;
78 }
79 
80 static inline void
apply_sy(struct ir3_instruction * instr,struct ir3_legalize_state * state,bool mergedregs)81 apply_sy(struct ir3_instruction *instr,
82          struct ir3_legalize_state *state,
83          bool mergedregs)
84 {
85    instr->flags |= IR3_INSTR_SY;
86    regmask_init(&state->needs_sy, mergedregs);
87 }
88 
89 /* We want to evaluate each block from the position of any other
90  * predecessor block, in order that the flags set are the union of
91  * all possible program paths.
92  *
93  * To do this, we need to know the output state (needs_ss/ss_war/sy)
94  * of all predecessor blocks.  The tricky thing is loops, which mean
95  * that we can't simply recursively process each predecessor block
96  * before legalizing the current block.
97  *
98  * How we handle that is by looping over all the blocks until the
99  * results converge.  If the output state of a given block changes
100  * in a given pass, this means that all successor blocks are not
101  * yet fully legalized.
102  */
103 
104 static bool
legalize_block(struct ir3_legalize_ctx * ctx,struct ir3_block * block)105 legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
106 {
107    struct ir3_legalize_block_data *bd = block->data;
108 
109    if (bd->valid)
110       return false;
111 
112    struct ir3_instruction *last_n = NULL;
113    struct list_head instr_list;
114    struct ir3_legalize_state prev_state = bd->state;
115    struct ir3_legalize_state *state = &bd->state;
116    bool last_input_needs_ss = false;
117    bool has_tex_prefetch = false;
118    bool mergedregs = ctx->so->mergedregs;
119 
120    /* our input state is the OR of all predecessor blocks' state: */
121    for (unsigned i = 0; i < block->predecessors_count; i++) {
122       struct ir3_block *predecessor = block->predecessors[i];
123       struct ir3_legalize_block_data *pbd = predecessor->data;
124       struct ir3_legalize_state *pstate = &pbd->state;
125 
126       /* Our input (ss)/(sy) state is based on OR'ing the output
127        * state of all our predecessor blocks
128        */
129       regmask_or(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
130       regmask_or(&state->needs_ss_war, &state->needs_ss_war,
131                  &pstate->needs_ss_war);
132       regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy);
133       state->needs_ss_for_const |= pstate->needs_ss_for_const;
134    }
135 
136    /* We need to take phsyical-only edges into account when tracking shared
137     * registers.
138     */
139    for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
140       struct ir3_block *predecessor = block->physical_predecessors[i];
141       struct ir3_legalize_block_data *pbd = predecessor->data;
142       struct ir3_legalize_state *pstate = &pbd->state;
143 
144       regmask_or_shared(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
145    }
146 
147    unsigned input_count = 0;
148 
149    foreach_instr (n, &block->instr_list) {
150       if (is_input(n)) {
151          input_count++;
152       }
153    }
154 
155    unsigned inputs_remaining = input_count;
156 
157    /* Either inputs are in the first block or we expect inputs to be released
158     * with the end of the program.
159     */
160    assert(input_count == 0 || !ctx->early_input_release ||
161           block == ir3_after_preamble(block->shader));
162 
163    /* remove all the instructions from the list, we'll be adding
164     * them back in as we go
165     */
166    list_replace(&block->instr_list, &instr_list);
167    list_inithead(&block->instr_list);
168 
169    foreach_instr_safe (n, &instr_list) {
170       unsigned i;
171 
172       n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
173 
174       /* _meta::tex_prefetch instructions removed later in
175        * collect_tex_prefetches()
176        */
177       if (is_meta(n) && (n->opc != OPC_META_TEX_PREFETCH))
178          continue;
179 
180       if (is_input(n)) {
181          struct ir3_register *inloc = n->srcs[0];
182          assert(inloc->flags & IR3_REG_IMMED);
183          ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val);
184       }
185 
186       if ((last_n && is_barrier(last_n)) || n->opc == OPC_SHPE) {
187          apply_ss(n, state, mergedregs);
188          apply_sy(n, state, mergedregs);
189          last_input_needs_ss = false;
190       }
191 
192       if (last_n && (last_n->opc == OPC_PREDT)) {
193          apply_ss(n, state, mergedregs);
194       }
195 
196       /* NOTE: consider dst register too.. it could happen that
197        * texture sample instruction (for example) writes some
198        * components which are unused.  A subsequent instruction
199        * that writes the same register can race w/ the sam instr
200        * resulting in undefined results:
201        */
202       for (i = 0; i < n->dsts_count + n->srcs_count; i++) {
203          struct ir3_register *reg;
204          if (i < n->dsts_count)
205             reg = n->dsts[i];
206          else
207             reg = n->srcs[i - n->dsts_count];
208 
209          if (reg_gpr(reg)) {
210 
211             /* TODO: we probably only need (ss) for alu
212              * instr consuming sfu result.. need to make
213              * some tests for both this and (sy)..
214              */
215             if (regmask_get(&state->needs_ss, reg)) {
216                apply_ss(n, state, mergedregs);
217                last_input_needs_ss = false;
218             }
219 
220             if (regmask_get(&state->needs_sy, reg)) {
221                apply_sy(n, state, mergedregs);
222             }
223          } else if ((reg->flags & IR3_REG_CONST)) {
224             if (state->needs_ss_for_const) {
225                apply_ss(n, state, mergedregs);
226                last_input_needs_ss = false;
227             }
228          }
229       }
230 
231       foreach_dst (reg, n) {
232          if (regmask_get(&state->needs_ss_war, reg)) {
233             apply_ss(n, state, mergedregs);
234             last_input_needs_ss = false;
235          }
236       }
237 
238       /* cat5+ does not have an (ss) bit, if needed we need to
239        * insert a nop to carry the sync flag.  Would be kinda
240        * clever if we were aware of this during scheduling, but
241        * this should be a pretty rare case:
242        */
243       if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) {
244          struct ir3_instruction *nop;
245          nop = ir3_NOP(block);
246          nop->flags |= IR3_INSTR_SS;
247          n->flags &= ~IR3_INSTR_SS;
248       }
249 
250       /* need to be able to set (ss) on first instruction: */
251       if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5) && !is_meta(n))
252          ir3_NOP(block);
253 
254       if (ctx->compiler->samgq_workaround &&
255           ctx->type != MESA_SHADER_FRAGMENT &&
256           ctx->type != MESA_SHADER_COMPUTE && n->opc == OPC_SAMGQ) {
257          struct ir3_instruction *samgp;
258 
259          list_delinit(&n->node);
260 
261          for (i = 0; i < 4; i++) {
262             samgp = ir3_instr_clone(n);
263             samgp->opc = OPC_SAMGP0 + i;
264             if (i > 1)
265                samgp->flags |= IR3_INSTR_SY;
266          }
267       } else {
268          list_delinit(&n->node);
269          list_addtail(&n->node, &block->instr_list);
270       }
271 
272       if (is_sfu(n))
273          regmask_set(&state->needs_ss, n->dsts[0]);
274 
275       foreach_dst (dst, n) {
276          if (dst->flags & IR3_REG_SHARED)
277             regmask_set(&state->needs_ss, dst);
278       }
279 
280       if (is_tex_or_prefetch(n)) {
281          regmask_set(&state->needs_sy, n->dsts[0]);
282          if (n->opc == OPC_META_TEX_PREFETCH)
283             has_tex_prefetch = true;
284       } else if (n->opc == OPC_RESINFO) {
285          regmask_set(&state->needs_ss, n->dsts[0]);
286          ir3_NOP(block)->flags |= IR3_INSTR_SS;
287          last_input_needs_ss = false;
288       } else if (is_load(n)) {
289          if (is_local_mem_load(n))
290             regmask_set(&state->needs_ss, n->dsts[0]);
291          else
292             regmask_set(&state->needs_sy, n->dsts[0]);
293       } else if (is_atomic(n->opc)) {
294          if (is_bindless_atomic(n->opc)) {
295             regmask_set(&state->needs_sy, n->srcs[2]);
296          } else if (is_global_a3xx_atomic(n->opc) ||
297                     is_global_a6xx_atomic(n->opc)) {
298             regmask_set(&state->needs_sy, n->dsts[0]);
299          } else {
300             regmask_set(&state->needs_ss, n->dsts[0]);
301          }
302       } else if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
303          state->needs_ss_for_const = true;
304       }
305 
306       if (is_ssbo(n->opc) || is_global_a3xx_atomic(n->opc) ||
307           is_bindless_atomic(n->opc))
308          ctx->so->has_ssbo = true;
309 
310       /* both tex/sfu appear to not always immediately consume
311        * their src register(s):
312        */
313       if (is_tex(n) || is_sfu(n) || is_mem(n)) {
314          foreach_src (reg, n) {
315             regmask_set(&state->needs_ss_war, reg);
316          }
317       }
318 
319       if (ctx->early_input_release && is_input(n)) {
320          last_input_needs_ss |= (n->opc == OPC_LDLV);
321 
322          assert(inputs_remaining > 0);
323          inputs_remaining--;
324          if (inputs_remaining == 0) {
325             /* This is the last input. We add the (ei) flag to release
326              * varying memory after this executes. If it's an ldlv,
327              * however, we need to insert a dummy bary.f on which we can
328              * set the (ei) flag. We may also need to insert an (ss) to
329              * guarantee that all ldlv's have finished fetching their
330              * results before releasing the varying memory.
331              */
332             struct ir3_instruction *last_input = n;
333             if (n->opc == OPC_LDLV) {
334                struct ir3_instruction *baryf;
335 
336                /* (ss)bary.f (ei)r63.x, 0, r0.x */
337                baryf = ir3_instr_create(block, OPC_BARY_F, 1, 2);
338                ir3_dst_create(baryf, regid(63, 0), 0);
339                ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
340                ir3_src_create(baryf, regid(0, 0), 0);
341 
342                last_input = baryf;
343             }
344 
345             last_input->dsts[0]->flags |= IR3_REG_EI;
346             if (last_input_needs_ss) {
347                apply_ss(last_input, state, mergedregs);
348             }
349          }
350       }
351 
352       last_n = n;
353    }
354 
355    assert(inputs_remaining == 0 || !ctx->early_input_release);
356 
357    if (has_tex_prefetch && !ctx->has_inputs) {
358       /* texture prefetch, but *no* inputs.. we need to insert a
359        * dummy bary.f at the top of the shader to unblock varying
360        * storage:
361        */
362       struct ir3_instruction *baryf;
363 
364       /* (ss)bary.f (ei)r63.x, 0, r0.x */
365       baryf = ir3_instr_create(block, OPC_BARY_F, 1, 2);
366       ir3_dst_create(baryf, regid(63, 0), 0)->flags |= IR3_REG_EI;
367       ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
368       ir3_src_create(baryf, regid(0, 0), 0);
369 
370       /* insert the dummy bary.f at head: */
371       list_delinit(&baryf->node);
372       list_add(&baryf->node, &block->instr_list);
373    }
374 
375    bd->valid = true;
376 
377    if (memcmp(&prev_state, state, sizeof(*state))) {
378       /* our output state changed, this invalidates all of our
379        * successors:
380        */
381       for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
382          if (!block->successors[i])
383             break;
384          struct ir3_legalize_block_data *pbd = block->successors[i]->data;
385          pbd->valid = false;
386       }
387    }
388 
389    return true;
390 }
391 
392 /* Expands dsxpp and dsypp macros to:
393  *
394  * dsxpp.1 dst, src
395  * dsxpp.1.p dst, src
396  *
397  * We apply this after flags syncing, as we don't want to sync in between the
398  * two (which might happen if dst == src).  We do it before nop scheduling
399  * because that needs to count actual instructions.
400  */
401 static bool
apply_fine_deriv_macro(struct ir3_legalize_ctx * ctx,struct ir3_block * block)402 apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
403 {
404    struct list_head instr_list;
405 
406    /* remove all the instructions from the list, we'll be adding
407     * them back in as we go
408     */
409    list_replace(&block->instr_list, &instr_list);
410    list_inithead(&block->instr_list);
411 
412    foreach_instr_safe (n, &instr_list) {
413       list_addtail(&n->node, &block->instr_list);
414 
415       if (n->opc == OPC_DSXPP_MACRO || n->opc == OPC_DSYPP_MACRO) {
416          n->opc = (n->opc == OPC_DSXPP_MACRO) ? OPC_DSXPP_1 : OPC_DSYPP_1;
417 
418          struct ir3_instruction *op_p = ir3_instr_clone(n);
419          op_p->flags = IR3_INSTR_P;
420 
421          ctx->so->need_full_quad = true;
422       }
423    }
424 
425    return true;
426 }
427 
428 static void
apply_push_consts_load_macro(struct ir3_legalize_ctx * ctx,struct ir3_block * block)429 apply_push_consts_load_macro(struct ir3_legalize_ctx *ctx,
430                              struct ir3_block *block)
431 {
432    foreach_instr (n, &block->instr_list) {
433       if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
434          struct ir3_instruction *stsc = ir3_instr_create(block, OPC_STSC, 0, 2);
435          ir3_instr_move_after(stsc, n);
436          ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val =
437             n->push_consts.dst_base;
438          ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val =
439             n->push_consts.src_base;
440          stsc->cat6.iim_val = n->push_consts.src_size;
441          stsc->cat6.type = TYPE_U32;
442 
443          if (ctx->compiler->stsc_duplication_quirk) {
444             struct ir3_instruction *nop = ir3_NOP(block);
445             ir3_instr_move_after(nop, stsc);
446             nop->flags |= IR3_INSTR_SS;
447             ir3_instr_move_after(ir3_instr_clone(stsc), nop);
448          }
449 
450          list_delinit(&n->node);
451          break;
452       } else if (!is_meta(n)) {
453          break;
454       }
455    }
456 }
457 
458 /* NOTE: branch instructions are always the last instruction(s)
459  * in the block.  We take advantage of this as we resolve the
460  * branches, since "if (foo) break;" constructs turn into
461  * something like:
462  *
463  *   block3 {
464  *   	...
465  *   	0029:021: mov.s32s32 r62.x, r1.y
466  *   	0082:022: br !p0.x, target=block5
467  *   	0083:023: br p0.x, target=block4
468  *   	// succs: if _[0029:021: mov.s32s32] block4; else block5;
469  *   }
470  *   block4 {
471  *   	0084:024: jump, target=block6
472  *   	// succs: block6;
473  *   }
474  *   block5 {
475  *   	0085:025: jump, target=block7
476  *   	// succs: block7;
477  *   }
478  *
479  * ie. only instruction in block4/block5 is a jump, so when
480  * resolving branches we can easily detect this by checking
481  * that the first instruction in the target block is itself
482  * a jump, and setup the br directly to the jump's target
483  * (and strip back out the now unreached jump)
484  *
485  * TODO sometimes we end up with things like:
486  *
487  *    br !p0.x, #2
488  *    br p0.x, #12
489  *    add.u r0.y, r0.y, 1
490  *
491  * If we swapped the order of the branches, we could drop one.
492  */
493 static struct ir3_block *
resolve_dest_block(struct ir3_block * block)494 resolve_dest_block(struct ir3_block *block)
495 {
496    /* special case for last block: */
497    if (!block->successors[0])
498       return block;
499 
500    /* NOTE that we may or may not have inserted the jump
501     * in the target block yet, so conditions to resolve
502     * the dest to the dest block's successor are:
503     *
504     *   (1) successor[1] == NULL &&
505     *   (2) (block-is-empty || only-instr-is-jump)
506     */
507    if (block->successors[1] == NULL) {
508       if (list_is_empty(&block->instr_list)) {
509          return block->successors[0];
510       } else if (list_length(&block->instr_list) == 1) {
511          struct ir3_instruction *instr =
512             list_first_entry(&block->instr_list, struct ir3_instruction, node);
513          if (instr->opc == OPC_JUMP) {
514             /* If this jump is backwards, then we will probably convert
515              * the jump being resolved to a backwards jump, which will
516              * change a loop-with-continue or loop-with-if into a
517              * doubly-nested loop and change the convergence behavior.
518              * Disallow this here.
519              */
520             if (block->successors[0]->index <= block->index)
521                return block;
522             return block->successors[0];
523          }
524       }
525    }
526    return block;
527 }
528 
529 static void
remove_unused_block(struct ir3_block * old_target)530 remove_unused_block(struct ir3_block *old_target)
531 {
532    list_delinit(&old_target->node);
533 
534    /* cleanup dangling predecessors: */
535    for (unsigned i = 0; i < ARRAY_SIZE(old_target->successors); i++) {
536       if (old_target->successors[i]) {
537          struct ir3_block *succ = old_target->successors[i];
538          ir3_block_remove_predecessor(succ, old_target);
539       }
540    }
541 }
542 
543 static bool
retarget_jump(struct ir3_instruction * instr,struct ir3_block * new_target)544 retarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target)
545 {
546    struct ir3_block *old_target = instr->cat0.target;
547    struct ir3_block *cur_block = instr->block;
548 
549    /* update current blocks successors to reflect the retargetting: */
550    if (cur_block->successors[0] == old_target) {
551       cur_block->successors[0] = new_target;
552    } else {
553       assert(cur_block->successors[1] == old_target);
554       cur_block->successors[1] = new_target;
555    }
556 
557    /* update new target's predecessors: */
558    ir3_block_add_predecessor(new_target, cur_block);
559 
560    /* and remove old_target's predecessor: */
561    ir3_block_remove_predecessor(old_target, cur_block);
562 
563    /* If we reconverged at the old target, we'll reconverge at the new target
564     * too:
565     */
566    new_target->reconvergence_point |= old_target->reconvergence_point;
567 
568    instr->cat0.target = new_target;
569 
570    if (old_target->predecessors_count == 0) {
571       remove_unused_block(old_target);
572       return true;
573    }
574 
575    return false;
576 }
577 
578 static bool
opt_jump(struct ir3 * ir)579 opt_jump(struct ir3 *ir)
580 {
581    bool progress = false;
582 
583    unsigned index = 0;
584    foreach_block (block, &ir->block_list)
585       block->index = index++;
586 
587    foreach_block (block, &ir->block_list) {
588       /* This pass destroys the physical CFG so don't keep it around to avoid
589        * validation errors.
590        */
591       block->physical_successors_count = 0;
592       block->physical_predecessors_count = 0;
593 
594       foreach_instr (instr, &block->instr_list) {
595          if (!is_flow(instr) || !instr->cat0.target)
596             continue;
597 
598          struct ir3_block *tblock = resolve_dest_block(instr->cat0.target);
599          if (tblock != instr->cat0.target) {
600             progress = true;
601 
602             /* Exit early if we deleted a block to avoid iterator
603              * weirdness/assert fails
604              */
605             if (retarget_jump(instr, tblock))
606                return true;
607          }
608       }
609 
610       /* Detect the case where the block ends either with:
611        * - A single unconditional jump to the next block.
612        * - Two jump instructions with opposite conditions, and one of the
613        *   them jumps to the next block.
614        * We can remove the one that jumps to the next block in either case.
615        */
616       if (list_is_empty(&block->instr_list))
617          continue;
618 
619       struct ir3_instruction *jumps[2] = {NULL, NULL};
620       jumps[0] =
621          list_last_entry(&block->instr_list, struct ir3_instruction, node);
622       if (!list_is_singular(&block->instr_list))
623          jumps[1] =
624             list_last_entry(&jumps[0]->node, struct ir3_instruction, node);
625 
626       if (jumps[0]->opc == OPC_JUMP)
627          jumps[1] = NULL;
628       else if (jumps[0]->opc != OPC_B || !jumps[1] || jumps[1]->opc != OPC_B)
629          continue;
630 
631       for (unsigned i = 0; i < 2; i++) {
632          if (!jumps[i])
633             continue;
634 
635          struct ir3_block *tblock = jumps[i]->cat0.target;
636          if (&tblock->node == block->node.next) {
637             list_delinit(&jumps[i]->node);
638             progress = true;
639             break;
640          }
641       }
642    }
643 
644    return progress;
645 }
646 
647 static void
resolve_jumps(struct ir3 * ir)648 resolve_jumps(struct ir3 *ir)
649 {
650    foreach_block (block, &ir->block_list)
651       foreach_instr (instr, &block->instr_list)
652          if (is_flow(instr) && instr->cat0.target) {
653             struct ir3_instruction *target = list_first_entry(
654                &instr->cat0.target->instr_list, struct ir3_instruction, node);
655 
656             instr->cat0.immed = (int)target->ip - (int)instr->ip;
657          }
658 }
659 
660 static void
mark_jp(struct ir3_block * block)661 mark_jp(struct ir3_block *block)
662 {
663    /* We only call this on the end block (in kill_sched) or after retargeting
664     * all jumps to empty blocks (in mark_xvergence_points) so there's no need to
665     * worry about empty blocks.
666     */
667    assert(!list_is_empty(&block->instr_list));
668 
669    struct ir3_instruction *target =
670       list_first_entry(&block->instr_list, struct ir3_instruction, node);
671    target->flags |= IR3_INSTR_JP;
672 }
673 
674 /* Mark points where control flow reconverges.
675  *
676  * Re-convergence points are where "parked" threads are reconverged with threads
677  * that took the opposite path last time around. We already calculated them, we
678  * just need to mark them with (jp).
679  */
680 static void
mark_xvergence_points(struct ir3 * ir)681 mark_xvergence_points(struct ir3 *ir)
682 {
683    foreach_block (block, &ir->block_list) {
684       if (block->reconvergence_point)
685          mark_jp(block);
686    }
687 }
688 
689 /* Insert the branch/jump instructions for flow control between blocks.
690  * Initially this is done naively, without considering if the successor
691  * block immediately follows the current block (ie. so no jump required),
692  * but that is cleaned up in opt_jump().
693  *
694  * TODO what ensures that the last write to p0.x in a block is the
695  * branch condition?  Have we been getting lucky all this time?
696  */
697 static void
block_sched(struct ir3 * ir)698 block_sched(struct ir3 *ir)
699 {
700    foreach_block (block, &ir->block_list) {
701       if (block->successors[1]) {
702          /* if/else, conditional branches to "then" or "else": */
703          struct ir3_instruction *br1, *br2;
704 
705          if (block->brtype == IR3_BRANCH_GETONE ||
706              block->brtype == IR3_BRANCH_GETLAST ||
707              block->brtype == IR3_BRANCH_SHPS) {
708             /* getone/shps can't be inverted, and it wouldn't even make sense
709              * to follow it with an inverted branch, so follow it by an
710              * unconditional branch.
711              */
712             assert(!block->condition);
713             if (block->brtype == IR3_BRANCH_GETONE)
714                br1 = ir3_GETONE(block);
715             else if (block->brtype == IR3_BRANCH_GETLAST)
716                br1 = ir3_GETLAST(block);
717             else
718                br1 = ir3_SHPS(block);
719             br1->cat0.target = block->successors[1];
720 
721             br2 = ir3_JUMP(block);
722             br2->cat0.target = block->successors[0];
723          } else {
724             assert(block->condition);
725 
726             /* create "else" branch first (since "then" block should
727              * frequently/always end up being a fall-thru):
728              */
729             br1 = ir3_instr_create(block, OPC_B, 0, 1);
730             ir3_src_create(br1, regid(REG_P0, 0), 0)->def =
731                block->condition->dsts[0];
732             br1->cat0.inv1 = true;
733             br1->cat0.target = block->successors[1];
734 
735             /* "then" branch: */
736             br2 = ir3_instr_create(block, OPC_B, 0, 1);
737             ir3_src_create(br2, regid(REG_P0, 0), 0)->def =
738                block->condition->dsts[0];
739             br2->cat0.target = block->successors[0];
740 
741             switch (block->brtype) {
742             case IR3_BRANCH_COND:
743                br1->cat0.brtype = br2->cat0.brtype = BRANCH_PLAIN;
744                break;
745             case IR3_BRANCH_ALL:
746                br1->cat0.brtype = BRANCH_ANY;
747                br2->cat0.brtype = BRANCH_ALL;
748                break;
749             case IR3_BRANCH_ANY:
750                br1->cat0.brtype = BRANCH_ALL;
751                br2->cat0.brtype = BRANCH_ANY;
752                break;
753             case IR3_BRANCH_GETONE:
754             case IR3_BRANCH_GETLAST:
755             case IR3_BRANCH_SHPS:
756                unreachable("can't get here");
757             }
758          }
759       } else if (block->successors[0]) {
760          /* otherwise unconditional jump to next block: */
761          struct ir3_instruction *jmp;
762 
763          jmp = ir3_JUMP(block);
764          jmp->cat0.target = block->successors[0];
765       }
766    }
767 }
768 
769 /* Here we workaround the fact that kill doesn't actually kill the thread as
770  * GL expects. The last instruction always needs to be an end instruction,
771  * which means that if we're stuck in a loop where kill is the only way out,
772  * then we may have to jump out to the end. kill may also have the d3d
773  * semantics of converting the thread to a helper thread, rather than setting
774  * the exec mask to 0, in which case the helper thread could get stuck in an
775  * infinite loop.
776  *
777  * We do this late, both to give the scheduler the opportunity to reschedule
778  * kill instructions earlier and to avoid having to create a separate basic
779  * block.
780  *
781  * TODO: Assuming that the wavefront doesn't stop as soon as all threads are
782  * killed, we might benefit by doing this more aggressively when the remaining
783  * part of the program after the kill is large, since that would let us
784  * skip over the instructions when there are no non-killed threads left.
785  */
786 static void
kill_sched(struct ir3 * ir,struct ir3_shader_variant * so)787 kill_sched(struct ir3 *ir, struct ir3_shader_variant *so)
788 {
789    /* True if we know that this block will always eventually lead to the end
790     * block:
791     */
792    bool always_ends = true;
793    bool added = false;
794    struct ir3_block *last_block =
795       list_last_entry(&ir->block_list, struct ir3_block, node);
796 
797    foreach_block_rev (block, &ir->block_list) {
798       for (unsigned i = 0; i < 2 && block->successors[i]; i++) {
799          if (block->successors[i]->start_ip <= block->end_ip)
800             always_ends = false;
801       }
802 
803       if (always_ends)
804          continue;
805 
806       foreach_instr_safe (instr, &block->instr_list) {
807          if (instr->opc != OPC_KILL)
808             continue;
809 
810          struct ir3_instruction *br = ir3_instr_create(block, OPC_B, 0, 1);
811          ir3_src_create(br, instr->srcs[0]->num, instr->srcs[0]->flags)->wrmask =
812             1;
813          br->cat0.target =
814             list_last_entry(&ir->block_list, struct ir3_block, node);
815 
816          list_del(&br->node);
817          list_add(&br->node, &instr->node);
818 
819          added = true;
820       }
821    }
822 
823    if (added) {
824       /* I'm not entirely sure how the branchstack works, but we probably
825        * need to add at least one entry for the divergence which is resolved
826        * at the end:
827        */
828       so->branchstack++;
829 
830       /* We don't update predecessors/successors, so we have to do this
831        * manually:
832        */
833       mark_jp(last_block);
834    }
835 }
836 
837 /* Insert nop's required to make this a legal/valid shader program: */
838 static void
nop_sched(struct ir3 * ir,struct ir3_shader_variant * so)839 nop_sched(struct ir3 *ir, struct ir3_shader_variant *so)
840 {
841    foreach_block (block, &ir->block_list) {
842       struct ir3_instruction *last = NULL;
843       struct list_head instr_list;
844 
845       /* remove all the instructions from the list, we'll be adding
846        * them back in as we go
847        */
848       list_replace(&block->instr_list, &instr_list);
849       list_inithead(&block->instr_list);
850 
851       foreach_instr_safe (instr, &instr_list) {
852          unsigned delay = ir3_delay_calc(block, instr, so->mergedregs);
853 
854          /* NOTE: I think the nopN encoding works for a5xx and
855           * probably a4xx, but not a3xx.  So far only tested on
856           * a6xx.
857           */
858 
859          if ((delay > 0) && (ir->compiler->gen >= 6) && last &&
860              ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3)) &&
861              (last->repeat == 0)) {
862             /* the previous cat2/cat3 instruction can encode at most 3 nop's: */
863             unsigned transfer = MIN2(delay, 3 - last->nop);
864             last->nop += transfer;
865             delay -= transfer;
866          }
867 
868          if ((delay > 0) && last && (last->opc == OPC_NOP)) {
869             /* the previous nop can encode at most 5 repeats: */
870             unsigned transfer = MIN2(delay, 5 - last->repeat);
871             last->repeat += transfer;
872             delay -= transfer;
873          }
874 
875          if (delay > 0) {
876             assert(delay <= 6);
877             ir3_NOP(block)->repeat = delay - 1;
878          }
879 
880          list_addtail(&instr->node, &block->instr_list);
881          last = instr;
882       }
883    }
884 }
885 
886 static void
dbg_sync_sched(struct ir3 * ir,struct ir3_shader_variant * so)887 dbg_sync_sched(struct ir3 *ir, struct ir3_shader_variant *so)
888 {
889    foreach_block (block, &ir->block_list) {
890       foreach_instr_safe (instr, &block->instr_list) {
891          if (opc_cat(instr->opc) == 4 || opc_cat(instr->opc) == 5 ||
892              opc_cat(instr->opc) == 6) {
893             struct ir3_instruction *nop = ir3_NOP(block);
894             nop->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
895             ir3_instr_move_after(nop, instr);
896          }
897       }
898    }
899 }
900 
901 static void
dbg_nop_sched(struct ir3 * ir,struct ir3_shader_variant * so)902 dbg_nop_sched(struct ir3 *ir, struct ir3_shader_variant *so)
903 {
904    foreach_block (block, &ir->block_list) {
905       foreach_instr_safe (instr, &block->instr_list) {
906          struct ir3_instruction *nop = ir3_NOP(block);
907          nop->repeat = 5;
908          ir3_instr_move_before(nop, instr);
909       }
910    }
911 }
912 
913 struct ir3_helper_block_data {
914    /* Whether helper invocations may be used on any path starting at the
915     * beginning of the block.
916     */
917    bool uses_helpers_beginning;
918 
919    /* Whether helper invocations may be used by the end of the block. Branch
920     * instructions are considered to be "between" blocks, because (eq) has to be
921     * inserted after them in the successor blocks, so branch instructions using
922     * helpers will result in uses_helpers_end = true for their block.
923     */
924    bool uses_helpers_end;
925 };
926 
927 /* Insert (eq) after the last instruction using the results of helper
928  * invocations. Use a backwards dataflow analysis to determine at which points
929  * in the program helper invocations are definitely never used, and then insert
930  * (eq) at the point where we cross from a point where they may be used to a
931  * point where they are never used.
932  */
933 static void
helper_sched(struct ir3_legalize_ctx * ctx,struct ir3 * ir,struct ir3_shader_variant * so)934 helper_sched(struct ir3_legalize_ctx *ctx, struct ir3 *ir,
935              struct ir3_shader_variant *so)
936 {
937    bool non_prefetch_helpers = false;
938 
939    foreach_block (block, &ir->block_list) {
940       struct ir3_helper_block_data *bd =
941          rzalloc(ctx, struct ir3_helper_block_data);
942       foreach_instr (instr, &block->instr_list) {
943          if (uses_helpers(instr)) {
944             bd->uses_helpers_beginning = true;
945             if (instr->opc != OPC_META_TEX_PREFETCH) {
946                non_prefetch_helpers = true;
947                break;
948             }
949          }
950 
951          if (instr->opc == OPC_SHPE) {
952             /* (eq) is not allowed in preambles, mark the whole preamble as
953              * requiring helpers to avoid putting it there.
954              */
955             bd->uses_helpers_beginning = true;
956             bd->uses_helpers_end = true;
957          }
958       }
959 
960       if (block->brtype == IR3_BRANCH_ALL ||
961           block->brtype == IR3_BRANCH_ANY ||
962           block->brtype == IR3_BRANCH_GETONE) {
963          bd->uses_helpers_beginning = true;
964          bd->uses_helpers_end = true;
965       }
966 
967       block->data = bd;
968    }
969 
970    /* If only prefetches use helpers then we can disable them in the shader via
971     * a register setting.
972     */
973    if (!non_prefetch_helpers) {
974       so->prefetch_end_of_quad = true;
975       return;
976    }
977 
978    bool progress;
979    do {
980       progress = false;
981       foreach_block_rev (block, &ir->block_list) {
982          struct ir3_helper_block_data *bd = block->data;
983 
984          if (!bd->uses_helpers_beginning)
985             continue;
986 
987          for (unsigned i = 0; i < block->predecessors_count; i++) {
988             struct ir3_block *pred = block->predecessors[i];
989             struct ir3_helper_block_data *pred_bd = pred->data;
990             if (!pred_bd->uses_helpers_end) {
991                pred_bd->uses_helpers_end = true;
992             }
993             if (!pred_bd->uses_helpers_beginning) {
994                pred_bd->uses_helpers_beginning = true;
995                progress = true;
996             }
997          }
998       }
999    } while (progress);
1000 
1001    /* Now, we need to determine the points where helper invocations become
1002     * unused.
1003     */
1004    foreach_block (block, &ir->block_list) {
1005       struct ir3_helper_block_data *bd = block->data;
1006       if (bd->uses_helpers_end)
1007          continue;
1008 
1009       /* We need to check the predecessors because of situations with critical
1010        * edges like this that can occur after optimizing jumps:
1011        *
1012        *    br p0.x, #endif
1013        *    ...
1014        *    sam ...
1015        *    ...
1016        *    endif:
1017        *    ...
1018        *    end
1019        *
1020        * The endif block will have uses_helpers_beginning = false and
1021        * uses_helpers_end = false, but because we jump to there from the
1022        * beginning of the if where uses_helpers_end = true, we still want to
1023        * add an (eq) at the beginning of the block:
1024        *
1025        *    br p0.x, #endif
1026        *    ...
1027        *    sam ...
1028        *    (eq)nop
1029        *    ...
1030        *    endif:
1031        *    (eq)nop
1032        *    ...
1033        *    end
1034        *
1035        * This an extra nop in the case where the branch isn't taken, but that's
1036        * probably preferable to adding an extra jump instruction which is what
1037        * would happen if we ran this pass before optimizing jumps:
1038        *
1039        *    br p0.x, #else
1040        *    ...
1041        *    sam ...
1042        *    (eq)nop
1043        *    ...
1044        *    jump #endif
1045        *    else:
1046        *    (eq)nop
1047        *    endif:
1048        *    ...
1049        *    end
1050        *
1051        * We also need this to make sure we insert (eq) after branches which use
1052        * helper invocations.
1053        */
1054       bool pred_uses_helpers = bd->uses_helpers_beginning;
1055       for (unsigned i = 0; i < block->predecessors_count; i++) {
1056          struct ir3_block *pred = block->predecessors[i];
1057          struct ir3_helper_block_data *pred_bd = pred->data;
1058          if (pred_bd->uses_helpers_end) {
1059             pred_uses_helpers = true;
1060             break;
1061          }
1062       }
1063 
1064       if (!pred_uses_helpers)
1065          continue;
1066 
1067       /* The last use of helpers is somewhere between the beginning and the
1068        * end. first_instr will be the first instruction where helpers are no
1069        * longer required, or NULL if helpers are not required just at the end.
1070        */
1071       struct ir3_instruction *first_instr = NULL;
1072       foreach_instr_rev (instr, &block->instr_list) {
1073          /* Skip prefetches because they actually execute before the block
1074           * starts and at this stage they aren't guaranteed to be at the start
1075           * of the block.
1076           */
1077          if (uses_helpers(instr) && instr->opc != OPC_META_TEX_PREFETCH)
1078             break;
1079          first_instr = instr;
1080       }
1081 
1082       bool killed = false;
1083       bool expensive_instruction_in_block = false;
1084       if (first_instr) {
1085          foreach_instr_from (instr, first_instr, &block->instr_list) {
1086             /* If there's already a nop, we don't have to worry about whether to
1087              * insert one.
1088              */
1089             if (instr->opc == OPC_NOP) {
1090                instr->flags |= IR3_INSTR_EQ;
1091                killed = true;
1092                break;
1093             }
1094 
1095             /* ALU and SFU instructions probably aren't going to benefit much
1096              * from killing helper invocations, because they complete at least
1097              * an entire quad in a cycle and don't access any quad-divergent
1098              * memory, so delay emitting (eq) in the hopes that we find a nop
1099              * afterwards.
1100              */
1101             if (is_alu(instr) || is_sfu(instr))
1102                continue;
1103 
1104             expensive_instruction_in_block = true;
1105             break;
1106          }
1107       }
1108 
1109       /* If this block isn't the last block before the end instruction, assume
1110        * that there may be expensive instructions in later blocks so it's worth
1111        * it to insert a nop.
1112        */
1113       if (!killed && (expensive_instruction_in_block ||
1114                       block->successors[0] != ir3_end_block(ir))) {
1115          struct ir3_instruction *nop = ir3_NOP(block);
1116          nop->flags |= IR3_INSTR_EQ;
1117          if (first_instr)
1118             ir3_instr_move_before(nop, first_instr);
1119       }
1120    }
1121 }
1122 
1123 bool
ir3_legalize(struct ir3 * ir,struct ir3_shader_variant * so,int * max_bary)1124 ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
1125 {
1126    struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx);
1127    bool mergedregs = so->mergedregs;
1128    bool progress;
1129 
1130    ctx->so = so;
1131    ctx->max_bary = -1;
1132    ctx->compiler = ir->compiler;
1133    ctx->type = ir->type;
1134 
1135    /* allocate per-block data: */
1136    foreach_block (block, &ir->block_list) {
1137       struct ir3_legalize_block_data *bd =
1138          rzalloc(ctx, struct ir3_legalize_block_data);
1139 
1140       regmask_init(&bd->state.needs_ss_war, mergedregs);
1141       regmask_init(&bd->state.needs_ss, mergedregs);
1142       regmask_init(&bd->state.needs_sy, mergedregs);
1143 
1144       block->data = bd;
1145    }
1146 
1147    /* We may have failed to pull all input loads into the first block.
1148     * In such case at the moment we aren't able to find a better place
1149     * to for (ei) than the end of the program.
1150     * a5xx and a6xx do automatically release varying storage at the end.
1151     */
1152    ctx->early_input_release = true;
1153    struct ir3_block *start_block = ir3_after_preamble(ir);
1154    foreach_block (block, &ir->block_list) {
1155       foreach_instr (instr, &block->instr_list) {
1156          if (is_input(instr)) {
1157             ctx->has_inputs = true;
1158             if (block != start_block) {
1159                ctx->early_input_release = false;
1160                break;
1161             }
1162          }
1163       }
1164    }
1165 
1166    assert(ctx->early_input_release || ctx->compiler->gen >= 5);
1167 
1168    /* process each block: */
1169    do {
1170       progress = false;
1171       foreach_block (block, &ir->block_list) {
1172          progress |= legalize_block(ctx, block);
1173       }
1174    } while (progress);
1175 
1176    *max_bary = ctx->max_bary;
1177 
1178    block_sched(ir);
1179    if (so->type == MESA_SHADER_FRAGMENT)
1180       kill_sched(ir, so);
1181 
1182    foreach_block (block, &ir->block_list) {
1183       progress |= apply_fine_deriv_macro(ctx, block);
1184    }
1185 
1186    foreach_block (block, &ir->block_list) {
1187       if (block->brtype == IR3_BRANCH_GETONE) {
1188          apply_push_consts_load_macro(ctx, block->successors[0]);
1189          break;
1190       }
1191    }
1192 
1193    nop_sched(ir, so);
1194 
1195    if (ir3_shader_debug & IR3_DBG_FULLSYNC) {
1196       dbg_sync_sched(ir, so);
1197    }
1198 
1199    if (ir3_shader_debug & IR3_DBG_FULLNOP) {
1200       dbg_nop_sched(ir, so);
1201    }
1202 
1203    while (opt_jump(ir))
1204       ;
1205 
1206    /* TODO: does (eq) exist before a6xx? */
1207    if (so->type == MESA_SHADER_FRAGMENT && so->need_pixlod &&
1208        so->compiler->gen >= 6)
1209       helper_sched(ctx, ir, so);
1210 
1211    ir3_count_instructions(ir);
1212    resolve_jumps(ir);
1213 
1214    mark_xvergence_points(ir);
1215 
1216    ralloc_free(ctx);
1217 
1218    return true;
1219 }
1220