• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2014 Rob Clark <robclark@freedesktop.org>
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Rob Clark <robclark@freedesktop.org>
7  */
8 
9 #include "util/ralloc.h"
10 #include "util/u_math.h"
11 
12 #include "ir3.h"
13 #include "ir3_shader.h"
14 
15 /*
16  * Legalize:
17  *
18  * The legalize pass handles ensuring sufficient nop's and sync flags for
19  * correct execution.
20  *
21  * 1) Iteratively determine where sync ((sy)/(ss)) flags are needed,
22  *    based on state flowing out of predecessor blocks until there is
23  *    no further change.  In some cases this requires inserting nops.
24  * 2) Mark (ei) on last varying input
25  * 3) Final nop scheduling for instruction latency
26  * 4) Resolve jumps and schedule blocks, marking potential convergence
27  *    points with (jp)
28  */
29 
30 struct ir3_legalize_ctx {
31    struct ir3_compiler *compiler;
32    struct ir3_shader_variant *so;
33    gl_shader_stage type;
34    int max_bary;
35    bool early_input_release;
36    bool has_inputs;
37    bool has_tex_prefetch;
38 };
39 
40 struct ir3_nop_state {
41    unsigned full_ready[GPR_REG_SIZE];
42    unsigned half_ready[GPR_REG_SIZE];
43 };
44 
45 struct ir3_legalize_state {
46    regmask_t needs_ss;
47    regmask_t needs_ss_scalar_full; /* half scalar ALU producer -> full scalar ALU consumer */
48    regmask_t needs_ss_scalar_half; /* full scalar ALU producer -> half scalar ALU consumer */
49    regmask_t needs_ss_war; /* write after read */
50    regmask_t needs_ss_or_sy_war;  /* WAR for sy-producer sources */
51    regmask_t needs_ss_scalar_war; /* scalar ALU write -> ALU write */
52    regmask_t needs_ss_or_sy_scalar_war;
53    regmask_t needs_sy;
54    bool needs_ss_for_const;
55    bool needs_sy_for_const;
56 
57    /* Each of these arrays contains the cycle when the corresponding register
58     * becomes "ready" i.e. does not require any more nops. There is a special
59     * mechanism to let ALU instructions read compatible (i.e. same halfness)
60     * destinations of another ALU instruction with less delay, so this can
61     * depend on what type the consuming instruction is, which is why there are
62     * multiple arrays. The cycle is counted relative to the start of the block.
63     */
64 
65    /* When ALU instructions reading the given full/half register will be ready.
66     */
67    struct ir3_nop_state alu_nop;
68 
69    /* When non-ALU (e.g. cat5) instructions reading the given full/half register
70     * will be ready.
71     */
72    struct ir3_nop_state non_alu_nop;
73 
74    /* When p0.x-w, a0.x, and a1.x are ready. */
75    unsigned pred_ready[4];
76    unsigned addr_ready[2];
77 };
78 
79 struct ir3_legalize_block_data {
80    bool valid;
81    struct ir3_legalize_state begin_state;
82    struct ir3_legalize_state state;
83 };
84 
85 static inline bool
needs_ss_war(struct ir3_legalize_state * state,struct ir3_register * dst,bool is_scalar_alu)86 needs_ss_war(struct ir3_legalize_state *state, struct ir3_register *dst,
87              bool is_scalar_alu)
88 {
89    if (regmask_get(&state->needs_ss_war, dst))
90       return true;
91    if (regmask_get(&state->needs_ss_or_sy_war, dst))
92       return true;
93 
94    if (!is_scalar_alu) {
95       if (regmask_get(&state->needs_ss_scalar_war, dst))
96          return true;
97       if (regmask_get(&state->needs_ss_or_sy_scalar_war, dst))
98          return true;
99    }
100 
101    return false;
102 }
103 
104 static inline void
apply_ss(struct ir3_instruction * instr,struct ir3_legalize_state * state,bool mergedregs)105 apply_ss(struct ir3_instruction *instr,
106          struct ir3_legalize_state *state,
107          bool mergedregs)
108 {
109    instr->flags |= IR3_INSTR_SS;
110    regmask_init(&state->needs_ss_war, mergedregs);
111    regmask_init(&state->needs_ss_or_sy_war, mergedregs);
112    regmask_init(&state->needs_ss, mergedregs);
113    regmask_init(&state->needs_ss_scalar_war, mergedregs);
114    regmask_init(&state->needs_ss_or_sy_scalar_war, mergedregs);
115    regmask_init(&state->needs_ss_scalar_full, mergedregs);
116    regmask_init(&state->needs_ss_scalar_half, mergedregs);
117    state->needs_ss_for_const = false;
118 }
119 
120 static inline void
apply_sy(struct ir3_instruction * instr,struct ir3_legalize_state * state,bool mergedregs)121 apply_sy(struct ir3_instruction *instr,
122          struct ir3_legalize_state *state,
123          bool mergedregs)
124 {
125    instr->flags |= IR3_INSTR_SY;
126    regmask_init(&state->needs_sy, mergedregs);
127    regmask_init(&state->needs_ss_or_sy_war, mergedregs);
128    regmask_init(&state->needs_ss_or_sy_scalar_war, mergedregs);
129    state->needs_sy_for_const = false;
130 }
131 
132 static bool
count_instruction(struct ir3_instruction * n,struct ir3_compiler * compiler)133 count_instruction(struct ir3_instruction *n, struct ir3_compiler *compiler)
134 {
135    /* NOTE: don't count branch/jump since we don't know yet if they will
136     * be eliminated later in resolve_jumps().. really should do that
137     * earlier so we don't have this constraint.
138     */
139    return (is_alu(n) && !is_scalar_alu(n, compiler)) ||
140       (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) &&
141            (n->opc != OPC_BRAA) && (n->opc != OPC_BRAO));
142 }
143 
144 static unsigned *
get_ready_slot(struct ir3_legalize_state * state,struct ir3_register * reg,unsigned num,bool consumer_alu,bool matching_size)145 get_ready_slot(struct ir3_legalize_state *state,
146                struct ir3_register *reg, unsigned num,
147                bool consumer_alu, bool matching_size)
148 {
149    if (reg->flags & IR3_REG_PREDICATE) {
150       assert(num == reg->num);
151       assert(reg_num(reg) == REG_P0);
152       return &state->pred_ready[reg_comp(reg)];
153    }
154    if (reg->num == regid(REG_A0, 0))
155       return &state->addr_ready[0];
156    if (reg->num == regid(REG_A0, 1))
157       return &state->addr_ready[1];
158    struct ir3_nop_state *nop =
159       consumer_alu ? &state->alu_nop : &state->non_alu_nop;
160    assert(!(reg->flags & IR3_REG_SHARED));
161    if (reg->flags & IR3_REG_HALF) {
162       if (matching_size)
163          return &nop->half_ready[num];
164       else
165          return &nop->full_ready[num / 2];
166    } else {
167       if (matching_size)
168          return &nop->full_ready[num];
169       /* If "num" is large enough, then it can't alias a half-reg because only
170        * the first half of the full reg speace aliases half regs. Return NULL in
171        * this case.
172        */
173       else if (num * 2 < ARRAY_SIZE(nop->half_ready))
174          return &nop->half_ready[num * 2];
175       else
176          return NULL;
177    }
178 }
179 
180 static unsigned
delay_calc(struct ir3_legalize_ctx * ctx,struct ir3_legalize_state * state,struct ir3_instruction * instr,unsigned cycle)181 delay_calc(struct ir3_legalize_ctx *ctx,
182            struct ir3_legalize_state *state,
183            struct ir3_instruction *instr,
184            unsigned cycle)
185 {
186    /* As far as we know, shader outputs don't need any delay. */
187    if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
188       return 0;
189 
190    unsigned delay = 0;
191    foreach_src_n (src, n, instr) {
192       if (src->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED))
193          continue;
194 
195       unsigned elems = post_ra_reg_elems(src);
196       unsigned num = post_ra_reg_num(src);
197       unsigned src_cycle = cycle + ir3_src_read_delay(ctx->compiler, instr, n);
198 
199       for (unsigned elem = 0; elem < elems; elem++, num++) {
200          unsigned ready_cycle =
201             *get_ready_slot(state, src, num, is_alu(instr), true);
202          delay = MAX2(delay, MAX2(ready_cycle, src_cycle) - src_cycle);
203 
204          /* Increment cycle for ALU instructions with (rptN) where sources are
205           * read each subsequent cycle.
206           */
207          if (instr->repeat && !(src->flags & IR3_REG_RELATIV))
208             src_cycle++;
209       }
210    }
211 
212    return delay;
213 }
214 
215 static void
delay_update(struct ir3_legalize_ctx * ctx,struct ir3_legalize_state * state,struct ir3_instruction * instr,unsigned cycle,bool mergedregs)216 delay_update(struct ir3_legalize_ctx *ctx,
217              struct ir3_legalize_state *state,
218              struct ir3_instruction *instr,
219              unsigned cycle,
220              bool mergedregs)
221 {
222    if (writes_addr1(instr) && instr->block->in_early_preamble)
223       return;
224 
225    foreach_dst_n (dst, n, instr) {
226       if (dst->flags & IR3_REG_RT)
227          continue;
228 
229       unsigned elems = post_ra_reg_elems(dst);
230       unsigned num = post_ra_reg_num(dst);
231       unsigned dst_cycle = cycle;
232 
233       /* sct and swz have scalar destinations and each destination is written in
234        * a subsequent cycle.
235        */
236       if (instr->opc == OPC_SCT || instr->opc == OPC_SWZ)
237          dst_cycle += n;
238 
239       /* For relative accesses with (rptN), we have no way of knowing which
240        * component is accessed when, so we have to assume the worst and mark
241        * every array member as being written at the end.
242        */
243       if (dst->flags & IR3_REG_RELATIV)
244          dst_cycle += instr->repeat;
245 
246       if (dst->flags & IR3_REG_SHARED)
247          continue;
248 
249       for (unsigned elem = 0; elem < elems; elem++, num++) {
250          for (unsigned consumer_alu = 0; consumer_alu < 2; consumer_alu++) {
251             for (unsigned matching_size = 0; matching_size < 2; matching_size++) {
252                unsigned *ready_slot =
253                   get_ready_slot(state, dst, num, consumer_alu, matching_size);
254 
255                if (!ready_slot)
256                   continue;
257 
258                bool reset_ready_slot = false;
259                unsigned delay = 0;
260                if (!is_alu(instr)) {
261                   /* Apparently writes that require (ss) or (sy) are
262                    * synchronized against previous writes, so consumers don't
263                    * have to wait for any previous overlapping ALU instructions
264                    * to complete.
265                    */
266                   reset_ready_slot = true;
267                } else if ((dst->flags & IR3_REG_PREDICATE) ||
268                           reg_num(dst) == REG_A0) {
269                   delay = ctx->compiler->delay_slots.non_alu;
270                   if (!matching_size)
271                      continue;
272                } else {
273                   delay = (consumer_alu && matching_size)
274                              ? ctx->compiler->delay_slots.alu_to_alu
275                              : ctx->compiler->delay_slots.non_alu;
276                }
277 
278                if (!matching_size) {
279                   for (unsigned i = 0; i < reg_elem_size(dst); i++) {
280                      ready_slot[i] =
281                         reset_ready_slot ? 0 :
282                         MAX2(ready_slot[i], dst_cycle + delay);
283                   }
284                } else {
285                   *ready_slot =
286                      reset_ready_slot ? 0 :
287                      MAX2(*ready_slot, dst_cycle + delay);
288                }
289             }
290          }
291 
292          /* Increment cycle for ALU instructions with (rptN) where destinations
293           * are written each subsequent cycle.
294           */
295          if (instr->repeat && !(dst->flags & IR3_REG_RELATIV))
296             dst_cycle++;
297       }
298    }
299 }
300 
301 /* We want to evaluate each block from the position of any other
302  * predecessor block, in order that the flags set are the union of
303  * all possible program paths.
304  *
305  * To do this, we need to know the output state (needs_ss/ss_war/sy)
306  * of all predecessor blocks.  The tricky thing is loops, which mean
307  * that we can't simply recursively process each predecessor block
308  * before legalizing the current block.
309  *
310  * How we handle that is by looping over all the blocks until the
311  * results converge.  If the output state of a given block changes
312  * in a given pass, this means that all successor blocks are not
313  * yet fully legalized.
314  */
315 
316 static bool
legalize_block(struct ir3_legalize_ctx * ctx,struct ir3_block * block)317 legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
318 {
319    struct ir3_legalize_block_data *bd = block->data;
320 
321    if (bd->valid)
322       return false;
323 
324    struct ir3_instruction *last_n = NULL;
325    struct list_head instr_list;
326    struct ir3_legalize_state prev_state = bd->state;
327    struct ir3_legalize_state *state = &bd->begin_state;
328    bool last_input_needs_ss = false;
329    bool mergedregs = ctx->so->mergedregs;
330    struct ir3_builder build = ir3_builder_at(ir3_after_block(block));
331 
332    /* Our input state is the OR of all predecessor blocks' state.
333     *
334     * Why don't we just zero the state at the beginning before merging in the
335     * predecessors? Because otherwise updates may not be a "lattice refinement",
336     * i.e. needs_ss may go from true to false for some register due to a (ss) we
337     * inserted the second time around (and the same for (sy)). This means that
338     * there's no solid guarantee the algorithm will converge, and in theory
339     * there may be infinite loops where we fight over the placment of an (ss).
340     */
341    for (unsigned i = 0; i < block->predecessors_count; i++) {
342       struct ir3_block *predecessor = block->predecessors[i];
343       struct ir3_legalize_block_data *pbd = predecessor->data;
344       struct ir3_legalize_state *pstate = &pbd->state;
345 
346       /* Our input (ss)/(sy) state is based on OR'ing the output
347        * state of all our predecessor blocks
348        */
349       regmask_or(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
350       regmask_or(&state->needs_ss_war, &state->needs_ss_war,
351                  &pstate->needs_ss_war);
352       regmask_or(&state->needs_ss_or_sy_war, &state->needs_ss_or_sy_war,
353                  &pstate->needs_ss_or_sy_war);
354       regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy);
355       state->needs_ss_for_const |= pstate->needs_ss_for_const;
356       state->needs_sy_for_const |= pstate->needs_sy_for_const;
357 
358       /* Our nop state is the max of the predecessor blocks */
359       for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++)
360          state->pred_ready[i] = MAX2(state->pred_ready[i],
361                                      pstate->pred_ready[i]);
362       for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) {
363          state->alu_nop.full_ready[i] = MAX2(state->alu_nop.full_ready[i],
364                                              pstate->alu_nop.full_ready[i]);
365          state->alu_nop.half_ready[i] = MAX2(state->alu_nop.half_ready[i],
366                                              pstate->alu_nop.half_ready[i]);
367          state->non_alu_nop.full_ready[i] = MAX2(state->non_alu_nop.full_ready[i],
368                                                  pstate->non_alu_nop.full_ready[i]);
369          state->non_alu_nop.half_ready[i] = MAX2(state->non_alu_nop.half_ready[i],
370                                                  pstate->non_alu_nop.half_ready[i]);
371       }
372    }
373 
374    /* We need to take phsyical-only edges into account when tracking shared
375     * registers.
376     */
377    for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
378       struct ir3_block *predecessor = block->physical_predecessors[i];
379       struct ir3_legalize_block_data *pbd = predecessor->data;
380       struct ir3_legalize_state *pstate = &pbd->state;
381 
382       regmask_or_shared(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
383       regmask_or_shared(&state->needs_ss_scalar_full,
384                         &state->needs_ss_scalar_full,
385                         &pstate->needs_ss_scalar_full);
386       regmask_or_shared(&state->needs_ss_scalar_half,
387                         &state->needs_ss_scalar_half,
388                         &pstate->needs_ss_scalar_half);
389       regmask_or_shared(&state->needs_ss_scalar_war, &state->needs_ss_scalar_war,
390                         &pstate->needs_ss_scalar_war);
391       regmask_or_shared(&state->needs_ss_or_sy_scalar_war,
392                         &state->needs_ss_or_sy_scalar_war,
393                         &pstate->needs_ss_or_sy_scalar_war);
394    }
395 
396    memcpy(&bd->state, state, sizeof(*state));
397    state = &bd->state;
398 
399    unsigned input_count = 0;
400 
401    foreach_instr (n, &block->instr_list) {
402       if (is_input(n)) {
403          input_count++;
404       }
405    }
406 
407    unsigned inputs_remaining = input_count;
408 
409    /* Either inputs are in the first block or we expect inputs to be released
410     * with the end of the program.
411     */
412    assert(input_count == 0 || !ctx->early_input_release ||
413           block == ir3_after_preamble(block->shader));
414 
415    /* remove all the instructions from the list, we'll be adding
416     * them back in as we go
417     */
418    list_replace(&block->instr_list, &instr_list);
419    list_inithead(&block->instr_list);
420 
421    unsigned cycle = 0;
422 
423    foreach_instr_safe (n, &instr_list) {
424       unsigned i;
425 
426       n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
427 
428       /* _meta::tex_prefetch instructions removed later in
429        * collect_tex_prefetches()
430        */
431       if (is_meta(n) && (n->opc != OPC_META_TEX_PREFETCH))
432          continue;
433 
434       if (is_input(n)) {
435          struct ir3_register *inloc = n->srcs[0];
436          assert(inloc->flags & IR3_REG_IMMED);
437 
438          int last_inloc =
439             inloc->iim_val + ((inloc->flags & IR3_REG_R) ? n->repeat : 0);
440          ctx->max_bary = MAX2(ctx->max_bary, last_inloc);
441       }
442 
443       if ((last_n && is_barrier(last_n)) || n->opc == OPC_SHPE) {
444          apply_ss(n, state, mergedregs);
445          apply_sy(n, state, mergedregs);
446          last_input_needs_ss = false;
447       }
448 
449       if (last_n && (last_n->opc == OPC_PREDT)) {
450          apply_ss(n, state, mergedregs);
451       }
452 
453       bool n_is_scalar_alu = is_scalar_alu(n, ctx->compiler);
454 
455       /* NOTE: consider dst register too.. it could happen that
456        * texture sample instruction (for example) writes some
457        * components which are unused.  A subsequent instruction
458        * that writes the same register can race w/ the sam instr
459        * resulting in undefined results:
460        */
461       for (i = 0; i < n->dsts_count + n->srcs_count; i++) {
462          struct ir3_register *reg;
463          if (i < n->dsts_count)
464             reg = n->dsts[i];
465          else
466             reg = n->srcs[i - n->dsts_count];
467 
468          if (is_reg_gpr(reg)) {
469 
470             /* TODO: we probably only need (ss) for alu
471              * instr consuming sfu result.. need to make
472              * some tests for both this and (sy)..
473              */
474             if (regmask_get(&state->needs_ss, reg)) {
475                apply_ss(n, state, mergedregs);
476                last_input_needs_ss = false;
477             }
478 
479             /* There is a fast feedback path for scalar ALU instructions which
480              * only takes 1 cycle of latency, similar to the normal 3 cycle
481              * latency path for ALU instructions. For this fast path the
482              * producer and consumer must use the same register size (i.e. no
483              * writing a full register and then reading half of it or vice
484              * versa). If we don't hit this path, either because of a mismatched
485              * size or a read via the regular ALU, then the write latency is
486              * variable and we must use (ss) to wait for the scalar ALU. This is
487              * different from the fixed 6 cycle latency for mismatched vector
488              * ALU accesses.
489              */
490             if (n_is_scalar_alu) {
491                /* Check if we have a mismatched size RaW dependency */
492                if (regmask_get((reg->flags & IR3_REG_HALF) ?
493                                &state->needs_ss_scalar_half :
494                                &state->needs_ss_scalar_full, reg)) {
495                   apply_ss(n, state, mergedregs);
496                   last_input_needs_ss = false;
497                }
498             } else {
499                /* check if we have a scalar -> vector RaW dependency */
500                if (regmask_get(&state->needs_ss_scalar_half, reg) ||
501                    regmask_get(&state->needs_ss_scalar_full, reg)) {
502                   apply_ss(n, state, mergedregs);
503                   last_input_needs_ss = false;
504                }
505             }
506 
507             if (regmask_get(&state->needs_sy, reg)) {
508                apply_sy(n, state, mergedregs);
509             }
510          } else if ((reg->flags & IR3_REG_CONST)) {
511             if (state->needs_ss_for_const) {
512                apply_ss(n, state, mergedregs);
513                last_input_needs_ss = false;
514             }
515             if (state->needs_sy_for_const) {
516                apply_sy(n, state, mergedregs);
517             }
518          } else if (reg_is_addr1(reg) && block->in_early_preamble) {
519             if (regmask_get(&state->needs_ss, reg)) {
520                apply_ss(n, state, mergedregs);
521                last_input_needs_ss = false;
522             }
523          }
524       }
525 
526       foreach_dst (reg, n) {
527          if (reg->flags & IR3_REG_RT)
528             continue;
529          if (needs_ss_war(state, reg, n_is_scalar_alu)) {
530             apply_ss(n, state, mergedregs);
531             last_input_needs_ss = false;
532          }
533       }
534 
535       /* I'm not exactly what this is for, but it seems we need this on every
536        * mova1 in early preambles.
537        */
538       if (writes_addr1(n) && block->in_early_preamble)
539          n->srcs[0]->flags |= IR3_REG_R;
540 
541       /* cat5+ does not have an (ss) bit, if needed we need to
542        * insert a nop to carry the sync flag.  Would be kinda
543        * clever if we were aware of this during scheduling, but
544        * this should be a pretty rare case:
545        */
546       if ((n->flags & IR3_INSTR_SS) && !supports_ss(n)) {
547          struct ir3_instruction *nop;
548          nop = ir3_NOP(&build);
549          nop->flags |= IR3_INSTR_SS;
550          n->flags &= ~IR3_INSTR_SS;
551          last_n = nop;
552          cycle++;
553       }
554 
555       unsigned delay = delay_calc(ctx, state, n, cycle);
556 
557       /* NOTE: I think the nopN encoding works for a5xx and
558        * probably a4xx, but not a3xx.  So far only tested on
559        * a6xx.
560        */
561 
562       if ((delay > 0) && (ctx->compiler->gen >= 6) && last_n &&
563           !n_is_scalar_alu &&
564           ((opc_cat(last_n->opc) == 2) || (opc_cat(last_n->opc) == 3)) &&
565           (last_n->repeat == 0)) {
566          /* the previous cat2/cat3 instruction can encode at most 3 nop's: */
567          unsigned transfer = MIN2(delay, 3 - last_n->nop);
568          last_n->nop += transfer;
569          delay -= transfer;
570          cycle += transfer;
571       }
572 
573       if ((delay > 0) && last_n && (last_n->opc == OPC_NOP)) {
574          /* the previous nop can encode at most 5 repeats: */
575          unsigned transfer = MIN2(delay, 5 - last_n->repeat);
576          last_n->repeat += transfer;
577          delay -= transfer;
578          cycle += transfer;
579       }
580 
581       if (delay > 0) {
582          assert(delay <= 6);
583          ir3_NOP(&build)->repeat = delay - 1;
584          cycle += delay;
585       }
586 
587       if (ctx->compiler->samgq_workaround &&
588           ctx->type != MESA_SHADER_FRAGMENT &&
589           ctx->type != MESA_SHADER_COMPUTE && n->opc == OPC_SAMGQ) {
590          struct ir3_instruction *samgp;
591 
592          list_delinit(&n->node);
593 
594          for (i = 0; i < 4; i++) {
595             samgp = ir3_instr_clone(n);
596             samgp->opc = OPC_SAMGP0 + i;
597             if (i > 1)
598                samgp->flags |= IR3_INSTR_SY;
599          }
600       } else {
601          list_delinit(&n->node);
602          list_addtail(&n->node, &block->instr_list);
603       }
604 
605       if (is_sfu(n) || n->opc == OPC_SHFL)
606          regmask_set(&state->needs_ss, n->dsts[0]);
607 
608       foreach_dst (dst, n) {
609          if (dst->flags & IR3_REG_SHARED) {
610             if (n_is_scalar_alu) {
611                if (dst->flags & IR3_REG_HALF)
612                   regmask_set(&state->needs_ss_scalar_full, dst);
613                else
614                   regmask_set(&state->needs_ss_scalar_half, dst);
615             } else {
616                regmask_set(&state->needs_ss, dst);
617             }
618          } else if (reg_is_addr1(dst) && block->in_early_preamble) {
619             regmask_set(&state->needs_ss, dst);
620          }
621       }
622 
623       if (is_tex_or_prefetch(n) && n->dsts_count > 0) {
624          regmask_set(&state->needs_sy, n->dsts[0]);
625          if (n->opc == OPC_META_TEX_PREFETCH)
626             ctx->has_tex_prefetch = true;
627       } else if (n->opc == OPC_RESINFO && n->dsts_count > 0) {
628          regmask_set(&state->needs_ss, n->dsts[0]);
629          ir3_NOP(&build)->flags |= IR3_INSTR_SS;
630          last_input_needs_ss = false;
631       } else if (is_load(n)) {
632          if (is_local_mem_load(n))
633             regmask_set(&state->needs_ss, n->dsts[0]);
634          else
635             regmask_set(&state->needs_sy, n->dsts[0]);
636       } else if (is_atomic(n->opc)) {
637          if (is_bindless_atomic(n->opc)) {
638             regmask_set(&state->needs_sy, n->srcs[2]);
639          } else if (is_global_a3xx_atomic(n->opc) ||
640                     is_global_a6xx_atomic(n->opc)) {
641             regmask_set(&state->needs_sy, n->dsts[0]);
642          } else {
643             regmask_set(&state->needs_ss, n->dsts[0]);
644          }
645       } else if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO || n->opc == OPC_STC) {
646          state->needs_ss_for_const = true;
647       } else if (n->opc == OPC_LDC_K) {
648          state->needs_sy_for_const = true;
649       }
650 
651       if (is_ssbo(n->opc) || is_global_a3xx_atomic(n->opc) ||
652           is_bindless_atomic(n->opc))
653          ctx->so->has_ssbo = true;
654 
655       /* both tex/sfu appear to not always immediately consume
656        * their src register(s):
657        */
658       if (is_war_hazard_producer(n)) {
659          /* These WAR hazards can always be resolved with (ss). However, when
660           * the reader is a sy-producer, they can also be resolved using (sy)
661           * because once we have synced the reader's results using (sy), its
662           * sources have definitely been consumed. We track the two cases
663           * separately so that we don't add an unnecessary (ss) if a (sy) sync
664           * already happened.
665           * For example, this prevents adding the unnecessary (ss) in the
666           * following sequence:
667           * sam rd, rs, ...
668           * (sy)... ; sam synced so consumed its sources
669           * (ss)write rs ; (ss) unnecessary since rs has been consumed already
670           */
671          bool needs_ss = is_ss_producer(n) || is_store(n) || n->opc == OPC_STC;
672 
673          if (n_is_scalar_alu) {
674             /* Scalar ALU also does not immediately read its source because it
675              * is not executed right away, but scalar ALU instructions are
676              * executed in-order so subsequent scalar ALU instructions don't
677              * need to wait for previous ones.
678              */
679             regmask_t *mask = needs_ss ? &state->needs_ss_scalar_war
680                                        : &state->needs_ss_or_sy_scalar_war;
681 
682             foreach_src (reg, n) {
683                if ((reg->flags & IR3_REG_SHARED) || is_reg_a0(reg)) {
684                   regmask_set(mask, reg);
685                }
686             }
687          } else {
688             regmask_t *mask =
689                needs_ss ? &state->needs_ss_war : &state->needs_ss_or_sy_war;
690 
691             foreach_src (reg, n) {
692                if (!(reg->flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
693                   regmask_set(mask, reg);
694                }
695             }
696          }
697       }
698 
699       bool count = count_instruction(n, ctx->compiler);
700       if (count)
701          cycle += 1;
702 
703       delay_update(ctx, state, n, cycle, mergedregs);
704 
705       if (count)
706          cycle += n->repeat + n->nop;
707 
708       if (ctx->early_input_release && is_input(n)) {
709          last_input_needs_ss |= (n->opc == OPC_LDLV);
710 
711          assert(inputs_remaining > 0);
712          inputs_remaining--;
713          if (inputs_remaining == 0) {
714             /* This is the last input. We add the (ei) flag to release
715              * varying memory after this executes. If it's an ldlv,
716              * however, we need to insert a dummy bary.f on which we can
717              * set the (ei) flag. We may also need to insert an (ss) to
718              * guarantee that all ldlv's have finished fetching their
719              * results before releasing the varying memory.
720              */
721             struct ir3_instruction *last_input = n;
722             if (n->opc == OPC_LDLV) {
723                struct ir3_instruction *baryf;
724 
725                /* (ss)bary.f (ei)r63.x, 0, r0.x */
726                baryf = ir3_build_instr(&build, OPC_BARY_F, 1, 2);
727                ir3_dst_create(baryf, regid(63, 0), 0);
728                ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
729                ir3_src_create(baryf, regid(0, 0), 0);
730 
731                last_input = baryf;
732             }
733 
734             last_input->dsts[0]->flags |= IR3_REG_EI;
735             if (last_input_needs_ss) {
736                apply_ss(last_input, state, mergedregs);
737             }
738          }
739       }
740 
741       last_n = n;
742    }
743 
744    assert(inputs_remaining == 0 || !ctx->early_input_release);
745 
746    if (block == ir3_after_preamble(ctx->so->ir) &&
747        ctx->has_tex_prefetch && !ctx->has_inputs) {
748       /* texture prefetch, but *no* inputs.. we need to insert a
749        * dummy bary.f at the top of the shader to unblock varying
750        * storage:
751        */
752       struct ir3_instruction *baryf;
753 
754       /* (ss)bary.f (ei)r63.x, 0, r0.x */
755       baryf = ir3_build_instr(&build, OPC_BARY_F, 1, 2);
756       ir3_dst_create(baryf, regid(63, 0), 0)->flags |= IR3_REG_EI;
757       ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
758       ir3_src_create(baryf, regid(0, 0), 0);
759 
760       /* insert the dummy bary.f at head: */
761       list_delinit(&baryf->node);
762       list_add(&baryf->node, &block->instr_list);
763    }
764 
765    /* Currently our nop state contains the cycle offset from the start of this
766     * block when each register becomes ready. But successor blocks need the
767     * cycle offset from their start, which is this block's end. Translate the
768     * cycle offset.
769     */
770    for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++)
771       state->pred_ready[i] = MAX2(state->pred_ready[i], cycle) - cycle;
772    for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) {
773       state->alu_nop.full_ready[i] =
774          MAX2(state->alu_nop.full_ready[i], cycle) - cycle;
775       state->alu_nop.half_ready[i] =
776          MAX2(state->alu_nop.half_ready[i], cycle) - cycle;
777       state->non_alu_nop.full_ready[i] =
778          MAX2(state->non_alu_nop.full_ready[i], cycle) - cycle;
779       state->non_alu_nop.half_ready[i] =
780          MAX2(state->non_alu_nop.half_ready[i], cycle) - cycle;
781    }
782 
783    bd->valid = true;
784 
785    if (memcmp(&prev_state, state, sizeof(*state))) {
786       /* our output state changed, this invalidates all of our
787        * successors:
788        */
789       for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
790          if (!block->successors[i])
791             break;
792          struct ir3_legalize_block_data *pbd = block->successors[i]->data;
793          pbd->valid = false;
794       }
795    }
796 
797    return true;
798 }
799 
800 /* Expands dsxpp and dsypp macros to:
801  *
802  * dsxpp.1 dst, src
803  * dsxpp.1.p dst, src
804  *
805  * We apply this after flags syncing, as we don't want to sync in between the
806  * two (which might happen if dst == src).
807  */
808 static bool
apply_fine_deriv_macro(struct ir3_legalize_ctx * ctx,struct ir3_block * block)809 apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
810 {
811    struct list_head instr_list;
812 
813    /* remove all the instructions from the list, we'll be adding
814     * them back in as we go
815     */
816    list_replace(&block->instr_list, &instr_list);
817    list_inithead(&block->instr_list);
818 
819    foreach_instr_safe (n, &instr_list) {
820       list_addtail(&n->node, &block->instr_list);
821 
822       if (n->opc == OPC_DSXPP_MACRO || n->opc == OPC_DSYPP_MACRO) {
823          n->opc = (n->opc == OPC_DSXPP_MACRO) ? OPC_DSXPP_1 : OPC_DSYPP_1;
824 
825          struct ir3_instruction *op_p = ir3_instr_clone(n);
826          op_p->flags = IR3_INSTR_P;
827 
828          ctx->so->need_full_quad = true;
829       }
830    }
831 
832    return true;
833 }
834 
835 /* Some instructions can take a dummy destination of r63.x, which we model as it
836  * not having a destination in the IR to avoid having special code to handle
837  * this. Insert the dummy destination after everything else is done.
838  */
839 static bool
expand_dummy_dests(struct ir3_block * block)840 expand_dummy_dests(struct ir3_block *block)
841 {
842    foreach_instr (n, &block->instr_list) {
843       if ((n->opc == OPC_SAM || n->opc == OPC_LDC || n->opc == OPC_RESINFO) &&
844           n->dsts_count == 0) {
845          struct ir3_register *dst = ir3_dst_create(n, INVALID_REG, 0);
846          /* Copy the blob's writemask */
847          if (n->opc == OPC_SAM)
848             dst->wrmask = 0b1111;
849       }
850    }
851    return true;
852 }
853 
854 static void
apply_push_consts_load_macro(struct ir3_legalize_ctx * ctx,struct ir3_block * block)855 apply_push_consts_load_macro(struct ir3_legalize_ctx *ctx,
856                              struct ir3_block *block)
857 {
858    foreach_instr (n, &block->instr_list) {
859       if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
860          struct ir3_instruction *stsc =
861             ir3_instr_create_at(ir3_after_instr(n), OPC_STSC, 0, 2);
862          ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val =
863             n->push_consts.dst_base;
864          ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val =
865             n->push_consts.src_base;
866          stsc->cat6.iim_val = n->push_consts.src_size;
867          stsc->cat6.type = TYPE_U32;
868 
869          if (ctx->compiler->stsc_duplication_quirk) {
870             struct ir3_builder build = ir3_builder_at(ir3_after_instr(stsc));
871             struct ir3_instruction *nop = ir3_NOP(&build);
872             nop->flags |= IR3_INSTR_SS;
873             ir3_instr_move_after(ir3_instr_clone(stsc), nop);
874          }
875 
876          list_delinit(&n->node);
877          break;
878       } else if (!is_meta(n)) {
879          break;
880       }
881    }
882 }
883 
884 /* NOTE: branch instructions are always the last instruction(s)
885  * in the block.  We take advantage of this as we resolve the
886  * branches, since "if (foo) break;" constructs turn into
887  * something like:
888  *
889  *   block3 {
890  *   	...
891  *   	0029:021: mov.s32s32 r62.x, r1.y
892  *   	0082:022: br !p0.x, target=block5
893  *   	0083:023: br p0.x, target=block4
894  *   	// succs: if _[0029:021: mov.s32s32] block4; else block5;
895  *   }
896  *   block4 {
897  *   	0084:024: jump, target=block6
898  *   	// succs: block6;
899  *   }
900  *   block5 {
901  *   	0085:025: jump, target=block7
902  *   	// succs: block7;
903  *   }
904  *
905  * ie. only instruction in block4/block5 is a jump, so when
906  * resolving branches we can easily detect this by checking
907  * that the first instruction in the target block is itself
908  * a jump, and setup the br directly to the jump's target
909  * (and strip back out the now unreached jump)
910  *
911  * TODO sometimes we end up with things like:
912  *
913  *    br !p0.x, #2
914  *    br p0.x, #12
915  *    add.u r0.y, r0.y, 1
916  *
917  * If we swapped the order of the branches, we could drop one.
918  */
919 static struct ir3_block *
resolve_dest_block(struct ir3_block * block)920 resolve_dest_block(struct ir3_block *block)
921 {
922    /* special case for last block: */
923    if (!block->successors[0])
924       return block;
925 
926    /* NOTE that we may or may not have inserted the jump
927     * in the target block yet, so conditions to resolve
928     * the dest to the dest block's successor are:
929     *
930     *   (1) successor[1] == NULL &&
931     *   (2) (block-is-empty || only-instr-is-jump)
932     */
933    if (block->successors[1] == NULL) {
934       if (list_is_empty(&block->instr_list)) {
935          return block->successors[0];
936       } else if (list_length(&block->instr_list) == 1) {
937          struct ir3_instruction *instr =
938             list_first_entry(&block->instr_list, struct ir3_instruction, node);
939          if (instr->opc == OPC_JUMP) {
940             /* If this jump is backwards, then we will probably convert
941              * the jump being resolved to a backwards jump, which will
942              * change a loop-with-continue or loop-with-if into a
943              * doubly-nested loop and change the convergence behavior.
944              * Disallow this here.
945              */
946             if (block->successors[0]->index <= block->index)
947                return block;
948             return block->successors[0];
949          }
950       }
951    }
952    return block;
953 }
954 
955 static void
remove_unused_block(struct ir3_block * old_target)956 remove_unused_block(struct ir3_block *old_target)
957 {
958    list_delinit(&old_target->node);
959 
960    /* cleanup dangling predecessors: */
961    for (unsigned i = 0; i < ARRAY_SIZE(old_target->successors); i++) {
962       if (old_target->successors[i]) {
963          struct ir3_block *succ = old_target->successors[i];
964          ir3_block_remove_predecessor(succ, old_target);
965       }
966    }
967 }
968 
969 static bool
retarget_jump(struct ir3_instruction * instr,struct ir3_block * new_target)970 retarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target)
971 {
972    struct ir3_block *old_target = instr->cat0.target;
973    struct ir3_block *cur_block = instr->block;
974 
975    /* update current blocks successors to reflect the retargetting: */
976    if (cur_block->successors[0] == old_target) {
977       cur_block->successors[0] = new_target;
978    } else {
979       assert(cur_block->successors[1] == old_target);
980       cur_block->successors[1] = new_target;
981    }
982 
983    /* update new target's predecessors: */
984    ir3_block_add_predecessor(new_target, cur_block);
985 
986    /* and remove old_target's predecessor: */
987    ir3_block_remove_predecessor(old_target, cur_block);
988 
989    instr->cat0.target = new_target;
990 
991    if (old_target->predecessors_count == 0) {
992       remove_unused_block(old_target);
993       return true;
994    }
995 
996    return false;
997 }
998 
999 static bool
is_invertible_branch(struct ir3_instruction * instr)1000 is_invertible_branch(struct ir3_instruction *instr)
1001 {
1002    switch (instr->opc) {
1003    case OPC_BR:
1004    case OPC_BRAA:
1005    case OPC_BRAO:
1006    case OPC_BANY:
1007    case OPC_BALL:
1008       return true;
1009    default:
1010       return false;
1011    }
1012 }
1013 
1014 static bool
opt_jump(struct ir3 * ir)1015 opt_jump(struct ir3 *ir)
1016 {
1017    bool progress = false;
1018 
1019    unsigned index = 0;
1020    foreach_block (block, &ir->block_list)
1021       block->index = index++;
1022 
1023    foreach_block (block, &ir->block_list) {
1024       /* This pass destroys the physical CFG so don't keep it around to avoid
1025        * validation errors.
1026        */
1027       block->physical_successors_count = 0;
1028       block->physical_predecessors_count = 0;
1029 
1030       foreach_instr (instr, &block->instr_list) {
1031          if (!is_flow(instr) || !instr->cat0.target)
1032             continue;
1033 
1034          struct ir3_block *tblock = resolve_dest_block(instr->cat0.target);
1035          if (tblock != instr->cat0.target) {
1036             progress = true;
1037 
1038             /* Exit early if we deleted a block to avoid iterator
1039              * weirdness/assert fails
1040              */
1041             if (retarget_jump(instr, tblock))
1042                return true;
1043          }
1044       }
1045 
1046       /* Detect the case where the block ends either with:
1047        * - A single unconditional jump to the next block.
1048        * - Two jump instructions with opposite conditions, and one of the
1049        *   them jumps to the next block.
1050        * We can remove the one that jumps to the next block in either case.
1051        */
1052       if (list_is_empty(&block->instr_list))
1053          continue;
1054 
1055       struct ir3_instruction *jumps[2] = {NULL, NULL};
1056       jumps[0] =
1057          list_last_entry(&block->instr_list, struct ir3_instruction, node);
1058       if (!list_is_singular(&block->instr_list))
1059          jumps[1] =
1060             list_last_entry(&jumps[0]->node, struct ir3_instruction, node);
1061 
1062       if (jumps[0]->opc == OPC_JUMP)
1063          jumps[1] = NULL;
1064       else if (!is_invertible_branch(jumps[0]) || !jumps[1] ||
1065                !is_invertible_branch(jumps[1])) {
1066          continue;
1067       }
1068 
1069       for (unsigned i = 0; i < 2; i++) {
1070          if (!jumps[i])
1071             continue;
1072          struct ir3_block *tblock = jumps[i]->cat0.target;
1073          if (&tblock->node == block->node.next) {
1074             list_delinit(&jumps[i]->node);
1075             progress = true;
1076             break;
1077          }
1078       }
1079    }
1080 
1081    return progress;
1082 }
1083 
1084 static void
resolve_jumps(struct ir3 * ir)1085 resolve_jumps(struct ir3 *ir)
1086 {
1087    foreach_block (block, &ir->block_list)
1088       foreach_instr (instr, &block->instr_list)
1089          if (is_flow(instr) && instr->cat0.target) {
1090             struct ir3_instruction *target = list_first_entry(
1091                &instr->cat0.target->instr_list, struct ir3_instruction, node);
1092 
1093             instr->cat0.immed = (int)target->ip - (int)instr->ip;
1094          }
1095 }
1096 
1097 static void
mark_jp(struct ir3_block * block)1098 mark_jp(struct ir3_block *block)
1099 {
1100    /* We only call this on the end block (in kill_sched) or after retargeting
1101     * all jumps to empty blocks (in mark_xvergence_points) so there's no need to
1102     * worry about empty blocks.
1103     */
1104    assert(!list_is_empty(&block->instr_list));
1105 
1106    struct ir3_instruction *target =
1107       list_first_entry(&block->instr_list, struct ir3_instruction, node);
1108    target->flags |= IR3_INSTR_JP;
1109 }
1110 
1111 /* Mark points where control flow reconverges.
1112  *
1113  * Re-convergence points are where "parked" threads are reconverged with threads
1114  * that took the opposite path last time around. We already calculated them, we
1115  * just need to mark them with (jp).
1116  */
1117 static void
mark_xvergence_points(struct ir3 * ir)1118 mark_xvergence_points(struct ir3 *ir)
1119 {
1120    foreach_block (block, &ir->block_list) {
1121       if (block->reconvergence_point)
1122          mark_jp(block);
1123    }
1124 }
1125 
1126 static void
invert_branch(struct ir3_instruction * branch)1127 invert_branch(struct ir3_instruction *branch)
1128 {
1129    switch (branch->opc) {
1130    case OPC_BR:
1131       break;
1132    case OPC_BALL:
1133       branch->opc = OPC_BANY;
1134       break;
1135    case OPC_BANY:
1136       branch->opc = OPC_BALL;
1137       break;
1138    case OPC_BRAA:
1139       branch->opc = OPC_BRAO;
1140       break;
1141    case OPC_BRAO:
1142       branch->opc = OPC_BRAA;
1143       break;
1144    default:
1145       unreachable("can't get here");
1146    }
1147 
1148    branch->cat0.inv1 = !branch->cat0.inv1;
1149    branch->cat0.inv2 = !branch->cat0.inv2;
1150    branch->cat0.target = branch->block->successors[1];
1151 }
1152 
1153 /* Insert the branch/jump instructions for flow control between blocks.
1154  * Initially this is done naively, without considering if the successor
1155  * block immediately follows the current block (ie. so no jump required),
1156  * but that is cleaned up in opt_jump().
1157  */
1158 static void
block_sched(struct ir3 * ir)1159 block_sched(struct ir3 *ir)
1160 {
1161    foreach_block (block, &ir->block_list) {
1162       struct ir3_instruction *terminator = ir3_block_get_terminator(block);
1163 
1164       if (block->successors[1]) {
1165          /* if/else, conditional branches to "then" or "else": */
1166          struct ir3_instruction *br1, *br2;
1167 
1168          assert(terminator);
1169          unsigned opc = terminator->opc;
1170 
1171          if (opc == OPC_GETONE || opc == OPC_SHPS || opc == OPC_GETLAST) {
1172             /* getone/shps can't be inverted, and it wouldn't even make sense
1173              * to follow it with an inverted branch, so follow it by an
1174              * unconditional branch.
1175              */
1176             assert(terminator->srcs_count == 0);
1177             br1 = terminator;
1178             br1->cat0.target = block->successors[1];
1179 
1180             struct ir3_builder build = ir3_builder_at(ir3_after_block(block));
1181             br2 = ir3_JUMP(&build);
1182             br2->cat0.target = block->successors[0];
1183          } else if (opc == OPC_BR || opc == OPC_BRAA || opc == OPC_BRAO ||
1184                     opc == OPC_BALL || opc == OPC_BANY) {
1185             /* create "else" branch first (since "then" block should
1186              * frequently/always end up being a fall-thru):
1187              */
1188             br1 = terminator;
1189             br2 = ir3_instr_clone(br1);
1190             invert_branch(br1);
1191             br2->cat0.target = block->successors[0];
1192          } else {
1193             assert(opc == OPC_PREDT || opc == OPC_PREDF);
1194 
1195             /* Handled by prede_sched. */
1196             terminator->cat0.target = block->successors[0];
1197             continue;
1198          }
1199 
1200          /* Creating br2 caused it to be moved before the terminator b1, move it
1201           * back.
1202           */
1203          ir3_instr_move_after(br2, br1);
1204       } else if (block->successors[0]) {
1205          /* otherwise unconditional jump or predt/predf to next block which
1206           * should already have been inserted.
1207           */
1208          assert(terminator);
1209          assert(terminator->opc == OPC_JUMP || terminator->opc == OPC_PREDT ||
1210                 terminator->opc == OPC_PREDF);
1211          terminator->cat0.target = block->successors[0];
1212       }
1213    }
1214 }
1215 
1216 /* Some gens have a hardware issue that needs to be worked around by 1)
1217  * inserting 4 nops after the second pred[tf] of a pred[tf]/pred[ft] pair and/or
1218  * inserting 6 nops after prede.
1219  *
1220  * This function should be called with the second pred[tf] of such a pair and
1221  * NULL if there is only one pred[tf].
1222  */
1223 static void
add_predication_workaround(struct ir3_compiler * compiler,struct ir3_instruction * predtf,struct ir3_instruction * prede)1224 add_predication_workaround(struct ir3_compiler *compiler,
1225                            struct ir3_instruction *predtf,
1226                            struct ir3_instruction *prede)
1227 {
1228    if (predtf && compiler->predtf_nop_quirk) {
1229       struct ir3_builder build = ir3_builder_at(ir3_after_block(predtf->block));
1230       struct ir3_instruction *nop = ir3_NOP(&build);
1231       nop->repeat = 4;
1232       ir3_instr_move_after(nop, predtf);
1233    }
1234 
1235    if (compiler->prede_nop_quirk) {
1236       struct ir3_builder build = ir3_builder_at(ir3_after_block(prede->block));
1237       struct ir3_instruction *nop = ir3_NOP(&build);
1238       nop->repeat = 6;
1239       ir3_instr_move_after(nop, prede);
1240    }
1241 }
1242 
1243 static void
prede_sched(struct ir3 * ir)1244 prede_sched(struct ir3 *ir)
1245 {
1246    unsigned index = 0;
1247    foreach_block (block, &ir->block_list)
1248       block->index = index++;
1249 
1250    foreach_block (block, &ir->block_list) {
1251       /* Look for the following pattern generated by NIR lowering. The numbers
1252        * at the top of blocks are their index.
1253        *        |--- i ----|
1254        *        |   ...    |
1255        *        | pred[tf] |
1256        *        |----------|
1257        *      succ0 /   \ succ1
1258        * |-- i+1 ---| |-- i+2 ---|
1259        * |    ...   | |   ...    |
1260        * | pred[ft] | |   ...    |
1261        * |----------| |----------|
1262        *     succ0 \   / succ0
1263        *        |--- j ----|
1264        *        |   ...    |
1265        *        |----------|
1266        */
1267       struct ir3_block *succ0 = block->successors[0];
1268       struct ir3_block *succ1 = block->successors[1];
1269 
1270       if (!succ1)
1271          continue;
1272 
1273       struct ir3_instruction *terminator = ir3_block_get_terminator(block);
1274       if (!terminator)
1275          continue;
1276       if (terminator->opc != OPC_PREDT && terminator->opc != OPC_PREDF)
1277          continue;
1278 
1279       assert(!succ0->successors[1] && !succ1->successors[1]);
1280       assert(succ0->successors[0] == succ1->successors[0]);
1281       assert(succ0->predecessors_count == 1 && succ1->predecessors_count == 1);
1282       assert(succ0->index == (block->index + 1));
1283       assert(succ1->index == (block->index + 2));
1284 
1285       struct ir3_instruction *succ0_terminator =
1286          ir3_block_get_terminator(succ0);
1287       assert(succ0_terminator);
1288       assert(succ0_terminator->opc ==
1289              (terminator->opc == OPC_PREDT ? OPC_PREDF : OPC_PREDT));
1290 
1291       ASSERTED struct ir3_instruction *succ1_terminator =
1292          ir3_block_get_terminator(succ1);
1293       assert(!succ1_terminator || (succ1_terminator->opc == OPC_JUMP));
1294 
1295       /* Simple case: both successors contain instructions. Keep both blocks and
1296        * insert prede before the second successor's terminator:
1297        *        |--- i ----|
1298        *        |   ...    |
1299        *        | pred[tf] |
1300        *        |----------|
1301        *      succ0 /   \ succ1
1302        * |-- i+1 ---| |-- i+2 ---|
1303        * |    ...   | |   ...    |
1304        * | pred[ft] | | prede    |
1305        * |----------| |----------|
1306        *     succ0 \   / succ0
1307        *        |--- j ----|
1308        *        |   ...    |
1309        *        |----------|
1310        */
1311       if (!list_is_empty(&succ1->instr_list)) {
1312          struct ir3_builder build =
1313             ir3_builder_at(ir3_before_terminator(succ1));
1314          struct ir3_instruction *prede = ir3_PREDE(&build);
1315          add_predication_workaround(ir->compiler, succ0_terminator, prede);
1316          continue;
1317       }
1318 
1319       /* Second successor is empty so we can remove it:
1320        *        |--- i ----|
1321        *        |   ...    |
1322        *        | pred[tf] |
1323        *        |----------|
1324        *      succ0 /   \ succ1
1325        * |-- i+1 ---|   |
1326        * |    ...   |   |
1327        * |   prede  |   |
1328        * |----------|   |
1329        *     succ0 \    /
1330        *        |--- j ----|
1331        *        |   ...    |
1332        *        |----------|
1333        */
1334       list_delinit(&succ0_terminator->node);
1335       struct ir3_builder build = ir3_builder_at(ir3_before_terminator(succ0));
1336       struct ir3_instruction *prede = ir3_PREDE(&build);
1337       add_predication_workaround(ir->compiler, NULL, prede);
1338       remove_unused_block(succ1);
1339       block->successors[1] = succ0->successors[0];
1340       ir3_block_add_predecessor(succ0->successors[0], block);
1341    }
1342 }
1343 
1344 /* Here we workaround the fact that kill doesn't actually kill the thread as
1345  * GL expects. The last instruction always needs to be an end instruction,
1346  * which means that if we're stuck in a loop where kill is the only way out,
1347  * then we may have to jump out to the end. kill may also have the d3d
1348  * semantics of converting the thread to a helper thread, rather than setting
1349  * the exec mask to 0, in which case the helper thread could get stuck in an
1350  * infinite loop.
1351  *
1352  * We do this late, both to give the scheduler the opportunity to reschedule
1353  * kill instructions earlier and to avoid having to create a separate basic
1354  * block.
1355  *
1356  * TODO: Assuming that the wavefront doesn't stop as soon as all threads are
1357  * killed, we might benefit by doing this more aggressively when the remaining
1358  * part of the program after the kill is large, since that would let us
1359  * skip over the instructions when there are no non-killed threads left.
1360  */
1361 static void
kill_sched(struct ir3 * ir,struct ir3_shader_variant * so)1362 kill_sched(struct ir3 *ir, struct ir3_shader_variant *so)
1363 {
1364    ir3_count_instructions(ir);
1365 
1366    /* True if we know that this block will always eventually lead to the end
1367     * block:
1368     */
1369    bool always_ends = true;
1370    bool added = false;
1371    struct ir3_block *last_block =
1372       list_last_entry(&ir->block_list, struct ir3_block, node);
1373 
1374    foreach_block_rev (block, &ir->block_list) {
1375       for (unsigned i = 0; i < 2 && block->successors[i]; i++) {
1376          if (block->successors[i]->start_ip <= block->end_ip)
1377             always_ends = false;
1378       }
1379 
1380       if (always_ends)
1381          continue;
1382 
1383       foreach_instr_safe (instr, &block->instr_list) {
1384          if (instr->opc != OPC_KILL)
1385             continue;
1386 
1387          struct ir3_instruction *br =
1388             ir3_instr_create_at(ir3_after_instr(instr), OPC_BR, 0, 1);
1389          ir3_src_create(br, instr->srcs[0]->num, instr->srcs[0]->flags)->wrmask =
1390             1;
1391          br->cat0.target =
1392             list_last_entry(&ir->block_list, struct ir3_block, node);
1393 
1394          added = true;
1395       }
1396    }
1397 
1398    if (added) {
1399       /* I'm not entirely sure how the branchstack works, but we probably
1400        * need to add at least one entry for the divergence which is resolved
1401        * at the end:
1402        */
1403       so->branchstack++;
1404 
1405       /* We don't update predecessors/successors, so we have to do this
1406        * manually:
1407        */
1408       mark_jp(last_block);
1409    }
1410 }
1411 
1412 static void
dbg_sync_sched(struct ir3 * ir,struct ir3_shader_variant * so)1413 dbg_sync_sched(struct ir3 *ir, struct ir3_shader_variant *so)
1414 {
1415    foreach_block (block, &ir->block_list) {
1416       foreach_instr_safe (instr, &block->instr_list) {
1417          if (is_ss_producer(instr) || is_sy_producer(instr)) {
1418             struct ir3_builder build = ir3_builder_at(ir3_after_instr(instr));
1419             struct ir3_instruction *nop = ir3_NOP(&build);
1420             nop->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
1421          }
1422       }
1423    }
1424 }
1425 
1426 static void
dbg_nop_sched(struct ir3 * ir,struct ir3_shader_variant * so)1427 dbg_nop_sched(struct ir3 *ir, struct ir3_shader_variant *so)
1428 {
1429    foreach_block (block, &ir->block_list) {
1430       foreach_instr_safe (instr, &block->instr_list) {
1431          struct ir3_builder build = ir3_builder_at(ir3_before_instr(instr));
1432          struct ir3_instruction *nop = ir3_NOP(&build);
1433          nop->repeat = 5;
1434       }
1435    }
1436 }
1437 
1438 static void
dbg_expand_rpt(struct ir3 * ir)1439 dbg_expand_rpt(struct ir3 *ir)
1440 {
1441    foreach_block (block, &ir->block_list) {
1442       foreach_instr_safe (instr, &block->instr_list) {
1443          if (instr->repeat == 0 || instr->opc == OPC_NOP ||
1444              instr->opc == OPC_SWZ || instr->opc == OPC_GAT ||
1445              instr->opc == OPC_SCT) {
1446             continue;
1447          }
1448 
1449          for (unsigned i = 0; i <= instr->repeat; ++i) {
1450             struct ir3_instruction *rpt = ir3_instr_clone(instr);
1451             ir3_instr_move_before(rpt, instr);
1452             rpt->repeat = 0;
1453 
1454             foreach_dst (dst, rpt) {
1455                dst->num += i;
1456                dst->wrmask = 1;
1457             }
1458 
1459             foreach_src (src, rpt) {
1460                if (!(src->flags & IR3_REG_R))
1461                   continue;
1462 
1463                src->num += i;
1464                src->uim_val += i;
1465                src->wrmask = 1;
1466                src->flags &= ~IR3_REG_R;
1467             }
1468          }
1469 
1470          list_delinit(&instr->node);
1471       }
1472    }
1473 }
1474 
1475 struct ir3_helper_block_data {
1476    /* Whether helper invocations may be used on any path starting at the
1477     * beginning of the block.
1478     */
1479    bool uses_helpers_beginning;
1480 
1481    /* Whether helper invocations may be used by the end of the block. Branch
1482     * instructions are considered to be "between" blocks, because (eq) has to be
1483     * inserted after them in the successor blocks, so branch instructions using
1484     * helpers will result in uses_helpers_end = true for their block.
1485     */
1486    bool uses_helpers_end;
1487 };
1488 
1489 /* Insert (eq) after the last instruction using the results of helper
1490  * invocations. Use a backwards dataflow analysis to determine at which points
1491  * in the program helper invocations are definitely never used, and then insert
1492  * (eq) at the point where we cross from a point where they may be used to a
1493  * point where they are never used.
1494  */
1495 static void
helper_sched(struct ir3_legalize_ctx * ctx,struct ir3 * ir,struct ir3_shader_variant * so)1496 helper_sched(struct ir3_legalize_ctx *ctx, struct ir3 *ir,
1497              struct ir3_shader_variant *so)
1498 {
1499    bool non_prefetch_helpers = false;
1500 
1501    foreach_block (block, &ir->block_list) {
1502       struct ir3_helper_block_data *bd =
1503          rzalloc(ctx, struct ir3_helper_block_data);
1504       foreach_instr (instr, &block->instr_list) {
1505          if (uses_helpers(instr)) {
1506             bd->uses_helpers_beginning = true;
1507             if (instr->opc != OPC_META_TEX_PREFETCH) {
1508                non_prefetch_helpers = true;
1509             }
1510          }
1511 
1512          if (instr->opc == OPC_SHPE) {
1513             /* (eq) is not allowed in preambles, mark the whole preamble as
1514              * requiring helpers to avoid putting it there.
1515              */
1516             bd->uses_helpers_beginning = true;
1517             bd->uses_helpers_end = true;
1518          }
1519       }
1520 
1521       struct ir3_instruction *terminator = ir3_block_get_terminator(block);
1522       if (terminator) {
1523          if (terminator->opc == OPC_BALL || terminator->opc == OPC_BANY ||
1524              (terminator->opc == OPC_GETONE &&
1525               (terminator->flags & IR3_INSTR_NEEDS_HELPERS))) {
1526             bd->uses_helpers_beginning = true;
1527             bd->uses_helpers_end = true;
1528             non_prefetch_helpers = true;
1529          }
1530       }
1531 
1532       block->data = bd;
1533    }
1534 
1535    /* If only prefetches use helpers then we can disable them in the shader via
1536     * a register setting.
1537     */
1538    if (!non_prefetch_helpers) {
1539       so->prefetch_end_of_quad = true;
1540       return;
1541    }
1542 
1543    bool progress;
1544    do {
1545       progress = false;
1546       foreach_block_rev (block, &ir->block_list) {
1547          struct ir3_helper_block_data *bd = block->data;
1548 
1549          if (!bd->uses_helpers_beginning)
1550             continue;
1551 
1552          for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
1553             struct ir3_block *pred = block->physical_predecessors[i];
1554             struct ir3_helper_block_data *pred_bd = pred->data;
1555             if (!pred_bd->uses_helpers_end) {
1556                pred_bd->uses_helpers_end = true;
1557             }
1558             if (!pred_bd->uses_helpers_beginning) {
1559                pred_bd->uses_helpers_beginning = true;
1560                progress = true;
1561             }
1562          }
1563       }
1564    } while (progress);
1565 
1566    /* Now, we need to determine the points where helper invocations become
1567     * unused.
1568     */
1569    foreach_block (block, &ir->block_list) {
1570       struct ir3_helper_block_data *bd = block->data;
1571       if (bd->uses_helpers_end)
1572          continue;
1573 
1574       /* We need to check the predecessors because of situations with critical
1575        * edges like this that can occur after optimizing jumps:
1576        *
1577        *    br p0.x, #endif
1578        *    ...
1579        *    sam ...
1580        *    ...
1581        *    endif:
1582        *    ...
1583        *    end
1584        *
1585        * The endif block will have uses_helpers_beginning = false and
1586        * uses_helpers_end = false, but because we jump to there from the
1587        * beginning of the if where uses_helpers_end = true, we still want to
1588        * add an (eq) at the beginning of the block:
1589        *
1590        *    br p0.x, #endif
1591        *    ...
1592        *    sam ...
1593        *    (eq)nop
1594        *    ...
1595        *    endif:
1596        *    (eq)nop
1597        *    ...
1598        *    end
1599        *
1600        * This an extra nop in the case where the branch isn't taken, but that's
1601        * probably preferable to adding an extra jump instruction which is what
1602        * would happen if we ran this pass before optimizing jumps:
1603        *
1604        *    br p0.x, #else
1605        *    ...
1606        *    sam ...
1607        *    (eq)nop
1608        *    ...
1609        *    jump #endif
1610        *    else:
1611        *    (eq)nop
1612        *    endif:
1613        *    ...
1614        *    end
1615        *
1616        * We also need this to make sure we insert (eq) after branches which use
1617        * helper invocations.
1618        */
1619       bool pred_uses_helpers = bd->uses_helpers_beginning;
1620       for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
1621          struct ir3_block *pred = block->physical_predecessors[i];
1622          struct ir3_helper_block_data *pred_bd = pred->data;
1623          if (pred_bd->uses_helpers_end) {
1624             pred_uses_helpers = true;
1625             break;
1626          }
1627       }
1628 
1629       if (!pred_uses_helpers)
1630          continue;
1631 
1632       /* The last use of helpers is somewhere between the beginning and the
1633        * end. first_instr will be the first instruction where helpers are no
1634        * longer required, or NULL if helpers are not required just at the end.
1635        */
1636       struct ir3_instruction *first_instr = NULL;
1637       foreach_instr_rev (instr, &block->instr_list) {
1638          /* Skip prefetches because they actually execute before the block
1639           * starts and at this stage they aren't guaranteed to be at the start
1640           * of the block.
1641           */
1642          if (uses_helpers(instr) && instr->opc != OPC_META_TEX_PREFETCH)
1643             break;
1644          first_instr = instr;
1645       }
1646 
1647       bool killed = false;
1648       bool expensive_instruction_in_block = false;
1649       if (first_instr) {
1650          foreach_instr_from (instr, first_instr, &block->instr_list) {
1651             /* If there's already a nop, we don't have to worry about whether to
1652              * insert one.
1653              */
1654             if (instr->opc == OPC_NOP) {
1655                instr->flags |= IR3_INSTR_EQ;
1656                killed = true;
1657                break;
1658             }
1659 
1660             /* ALU and SFU instructions probably aren't going to benefit much
1661              * from killing helper invocations, because they complete at least
1662              * an entire quad in a cycle and don't access any quad-divergent
1663              * memory, so delay emitting (eq) in the hopes that we find a nop
1664              * afterwards.
1665              */
1666             if (is_alu(instr) || is_sfu(instr))
1667                continue;
1668             if (instr->opc == OPC_PREDE)
1669                continue;
1670 
1671             expensive_instruction_in_block = true;
1672             break;
1673          }
1674       }
1675 
1676       /* If this block isn't the last block before the end instruction, assume
1677        * that there may be expensive instructions in later blocks so it's worth
1678        * it to insert a nop.
1679        */
1680       if (!killed && (expensive_instruction_in_block ||
1681                       block->successors[0] != ir3_end_block(ir))) {
1682          struct ir3_cursor cursor = first_instr ? ir3_before_instr(first_instr)
1683                                                 : ir3_before_terminator(block);
1684          struct ir3_builder build = ir3_builder_at(cursor);
1685          struct ir3_instruction *nop = ir3_NOP(&build);
1686          nop->flags |= IR3_INSTR_EQ;
1687       }
1688    }
1689 }
1690 
1691 bool
ir3_legalize(struct ir3 * ir,struct ir3_shader_variant * so,int * max_bary)1692 ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
1693 {
1694    struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx);
1695    bool mergedregs = so->mergedregs;
1696    bool progress;
1697 
1698    ctx->so = so;
1699    ctx->max_bary = -1;
1700    ctx->compiler = ir->compiler;
1701    ctx->type = ir->type;
1702 
1703    /* allocate per-block data: */
1704    foreach_block (block, &ir->block_list) {
1705       struct ir3_legalize_block_data *bd =
1706          rzalloc(ctx, struct ir3_legalize_block_data);
1707 
1708       regmask_init(&bd->state.needs_ss_war, mergedregs);
1709       regmask_init(&bd->state.needs_ss_or_sy_war, mergedregs);
1710       regmask_init(&bd->state.needs_ss_scalar_war, mergedregs);
1711       regmask_init(&bd->state.needs_ss_or_sy_scalar_war, mergedregs);
1712       regmask_init(&bd->state.needs_ss_scalar_full, mergedregs);
1713       regmask_init(&bd->state.needs_ss_scalar_half, mergedregs);
1714       regmask_init(&bd->state.needs_ss, mergedregs);
1715       regmask_init(&bd->state.needs_sy, mergedregs);
1716       regmask_init(&bd->begin_state.needs_ss_war, mergedregs);
1717       regmask_init(&bd->begin_state.needs_ss_or_sy_war, mergedregs);
1718       regmask_init(&bd->begin_state.needs_ss_scalar_war, mergedregs);
1719       regmask_init(&bd->begin_state.needs_ss_or_sy_scalar_war, mergedregs);
1720       regmask_init(&bd->begin_state.needs_ss_scalar_full, mergedregs);
1721       regmask_init(&bd->begin_state.needs_ss_scalar_half, mergedregs);
1722       regmask_init(&bd->begin_state.needs_ss, mergedregs);
1723       regmask_init(&bd->begin_state.needs_sy, mergedregs);
1724 
1725       block->data = bd;
1726    }
1727 
1728    /* We may have failed to pull all input loads into the first block.
1729     * In such case at the moment we aren't able to find a better place
1730     * to for (ei) than the end of the program.
1731     * a5xx and a6xx do automatically release varying storage at the end.
1732     */
1733    ctx->early_input_release = true;
1734 
1735    struct ir3_block *start_block = ir3_after_preamble(ir);
1736 
1737    /* Gather information to determine whether we can enable early preamble.
1738     */
1739    bool gpr_in_preamble = false;
1740    bool pred_in_preamble = false;
1741    bool relative_in_preamble = false;
1742    bool in_preamble = start_block != ir3_start_block(ir);
1743    bool has_preamble = start_block != ir3_start_block(ir);
1744 
1745    foreach_block (block, &ir->block_list) {
1746       if (block == start_block)
1747          in_preamble = false;
1748 
1749       foreach_instr (instr, &block->instr_list) {
1750          if (is_input(instr)) {
1751             ctx->has_inputs = true;
1752             if (block != start_block) {
1753                ctx->early_input_release = false;
1754             }
1755          }
1756 
1757          if (is_meta(instr))
1758             continue;
1759 
1760          foreach_src (reg, instr) {
1761             if (in_preamble) {
1762                if (!(reg->flags & IR3_REG_SHARED) && is_reg_gpr(reg))
1763                   gpr_in_preamble = true;
1764                if (reg->flags & IR3_REG_RELATIV)
1765                   relative_in_preamble = true;
1766             }
1767          }
1768 
1769          foreach_dst (reg, instr) {
1770             if (is_dest_gpr(reg)) {
1771                if (in_preamble) {
1772                   if (!(reg->flags & IR3_REG_SHARED))
1773                      gpr_in_preamble = true;
1774                   if (reg->flags & IR3_REG_RELATIV)
1775                      relative_in_preamble = true;
1776                }
1777             }
1778          }
1779 
1780          if (in_preamble && writes_pred(instr)) {
1781             pred_in_preamble = true;
1782          }
1783       }
1784    }
1785 
1786    so->early_preamble = has_preamble && !gpr_in_preamble &&
1787       !pred_in_preamble && !relative_in_preamble &&
1788       ir->compiler->has_early_preamble &&
1789       !(ir3_shader_debug & IR3_DBG_NOEARLYPREAMBLE);
1790 
1791    /* On a7xx, sync behavior for a1.x is different in the early preamble. RaW
1792     * dependencies must be synchronized with (ss) there must be an extra
1793     * (r) on the source of the mova1 instruction.
1794     */
1795    if (so->early_preamble && ir->compiler->gen >= 7) {
1796       foreach_block (block, &ir->block_list) {
1797          if (block == start_block)
1798             break;
1799          block->in_early_preamble = true;
1800       }
1801    }
1802 
1803    assert(ctx->early_input_release || ctx->compiler->gen >= 5);
1804 
1805    if (ir3_shader_debug & IR3_DBG_EXPANDRPT) {
1806       dbg_expand_rpt(ir);
1807    }
1808 
1809    /* process each block: */
1810    do {
1811       progress = false;
1812       foreach_block (block, &ir->block_list) {
1813          progress |= legalize_block(ctx, block);
1814       }
1815    } while (progress);
1816 
1817    *max_bary = ctx->max_bary;
1818 
1819    foreach_block (block, &ir->block_list) {
1820       struct ir3_instruction *terminator = ir3_block_get_terminator(block);
1821       if (terminator && terminator->opc == OPC_GETONE) {
1822          apply_push_consts_load_macro(ctx, block->successors[0]);
1823          break;
1824       }
1825    }
1826 
1827    block_sched(ir);
1828 
1829    foreach_block (block, &ir->block_list) {
1830       progress |= apply_fine_deriv_macro(ctx, block);
1831    }
1832 
1833    if (ir3_shader_debug & IR3_DBG_FULLSYNC) {
1834       dbg_sync_sched(ir, so);
1835    }
1836 
1837    if (ir3_shader_debug & IR3_DBG_FULLNOP) {
1838       dbg_nop_sched(ir, so);
1839    }
1840 
1841    bool cfg_changed = false;
1842    while (opt_jump(ir))
1843       cfg_changed = true;
1844 
1845    prede_sched(ir);
1846 
1847    if (cfg_changed)
1848       ir3_calc_reconvergence(so);
1849 
1850    if (so->type == MESA_SHADER_FRAGMENT)
1851       kill_sched(ir, so);
1852 
1853    /* TODO: does (eq) exist before a6xx? */
1854    if (so->type == MESA_SHADER_FRAGMENT && so->need_pixlod &&
1855        so->compiler->gen >= 6)
1856       helper_sched(ctx, ir, so);
1857 
1858    foreach_block (block, &ir->block_list) {
1859       progress |= expand_dummy_dests(block);
1860    }
1861 
1862    ir3_insert_alias_tex(ir);
1863    ir3_count_instructions(ir);
1864    resolve_jumps(ir);
1865 
1866    mark_xvergence_points(ir);
1867 
1868    ralloc_free(ctx);
1869 
1870    return true;
1871 }
1872