1 /*
2 * Copyright © 2014 Rob Clark <robclark@freedesktop.org>
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Rob Clark <robclark@freedesktop.org>
7 */
8
9 #include "util/ralloc.h"
10 #include "util/u_math.h"
11
12 #include "ir3.h"
13 #include "ir3_shader.h"
14
15 /*
16 * Legalize:
17 *
18 * The legalize pass handles ensuring sufficient nop's and sync flags for
19 * correct execution.
20 *
21 * 1) Iteratively determine where sync ((sy)/(ss)) flags are needed,
22 * based on state flowing out of predecessor blocks until there is
23 * no further change. In some cases this requires inserting nops.
24 * 2) Mark (ei) on last varying input
25 * 3) Final nop scheduling for instruction latency
26 * 4) Resolve jumps and schedule blocks, marking potential convergence
27 * points with (jp)
28 */
29
30 struct ir3_legalize_ctx {
31 struct ir3_compiler *compiler;
32 struct ir3_shader_variant *so;
33 gl_shader_stage type;
34 int max_bary;
35 bool early_input_release;
36 bool has_inputs;
37 bool has_tex_prefetch;
38 };
39
40 struct ir3_nop_state {
41 unsigned full_ready[GPR_REG_SIZE];
42 unsigned half_ready[GPR_REG_SIZE];
43 };
44
45 struct ir3_legalize_state {
46 regmask_t needs_ss;
47 regmask_t needs_ss_scalar_full; /* half scalar ALU producer -> full scalar ALU consumer */
48 regmask_t needs_ss_scalar_half; /* full scalar ALU producer -> half scalar ALU consumer */
49 regmask_t needs_ss_war; /* write after read */
50 regmask_t needs_ss_or_sy_war; /* WAR for sy-producer sources */
51 regmask_t needs_ss_scalar_war; /* scalar ALU write -> ALU write */
52 regmask_t needs_ss_or_sy_scalar_war;
53 regmask_t needs_sy;
54 bool needs_ss_for_const;
55 bool needs_sy_for_const;
56
57 /* Each of these arrays contains the cycle when the corresponding register
58 * becomes "ready" i.e. does not require any more nops. There is a special
59 * mechanism to let ALU instructions read compatible (i.e. same halfness)
60 * destinations of another ALU instruction with less delay, so this can
61 * depend on what type the consuming instruction is, which is why there are
62 * multiple arrays. The cycle is counted relative to the start of the block.
63 */
64
65 /* When ALU instructions reading the given full/half register will be ready.
66 */
67 struct ir3_nop_state alu_nop;
68
69 /* When non-ALU (e.g. cat5) instructions reading the given full/half register
70 * will be ready.
71 */
72 struct ir3_nop_state non_alu_nop;
73
74 /* When p0.x-w, a0.x, and a1.x are ready. */
75 unsigned pred_ready[4];
76 unsigned addr_ready[2];
77 };
78
79 struct ir3_legalize_block_data {
80 bool valid;
81 struct ir3_legalize_state begin_state;
82 struct ir3_legalize_state state;
83 };
84
85 static inline bool
needs_ss_war(struct ir3_legalize_state * state,struct ir3_register * dst,bool is_scalar_alu)86 needs_ss_war(struct ir3_legalize_state *state, struct ir3_register *dst,
87 bool is_scalar_alu)
88 {
89 if (regmask_get(&state->needs_ss_war, dst))
90 return true;
91 if (regmask_get(&state->needs_ss_or_sy_war, dst))
92 return true;
93
94 if (!is_scalar_alu) {
95 if (regmask_get(&state->needs_ss_scalar_war, dst))
96 return true;
97 if (regmask_get(&state->needs_ss_or_sy_scalar_war, dst))
98 return true;
99 }
100
101 return false;
102 }
103
104 static inline void
apply_ss(struct ir3_instruction * instr,struct ir3_legalize_state * state,bool mergedregs)105 apply_ss(struct ir3_instruction *instr,
106 struct ir3_legalize_state *state,
107 bool mergedregs)
108 {
109 instr->flags |= IR3_INSTR_SS;
110 regmask_init(&state->needs_ss_war, mergedregs);
111 regmask_init(&state->needs_ss_or_sy_war, mergedregs);
112 regmask_init(&state->needs_ss, mergedregs);
113 regmask_init(&state->needs_ss_scalar_war, mergedregs);
114 regmask_init(&state->needs_ss_or_sy_scalar_war, mergedregs);
115 regmask_init(&state->needs_ss_scalar_full, mergedregs);
116 regmask_init(&state->needs_ss_scalar_half, mergedregs);
117 state->needs_ss_for_const = false;
118 }
119
120 static inline void
apply_sy(struct ir3_instruction * instr,struct ir3_legalize_state * state,bool mergedregs)121 apply_sy(struct ir3_instruction *instr,
122 struct ir3_legalize_state *state,
123 bool mergedregs)
124 {
125 instr->flags |= IR3_INSTR_SY;
126 regmask_init(&state->needs_sy, mergedregs);
127 regmask_init(&state->needs_ss_or_sy_war, mergedregs);
128 regmask_init(&state->needs_ss_or_sy_scalar_war, mergedregs);
129 state->needs_sy_for_const = false;
130 }
131
132 static bool
count_instruction(struct ir3_instruction * n,struct ir3_compiler * compiler)133 count_instruction(struct ir3_instruction *n, struct ir3_compiler *compiler)
134 {
135 /* NOTE: don't count branch/jump since we don't know yet if they will
136 * be eliminated later in resolve_jumps().. really should do that
137 * earlier so we don't have this constraint.
138 */
139 return (is_alu(n) && !is_scalar_alu(n, compiler)) ||
140 (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) &&
141 (n->opc != OPC_BRAA) && (n->opc != OPC_BRAO));
142 }
143
144 static unsigned *
get_ready_slot(struct ir3_legalize_state * state,struct ir3_register * reg,unsigned num,bool consumer_alu,bool matching_size)145 get_ready_slot(struct ir3_legalize_state *state,
146 struct ir3_register *reg, unsigned num,
147 bool consumer_alu, bool matching_size)
148 {
149 if (reg->flags & IR3_REG_PREDICATE) {
150 assert(num == reg->num);
151 assert(reg_num(reg) == REG_P0);
152 return &state->pred_ready[reg_comp(reg)];
153 }
154 if (reg->num == regid(REG_A0, 0))
155 return &state->addr_ready[0];
156 if (reg->num == regid(REG_A0, 1))
157 return &state->addr_ready[1];
158 struct ir3_nop_state *nop =
159 consumer_alu ? &state->alu_nop : &state->non_alu_nop;
160 assert(!(reg->flags & IR3_REG_SHARED));
161 if (reg->flags & IR3_REG_HALF) {
162 if (matching_size)
163 return &nop->half_ready[num];
164 else
165 return &nop->full_ready[num / 2];
166 } else {
167 if (matching_size)
168 return &nop->full_ready[num];
169 /* If "num" is large enough, then it can't alias a half-reg because only
170 * the first half of the full reg speace aliases half regs. Return NULL in
171 * this case.
172 */
173 else if (num * 2 < ARRAY_SIZE(nop->half_ready))
174 return &nop->half_ready[num * 2];
175 else
176 return NULL;
177 }
178 }
179
180 static unsigned
delay_calc(struct ir3_legalize_ctx * ctx,struct ir3_legalize_state * state,struct ir3_instruction * instr,unsigned cycle)181 delay_calc(struct ir3_legalize_ctx *ctx,
182 struct ir3_legalize_state *state,
183 struct ir3_instruction *instr,
184 unsigned cycle)
185 {
186 /* As far as we know, shader outputs don't need any delay. */
187 if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
188 return 0;
189
190 unsigned delay = 0;
191 foreach_src_n (src, n, instr) {
192 if (src->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED))
193 continue;
194
195 unsigned elems = post_ra_reg_elems(src);
196 unsigned num = post_ra_reg_num(src);
197 unsigned src_cycle = cycle + ir3_src_read_delay(ctx->compiler, instr, n);
198
199 for (unsigned elem = 0; elem < elems; elem++, num++) {
200 unsigned ready_cycle =
201 *get_ready_slot(state, src, num, is_alu(instr), true);
202 delay = MAX2(delay, MAX2(ready_cycle, src_cycle) - src_cycle);
203
204 /* Increment cycle for ALU instructions with (rptN) where sources are
205 * read each subsequent cycle.
206 */
207 if (instr->repeat && !(src->flags & IR3_REG_RELATIV))
208 src_cycle++;
209 }
210 }
211
212 return delay;
213 }
214
215 static void
delay_update(struct ir3_legalize_ctx * ctx,struct ir3_legalize_state * state,struct ir3_instruction * instr,unsigned cycle,bool mergedregs)216 delay_update(struct ir3_legalize_ctx *ctx,
217 struct ir3_legalize_state *state,
218 struct ir3_instruction *instr,
219 unsigned cycle,
220 bool mergedregs)
221 {
222 if (writes_addr1(instr) && instr->block->in_early_preamble)
223 return;
224
225 foreach_dst_n (dst, n, instr) {
226 if (dst->flags & IR3_REG_RT)
227 continue;
228
229 unsigned elems = post_ra_reg_elems(dst);
230 unsigned num = post_ra_reg_num(dst);
231 unsigned dst_cycle = cycle;
232
233 /* sct and swz have scalar destinations and each destination is written in
234 * a subsequent cycle.
235 */
236 if (instr->opc == OPC_SCT || instr->opc == OPC_SWZ)
237 dst_cycle += n;
238
239 /* For relative accesses with (rptN), we have no way of knowing which
240 * component is accessed when, so we have to assume the worst and mark
241 * every array member as being written at the end.
242 */
243 if (dst->flags & IR3_REG_RELATIV)
244 dst_cycle += instr->repeat;
245
246 if (dst->flags & IR3_REG_SHARED)
247 continue;
248
249 for (unsigned elem = 0; elem < elems; elem++, num++) {
250 for (unsigned consumer_alu = 0; consumer_alu < 2; consumer_alu++) {
251 for (unsigned matching_size = 0; matching_size < 2; matching_size++) {
252 unsigned *ready_slot =
253 get_ready_slot(state, dst, num, consumer_alu, matching_size);
254
255 if (!ready_slot)
256 continue;
257
258 bool reset_ready_slot = false;
259 unsigned delay = 0;
260 if (!is_alu(instr)) {
261 /* Apparently writes that require (ss) or (sy) are
262 * synchronized against previous writes, so consumers don't
263 * have to wait for any previous overlapping ALU instructions
264 * to complete.
265 */
266 reset_ready_slot = true;
267 } else if ((dst->flags & IR3_REG_PREDICATE) ||
268 reg_num(dst) == REG_A0) {
269 delay = ctx->compiler->delay_slots.non_alu;
270 if (!matching_size)
271 continue;
272 } else {
273 delay = (consumer_alu && matching_size)
274 ? ctx->compiler->delay_slots.alu_to_alu
275 : ctx->compiler->delay_slots.non_alu;
276 }
277
278 if (!matching_size) {
279 for (unsigned i = 0; i < reg_elem_size(dst); i++) {
280 ready_slot[i] =
281 reset_ready_slot ? 0 :
282 MAX2(ready_slot[i], dst_cycle + delay);
283 }
284 } else {
285 *ready_slot =
286 reset_ready_slot ? 0 :
287 MAX2(*ready_slot, dst_cycle + delay);
288 }
289 }
290 }
291
292 /* Increment cycle for ALU instructions with (rptN) where destinations
293 * are written each subsequent cycle.
294 */
295 if (instr->repeat && !(dst->flags & IR3_REG_RELATIV))
296 dst_cycle++;
297 }
298 }
299 }
300
301 /* We want to evaluate each block from the position of any other
302 * predecessor block, in order that the flags set are the union of
303 * all possible program paths.
304 *
305 * To do this, we need to know the output state (needs_ss/ss_war/sy)
306 * of all predecessor blocks. The tricky thing is loops, which mean
307 * that we can't simply recursively process each predecessor block
308 * before legalizing the current block.
309 *
310 * How we handle that is by looping over all the blocks until the
311 * results converge. If the output state of a given block changes
312 * in a given pass, this means that all successor blocks are not
313 * yet fully legalized.
314 */
315
316 static bool
legalize_block(struct ir3_legalize_ctx * ctx,struct ir3_block * block)317 legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
318 {
319 struct ir3_legalize_block_data *bd = block->data;
320
321 if (bd->valid)
322 return false;
323
324 struct ir3_instruction *last_n = NULL;
325 struct list_head instr_list;
326 struct ir3_legalize_state prev_state = bd->state;
327 struct ir3_legalize_state *state = &bd->begin_state;
328 bool last_input_needs_ss = false;
329 bool mergedregs = ctx->so->mergedregs;
330 struct ir3_builder build = ir3_builder_at(ir3_after_block(block));
331
332 /* Our input state is the OR of all predecessor blocks' state.
333 *
334 * Why don't we just zero the state at the beginning before merging in the
335 * predecessors? Because otherwise updates may not be a "lattice refinement",
336 * i.e. needs_ss may go from true to false for some register due to a (ss) we
337 * inserted the second time around (and the same for (sy)). This means that
338 * there's no solid guarantee the algorithm will converge, and in theory
339 * there may be infinite loops where we fight over the placment of an (ss).
340 */
341 for (unsigned i = 0; i < block->predecessors_count; i++) {
342 struct ir3_block *predecessor = block->predecessors[i];
343 struct ir3_legalize_block_data *pbd = predecessor->data;
344 struct ir3_legalize_state *pstate = &pbd->state;
345
346 /* Our input (ss)/(sy) state is based on OR'ing the output
347 * state of all our predecessor blocks
348 */
349 regmask_or(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
350 regmask_or(&state->needs_ss_war, &state->needs_ss_war,
351 &pstate->needs_ss_war);
352 regmask_or(&state->needs_ss_or_sy_war, &state->needs_ss_or_sy_war,
353 &pstate->needs_ss_or_sy_war);
354 regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy);
355 state->needs_ss_for_const |= pstate->needs_ss_for_const;
356 state->needs_sy_for_const |= pstate->needs_sy_for_const;
357
358 /* Our nop state is the max of the predecessor blocks */
359 for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++)
360 state->pred_ready[i] = MAX2(state->pred_ready[i],
361 pstate->pred_ready[i]);
362 for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) {
363 state->alu_nop.full_ready[i] = MAX2(state->alu_nop.full_ready[i],
364 pstate->alu_nop.full_ready[i]);
365 state->alu_nop.half_ready[i] = MAX2(state->alu_nop.half_ready[i],
366 pstate->alu_nop.half_ready[i]);
367 state->non_alu_nop.full_ready[i] = MAX2(state->non_alu_nop.full_ready[i],
368 pstate->non_alu_nop.full_ready[i]);
369 state->non_alu_nop.half_ready[i] = MAX2(state->non_alu_nop.half_ready[i],
370 pstate->non_alu_nop.half_ready[i]);
371 }
372 }
373
374 /* We need to take phsyical-only edges into account when tracking shared
375 * registers.
376 */
377 for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
378 struct ir3_block *predecessor = block->physical_predecessors[i];
379 struct ir3_legalize_block_data *pbd = predecessor->data;
380 struct ir3_legalize_state *pstate = &pbd->state;
381
382 regmask_or_shared(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
383 regmask_or_shared(&state->needs_ss_scalar_full,
384 &state->needs_ss_scalar_full,
385 &pstate->needs_ss_scalar_full);
386 regmask_or_shared(&state->needs_ss_scalar_half,
387 &state->needs_ss_scalar_half,
388 &pstate->needs_ss_scalar_half);
389 regmask_or_shared(&state->needs_ss_scalar_war, &state->needs_ss_scalar_war,
390 &pstate->needs_ss_scalar_war);
391 regmask_or_shared(&state->needs_ss_or_sy_scalar_war,
392 &state->needs_ss_or_sy_scalar_war,
393 &pstate->needs_ss_or_sy_scalar_war);
394 }
395
396 memcpy(&bd->state, state, sizeof(*state));
397 state = &bd->state;
398
399 unsigned input_count = 0;
400
401 foreach_instr (n, &block->instr_list) {
402 if (is_input(n)) {
403 input_count++;
404 }
405 }
406
407 unsigned inputs_remaining = input_count;
408
409 /* Either inputs are in the first block or we expect inputs to be released
410 * with the end of the program.
411 */
412 assert(input_count == 0 || !ctx->early_input_release ||
413 block == ir3_after_preamble(block->shader));
414
415 /* remove all the instructions from the list, we'll be adding
416 * them back in as we go
417 */
418 list_replace(&block->instr_list, &instr_list);
419 list_inithead(&block->instr_list);
420
421 unsigned cycle = 0;
422
423 foreach_instr_safe (n, &instr_list) {
424 unsigned i;
425
426 n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
427
428 /* _meta::tex_prefetch instructions removed later in
429 * collect_tex_prefetches()
430 */
431 if (is_meta(n) && (n->opc != OPC_META_TEX_PREFETCH))
432 continue;
433
434 if (is_input(n)) {
435 struct ir3_register *inloc = n->srcs[0];
436 assert(inloc->flags & IR3_REG_IMMED);
437
438 int last_inloc =
439 inloc->iim_val + ((inloc->flags & IR3_REG_R) ? n->repeat : 0);
440 ctx->max_bary = MAX2(ctx->max_bary, last_inloc);
441 }
442
443 if ((last_n && is_barrier(last_n)) || n->opc == OPC_SHPE) {
444 apply_ss(n, state, mergedregs);
445 apply_sy(n, state, mergedregs);
446 last_input_needs_ss = false;
447 }
448
449 if (last_n && (last_n->opc == OPC_PREDT)) {
450 apply_ss(n, state, mergedregs);
451 }
452
453 bool n_is_scalar_alu = is_scalar_alu(n, ctx->compiler);
454
455 /* NOTE: consider dst register too.. it could happen that
456 * texture sample instruction (for example) writes some
457 * components which are unused. A subsequent instruction
458 * that writes the same register can race w/ the sam instr
459 * resulting in undefined results:
460 */
461 for (i = 0; i < n->dsts_count + n->srcs_count; i++) {
462 struct ir3_register *reg;
463 if (i < n->dsts_count)
464 reg = n->dsts[i];
465 else
466 reg = n->srcs[i - n->dsts_count];
467
468 if (is_reg_gpr(reg)) {
469
470 /* TODO: we probably only need (ss) for alu
471 * instr consuming sfu result.. need to make
472 * some tests for both this and (sy)..
473 */
474 if (regmask_get(&state->needs_ss, reg)) {
475 apply_ss(n, state, mergedregs);
476 last_input_needs_ss = false;
477 }
478
479 /* There is a fast feedback path for scalar ALU instructions which
480 * only takes 1 cycle of latency, similar to the normal 3 cycle
481 * latency path for ALU instructions. For this fast path the
482 * producer and consumer must use the same register size (i.e. no
483 * writing a full register and then reading half of it or vice
484 * versa). If we don't hit this path, either because of a mismatched
485 * size or a read via the regular ALU, then the write latency is
486 * variable and we must use (ss) to wait for the scalar ALU. This is
487 * different from the fixed 6 cycle latency for mismatched vector
488 * ALU accesses.
489 */
490 if (n_is_scalar_alu) {
491 /* Check if we have a mismatched size RaW dependency */
492 if (regmask_get((reg->flags & IR3_REG_HALF) ?
493 &state->needs_ss_scalar_half :
494 &state->needs_ss_scalar_full, reg)) {
495 apply_ss(n, state, mergedregs);
496 last_input_needs_ss = false;
497 }
498 } else {
499 /* check if we have a scalar -> vector RaW dependency */
500 if (regmask_get(&state->needs_ss_scalar_half, reg) ||
501 regmask_get(&state->needs_ss_scalar_full, reg)) {
502 apply_ss(n, state, mergedregs);
503 last_input_needs_ss = false;
504 }
505 }
506
507 if (regmask_get(&state->needs_sy, reg)) {
508 apply_sy(n, state, mergedregs);
509 }
510 } else if ((reg->flags & IR3_REG_CONST)) {
511 if (state->needs_ss_for_const) {
512 apply_ss(n, state, mergedregs);
513 last_input_needs_ss = false;
514 }
515 if (state->needs_sy_for_const) {
516 apply_sy(n, state, mergedregs);
517 }
518 } else if (reg_is_addr1(reg) && block->in_early_preamble) {
519 if (regmask_get(&state->needs_ss, reg)) {
520 apply_ss(n, state, mergedregs);
521 last_input_needs_ss = false;
522 }
523 }
524 }
525
526 foreach_dst (reg, n) {
527 if (reg->flags & IR3_REG_RT)
528 continue;
529 if (needs_ss_war(state, reg, n_is_scalar_alu)) {
530 apply_ss(n, state, mergedregs);
531 last_input_needs_ss = false;
532 }
533 }
534
535 /* I'm not exactly what this is for, but it seems we need this on every
536 * mova1 in early preambles.
537 */
538 if (writes_addr1(n) && block->in_early_preamble)
539 n->srcs[0]->flags |= IR3_REG_R;
540
541 /* cat5+ does not have an (ss) bit, if needed we need to
542 * insert a nop to carry the sync flag. Would be kinda
543 * clever if we were aware of this during scheduling, but
544 * this should be a pretty rare case:
545 */
546 if ((n->flags & IR3_INSTR_SS) && !supports_ss(n)) {
547 struct ir3_instruction *nop;
548 nop = ir3_NOP(&build);
549 nop->flags |= IR3_INSTR_SS;
550 n->flags &= ~IR3_INSTR_SS;
551 last_n = nop;
552 cycle++;
553 }
554
555 unsigned delay = delay_calc(ctx, state, n, cycle);
556
557 /* NOTE: I think the nopN encoding works for a5xx and
558 * probably a4xx, but not a3xx. So far only tested on
559 * a6xx.
560 */
561
562 if ((delay > 0) && (ctx->compiler->gen >= 6) && last_n &&
563 !n_is_scalar_alu &&
564 ((opc_cat(last_n->opc) == 2) || (opc_cat(last_n->opc) == 3)) &&
565 (last_n->repeat == 0)) {
566 /* the previous cat2/cat3 instruction can encode at most 3 nop's: */
567 unsigned transfer = MIN2(delay, 3 - last_n->nop);
568 last_n->nop += transfer;
569 delay -= transfer;
570 cycle += transfer;
571 }
572
573 if ((delay > 0) && last_n && (last_n->opc == OPC_NOP)) {
574 /* the previous nop can encode at most 5 repeats: */
575 unsigned transfer = MIN2(delay, 5 - last_n->repeat);
576 last_n->repeat += transfer;
577 delay -= transfer;
578 cycle += transfer;
579 }
580
581 if (delay > 0) {
582 assert(delay <= 6);
583 ir3_NOP(&build)->repeat = delay - 1;
584 cycle += delay;
585 }
586
587 if (ctx->compiler->samgq_workaround &&
588 ctx->type != MESA_SHADER_FRAGMENT &&
589 ctx->type != MESA_SHADER_COMPUTE && n->opc == OPC_SAMGQ) {
590 struct ir3_instruction *samgp;
591
592 list_delinit(&n->node);
593
594 for (i = 0; i < 4; i++) {
595 samgp = ir3_instr_clone(n);
596 samgp->opc = OPC_SAMGP0 + i;
597 if (i > 1)
598 samgp->flags |= IR3_INSTR_SY;
599 }
600 } else {
601 list_delinit(&n->node);
602 list_addtail(&n->node, &block->instr_list);
603 }
604
605 if (is_sfu(n) || n->opc == OPC_SHFL)
606 regmask_set(&state->needs_ss, n->dsts[0]);
607
608 foreach_dst (dst, n) {
609 if (dst->flags & IR3_REG_SHARED) {
610 if (n_is_scalar_alu) {
611 if (dst->flags & IR3_REG_HALF)
612 regmask_set(&state->needs_ss_scalar_full, dst);
613 else
614 regmask_set(&state->needs_ss_scalar_half, dst);
615 } else {
616 regmask_set(&state->needs_ss, dst);
617 }
618 } else if (reg_is_addr1(dst) && block->in_early_preamble) {
619 regmask_set(&state->needs_ss, dst);
620 }
621 }
622
623 if (is_tex_or_prefetch(n) && n->dsts_count > 0) {
624 regmask_set(&state->needs_sy, n->dsts[0]);
625 if (n->opc == OPC_META_TEX_PREFETCH)
626 ctx->has_tex_prefetch = true;
627 } else if (n->opc == OPC_RESINFO && n->dsts_count > 0) {
628 regmask_set(&state->needs_ss, n->dsts[0]);
629 ir3_NOP(&build)->flags |= IR3_INSTR_SS;
630 last_input_needs_ss = false;
631 } else if (is_load(n)) {
632 if (is_local_mem_load(n))
633 regmask_set(&state->needs_ss, n->dsts[0]);
634 else
635 regmask_set(&state->needs_sy, n->dsts[0]);
636 } else if (is_atomic(n->opc)) {
637 if (is_bindless_atomic(n->opc)) {
638 regmask_set(&state->needs_sy, n->srcs[2]);
639 } else if (is_global_a3xx_atomic(n->opc) ||
640 is_global_a6xx_atomic(n->opc)) {
641 regmask_set(&state->needs_sy, n->dsts[0]);
642 } else {
643 regmask_set(&state->needs_ss, n->dsts[0]);
644 }
645 } else if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO || n->opc == OPC_STC) {
646 state->needs_ss_for_const = true;
647 } else if (n->opc == OPC_LDC_K) {
648 state->needs_sy_for_const = true;
649 }
650
651 if (is_ssbo(n->opc) || is_global_a3xx_atomic(n->opc) ||
652 is_bindless_atomic(n->opc))
653 ctx->so->has_ssbo = true;
654
655 /* both tex/sfu appear to not always immediately consume
656 * their src register(s):
657 */
658 if (is_war_hazard_producer(n)) {
659 /* These WAR hazards can always be resolved with (ss). However, when
660 * the reader is a sy-producer, they can also be resolved using (sy)
661 * because once we have synced the reader's results using (sy), its
662 * sources have definitely been consumed. We track the two cases
663 * separately so that we don't add an unnecessary (ss) if a (sy) sync
664 * already happened.
665 * For example, this prevents adding the unnecessary (ss) in the
666 * following sequence:
667 * sam rd, rs, ...
668 * (sy)... ; sam synced so consumed its sources
669 * (ss)write rs ; (ss) unnecessary since rs has been consumed already
670 */
671 bool needs_ss = is_ss_producer(n) || is_store(n) || n->opc == OPC_STC;
672
673 if (n_is_scalar_alu) {
674 /* Scalar ALU also does not immediately read its source because it
675 * is not executed right away, but scalar ALU instructions are
676 * executed in-order so subsequent scalar ALU instructions don't
677 * need to wait for previous ones.
678 */
679 regmask_t *mask = needs_ss ? &state->needs_ss_scalar_war
680 : &state->needs_ss_or_sy_scalar_war;
681
682 foreach_src (reg, n) {
683 if ((reg->flags & IR3_REG_SHARED) || is_reg_a0(reg)) {
684 regmask_set(mask, reg);
685 }
686 }
687 } else {
688 regmask_t *mask =
689 needs_ss ? &state->needs_ss_war : &state->needs_ss_or_sy_war;
690
691 foreach_src (reg, n) {
692 if (!(reg->flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
693 regmask_set(mask, reg);
694 }
695 }
696 }
697 }
698
699 bool count = count_instruction(n, ctx->compiler);
700 if (count)
701 cycle += 1;
702
703 delay_update(ctx, state, n, cycle, mergedregs);
704
705 if (count)
706 cycle += n->repeat + n->nop;
707
708 if (ctx->early_input_release && is_input(n)) {
709 last_input_needs_ss |= (n->opc == OPC_LDLV);
710
711 assert(inputs_remaining > 0);
712 inputs_remaining--;
713 if (inputs_remaining == 0) {
714 /* This is the last input. We add the (ei) flag to release
715 * varying memory after this executes. If it's an ldlv,
716 * however, we need to insert a dummy bary.f on which we can
717 * set the (ei) flag. We may also need to insert an (ss) to
718 * guarantee that all ldlv's have finished fetching their
719 * results before releasing the varying memory.
720 */
721 struct ir3_instruction *last_input = n;
722 if (n->opc == OPC_LDLV) {
723 struct ir3_instruction *baryf;
724
725 /* (ss)bary.f (ei)r63.x, 0, r0.x */
726 baryf = ir3_build_instr(&build, OPC_BARY_F, 1, 2);
727 ir3_dst_create(baryf, regid(63, 0), 0);
728 ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
729 ir3_src_create(baryf, regid(0, 0), 0);
730
731 last_input = baryf;
732 }
733
734 last_input->dsts[0]->flags |= IR3_REG_EI;
735 if (last_input_needs_ss) {
736 apply_ss(last_input, state, mergedregs);
737 }
738 }
739 }
740
741 last_n = n;
742 }
743
744 assert(inputs_remaining == 0 || !ctx->early_input_release);
745
746 if (block == ir3_after_preamble(ctx->so->ir) &&
747 ctx->has_tex_prefetch && !ctx->has_inputs) {
748 /* texture prefetch, but *no* inputs.. we need to insert a
749 * dummy bary.f at the top of the shader to unblock varying
750 * storage:
751 */
752 struct ir3_instruction *baryf;
753
754 /* (ss)bary.f (ei)r63.x, 0, r0.x */
755 baryf = ir3_build_instr(&build, OPC_BARY_F, 1, 2);
756 ir3_dst_create(baryf, regid(63, 0), 0)->flags |= IR3_REG_EI;
757 ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
758 ir3_src_create(baryf, regid(0, 0), 0);
759
760 /* insert the dummy bary.f at head: */
761 list_delinit(&baryf->node);
762 list_add(&baryf->node, &block->instr_list);
763 }
764
765 /* Currently our nop state contains the cycle offset from the start of this
766 * block when each register becomes ready. But successor blocks need the
767 * cycle offset from their start, which is this block's end. Translate the
768 * cycle offset.
769 */
770 for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++)
771 state->pred_ready[i] = MAX2(state->pred_ready[i], cycle) - cycle;
772 for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) {
773 state->alu_nop.full_ready[i] =
774 MAX2(state->alu_nop.full_ready[i], cycle) - cycle;
775 state->alu_nop.half_ready[i] =
776 MAX2(state->alu_nop.half_ready[i], cycle) - cycle;
777 state->non_alu_nop.full_ready[i] =
778 MAX2(state->non_alu_nop.full_ready[i], cycle) - cycle;
779 state->non_alu_nop.half_ready[i] =
780 MAX2(state->non_alu_nop.half_ready[i], cycle) - cycle;
781 }
782
783 bd->valid = true;
784
785 if (memcmp(&prev_state, state, sizeof(*state))) {
786 /* our output state changed, this invalidates all of our
787 * successors:
788 */
789 for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
790 if (!block->successors[i])
791 break;
792 struct ir3_legalize_block_data *pbd = block->successors[i]->data;
793 pbd->valid = false;
794 }
795 }
796
797 return true;
798 }
799
800 /* Expands dsxpp and dsypp macros to:
801 *
802 * dsxpp.1 dst, src
803 * dsxpp.1.p dst, src
804 *
805 * We apply this after flags syncing, as we don't want to sync in between the
806 * two (which might happen if dst == src).
807 */
808 static bool
apply_fine_deriv_macro(struct ir3_legalize_ctx * ctx,struct ir3_block * block)809 apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
810 {
811 struct list_head instr_list;
812
813 /* remove all the instructions from the list, we'll be adding
814 * them back in as we go
815 */
816 list_replace(&block->instr_list, &instr_list);
817 list_inithead(&block->instr_list);
818
819 foreach_instr_safe (n, &instr_list) {
820 list_addtail(&n->node, &block->instr_list);
821
822 if (n->opc == OPC_DSXPP_MACRO || n->opc == OPC_DSYPP_MACRO) {
823 n->opc = (n->opc == OPC_DSXPP_MACRO) ? OPC_DSXPP_1 : OPC_DSYPP_1;
824
825 struct ir3_instruction *op_p = ir3_instr_clone(n);
826 op_p->flags = IR3_INSTR_P;
827
828 ctx->so->need_full_quad = true;
829 }
830 }
831
832 return true;
833 }
834
835 /* Some instructions can take a dummy destination of r63.x, which we model as it
836 * not having a destination in the IR to avoid having special code to handle
837 * this. Insert the dummy destination after everything else is done.
838 */
839 static bool
expand_dummy_dests(struct ir3_block * block)840 expand_dummy_dests(struct ir3_block *block)
841 {
842 foreach_instr (n, &block->instr_list) {
843 if ((n->opc == OPC_SAM || n->opc == OPC_LDC || n->opc == OPC_RESINFO) &&
844 n->dsts_count == 0) {
845 struct ir3_register *dst = ir3_dst_create(n, INVALID_REG, 0);
846 /* Copy the blob's writemask */
847 if (n->opc == OPC_SAM)
848 dst->wrmask = 0b1111;
849 }
850 }
851 return true;
852 }
853
854 static void
apply_push_consts_load_macro(struct ir3_legalize_ctx * ctx,struct ir3_block * block)855 apply_push_consts_load_macro(struct ir3_legalize_ctx *ctx,
856 struct ir3_block *block)
857 {
858 foreach_instr (n, &block->instr_list) {
859 if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
860 struct ir3_instruction *stsc =
861 ir3_instr_create_at(ir3_after_instr(n), OPC_STSC, 0, 2);
862 ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val =
863 n->push_consts.dst_base;
864 ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val =
865 n->push_consts.src_base;
866 stsc->cat6.iim_val = n->push_consts.src_size;
867 stsc->cat6.type = TYPE_U32;
868
869 if (ctx->compiler->stsc_duplication_quirk) {
870 struct ir3_builder build = ir3_builder_at(ir3_after_instr(stsc));
871 struct ir3_instruction *nop = ir3_NOP(&build);
872 nop->flags |= IR3_INSTR_SS;
873 ir3_instr_move_after(ir3_instr_clone(stsc), nop);
874 }
875
876 list_delinit(&n->node);
877 break;
878 } else if (!is_meta(n)) {
879 break;
880 }
881 }
882 }
883
884 /* NOTE: branch instructions are always the last instruction(s)
885 * in the block. We take advantage of this as we resolve the
886 * branches, since "if (foo) break;" constructs turn into
887 * something like:
888 *
889 * block3 {
890 * ...
891 * 0029:021: mov.s32s32 r62.x, r1.y
892 * 0082:022: br !p0.x, target=block5
893 * 0083:023: br p0.x, target=block4
894 * // succs: if _[0029:021: mov.s32s32] block4; else block5;
895 * }
896 * block4 {
897 * 0084:024: jump, target=block6
898 * // succs: block6;
899 * }
900 * block5 {
901 * 0085:025: jump, target=block7
902 * // succs: block7;
903 * }
904 *
905 * ie. only instruction in block4/block5 is a jump, so when
906 * resolving branches we can easily detect this by checking
907 * that the first instruction in the target block is itself
908 * a jump, and setup the br directly to the jump's target
909 * (and strip back out the now unreached jump)
910 *
911 * TODO sometimes we end up with things like:
912 *
913 * br !p0.x, #2
914 * br p0.x, #12
915 * add.u r0.y, r0.y, 1
916 *
917 * If we swapped the order of the branches, we could drop one.
918 */
919 static struct ir3_block *
resolve_dest_block(struct ir3_block * block)920 resolve_dest_block(struct ir3_block *block)
921 {
922 /* special case for last block: */
923 if (!block->successors[0])
924 return block;
925
926 /* NOTE that we may or may not have inserted the jump
927 * in the target block yet, so conditions to resolve
928 * the dest to the dest block's successor are:
929 *
930 * (1) successor[1] == NULL &&
931 * (2) (block-is-empty || only-instr-is-jump)
932 */
933 if (block->successors[1] == NULL) {
934 if (list_is_empty(&block->instr_list)) {
935 return block->successors[0];
936 } else if (list_length(&block->instr_list) == 1) {
937 struct ir3_instruction *instr =
938 list_first_entry(&block->instr_list, struct ir3_instruction, node);
939 if (instr->opc == OPC_JUMP) {
940 /* If this jump is backwards, then we will probably convert
941 * the jump being resolved to a backwards jump, which will
942 * change a loop-with-continue or loop-with-if into a
943 * doubly-nested loop and change the convergence behavior.
944 * Disallow this here.
945 */
946 if (block->successors[0]->index <= block->index)
947 return block;
948 return block->successors[0];
949 }
950 }
951 }
952 return block;
953 }
954
955 static void
remove_unused_block(struct ir3_block * old_target)956 remove_unused_block(struct ir3_block *old_target)
957 {
958 list_delinit(&old_target->node);
959
960 /* cleanup dangling predecessors: */
961 for (unsigned i = 0; i < ARRAY_SIZE(old_target->successors); i++) {
962 if (old_target->successors[i]) {
963 struct ir3_block *succ = old_target->successors[i];
964 ir3_block_remove_predecessor(succ, old_target);
965 }
966 }
967 }
968
969 static bool
retarget_jump(struct ir3_instruction * instr,struct ir3_block * new_target)970 retarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target)
971 {
972 struct ir3_block *old_target = instr->cat0.target;
973 struct ir3_block *cur_block = instr->block;
974
975 /* update current blocks successors to reflect the retargetting: */
976 if (cur_block->successors[0] == old_target) {
977 cur_block->successors[0] = new_target;
978 } else {
979 assert(cur_block->successors[1] == old_target);
980 cur_block->successors[1] = new_target;
981 }
982
983 /* update new target's predecessors: */
984 ir3_block_add_predecessor(new_target, cur_block);
985
986 /* and remove old_target's predecessor: */
987 ir3_block_remove_predecessor(old_target, cur_block);
988
989 instr->cat0.target = new_target;
990
991 if (old_target->predecessors_count == 0) {
992 remove_unused_block(old_target);
993 return true;
994 }
995
996 return false;
997 }
998
999 static bool
is_invertible_branch(struct ir3_instruction * instr)1000 is_invertible_branch(struct ir3_instruction *instr)
1001 {
1002 switch (instr->opc) {
1003 case OPC_BR:
1004 case OPC_BRAA:
1005 case OPC_BRAO:
1006 case OPC_BANY:
1007 case OPC_BALL:
1008 return true;
1009 default:
1010 return false;
1011 }
1012 }
1013
1014 static bool
opt_jump(struct ir3 * ir)1015 opt_jump(struct ir3 *ir)
1016 {
1017 bool progress = false;
1018
1019 unsigned index = 0;
1020 foreach_block (block, &ir->block_list)
1021 block->index = index++;
1022
1023 foreach_block (block, &ir->block_list) {
1024 /* This pass destroys the physical CFG so don't keep it around to avoid
1025 * validation errors.
1026 */
1027 block->physical_successors_count = 0;
1028 block->physical_predecessors_count = 0;
1029
1030 foreach_instr (instr, &block->instr_list) {
1031 if (!is_flow(instr) || !instr->cat0.target)
1032 continue;
1033
1034 struct ir3_block *tblock = resolve_dest_block(instr->cat0.target);
1035 if (tblock != instr->cat0.target) {
1036 progress = true;
1037
1038 /* Exit early if we deleted a block to avoid iterator
1039 * weirdness/assert fails
1040 */
1041 if (retarget_jump(instr, tblock))
1042 return true;
1043 }
1044 }
1045
1046 /* Detect the case where the block ends either with:
1047 * - A single unconditional jump to the next block.
1048 * - Two jump instructions with opposite conditions, and one of the
1049 * them jumps to the next block.
1050 * We can remove the one that jumps to the next block in either case.
1051 */
1052 if (list_is_empty(&block->instr_list))
1053 continue;
1054
1055 struct ir3_instruction *jumps[2] = {NULL, NULL};
1056 jumps[0] =
1057 list_last_entry(&block->instr_list, struct ir3_instruction, node);
1058 if (!list_is_singular(&block->instr_list))
1059 jumps[1] =
1060 list_last_entry(&jumps[0]->node, struct ir3_instruction, node);
1061
1062 if (jumps[0]->opc == OPC_JUMP)
1063 jumps[1] = NULL;
1064 else if (!is_invertible_branch(jumps[0]) || !jumps[1] ||
1065 !is_invertible_branch(jumps[1])) {
1066 continue;
1067 }
1068
1069 for (unsigned i = 0; i < 2; i++) {
1070 if (!jumps[i])
1071 continue;
1072 struct ir3_block *tblock = jumps[i]->cat0.target;
1073 if (&tblock->node == block->node.next) {
1074 list_delinit(&jumps[i]->node);
1075 progress = true;
1076 break;
1077 }
1078 }
1079 }
1080
1081 return progress;
1082 }
1083
1084 static void
resolve_jumps(struct ir3 * ir)1085 resolve_jumps(struct ir3 *ir)
1086 {
1087 foreach_block (block, &ir->block_list)
1088 foreach_instr (instr, &block->instr_list)
1089 if (is_flow(instr) && instr->cat0.target) {
1090 struct ir3_instruction *target = list_first_entry(
1091 &instr->cat0.target->instr_list, struct ir3_instruction, node);
1092
1093 instr->cat0.immed = (int)target->ip - (int)instr->ip;
1094 }
1095 }
1096
1097 static void
mark_jp(struct ir3_block * block)1098 mark_jp(struct ir3_block *block)
1099 {
1100 /* We only call this on the end block (in kill_sched) or after retargeting
1101 * all jumps to empty blocks (in mark_xvergence_points) so there's no need to
1102 * worry about empty blocks.
1103 */
1104 assert(!list_is_empty(&block->instr_list));
1105
1106 struct ir3_instruction *target =
1107 list_first_entry(&block->instr_list, struct ir3_instruction, node);
1108 target->flags |= IR3_INSTR_JP;
1109 }
1110
1111 /* Mark points where control flow reconverges.
1112 *
1113 * Re-convergence points are where "parked" threads are reconverged with threads
1114 * that took the opposite path last time around. We already calculated them, we
1115 * just need to mark them with (jp).
1116 */
1117 static void
mark_xvergence_points(struct ir3 * ir)1118 mark_xvergence_points(struct ir3 *ir)
1119 {
1120 foreach_block (block, &ir->block_list) {
1121 if (block->reconvergence_point)
1122 mark_jp(block);
1123 }
1124 }
1125
1126 static void
invert_branch(struct ir3_instruction * branch)1127 invert_branch(struct ir3_instruction *branch)
1128 {
1129 switch (branch->opc) {
1130 case OPC_BR:
1131 break;
1132 case OPC_BALL:
1133 branch->opc = OPC_BANY;
1134 break;
1135 case OPC_BANY:
1136 branch->opc = OPC_BALL;
1137 break;
1138 case OPC_BRAA:
1139 branch->opc = OPC_BRAO;
1140 break;
1141 case OPC_BRAO:
1142 branch->opc = OPC_BRAA;
1143 break;
1144 default:
1145 unreachable("can't get here");
1146 }
1147
1148 branch->cat0.inv1 = !branch->cat0.inv1;
1149 branch->cat0.inv2 = !branch->cat0.inv2;
1150 branch->cat0.target = branch->block->successors[1];
1151 }
1152
1153 /* Insert the branch/jump instructions for flow control between blocks.
1154 * Initially this is done naively, without considering if the successor
1155 * block immediately follows the current block (ie. so no jump required),
1156 * but that is cleaned up in opt_jump().
1157 */
1158 static void
block_sched(struct ir3 * ir)1159 block_sched(struct ir3 *ir)
1160 {
1161 foreach_block (block, &ir->block_list) {
1162 struct ir3_instruction *terminator = ir3_block_get_terminator(block);
1163
1164 if (block->successors[1]) {
1165 /* if/else, conditional branches to "then" or "else": */
1166 struct ir3_instruction *br1, *br2;
1167
1168 assert(terminator);
1169 unsigned opc = terminator->opc;
1170
1171 if (opc == OPC_GETONE || opc == OPC_SHPS || opc == OPC_GETLAST) {
1172 /* getone/shps can't be inverted, and it wouldn't even make sense
1173 * to follow it with an inverted branch, so follow it by an
1174 * unconditional branch.
1175 */
1176 assert(terminator->srcs_count == 0);
1177 br1 = terminator;
1178 br1->cat0.target = block->successors[1];
1179
1180 struct ir3_builder build = ir3_builder_at(ir3_after_block(block));
1181 br2 = ir3_JUMP(&build);
1182 br2->cat0.target = block->successors[0];
1183 } else if (opc == OPC_BR || opc == OPC_BRAA || opc == OPC_BRAO ||
1184 opc == OPC_BALL || opc == OPC_BANY) {
1185 /* create "else" branch first (since "then" block should
1186 * frequently/always end up being a fall-thru):
1187 */
1188 br1 = terminator;
1189 br2 = ir3_instr_clone(br1);
1190 invert_branch(br1);
1191 br2->cat0.target = block->successors[0];
1192 } else {
1193 assert(opc == OPC_PREDT || opc == OPC_PREDF);
1194
1195 /* Handled by prede_sched. */
1196 terminator->cat0.target = block->successors[0];
1197 continue;
1198 }
1199
1200 /* Creating br2 caused it to be moved before the terminator b1, move it
1201 * back.
1202 */
1203 ir3_instr_move_after(br2, br1);
1204 } else if (block->successors[0]) {
1205 /* otherwise unconditional jump or predt/predf to next block which
1206 * should already have been inserted.
1207 */
1208 assert(terminator);
1209 assert(terminator->opc == OPC_JUMP || terminator->opc == OPC_PREDT ||
1210 terminator->opc == OPC_PREDF);
1211 terminator->cat0.target = block->successors[0];
1212 }
1213 }
1214 }
1215
1216 /* Some gens have a hardware issue that needs to be worked around by 1)
1217 * inserting 4 nops after the second pred[tf] of a pred[tf]/pred[ft] pair and/or
1218 * inserting 6 nops after prede.
1219 *
1220 * This function should be called with the second pred[tf] of such a pair and
1221 * NULL if there is only one pred[tf].
1222 */
1223 static void
add_predication_workaround(struct ir3_compiler * compiler,struct ir3_instruction * predtf,struct ir3_instruction * prede)1224 add_predication_workaround(struct ir3_compiler *compiler,
1225 struct ir3_instruction *predtf,
1226 struct ir3_instruction *prede)
1227 {
1228 if (predtf && compiler->predtf_nop_quirk) {
1229 struct ir3_builder build = ir3_builder_at(ir3_after_block(predtf->block));
1230 struct ir3_instruction *nop = ir3_NOP(&build);
1231 nop->repeat = 4;
1232 ir3_instr_move_after(nop, predtf);
1233 }
1234
1235 if (compiler->prede_nop_quirk) {
1236 struct ir3_builder build = ir3_builder_at(ir3_after_block(prede->block));
1237 struct ir3_instruction *nop = ir3_NOP(&build);
1238 nop->repeat = 6;
1239 ir3_instr_move_after(nop, prede);
1240 }
1241 }
1242
1243 static void
prede_sched(struct ir3 * ir)1244 prede_sched(struct ir3 *ir)
1245 {
1246 unsigned index = 0;
1247 foreach_block (block, &ir->block_list)
1248 block->index = index++;
1249
1250 foreach_block (block, &ir->block_list) {
1251 /* Look for the following pattern generated by NIR lowering. The numbers
1252 * at the top of blocks are their index.
1253 * |--- i ----|
1254 * | ... |
1255 * | pred[tf] |
1256 * |----------|
1257 * succ0 / \ succ1
1258 * |-- i+1 ---| |-- i+2 ---|
1259 * | ... | | ... |
1260 * | pred[ft] | | ... |
1261 * |----------| |----------|
1262 * succ0 \ / succ0
1263 * |--- j ----|
1264 * | ... |
1265 * |----------|
1266 */
1267 struct ir3_block *succ0 = block->successors[0];
1268 struct ir3_block *succ1 = block->successors[1];
1269
1270 if (!succ1)
1271 continue;
1272
1273 struct ir3_instruction *terminator = ir3_block_get_terminator(block);
1274 if (!terminator)
1275 continue;
1276 if (terminator->opc != OPC_PREDT && terminator->opc != OPC_PREDF)
1277 continue;
1278
1279 assert(!succ0->successors[1] && !succ1->successors[1]);
1280 assert(succ0->successors[0] == succ1->successors[0]);
1281 assert(succ0->predecessors_count == 1 && succ1->predecessors_count == 1);
1282 assert(succ0->index == (block->index + 1));
1283 assert(succ1->index == (block->index + 2));
1284
1285 struct ir3_instruction *succ0_terminator =
1286 ir3_block_get_terminator(succ0);
1287 assert(succ0_terminator);
1288 assert(succ0_terminator->opc ==
1289 (terminator->opc == OPC_PREDT ? OPC_PREDF : OPC_PREDT));
1290
1291 ASSERTED struct ir3_instruction *succ1_terminator =
1292 ir3_block_get_terminator(succ1);
1293 assert(!succ1_terminator || (succ1_terminator->opc == OPC_JUMP));
1294
1295 /* Simple case: both successors contain instructions. Keep both blocks and
1296 * insert prede before the second successor's terminator:
1297 * |--- i ----|
1298 * | ... |
1299 * | pred[tf] |
1300 * |----------|
1301 * succ0 / \ succ1
1302 * |-- i+1 ---| |-- i+2 ---|
1303 * | ... | | ... |
1304 * | pred[ft] | | prede |
1305 * |----------| |----------|
1306 * succ0 \ / succ0
1307 * |--- j ----|
1308 * | ... |
1309 * |----------|
1310 */
1311 if (!list_is_empty(&succ1->instr_list)) {
1312 struct ir3_builder build =
1313 ir3_builder_at(ir3_before_terminator(succ1));
1314 struct ir3_instruction *prede = ir3_PREDE(&build);
1315 add_predication_workaround(ir->compiler, succ0_terminator, prede);
1316 continue;
1317 }
1318
1319 /* Second successor is empty so we can remove it:
1320 * |--- i ----|
1321 * | ... |
1322 * | pred[tf] |
1323 * |----------|
1324 * succ0 / \ succ1
1325 * |-- i+1 ---| |
1326 * | ... | |
1327 * | prede | |
1328 * |----------| |
1329 * succ0 \ /
1330 * |--- j ----|
1331 * | ... |
1332 * |----------|
1333 */
1334 list_delinit(&succ0_terminator->node);
1335 struct ir3_builder build = ir3_builder_at(ir3_before_terminator(succ0));
1336 struct ir3_instruction *prede = ir3_PREDE(&build);
1337 add_predication_workaround(ir->compiler, NULL, prede);
1338 remove_unused_block(succ1);
1339 block->successors[1] = succ0->successors[0];
1340 ir3_block_add_predecessor(succ0->successors[0], block);
1341 }
1342 }
1343
1344 /* Here we workaround the fact that kill doesn't actually kill the thread as
1345 * GL expects. The last instruction always needs to be an end instruction,
1346 * which means that if we're stuck in a loop where kill is the only way out,
1347 * then we may have to jump out to the end. kill may also have the d3d
1348 * semantics of converting the thread to a helper thread, rather than setting
1349 * the exec mask to 0, in which case the helper thread could get stuck in an
1350 * infinite loop.
1351 *
1352 * We do this late, both to give the scheduler the opportunity to reschedule
1353 * kill instructions earlier and to avoid having to create a separate basic
1354 * block.
1355 *
1356 * TODO: Assuming that the wavefront doesn't stop as soon as all threads are
1357 * killed, we might benefit by doing this more aggressively when the remaining
1358 * part of the program after the kill is large, since that would let us
1359 * skip over the instructions when there are no non-killed threads left.
1360 */
1361 static void
kill_sched(struct ir3 * ir,struct ir3_shader_variant * so)1362 kill_sched(struct ir3 *ir, struct ir3_shader_variant *so)
1363 {
1364 ir3_count_instructions(ir);
1365
1366 /* True if we know that this block will always eventually lead to the end
1367 * block:
1368 */
1369 bool always_ends = true;
1370 bool added = false;
1371 struct ir3_block *last_block =
1372 list_last_entry(&ir->block_list, struct ir3_block, node);
1373
1374 foreach_block_rev (block, &ir->block_list) {
1375 for (unsigned i = 0; i < 2 && block->successors[i]; i++) {
1376 if (block->successors[i]->start_ip <= block->end_ip)
1377 always_ends = false;
1378 }
1379
1380 if (always_ends)
1381 continue;
1382
1383 foreach_instr_safe (instr, &block->instr_list) {
1384 if (instr->opc != OPC_KILL)
1385 continue;
1386
1387 struct ir3_instruction *br =
1388 ir3_instr_create_at(ir3_after_instr(instr), OPC_BR, 0, 1);
1389 ir3_src_create(br, instr->srcs[0]->num, instr->srcs[0]->flags)->wrmask =
1390 1;
1391 br->cat0.target =
1392 list_last_entry(&ir->block_list, struct ir3_block, node);
1393
1394 added = true;
1395 }
1396 }
1397
1398 if (added) {
1399 /* I'm not entirely sure how the branchstack works, but we probably
1400 * need to add at least one entry for the divergence which is resolved
1401 * at the end:
1402 */
1403 so->branchstack++;
1404
1405 /* We don't update predecessors/successors, so we have to do this
1406 * manually:
1407 */
1408 mark_jp(last_block);
1409 }
1410 }
1411
1412 static void
dbg_sync_sched(struct ir3 * ir,struct ir3_shader_variant * so)1413 dbg_sync_sched(struct ir3 *ir, struct ir3_shader_variant *so)
1414 {
1415 foreach_block (block, &ir->block_list) {
1416 foreach_instr_safe (instr, &block->instr_list) {
1417 if (is_ss_producer(instr) || is_sy_producer(instr)) {
1418 struct ir3_builder build = ir3_builder_at(ir3_after_instr(instr));
1419 struct ir3_instruction *nop = ir3_NOP(&build);
1420 nop->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
1421 }
1422 }
1423 }
1424 }
1425
1426 static void
dbg_nop_sched(struct ir3 * ir,struct ir3_shader_variant * so)1427 dbg_nop_sched(struct ir3 *ir, struct ir3_shader_variant *so)
1428 {
1429 foreach_block (block, &ir->block_list) {
1430 foreach_instr_safe (instr, &block->instr_list) {
1431 struct ir3_builder build = ir3_builder_at(ir3_before_instr(instr));
1432 struct ir3_instruction *nop = ir3_NOP(&build);
1433 nop->repeat = 5;
1434 }
1435 }
1436 }
1437
1438 static void
dbg_expand_rpt(struct ir3 * ir)1439 dbg_expand_rpt(struct ir3 *ir)
1440 {
1441 foreach_block (block, &ir->block_list) {
1442 foreach_instr_safe (instr, &block->instr_list) {
1443 if (instr->repeat == 0 || instr->opc == OPC_NOP ||
1444 instr->opc == OPC_SWZ || instr->opc == OPC_GAT ||
1445 instr->opc == OPC_SCT) {
1446 continue;
1447 }
1448
1449 for (unsigned i = 0; i <= instr->repeat; ++i) {
1450 struct ir3_instruction *rpt = ir3_instr_clone(instr);
1451 ir3_instr_move_before(rpt, instr);
1452 rpt->repeat = 0;
1453
1454 foreach_dst (dst, rpt) {
1455 dst->num += i;
1456 dst->wrmask = 1;
1457 }
1458
1459 foreach_src (src, rpt) {
1460 if (!(src->flags & IR3_REG_R))
1461 continue;
1462
1463 src->num += i;
1464 src->uim_val += i;
1465 src->wrmask = 1;
1466 src->flags &= ~IR3_REG_R;
1467 }
1468 }
1469
1470 list_delinit(&instr->node);
1471 }
1472 }
1473 }
1474
1475 struct ir3_helper_block_data {
1476 /* Whether helper invocations may be used on any path starting at the
1477 * beginning of the block.
1478 */
1479 bool uses_helpers_beginning;
1480
1481 /* Whether helper invocations may be used by the end of the block. Branch
1482 * instructions are considered to be "between" blocks, because (eq) has to be
1483 * inserted after them in the successor blocks, so branch instructions using
1484 * helpers will result in uses_helpers_end = true for their block.
1485 */
1486 bool uses_helpers_end;
1487 };
1488
1489 /* Insert (eq) after the last instruction using the results of helper
1490 * invocations. Use a backwards dataflow analysis to determine at which points
1491 * in the program helper invocations are definitely never used, and then insert
1492 * (eq) at the point where we cross from a point where they may be used to a
1493 * point where they are never used.
1494 */
1495 static void
helper_sched(struct ir3_legalize_ctx * ctx,struct ir3 * ir,struct ir3_shader_variant * so)1496 helper_sched(struct ir3_legalize_ctx *ctx, struct ir3 *ir,
1497 struct ir3_shader_variant *so)
1498 {
1499 bool non_prefetch_helpers = false;
1500
1501 foreach_block (block, &ir->block_list) {
1502 struct ir3_helper_block_data *bd =
1503 rzalloc(ctx, struct ir3_helper_block_data);
1504 foreach_instr (instr, &block->instr_list) {
1505 if (uses_helpers(instr)) {
1506 bd->uses_helpers_beginning = true;
1507 if (instr->opc != OPC_META_TEX_PREFETCH) {
1508 non_prefetch_helpers = true;
1509 }
1510 }
1511
1512 if (instr->opc == OPC_SHPE) {
1513 /* (eq) is not allowed in preambles, mark the whole preamble as
1514 * requiring helpers to avoid putting it there.
1515 */
1516 bd->uses_helpers_beginning = true;
1517 bd->uses_helpers_end = true;
1518 }
1519 }
1520
1521 struct ir3_instruction *terminator = ir3_block_get_terminator(block);
1522 if (terminator) {
1523 if (terminator->opc == OPC_BALL || terminator->opc == OPC_BANY ||
1524 (terminator->opc == OPC_GETONE &&
1525 (terminator->flags & IR3_INSTR_NEEDS_HELPERS))) {
1526 bd->uses_helpers_beginning = true;
1527 bd->uses_helpers_end = true;
1528 non_prefetch_helpers = true;
1529 }
1530 }
1531
1532 block->data = bd;
1533 }
1534
1535 /* If only prefetches use helpers then we can disable them in the shader via
1536 * a register setting.
1537 */
1538 if (!non_prefetch_helpers) {
1539 so->prefetch_end_of_quad = true;
1540 return;
1541 }
1542
1543 bool progress;
1544 do {
1545 progress = false;
1546 foreach_block_rev (block, &ir->block_list) {
1547 struct ir3_helper_block_data *bd = block->data;
1548
1549 if (!bd->uses_helpers_beginning)
1550 continue;
1551
1552 for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
1553 struct ir3_block *pred = block->physical_predecessors[i];
1554 struct ir3_helper_block_data *pred_bd = pred->data;
1555 if (!pred_bd->uses_helpers_end) {
1556 pred_bd->uses_helpers_end = true;
1557 }
1558 if (!pred_bd->uses_helpers_beginning) {
1559 pred_bd->uses_helpers_beginning = true;
1560 progress = true;
1561 }
1562 }
1563 }
1564 } while (progress);
1565
1566 /* Now, we need to determine the points where helper invocations become
1567 * unused.
1568 */
1569 foreach_block (block, &ir->block_list) {
1570 struct ir3_helper_block_data *bd = block->data;
1571 if (bd->uses_helpers_end)
1572 continue;
1573
1574 /* We need to check the predecessors because of situations with critical
1575 * edges like this that can occur after optimizing jumps:
1576 *
1577 * br p0.x, #endif
1578 * ...
1579 * sam ...
1580 * ...
1581 * endif:
1582 * ...
1583 * end
1584 *
1585 * The endif block will have uses_helpers_beginning = false and
1586 * uses_helpers_end = false, but because we jump to there from the
1587 * beginning of the if where uses_helpers_end = true, we still want to
1588 * add an (eq) at the beginning of the block:
1589 *
1590 * br p0.x, #endif
1591 * ...
1592 * sam ...
1593 * (eq)nop
1594 * ...
1595 * endif:
1596 * (eq)nop
1597 * ...
1598 * end
1599 *
1600 * This an extra nop in the case where the branch isn't taken, but that's
1601 * probably preferable to adding an extra jump instruction which is what
1602 * would happen if we ran this pass before optimizing jumps:
1603 *
1604 * br p0.x, #else
1605 * ...
1606 * sam ...
1607 * (eq)nop
1608 * ...
1609 * jump #endif
1610 * else:
1611 * (eq)nop
1612 * endif:
1613 * ...
1614 * end
1615 *
1616 * We also need this to make sure we insert (eq) after branches which use
1617 * helper invocations.
1618 */
1619 bool pred_uses_helpers = bd->uses_helpers_beginning;
1620 for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
1621 struct ir3_block *pred = block->physical_predecessors[i];
1622 struct ir3_helper_block_data *pred_bd = pred->data;
1623 if (pred_bd->uses_helpers_end) {
1624 pred_uses_helpers = true;
1625 break;
1626 }
1627 }
1628
1629 if (!pred_uses_helpers)
1630 continue;
1631
1632 /* The last use of helpers is somewhere between the beginning and the
1633 * end. first_instr will be the first instruction where helpers are no
1634 * longer required, or NULL if helpers are not required just at the end.
1635 */
1636 struct ir3_instruction *first_instr = NULL;
1637 foreach_instr_rev (instr, &block->instr_list) {
1638 /* Skip prefetches because they actually execute before the block
1639 * starts and at this stage they aren't guaranteed to be at the start
1640 * of the block.
1641 */
1642 if (uses_helpers(instr) && instr->opc != OPC_META_TEX_PREFETCH)
1643 break;
1644 first_instr = instr;
1645 }
1646
1647 bool killed = false;
1648 bool expensive_instruction_in_block = false;
1649 if (first_instr) {
1650 foreach_instr_from (instr, first_instr, &block->instr_list) {
1651 /* If there's already a nop, we don't have to worry about whether to
1652 * insert one.
1653 */
1654 if (instr->opc == OPC_NOP) {
1655 instr->flags |= IR3_INSTR_EQ;
1656 killed = true;
1657 break;
1658 }
1659
1660 /* ALU and SFU instructions probably aren't going to benefit much
1661 * from killing helper invocations, because they complete at least
1662 * an entire quad in a cycle and don't access any quad-divergent
1663 * memory, so delay emitting (eq) in the hopes that we find a nop
1664 * afterwards.
1665 */
1666 if (is_alu(instr) || is_sfu(instr))
1667 continue;
1668 if (instr->opc == OPC_PREDE)
1669 continue;
1670
1671 expensive_instruction_in_block = true;
1672 break;
1673 }
1674 }
1675
1676 /* If this block isn't the last block before the end instruction, assume
1677 * that there may be expensive instructions in later blocks so it's worth
1678 * it to insert a nop.
1679 */
1680 if (!killed && (expensive_instruction_in_block ||
1681 block->successors[0] != ir3_end_block(ir))) {
1682 struct ir3_cursor cursor = first_instr ? ir3_before_instr(first_instr)
1683 : ir3_before_terminator(block);
1684 struct ir3_builder build = ir3_builder_at(cursor);
1685 struct ir3_instruction *nop = ir3_NOP(&build);
1686 nop->flags |= IR3_INSTR_EQ;
1687 }
1688 }
1689 }
1690
1691 bool
ir3_legalize(struct ir3 * ir,struct ir3_shader_variant * so,int * max_bary)1692 ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
1693 {
1694 struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx);
1695 bool mergedregs = so->mergedregs;
1696 bool progress;
1697
1698 ctx->so = so;
1699 ctx->max_bary = -1;
1700 ctx->compiler = ir->compiler;
1701 ctx->type = ir->type;
1702
1703 /* allocate per-block data: */
1704 foreach_block (block, &ir->block_list) {
1705 struct ir3_legalize_block_data *bd =
1706 rzalloc(ctx, struct ir3_legalize_block_data);
1707
1708 regmask_init(&bd->state.needs_ss_war, mergedregs);
1709 regmask_init(&bd->state.needs_ss_or_sy_war, mergedregs);
1710 regmask_init(&bd->state.needs_ss_scalar_war, mergedregs);
1711 regmask_init(&bd->state.needs_ss_or_sy_scalar_war, mergedregs);
1712 regmask_init(&bd->state.needs_ss_scalar_full, mergedregs);
1713 regmask_init(&bd->state.needs_ss_scalar_half, mergedregs);
1714 regmask_init(&bd->state.needs_ss, mergedregs);
1715 regmask_init(&bd->state.needs_sy, mergedregs);
1716 regmask_init(&bd->begin_state.needs_ss_war, mergedregs);
1717 regmask_init(&bd->begin_state.needs_ss_or_sy_war, mergedregs);
1718 regmask_init(&bd->begin_state.needs_ss_scalar_war, mergedregs);
1719 regmask_init(&bd->begin_state.needs_ss_or_sy_scalar_war, mergedregs);
1720 regmask_init(&bd->begin_state.needs_ss_scalar_full, mergedregs);
1721 regmask_init(&bd->begin_state.needs_ss_scalar_half, mergedregs);
1722 regmask_init(&bd->begin_state.needs_ss, mergedregs);
1723 regmask_init(&bd->begin_state.needs_sy, mergedregs);
1724
1725 block->data = bd;
1726 }
1727
1728 /* We may have failed to pull all input loads into the first block.
1729 * In such case at the moment we aren't able to find a better place
1730 * to for (ei) than the end of the program.
1731 * a5xx and a6xx do automatically release varying storage at the end.
1732 */
1733 ctx->early_input_release = true;
1734
1735 struct ir3_block *start_block = ir3_after_preamble(ir);
1736
1737 /* Gather information to determine whether we can enable early preamble.
1738 */
1739 bool gpr_in_preamble = false;
1740 bool pred_in_preamble = false;
1741 bool relative_in_preamble = false;
1742 bool in_preamble = start_block != ir3_start_block(ir);
1743 bool has_preamble = start_block != ir3_start_block(ir);
1744
1745 foreach_block (block, &ir->block_list) {
1746 if (block == start_block)
1747 in_preamble = false;
1748
1749 foreach_instr (instr, &block->instr_list) {
1750 if (is_input(instr)) {
1751 ctx->has_inputs = true;
1752 if (block != start_block) {
1753 ctx->early_input_release = false;
1754 }
1755 }
1756
1757 if (is_meta(instr))
1758 continue;
1759
1760 foreach_src (reg, instr) {
1761 if (in_preamble) {
1762 if (!(reg->flags & IR3_REG_SHARED) && is_reg_gpr(reg))
1763 gpr_in_preamble = true;
1764 if (reg->flags & IR3_REG_RELATIV)
1765 relative_in_preamble = true;
1766 }
1767 }
1768
1769 foreach_dst (reg, instr) {
1770 if (is_dest_gpr(reg)) {
1771 if (in_preamble) {
1772 if (!(reg->flags & IR3_REG_SHARED))
1773 gpr_in_preamble = true;
1774 if (reg->flags & IR3_REG_RELATIV)
1775 relative_in_preamble = true;
1776 }
1777 }
1778 }
1779
1780 if (in_preamble && writes_pred(instr)) {
1781 pred_in_preamble = true;
1782 }
1783 }
1784 }
1785
1786 so->early_preamble = has_preamble && !gpr_in_preamble &&
1787 !pred_in_preamble && !relative_in_preamble &&
1788 ir->compiler->has_early_preamble &&
1789 !(ir3_shader_debug & IR3_DBG_NOEARLYPREAMBLE);
1790
1791 /* On a7xx, sync behavior for a1.x is different in the early preamble. RaW
1792 * dependencies must be synchronized with (ss) there must be an extra
1793 * (r) on the source of the mova1 instruction.
1794 */
1795 if (so->early_preamble && ir->compiler->gen >= 7) {
1796 foreach_block (block, &ir->block_list) {
1797 if (block == start_block)
1798 break;
1799 block->in_early_preamble = true;
1800 }
1801 }
1802
1803 assert(ctx->early_input_release || ctx->compiler->gen >= 5);
1804
1805 if (ir3_shader_debug & IR3_DBG_EXPANDRPT) {
1806 dbg_expand_rpt(ir);
1807 }
1808
1809 /* process each block: */
1810 do {
1811 progress = false;
1812 foreach_block (block, &ir->block_list) {
1813 progress |= legalize_block(ctx, block);
1814 }
1815 } while (progress);
1816
1817 *max_bary = ctx->max_bary;
1818
1819 foreach_block (block, &ir->block_list) {
1820 struct ir3_instruction *terminator = ir3_block_get_terminator(block);
1821 if (terminator && terminator->opc == OPC_GETONE) {
1822 apply_push_consts_load_macro(ctx, block->successors[0]);
1823 break;
1824 }
1825 }
1826
1827 block_sched(ir);
1828
1829 foreach_block (block, &ir->block_list) {
1830 progress |= apply_fine_deriv_macro(ctx, block);
1831 }
1832
1833 if (ir3_shader_debug & IR3_DBG_FULLSYNC) {
1834 dbg_sync_sched(ir, so);
1835 }
1836
1837 if (ir3_shader_debug & IR3_DBG_FULLNOP) {
1838 dbg_nop_sched(ir, so);
1839 }
1840
1841 bool cfg_changed = false;
1842 while (opt_jump(ir))
1843 cfg_changed = true;
1844
1845 prede_sched(ir);
1846
1847 if (cfg_changed)
1848 ir3_calc_reconvergence(so);
1849
1850 if (so->type == MESA_SHADER_FRAGMENT)
1851 kill_sched(ir, so);
1852
1853 /* TODO: does (eq) exist before a6xx? */
1854 if (so->type == MESA_SHADER_FRAGMENT && so->need_pixlod &&
1855 so->compiler->gen >= 6)
1856 helper_sched(ctx, ir, so);
1857
1858 foreach_block (block, &ir->block_list) {
1859 progress |= expand_dummy_dests(block);
1860 }
1861
1862 ir3_insert_alias_tex(ir);
1863 ir3_count_instructions(ir);
1864 resolve_jumps(ir);
1865
1866 mark_xvergence_points(ir);
1867
1868 ralloc_free(ctx);
1869
1870 return true;
1871 }
1872