• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include "util/ralloc.h"
28 #include "util/u_math.h"
29 
30 #include "ir3.h"
31 #include "ir3_shader.h"
32 
33 /*
34  * Legalize:
35  *
36  * The legalize pass handles ensuring sufficient nop's and sync flags for
37  * correct execution.
38  *
39  * 1) Iteratively determine where sync ((sy)/(ss)) flags are needed,
40  *    based on state flowing out of predecessor blocks until there is
41  *    no further change.  In some cases this requires inserting nops.
42  * 2) Mark (ei) on last varying input, and (ul) on last use of a0.x
43  * 3) Final nop scheduling for instruction latency
44  * 4) Resolve jumps and schedule blocks, marking potential convergence
45  *    points with (jp)
46  */
47 
48 struct ir3_legalize_ctx {
49 	struct ir3_compiler *compiler;
50 	struct ir3_shader_variant *so;
51 	gl_shader_stage type;
52 	int max_bary;
53 };
54 
55 struct ir3_legalize_state {
56 	regmask_t needs_ss;
57 	regmask_t needs_ss_war;       /* write after read */
58 	regmask_t needs_sy;
59 };
60 
61 struct ir3_legalize_block_data {
62 	bool valid;
63 	struct ir3_legalize_state state;
64 };
65 
66 /* We want to evaluate each block from the position of any other
67  * predecessor block, in order that the flags set are the union of
68  * all possible program paths.
69  *
70  * To do this, we need to know the output state (needs_ss/ss_war/sy)
71  * of all predecessor blocks.  The tricky thing is loops, which mean
72  * that we can't simply recursively process each predecessor block
73  * before legalizing the current block.
74  *
75  * How we handle that is by looping over all the blocks until the
76  * results converge.  If the output state of a given block changes
77  * in a given pass, this means that all successor blocks are not
78  * yet fully legalized.
79  */
80 
81 static bool
legalize_block(struct ir3_legalize_ctx * ctx,struct ir3_block * block)82 legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
83 {
84 	struct ir3_legalize_block_data *bd = block->data;
85 
86 	if (bd->valid)
87 		return false;
88 
89 	struct ir3_instruction *last_input = NULL;
90 	struct ir3_instruction *last_rel = NULL;
91 	struct ir3_instruction *last_n = NULL;
92 	struct list_head instr_list;
93 	struct ir3_legalize_state prev_state = bd->state;
94 	struct ir3_legalize_state *state = &bd->state;
95 	bool last_input_needs_ss = false;
96 	bool has_tex_prefetch = false;
97 	bool mergedregs = ctx->so->mergedregs;
98 
99 	/* our input state is the OR of all predecessor blocks' state: */
100 	set_foreach(block->predecessors, entry) {
101 		struct ir3_block *predecessor = (struct ir3_block *)entry->key;
102 		struct ir3_legalize_block_data *pbd = predecessor->data;
103 		struct ir3_legalize_state *pstate = &pbd->state;
104 
105 		/* Our input (ss)/(sy) state is based on OR'ing the output
106 		 * state of all our predecessor blocks
107 		 */
108 		regmask_or(&state->needs_ss,
109 				&state->needs_ss, &pstate->needs_ss);
110 		regmask_or(&state->needs_ss_war,
111 				&state->needs_ss_war, &pstate->needs_ss_war);
112 		regmask_or(&state->needs_sy,
113 				&state->needs_sy, &pstate->needs_sy);
114 	}
115 
116 	/* remove all the instructions from the list, we'll be adding
117 	 * them back in as we go
118 	 */
119 	list_replace(&block->instr_list, &instr_list);
120 	list_inithead(&block->instr_list);
121 
122 	foreach_instr_safe (n, &instr_list) {
123 		unsigned i;
124 
125 		n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
126 
127 		/* _meta::tex_prefetch instructions removed later in
128 		 * collect_tex_prefetches()
129 		 */
130 		if (is_meta(n) && (n->opc != OPC_META_TEX_PREFETCH))
131 			continue;
132 
133 		if (is_input(n)) {
134 			struct ir3_register *inloc = n->regs[1];
135 			assert(inloc->flags & IR3_REG_IMMED);
136 			ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val);
137 		}
138 
139 		if (last_n && is_barrier(last_n)) {
140 			n->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
141 			last_input_needs_ss = false;
142 			regmask_init(&state->needs_ss_war, mergedregs);
143 			regmask_init(&state->needs_ss, mergedregs);
144 			regmask_init(&state->needs_sy, mergedregs);
145 		}
146 
147 		if (last_n && (last_n->opc == OPC_PREDT)) {
148 			n->flags |= IR3_INSTR_SS;
149 			regmask_init(&state->needs_ss_war, mergedregs);
150 			regmask_init(&state->needs_ss, mergedregs);
151 		}
152 
153 		/* NOTE: consider dst register too.. it could happen that
154 		 * texture sample instruction (for example) writes some
155 		 * components which are unused.  A subsequent instruction
156 		 * that writes the same register can race w/ the sam instr
157 		 * resulting in undefined results:
158 		 */
159 		for (i = 0; i < n->regs_count; i++) {
160 			struct ir3_register *reg = n->regs[i];
161 
162 			if (reg_gpr(reg)) {
163 
164 				/* TODO: we probably only need (ss) for alu
165 				 * instr consuming sfu result.. need to make
166 				 * some tests for both this and (sy)..
167 				 */
168 				if (regmask_get(&state->needs_ss, reg)) {
169 					n->flags |= IR3_INSTR_SS;
170 					last_input_needs_ss = false;
171 					regmask_init(&state->needs_ss_war, mergedregs);
172 					regmask_init(&state->needs_ss, mergedregs);
173 				}
174 
175 				if (regmask_get(&state->needs_sy, reg)) {
176 					n->flags |= IR3_INSTR_SY;
177 					regmask_init(&state->needs_sy, mergedregs);
178 				}
179 			}
180 
181 			/* TODO: is it valid to have address reg loaded from a
182 			 * relative src (ie. mova a0, c<a0.x+4>)?  If so, the
183 			 * last_rel check below should be moved ahead of this:
184 			 */
185 			if (reg->flags & IR3_REG_RELATIV)
186 				last_rel = n;
187 		}
188 
189 		if (n->regs_count > 0) {
190 			struct ir3_register *reg = n->regs[0];
191 			if (regmask_get(&state->needs_ss_war, reg)) {
192 				n->flags |= IR3_INSTR_SS;
193 				last_input_needs_ss = false;
194 				regmask_init(&state->needs_ss_war, mergedregs);
195 				regmask_init(&state->needs_ss, mergedregs);
196 			}
197 
198 			if (last_rel && (reg->num == regid(REG_A0, 0))) {
199 				last_rel->flags |= IR3_INSTR_UL;
200 				last_rel = NULL;
201 			}
202 		}
203 
204 		/* cat5+ does not have an (ss) bit, if needed we need to
205 		 * insert a nop to carry the sync flag.  Would be kinda
206 		 * clever if we were aware of this during scheduling, but
207 		 * this should be a pretty rare case:
208 		 */
209 		if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) {
210 			struct ir3_instruction *nop;
211 			nop = ir3_NOP(block);
212 			nop->flags |= IR3_INSTR_SS;
213 			n->flags &= ~IR3_INSTR_SS;
214 		}
215 
216 		/* need to be able to set (ss) on first instruction: */
217 		if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
218 			ir3_NOP(block);
219 
220 		if (ctx->compiler->samgq_workaround &&
221 			ctx->type == MESA_SHADER_VERTEX && n->opc == OPC_SAMGQ) {
222 			struct ir3_instruction *samgp;
223 
224 			list_delinit(&n->node);
225 
226 			for (i = 0; i < 4; i++) {
227 				samgp = ir3_instr_clone(n);
228 				samgp->opc = OPC_SAMGP0 + i;
229 				if (i > 1)
230 					samgp->flags |= IR3_INSTR_SY;
231 			}
232 		} else {
233 			list_addtail(&n->node, &block->instr_list);
234 		}
235 
236 		if (is_sfu(n))
237 			regmask_set(&state->needs_ss, n->regs[0]);
238 
239 		if (is_tex_or_prefetch(n)) {
240 			regmask_set(&state->needs_sy, n->regs[0]);
241 			if (n->opc == OPC_META_TEX_PREFETCH)
242 				has_tex_prefetch = true;
243 		} else if (n->opc == OPC_RESINFO) {
244 			regmask_set(&state->needs_ss, n->regs[0]);
245 			ir3_NOP(block)->flags |= IR3_INSTR_SS;
246 			last_input_needs_ss = false;
247 		} else if (is_load(n)) {
248 			/* seems like ldlv needs (ss) bit instead??  which is odd but
249 			 * makes a bunch of flat-varying tests start working on a4xx.
250 			 */
251 			if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL) || (n->opc == OPC_LDLW))
252 				regmask_set(&state->needs_ss, n->regs[0]);
253 			else
254 				regmask_set(&state->needs_sy, n->regs[0]);
255 		} else if (is_atomic(n->opc)) {
256 			if (n->flags & IR3_INSTR_G) {
257 				if (ctx->compiler->gpu_id >= 600) {
258 					/* New encoding, returns  result via second src: */
259 					regmask_set(&state->needs_sy, n->regs[3]);
260 				} else {
261 					regmask_set(&state->needs_sy, n->regs[0]);
262 				}
263 			} else {
264 				regmask_set(&state->needs_ss, n->regs[0]);
265 			}
266 		}
267 
268 		if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G)))
269 			ctx->so->has_ssbo = true;
270 
271 		/* both tex/sfu appear to not always immediately consume
272 		 * their src register(s):
273 		 */
274 		if (is_tex(n) || is_sfu(n) || is_mem(n)) {
275 			foreach_src (reg, n) {
276 				if (reg_gpr(reg))
277 					regmask_set(&state->needs_ss_war, reg);
278 			}
279 		}
280 
281 		if (is_input(n)) {
282 			last_input = n;
283 			last_input_needs_ss |= (n->opc == OPC_LDLV);
284 		}
285 
286 		last_n = n;
287 	}
288 
289 	if (last_input) {
290 		assert(block == list_first_entry(&block->shader->block_list,
291 				struct ir3_block, node));
292 		/* special hack.. if using ldlv to bypass interpolation,
293 		 * we need to insert a dummy bary.f on which we can set
294 		 * the (ei) flag:
295 		 */
296 		if (is_mem(last_input) && (last_input->opc == OPC_LDLV)) {
297 			struct ir3_instruction *baryf;
298 
299 			/* (ss)bary.f (ei)r63.x, 0, r0.x */
300 			baryf = ir3_instr_create(block, OPC_BARY_F);
301 			ir3_reg_create(baryf, regid(63, 0), 0);
302 			ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
303 			ir3_reg_create(baryf, regid(0, 0), 0);
304 
305 			/* insert the dummy bary.f after last_input: */
306 			ir3_instr_move_after(baryf, last_input);
307 
308 			last_input = baryf;
309 
310 			/* by definition, we need (ss) since we are inserting
311 			 * the dummy bary.f immediately after the ldlv:
312 			 */
313 			last_input_needs_ss = true;
314 		}
315 		last_input->regs[0]->flags |= IR3_REG_EI;
316 		if (last_input_needs_ss)
317 			last_input->flags |= IR3_INSTR_SS;
318 	} else if (has_tex_prefetch) {
319 		/* texture prefetch, but *no* inputs.. we need to insert a
320 		 * dummy bary.f at the top of the shader to unblock varying
321 		 * storage:
322 		 */
323 		struct ir3_instruction *baryf;
324 
325 		/* (ss)bary.f (ei)r63.x, 0, r0.x */
326 		baryf = ir3_instr_create(block, OPC_BARY_F);
327 		ir3_reg_create(baryf, regid(63, 0), 0)->flags |= IR3_REG_EI;
328 		ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
329 		ir3_reg_create(baryf, regid(0, 0), 0);
330 
331 		/* insert the dummy bary.f at head: */
332 		list_delinit(&baryf->node);
333 		list_add(&baryf->node, &block->instr_list);
334 	}
335 
336 	if (last_rel)
337 		last_rel->flags |= IR3_INSTR_UL;
338 
339 	bd->valid = true;
340 
341 	if (memcmp(&prev_state, state, sizeof(*state))) {
342 		/* our output state changed, this invalidates all of our
343 		 * successors:
344 		 */
345 		for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
346 			if (!block->successors[i])
347 				break;
348 			struct ir3_legalize_block_data *pbd = block->successors[i]->data;
349 			pbd->valid = false;
350 		}
351 	}
352 
353 	return true;
354 }
355 
356 /* Expands dsxpp and dsypp macros to:
357  *
358  * dsxpp.1 dst, src
359  * dsxpp.1.p dst, src
360  *
361  * We apply this after flags syncing, as we don't want to sync in between the
362  * two (which might happen if dst == src).  We do it before nop scheduling
363  * because that needs to count actual instructions.
364  */
365 static bool
apply_fine_deriv_macro(struct ir3_legalize_ctx * ctx,struct ir3_block * block)366 apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
367 {
368 	struct list_head instr_list;
369 
370 	/* remove all the instructions from the list, we'll be adding
371 	 * them back in as we go
372 	 */
373 	list_replace(&block->instr_list, &instr_list);
374 	list_inithead(&block->instr_list);
375 
376 	foreach_instr_safe (n, &instr_list) {
377 		list_addtail(&n->node, &block->instr_list);
378 
379 		if (n->opc == OPC_DSXPP_MACRO || n->opc == OPC_DSYPP_MACRO) {
380 			n->opc = (n->opc == OPC_DSXPP_MACRO) ? OPC_DSXPP_1 : OPC_DSYPP_1;
381 
382 			struct ir3_instruction *op_p = ir3_instr_clone(n);
383 			op_p->flags = IR3_INSTR_P;
384 
385 			ctx->so->need_fine_derivatives = true;
386 		}
387 	}
388 
389 	return true;
390 }
391 
392 /* NOTE: branch instructions are always the last instruction(s)
393  * in the block.  We take advantage of this as we resolve the
394  * branches, since "if (foo) break;" constructs turn into
395  * something like:
396  *
397  *   block3 {
398  *   	...
399  *   	0029:021: mov.s32s32 r62.x, r1.y
400  *   	0082:022: br !p0.x, target=block5
401  *   	0083:023: br p0.x, target=block4
402  *   	// succs: if _[0029:021: mov.s32s32] block4; else block5;
403  *   }
404  *   block4 {
405  *   	0084:024: jump, target=block6
406  *   	// succs: block6;
407  *   }
408  *   block5 {
409  *   	0085:025: jump, target=block7
410  *   	// succs: block7;
411  *   }
412  *
413  * ie. only instruction in block4/block5 is a jump, so when
414  * resolving branches we can easily detect this by checking
415  * that the first instruction in the target block is itself
416  * a jump, and setup the br directly to the jump's target
417  * (and strip back out the now unreached jump)
418  *
419  * TODO sometimes we end up with things like:
420  *
421  *    br !p0.x, #2
422  *    br p0.x, #12
423  *    add.u r0.y, r0.y, 1
424  *
425  * If we swapped the order of the branches, we could drop one.
426  */
427 static struct ir3_block *
resolve_dest_block(struct ir3_block * block)428 resolve_dest_block(struct ir3_block *block)
429 {
430 	/* special case for last block: */
431 	if (!block->successors[0])
432 		return block;
433 
434 	/* NOTE that we may or may not have inserted the jump
435 	 * in the target block yet, so conditions to resolve
436 	 * the dest to the dest block's successor are:
437 	 *
438 	 *   (1) successor[1] == NULL &&
439 	 *   (2) (block-is-empty || only-instr-is-jump)
440 	 */
441 	if (block->successors[1] == NULL) {
442 		if (list_is_empty(&block->instr_list)) {
443 			return block->successors[0];
444 		} else if (list_length(&block->instr_list) == 1) {
445 			struct ir3_instruction *instr = list_first_entry(
446 					&block->instr_list, struct ir3_instruction, node);
447 			if (instr->opc == OPC_JUMP)
448 				return block->successors[0];
449 		}
450 	}
451 	return block;
452 }
453 
454 static void
remove_unused_block(struct ir3_block * old_target)455 remove_unused_block(struct ir3_block *old_target)
456 {
457 	list_delinit(&old_target->node);
458 
459 	/* cleanup dangling predecessors: */
460 	for (unsigned i = 0; i < ARRAY_SIZE(old_target->successors); i++) {
461 		if (old_target->successors[i]) {
462 			struct ir3_block *succ = old_target->successors[i];
463 			_mesa_set_remove_key(succ->predecessors, old_target);
464 		}
465 	}
466 }
467 
468 static void
retarget_jump(struct ir3_instruction * instr,struct ir3_block * new_target)469 retarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target)
470 {
471 	struct ir3_block *old_target = instr->cat0.target;
472 	struct ir3_block *cur_block = instr->block;
473 
474 	/* update current blocks successors to reflect the retargetting: */
475 	if (cur_block->successors[0] == old_target) {
476 		cur_block->successors[0] = new_target;
477 	} else {
478 		debug_assert(cur_block->successors[1] == old_target);
479 		cur_block->successors[1] = new_target;
480 	}
481 
482 	/* update new target's predecessors: */
483 	_mesa_set_add(new_target->predecessors, cur_block);
484 
485 	/* and remove old_target's predecessor: */
486 	debug_assert(_mesa_set_search(old_target->predecessors, cur_block));
487 	_mesa_set_remove_key(old_target->predecessors, cur_block);
488 
489 	if (old_target->predecessors->entries == 0)
490 		remove_unused_block(old_target);
491 
492 	instr->cat0.target = new_target;
493 }
494 
495 static bool
resolve_jump(struct ir3_instruction * instr)496 resolve_jump(struct ir3_instruction *instr)
497 {
498 	struct ir3_block *tblock =
499 		resolve_dest_block(instr->cat0.target);
500 	struct ir3_instruction *target;
501 
502 	if (tblock != instr->cat0.target) {
503 		retarget_jump(instr, tblock);
504 		return true;
505 	}
506 
507 	target = list_first_entry(&tblock->instr_list,
508 				struct ir3_instruction, node);
509 
510 	/* TODO maybe a less fragile way to do this.  But we are expecting
511 	 * a pattern from sched_block() that looks like:
512 	 *
513 	 *   br !p0.x, #else-block
514 	 *   br p0.x, #if-block
515 	 *
516 	 * if the first branch target is +2, or if 2nd branch target is +1
517 	 * then we can just drop the jump.
518 	 */
519 	unsigned next_block;
520 	if (instr->cat0.inv == true)
521 		next_block = 2;
522 	else
523 		next_block = 1;
524 
525 	if (target->ip == (instr->ip + next_block)) {
526 		list_delinit(&instr->node);
527 		return true;
528 	} else {
529 		instr->cat0.immed =
530 			(int)target->ip - (int)instr->ip;
531 	}
532 	return false;
533 }
534 
535 /* resolve jumps, removing jumps/branches to immediately following
536  * instruction which we end up with from earlier stages.  Since
537  * removing an instruction can invalidate earlier instruction's
538  * branch offsets, we need to do this iteratively until no more
539  * branches are removed.
540  */
541 static bool
resolve_jumps(struct ir3 * ir)542 resolve_jumps(struct ir3 *ir)
543 {
544 	foreach_block (block, &ir->block_list)
545 		foreach_instr (instr, &block->instr_list)
546 			if (is_flow(instr) && instr->cat0.target)
547 				if (resolve_jump(instr))
548 					return true;
549 
550 	return false;
551 }
552 
mark_jp(struct ir3_block * block)553 static void mark_jp(struct ir3_block *block)
554 {
555 	struct ir3_instruction *target = list_first_entry(&block->instr_list,
556 			struct ir3_instruction, node);
557 	target->flags |= IR3_INSTR_JP;
558 }
559 
560 /* Mark points where control flow converges or diverges.
561  *
562  * Divergence points could actually be re-convergence points where
563  * "parked" threads are recoverged with threads that took the opposite
564  * path last time around.  Possibly it is easier to think of (jp) as
565  * "the execution mask might have changed".
566  */
567 static void
mark_xvergence_points(struct ir3 * ir)568 mark_xvergence_points(struct ir3 *ir)
569 {
570 	foreach_block (block, &ir->block_list) {
571 		if (block->predecessors->entries > 1) {
572 			/* if a block has more than one possible predecessor, then
573 			 * the first instruction is a convergence point.
574 			 */
575 			mark_jp(block);
576 		} else if (block->predecessors->entries == 1) {
577 			/* If a block has one predecessor, which has multiple possible
578 			 * successors, it is a divergence point.
579 			 */
580 			set_foreach(block->predecessors, entry) {
581 				struct ir3_block *predecessor = (struct ir3_block *)entry->key;
582 				if (predecessor->successors[1]) {
583 					mark_jp(block);
584 				}
585 			}
586 		}
587 	}
588 }
589 
590 /* Insert the branch/jump instructions for flow control between blocks.
591  * Initially this is done naively, without considering if the successor
592  * block immediately follows the current block (ie. so no jump required),
593  * but that is cleaned up in resolve_jumps().
594  *
595  * TODO what ensures that the last write to p0.x in a block is the
596  * branch condition?  Have we been getting lucky all this time?
597  */
598 static void
block_sched(struct ir3 * ir)599 block_sched(struct ir3 *ir)
600 {
601 	foreach_block (block, &ir->block_list) {
602 		if (block->successors[1]) {
603 			/* if/else, conditional branches to "then" or "else": */
604 			struct ir3_instruction *br;
605 
606 			debug_assert(block->condition);
607 
608 			/* create "else" branch first (since "then" block should
609 			 * frequently/always end up being a fall-thru):
610 			 */
611 			br = ir3_B(block, block->condition, 0);
612 			br->cat0.inv = true;
613 			br->cat0.target = block->successors[1];
614 
615 			/* "then" branch: */
616 			br = ir3_B(block, block->condition, 0);
617 			br->cat0.target = block->successors[0];
618 
619 		} else if (block->successors[0]) {
620 			/* otherwise unconditional jump to next block: */
621 			struct ir3_instruction *jmp;
622 
623 			jmp = ir3_JUMP(block);
624 			jmp->cat0.target = block->successors[0];
625 		}
626 	}
627 }
628 
629 /* Here we workaround the fact that kill doesn't actually kill the thread as
630  * GL expects. The last instruction always needs to be an end instruction,
631  * which means that if we're stuck in a loop where kill is the only way out,
632  * then we may have to jump out to the end. kill may also have the d3d
633  * semantics of converting the thread to a helper thread, rather than setting
634  * the exec mask to 0, in which case the helper thread could get stuck in an
635  * infinite loop.
636  *
637  * We do this late, both to give the scheduler the opportunity to reschedule
638  * kill instructions earlier and to avoid having to create a separate basic
639  * block.
640  *
641  * TODO: Assuming that the wavefront doesn't stop as soon as all threads are
642  * killed, we might benefit by doing this more aggressively when the remaining
643  * part of the program after the kill is large, since that would let us
644  * skip over the instructions when there are no non-killed threads left.
645  */
646 static void
kill_sched(struct ir3 * ir,struct ir3_shader_variant * so)647 kill_sched(struct ir3 *ir, struct ir3_shader_variant *so)
648 {
649 	/* True if we know that this block will always eventually lead to the end
650 	 * block:
651 	 */
652 	bool always_ends = true;
653 	bool added = false;
654 	struct ir3_block *last_block =
655 		list_last_entry(&ir->block_list, struct ir3_block, node);
656 
657 	foreach_block_rev (block, &ir->block_list) {
658 		for (unsigned i = 0; i < 2 && block->successors[i]; i++) {
659 			if (block->successors[i]->start_ip <= block->end_ip)
660 				always_ends = false;
661 		}
662 
663 		if (always_ends)
664 			continue;
665 
666 		foreach_instr_safe (instr, &block->instr_list) {
667 			if (instr->opc != OPC_KILL)
668 				continue;
669 
670 			struct ir3_instruction *br = ir3_instr_create(block, OPC_B);
671 			br->regs[1] = instr->regs[1];
672 			br->cat0.target =
673 				list_last_entry(&ir->block_list, struct ir3_block, node);
674 
675 			list_del(&br->node);
676 			list_add(&br->node, &instr->node);
677 
678 			added = true;
679 		}
680 	}
681 
682 	if (added) {
683 		/* I'm not entirely sure how the branchstack works, but we probably
684 		 * need to add at least one entry for the divergence which is resolved
685 		 * at the end:
686 		 */
687 		so->branchstack++;
688 
689 		/* We don't update predecessors/successors, so we have to do this
690 		 * manually:
691 		 */
692 		mark_jp(last_block);
693 	}
694 }
695 
696 /* Insert nop's required to make this a legal/valid shader program: */
697 static void
nop_sched(struct ir3 * ir)698 nop_sched(struct ir3 *ir)
699 {
700 	foreach_block (block, &ir->block_list) {
701 		struct ir3_instruction *last = NULL;
702 		struct list_head instr_list;
703 
704 		/* remove all the instructions from the list, we'll be adding
705 		 * them back in as we go
706 		 */
707 		list_replace(&block->instr_list, &instr_list);
708 		list_inithead(&block->instr_list);
709 
710 		foreach_instr_safe (instr, &instr_list) {
711 			unsigned delay = ir3_delay_calc(block, instr, false, true);
712 
713 			/* NOTE: I think the nopN encoding works for a5xx and
714 			 * probably a4xx, but not a3xx.  So far only tested on
715 			 * a6xx.
716 			 */
717 
718 			if ((delay > 0) && (ir->compiler->gpu_id >= 600) && last &&
719 					((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3)) &&
720 					(last->repeat == 0)) {
721 				/* the previous cat2/cat3 instruction can encode at most 3 nop's: */
722 				unsigned transfer = MIN2(delay, 3 - last->nop);
723 				last->nop += transfer;
724 				delay -= transfer;
725 			}
726 
727 			if ((delay > 0) && last && (last->opc == OPC_NOP)) {
728 				/* the previous nop can encode at most 5 repeats: */
729 				unsigned transfer = MIN2(delay, 5 - last->repeat);
730 				last->repeat += transfer;
731 				delay -= transfer;
732 			}
733 
734 			if (delay > 0) {
735 				debug_assert(delay <= 6);
736 				ir3_NOP(block)->repeat = delay - 1;
737 			}
738 
739 			list_addtail(&instr->node, &block->instr_list);
740 			last = instr;
741 		}
742 	}
743 }
744 
745 bool
ir3_legalize(struct ir3 * ir,struct ir3_shader_variant * so,int * max_bary)746 ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
747 {
748 	struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx);
749 	bool mergedregs = so->mergedregs;
750 	bool progress;
751 
752 	ctx->so = so;
753 	ctx->max_bary = -1;
754 	ctx->compiler = ir->compiler;
755 	ctx->type = ir->type;
756 
757 	/* allocate per-block data: */
758 	foreach_block (block, &ir->block_list) {
759 		struct ir3_legalize_block_data *bd =
760 				rzalloc(ctx, struct ir3_legalize_block_data);
761 
762 		regmask_init(&bd->state.needs_ss_war, mergedregs);
763 		regmask_init(&bd->state.needs_ss, mergedregs);
764 		regmask_init(&bd->state.needs_sy, mergedregs);
765 
766 		block->data = bd;
767 	}
768 
769 	ir3_remove_nops(ir);
770 
771 	/* process each block: */
772 	do {
773 		progress = false;
774 		foreach_block (block, &ir->block_list) {
775 			progress |= legalize_block(ctx, block);
776 		}
777 	} while (progress);
778 
779 	*max_bary = ctx->max_bary;
780 
781 	block_sched(ir);
782 	if (so->type == MESA_SHADER_FRAGMENT)
783 		kill_sched(ir, so);
784 
785 	foreach_block (block, &ir->block_list) {
786 		progress |= apply_fine_deriv_macro(ctx, block);
787 	}
788 
789 	nop_sched(ir);
790 
791 	do {
792 		ir3_count_instructions(ir);
793 	} while(resolve_jumps(ir));
794 
795 	mark_xvergence_points(ir);
796 
797 	ralloc_free(ctx);
798 
799 	return true;
800 }
801