1 /*
2  * Copyright (C) 2019 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 
28 #include "util/dag.h"
29 #include "util/u_math.h"
30 
31 #include "ir3.h"
32 #include "ir3_compiler.h"
33 #include "ir3_context.h"
34 
35 #ifdef DEBUG
36 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
37 #else
38 #define SCHED_DEBUG 0
39 #endif
40 #define d(fmt, ...) do { if (SCHED_DEBUG) { \
41 	printf("PSCHED: "fmt"\n", ##__VA_ARGS__); \
42 } } while (0)
43 
44 #define di(instr, fmt, ...) do { if (SCHED_DEBUG) { \
45 	printf("PSCHED: "fmt": ", ##__VA_ARGS__); \
46 	ir3_print_instr(instr); \
47 } } while (0)
48 
49 /*
50  * Post RA Instruction Scheduling
51  */
52 
53 struct ir3_postsched_ctx {
54 	struct ir3 *ir;
55 
56 	struct ir3_shader_variant *v;
57 
58 	void *mem_ctx;
59 	struct ir3_block *block;           /* the current block */
60 	struct dag *dag;
61 
62 	struct list_head unscheduled_list; /* unscheduled instructions */
63 
64 	int sfu_delay;
65 	int tex_delay;
66 };
67 
68 struct ir3_postsched_node {
69 	struct dag_node dag;     /* must be first for util_dynarray_foreach */
70 	struct ir3_instruction *instr;
71 	bool partially_evaluated_path;
72 
73 	unsigned delay;
74 	unsigned max_delay;
75 };
76 
77 #define foreach_sched_node(__n, __list) \
78 	list_for_each_entry(struct ir3_postsched_node, __n, __list, dag.link)
79 
80 #define foreach_bit(b, mask) \
81 	for (uint32_t _m = ({debug_assert((mask) >= 1); (mask);}); _m && ({(b) = u_bit_scan(&_m); 1;});)
82 
83 static void
schedule(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)84 schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
85 {
86 	debug_assert(ctx->block == instr->block);
87 
88 	/* remove from unscheduled_list:
89 	 */
90 	list_delinit(&instr->node);
91 
92 	di(instr, "schedule");
93 
94 	list_addtail(&instr->node, &instr->block->instr_list);
95 
96 	struct ir3_postsched_node *n = instr->data;
97 	dag_prune_head(ctx->dag, &n->dag);
98 
99 	if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
100 		return;
101 
102 	if (is_sfu(instr)) {
103 		ctx->sfu_delay = 8;
104 	} else if (check_src_cond(instr, is_sfu)) {
105 		ctx->sfu_delay = 0;
106 	} else if (ctx->sfu_delay > 0) {
107 		ctx->sfu_delay--;
108 	}
109 
110 	if (is_tex_or_prefetch(instr)) {
111 		ctx->tex_delay = 10;
112 	} else if (check_src_cond(instr, is_tex_or_prefetch)) {
113 		ctx->tex_delay = 0;
114 	} else if (ctx->tex_delay > 0) {
115 		ctx->tex_delay--;
116 	}
117 }
118 
119 static void
dump_state(struct ir3_postsched_ctx * ctx)120 dump_state(struct ir3_postsched_ctx *ctx)
121 {
122 	if (!SCHED_DEBUG)
123 		return;
124 
125 	foreach_sched_node (n, &ctx->dag->heads) {
126 		di(n->instr, "maxdel=%3d    ", n->max_delay);
127 
128 		util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
129 			struct ir3_postsched_node *child =
130 				(struct ir3_postsched_node *)edge->child;
131 
132 			di(child->instr, " -> (%d parents) ", child->dag.parent_count);
133 		}
134 	}
135 }
136 
137 /* Determine if this is an instruction that we'd prefer not to schedule
138  * yet, in order to avoid an (ss) sync.  This is limited by the sfu_delay
139  * counter, ie. the more cycles it has been since the last SFU, the less
140  * costly a sync would be.
141  */
142 static bool
would_sync(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)143 would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
144 {
145 	if (ctx->sfu_delay) {
146 		if (check_src_cond(instr, is_sfu))
147 			return true;
148 	}
149 
150 	if (ctx->tex_delay) {
151 		if (check_src_cond(instr, is_tex_or_prefetch))
152 			return true;
153 	}
154 
155 	return false;
156 }
157 
158 /* find instruction to schedule: */
159 static struct ir3_instruction *
choose_instr(struct ir3_postsched_ctx * ctx)160 choose_instr(struct ir3_postsched_ctx *ctx)
161 {
162 	struct ir3_postsched_node *chosen = NULL;
163 
164 	dump_state(ctx);
165 
166 	foreach_sched_node (n, &ctx->dag->heads) {
167 		if (!is_meta(n->instr))
168 			continue;
169 
170 		if (!chosen || (chosen->max_delay < n->max_delay))
171 			chosen = n;
172 	}
173 
174 	if (chosen) {
175 		di(chosen->instr, "prio: chose (meta)");
176 		return chosen->instr;
177 	}
178 
179 	/* Try to schedule inputs with a higher priority, if possible, as
180 	 * the last bary.f unlocks varying storage to unblock more VS
181 	 * warps.
182 	 */
183 	foreach_sched_node (n, &ctx->dag->heads) {
184 		if (!is_input(n->instr))
185 			continue;
186 
187 		if (!chosen || (chosen->max_delay < n->max_delay))
188 			chosen = n;
189 	}
190 
191 	if (chosen) {
192 		di(chosen->instr, "prio: chose (input)");
193 		return chosen->instr;
194 	}
195 
196 	/* Next prioritize discards: */
197 	foreach_sched_node (n, &ctx->dag->heads) {
198 		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
199 
200 		if (d > 0)
201 			continue;
202 
203 		if (!is_kill(n->instr))
204 			continue;
205 
206 		if (!chosen || (chosen->max_delay < n->max_delay))
207 			chosen = n;
208 	}
209 
210 	if (chosen) {
211 		di(chosen->instr, "csp: chose (kill, hard ready)");
212 		return chosen->instr;
213 	}
214 
215 	/* Next prioritize expensive instructions: */
216 	foreach_sched_node (n, &ctx->dag->heads) {
217 		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
218 
219 		if (d > 0)
220 			continue;
221 
222 		if (!(is_sfu(n->instr) || is_tex(n->instr)))
223 			continue;
224 
225 		if (!chosen || (chosen->max_delay < n->max_delay))
226 			chosen = n;
227 	}
228 
229 	if (chosen) {
230 		di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
231 		return chosen->instr;
232 	}
233 
234 	/*
235 	 * Sometimes be better to take a nop, rather than scheduling an
236 	 * instruction that would require an (ss) shortly after another
237 	 * SFU..  ie. if last SFU was just one or two instr ago, and we
238 	 * could choose between taking a nop and then scheduling
239 	 * something else, vs scheduling the immed avail instruction that
240 	 * would require (ss), we are better with the nop.
241 	 */
242 	for (unsigned delay = 0; delay < 4; delay++) {
243 		foreach_sched_node (n, &ctx->dag->heads) {
244 			if (would_sync(ctx, n->instr))
245 				continue;
246 
247 			unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
248 
249 			if (d > delay)
250 				continue;
251 
252 			if (!chosen || (chosen->max_delay < n->max_delay))
253 				chosen = n;
254 		}
255 
256 		if (chosen) {
257 			di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
258 			return chosen->instr;
259 		}
260 	}
261 
262 	/* Next try to find a ready leader w/ soft delay (ie. including extra
263 	 * delay for things like tex fetch which can be synchronized w/ sync
264 	 * bit (but we probably do want to schedule some other instructions
265 	 * while we wait)
266 	 */
267 	foreach_sched_node (n, &ctx->dag->heads) {
268 		unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
269 
270 		if (d > 0)
271 			continue;
272 
273 		if (!chosen || (chosen->max_delay < n->max_delay))
274 			chosen = n;
275 	}
276 
277 	if (chosen) {
278 		di(chosen->instr, "csp: chose (soft ready)");
279 		return chosen->instr;
280 	}
281 
282 	/* Next try to find a ready leader that can be scheduled without nop's,
283 	 * which in the case of things that need (sy)/(ss) could result in
284 	 * stalls.. but we've already decided there is not a better option.
285 	 */
286 	foreach_sched_node (n, &ctx->dag->heads) {
287 		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
288 
289 		if (d > 0)
290 			continue;
291 
292 		if (!chosen || (chosen->max_delay < n->max_delay))
293 			chosen = n;
294 	}
295 
296 	if (chosen) {
297 		di(chosen->instr, "csp: chose (hard ready)");
298 		return chosen->instr;
299 	}
300 
301 	/* Otherwise choose leader with maximum cost:
302 	 *
303 	 * TODO should we try to balance cost and delays?  I guess it is
304 	 * a balance between now-nop's and future-nop's?
305 	 */
306 	foreach_sched_node (n, &ctx->dag->heads) {
307 		if (!chosen || chosen->max_delay < n->max_delay)
308 			chosen = n;
309 	}
310 
311 	if (chosen) {
312 		di(chosen->instr, "csp: chose (leader)");
313 		return chosen->instr;
314 	}
315 
316 	return NULL;
317 }
318 
319 struct ir3_postsched_deps_state {
320 	struct ir3_postsched_ctx *ctx;
321 
322 	enum { F, R } direction;
323 
324 	bool merged;
325 
326 	/* Track the mapping between sched node (instruction) that last
327 	 * wrote a given register (in whichever direction we are iterating
328 	 * the block)
329 	 *
330 	 * Note, this table is twice as big as the # of regs, to deal with
331 	 * half-precision regs.  The approach differs depending on whether
332 	 * the half and full precision register files are "merged" (conflict,
333 	 * ie. a6xx+) in which case we consider each full precision dep
334 	 * as two half-precision dependencies, vs older separate (non-
335 	 * conflicting) in which case the first half of the table is used
336 	 * for full precision and 2nd half for half-precision.
337 	 */
338 	struct ir3_postsched_node *regs[2 * 256];
339 };
340 
341 /* bounds checking read/write accessors, since OoB access to stuff on
342  * the stack is gonna cause a bad day.
343  */
344 #define dep_reg(state, idx) *({ \
345 		assert((idx) < ARRAY_SIZE((state)->regs)); \
346 		&(state)->regs[(idx)]; \
347 	})
348 
349 static void
add_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * before,struct ir3_postsched_node * after)350 add_dep(struct ir3_postsched_deps_state *state,
351 		struct ir3_postsched_node *before,
352 		struct ir3_postsched_node *after)
353 {
354 	if (!before || !after)
355 		return;
356 
357 	assert(before != after);
358 
359 	if (state->direction == F) {
360 		dag_add_edge(&before->dag, &after->dag, NULL);
361 	} else {
362 		dag_add_edge(&after->dag, &before->dag, NULL);
363 	}
364 }
365 
366 static void
add_single_reg_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node,unsigned num,bool write)367 add_single_reg_dep(struct ir3_postsched_deps_state *state,
368 		struct ir3_postsched_node *node, unsigned num, bool write)
369 {
370 	add_dep(state, dep_reg(state, num), node);
371 	if (write) {
372 		dep_reg(state, num) = node;
373 	}
374 }
375 
376 /* This is where we handled full vs half-precision, and potential conflicts
377  * between half and full precision that result in additional dependencies.
378  * The 'reg' arg is really just to know half vs full precision.
379  */
380 static void
add_reg_dep(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node,const struct ir3_register * reg,unsigned num,bool write)381 add_reg_dep(struct ir3_postsched_deps_state *state,
382 		struct ir3_postsched_node *node, const struct ir3_register *reg,
383 		unsigned num, bool write)
384 {
385 	if (state->merged) {
386 		if (reg->flags & IR3_REG_HALF) {
387 			/* single conflict in half-reg space: */
388 			add_single_reg_dep(state, node, num, write);
389 		} else {
390 			/* two conflicts in half-reg space: */
391 			add_single_reg_dep(state, node, 2 * num + 0, write);
392 			add_single_reg_dep(state, node, 2 * num + 1, write);
393 		}
394 	} else {
395 		if (reg->flags & IR3_REG_HALF)
396 			num += ARRAY_SIZE(state->regs) / 2;
397 		add_single_reg_dep(state, node, num, write);
398 	}
399 }
400 
401 static void
calculate_deps(struct ir3_postsched_deps_state * state,struct ir3_postsched_node * node)402 calculate_deps(struct ir3_postsched_deps_state *state,
403 		struct ir3_postsched_node *node)
404 {
405 	int b;
406 
407 	/* Add dependencies on instructions that previously (or next,
408 	 * in the reverse direction) wrote any of our src registers:
409 	 */
410 	foreach_src_n (reg, i, node->instr) {
411 		if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
412 			continue;
413 
414 		if (reg->flags & IR3_REG_RELATIV) {
415 			/* mark entire array as read: */
416 			struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
417 			for (unsigned i = 0; i < arr->length; i++) {
418 				add_reg_dep(state, node, reg, arr->reg + i, false);
419 			}
420 		} else {
421 			foreach_bit (b, reg->wrmask) {
422 				add_reg_dep(state, node, reg, reg->num + b, false);
423 
424 				struct ir3_postsched_node *dep = dep_reg(state, reg->num + b);
425 				if (dep && (state->direction == F)) {
426 					unsigned d = ir3_delayslots(dep->instr, node->instr, i, true);
427 					node->delay = MAX2(node->delay, d);
428 				}
429 			}
430 		}
431 	}
432 
433 	if (node->instr->address) {
434 		add_reg_dep(state, node, node->instr->address->regs[0],
435 					node->instr->address->regs[0]->num,
436 					false);
437 	}
438 
439 	if (dest_regs(node->instr) == 0)
440 		return;
441 
442 	/* And then after we update the state for what this instruction
443 	 * wrote:
444 	 */
445 	struct ir3_register *reg = node->instr->regs[0];
446 	if (reg->flags & IR3_REG_RELATIV) {
447 		/* mark the entire array as written: */
448 		struct ir3_array *arr = ir3_lookup_array(state->ctx->ir, reg->array.id);
449 		for (unsigned i = 0; i < arr->length; i++) {
450 			add_reg_dep(state, node, reg, arr->reg + i, true);
451 		}
452 	} else {
453 		foreach_bit (b, reg->wrmask) {
454 			add_reg_dep(state, node, reg, reg->num + b, true);
455 		}
456 	}
457 }
458 
459 static void
calculate_forward_deps(struct ir3_postsched_ctx * ctx)460 calculate_forward_deps(struct ir3_postsched_ctx *ctx)
461 {
462 	struct ir3_postsched_deps_state state = {
463 			.ctx = ctx,
464 			.direction = F,
465 			.merged = ctx->v->mergedregs,
466 	};
467 
468 	foreach_instr (instr, &ctx->unscheduled_list) {
469 		calculate_deps(&state, instr->data);
470 	}
471 }
472 
473 static void
calculate_reverse_deps(struct ir3_postsched_ctx * ctx)474 calculate_reverse_deps(struct ir3_postsched_ctx *ctx)
475 {
476 	struct ir3_postsched_deps_state state = {
477 			.ctx = ctx,
478 			.direction = R,
479 			.merged = ctx->v->mergedregs,
480 	};
481 
482 	foreach_instr_rev (instr, &ctx->unscheduled_list) {
483 		calculate_deps(&state, instr->data);
484 	}
485 }
486 
487 static void
sched_node_init(struct ir3_postsched_ctx * ctx,struct ir3_instruction * instr)488 sched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
489 {
490 	struct ir3_postsched_node *n = rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
491 
492 	dag_init_node(ctx->dag, &n->dag);
493 
494 	n->instr = instr;
495 	instr->data = n;
496 }
497 
498 static void
sched_dag_max_delay_cb(struct dag_node * node,void * state)499 sched_dag_max_delay_cb(struct dag_node *node, void *state)
500 {
501 	struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
502 	uint32_t max_delay = 0;
503 
504 	util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
505 		struct ir3_postsched_node *child = (struct ir3_postsched_node *)edge->child;
506 		max_delay = MAX2(child->max_delay, max_delay);
507 	}
508 
509 	n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
510 }
511 
512 static void
sched_dag_init(struct ir3_postsched_ctx * ctx)513 sched_dag_init(struct ir3_postsched_ctx *ctx)
514 {
515 	ctx->mem_ctx = ralloc_context(NULL);
516 
517 	ctx->dag = dag_create(ctx->mem_ctx);
518 
519 	foreach_instr (instr, &ctx->unscheduled_list)
520 		sched_node_init(ctx, instr);
521 
522 	calculate_forward_deps(ctx);
523 	calculate_reverse_deps(ctx);
524 
525 	/*
526 	 * To avoid expensive texture fetches, etc, from being moved ahead
527 	 * of kills, track the kills we've seen so far, so we can add an
528 	 * extra dependency on them for tex/mem instructions
529 	 */
530 	struct util_dynarray kills;
531 	util_dynarray_init(&kills, ctx->mem_ctx);
532 
533 	/*
534 	 * Normal srcs won't be in SSA at this point, those are dealt with in
535 	 * calculate_forward_deps() and calculate_reverse_deps().  But we still
536 	 * have the false-dep information in SSA form, so go ahead and add
537 	 * dependencies for that here:
538 	 */
539 	foreach_instr (instr, &ctx->unscheduled_list) {
540 		struct ir3_postsched_node *n = instr->data;
541 
542 		foreach_ssa_src_n (src, i, instr) {
543 			if (src->block != instr->block)
544 				continue;
545 
546 			/* we can end up with unused false-deps.. just skip them: */
547 			if (src->flags & IR3_INSTR_UNUSED)
548 				continue;
549 
550 			struct ir3_postsched_node *sn = src->data;
551 
552 			/* don't consider dependencies in other blocks: */
553 			if (src->block != instr->block)
554 				continue;
555 
556 			dag_add_edge(&sn->dag, &n->dag, NULL);
557 		}
558 
559 		if (is_kill(instr)) {
560 			util_dynarray_append(&kills, struct ir3_instruction *, instr);
561 		} else if (is_tex(instr) || is_mem(instr)) {
562 			util_dynarray_foreach(&kills, struct ir3_instruction *, instrp) {
563 				struct ir3_instruction *kill = *instrp;
564 				struct ir3_postsched_node *kn = kill->data;
565 				dag_add_edge(&kn->dag, &n->dag, NULL);
566 			}
567 		}
568 	}
569 
570 	// TODO do we want to do this after reverse-dependencies?
571 	dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
572 }
573 
574 static void
sched_dag_destroy(struct ir3_postsched_ctx * ctx)575 sched_dag_destroy(struct ir3_postsched_ctx *ctx)
576 {
577 	ralloc_free(ctx->mem_ctx);
578 	ctx->mem_ctx = NULL;
579 	ctx->dag = NULL;
580 }
581 
582 static void
sched_block(struct ir3_postsched_ctx * ctx,struct ir3_block * block)583 sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
584 {
585 	ctx->block = block;
586 	ctx->tex_delay = 0;
587 	ctx->sfu_delay = 0;
588 
589 	/* move all instructions to the unscheduled list, and
590 	 * empty the block's instruction list (to which we will
591 	 * be inserting).
592 	 */
593 	list_replace(&block->instr_list, &ctx->unscheduled_list);
594 	list_inithead(&block->instr_list);
595 
596 	// TODO once we are using post-sched for everything we can
597 	// just not stick in NOP's prior to post-sched, and drop this.
598 	// for now keep this, since it makes post-sched optional:
599 	foreach_instr_safe (instr, &ctx->unscheduled_list) {
600 		switch (instr->opc) {
601 		case OPC_NOP:
602 		case OPC_B:
603 		case OPC_JUMP:
604 			list_delinit(&instr->node);
605 			break;
606 		default:
607 			break;
608 		}
609 	}
610 
611 	sched_dag_init(ctx);
612 
613 	/* First schedule all meta:input instructions, followed by
614 	 * tex-prefetch.  We want all of the instructions that load
615 	 * values into registers before the shader starts to go
616 	 * before any other instructions.  But in particular we
617 	 * want inputs to come before prefetches.  This is because
618 	 * a FS's bary_ij input may not actually be live in the
619 	 * shader, but it should not be scheduled on top of any
620 	 * other input (but can be overwritten by a tex prefetch)
621 	 */
622 	foreach_instr_safe (instr, &ctx->unscheduled_list)
623 		if (instr->opc == OPC_META_INPUT)
624 			schedule(ctx, instr);
625 
626 	foreach_instr_safe (instr, &ctx->unscheduled_list)
627 		if (instr->opc == OPC_META_TEX_PREFETCH)
628 			schedule(ctx, instr);
629 
630 	while (!list_is_empty(&ctx->unscheduled_list)) {
631 		struct ir3_instruction *instr = choose_instr(ctx);
632 
633 		unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
634 		d("delay=%u", delay);
635 
636 		/* and if we run out of instructions that can be scheduled,
637 		 * then it is time for nop's:
638 		 */
639 		debug_assert(delay <= 6);
640 		while (delay > 0) {
641 			ir3_NOP(block);
642 			delay--;
643 		}
644 
645 		schedule(ctx, instr);
646 	}
647 
648 	sched_dag_destroy(ctx);
649 }
650 
651 
652 static bool
is_self_mov(struct ir3_instruction * instr)653 is_self_mov(struct ir3_instruction *instr)
654 {
655 	if (!is_same_type_mov(instr))
656 		return false;
657 
658 	if (instr->regs[0]->num != instr->regs[1]->num)
659 		return false;
660 
661 	if (instr->regs[0]->flags & IR3_REG_RELATIV)
662 		return false;
663 
664 	if (instr->regs[1]->flags & (IR3_REG_CONST | IR3_REG_IMMED |
665 			IR3_REG_RELATIV | IR3_REG_FNEG | IR3_REG_FABS |
666 			IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT |
667 			IR3_REG_EVEN | IR3_REG_POS_INF))
668 		return false;
669 
670 	return true;
671 }
672 
673 /* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
674  * as a result of places were before RA we are not sure that it is
675  * safe to eliminate.  We could eliminate these earlier, but sometimes
676  * they are tangled up in false-dep's, etc, so it is easier just to
677  * let them exist until after RA
678  */
679 static void
cleanup_self_movs(struct ir3 * ir)680 cleanup_self_movs(struct ir3 *ir)
681 {
682 	foreach_block (block, &ir->block_list) {
683 		foreach_instr_safe (instr, &block->instr_list) {
684 
685 			foreach_src (reg, instr) {
686 				if (!reg->instr)
687 					continue;
688 
689 				if (is_self_mov(reg->instr)) {
690 					list_delinit(®->instr->node);
691 					reg->instr = reg->instr->regs[1]->instr;
692 				}
693 			}
694 
695 			for (unsigned i = 0; i < instr->deps_count; i++) {
696 				if (instr->deps[i] && is_self_mov(instr->deps[i])) {
697 					list_delinit(&instr->deps[i]->node);
698 					instr->deps[i] = instr->deps[i]->regs[1]->instr;
699 				}
700 			}
701 		}
702 	}
703 }
704 
705 bool
ir3_postsched(struct ir3 * ir,struct ir3_shader_variant * v)706 ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
707 {
708 	struct ir3_postsched_ctx ctx = {
709 			.ir = ir,
710 			.v  = v,
711 	};
712 
713 	ir3_remove_nops(ir);
714 	cleanup_self_movs(ir);
715 
716 	foreach_block (block, &ir->block_list) {
717 		sched_block(&ctx, block);
718 	}
719 
720 	return true;
721 }
722