1 /*
2 * Copyright © 2019 Google, Inc.
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Rob Clark <robclark@freedesktop.org>
7 */
8
9 #include "ir3.h"
10
11 #include "ir3_compiler.h"
12
13 /* The maximum number of nop's we may need to insert between two instructions.
14 */
15 #define MAX_NOPS 6
16
17 /*
18 * Helpers to figure out the necessary delay slots between instructions. Used
19 * both in scheduling pass(es) and the final pass to insert any required nop's
20 * so that the shader program is valid.
21 *
22 * Note that this needs to work both pre and post RA, so we can't assume ssa
23 * src iterators work.
24 */
25
26 /* Return the number of cycles from the start of the instruction until src_n is
27 * read.
28 */
29 unsigned
ir3_src_read_delay(struct ir3_compiler * compiler,struct ir3_instruction * instr,unsigned src_n)30 ir3_src_read_delay(struct ir3_compiler *compiler, struct ir3_instruction *instr,
31 unsigned src_n)
32 {
33 /* gat and swz have scalar sources and each source is read in a subsequent
34 * cycle.
35 */
36 if (instr->opc == OPC_GAT || instr->opc == OPC_SWZ) {
37 return src_n;
38 }
39
40 /* cat3 instructions consume their last source one or two cycles later. Note
41 * that not all cat3 instructions seem to do this pre-a7xx.
42 */
43 bool cat3_reads_later = compiler->gen >= 7
44 ? (opc_cat(instr->opc) == 3)
45 : (is_mad(instr->opc) || is_madsh(instr->opc));
46 if (cat3_reads_later && src_n == 2) {
47 return compiler->delay_slots.cat3_src2_read;
48 }
49
50 return 0;
51 }
52
53 /* calculate required # of delay slots between the instruction that
54 * assigns a value and the one that consumes
55 */
56 int
ir3_delayslots(struct ir3_compiler * compiler,struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned n,bool soft)57 ir3_delayslots(struct ir3_compiler *compiler,
58 struct ir3_instruction *assigner,
59 struct ir3_instruction *consumer, unsigned n, bool soft)
60 {
61 /* generally don't count false dependencies, since this can just be
62 * something like a barrier, or SSBO store.
63 */
64 if (__is_false_dep(consumer, n))
65 return 0;
66
67 /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
68 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
69 * handled with sync bits
70 */
71
72 if (is_meta(assigner) || is_meta(consumer))
73 return 0;
74
75 if (writes_addr0(assigner) || writes_addr1(assigner))
76 return compiler->delay_slots.non_alu;
77
78 if (soft && needs_ss(compiler, assigner, consumer))
79 return soft_ss_delay(assigner);
80
81 /* handled via sync flags: */
82 if (needs_ss(compiler, assigner, consumer) ||
83 is_sy_producer(assigner))
84 return 0;
85
86 /* scalar ALU -> scalar ALU depdendencies where the source and destination
87 * register sizes match don't require any nops.
88 */
89 if (is_scalar_alu(assigner, compiler)) {
90 assert(is_scalar_alu(consumer, compiler));
91 /* If the sizes don't match then we need (ss) and needs_ss() should've
92 * returned above.
93 */
94 assert((assigner->dsts[0]->flags & IR3_REG_HALF) ==
95 (consumer->srcs[n]->flags & IR3_REG_HALF));
96 return 0;
97 }
98
99 /* As far as we know, shader outputs don't need any delay. */
100 if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
101 return 0;
102
103 /* assigner must be alu: */
104 if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
105 is_mem(consumer)) {
106 return compiler->delay_slots.non_alu;
107 } else {
108 /* In mergedregs mode, there is an extra 2-cycle penalty when half of
109 * a full-reg is read as a half-reg or when a half-reg is read as a
110 * full-reg.
111 */
112 bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) !=
113 (consumer->srcs[n]->flags & IR3_REG_HALF);
114 unsigned penalty = mismatched_half ? 3 : 0;
115 return compiler->delay_slots.alu_to_alu + penalty -
116 ir3_src_read_delay(compiler, consumer, n);
117 }
118 }
119
120 unsigned
ir3_delayslots_with_repeat(struct ir3_compiler * compiler,struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned assigner_n,unsigned consumer_n)121 ir3_delayslots_with_repeat(struct ir3_compiler *compiler,
122 struct ir3_instruction *assigner,
123 struct ir3_instruction *consumer,
124 unsigned assigner_n, unsigned consumer_n)
125 {
126 unsigned delay = ir3_delayslots(compiler, assigner, consumer, consumer_n, false);
127
128 struct ir3_register *src = consumer->srcs[consumer_n];
129 struct ir3_register *dst = assigner->dsts[assigner_n];
130
131 if (assigner->repeat == 0 && consumer->repeat == 0)
132 return delay;
133
134 unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
135 unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
136
137 /* If either side is a relative access, we can't really apply most of the
138 * reasoning below because we don't know which component aliases which.
139 * Just bail in this case.
140 */
141 if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
142 return delay;
143
144 /* MOVMSK seems to require that all users wait until the entire
145 * instruction is finished, so just bail here.
146 */
147 if (assigner->opc == OPC_MOVMSK)
148 return delay;
149
150 /* TODO: Handle the combination of (rpt) and different component sizes
151 * better like below. This complicates things significantly because the
152 * components don't line up.
153 */
154 if ((src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF))
155 return delay;
156
157 /* If an instruction has a (rpt), then it acts as a sequence of
158 * instructions, reading its non-(r) sources at each cycle. First, get the
159 * register num for the first instruction where they interfere:
160 */
161
162 unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);
163
164 /* Now, for that first conflicting half/full register, figure out the
165 * sub-instruction within assigner/consumer it corresponds to. For (r)
166 * sources, this should already return the correct answer of 0. However we
167 * have to special-case the multi-mov instructions, where the
168 * sub-instructions sometimes come from the src/dst indices instead.
169 */
170 unsigned first_src_instr;
171 if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT)
172 first_src_instr = consumer_n;
173 else
174 first_src_instr = first_num - src->num;
175
176 unsigned first_dst_instr;
177 if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT)
178 first_dst_instr = assigner_n;
179 else
180 first_dst_instr = first_num - dst->num;
181
182 /* The delay we return is relative to the *end* of assigner and the
183 * *beginning* of consumer, because it's the number of nops (or other
184 * things) needed between them. Any instructions after first_dst_instr
185 * subtract from the delay, and so do any instructions before
186 * first_src_instr. Calculate an offset to subtract from the non-rpt-aware
187 * delay to account for that.
188 *
189 * Now, a priori, we need to go through this process for every
190 * conflicting regnum and take the minimum of the offsets to make sure
191 * that the appropriate number of nop's is inserted for every conflicting
192 * pair of sub-instructions. However, as we go to the next conflicting
193 * regnum (if any), the number of instructions after first_dst_instr
194 * decreases by 1 and the number of source instructions before
195 * first_src_instr correspondingly increases by 1, so the offset stays the
196 * same for all conflicting registers.
197 */
198 unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
199 return offset > delay ? 0 : delay - offset;
200 }
201
202