1 /*
2 * Copyright (C) 2019 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 #include "ir3.h"
28
29 /*
30 * Helpers to figure out the necessary delay slots between instructions. Used
31 * both in scheduling pass(es) and the final pass to insert any required nop's
32 * so that the shader program is valid.
33 *
34 * Note that this needs to work both pre and post RA, so we can't assume ssa
35 * src iterators work.
36 */
37
38 /* generally don't count false dependencies, since this can just be
39 * something like a barrier, or SSBO store. The exception is array
40 * dependencies if the assigner is an array write and the consumer
41 * reads the same array.
42 */
43 static bool
ignore_dep(struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned n)44 ignore_dep(struct ir3_instruction *assigner,
45 struct ir3_instruction *consumer, unsigned n)
46 {
47 if (!__is_false_dep(consumer, n))
48 return false;
49
50 if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
51 struct ir3_register *dst = assigner->regs[0];
52
53 debug_assert(dst->flags & IR3_REG_ARRAY);
54
55 foreach_src (src, consumer) {
56 if ((src->flags & IR3_REG_ARRAY) &&
57 (dst->array.id == src->array.id)) {
58 return false;
59 }
60 }
61 }
62
63 return true;
64 }
65
66 /* calculate required # of delay slots between the instruction that
67 * assigns a value and the one that consumes
68 */
69 int
ir3_delayslots(struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned n,bool soft)70 ir3_delayslots(struct ir3_instruction *assigner,
71 struct ir3_instruction *consumer, unsigned n, bool soft)
72 {
73 if (ignore_dep(assigner, consumer, n))
74 return 0;
75
76 /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
77 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
78 * handled with sync bits
79 */
80
81 if (is_meta(assigner) || is_meta(consumer))
82 return 0;
83
84 if (writes_addr0(assigner) || writes_addr1(assigner))
85 return 6;
86
87 /* On a6xx, it takes the number of delay slots to get a SFU result
88 * back (ie. using nop's instead of (ss) is:
89 *
90 * 8 - single warp
91 * 9 - two warps
92 * 10 - four warps
93 *
94 * and so on. Not quite sure where it tapers out (ie. how many
95 * warps share an SFU unit). But 10 seems like a reasonable #
96 * to choose:
97 */
98 if (soft && is_sfu(assigner))
99 return 10;
100
101 /* handled via sync flags: */
102 if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
103 return 0;
104
105 /* assigner must be alu: */
106 if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
107 is_mem(consumer)) {
108 return 6;
109 } else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
110 (n == 3)) {
111 /* special case, 3rd src to cat3 not required on first cycle */
112 return 1;
113 } else {
114 return 3;
115 }
116 }
117
118 static bool
count_instruction(struct ir3_instruction * n)119 count_instruction(struct ir3_instruction *n)
120 {
121 /* NOTE: don't count branch/jump since we don't know yet if they will
122 * be eliminated later in resolve_jumps().. really should do that
123 * earlier so we don't have this constraint.
124 */
125 return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
126 }
127
128 /**
129 * @block: the block to search in, starting from end; in first pass,
130 * this will be the block the instruction would be inserted into
131 * (but has not yet, ie. it only contains already scheduled
132 * instructions). For intra-block scheduling (second pass), this
133 * would be one of the predecessor blocks.
134 * @instr: the instruction to search for
135 * @maxd: max distance, bail after searching this # of instruction
136 * slots, since it means the instruction we are looking for is
137 * far enough away
138 * @pred: if true, recursively search into predecessor blocks to
139 * find the worst case (shortest) distance (only possible after
140 * individual blocks are all scheduled)
141 */
142 static unsigned
distance(struct ir3_block * block,struct ir3_instruction * instr,unsigned maxd,bool pred)143 distance(struct ir3_block *block, struct ir3_instruction *instr,
144 unsigned maxd, bool pred)
145 {
146 unsigned d = 0;
147
148 /* Note that this relies on incrementally building up the block's
149 * instruction list.. but this is how scheduling and nopsched
150 * work.
151 */
152 foreach_instr_rev (n, &block->instr_list) {
153 if ((n == instr) || (d >= maxd))
154 return MIN2(maxd, d + n->nop);
155 if (count_instruction(n))
156 d = MIN2(maxd, d + 1 + n->repeat + n->nop);
157 }
158
159 /* if coming from a predecessor block, assume it is assigned far
160 * enough away.. we'll fix up later.
161 */
162 if (!pred)
163 return maxd;
164
165 if (pred && (block->data != block)) {
166 /* Search into predecessor blocks, finding the one with the
167 * shortest distance, since that will be the worst case
168 */
169 unsigned min = maxd - d;
170
171 /* (ab)use block->data to prevent recursion: */
172 block->data = block;
173
174 set_foreach (block->predecessors, entry) {
175 struct ir3_block *pred = (struct ir3_block *)entry->key;
176 unsigned n;
177
178 n = distance(pred, instr, min, pred);
179
180 min = MIN2(min, n);
181 }
182
183 block->data = NULL;
184 d += min;
185 }
186
187 return d;
188 }
189
190 /* calculate delay for specified src: */
191 static unsigned
delay_calc_srcn(struct ir3_block * block,struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned srcn,bool soft,bool pred)192 delay_calc_srcn(struct ir3_block *block,
193 struct ir3_instruction *assigner,
194 struct ir3_instruction *consumer,
195 unsigned srcn, bool soft, bool pred)
196 {
197 unsigned delay = 0;
198
199 if (is_meta(assigner)) {
200 foreach_src_n (src, n, assigner) {
201 unsigned d;
202
203 if (!src->instr)
204 continue;
205
206 d = delay_calc_srcn(block, src->instr, consumer, srcn, soft, pred);
207
208 /* A (rptN) instruction executes in consecutive cycles so
209 * it's outputs are written in successive cycles. And
210 * likewise for it's (r)'d (incremented) inputs, they are
211 * read on successive cycles.
212 *
213 * So we need to adjust the delay for (rptN)'s assigners
214 * and consumers accordingly.
215 *
216 * Note that the dst of a (rptN) instruction is implicitly
217 * (r) (the assigner case), although that is not the case
218 * for src registers. There is exactly one case, bary.f,
219 * which has a vecN (collect) src that is not (r)'d.
220 */
221 if ((assigner->opc == OPC_META_SPLIT) && src->instr->repeat) {
222 /* (rptN) assigner case: */
223 d -= MIN2(d, src->instr->repeat - assigner->split.off);
224 } else if ((assigner->opc == OPC_META_COLLECT) && consumer->repeat &&
225 (consumer->regs[srcn]->flags & IR3_REG_R)) {
226 d -= MIN2(d, n);
227 }
228
229 delay = MAX2(delay, d);
230 }
231 } else {
232 delay = ir3_delayslots(assigner, consumer, srcn, soft);
233 delay -= distance(block, assigner, delay, pred);
234 }
235
236 return delay;
237 }
238
239 static struct ir3_instruction *
find_array_write(struct ir3_block * block,unsigned array_id,unsigned maxd)240 find_array_write(struct ir3_block *block, unsigned array_id, unsigned maxd)
241 {
242 unsigned d = 0;
243
244 /* Note that this relies on incrementally building up the block's
245 * instruction list.. but this is how scheduling and nopsched
246 * work.
247 */
248 foreach_instr_rev (n, &block->instr_list) {
249 if (d >= maxd)
250 return NULL;
251 if (count_instruction(n))
252 d++;
253 if (dest_regs(n) == 0)
254 continue;
255
256 /* note that a dest reg will never be an immediate */
257 if (n->regs[0]->array.id == array_id)
258 return n;
259 }
260
261 return NULL;
262 }
263
264 /* like list_length() but only counts instructions which count in the
265 * delay determination:
266 */
267 static unsigned
count_block_delay(struct ir3_block * block)268 count_block_delay(struct ir3_block *block)
269 {
270 unsigned delay = 0;
271 foreach_instr (n, &block->instr_list) {
272 if (!count_instruction(n))
273 continue;
274 delay++;
275 }
276 return delay;
277 }
278
279 static unsigned
delay_calc_array(struct ir3_block * block,unsigned array_id,struct ir3_instruction * consumer,unsigned srcn,bool soft,bool pred,unsigned maxd)280 delay_calc_array(struct ir3_block *block, unsigned array_id,
281 struct ir3_instruction *consumer, unsigned srcn,
282 bool soft, bool pred, unsigned maxd)
283 {
284 struct ir3_instruction *assigner;
285
286 assigner = find_array_write(block, array_id, maxd);
287 if (assigner)
288 return delay_calc_srcn(block, assigner, consumer, srcn, soft, pred);
289
290 if (!pred)
291 return 0;
292
293 unsigned len = count_block_delay(block);
294 if (maxd <= len)
295 return 0;
296
297 maxd -= len;
298
299 if (block->data == block) {
300 /* we have a loop, return worst case: */
301 return maxd;
302 }
303
304 /* If we need to search into predecessors, find the one with the
305 * max delay.. the resulting delay is that minus the number of
306 * counted instructions in this block:
307 */
308 unsigned max = 0;
309
310 /* (ab)use block->data to prevent recursion: */
311 block->data = block;
312
313 set_foreach (block->predecessors, entry) {
314 struct ir3_block *pred = (struct ir3_block *)entry->key;
315 unsigned delay =
316 delay_calc_array(pred, array_id, consumer, srcn, soft, pred, maxd);
317
318 max = MAX2(max, delay);
319 }
320
321 block->data = NULL;
322
323 if (max < len)
324 return 0;
325
326 return max - len;
327 }
328
329 /**
330 * Calculate delay for instruction (maximum of delay for all srcs):
331 *
332 * @soft: If true, add additional delay for situations where they
333 * would not be strictly required because a sync flag would be
334 * used (but scheduler would prefer to schedule some other
335 * instructions first to avoid stalling on sync flag)
336 * @pred: If true, recurse into predecessor blocks
337 */
338 unsigned
ir3_delay_calc(struct ir3_block * block,struct ir3_instruction * instr,bool soft,bool pred)339 ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
340 bool soft, bool pred)
341 {
342 unsigned delay = 0;
343
344 foreach_src_n (src, i, instr) {
345 unsigned d = 0;
346
347 if ((src->flags & IR3_REG_RELATIV) && !(src->flags & IR3_REG_CONST)) {
348 d = delay_calc_array(block, src->array.id, instr, i+1, soft, pred, 6);
349 } else if (src->instr) {
350 d = delay_calc_srcn(block, src->instr, instr, i+1, soft, pred);
351 }
352
353 delay = MAX2(delay, d);
354 }
355
356 if (instr->address) {
357 unsigned d = delay_calc_srcn(block, instr->address, instr, 0, soft, pred);
358 delay = MAX2(delay, d);
359 }
360
361 return delay;
362 }
363
364 /**
365 * Remove nop instructions. The scheduler can insert placeholder nop's
366 * so that ir3_delay_calc() can account for nop's that won't be needed
367 * due to nop's triggered by a previous instruction. However, before
368 * legalize, we want to remove these. The legalize pass can insert
369 * some nop's if needed to hold (for example) sync flags. This final
370 * remaining nops are inserted by legalize after this.
371 */
372 void
ir3_remove_nops(struct ir3 * ir)373 ir3_remove_nops(struct ir3 *ir)
374 {
375 foreach_block (block, &ir->block_list) {
376 foreach_instr_safe (instr, &block->instr_list) {
377 if (instr->opc == OPC_NOP) {
378 list_del(&instr->node);
379 }
380 }
381 }
382
383 }
384