• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2019 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include "ir3.h"
28 
29 /*
30  * Helpers to figure out the necessary delay slots between instructions.  Used
31  * both in scheduling pass(es) and the final pass to insert any required nop's
32  * so that the shader program is valid.
33  *
34  * Note that this needs to work both pre and post RA, so we can't assume ssa
35  * src iterators work.
36  */
37 
38 /* generally don't count false dependencies, since this can just be
39  * something like a barrier, or SSBO store.  The exception is array
40  * dependencies if the assigner is an array write and the consumer
41  * reads the same array.
42  */
43 static bool
ignore_dep(struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned n)44 ignore_dep(struct ir3_instruction *assigner,
45 		struct ir3_instruction *consumer, unsigned n)
46 {
47 	if (!__is_false_dep(consumer, n))
48 		return false;
49 
50 	if (assigner->barrier_class & IR3_BARRIER_ARRAY_W) {
51 		struct ir3_register *dst = assigner->regs[0];
52 
53 		debug_assert(dst->flags & IR3_REG_ARRAY);
54 
55 		foreach_src (src, consumer) {
56 			if ((src->flags & IR3_REG_ARRAY) &&
57 					(dst->array.id == src->array.id)) {
58 				return false;
59 			}
60 		}
61 	}
62 
63 	return true;
64 }
65 
66 /* calculate required # of delay slots between the instruction that
67  * assigns a value and the one that consumes
68  */
69 int
ir3_delayslots(struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned n,bool soft)70 ir3_delayslots(struct ir3_instruction *assigner,
71 		struct ir3_instruction *consumer, unsigned n, bool soft)
72 {
73 	if (ignore_dep(assigner, consumer, n))
74 		return 0;
75 
76 	/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
77 	 * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
78 	 * handled with sync bits
79 	 */
80 
81 	if (is_meta(assigner) || is_meta(consumer))
82 		return 0;
83 
84 	if (writes_addr0(assigner) || writes_addr1(assigner))
85 		return 6;
86 
87 	/* On a6xx, it takes the number of delay slots to get a SFU result
88 	 * back (ie. using nop's instead of (ss) is:
89 	 *
90 	 *     8 - single warp
91 	 *     9 - two warps
92 	 *    10 - four warps
93 	 *
94 	 * and so on.  Not quite sure where it tapers out (ie. how many
95 	 * warps share an SFU unit).  But 10 seems like a reasonable #
96 	 * to choose:
97 	 */
98 	if (soft && is_sfu(assigner))
99 		return 10;
100 
101 	/* handled via sync flags: */
102 	if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
103 		return 0;
104 
105 	/* assigner must be alu: */
106 	if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
107 			is_mem(consumer)) {
108 		return 6;
109 	} else if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) &&
110 			(n == 3)) {
111 		/* special case, 3rd src to cat3 not required on first cycle */
112 		return 1;
113 	} else {
114 		return 3;
115 	}
116 }
117 
118 static bool
count_instruction(struct ir3_instruction * n)119 count_instruction(struct ir3_instruction *n)
120 {
121 	/* NOTE: don't count branch/jump since we don't know yet if they will
122 	 * be eliminated later in resolve_jumps().. really should do that
123 	 * earlier so we don't have this constraint.
124 	 */
125 	return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
126 }
127 
128 /**
129  * @block: the block to search in, starting from end; in first pass,
130  *    this will be the block the instruction would be inserted into
131  *    (but has not yet, ie. it only contains already scheduled
132  *    instructions).  For intra-block scheduling (second pass), this
133  *    would be one of the predecessor blocks.
134  * @instr: the instruction to search for
135  * @maxd:  max distance, bail after searching this # of instruction
136  *    slots, since it means the instruction we are looking for is
137  *    far enough away
138  * @pred:  if true, recursively search into predecessor blocks to
139  *    find the worst case (shortest) distance (only possible after
140  *    individual blocks are all scheduled)
141  */
142 static unsigned
distance(struct ir3_block * block,struct ir3_instruction * instr,unsigned maxd,bool pred)143 distance(struct ir3_block *block, struct ir3_instruction *instr,
144 		unsigned maxd, bool pred)
145 {
146 	unsigned d = 0;
147 
148 	/* Note that this relies on incrementally building up the block's
149 	 * instruction list.. but this is how scheduling and nopsched
150 	 * work.
151 	 */
152 	foreach_instr_rev (n, &block->instr_list) {
153 		if ((n == instr) || (d >= maxd))
154 			return MIN2(maxd, d + n->nop);
155 		if (count_instruction(n))
156 			d = MIN2(maxd, d + 1 + n->repeat + n->nop);
157 	}
158 
159 	/* if coming from a predecessor block, assume it is assigned far
160 	 * enough away.. we'll fix up later.
161 	 */
162 	if (!pred)
163 		return maxd;
164 
165 	if (pred && (block->data != block)) {
166 		/* Search into predecessor blocks, finding the one with the
167 		 * shortest distance, since that will be the worst case
168 		 */
169 		unsigned min = maxd - d;
170 
171 		/* (ab)use block->data to prevent recursion: */
172 		block->data = block;
173 
174 		set_foreach (block->predecessors, entry) {
175 			struct ir3_block *pred = (struct ir3_block *)entry->key;
176 			unsigned n;
177 
178 			n = distance(pred, instr, min, pred);
179 
180 			min = MIN2(min, n);
181 		}
182 
183 		block->data = NULL;
184 		d += min;
185 	}
186 
187 	return d;
188 }
189 
190 /* calculate delay for specified src: */
191 static unsigned
delay_calc_srcn(struct ir3_block * block,struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned srcn,bool soft,bool pred)192 delay_calc_srcn(struct ir3_block *block,
193 		struct ir3_instruction *assigner,
194 		struct ir3_instruction *consumer,
195 		unsigned srcn, bool soft, bool pred)
196 {
197 	unsigned delay = 0;
198 
199 	if (is_meta(assigner)) {
200 		foreach_src_n (src, n, assigner) {
201 			unsigned d;
202 
203 			if (!src->instr)
204 				continue;
205 
206 			d = delay_calc_srcn(block, src->instr, consumer, srcn, soft, pred);
207 
208 			/* A (rptN) instruction executes in consecutive cycles so
209 			 * it's outputs are written in successive cycles.  And
210 			 * likewise for it's (r)'d (incremented) inputs, they are
211 			 * read on successive cycles.
212 			 *
213 			 * So we need to adjust the delay for (rptN)'s assigners
214 			 * and consumers accordingly.
215 			 *
216 			 * Note that the dst of a (rptN) instruction is implicitly
217 			 * (r) (the assigner case), although that is not the case
218 			 * for src registers.  There is exactly one case, bary.f,
219 			 * which has a vecN (collect) src that is not (r)'d.
220 			 */
221 			if ((assigner->opc == OPC_META_SPLIT) && src->instr->repeat) {
222 				/* (rptN) assigner case: */
223 				d -= MIN2(d, src->instr->repeat - assigner->split.off);
224 			} else if ((assigner->opc == OPC_META_COLLECT) && consumer->repeat &&
225 					(consumer->regs[srcn]->flags & IR3_REG_R)) {
226 				d -= MIN2(d, n);
227 			}
228 
229 			delay = MAX2(delay, d);
230 		}
231 	} else {
232 		delay = ir3_delayslots(assigner, consumer, srcn, soft);
233 		delay -= distance(block, assigner, delay, pred);
234 	}
235 
236 	return delay;
237 }
238 
239 static struct ir3_instruction *
find_array_write(struct ir3_block * block,unsigned array_id,unsigned maxd)240 find_array_write(struct ir3_block *block, unsigned array_id, unsigned maxd)
241 {
242 	unsigned d = 0;
243 
244 	/* Note that this relies on incrementally building up the block's
245 	 * instruction list.. but this is how scheduling and nopsched
246 	 * work.
247 	 */
248 	foreach_instr_rev (n, &block->instr_list) {
249 		if (d >= maxd)
250 			return NULL;
251 		if (count_instruction(n))
252 			d++;
253 		if (dest_regs(n) == 0)
254 			continue;
255 
256 		/* note that a dest reg will never be an immediate */
257 		if (n->regs[0]->array.id == array_id)
258 			return n;
259 	}
260 
261 	return NULL;
262 }
263 
264 /* like list_length() but only counts instructions which count in the
265  * delay determination:
266  */
267 static unsigned
count_block_delay(struct ir3_block * block)268 count_block_delay(struct ir3_block *block)
269 {
270 	unsigned delay = 0;
271 	foreach_instr (n, &block->instr_list) {
272 		if (!count_instruction(n))
273 			continue;
274 		delay++;
275 	}
276 	return delay;
277 }
278 
279 static unsigned
delay_calc_array(struct ir3_block * block,unsigned array_id,struct ir3_instruction * consumer,unsigned srcn,bool soft,bool pred,unsigned maxd)280 delay_calc_array(struct ir3_block *block, unsigned array_id,
281 		struct ir3_instruction *consumer, unsigned srcn,
282 		bool soft, bool pred, unsigned maxd)
283 {
284 	struct ir3_instruction *assigner;
285 
286 	assigner = find_array_write(block, array_id, maxd);
287 	if (assigner)
288 		return delay_calc_srcn(block, assigner, consumer, srcn, soft, pred);
289 
290 	if (!pred)
291 		return 0;
292 
293 	unsigned len = count_block_delay(block);
294 	if (maxd <= len)
295 		return 0;
296 
297 	maxd -= len;
298 
299 	if (block->data == block) {
300 		/* we have a loop, return worst case: */
301 		return maxd;
302 	}
303 
304 	/* If we need to search into predecessors, find the one with the
305 	 * max delay.. the resulting delay is that minus the number of
306 	 * counted instructions in this block:
307 	 */
308 	unsigned max = 0;
309 
310 	/* (ab)use block->data to prevent recursion: */
311 	block->data = block;
312 
313 	set_foreach (block->predecessors, entry) {
314 		struct ir3_block *pred = (struct ir3_block *)entry->key;
315 		unsigned delay =
316 			delay_calc_array(pred, array_id, consumer, srcn, soft, pred, maxd);
317 
318 		max = MAX2(max, delay);
319 	}
320 
321 	block->data = NULL;
322 
323 	if (max < len)
324 		return 0;
325 
326 	return max - len;
327 }
328 
329 /**
330  * Calculate delay for instruction (maximum of delay for all srcs):
331  *
332  * @soft:  If true, add additional delay for situations where they
333  *    would not be strictly required because a sync flag would be
334  *    used (but scheduler would prefer to schedule some other
335  *    instructions first to avoid stalling on sync flag)
336  * @pred:  If true, recurse into predecessor blocks
337  */
338 unsigned
ir3_delay_calc(struct ir3_block * block,struct ir3_instruction * instr,bool soft,bool pred)339 ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
340 		bool soft, bool pred)
341 {
342 	unsigned delay = 0;
343 
344 	foreach_src_n (src, i, instr) {
345 		unsigned d = 0;
346 
347 		if ((src->flags & IR3_REG_RELATIV) && !(src->flags & IR3_REG_CONST)) {
348 			d = delay_calc_array(block, src->array.id, instr, i+1, soft, pred, 6);
349 		} else if (src->instr) {
350 			d = delay_calc_srcn(block, src->instr, instr, i+1, soft, pred);
351 		}
352 
353 		delay = MAX2(delay, d);
354 	}
355 
356 	if (instr->address) {
357 		unsigned d = delay_calc_srcn(block, instr->address, instr, 0, soft, pred);
358 		delay = MAX2(delay, d);
359 	}
360 
361 	return delay;
362 }
363 
364 /**
365  * Remove nop instructions.  The scheduler can insert placeholder nop's
366  * so that ir3_delay_calc() can account for nop's that won't be needed
367  * due to nop's triggered by a previous instruction.  However, before
368  * legalize, we want to remove these.  The legalize pass can insert
369  * some nop's if needed to hold (for example) sync flags.  This final
370  * remaining nops are inserted by legalize after this.
371  */
372 void
ir3_remove_nops(struct ir3 * ir)373 ir3_remove_nops(struct ir3 *ir)
374 {
375 	foreach_block (block, &ir->block_list) {
376 		foreach_instr_safe (instr, &block->instr_list) {
377 			if (instr->opc == OPC_NOP) {
378 				list_del(&instr->node);
379 			}
380 		}
381 	}
382 
383 }
384