• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2019 Google, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Rob Clark <robclark@freedesktop.org>
25  */
26 
27 #include "ir3.h"
28 
29 /* The maximum number of nop's we may need to insert between two instructions.
30  */
31 #define MAX_NOPS 6
32 
33 /*
34  * Helpers to figure out the necessary delay slots between instructions.  Used
35  * both in scheduling pass(es) and the final pass to insert any required nop's
36  * so that the shader program is valid.
37  *
38  * Note that this needs to work both pre and post RA, so we can't assume ssa
39  * src iterators work.
40  */
41 
42 /* calculate required # of delay slots between the instruction that
43  * assigns a value and the one that consumes
44  */
45 int
ir3_delayslots(struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned n,bool soft)46 ir3_delayslots(struct ir3_instruction *assigner,
47                struct ir3_instruction *consumer, unsigned n, bool soft)
48 {
49    /* generally don't count false dependencies, since this can just be
50     * something like a barrier, or SSBO store.
51     */
52    if (__is_false_dep(consumer, n))
53       return 0;
54 
55    /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
56     * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
57     * handled with sync bits
58     */
59 
60    if (is_meta(assigner) || is_meta(consumer))
61       return 0;
62 
63    if (writes_addr0(assigner) || writes_addr1(assigner))
64       return 6;
65 
66    if (soft && is_ss_producer(assigner))
67       return soft_ss_delay(assigner);
68 
69    /* handled via sync flags: */
70    if (is_ss_producer(assigner) || is_sy_producer(assigner))
71       return 0;
72 
73    /* As far as we know, shader outputs don't need any delay. */
74    if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
75       return 0;
76 
77    /* assigner must be alu: */
78    if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
79        is_mem(consumer)) {
80       return 6;
81    } else {
82       /* In mergedregs mode, there is an extra 2-cycle penalty when half of
83        * a full-reg is read as a half-reg or when a half-reg is read as a
84        * full-reg.
85        */
86       bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) !=
87                              (consumer->srcs[n]->flags & IR3_REG_HALF);
88       unsigned penalty = mismatched_half ? 3 : 0;
89       if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && (n == 2)) {
90          /* special case, 3rd src to cat3 not required on first cycle */
91          return 1 + penalty;
92       } else {
93          return 3 + penalty;
94       }
95    }
96 }
97 
98 static bool
count_instruction(struct ir3_instruction * n)99 count_instruction(struct ir3_instruction *n)
100 {
101    /* NOTE: don't count branch/jump since we don't know yet if they will
102     * be eliminated later in resolve_jumps().. really should do that
103     * earlier so we don't have this constraint.
104     */
105    return is_alu(n) ||
106           (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
107 }
108 
109 /* Post-RA, we don't have arrays any more, so we have to be a bit careful here
110  * and have to handle relative accesses specially.
111  */
112 
113 static unsigned
post_ra_reg_elems(struct ir3_register * reg)114 post_ra_reg_elems(struct ir3_register *reg)
115 {
116    if (reg->flags & IR3_REG_RELATIV)
117       return reg->size;
118    return reg_elems(reg);
119 }
120 
121 static unsigned
post_ra_reg_num(struct ir3_register * reg)122 post_ra_reg_num(struct ir3_register *reg)
123 {
124    if (reg->flags & IR3_REG_RELATIV)
125       return reg->array.base;
126    return reg->num;
127 }
128 
129 unsigned
ir3_delayslots_with_repeat(struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned assigner_n,unsigned consumer_n)130 ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
131                            struct ir3_instruction *consumer,
132                            unsigned assigner_n, unsigned consumer_n)
133 {
134    unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, false);
135 
136    struct ir3_register *src = consumer->srcs[consumer_n];
137    struct ir3_register *dst = assigner->dsts[assigner_n];
138 
139    if (assigner->repeat == 0 && consumer->repeat == 0)
140       return delay;
141 
142    unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
143    unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
144 
145    /* If either side is a relative access, we can't really apply most of the
146     * reasoning below because we don't know which component aliases which.
147     * Just bail in this case.
148     */
149    if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
150       return delay;
151 
152    /* MOVMSK seems to require that all users wait until the entire
153     * instruction is finished, so just bail here.
154     */
155    if (assigner->opc == OPC_MOVMSK)
156       return delay;
157 
158    bool mismatched_half =
159       (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
160 
161    /* TODO: Handle the combination of (rpt) and different component sizes
162     * better like below. This complicates things significantly because the
163     * components don't line up.
164     */
165    if (mismatched_half)
166       return delay;
167 
168    /* If an instruction has a (rpt), then it acts as a sequence of
169     * instructions, reading its non-(r) sources at each cycle. First, get the
170     * register num for the first instruction where they interfere:
171     */
172 
173    unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);
174 
175    /* Now, for that first conflicting half/full register, figure out the
176     * sub-instruction within assigner/consumer it corresponds to. For (r)
177     * sources, this should already return the correct answer of 0. However we
178     * have to special-case the multi-mov instructions, where the
179     * sub-instructions sometimes come from the src/dst indices instead.
180     */
181    unsigned first_src_instr;
182    if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT)
183       first_src_instr = consumer_n;
184    else
185       first_src_instr = first_num - src->num;
186 
187    unsigned first_dst_instr;
188    if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT)
189       first_dst_instr = assigner_n;
190    else
191       first_dst_instr = first_num - dst->num;
192 
193    /* The delay we return is relative to the *end* of assigner and the
194     * *beginning* of consumer, because it's the number of nops (or other
195     * things) needed between them. Any instructions after first_dst_instr
196     * subtract from the delay, and so do any instructions before
197     * first_src_instr. Calculate an offset to subtract from the non-rpt-aware
198     * delay to account for that.
199     *
200     * Now, a priori, we need to go through this process for every
201     * conflicting regnum and take the minimum of the offsets to make sure
202     * that the appropriate number of nop's is inserted for every conflicting
203     * pair of sub-instructions. However, as we go to the next conflicting
204     * regnum (if any), the number of instructions after first_dst_instr
205     * decreases by 1 and the number of source instructions before
206     * first_src_instr correspondingly increases by 1, so the offset stays the
207     * same for all conflicting registers.
208     */
209    unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
210    return offset > delay ? 0 : delay - offset;
211 }
212 
213 static unsigned
delay_calc_srcn(struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned assigner_n,unsigned consumer_n,bool mergedregs)214 delay_calc_srcn(struct ir3_instruction *assigner,
215                 struct ir3_instruction *consumer, unsigned assigner_n,
216                 unsigned consumer_n, bool mergedregs)
217 {
218    struct ir3_register *src = consumer->srcs[consumer_n];
219    struct ir3_register *dst = assigner->dsts[assigner_n];
220    bool mismatched_half =
221       (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
222 
223    /* In the mergedregs case or when the register is a special register,
224     * half-registers do not alias with full registers.
225     */
226    if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
227        mismatched_half)
228       return 0;
229 
230    unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
231    unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
232    unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
233    unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
234 
235    if (dst_start >= src_end || src_start >= dst_end)
236       return 0;
237 
238    return ir3_delayslots_with_repeat(assigner, consumer, assigner_n, consumer_n);
239 }
240 
241 static unsigned
delay_calc(struct ir3_block * block,struct ir3_instruction * start,struct ir3_instruction * consumer,unsigned distance,regmask_t * in_mask,bool mergedregs)242 delay_calc(struct ir3_block *block, struct ir3_instruction *start,
243            struct ir3_instruction *consumer, unsigned distance,
244            regmask_t *in_mask, bool mergedregs)
245 {
246    regmask_t mask;
247    memcpy(&mask, in_mask, sizeof(mask));
248 
249    unsigned delay = 0;
250    /* Search backwards starting at the instruction before start, unless it's
251     * NULL then search backwards from the block end.
252     */
253    struct list_head *start_list =
254       start ? start->node.prev : block->instr_list.prev;
255    list_for_each_entry_from_rev (struct ir3_instruction, assigner, start_list,
256                                  &block->instr_list, node) {
257       if (count_instruction(assigner))
258          distance += assigner->nop;
259 
260       if (distance + delay >= MAX_NOPS)
261          return delay;
262 
263       if (is_meta(assigner))
264          continue;
265 
266       unsigned new_delay = 0;
267 
268       foreach_dst_n (dst, dst_n, assigner) {
269          if (dst->wrmask == 0)
270             continue;
271          if (!regmask_get(&mask, dst))
272             continue;
273          foreach_src_n (src, src_n, consumer) {
274             if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
275                continue;
276 
277             unsigned src_delay = delay_calc_srcn(
278                assigner, consumer, dst_n, src_n, mergedregs);
279             new_delay = MAX2(new_delay, src_delay);
280          }
281          regmask_clear(&mask, dst);
282       }
283 
284       new_delay = new_delay > distance ? new_delay - distance : 0;
285       delay = MAX2(delay, new_delay);
286 
287       if (count_instruction(assigner))
288          distance += 1 + assigner->repeat;
289    }
290 
291    /* Note: this allows recursion into "block" if it has already been
292     * visited, but *not* recursion into its predecessors. We may have to
293     * visit the original block twice, for the loop case where we have to
294     * consider definititons in an earlier iterations of the same loop:
295     *
296     * while (...) {
297     *		mov.u32u32 ..., r0.x
298     *		...
299     *		mov.u32u32 r0.x, ...
300     * }
301     *
302     * However any other recursion would be unnecessary.
303     */
304 
305    if (block->data != block) {
306       block->data = block;
307 
308       for (unsigned i = 0; i < block->predecessors_count; i++) {
309          struct ir3_block *pred = block->predecessors[i];
310          unsigned pred_delay = delay_calc(pred, NULL, consumer, distance,
311                                           &mask, mergedregs);
312          delay = MAX2(delay, pred_delay);
313       }
314 
315       block->data = NULL;
316    }
317 
318    return delay;
319 }
320 
321 /**
322  * Calculate delay for nop insertion. This must exactly match hardware
323  * requirements, including recursing into predecessor blocks.
324  */
325 unsigned
ir3_delay_calc(struct ir3_block * block,struct ir3_instruction * instr,bool mergedregs)326 ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
327                bool mergedregs)
328 {
329    regmask_t mask;
330    regmask_init(&mask, mergedregs);
331    foreach_src (src, instr) {
332       if (!(src->flags & (IR3_REG_IMMED | IR3_REG_CONST)))
333          regmask_set(&mask, src);
334    }
335 
336    return delay_calc(block, NULL, instr, 0, &mask, mergedregs);
337 }
338