• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Google, Inc.
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Rob Clark <robclark@freedesktop.org>
7  */
8 
9 #include "ir3.h"
10 
11 #include "ir3_compiler.h"
12 
13 /* The maximum number of nop's we may need to insert between two instructions.
14  */
15 #define MAX_NOPS 6
16 
17 /*
18  * Helpers to figure out the necessary delay slots between instructions.  Used
19  * both in scheduling pass(es) and the final pass to insert any required nop's
20  * so that the shader program is valid.
21  *
22  * Note that this needs to work both pre and post RA, so we can't assume ssa
23  * src iterators work.
24  */
25 
26 /* Return the number of cycles from the start of the instruction until src_n is
27  * read.
28  */
29 unsigned
ir3_src_read_delay(struct ir3_compiler * compiler,struct ir3_instruction * instr,unsigned src_n)30 ir3_src_read_delay(struct ir3_compiler *compiler, struct ir3_instruction *instr,
31                    unsigned src_n)
32 {
33    /* gat and swz have scalar sources and each source is read in a subsequent
34     * cycle.
35     */
36    if (instr->opc == OPC_GAT || instr->opc == OPC_SWZ) {
37       return src_n;
38    }
39 
40    /* cat3 instructions consume their last source one or two cycles later. Note
41     * that not all cat3 instructions seem to do this pre-a7xx.
42     */
43    bool cat3_reads_later = compiler->gen >= 7
44                               ? (opc_cat(instr->opc) == 3)
45                               : (is_mad(instr->opc) || is_madsh(instr->opc));
46    if (cat3_reads_later && src_n == 2) {
47       return compiler->delay_slots.cat3_src2_read;
48    }
49 
50    return 0;
51 }
52 
53 /* calculate required # of delay slots between the instruction that
54  * assigns a value and the one that consumes
55  */
56 int
ir3_delayslots(struct ir3_compiler * compiler,struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned n,bool soft)57 ir3_delayslots(struct ir3_compiler *compiler,
58                struct ir3_instruction *assigner,
59                struct ir3_instruction *consumer, unsigned n, bool soft)
60 {
61    /* generally don't count false dependencies, since this can just be
62     * something like a barrier, or SSBO store.
63     */
64    if (__is_false_dep(consumer, n))
65       return 0;
66 
67    /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
68     * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
69     * handled with sync bits
70     */
71 
72    if (is_meta(assigner) || is_meta(consumer))
73       return 0;
74 
75    if (writes_addr0(assigner) || writes_addr1(assigner))
76       return compiler->delay_slots.non_alu;
77 
78    if (soft && needs_ss(compiler, assigner, consumer))
79       return soft_ss_delay(assigner);
80 
81    /* handled via sync flags: */
82    if (needs_ss(compiler, assigner, consumer) ||
83        is_sy_producer(assigner))
84       return 0;
85 
86    /* scalar ALU -> scalar ALU depdendencies where the source and destination
87     * register sizes match don't require any nops.
88     */
89    if (is_scalar_alu(assigner, compiler)) {
90       assert(is_scalar_alu(consumer, compiler));
91       /* If the sizes don't match then we need (ss) and needs_ss() should've
92        * returned above.
93        */
94       assert((assigner->dsts[0]->flags & IR3_REG_HALF) ==
95              (consumer->srcs[n]->flags & IR3_REG_HALF));
96       return 0;
97    }
98 
99    /* As far as we know, shader outputs don't need any delay. */
100    if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
101       return 0;
102 
103    /* assigner must be alu: */
104    if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
105        is_mem(consumer)) {
106       return compiler->delay_slots.non_alu;
107    } else {
108       /* In mergedregs mode, there is an extra 2-cycle penalty when half of
109        * a full-reg is read as a half-reg or when a half-reg is read as a
110        * full-reg.
111        */
112       bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) !=
113                              (consumer->srcs[n]->flags & IR3_REG_HALF);
114       unsigned penalty = mismatched_half ? 3 : 0;
115       return compiler->delay_slots.alu_to_alu + penalty -
116              ir3_src_read_delay(compiler, consumer, n);
117    }
118 }
119 
120 unsigned
ir3_delayslots_with_repeat(struct ir3_compiler * compiler,struct ir3_instruction * assigner,struct ir3_instruction * consumer,unsigned assigner_n,unsigned consumer_n)121 ir3_delayslots_with_repeat(struct ir3_compiler *compiler,
122                            struct ir3_instruction *assigner,
123                            struct ir3_instruction *consumer,
124                            unsigned assigner_n, unsigned consumer_n)
125 {
126    unsigned delay = ir3_delayslots(compiler, assigner, consumer, consumer_n, false);
127 
128    struct ir3_register *src = consumer->srcs[consumer_n];
129    struct ir3_register *dst = assigner->dsts[assigner_n];
130 
131    if (assigner->repeat == 0 && consumer->repeat == 0)
132       return delay;
133 
134    unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
135    unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
136 
137    /* If either side is a relative access, we can't really apply most of the
138     * reasoning below because we don't know which component aliases which.
139     * Just bail in this case.
140     */
141    if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
142       return delay;
143 
144    /* MOVMSK seems to require that all users wait until the entire
145     * instruction is finished, so just bail here.
146     */
147    if (assigner->opc == OPC_MOVMSK)
148       return delay;
149 
150    /* TODO: Handle the combination of (rpt) and different component sizes
151     * better like below. This complicates things significantly because the
152     * components don't line up.
153     */
154    if ((src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF))
155       return delay;
156 
157    /* If an instruction has a (rpt), then it acts as a sequence of
158     * instructions, reading its non-(r) sources at each cycle. First, get the
159     * register num for the first instruction where they interfere:
160     */
161 
162    unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);
163 
164    /* Now, for that first conflicting half/full register, figure out the
165     * sub-instruction within assigner/consumer it corresponds to. For (r)
166     * sources, this should already return the correct answer of 0. However we
167     * have to special-case the multi-mov instructions, where the
168     * sub-instructions sometimes come from the src/dst indices instead.
169     */
170    unsigned first_src_instr;
171    if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT)
172       first_src_instr = consumer_n;
173    else
174       first_src_instr = first_num - src->num;
175 
176    unsigned first_dst_instr;
177    if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT)
178       first_dst_instr = assigner_n;
179    else
180       first_dst_instr = first_num - dst->num;
181 
182    /* The delay we return is relative to the *end* of assigner and the
183     * *beginning* of consumer, because it's the number of nops (or other
184     * things) needed between them. Any instructions after first_dst_instr
185     * subtract from the delay, and so do any instructions before
186     * first_src_instr. Calculate an offset to subtract from the non-rpt-aware
187     * delay to account for that.
188     *
189     * Now, a priori, we need to go through this process for every
190     * conflicting regnum and take the minimum of the offsets to make sure
191     * that the appropriate number of nop's is inserted for every conflicting
192     * pair of sub-instructions. However, as we go to the next conflicting
193     * regnum (if any), the number of instructions after first_dst_instr
194     * decreases by 1 and the number of source instructions before
195     * first_src_instr correspondingly increases by 1, so the offset stays the
196     * same for all conflicting registers.
197     */
198    unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
199    return offset > delay ? 0 : delay - offset;
200 }
201 
202