• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2020 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors (Collabora):
24  *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25  */
26 
27 #include "compiler.h"
28 
29 /* Assign dependency slots to each clause and calculate dependencies, This pass
30  * must be run after scheduling.
31  *
32  * 1. A clause that does not produce a message must use the sentinel slot #0
33  * 2a. A clause that depends on the results of a previous message-passing
34  * instruction must depend on that instruction's dependency slot, unless all
35  * reaching code paths already depended on it.
36  * 2b. More generally, any dependencies must be encoded. This includes
37  * Write-After-Write and Write-After-Read hazards with LOAD/STORE to memory.
38  * 3. The shader must wait on slot #6 before running BLEND, ATEST
39  * 4. The shader must wait on slot #7 before running BLEND, ST_TILE
40  * 5. ATEST, ZS_EMIT must be issued with slot #0
41  * 6. BARRIER must be issued with slot #7 and wait on every active slot.
42  * 7. Only slots #0 through #5 may be used for clauses not otherwise specified.
43  * 8. If a clause writes to a read staging register of an unresolved
44  * dependency, it must set a staging barrier.
45  *
46  * Note it _is_ legal to reuse slots for multiple message passing instructions
47  * with overlapping liveness, albeit with a slight performance penalty. As such
48  * the problem is significantly easier than register allocation, rather than
49  * spilling we may simply reuse slots. (TODO: does this have an optimal
50  * linear-time solution).
51  *
52  * Within these constraints we are free to assign slots as we like. This pass
53  * attempts to minimize stalls (TODO).
54  */
55 
56 #define BI_NUM_GENERAL_SLOTS 6
57 #define BI_NUM_SLOTS 8
58 #define BI_NUM_REGISTERS 64
59 #define BI_SLOT_SERIAL 0 /* arbitrary */
60 
61 /*
62  * Due to the crude scoreboarding we do, we need to serialize varying loads and
63  * memory access. Identify these instructions here.
64  */
65 static bool
bi_should_serialize(bi_instr * I)66 bi_should_serialize(bi_instr *I)
67 {
68         /* For debug, serialize everything to disable scoreboard opts */
69         if (bifrost_debug & BIFROST_DBG_NOSB)
70                 return true;
71 
72         /* Although nominally on the attribute unit, image loads have the same
73          * coherency requirements as general memory loads. Serialize them for
74          * now until we can do something more clever.
75          */
76         if (I->op == BI_OPCODE_LD_ATTR_TEX)
77                 return true;
78 
79         switch (bi_opcode_props[I->op].message) {
80         case BIFROST_MESSAGE_VARYING:
81         case BIFROST_MESSAGE_LOAD:
82         case BIFROST_MESSAGE_STORE:
83         case BIFROST_MESSAGE_ATOMIC:
84                 return true;
85         default:
86                 return false;
87         }
88 }
89 
90 /* Given a scoreboard model, choose a slot for a clause wrapping a given
91  * message passing instruction. No side effects. */
92 
93 static unsigned
bi_choose_scoreboard_slot(bi_instr * message)94 bi_choose_scoreboard_slot(bi_instr *message)
95 {
96         /* ATEST, ZS_EMIT must be issued with slot #0 */
97         if (message->op == BI_OPCODE_ATEST || message->op == BI_OPCODE_ZS_EMIT)
98                 return 0;
99 
100         /* BARRIER must be issued with slot #7 */
101         if (message->op == BI_OPCODE_BARRIER)
102                 return 7;
103 
104         /* For now, make serialization is easy */
105         if (bi_should_serialize(message))
106                 return BI_SLOT_SERIAL;
107 
108         return 0;
109 }
110 
111 static uint64_t
bi_read_mask(bi_instr * I,bool staging_only)112 bi_read_mask(bi_instr *I, bool staging_only)
113 {
114         uint64_t mask = 0;
115 
116         if (staging_only && !bi_opcode_props[I->op].sr_read)
117                 return mask;
118 
119         bi_foreach_src(I, s) {
120                 if (I->src[s].type == BI_INDEX_REGISTER) {
121                         unsigned reg = I->src[s].value;
122                         unsigned count = bi_count_read_registers(I, s);
123 
124                         mask |= (BITFIELD64_MASK(count) << reg);
125                 }
126 
127                 if (staging_only)
128                         break;
129         }
130 
131         return mask;
132 }
133 
134 static uint64_t
bi_write_mask(bi_instr * I)135 bi_write_mask(bi_instr *I)
136 {
137         uint64_t mask = 0;
138 
139         bi_foreach_dest(I, d) {
140                 if (bi_is_null(I->dest[d])) continue;
141 
142                 assert(I->dest[d].type == BI_INDEX_REGISTER);
143 
144                 unsigned reg = I->dest[d].value;
145                 unsigned count = bi_count_write_registers(I, d);
146 
147                 mask |= (BITFIELD64_MASK(count) << reg);
148         }
149 
150         /* Instructions like AXCHG.i32 unconditionally both read and write
151          * staging registers. Even if we discard the result, the write still
152          * happens logically and needs to be included in our calculations.
153          * Obscurely, ATOM_CX is sr_write but can ignore the staging register in
154          * certain circumstances; this does not require consideration.
155          */
156         if (bi_opcode_props[I->op].sr_write && bi_is_null(I->dest[0]) &&
157             !bi_is_null(I->src[0])) {
158 
159                 unsigned reg = I->src[0].value;
160                 unsigned count = bi_count_write_registers(I, 0);
161 
162                 mask |= (BITFIELD64_MASK(count) << reg);
163         }
164 
165         return mask;
166 }
167 
168 /* Update the scoreboard model to assign an instruction to a given slot */
169 
170 static void
bi_push_clause(struct bi_scoreboard_state * st,bi_clause * clause)171 bi_push_clause(struct bi_scoreboard_state *st, bi_clause *clause)
172 {
173         bi_instr *I = clause->message;
174         unsigned slot = clause->scoreboard_id;
175 
176         if (!I)
177                 return;
178 
179         st->read[slot] |= bi_read_mask(I, true);
180 
181         if (bi_opcode_props[I->op].sr_write)
182                 st->write[slot] |= bi_write_mask(I);
183 }
184 
185 /* Adds a dependency on each slot writing any specified register */
186 
187 static void
bi_depend_on_writers(bi_clause * clause,struct bi_scoreboard_state * st,uint64_t regmask)188 bi_depend_on_writers(bi_clause *clause, struct bi_scoreboard_state *st, uint64_t regmask)
189 {
190         for (unsigned slot = 0; slot < ARRAY_SIZE(st->write); ++slot) {
191                 if (!(st->write[slot] & regmask))
192                         continue;
193 
194                 st->write[slot] = 0;
195                 st->read[slot] = 0;
196 
197                 clause->dependencies |= BITFIELD_BIT(slot);
198         }
199 }
200 
201 static void
bi_set_staging_barrier(bi_clause * clause,struct bi_scoreboard_state * st,uint64_t regmask)202 bi_set_staging_barrier(bi_clause *clause, struct bi_scoreboard_state *st, uint64_t regmask)
203 {
204         for (unsigned slot = 0; slot < ARRAY_SIZE(st->read); ++slot) {
205                 if (!(st->read[slot] & regmask))
206                         continue;
207 
208                 st->read[slot] = 0;
209                 clause->staging_barrier = true;
210         }
211 }
212 
213 /* Sets the dependencies for a given clause, updating the model */
214 
215 static void
bi_set_dependencies(bi_block * block,bi_clause * clause,struct bi_scoreboard_state * st)216 bi_set_dependencies(bi_block *block, bi_clause *clause, struct bi_scoreboard_state *st)
217 {
218         bi_foreach_instr_in_clause(block, clause, I) {
219                 uint64_t read = bi_read_mask(I, false);
220                 uint64_t written = bi_write_mask(I);
221 
222                 /* Read-after-write; write-after-write */
223                 bi_depend_on_writers(clause, st, read | written);
224 
225                 /* Write-after-read */
226                 bi_set_staging_barrier(clause, st, written);
227         }
228 
229         /* LD_VAR instructions must be serialized per-quad. Just always depend
230          * on any LD_VAR instructions. This isn't optimal, but doing better
231          * requires divergence-aware data flow analysis.
232          *
233          * Similarly, memory loads/stores need to be synchronized. For now,
234          * force them to be serialized. This is not optimal.
235          */
236         if (clause->message && bi_should_serialize(clause->message))
237                 clause->dependencies |= BITFIELD_BIT(BI_SLOT_SERIAL);
238 
239         /* Barriers must wait on all slots to flush existing work. It might be
240          * possible to skip this with more information about the barrier. For
241          * now, be conservative.
242          */
243         if (clause->message && clause->message->op == BI_OPCODE_BARRIER)
244                 clause->dependencies |= BITFIELD_MASK(BI_NUM_GENERAL_SLOTS);
245 }
246 
247 static bool
scoreboard_block_update(bi_block * blk)248 scoreboard_block_update(bi_block *blk)
249 {
250         bool progress = false;
251 
252         /* pending_in[s] = sum { p in pred[s] } ( pending_out[p] ) */
253         bi_foreach_predecessor(blk, pred) {
254                 for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) {
255                         blk->scoreboard_in.read[i] |= (*pred)->scoreboard_out.read[i];
256                         blk->scoreboard_in.write[i] |= (*pred)->scoreboard_out.write[i];
257                 }
258         }
259 
260         struct bi_scoreboard_state state = blk->scoreboard_in;
261 
262         /* Assign locally */
263 
264         bi_foreach_clause_in_block(blk, clause) {
265                 bi_set_dependencies(blk, clause, &state);
266                 bi_push_clause(&state, clause);
267         }
268 
269         /* To figure out progress, diff scoreboard_out */
270 
271         for (unsigned i = 0; i < BI_NUM_SLOTS; ++i)
272                 progress |= !!memcmp(&state, &blk->scoreboard_out, sizeof(state));
273 
274         blk->scoreboard_out = state;
275 
276         return progress;
277 }
278 
279 void
bi_assign_scoreboard(bi_context * ctx)280 bi_assign_scoreboard(bi_context *ctx)
281 {
282         u_worklist worklist;
283         bi_worklist_init(ctx, &worklist);
284 
285         /* First, assign slots. */
286         bi_foreach_block(ctx, block) {
287                 bi_foreach_clause_in_block(block, clause) {
288                         if (clause->message) {
289                                 unsigned slot = bi_choose_scoreboard_slot(clause->message);
290                                 clause->scoreboard_id = slot;
291                         }
292                 }
293 
294                 bi_worklist_push_tail(&worklist, block);
295         }
296 
297         /* Next, perform forward data flow analysis to calculate dependencies */
298         while (!u_worklist_is_empty(&worklist)) {
299                 /* Pop from the front for forward analysis */
300                 bi_block *blk = bi_worklist_pop_head(&worklist);
301 
302                 if (scoreboard_block_update(blk)) {
303                         bi_foreach_successor(blk, succ)
304                                 bi_worklist_push_tail(&worklist, succ);
305                 }
306         }
307 
308         u_worklist_fini(&worklist);
309 }
310