• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2019-2021 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors (Collabora):
24  *    Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25  */
26 
27 #include "compiler.h"
28 
29 /* Bifrost texture operations have a `skip` bit, instructing helper invocations
30  * to skip execution. Each clause has a `terminate_discarded_threads` bit,
31  * which will terminate helper invocations.
32  *
33  * The terminate bit should be set on the last clause requiring helper
34  * invocations. Without control flow, that's the last source-order instruction;
35  * with control flow, there may be multiple such instructions (with ifs) or no
36  * such instruction (with loops).
37  *
38  * The skip bit should be set unless the value of this instruction is required
39  * by a future instruction requiring helper invocations. Consider:
40  *
41  *      0 = texture ...
42  *      1 = fmul 0, #10
43  *      2 = dfdx 1
44  *      store 2
45  *
46  * Since the derivative calculation 2 requires helper invocations, the value 1
47  * must be calculated by helper invocations, and since it depends on 0, 0 must
48  * be calculated by helpers. Hence the texture op does NOT have the skip bit
49  * set, and the clause containing the derivative has the terminate bit set.
50  *
51  * Calculating the terminate bit occurs by forward dataflow analysis to
52  * determine which blocks require helper invocations. A block requires
53  * invocations in if any of its instructions use helper invocations, or if it
54  * depends on a block that requires invocation. With that analysis, the
55  * terminate bit is set on the last instruction using invocations within any
56  * block that does *not* require invocations out.
57  *
58  * Likewise, calculating the execute bit requires backward dataflow analysis
59  * with union as the join operation and the generating set being the union of
60  * sources of instructions writing executed values. The skip bit is the inverse
61  * of the execute bit.
62  */
63 
64 static bool
bi_has_skip_bit(enum bi_opcode op)65 bi_has_skip_bit(enum bi_opcode op)
66 {
67         switch (op) {
68         case BI_OPCODE_TEX_SINGLE:
69         case BI_OPCODE_TEXC:
70         case BI_OPCODE_TEXS_2D_F16:
71         case BI_OPCODE_TEXS_2D_F32:
72         case BI_OPCODE_TEXS_CUBE_F16:
73         case BI_OPCODE_TEXS_CUBE_F32:
74         case BI_OPCODE_VAR_TEX_F16:
75         case BI_OPCODE_VAR_TEX_F32:
76                 return true;
77         default:
78                 return false;
79         }
80 }
81 
82 /* Does a given instruction require helper threads to be active (because it
83  * reads from other subgroup lanes)? This only applies to fragment shaders.
84  * Other shader stages do not have a notion of helper threads. */
85 
86 bool
bi_instr_uses_helpers(bi_instr * I)87 bi_instr_uses_helpers(bi_instr *I)
88 {
89         switch (I->op) {
90         case BI_OPCODE_TEXC:
91         case BI_OPCODE_TEXS_2D_F16:
92         case BI_OPCODE_TEXS_2D_F32:
93         case BI_OPCODE_TEXS_CUBE_F16:
94         case BI_OPCODE_TEXS_CUBE_F32:
95         case BI_OPCODE_VAR_TEX_F16:
96         case BI_OPCODE_VAR_TEX_F32:
97                 return !I->lod_mode; /* set for zero, clear for computed */
98         case BI_OPCODE_TEX_SINGLE:
99                 return (I->va_lod_mode == BI_VA_LOD_MODE_COMPUTED_LOD) ||
100                        (I->va_lod_mode == BI_VA_LOD_MODE_COMPUTED_BIAS);
101         case BI_OPCODE_CLPER_I32:
102         case BI_OPCODE_CLPER_OLD_I32:
103                 /* Fragment shaders require helpers to implement derivatives.
104                  * Other shader stages don't have helpers at all */
105                 return true;
106         default:
107                 return false;
108         }
109 }
110 
111 /* Does a block use helpers directly */
112 static bool
bi_block_uses_helpers(bi_block * block)113 bi_block_uses_helpers(bi_block *block)
114 {
115         bi_foreach_instr_in_block(block, I) {
116                 if (bi_instr_uses_helpers(I))
117                         return true;
118         }
119 
120         return false;
121 }
122 
123 bool
bi_block_terminates_helpers(bi_block * block)124 bi_block_terminates_helpers(bi_block *block)
125 {
126         /* Can't terminate if a successor needs helpers */
127         bi_foreach_successor(block, succ) {
128                 if (succ->pass_flags & 1)
129                         return false;
130         }
131 
132         /* Otherwise we terminate */
133         return true;
134 }
135 
136 /*
137  * Propagate the pass flag up the control flow graph by performing depth-first
138  * search on the directed control flow graph.
139  */
140 static void
bi_propagate_pass_flag(bi_block * block)141 bi_propagate_pass_flag(bi_block *block)
142 {
143         block->pass_flags = 1;
144 
145         bi_foreach_predecessor(block, pred) {
146                 if ((*pred)->pass_flags == 0)
147                         bi_propagate_pass_flag(*pred);
148         }
149 }
150 
151 void
bi_analyze_helper_terminate(bi_context * ctx)152 bi_analyze_helper_terminate(bi_context *ctx)
153 {
154         /* Other shader stages do not have a notion of helper threads, so we
155          * can skip the analysis. Don't run for blend shaders, either, since
156          * they run in the context of another shader that we don't see. */
157         if (ctx->stage != MESA_SHADER_FRAGMENT || ctx->inputs->is_blend)
158                 return;
159 
160         /* Clear flags */
161         bi_foreach_block(ctx, block)
162                 block->pass_flags = 0;
163 
164         /* For each block, check if it uses helpers and propagate that fact if
165          * so. We walk in reverse order to minimize the number of blocks tested:
166          * if the (unique) last block uses helpers, only that block is tested.
167          */
168         bi_foreach_block_rev(ctx, block) {
169                 if (block->pass_flags == 0 && bi_block_uses_helpers(block))
170                         bi_propagate_pass_flag(block);
171         }
172 }
173 
174 void
bi_mark_clauses_td(bi_context * ctx)175 bi_mark_clauses_td(bi_context *ctx)
176 {
177         if (ctx->stage != MESA_SHADER_FRAGMENT || ctx->inputs->is_blend)
178                 return;
179 
180         /* Finally, mark clauses requiring helpers */
181         bi_foreach_block(ctx, block) {
182                 /* At the end, there are helpers iff we don't terminate */
183                 bool helpers = !bi_block_terminates_helpers(block);
184 
185                 bi_foreach_clause_in_block_rev(block, clause) {
186                         bi_foreach_instr_in_clause_rev(block, clause, I) {
187                                 helpers |= bi_instr_uses_helpers(I);
188                         }
189 
190                         clause->td = !helpers;
191                 }
192         }
193 }
194 
195 static bool
bi_helper_block_update(BITSET_WORD * deps,bi_block * block)196 bi_helper_block_update(BITSET_WORD *deps, bi_block *block)
197 {
198         bool progress = false;
199 
200         bi_foreach_instr_in_block_rev(block, I) {
201                 /* If a destination is required by helper invocation... */
202                 bi_foreach_dest(I, d) {
203                         if (bi_is_null(I->dest[d]))
204                                 continue;
205 
206                         if (!BITSET_TEST(deps, bi_get_node(I->dest[d])))
207                                 continue;
208 
209                         /* ...so are the sources */
210                         bi_foreach_src(I, s) {
211                                 if (I->src[s].type == BI_INDEX_NORMAL) {
212                                         unsigned node = bi_get_node(I->src[s]);
213                                         progress |= !BITSET_TEST(deps, node);
214                                         BITSET_SET(deps, node);
215                                 }
216                         }
217 
218                         break;
219                 }
220         }
221 
222         return progress;
223 }
224 
225 void
bi_analyze_helper_requirements(bi_context * ctx)226 bi_analyze_helper_requirements(bi_context *ctx)
227 {
228         unsigned temp_count = bi_max_temp(ctx);
229         BITSET_WORD *deps = calloc(sizeof(BITSET_WORD), BITSET_WORDS(temp_count));
230 
231         /* Initialize with the sources of instructions consuming
232          * derivatives */
233 
234         bi_foreach_instr_global(ctx, I) {
235                 if (!bi_instr_uses_helpers(I)) continue;
236 
237                 bi_foreach_src(I, s) {
238                         if (I->src[s].type == BI_INDEX_NORMAL)
239                                 BITSET_SET(deps, bi_get_node(I->src[s]));
240                 }
241         }
242 
243         /* Propagate that up */
244         u_worklist worklist;
245         bi_worklist_init(ctx, &worklist);
246 
247         bi_foreach_block(ctx, block) {
248                 bi_worklist_push_tail(&worklist, block);
249         }
250 
251         while (!u_worklist_is_empty(&worklist)) {
252                 bi_block *blk = bi_worklist_pop_tail(&worklist);
253 
254                 if (bi_helper_block_update(deps, blk)) {
255                         bi_foreach_predecessor(blk, pred)
256                                 bi_worklist_push_head(&worklist, *pred);
257                 }
258         }
259 
260         u_worklist_fini(&worklist);
261 
262         /* Set the execute bits */
263 
264         bi_foreach_instr_global(ctx, I) {
265                 if (!bi_has_skip_bit(I->op)) continue;
266 
267                 bool exec = false;
268 
269                 bi_foreach_dest(I, d) {
270                         if (I->dest[d].type == BI_INDEX_NORMAL)
271                                 exec |= BITSET_TEST(deps, bi_get_node(I->dest[d]));
272                 }
273 
274                 I->skip = !exec;
275         }
276 
277         free(deps);
278 }
279