1 /*
2 * Copyright (C) 2019-2021 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors (Collabora):
24 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25 */
26
27 #include "compiler.h"
28
29 /* Bifrost texture operations have a `skip` bit, instructing helper invocations
30 * to skip execution. Each clause has a `terminate_discarded_threads` bit,
31 * which will terminate helper invocations.
32 *
33 * The terminate bit should be set on the last clause requiring helper
34 * invocations. Without control flow, that's the last source-order instruction;
35 * with control flow, there may be multiple such instructions (with ifs) or no
36 * such instruction (with loops).
37 *
38 * The skip bit should be set unless the value of this instruction is required
39 * by a future instruction requiring helper invocations. Consider:
40 *
41 * 0 = texture ...
42 * 1 = fmul 0, #10
43 * 2 = dfdx 1
44 * store 2
45 *
46 * Since the derivative calculation 2 requires helper invocations, the value 1
47 * must be calculated by helper invocations, and since it depends on 0, 0 must
48 * be calculated by helpers. Hence the texture op does NOT have the skip bit
49 * set, and the clause containing the derivative has the terminate bit set.
50 *
51 * Calculating the terminate bit occurs by forward dataflow analysis to
52 * determine which blocks require helper invocations. A block requires
53 * invocations in if any of its instructions use helper invocations, or if it
54 * depends on a block that requires invocation. With that analysis, the
55 * terminate bit is set on the last instruction using invocations within any
56 * block that does *not* require invocations out.
57 *
58 * Likewise, calculating the execute bit requires backward dataflow analysis
59 * with union as the join operation and the generating set being the union of
60 * sources of instructions writing executed values. The skip bit is the inverse
61 * of the execute bit.
62 */
63
64 static bool
bi_has_skip_bit(enum bi_opcode op)65 bi_has_skip_bit(enum bi_opcode op)
66 {
67 switch (op) {
68 case BI_OPCODE_TEX_SINGLE:
69 case BI_OPCODE_TEXC:
70 case BI_OPCODE_TEXC_DUAL:
71 case BI_OPCODE_TEXS_2D_F16:
72 case BI_OPCODE_TEXS_2D_F32:
73 case BI_OPCODE_TEXS_CUBE_F16:
74 case BI_OPCODE_TEXS_CUBE_F32:
75 case BI_OPCODE_VAR_TEX_F16:
76 case BI_OPCODE_VAR_TEX_F32:
77 return true;
78 default:
79 return false;
80 }
81 }
82
83 /* Does a given instruction require helper threads to be active (because it
84 * reads from other subgroup lanes)? This only applies to fragment shaders.
85 * Other shader stages do not have a notion of helper threads. */
86
87 bool
bi_instr_uses_helpers(bi_instr * I)88 bi_instr_uses_helpers(bi_instr *I)
89 {
90 switch (I->op) {
91 case BI_OPCODE_TEXC:
92 case BI_OPCODE_TEXC_DUAL:
93 case BI_OPCODE_TEXS_2D_F16:
94 case BI_OPCODE_TEXS_2D_F32:
95 case BI_OPCODE_TEXS_CUBE_F16:
96 case BI_OPCODE_TEXS_CUBE_F32:
97 case BI_OPCODE_VAR_TEX_F16:
98 case BI_OPCODE_VAR_TEX_F32:
99 return !I->lod_mode; /* set for zero, clear for computed */
100 case BI_OPCODE_TEX_SINGLE:
101 return (I->va_lod_mode == BI_VA_LOD_MODE_COMPUTED_LOD) ||
102 (I->va_lod_mode == BI_VA_LOD_MODE_COMPUTED_BIAS);
103 case BI_OPCODE_CLPER_I32:
104 case BI_OPCODE_CLPER_OLD_I32:
105 /* Fragment shaders require helpers to implement derivatives.
106 * Other shader stages don't have helpers at all */
107 return true;
108 default:
109 return false;
110 }
111 }
112
113 /* Does a block use helpers directly */
114 static bool
bi_block_uses_helpers(bi_block * block)115 bi_block_uses_helpers(bi_block *block)
116 {
117 bi_foreach_instr_in_block(block, I) {
118 if (bi_instr_uses_helpers(I))
119 return true;
120 }
121
122 return false;
123 }
124
125 bool
bi_block_terminates_helpers(bi_block * block)126 bi_block_terminates_helpers(bi_block *block)
127 {
128 /* Can't terminate if a successor needs helpers */
129 bi_foreach_successor(block, succ) {
130 if (succ->pass_flags & 1)
131 return false;
132 }
133
134 /* Otherwise we terminate */
135 return true;
136 }
137
138 /*
139 * Propagate the pass flag up the control flow graph by performing depth-first
140 * search on the directed control flow graph.
141 */
142 static void
bi_propagate_pass_flag(bi_block * block)143 bi_propagate_pass_flag(bi_block *block)
144 {
145 block->pass_flags = 1;
146
147 bi_foreach_predecessor(block, pred) {
148 if ((*pred)->pass_flags == 0)
149 bi_propagate_pass_flag(*pred);
150 }
151 }
152
153 void
bi_analyze_helper_terminate(bi_context * ctx)154 bi_analyze_helper_terminate(bi_context *ctx)
155 {
156 /* Other shader stages do not have a notion of helper threads, so we
157 * can skip the analysis. Don't run for blend shaders, either, since
158 * they run in the context of another shader that we don't see. */
159 if (ctx->stage != MESA_SHADER_FRAGMENT || ctx->inputs->is_blend)
160 return;
161
162 /* Clear flags */
163 bi_foreach_block(ctx, block)
164 block->pass_flags = 0;
165
166 /* For each block, check if it uses helpers and propagate that fact if
167 * so. We walk in reverse order to minimize the number of blocks tested:
168 * if the (unique) last block uses helpers, only that block is tested.
169 */
170 bi_foreach_block_rev(ctx, block) {
171 if (block->pass_flags == 0 && bi_block_uses_helpers(block))
172 bi_propagate_pass_flag(block);
173 }
174 }
175
176 void
bi_mark_clauses_td(bi_context * ctx)177 bi_mark_clauses_td(bi_context *ctx)
178 {
179 if (ctx->stage != MESA_SHADER_FRAGMENT || ctx->inputs->is_blend)
180 return;
181
182 /* Finally, mark clauses requiring helpers */
183 bi_foreach_block(ctx, block) {
184 /* At the end, there are helpers iff we don't terminate */
185 bool helpers = !bi_block_terminates_helpers(block);
186
187 bi_foreach_clause_in_block_rev(block, clause) {
188 bi_foreach_instr_in_clause_rev(block, clause, I) {
189 helpers |= bi_instr_uses_helpers(I);
190 }
191
192 clause->td = !helpers;
193 }
194 }
195 }
196
197 static bool
bi_helper_block_update(BITSET_WORD * deps,bi_block * block)198 bi_helper_block_update(BITSET_WORD *deps, bi_block *block)
199 {
200 bool progress = false;
201
202 bi_foreach_instr_in_block_rev(block, I) {
203 /* If a destination is required by helper invocation... */
204 bi_foreach_dest(I, d) {
205 if (!BITSET_TEST(deps, I->dest[d].value))
206 continue;
207
208 /* ...so are the sources */
209 bi_foreach_ssa_src(I, s) {
210 progress |= !BITSET_TEST(deps, I->src[s].value);
211 BITSET_SET(deps, I->src[s].value);
212 }
213
214 break;
215 }
216 }
217
218 return progress;
219 }
220
221 void
bi_analyze_helper_requirements(bi_context * ctx)222 bi_analyze_helper_requirements(bi_context *ctx)
223 {
224 BITSET_WORD *deps = calloc(sizeof(BITSET_WORD), ctx->ssa_alloc);
225
226 /* Initialize with the sources of instructions consuming
227 * derivatives */
228
229 bi_foreach_instr_global(ctx, I) {
230 if (!bi_instr_uses_helpers(I))
231 continue;
232
233 bi_foreach_ssa_src(I, s)
234 BITSET_SET(deps, I->src[s].value);
235 }
236
237 /* Propagate that up */
238 u_worklist worklist;
239 bi_worklist_init(ctx, &worklist);
240
241 bi_foreach_block(ctx, block) {
242 bi_worklist_push_tail(&worklist, block);
243 }
244
245 while (!u_worklist_is_empty(&worklist)) {
246 bi_block *blk = bi_worklist_pop_tail(&worklist);
247
248 if (bi_helper_block_update(deps, blk)) {
249 bi_foreach_predecessor(blk, pred)
250 bi_worklist_push_head(&worklist, *pred);
251 }
252 }
253
254 u_worklist_fini(&worklist);
255
256 /* Set the execute bits */
257
258 bi_foreach_instr_global(ctx, I) {
259 if (!bi_has_skip_bit(I->op))
260 continue;
261
262 bool exec = false;
263
264 bi_foreach_dest(I, d)
265 exec |= BITSET_TEST(deps, I->dest[d].value);
266
267 I->skip = !exec;
268 }
269
270 free(deps);
271 }
272