1 /*
2 * Copyright (C) 2023 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 /* The pass uses information on which branches are divergent in order to
25 * determine which blocks are "reconvergence points" where parked threads may
26 * become reactivated as well as to add "physical" edges where the machine may
27 * fall through to the next reconvergence point. Reconvergence points need a
28 * (jp) added in the assembly, and physical edges are needed to model shared
29 * register liveness correctly. Reconvergence happens in the following two
30 * scenarios:
31 *
32 * 1. When there is a divergent branch, the later of the two block destinations
33 * becomes a reconvergence point.
34 * 2. When a forward edge crosses over a reconvergence point that may be
35 * outstanding at the start of the edge, we need to park the threads that
36 * take the edge and resume execution at the reconvergence point. This means
37 * that there is a physical edge from the start of the edge to the
38 * reconvergence point, and the destination of the edge becomes a new
39 * reconvergence point.
40 *
41 * For example, consider this simple if-else:
42 *
43 * bb0:
44 * ...
45 * br p0.x, #bb1, #bb2
46 * bb1:
47 * ...
48 * jump bb3
49 * bb2:
50 * ...
51 * jump bb3
52 * bb3:
53 * ...
54 *
55 * The divergent branch at the end of bb0 makes bb2 a reconvergence point
56 * following (1), which starts being outstanding after the branch at the end of
57 * bb1. The jump to bb3 at the end of bb1 goes over bb2 while it is outstanding,
58 * so there is a physical edge from bb1 to bb2 and bb3 is a reconvergence point
59 * following (2).
60 *
61 * Note that (2) can apply recursively. To handle this efficiently we build an
62 * interval tree of forward edges that cross other blocks and whenever a block
63 * becomes a RP we iterate through the edges jumping across it using the tree.
64 * We also need to keep track of the range where each RP may be
65 * "outstanding." A RP becomes outstanding after a branch to it parks its
66 * threads there. This range may increase in size as we discover more and more
67 * branches to it that may park their threads there.
68 *
69 * Finally, we need to compute the branchstack value, which is the maximum
70 * number of outstanding reconvergence points. For the if-else, the branchstack
71 * is 2, because after the jump at the end of bb2 both reconvergence points are
72 * outstanding (although the first is removed immediately afterwards). Because
73 * we already computed the range where each RP is outstanding, this part is
74 * relatively straightforward.
75 */
76
77 #include <limits.h>
78
79 #include "ir3_shader.h"
80
81 #include "util/rb_tree.h"
82 #include "util/u_worklist.h"
83 #include "util/ralloc.h"
84
85 struct logical_edge {
86 struct uinterval_node node;
87 struct ir3_block *start_block;
88 struct ir3_block *end_block;
89 };
90
91 struct block_data {
92 /* For a reconvergance point, the index of the first block where, upon
93 * exiting, the RP may be outstanding. Normally this is a predecessor but may
94 * be a loop header for loops.
95 */
96 unsigned first_divergent_pred;
97
98 /* The last processed first_divergent_pred. */
99 unsigned first_processed_divergent_pred;
100
101 /* The number of blocks that have this block as a first_divergent_pred. */
102 unsigned divergence_count;
103 };
104
105 void
ir3_calc_reconvergence(struct ir3_shader_variant * so)106 ir3_calc_reconvergence(struct ir3_shader_variant *so)
107 {
108 void *mem_ctx = ralloc_context(NULL);
109
110 /* It's important that the index we use corresponds to the final order blocks
111 * are emitted in!
112 */
113 unsigned index = 0;
114 foreach_block (block, &so->ir->block_list) {
115 block->index = index++;
116 }
117
118 /* Setup the tree of edges */
119 unsigned edge_count = 0;
120 foreach_block (block, &so->ir->block_list) {
121 if (block->successors[0])
122 edge_count++;
123 if (block->successors[1])
124 edge_count++;
125 }
126
127 struct rb_tree forward_edges, backward_edges;
128 rb_tree_init(&forward_edges);
129 rb_tree_init(&backward_edges);
130
131 unsigned edge = 0;
132 struct logical_edge *edges =
133 ralloc_array(mem_ctx, struct logical_edge, edge_count);
134 struct block_data *blocks =
135 ralloc_array(mem_ctx, struct block_data, index);
136 foreach_block (block, &so->ir->block_list) {
137 blocks[block->index].divergence_count = 0;
138 blocks[block->index].first_divergent_pred = UINT_MAX;
139 blocks[block->index].first_processed_divergent_pred = UINT_MAX;
140 for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
141 if (block->successors[i]) {
142 ir3_block_link_physical(block, block->successors[i]);
143
144 if (block->successors[i]->index > block->index + 1) {
145 edges[edge] = (struct logical_edge) {
146 .node = {
147 .interval = {
148 block->index + 1,
149 block->successors[i]->index - 1
150 },
151 },
152 .start_block = block,
153 .end_block = block->successors[i],
154 };
155
156 uinterval_tree_insert(&forward_edges, &edges[edge++].node);
157 } else if (block->successors[i]->index < block->index - 1) {
158 edges[edge] = (struct logical_edge) {
159 .node = {
160 .interval = {
161 block->successors[i]->index - 1,
162 block->index + 1
163 },
164 },
165 .start_block = block->successors[i],
166 .end_block = block,
167 };
168
169 uinterval_tree_insert(&backward_edges, &edges[edge++].node);
170 }
171 }
172 }
173 }
174
175 assert(edge <= edge_count);
176
177 u_worklist worklist;
178 u_worklist_init(&worklist, index, mem_ctx);
179
180 /* First, find and mark divergent branches. The later destination will be the
181 * reconvergence point.
182 */
183 foreach_block (block, &so->ir->block_list) {
184 if (block->successors[0] && block->successors[1]) {
185 unsigned idx = block->successors[0]->index >
186 block->successors[1]->index ? 0 : 1;
187 block->successors[idx]->reconvergence_point = true;
188 blocks[block->successors[idx]->index].first_divergent_pred =
189 block->index;
190 u_worklist_push_tail(&worklist, block->successors[idx], index);
191 }
192 }
193
194 while (!u_worklist_is_empty(&worklist)) {
195 struct ir3_block *block =
196 u_worklist_pop_head(&worklist, struct ir3_block, index);
197 assert(block->reconvergence_point);
198
199 /* Iterate over all edges stepping over the block. */
200 struct uinterval interval = { block->index, block->index };
201 struct logical_edge *prev = NULL;
202 uinterval_tree_foreach (struct logical_edge, edge, interval, &forward_edges,
203 node) {
204 /* If "block" definitely isn't outstanding when the branch
205 * corresponding to "edge" is taken, then we don't need to park
206 * "edge->end_block" and we can ignore this.
207 *
208 * TODO: add uinterval_tree_foreach_from() and use that instead.
209 */
210 if (edge->start_block->index <= blocks[block->index].first_divergent_pred)
211 continue;
212
213 /* If we've already processed this edge + RP pair, don't process it
214 * again. Because edges are ordered by start point, we must have
215 * processed every edge after this too.
216 */
217 if (edge->start_block->index >
218 blocks[block->index].first_processed_divergent_pred)
219 break;
220
221 edge->end_block->reconvergence_point = true;
222 if (blocks[edge->end_block->index].first_divergent_pred >
223 edge->start_block->index) {
224 blocks[edge->end_block->index].first_divergent_pred =
225 edge->start_block->index;
226 u_worklist_push_tail(&worklist, edge->end_block, index);
227 }
228
229 /* Backwards branches extend the range of divergence. For example, a
230 * divergent break creates a reconvergence point after the loop that
231 * stays outstanding throughout subsequent iterations, even at points
232 * before the break. This takes that into account.
233 *
234 * More precisely, a backwards edge that originates between the start
235 * and end of "edge" extends the divergence range to the beginning of
236 * its destination if it is taken, or alternatively to the end of the
237 * block before its destination.
238 *
239 * TODO: in case we ever start accepting weird non-structured control
240 * flow, we may also need to handle this above if a divergent branch
241 * crosses over a backwards edge.
242 */
243 struct uinterval interval2 = { edge->start_block->index, edge->start_block->index };
244 uinterval_tree_foreach (struct logical_edge, back_edge, interval2, &backward_edges,
245 node) {
246 if (back_edge->end_block->index < edge->end_block->index) {
247 if (blocks[edge->end_block->index].first_divergent_pred >
248 back_edge->start_block->index - 1) {
249 blocks[edge->end_block->index].first_divergent_pred =
250 back_edge->start_block->index - 1;
251 u_worklist_push_tail(&worklist, edge->end_block, index);
252 }
253 }
254 }
255
256 if (!prev || prev->start_block != edge->start_block) {
257 /* We should only process this edge + block combination once, and
258 * we use the fact that edges are sorted by start point to avoid
259 * adding redundant physical edges in case multiple edges have the
260 * same start point by comparing with the previous edge. Therefore
261 * we should only add the physical edge once.
262 */
263 for (unsigned i = 0; i < block->physical_predecessors_count; i++)
264 assert(block->physical_predecessors[i] != edge->start_block);
265 ir3_block_link_physical(edge->start_block, block);
266 }
267 prev = edge;
268 }
269
270 blocks[block->index].first_processed_divergent_pred =
271 blocks[block->index].first_divergent_pred;
272 }
273
274 /* For each reconvergent point p we have an open range
275 * (p->first_divergent_pred, p) where p may be outstanding. We need to keep
276 * track of the number of outstanding RPs and calculate the maximum.
277 */
278 foreach_block (block, &so->ir->block_list) {
279 if (block->reconvergence_point) {
280 blocks[blocks[block->index].first_divergent_pred].divergence_count++;
281 }
282 }
283
284 unsigned rc_level = 0;
285 so->branchstack = 0;
286 foreach_block (block, &so->ir->block_list) {
287 if (block->reconvergence_point)
288 rc_level--;
289
290 /* Account for lowerings that produce divergent control flow. */
291 foreach_instr (instr, &block->instr_list) {
292 switch (instr->opc) {
293 case OPC_SCAN_MACRO:
294 so->branchstack = MAX2(so->branchstack, rc_level + 2);
295 break;
296 case OPC_BALLOT_MACRO:
297 case OPC_READ_COND_MACRO:
298 case OPC_ELECT_MACRO:
299 case OPC_READ_FIRST_MACRO:
300 case OPC_SWZ_SHARED_MACRO:
301 so->branchstack = MAX2(so->branchstack, rc_level + 1);
302 break;
303 default:
304 break;
305 }
306 }
307
308 rc_level += blocks[block->index].divergence_count;
309
310 so->branchstack = MAX2(so->branchstack, rc_level);
311 }
312 assert(rc_level == 0);
313
314 ralloc_free(mem_ctx);
315 }
316
317