• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2023 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 /* The pass uses information on which branches are divergent in order to
25  * determine which blocks are "reconvergence points" where parked threads may
26  * become reactivated as well as to add "physical" edges where the machine may
27  * fall through to the next reconvergence point. Reconvergence points need a
28  * (jp) added in the assembly, and physical edges are needed to model shared
29  * register liveness correctly. Reconvergence happens in the following two
30  * scenarios:
31  *
32  * 1. When there is a divergent branch, the later of the two block destinations
33  *    becomes a reconvergence point.
34  * 2. When a forward edge crosses over a reconvergence point that may be
35  *    outstanding at the start of the edge, we need to park the threads that
36  *    take the edge and resume execution at the reconvergence point. This means
37  *    that there is a physical edge from the start of the edge to the
38  *    reconvergence point, and the destination of the edge becomes a new
39  *    reconvergence point.
40  *
41  * For example, consider this simple if-else:
42  *
43  *    bb0:
44  *    ...
45  *    br p0.x, #bb1, #bb2
46  *    bb1:
47  *    ...
48  *    jump bb3
49  *    bb2:
50  *    ...
51  *    jump bb3
52  *    bb3:
53  *    ...
54  *
55  * The divergent branch at the end of bb0 makes bb2 a reconvergence point
56  * following (1), which starts being outstanding after the branch at the end of
57  * bb1. The jump to bb3 at the end of bb1 goes over bb2 while it is outstanding,
58  * so there is a physical edge from bb1 to bb2 and bb3 is a reconvergence point
59  * following (2).
60  *
61  * Note that (2) can apply recursively. To handle this efficiently we build an
62  * interval tree of forward edges that cross other blocks and whenever a block
63  * becomes a RP we iterate through the edges jumping across it using the tree.
64  * We also need to keep track of the range where each RP may be
65  * "outstanding." A RP becomes outstanding after a branch to it parks its
66  * threads there. This range may increase in size as we discover more and more
67  * branches to it that may park their threads there.
68  *
69  * Finally, we need to compute the branchstack value, which is the maximum
70  * number of outstanding reconvergence points. For the if-else, the branchstack
71  * is 2, because after the jump at the end of bb2 both reconvergence points are
72  * outstanding (although the first is removed immediately afterwards). Because
73  * we already computed the range where each RP is outstanding, this part is
74  * relatively straightforward.
75  */
76 
77 #include <limits.h>
78 
79 #include "ir3_shader.h"
80 
81 #include "util/rb_tree.h"
82 #include "util/u_worklist.h"
83 #include "util/ralloc.h"
84 
85 struct logical_edge {
86    struct uinterval_node node;
87    struct ir3_block *start_block;
88    struct ir3_block *end_block;
89 };
90 
91 struct block_data {
92    /* For a reconvergance point, the index of the first block where, upon
93     * exiting, the RP may be outstanding. Normally this is a predecessor but may
94     * be a loop header for loops.
95     */
96    unsigned first_divergent_pred;
97 
98    /* The last processed first_divergent_pred. */
99    unsigned first_processed_divergent_pred;
100 
101    /* The number of blocks that have this block as a first_divergent_pred. */
102    unsigned divergence_count;
103 };
104 
105 void
ir3_calc_reconvergence(struct ir3_shader_variant * so)106 ir3_calc_reconvergence(struct ir3_shader_variant *so)
107 {
108    void *mem_ctx = ralloc_context(NULL);
109 
110    /* It's important that the index we use corresponds to the final order blocks
111     * are emitted in!
112     */
113    unsigned index = 0;
114    foreach_block (block, &so->ir->block_list) {
115       block->index = index++;
116    }
117 
118    /* Setup the tree of edges */
119    unsigned edge_count = 0;
120    foreach_block (block, &so->ir->block_list) {
121       if (block->successors[0])
122          edge_count++;
123       if (block->successors[1])
124          edge_count++;
125    }
126 
127    struct rb_tree forward_edges, backward_edges;
128    rb_tree_init(&forward_edges);
129    rb_tree_init(&backward_edges);
130 
131    unsigned edge = 0;
132    struct logical_edge *edges =
133       ralloc_array(mem_ctx, struct logical_edge, edge_count);
134    struct block_data *blocks =
135       ralloc_array(mem_ctx, struct block_data, index);
136    foreach_block (block, &so->ir->block_list) {
137       blocks[block->index].divergence_count = 0;
138       blocks[block->index].first_divergent_pred = UINT_MAX;
139       blocks[block->index].first_processed_divergent_pred = UINT_MAX;
140       for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
141          if (block->successors[i]) {
142             ir3_block_link_physical(block, block->successors[i]);
143 
144             if (block->successors[i]->index > block->index + 1) {
145                edges[edge] = (struct logical_edge) {
146                   .node = {
147                      .interval = {
148                         block->index + 1,
149                         block->successors[i]->index - 1
150                      },
151                   },
152                   .start_block = block,
153                   .end_block = block->successors[i],
154                };
155 
156                uinterval_tree_insert(&forward_edges, &edges[edge++].node);
157             } else if (block->successors[i]->index < block->index - 1) {
158                edges[edge] = (struct logical_edge) {
159                   .node = {
160                      .interval = {
161                         block->successors[i]->index - 1,
162                         block->index + 1
163                      },
164                   },
165                   .start_block = block->successors[i],
166                   .end_block = block,
167                };
168 
169                uinterval_tree_insert(&backward_edges, &edges[edge++].node);
170             }
171          }
172       }
173    }
174 
175    assert(edge <= edge_count);
176 
177    u_worklist worklist;
178    u_worklist_init(&worklist, index, mem_ctx);
179 
180    /* First, find and mark divergent branches. The later destination will be the
181     * reconvergence point.
182     */
183    foreach_block (block, &so->ir->block_list) {
184       if (block->successors[0] && block->successors[1]) {
185          unsigned idx = block->successors[0]->index >
186             block->successors[1]->index ? 0 : 1;
187          block->successors[idx]->reconvergence_point = true;
188          blocks[block->successors[idx]->index].first_divergent_pred =
189             block->index;
190          u_worklist_push_tail(&worklist, block->successors[idx], index);
191       }
192    }
193 
194    while (!u_worklist_is_empty(&worklist)) {
195       struct ir3_block *block =
196          u_worklist_pop_head(&worklist, struct ir3_block, index);
197       assert(block->reconvergence_point);
198 
199       /* Iterate over all edges stepping over the block. */
200       struct uinterval interval = { block->index, block->index };
201       struct logical_edge *prev = NULL;
202       uinterval_tree_foreach (struct logical_edge, edge, interval, &forward_edges,
203                               node) {
204          /* If "block" definitely isn't outstanding when the branch
205           * corresponding to "edge" is taken, then we don't need to park
206           * "edge->end_block" and we can ignore this.
207           *
208           * TODO: add uinterval_tree_foreach_from() and use that instead.
209           */
210          if (edge->start_block->index <= blocks[block->index].first_divergent_pred)
211             continue;
212 
213          /* If we've already processed this edge + RP pair, don't process it
214           * again. Because edges are ordered by start point, we must have
215           * processed every edge after this too.
216           */
217          if (edge->start_block->index >
218              blocks[block->index].first_processed_divergent_pred)
219             break;
220 
221          edge->end_block->reconvergence_point = true;
222          if (blocks[edge->end_block->index].first_divergent_pred >
223              edge->start_block->index) {
224             blocks[edge->end_block->index].first_divergent_pred =
225                edge->start_block->index;
226             u_worklist_push_tail(&worklist, edge->end_block, index);
227          }
228 
229          /* Backwards branches extend the range of divergence. For example, a
230           * divergent break creates a reconvergence point after the loop that
231           * stays outstanding throughout subsequent iterations, even at points
232           * before the break. This takes that into account.
233           *
234           * More precisely, a backwards edge that originates between the start
235           * and end of "edge" extends the divergence range to the beginning of
236           * its destination if it is taken, or alternatively to the end of the
237           * block before its destination.
238           *
239           * TODO: in case we ever start accepting weird non-structured control
240           * flow, we may also need to handle this above if a divergent branch
241           * crosses over a backwards edge.
242           */
243          struct uinterval interval2 = { edge->start_block->index, edge->start_block->index };
244          uinterval_tree_foreach (struct logical_edge, back_edge, interval2, &backward_edges,
245                                  node) {
246             if (back_edge->end_block->index < edge->end_block->index) {
247                if (blocks[edge->end_block->index].first_divergent_pred >
248                    back_edge->start_block->index - 1) {
249                   blocks[edge->end_block->index].first_divergent_pred =
250                      back_edge->start_block->index - 1;
251                   u_worklist_push_tail(&worklist, edge->end_block, index);
252                }
253             }
254          }
255 
256          if (!prev || prev->start_block != edge->start_block) {
257             /* We should only process this edge + block combination once, and
258              * we use the fact that edges are sorted by start point to avoid
259              * adding redundant physical edges in case multiple edges have the
260              * same start point by comparing with the previous edge. Therefore
261              * we should only add the physical edge once.
262              */
263             for (unsigned i = 0; i < block->physical_predecessors_count; i++)
264                assert(block->physical_predecessors[i] != edge->start_block);
265             ir3_block_link_physical(edge->start_block, block);
266          }
267          prev = edge;
268       }
269 
270       blocks[block->index].first_processed_divergent_pred =
271          blocks[block->index].first_divergent_pred;
272    }
273 
274    /* For each reconvergent point p we have an open range
275     * (p->first_divergent_pred, p) where p may be outstanding. We need to keep
276     * track of the number of outstanding RPs and calculate the maximum.
277     */
278    foreach_block (block, &so->ir->block_list) {
279       if (block->reconvergence_point) {
280          blocks[blocks[block->index].first_divergent_pred].divergence_count++;
281       }
282    }
283 
284    unsigned rc_level = 0;
285    so->branchstack = 0;
286    foreach_block (block, &so->ir->block_list) {
287       if (block->reconvergence_point)
288          rc_level--;
289 
290       /* Account for lowerings that produce divergent control flow. */
291       foreach_instr (instr, &block->instr_list) {
292          switch (instr->opc) {
293          case OPC_SCAN_MACRO:
294             so->branchstack = MAX2(so->branchstack, rc_level + 2);
295             break;
296          case OPC_BALLOT_MACRO:
297          case OPC_READ_COND_MACRO:
298          case OPC_ELECT_MACRO:
299          case OPC_READ_FIRST_MACRO:
300          case OPC_SWZ_SHARED_MACRO:
301             so->branchstack = MAX2(so->branchstack, rc_level + 1);
302             break;
303          default:
304             break;
305          }
306       }
307 
308       rc_level += blocks[block->index].divergence_count;
309 
310       so->branchstack = MAX2(so->branchstack, rc_level);
311    }
312    assert(rc_level == 0);
313 
314    ralloc_free(mem_ctx);
315 }
316 
317