1 /*
2 * Copyright 2023 Alyssa Rosenzweig
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "agx_builder.h"
7 #include "agx_compiler.h"
8 #include "agx_opcodes.h"
9
10 /*
11 * AGX control flow instructions predicate out threads. No forward branches are
12 * inserted during instruction selection, only backwards branches at the end of
13 * loops exist before this pass. This means, prior to this pass, we would always
14 * execute both sides of an if.
15 *
16 * To improve performance, this pass inserts conservative forward branches after
17 * if_*cmp, else_*cmp, and break_if_*cmp instructions, jumping the
18 * subgroup to their logical destination if all threads in the subgroup are
19 * inactive. This has the effect of skipping over the unexecuted half of an if.
20 * That means this pass is critical for control flow performance.
21 */
22
23 /* Estimated cost of inserting a jmp_exec_none. This value is tuned to Dolphin
24 * ubershaders. It needs to be retuned in lockstep with changes to the cost
25 * estimation heuristic.
26 */
27 #define COST_JMP (19)
28
29 static uint32_t
cost_instr(agx_instr * I)30 cost_instr(agx_instr *I)
31 {
32 /* TODO: Better heuristic */
33 switch (I->op) {
34 case AGX_OPCODE_DEVICE_LOAD:
35 case AGX_OPCODE_TEXTURE_LOAD:
36 case AGX_OPCODE_TEXTURE_SAMPLE:
37 return 10;
38 default:
39 return 1;
40 }
41 }
42
43 /*
44 * Estimate the cost between the instruction and the branch target. This is an
45 * input for our heuristic. The branch target is guaranteed to be a forward
46 * branch.
47 */
48 static uint32_t
cost_between(agx_context * ctx,agx_block * from,agx_instr * from_I,agx_block * target,bool skip_to_end_of_target)49 cost_between(agx_context *ctx, agx_block *from, agx_instr *from_I,
50 agx_block *target, bool skip_to_end_of_target)
51 {
52 uint32_t cost = 0;
53
54 /* Consider the cost in the rest of this block */
55 if (from_I != agx_last_instr(from)) {
56 agx_foreach_instr_in_block_from(from, J, from_I) {
57 /* If we reach the end, we're done */
58 if (from == target && skip_to_end_of_target &&
59 J == agx_last_instr(target))
60 break;
61
62 cost += cost_instr(J);
63 }
64 }
65
66 if (from == target)
67 return cost;
68
69 /* Consider the cost in the subsequent blocks */
70 agx_foreach_block_from(ctx, from, block) {
71 if (block == from)
72 continue;
73
74 if (block == target && !skip_to_end_of_target)
75 break;
76
77 agx_foreach_instr_in_block(block, I) {
78 if (block == target && I == agx_last_instr(target))
79 break;
80
81 cost += cost_instr(I);
82 }
83
84 if (block == target) {
85 assert(skip_to_end_of_target);
86 break;
87 }
88 }
89
90 return cost;
91 }
92
93 static void
try_insert_jmp(agx_context * ctx,agx_block * from,agx_instr * from_I,agx_block * target,bool skip_to_end_of_target,unsigned inverse_probability)94 try_insert_jmp(agx_context *ctx, agx_block *from, agx_instr *from_I,
95 agx_block *target, bool skip_to_end_of_target,
96 unsigned inverse_probability)
97 {
98 agx_builder b = agx_init_builder(ctx, agx_after_instr(from_I));
99
100 /* If the control flow instruction was only inserted for its side effects,
101 * there is nowhere to jump. Bail.
102 */
103 if (!target)
104 return;
105
106 /* If we do not insert a jump, we execute the predicated instructions
107 * unconditionally, with an expected cost C.
108 *
109 * If we do insert a jump, then we pay the cost J of the jump, AND if we do
110 * not take the jump, also the cost of the instructions C. The expected cost
111 * if we insert a jump is therefore J + P(not all threads inactive) C.
112 *
113 * Therefore, we should insert a jump if:
114 *
115 * J + P(not all threads inactive) C < C
116 *
117 * To model the implicit (i-cache, etc) costs of inserting a jump
118 * instruction, we tie break conservatively, comparing with < instead of <=.
119 *
120 * Rearranging terms, we should NOT insert a jump if:
121 *
122 * C < J / P(all threads inactive).
123 */
124 uint32_t cost_instructions =
125 cost_between(ctx, from, from_I, target, skip_to_end_of_target);
126
127 if (cost_instructions < COST_JMP * inverse_probability)
128 return;
129
130 /* It looks like inserting a jump will be a win. Do so. */
131 if (skip_to_end_of_target)
132 agx_jmp_exec_none_after(&b, target);
133 else
134 agx_jmp_exec_none(&b, target);
135 }
136
137 void
agx_opt_jmp_none(agx_context * ctx)138 agx_opt_jmp_none(agx_context *ctx)
139 {
140 agx_foreach_block(ctx, blk) {
141 /* Handle the beginning of blocks */
142 agx_instr *first_ = agx_first_instr(blk);
143 if (first_ && (first_->op == AGX_OPCODE_ELSE_ICMP ||
144 first_->op == AGX_OPCODE_ELSE_FCMP)) {
145
146 /* The target of the else is the last block of the else, so we skip
147 * to the end of the block (to start execution with the pop_exec).
148 */
149 try_insert_jmp(ctx, blk, first_, first_->target, true, 2);
150 } else if (first_ &&
151 (first_->op == AGX_OPCODE_BREAK_IF_ICMP ||
152 first_->op == AGX_OPCODE_BREAK_IF_FCMP) &&
153 first_->nest == 1) {
154 /* The target of the break is the block immediately after the end of
155 * the loop, so jump to the end of the previous block to get the
156 * appropriate pop_exec.
157 *
158 * Also, note we only do this for nest=1 to ensure we don't insert
159 * jumps inside if-statements inside breaks. We can't insert a
160 * jmp_exec_none inside the if because it would break out of the loop
161 * for threads that are still running the loop but merely predicated
162 * out due to the if-condition. This is similarly why we don't bother
163 * handling unconditional break.
164 *
165 * TODO: This is not optimal, but fixing this would require
166 * considerably more CFG gymnastics.
167 */
168 agx_block *target = agx_prev_block(first_->target);
169 try_insert_jmp(ctx, blk, first_, target, true, 10);
170 }
171
172 /* Handle end of block instructions */
173 agx_foreach_instr_in_block_rev(blk, I) {
174 if (!instr_after_logical_end(I))
175 break;
176
177 if (I->op == AGX_OPCODE_IF_ICMP || I->op == AGX_OPCODE_IF_FCMP) {
178 try_insert_jmp(ctx, blk, I, I->target, false, 2);
179 break;
180 }
181 }
182 }
183 }
184