1 /*
2 * Copyright (C) 2022 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "va_compiler.h"
25 #include "valhall_enums.h"
26 #include "bi_builder.h"
27
28 /*
29 * Merge NOPs with flow control with nearby instructions to eliminate the NOPs,
30 * according to the following rules:
31 *
32 * 1. Waits may be combined by waiting on a union of the slots.
33 * 2. Waits may be moved up as far as the first (last) asynchronous instruction
34 * writing to a slot waited on, at a performance cost.
35 * 3. Discard may be moved down, at a performance cost.
36 * 4. Reconverge must be on the last instruction of the block.
37 * 5. End must be on the last instruction of the block.
38 *
39 * For simplicity, this pass only merges within a single basic block. This
40 * should be sufficient. The algorithm works as follows.
41 *
42 * Because reconverge/end must be on the last instruction, there may be only
43 * one of these in a block. First check for a NOP at the end, and if it has
44 * reconverge/end flow control, merge with the penultimate instruction. Now we
45 * only have to worry about waits and discard.
46 *
47 * The cost of moving a wait up is assumed to be greater than the cost of moving
48 * a discard down, so we next move waits while we have more freedom. For each
49 * wait, merge with an instruction above that either is free or has another
50 * wait, and merge the flow control, aborting if we hit a message signaling the
51 * relevant slot. By maintaining the candidate instruction at all steps, this is
52 * implemented in a linear time walk.
53 *
54 * Finally, all that's left are discards. Move discards down to merge with a
55 * free instruction with a similar linear time walk.
56 *
57 * Since discarding helper threads is an optimization (not required for
58 * correctness), we may remove discards if we believe them harmful to the
59 * performance of the shader. Keeping with the local structure of the pass, we
60 * remove discards only if they are at the end of the last block and cannot be
61 * merged with any other instruction. Such a condition means every following
62 * instruction has incompatible flow control (wait and end). In practice, this
63 * means BLEND and ATEST run for helper threads in certain shaders to save a
64 * NOP, but BLEND is a no-op for helper threads anyway.
65 *
66 * One case we don't handle is merging "foo, bar, wait, reconverge" to
67 * "foo.wait, bar.reconverge". This sequence is rarely generated by the dataflow
68 * analysis, so we don't care for the complexity.
69 */
70
71 /*
72 * If the last instruction has reconverge or end, try to merge with the
73 * second to last instruction.
74 *
75 * Precondition: block has at least 2 instructions.
76 */
77 static void
merge_end_reconverge(bi_block * block)78 merge_end_reconverge(bi_block *block)
79 {
80 bi_instr *last = list_last_entry(&block->instructions, bi_instr, link);
81 bi_instr *penult = bi_prev_op(last);
82
83 if (last->op != BI_OPCODE_NOP) return;
84 if (last->flow != VA_FLOW_RECONVERGE && last->flow != VA_FLOW_END) return;
85
86 /* End implies all other flow control except for waiting on barriers (slot
87 * #7, with VA_FLOW_WAIT), so remove blocking flow control.
88 */
89 if (last->flow == VA_FLOW_END) {
90 while (penult->op == BI_OPCODE_NOP && penult->flow != VA_FLOW_WAIT) {
91 bi_remove_instruction(penult);
92
93 /* There may be nothing left */
94 if (list_is_singular(&block->instructions))
95 return;
96
97 penult = bi_prev_op(last);
98 }
99 }
100
101 /* If there is blocking flow control, we can't merge */
102 if (penult->flow != VA_FLOW_NONE) return;
103
104 /* Else, merge */
105 penult->flow = last->flow;
106 bi_remove_instruction(last);
107 }
108
109 /*
110 * Calculate the union of two waits. We may wait on any combination of slots #0,
111 * #1, #2 or the entirety of 0126 and 01267. If we wait on the entirety, the
112 * union is trivial. If we do not wait on slot #6 (by extension slot #7), we
113 * wait only on slots #0, #1, and #2, for which the waits are encoded as a
114 * bitset and the union is just a bitwise OR.
115 */
116 static enum va_flow
union_waits(enum va_flow x,enum va_flow y)117 union_waits(enum va_flow x, enum va_flow y)
118 {
119 assert(va_flow_is_wait_or_none(x) && va_flow_is_wait_or_none(y));
120
121 if ((x == VA_FLOW_WAIT) || (y == VA_FLOW_WAIT))
122 return VA_FLOW_WAIT;
123 else if ((x == VA_FLOW_WAIT0126) || (y == VA_FLOW_WAIT0126))
124 return VA_FLOW_WAIT0126;
125 else
126 return x | y;
127 }
128
129 static void
merge_waits(bi_block * block)130 merge_waits(bi_block *block)
131 {
132 /* Most recent instruction with which we can merge, or NULL if none */
133 bi_instr *last_free = NULL;
134
135 bi_foreach_instr_in_block_safe(block, I) {
136 if (last_free != NULL &&
137 I->op == BI_OPCODE_NOP && va_flow_is_wait_or_none(I->flow)) {
138
139 /* Merge waits with compatible instructions */
140 last_free->flow = union_waits(last_free->flow, I->flow);
141 bi_remove_instruction(I);
142 continue;
143 }
144
145 /* Don't move waits past async instructions, since they might be what
146 * we're waiting for. If we wanted to optimize this case, we could check
147 * the signaled slots.
148 */
149 if (bi_opcode_props[I->op].message)
150 last_free = NULL;
151
152 /* We can only merge with instructions whose flow control is a wait.
153 * This includes such an instruction after merging in a wait. It also
154 * includes async instructions.
155 */
156 if (va_flow_is_wait_or_none(I->flow))
157 last_free = I;
158 }
159 }
160
161 static bool
bi_is_first_instr(bi_block * block,bi_instr * I)162 bi_is_first_instr(bi_block *block, bi_instr *I)
163 {
164 return block->instructions.next == &I->link;
165 }
166
167 static void
merge_discard(bi_block * block)168 merge_discard(bi_block *block)
169 {
170 bi_instr *last_free = NULL;
171
172 bi_foreach_instr_in_block_safe_rev(block, I) {
173 if ((I->op == BI_OPCODE_NOP) && (I->flow == VA_FLOW_DISCARD)) {
174 /* Try to merge with the instruction *preceding* discard, because
175 * because flow control happens at the end of an instruction and
176 * discard is a NOP.
177 */
178 if (!bi_is_first_instr(block, I)) {
179 bi_instr *prev = bi_prev_op(I);
180
181 if (prev->flow == VA_FLOW_NONE) {
182 prev->flow = VA_FLOW_DISCARD;
183 bi_remove_instruction(I);
184 continue;
185 }
186 }
187
188 /* Or try to merge with the next instruction with no flow control */
189 if (last_free != NULL) {
190 last_free->flow = VA_FLOW_DISCARD;
191 bi_remove_instruction(I);
192 continue;
193 }
194
195 /* If there's nowhere to merge and this is the end of the shader, just
196 * remove the discard.
197 */
198 if (!block->successors[0] && !block->successors[1]) {
199 bi_remove_instruction(I);
200 continue;
201 }
202 }
203
204 /* Allow merging into free instructions */
205 if (I->flow == VA_FLOW_NONE)
206 last_free = I;
207 }
208 }
209
210 void
va_merge_flow(bi_context * ctx)211 va_merge_flow(bi_context *ctx)
212 {
213 bi_foreach_block(ctx, block) {
214 /* If there are less than 2 instructions, there's nothing to merge */
215 if (list_is_empty(&block->instructions)) continue;
216 if (list_is_singular(&block->instructions)) continue;
217
218 merge_end_reconverge(block);
219 merge_waits(block);
220
221 if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend)
222 merge_discard(block);
223 }
224 }
225