1 /*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "nir_instr_set.h"
25 #include "nir_search_helpers.h"
26 #include "nir_builder.h"
27 #include "util/u_vector.h"
28
29 /* Partial redundancy elimination of compares
30 *
31 * Seaches for comparisons of the form 'a cmp b' that dominate arithmetic
32 * instructions like 'b - a'. The comparison is replaced by the arithmetic
33 * instruction, and the result is compared with zero. For example,
34 *
35 * vec1 32 ssa_111 = flt 0.37, ssa_110.w
36 * if ssa_111 {
37 * block block_1:
38 * vec1 32 ssa_112 = fadd ssa_110.w, -0.37
39 * ...
40 *
41 * becomes
42 *
43 * vec1 32 ssa_111 = fadd ssa_110.w, -0.37
44 * vec1 32 ssa_112 = flt 0.0, ssa_111
45 * if ssa_112 {
46 * block block_1:
47 * ...
48 */
49
50 struct block_queue {
51 /**
52 * Stack of blocks from the current location in the CFG to the entry point
53 * of the function.
54 *
55 * This is sort of a poor man's dominator tree.
56 */
57 struct exec_list blocks;
58
59 /** List of freed block_instructions structures that can be reused. */
60 struct exec_list reusable_blocks;
61 };
62
63 struct block_instructions {
64 struct exec_node node;
65
66 /**
67 * Set of comparison instructions from the block that are candidates for
68 * being replaced by add instructions.
69 */
70 struct u_vector instructions;
71 };
72
73 static void
block_queue_init(struct block_queue * bq)74 block_queue_init(struct block_queue *bq)
75 {
76 exec_list_make_empty(&bq->blocks);
77 exec_list_make_empty(&bq->reusable_blocks);
78 }
79
80 static void
block_queue_finish(struct block_queue * bq)81 block_queue_finish(struct block_queue *bq)
82 {
83 struct block_instructions *n;
84
85 while ((n = (struct block_instructions *) exec_list_pop_head(&bq->blocks)) != NULL) {
86 u_vector_finish(&n->instructions);
87 free(n);
88 }
89
90 while ((n = (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks)) != NULL) {
91 free(n);
92 }
93 }
94
95 static struct block_instructions *
push_block(struct block_queue * bq)96 push_block(struct block_queue *bq)
97 {
98 struct block_instructions *bi =
99 (struct block_instructions *) exec_list_pop_head(&bq->reusable_blocks);
100
101 if (bi == NULL) {
102 bi = calloc(1, sizeof(struct block_instructions));
103
104 if (bi == NULL)
105 return NULL;
106 }
107
108 if (!u_vector_init_pow2(&bi->instructions, 8, sizeof(nir_alu_instr *))) {
109 free(bi);
110 return NULL;
111 }
112
113 exec_list_push_tail(&bq->blocks, &bi->node);
114
115 return bi;
116 }
117
118 static void
pop_block(struct block_queue * bq,struct block_instructions * bi)119 pop_block(struct block_queue *bq, struct block_instructions *bi)
120 {
121 u_vector_finish(&bi->instructions);
122 exec_node_remove(&bi->node);
123 exec_list_push_head(&bq->reusable_blocks, &bi->node);
124 }
125
126 static void
add_instruction_for_block(struct block_instructions * bi,nir_alu_instr * alu)127 add_instruction_for_block(struct block_instructions *bi,
128 nir_alu_instr *alu)
129 {
130 nir_alu_instr **data =
131 u_vector_add(&bi->instructions);
132
133 *data = alu;
134 }
135
136 static void
rewrite_compare_instruction(nir_builder * bld,nir_alu_instr * orig_cmp,nir_alu_instr * orig_add,bool zero_on_left)137 rewrite_compare_instruction(nir_builder *bld, nir_alu_instr *orig_cmp,
138 nir_alu_instr *orig_add, bool zero_on_left)
139 {
140 bld->cursor = nir_before_instr(&orig_cmp->instr);
141
142 /* This is somewhat tricky. The compare instruction may be something like
143 * (fcmp, a, b) while the add instruction is something like (fadd, fneg(a),
144 * b). This is problematic because the SSA value for the fneg(a) may not
145 * exist yet at the compare instruction.
146 *
147 * We fabricate the operands of the new add. This is done using
148 * information provided by zero_on_left. If zero_on_left is true, we know
149 * the resulting compare instruction is (fcmp, 0.0, (fadd, x, y)). If the
150 * original compare instruction was (fcmp, a, b), x = b and y = -a. If
151 * zero_on_left is false, the resulting compare instruction is (fcmp,
152 * (fadd, x, y), 0.0) and x = a and y = -b.
153 */
154 nir_ssa_def *const a = nir_ssa_for_alu_src(bld, orig_cmp, 0);
155 nir_ssa_def *const b = nir_ssa_for_alu_src(bld, orig_cmp, 1);
156
157 nir_ssa_def *const fadd = zero_on_left
158 ? nir_fadd(bld, b, nir_fneg(bld, a))
159 : nir_fadd(bld, a, nir_fneg(bld, b));
160
161 nir_ssa_def *const zero =
162 nir_imm_floatN_t(bld, 0.0, orig_add->dest.dest.ssa.bit_size);
163
164 nir_ssa_def *const cmp = zero_on_left
165 ? nir_build_alu(bld, orig_cmp->op, zero, fadd, NULL, NULL)
166 : nir_build_alu(bld, orig_cmp->op, fadd, zero, NULL, NULL);
167
168 /* Generating extra moves of the results is the easy way to make sure the
169 * writemasks match the original instructions. Later optimization passes
170 * will clean these up. This is similar to nir_replace_instr (in
171 * nir_search.c).
172 */
173 nir_alu_instr *mov_add = nir_alu_instr_create(bld->shader, nir_op_mov);
174 mov_add->dest.write_mask = orig_add->dest.write_mask;
175 nir_ssa_dest_init(&mov_add->instr, &mov_add->dest.dest,
176 orig_add->dest.dest.ssa.num_components,
177 orig_add->dest.dest.ssa.bit_size, NULL);
178 mov_add->src[0].src = nir_src_for_ssa(fadd);
179
180 nir_builder_instr_insert(bld, &mov_add->instr);
181
182 nir_alu_instr *mov_cmp = nir_alu_instr_create(bld->shader, nir_op_mov);
183 mov_cmp->dest.write_mask = orig_cmp->dest.write_mask;
184 nir_ssa_dest_init(&mov_cmp->instr, &mov_cmp->dest.dest,
185 orig_cmp->dest.dest.ssa.num_components,
186 orig_cmp->dest.dest.ssa.bit_size, NULL);
187 mov_cmp->src[0].src = nir_src_for_ssa(cmp);
188
189 nir_builder_instr_insert(bld, &mov_cmp->instr);
190
191 nir_ssa_def_rewrite_uses(&orig_cmp->dest.dest.ssa,
192 &mov_cmp->dest.dest.ssa);
193 nir_ssa_def_rewrite_uses(&orig_add->dest.dest.ssa,
194 &mov_add->dest.dest.ssa);
195
196 /* We know these have no more uses because we just rewrote them all, so we
197 * can remove them.
198 */
199 nir_instr_remove(&orig_cmp->instr);
200 nir_instr_remove(&orig_add->instr);
201 }
202
203 static bool
comparison_pre_block(nir_block * block,struct block_queue * bq,nir_builder * bld)204 comparison_pre_block(nir_block *block, struct block_queue *bq, nir_builder *bld)
205 {
206 bool progress = false;
207
208 struct block_instructions *bi = push_block(bq);
209 if (bi == NULL)
210 return false;
211
212 /* Starting with the current block, examine each instruction. If the
213 * instruction is a comparison that matches the '±a cmp ±b' pattern, add it
214 * to the block_instructions::instructions set. If the instruction is an
215 * add instruction, walk up the block queue looking at the stored
216 * instructions. If a matching comparison is found, move the addition and
217 * replace the comparison with a different comparison based on the result
218 * of the addition. All of the blocks in the queue are guaranteed to be
219 * dominators of the current block.
220 *
221 * After processing the current block, recurse into the blocks dominated by
222 * the current block.
223 */
224 nir_foreach_instr_safe(instr, block) {
225 if (instr->type != nir_instr_type_alu)
226 continue;
227
228 nir_alu_instr *const alu = nir_instr_as_alu(instr);
229
230 if (alu->dest.dest.ssa.num_components != 1)
231 continue;
232
233 if (alu->dest.saturate)
234 continue;
235
236 static const uint8_t swizzle[NIR_MAX_VEC_COMPONENTS] = {0};
237
238 switch (alu->op) {
239 case nir_op_fadd: {
240 /* If the instruction is fadd, check it against comparison
241 * instructions that dominate it.
242 */
243 struct block_instructions *b =
244 (struct block_instructions *) exec_list_get_head_raw(&bq->blocks);
245
246 while (b->node.next != NULL) {
247 nir_alu_instr **a;
248 bool rewrote_compare = false;
249
250 u_vector_foreach(a, &b->instructions) {
251 nir_alu_instr *const cmp = *a;
252
253 if (cmp == NULL)
254 continue;
255
256 /* The operands of both instructions are, with some liberty,
257 * commutative. Check all four permutations. The third and
258 * fourth permutations are negations of the first two.
259 */
260 if ((nir_alu_srcs_equal(cmp, alu, 0, 0) &&
261 nir_alu_srcs_negative_equal(cmp, alu, 1, 1)) ||
262 (nir_alu_srcs_equal(cmp, alu, 0, 1) &&
263 nir_alu_srcs_negative_equal(cmp, alu, 1, 0))) {
264 /* These are the cases where (A cmp B) matches either (A +
265 * -B) or (-B + A)
266 *
267 * A cmp B <=> A + -B cmp 0
268 */
269 rewrite_compare_instruction(bld, cmp, alu, false);
270
271 *a = NULL;
272 rewrote_compare = true;
273 break;
274 } else if ((nir_alu_srcs_equal(cmp, alu, 1, 0) &&
275 nir_alu_srcs_negative_equal(cmp, alu, 0, 1)) ||
276 (nir_alu_srcs_equal(cmp, alu, 1, 1) &&
277 nir_alu_srcs_negative_equal(cmp, alu, 0, 0))) {
278 /* This is the case where (A cmp B) matches (B + -A) or (-A
279 * + B).
280 *
281 * A cmp B <=> 0 cmp B + -A
282 */
283 rewrite_compare_instruction(bld, cmp, alu, true);
284
285 *a = NULL;
286 rewrote_compare = true;
287 break;
288 }
289 }
290
291 /* Bail after a compare in the most dominating block is found.
292 * This is necessary because 'alu' has been removed from the
293 * instruction stream. Should there be a matching compare in
294 * another block, calling rewrite_compare_instruction again will
295 * try to operate on a node that is not in the list as if it were
296 * in the list.
297 *
298 * FINISHME: There may be opportunity for additional optimization
299 * here. I discovered this problem due to a shader in Guacamelee.
300 * It may be possible to rewrite the matching compares that are
301 * encountered later to reuse the result from the compare that was
302 * first rewritten. It's also possible that this is just taken
303 * care of by calling the optimization pass repeatedly.
304 */
305 if (rewrote_compare) {
306 progress = true;
307 break;
308 }
309
310 b = (struct block_instructions *) b->node.next;
311 }
312
313 break;
314 }
315
316 case nir_op_flt:
317 case nir_op_fge:
318 case nir_op_fneu:
319 case nir_op_feq:
320 /* If the instruction is a comparison that is used by an if-statement
321 * and neither operand is immediate value 0, add it to the set.
322 */
323 if (is_used_by_if(alu) &&
324 is_not_const_zero(NULL, alu, 0, 1, swizzle) &&
325 is_not_const_zero(NULL, alu, 1, 1, swizzle))
326 add_instruction_for_block(bi, alu);
327
328 break;
329
330 default:
331 break;
332 }
333 }
334
335 for (unsigned i = 0; i < block->num_dom_children; i++) {
336 nir_block *child = block->dom_children[i];
337
338 if (comparison_pre_block(child, bq, bld))
339 progress = true;
340 }
341
342 pop_block(bq, bi);
343
344 return progress;
345 }
346
347 bool
nir_opt_comparison_pre_impl(nir_function_impl * impl)348 nir_opt_comparison_pre_impl(nir_function_impl *impl)
349 {
350 struct block_queue bq;
351 nir_builder bld;
352
353 block_queue_init(&bq);
354 nir_builder_init(&bld, impl);
355
356 nir_metadata_require(impl, nir_metadata_dominance);
357
358 const bool progress =
359 comparison_pre_block(nir_start_block(impl), &bq, &bld);
360
361 block_queue_finish(&bq);
362
363 if (progress) {
364 nir_metadata_preserve(impl, nir_metadata_block_index |
365 nir_metadata_dominance);
366 } else {
367 nir_metadata_preserve(impl, nir_metadata_all);
368 }
369
370 return progress;
371 }
372
373 bool
nir_opt_comparison_pre(nir_shader * shader)374 nir_opt_comparison_pre(nir_shader *shader)
375 {
376 bool progress = false;
377
378 nir_foreach_function(function, shader) {
379 if (function->impl)
380 progress |= nir_opt_comparison_pre_impl(function->impl);
381 }
382
383 return progress;
384 }
385