1 /*
2 * Copyright 2021 Alyssa Rosenzweig
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "util/macros.h"
7 #include "agx_builder.h"
8 #include "agx_compiler.h"
9 #include "agx_minifloat.h"
10 #include "agx_opcodes.h"
11
12 /* AGX peephole optimizer responsible for instruction combining. It operates in
13 * a forward direction and a backward direction, in each case traversing in
14 * source order. SSA means the forward pass satisfies the invariant:
15 *
16 * Every def is visited before any of its uses.
17 *
18 * Dually, the backend pass satisfies the invariant:
19 *
20 * Every use of a def is visited before the def.
21 *
22 * This means the forward pass can propagate modifiers forward, whereas the
23 * backwards pass propagates modifiers backward. Consider an example:
24 *
25 * 1 = fabs 0
26 * 2 = fround 1
27 * 3 = fsat 1
28 *
29 * The forwards pass would propagate the fabs to the fround (since we can
30 * lookup the fabs from the fround source and do the replacement). By contrast
31 * the backwards pass would propagate the fsat back to the fround (since when
32 * we see the fround we know it has only a single user, fsat). Propagatable
33 * instruction have natural directions (like pushforwards and pullbacks).
34 *
35 * We are careful to update the tracked state whenever we modify an instruction
36 * to ensure the passes are linear-time and converge in a single iteration.
37 *
38 * Size conversions are worth special discussion. Consider the snippet:
39 *
40 * 2 = fadd 0, 1
41 * 3 = f2f16 2
42 * 4 = fround 3
43 *
44 * A priori, we can move the f2f16 in either direction. But it's not equal --
45 * if we move it up to the fadd, we get FP16 for two instructions, whereas if
46 * we push it into the fround, we effectively get FP32 for two instructions. So
47 * f2f16 is backwards. Likewise, consider
48 *
49 * 2 = fadd 0, 1
50 * 3 = f2f32 1
51 * 4 = fround 3
52 *
53 * This time if we move f2f32 up to the fadd, we get FP32 for two, but if we
54 * move it down to the fround, we get FP16 to too. So f2f32 is backwards.
55 */
56
57 static bool
agx_is_fmov(agx_instr * def)58 agx_is_fmov(agx_instr *def)
59 {
60 return (def->op == AGX_OPCODE_FADD || def->op == AGX_OPCODE_HADD) &&
61 agx_is_equiv(def->src[1], agx_negzero());
62 }
63
64 /* Compose floating-point modifiers with floating-point sources */
65
66 static agx_index
agx_compose_float_src(agx_index to,agx_index from)67 agx_compose_float_src(agx_index to, agx_index from)
68 {
69 if (to.abs) {
70 from.neg = false;
71 from.abs = true;
72 }
73
74 from.neg ^= to.neg;
75
76 return from;
77 }
78
79 static void
agx_optimizer_fmov(agx_instr ** defs,agx_instr * ins)80 agx_optimizer_fmov(agx_instr **defs, agx_instr *ins)
81 {
82 agx_foreach_ssa_src(ins, s) {
83 agx_index src = ins->src[s];
84 agx_instr *def = defs[src.value];
85
86 if (def == NULL)
87 continue; /* happens for phis in loops */
88 if (!agx_is_fmov(def))
89 continue;
90 if (def->saturate)
91 continue;
92 if (ins->op == AGX_OPCODE_FCMPSEL && s >= 2)
93 continue;
94
95 /* We can fold f2f32 into 32-bit instructions, but we can't fold f2f16
96 * into 16-bit instructions, since the latter would implicitly promote to
97 * a 32-bit instruction which is not exact.
98 */
99 assert(def->src[0].size == AGX_SIZE_32 ||
100 def->src[0].size == AGX_SIZE_16);
101 assert(src.size == AGX_SIZE_32 || src.size == AGX_SIZE_16);
102
103 if (src.size == AGX_SIZE_16 && def->src[0].size == AGX_SIZE_32)
104 continue;
105
106 ins->src[s] = agx_compose_float_src(src, def->src[0]);
107 }
108 }
109
110 static bool
image_write_source_can_be_immediate(agx_instr * I,unsigned s)111 image_write_source_can_be_immediate(agx_instr *I, unsigned s)
112 {
113 bool block = I->op == AGX_OPCODE_BLOCK_IMAGE_STORE;
114 assert(I->op == AGX_OPCODE_IMAGE_WRITE || block);
115
116 /* LOD can always be immediate. Actually, it's just zero so far, we don't
117 * support nonzero LOD for images yet.
118 */
119 if (s == 2 && !block)
120 return true;
121
122 /* If the "bindless" source (source 3) is an immediate, it means we don't
123 * have a bindless image, instead we have a texture state index. We're
124 * allowed to have immediate texture state registers (source 4). However,
125 * we're not allowed to have immediate bindless offsets (also source 4).
126 */
127 unsigned base = block ? 0 : 3;
128 bool is_texture_state = (I->src[base].type == AGX_INDEX_IMMEDIATE);
129 if (s == (base + 1) && is_texture_state)
130 return true;
131
132 /* Otherwise, must be from a register */
133 return false;
134 }
135
136 static void
agx_optimizer_inline_imm(agx_instr ** defs,agx_instr * I)137 agx_optimizer_inline_imm(agx_instr **defs, agx_instr *I)
138 {
139 agx_foreach_ssa_src(I, s) {
140 agx_index src = I->src[s];
141 if (src.neg)
142 continue;
143
144 agx_instr *def = defs[src.value];
145 if (!def || def->op != AGX_OPCODE_MOV_IMM)
146 continue;
147
148 uint8_t value = def->imm;
149 uint16_t value_u16 = def->imm;
150
151 bool float_src = agx_is_float_src(I, s);
152
153 if (I->op == AGX_OPCODE_ST_TILE && s == 0)
154 continue;
155 if (I->op == AGX_OPCODE_ZS_EMIT && s != 0)
156 continue;
157 if (I->op == AGX_OPCODE_TEXTURE_SAMPLE && s != 4)
158 continue;
159 if ((I->op == AGX_OPCODE_DEVICE_STORE ||
160 I->op == AGX_OPCODE_LOCAL_STORE || I->op == AGX_OPCODE_ATOMIC ||
161 I->op == AGX_OPCODE_LOCAL_ATOMIC) &&
162 s != 2)
163 continue;
164 if (I->op == AGX_OPCODE_ST_VARY && s != 0)
165 continue;
166 if ((I->op == AGX_OPCODE_LOCAL_LOAD || I->op == AGX_OPCODE_DEVICE_LOAD ||
167 I->op == AGX_OPCODE_STACK_STORE) &&
168 s != 1)
169 continue;
170 if (I->op == AGX_OPCODE_SPLIT)
171 continue;
172
173 if ((I->op == AGX_OPCODE_IMAGE_WRITE ||
174 I->op == AGX_OPCODE_BLOCK_IMAGE_STORE) &&
175 !image_write_source_can_be_immediate(I, s))
176 continue;
177
178 if (float_src) {
179 bool fp16 = (def->dest[0].size == AGX_SIZE_16);
180 assert(fp16 || (def->dest[0].size == AGX_SIZE_32));
181
182 float f = fp16 ? _mesa_half_to_float(def->imm) : uif(def->imm);
183 if (!agx_minifloat_exact(f))
184 continue;
185
186 I->src[s] = agx_immediate_f(f);
187 } else if (value == def->imm) {
188 I->src[s] = agx_immediate(value);
189 } else if (value_u16 == def->imm && agx_allows_16bit_immediate(I)) {
190 I->src[s] = agx_abs(agx_immediate(value_u16));
191 } else if ((I->op == AGX_OPCODE_IADD || I->op == AGX_OPCODE_IMAD) &&
192 s == agx_negate_src_index(I)) {
193 unsigned bits = agx_size_align_16(def->dest[0].size) * 16;
194 uint64_t mask = BITFIELD64_MASK(bits);
195 uint64_t negated = (-def->imm) & mask;
196 value = negated;
197
198 /* Try to negate the immediate */
199 if (value == negated) {
200 I->src[s] = agx_neg(agx_immediate(value));
201 }
202 }
203 }
204 }
205
206 /*
207 * Fuse not into and/or/xor. Specifically, acts on not and fuses:
208 *
209 * not(and(x, y) -> nand(x, y)
210 * not(or(x, y) -> nor(x, y)
211 * not(xor(x, y) -> xnor(x, y)
212 */
213 static bool
agx_optimizer_not(agx_instr * I,agx_instr * use)214 agx_optimizer_not(agx_instr *I, agx_instr *use)
215 {
216 /* Check for bit op and use of not op */
217 if (I->op != AGX_OPCODE_BITOP || use->op != AGX_OPCODE_NOT)
218 return false;
219
220 /* Remap operation to the appropriate one */
221 I->truth_table ^= 0xF;
222 I->dest[0] = use->dest[0];
223
224 return true;
225 }
226
227 static bool
agx_optimizer_fmov_rev(agx_instr * I,agx_instr * use)228 agx_optimizer_fmov_rev(agx_instr *I, agx_instr *use)
229 {
230 if (!agx_is_fmov(use))
231 return false;
232 if (use->src[0].neg || use->src[0].abs)
233 return false;
234
235 /* We can fold f2f16 into 32-bit instructions, but we can't fold f2f32 into
236 * 16-bit instructions, since the latter would implicitly promote to a 32-bit
237 * instruction which is not exact.
238 */
239 assert(use->dest[0].size == AGX_SIZE_32 || use->dest[0].size == AGX_SIZE_16);
240 assert(I->dest[0].size == AGX_SIZE_32 || I->dest[0].size == AGX_SIZE_16);
241
242 if (I->dest[0].size == AGX_SIZE_16 && use->dest[0].size == AGX_SIZE_32)
243 return false;
244
245 /* saturate(saturate(x)) = saturate(x) */
246 I->saturate |= use->saturate;
247 I->dest[0] = use->dest[0];
248 return true;
249 }
250
251 static bool
agx_supports_zext(agx_instr * I,unsigned s)252 agx_supports_zext(agx_instr *I, unsigned s)
253 {
254 switch (I->op) {
255 case AGX_OPCODE_IADD:
256 case AGX_OPCODE_IMAD:
257 case AGX_OPCODE_ICMP:
258 case AGX_OPCODE_INTL:
259 case AGX_OPCODE_FFS:
260 case AGX_OPCODE_BITREV:
261 case AGX_OPCODE_BFI:
262 case AGX_OPCODE_BFEIL:
263 case AGX_OPCODE_EXTR:
264 case AGX_OPCODE_BITOP:
265 case AGX_OPCODE_WHILE_ICMP:
266 case AGX_OPCODE_IF_ICMP:
267 case AGX_OPCODE_ELSE_ICMP:
268 case AGX_OPCODE_BREAK_IF_ICMP:
269 case AGX_OPCODE_ICMP_BALLOT:
270 case AGX_OPCODE_ICMP_QUAD_BALLOT:
271 return true;
272 case AGX_OPCODE_ICMPSEL:
273 /* Only the comparisons can be extended, not the selection */
274 return s < 2;
275 default:
276 return false;
277 }
278 }
279
280 static void
agx_optimizer_copyprop(agx_context * ctx,agx_instr ** defs,agx_instr * I)281 agx_optimizer_copyprop(agx_context *ctx, agx_instr **defs, agx_instr *I)
282 {
283 agx_foreach_ssa_src(I, s) {
284 agx_index src = I->src[s];
285 agx_instr *def = defs[src.value];
286
287 if (def == NULL)
288 continue; /* happens for phis in loops */
289 if (def->op != AGX_OPCODE_MOV)
290 continue;
291
292 /* At the moment, not all instructions support size conversions. Notably
293 * RA pseudo instructions don't handle size conversions. This should be
294 * refined in the future.
295 */
296 if (def->src[0].size != src.size &&
297 !(def->src[0].size < src.size && agx_supports_zext(I, s)))
298 continue;
299
300 /* Optimize split(64-bit uniform) so we can get better copyprop of the
301 * 32-bit uniform parts. This helps reduce moves with 64-bit uniforms.
302 */
303 if (I->op == AGX_OPCODE_SPLIT && def->src[0].type == AGX_INDEX_UNIFORM &&
304 src.size == AGX_SIZE_64 && I->dest[0].size == AGX_SIZE_32) {
305
306 assert(I->nr_dests == 2 && "decomposing a 64-bit scalar");
307 agx_builder b = agx_init_builder(ctx, agx_before_instr(I));
308
309 agx_index lo = def->src[0];
310 lo.size = AGX_SIZE_32;
311
312 agx_index hi = lo;
313 hi.value += 2 /* half of 64-bits = 32-bits = 2 x 16-bits */;
314
315 defs[I->dest[0].value] = agx_mov_to(&b, I->dest[0], lo);
316 defs[I->dest[1].value] = agx_mov_to(&b, I->dest[1], hi);
317
318 agx_remove_instruction(I);
319 continue;
320 }
321
322 /* Immediate inlining happens elsewhere */
323 if (def->src[0].type == AGX_INDEX_IMMEDIATE)
324 continue;
325
326 /* ALU instructions cannot take 64-bit */
327 if (def->src[0].size == AGX_SIZE_64 &&
328 !(I->op == AGX_OPCODE_DEVICE_LOAD && s == 0) &&
329 !(I->op == AGX_OPCODE_DEVICE_STORE && s == 1) &&
330 !(I->op == AGX_OPCODE_ATOMIC && s == 1))
331 continue;
332
333 agx_replace_src(I, s, def->src[0]);
334
335 /* If we are zero-extending into an instruction that distinguishes sign
336 * and zero extend, make sure we pick zero-extend.
337 */
338 if (def->src[0].size < src.size &&
339 (I->op == AGX_OPCODE_IMAD || I->op == AGX_OPCODE_IADD)) {
340
341 assert(agx_supports_zext(I, s));
342 I->src[s].abs = true;
343 }
344 }
345 }
346
347 /*
348 * Fuse conditions into if. Specifically, acts on if_icmp and fuses:
349 *
350 * if_icmp(cmp(x, y, *), 0, ne/eq) -> if_cmp(x, y, *)
351 */
352 static void
agx_optimizer_if_cmp(agx_instr ** defs,agx_instr * I)353 agx_optimizer_if_cmp(agx_instr **defs, agx_instr *I)
354 {
355 /* Check for unfused if */
356 if (!agx_is_equiv(I->src[1], agx_zero()) || I->icond != AGX_ICOND_UEQ ||
357 I->src[0].type != AGX_INDEX_NORMAL)
358 return;
359
360 /* Check for condition */
361 agx_instr *def = defs[I->src[0].value];
362 if (def->op != AGX_OPCODE_ICMP && def->op != AGX_OPCODE_FCMP)
363 return;
364
365 /* Fuse */
366 I->src[0] = def->src[0];
367 I->src[1] = def->src[1];
368 I->invert_cond = def->invert_cond ^ !I->invert_cond;
369
370 if (def->op == AGX_OPCODE_ICMP) {
371 I->op = AGX_OPCODE_IF_ICMP;
372 I->icond = def->icond;
373 } else {
374 I->op = AGX_OPCODE_IF_FCMP;
375 I->fcond = def->fcond;
376 }
377 }
378
379 /*
380 * Fuse invert into if. Acts on if_icmp and fuses:
381 *
382 * if_icmp(xor(x, 1), 0, ne) -> if_cmp(x, 0, eq)
383 */
384 static void
agx_optimizer_if_not(agx_instr ** defs,agx_instr * I)385 agx_optimizer_if_not(agx_instr **defs, agx_instr *I)
386 {
387 /* Check for unfused if */
388 if (!agx_is_equiv(I->src[1], agx_zero()) || I->icond != AGX_ICOND_UEQ ||
389 I->src[0].type != AGX_INDEX_NORMAL)
390 return;
391
392 /* Check for invert */
393 agx_instr *def = defs[I->src[0].value];
394 if (def->op != AGX_OPCODE_BITOP ||
395 !agx_is_equiv(def->src[1], agx_immediate(1)) ||
396 def->truth_table != AGX_BITOP_XOR)
397 return;
398
399 /* Fuse */
400 I->src[0] = def->src[0];
401 I->invert_cond = !I->invert_cond;
402 }
403
404 /*
405 * Fuse conditions into select. Specifically, acts on icmpsel and fuses:
406 *
407 * icmpsel(cmp(x, y, *), 0, z, w, eq) -> cmpsel(x, y, w, z, *)
408 *
409 * Care must be taken to invert the condition by swapping cmpsel arguments.
410 */
411 static void
agx_optimizer_cmpsel(agx_instr ** defs,agx_instr * I)412 agx_optimizer_cmpsel(agx_instr **defs, agx_instr *I)
413 {
414 /* Check for unfused select */
415 if (!agx_is_equiv(I->src[1], agx_zero()) || I->icond != AGX_ICOND_UEQ ||
416 I->src[0].type != AGX_INDEX_NORMAL)
417 return;
418
419 /* Check for condition */
420 agx_instr *def = defs[I->src[0].value];
421 if (def->op != AGX_OPCODE_ICMP && def->op != AGX_OPCODE_FCMP)
422 return;
423
424 /* Fuse */
425 I->src[0] = def->src[0];
426 I->src[1] = def->src[1];
427
428 /* In the unfused select, the condition is inverted due to the form:
429 *
430 * (cond == 0) ? x : y
431 *
432 * So we need to swap the arguments when fusing to become cond ? y : x. If
433 * the condition was supposed to be inverted, we don't swap since it's
434 * already inverted. cmpsel does not have an invert_cond bit to use.
435 */
436 if (!def->invert_cond) {
437 agx_index temp = I->src[2];
438 I->src[2] = I->src[3];
439 I->src[3] = temp;
440 }
441
442 if (def->op == AGX_OPCODE_ICMP) {
443 I->op = AGX_OPCODE_ICMPSEL;
444 I->icond = def->icond;
445 } else {
446 I->op = AGX_OPCODE_FCMPSEL;
447 I->fcond = def->fcond;
448 }
449 }
450
451 /*
452 * Fuse conditions into ballots:
453 *
454 * ballot(cmp(x, y)) -> ballot_cmp(x, y)
455 */
456 static void
agx_optimizer_ballot(agx_context * ctx,agx_instr ** defs,agx_instr * I)457 agx_optimizer_ballot(agx_context *ctx, agx_instr **defs, agx_instr *I)
458 {
459 if (I->src[0].type != AGX_INDEX_NORMAL)
460 return;
461
462 agx_instr *def = defs[I->src[0].value];
463 if (!def || (def->op != AGX_OPCODE_ICMP && def->op != AGX_OPCODE_FCMP))
464 return;
465
466 bool quad = I->op == AGX_OPCODE_QUAD_BALLOT;
467 assert(quad || I->op == AGX_OPCODE_BALLOT);
468
469 /* Replace with a fused instruction since the # of sources changes */
470 agx_builder b = agx_init_builder(ctx, agx_before_instr(I));
471
472 agx_instr *fused = agx_icmp_ballot_to(
473 &b, I->dest[0], def->src[0], def->src[1], def->icond, def->invert_cond);
474
475 if (def->op == AGX_OPCODE_ICMP) {
476 fused->op = quad ? AGX_OPCODE_ICMP_QUAD_BALLOT : AGX_OPCODE_ICMP_BALLOT;
477 } else {
478 fused->op = quad ? AGX_OPCODE_FCMP_QUAD_BALLOT : AGX_OPCODE_FCMP_BALLOT;
479 }
480
481 agx_remove_instruction(I);
482 }
483
484 /*
485 * Fuse not srcs into bitop.
486 */
487 static void
agx_optimizer_bitop(agx_instr ** defs,agx_instr * I)488 agx_optimizer_bitop(agx_instr **defs, agx_instr *I)
489 {
490 agx_foreach_ssa_src(I, s) {
491 agx_index src = I->src[s];
492 agx_instr *def = defs[src.value];
493
494 /* Check for not src */
495 if (def->op != AGX_OPCODE_NOT)
496 continue;
497
498 /* Select new operation */
499 if (s == 0) {
500 I->truth_table =
501 ((I->truth_table & 0x5) << 1) | ((I->truth_table & 0xa) >> 1);
502 } else if (s == 1) {
503 I->truth_table = ((I->truth_table & 0x3) << 2) | (I->truth_table >> 2);
504 }
505
506 /* Fuse */
507 I->src[s] = def->src[0];
508 }
509 }
510
511 /*
512 * Fuse sign-extends into addition-like instructions:
513 */
514 static void
agx_optimizer_signext(agx_instr ** defs,agx_instr * I)515 agx_optimizer_signext(agx_instr **defs, agx_instr *I)
516 {
517 agx_foreach_ssa_src(I, s) {
518 agx_index src = I->src[s];
519 agx_instr *def = defs[src.value];
520
521 if (def == NULL || def->op != AGX_OPCODE_SIGNEXT)
522 continue;
523
524 agx_replace_src(I, s, def->src[0]);
525 assert(!I->src[s].abs && "sign-extended");
526 }
527 }
528
529 void
agx_optimizer_forward(agx_context * ctx)530 agx_optimizer_forward(agx_context *ctx)
531 {
532 agx_instr **defs = calloc(ctx->alloc, sizeof(*defs));
533
534 agx_foreach_instr_global_safe(ctx, I) {
535 struct agx_opcode_info info = agx_opcodes_info[I->op];
536
537 agx_foreach_ssa_dest(I, d) {
538 defs[I->dest[d].value] = I;
539 }
540
541 /* Optimize moves */
542 agx_optimizer_copyprop(ctx, defs, I);
543
544 /* Propagate fmov down */
545 if (info.is_float || I->op == AGX_OPCODE_FCMPSEL ||
546 I->op == AGX_OPCODE_FCMP)
547 agx_optimizer_fmov(defs, I);
548
549 /* Inline immediates if we can. TODO: systematic */
550 if (I->op != AGX_OPCODE_COLLECT && I->op != AGX_OPCODE_IMAGE_LOAD &&
551 I->op != AGX_OPCODE_TEXTURE_LOAD &&
552 I->op != AGX_OPCODE_UNIFORM_STORE && I->op != AGX_OPCODE_EXPORT)
553 agx_optimizer_inline_imm(defs, I);
554
555 if (I->op == AGX_OPCODE_IF_ICMP) {
556 agx_optimizer_if_not(defs, I);
557 agx_optimizer_if_cmp(defs, I);
558 } else if (I->op == AGX_OPCODE_ICMPSEL) {
559 agx_optimizer_cmpsel(defs, I);
560 } else if (I->op == AGX_OPCODE_BALLOT ||
561 I->op == AGX_OPCODE_QUAD_BALLOT) {
562 agx_optimizer_ballot(ctx, defs, I);
563 } else if (I->op == AGX_OPCODE_BITOP) {
564 agx_optimizer_bitop(defs, I);
565 } else if (I->op == AGX_OPCODE_IADD || I->op == AGX_OPCODE_IMAD) {
566 agx_optimizer_signext(defs, I);
567 }
568 }
569
570 free(defs);
571 }
572
573 static void
record_use(agx_instr ** uses,BITSET_WORD * multiple,agx_instr * I,unsigned src)574 record_use(agx_instr **uses, BITSET_WORD *multiple, agx_instr *I, unsigned src)
575 {
576 unsigned v = I->src[src].value;
577
578 if (uses[v])
579 BITSET_SET(multiple, v);
580 else
581 uses[v] = I;
582 }
583
584 void
agx_optimizer_backward(agx_context * ctx)585 agx_optimizer_backward(agx_context *ctx)
586 {
587 agx_instr **uses = calloc(ctx->alloc, sizeof(*uses));
588 BITSET_WORD *multiple = calloc(BITSET_WORDS(ctx->alloc), sizeof(*multiple));
589
590 agx_foreach_block_rev(ctx, block) {
591 /* Phi sources are logically read at the end of predecessor, so process
592 * our source in our successors' phis firsts. This ensures we set
593 * `multiple` correctly with phi sources.
594 */
595 agx_foreach_successor(block, succ) {
596 unsigned s = agx_predecessor_index(succ, block);
597
598 agx_foreach_phi_in_block(succ, phi) {
599 record_use(uses, multiple, phi, s);
600 }
601 }
602
603 agx_foreach_instr_in_block_rev(block, I) {
604 struct agx_opcode_info info = agx_opcodes_info[I->op];
605
606 /* Skip phis, they're handled specially */
607 if (I->op == AGX_OPCODE_PHI) {
608 continue;
609 }
610
611 agx_foreach_ssa_src(I, s) {
612 record_use(uses, multiple, I, s);
613 }
614
615 if (info.nr_dests != 1)
616 continue;
617
618 if (I->dest[0].type != AGX_INDEX_NORMAL)
619 continue;
620
621 agx_instr *use = uses[I->dest[0].value];
622
623 if (!use || BITSET_TEST(multiple, I->dest[0].value))
624 continue;
625
626 if (agx_optimizer_not(I, use)) {
627 agx_remove_instruction(use);
628 continue;
629 }
630
631 /* Destination has a single use, try to propagate */
632 if (info.is_float && agx_optimizer_fmov_rev(I, use)) {
633 agx_remove_instruction(use);
634 continue;
635 }
636 }
637 }
638
639 free(uses);
640 free(multiple);
641 }
642