• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2021 Alyssa Rosenzweig
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "util/macros.h"
7 #include "agx_builder.h"
8 #include "agx_compiler.h"
9 #include "agx_minifloat.h"
10 #include "agx_opcodes.h"
11 
12 /* AGX peephole optimizer responsible for instruction combining. It operates in
13  * a forward direction and a backward direction, in each case traversing in
14  * source order. SSA means the forward pass satisfies the invariant:
15  *
16  *    Every def is visited before any of its uses.
17  *
18  * Dually, the backend pass satisfies the invariant:
19  *
20  *    Every use of a def is visited before the def.
21  *
22  * This means the forward pass can propagate modifiers forward, whereas the
23  * backwards pass propagates modifiers backward. Consider an example:
24  *
25  *    1 = fabs 0
26  *    2 = fround 1
27  *    3 = fsat 1
28  *
29  * The forwards pass would propagate the fabs to the fround (since we can
30  * lookup the fabs from the fround source and do the replacement). By contrast
31  * the backwards pass would propagate the fsat back to the fround (since when
32  * we see the fround we know it has only a single user, fsat).  Propagatable
33  * instruction have natural directions (like pushforwards and pullbacks).
34  *
35  * We are careful to update the tracked state whenever we modify an instruction
36  * to ensure the passes are linear-time and converge in a single iteration.
37  *
38  * Size conversions are worth special discussion. Consider the snippet:
39  *
40  *    2 = fadd 0, 1
41  *    3 = f2f16 2
42  *    4 = fround 3
43  *
44  * A priori, we can move the f2f16 in either direction. But it's not equal --
45  * if we move it up to the fadd, we get FP16 for two instructions, whereas if
46  * we push it into the fround, we effectively get FP32 for two instructions. So
47  * f2f16 is backwards. Likewise, consider
48  *
49  *    2 = fadd 0, 1
50  *    3 = f2f32 1
51  *    4 = fround 3
52  *
53  * This time if we move f2f32 up to the fadd, we get FP32 for two, but if we
54  * move it down to the fround, we get FP16 to too. So f2f32 is backwards.
55  */
56 
57 static bool
agx_is_fmov(agx_instr * def)58 agx_is_fmov(agx_instr *def)
59 {
60    return (def->op == AGX_OPCODE_FADD || def->op == AGX_OPCODE_HADD) &&
61           agx_is_equiv(def->src[1], agx_negzero());
62 }
63 
64 /* Compose floating-point modifiers with floating-point sources */
65 
66 static agx_index
agx_compose_float_src(agx_index to,agx_index from)67 agx_compose_float_src(agx_index to, agx_index from)
68 {
69    if (to.abs) {
70       from.neg = false;
71       from.abs = true;
72    }
73 
74    from.neg ^= to.neg;
75 
76    return from;
77 }
78 
79 static void
agx_optimizer_fmov(agx_instr ** defs,agx_instr * ins)80 agx_optimizer_fmov(agx_instr **defs, agx_instr *ins)
81 {
82    agx_foreach_ssa_src(ins, s) {
83       agx_index src = ins->src[s];
84       agx_instr *def = defs[src.value];
85 
86       if (def == NULL)
87          continue; /* happens for phis in loops */
88       if (!agx_is_fmov(def))
89          continue;
90       if (def->saturate)
91          continue;
92       if (ins->op == AGX_OPCODE_FCMPSEL && s >= 2)
93          continue;
94 
95       /* We can fold f2f32 into 32-bit instructions, but we can't fold f2f16
96        * into 16-bit instructions, since the latter would implicitly promote to
97        * a 32-bit instruction which is not exact.
98        */
99       assert(def->src[0].size == AGX_SIZE_32 ||
100              def->src[0].size == AGX_SIZE_16);
101       assert(src.size == AGX_SIZE_32 || src.size == AGX_SIZE_16);
102 
103       if (src.size == AGX_SIZE_16 && def->src[0].size == AGX_SIZE_32)
104          continue;
105 
106       ins->src[s] = agx_compose_float_src(src, def->src[0]);
107    }
108 }
109 
110 static bool
image_write_source_can_be_immediate(agx_instr * I,unsigned s)111 image_write_source_can_be_immediate(agx_instr *I, unsigned s)
112 {
113    bool block = I->op == AGX_OPCODE_BLOCK_IMAGE_STORE;
114    assert(I->op == AGX_OPCODE_IMAGE_WRITE || block);
115 
116    /* LOD can always be immediate. Actually, it's just zero so far, we don't
117     * support nonzero LOD for images yet.
118     */
119    if (s == 2 && !block)
120       return true;
121 
122    /* If the "bindless" source (source 3) is an immediate, it means we don't
123     * have a bindless image, instead we have a texture state index. We're
124     * allowed to have immediate texture state registers (source 4). However,
125     * we're not allowed to have immediate bindless offsets (also source 4).
126     */
127    unsigned base = block ? 0 : 3;
128    bool is_texture_state = (I->src[base].type == AGX_INDEX_IMMEDIATE);
129    if (s == (base + 1) && is_texture_state)
130       return true;
131 
132    /* Otherwise, must be from a register */
133    return false;
134 }
135 
136 static void
agx_optimizer_inline_imm(agx_instr ** defs,agx_instr * I)137 agx_optimizer_inline_imm(agx_instr **defs, agx_instr *I)
138 {
139    agx_foreach_ssa_src(I, s) {
140       agx_index src = I->src[s];
141       if (src.neg)
142          continue;
143 
144       agx_instr *def = defs[src.value];
145       if (!def || def->op != AGX_OPCODE_MOV_IMM)
146          continue;
147 
148       uint8_t value = def->imm;
149       uint16_t value_u16 = def->imm;
150 
151       bool float_src = agx_is_float_src(I, s);
152 
153       if (I->op == AGX_OPCODE_ST_TILE && s == 0)
154          continue;
155       if (I->op == AGX_OPCODE_ZS_EMIT && s != 0)
156          continue;
157       if (I->op == AGX_OPCODE_TEXTURE_SAMPLE && s != 4)
158          continue;
159       if ((I->op == AGX_OPCODE_DEVICE_STORE ||
160            I->op == AGX_OPCODE_LOCAL_STORE || I->op == AGX_OPCODE_ATOMIC ||
161            I->op == AGX_OPCODE_LOCAL_ATOMIC) &&
162           s != 2)
163          continue;
164       if (I->op == AGX_OPCODE_ST_VARY && s != 0)
165          continue;
166       if ((I->op == AGX_OPCODE_LOCAL_LOAD || I->op == AGX_OPCODE_DEVICE_LOAD ||
167            I->op == AGX_OPCODE_STACK_STORE) &&
168           s != 1)
169          continue;
170       if (I->op == AGX_OPCODE_SPLIT)
171          continue;
172 
173       if ((I->op == AGX_OPCODE_IMAGE_WRITE ||
174            I->op == AGX_OPCODE_BLOCK_IMAGE_STORE) &&
175           !image_write_source_can_be_immediate(I, s))
176          continue;
177 
178       if (float_src) {
179          bool fp16 = (def->dest[0].size == AGX_SIZE_16);
180          assert(fp16 || (def->dest[0].size == AGX_SIZE_32));
181 
182          float f = fp16 ? _mesa_half_to_float(def->imm) : uif(def->imm);
183          if (!agx_minifloat_exact(f))
184             continue;
185 
186          I->src[s] = agx_immediate_f(f);
187       } else if (value == def->imm) {
188          I->src[s] = agx_immediate(value);
189       } else if (value_u16 == def->imm && agx_allows_16bit_immediate(I)) {
190          I->src[s] = agx_abs(agx_immediate(value_u16));
191       } else if ((I->op == AGX_OPCODE_IADD || I->op == AGX_OPCODE_IMAD) &&
192                  s == agx_negate_src_index(I)) {
193          unsigned bits = agx_size_align_16(def->dest[0].size) * 16;
194          uint64_t mask = BITFIELD64_MASK(bits);
195          uint64_t negated = (-def->imm) & mask;
196          value = negated;
197 
198          /* Try to negate the immediate */
199          if (value == negated) {
200             I->src[s] = agx_neg(agx_immediate(value));
201          }
202       }
203    }
204 }
205 
206 /*
207  * Fuse not into and/or/xor. Specifically, acts on not and fuses:
208  *
209  *    not(and(x, y) -> nand(x, y)
210  *    not(or(x, y) -> nor(x, y)
211  *    not(xor(x, y) -> xnor(x, y)
212  */
213 static bool
agx_optimizer_not(agx_instr * I,agx_instr * use)214 agx_optimizer_not(agx_instr *I, agx_instr *use)
215 {
216    /* Check for bit op and use of not op */
217    if (I->op != AGX_OPCODE_BITOP || use->op != AGX_OPCODE_NOT)
218       return false;
219 
220    /* Remap operation to the appropriate one */
221    I->truth_table ^= 0xF;
222    I->dest[0] = use->dest[0];
223 
224    return true;
225 }
226 
227 static bool
agx_optimizer_fmov_rev(agx_instr * I,agx_instr * use)228 agx_optimizer_fmov_rev(agx_instr *I, agx_instr *use)
229 {
230    if (!agx_is_fmov(use))
231       return false;
232    if (use->src[0].neg || use->src[0].abs)
233       return false;
234 
235    /* We can fold f2f16 into 32-bit instructions, but we can't fold f2f32 into
236     * 16-bit instructions, since the latter would implicitly promote to a 32-bit
237     * instruction which is not exact.
238     */
239    assert(use->dest[0].size == AGX_SIZE_32 || use->dest[0].size == AGX_SIZE_16);
240    assert(I->dest[0].size == AGX_SIZE_32 || I->dest[0].size == AGX_SIZE_16);
241 
242    if (I->dest[0].size == AGX_SIZE_16 && use->dest[0].size == AGX_SIZE_32)
243       return false;
244 
245    /* saturate(saturate(x)) = saturate(x) */
246    I->saturate |= use->saturate;
247    I->dest[0] = use->dest[0];
248    return true;
249 }
250 
251 static bool
agx_supports_zext(agx_instr * I,unsigned s)252 agx_supports_zext(agx_instr *I, unsigned s)
253 {
254    switch (I->op) {
255    case AGX_OPCODE_IADD:
256    case AGX_OPCODE_IMAD:
257    case AGX_OPCODE_ICMP:
258    case AGX_OPCODE_INTL:
259    case AGX_OPCODE_FFS:
260    case AGX_OPCODE_BITREV:
261    case AGX_OPCODE_BFI:
262    case AGX_OPCODE_BFEIL:
263    case AGX_OPCODE_EXTR:
264    case AGX_OPCODE_BITOP:
265    case AGX_OPCODE_WHILE_ICMP:
266    case AGX_OPCODE_IF_ICMP:
267    case AGX_OPCODE_ELSE_ICMP:
268    case AGX_OPCODE_BREAK_IF_ICMP:
269    case AGX_OPCODE_ICMP_BALLOT:
270    case AGX_OPCODE_ICMP_QUAD_BALLOT:
271       return true;
272    case AGX_OPCODE_ICMPSEL:
273       /* Only the comparisons can be extended, not the selection */
274       return s < 2;
275    default:
276       return false;
277    }
278 }
279 
280 static void
agx_optimizer_copyprop(agx_context * ctx,agx_instr ** defs,agx_instr * I)281 agx_optimizer_copyprop(agx_context *ctx, agx_instr **defs, agx_instr *I)
282 {
283    agx_foreach_ssa_src(I, s) {
284       agx_index src = I->src[s];
285       agx_instr *def = defs[src.value];
286 
287       if (def == NULL)
288          continue; /* happens for phis in loops */
289       if (def->op != AGX_OPCODE_MOV)
290          continue;
291 
292       /* At the moment, not all instructions support size conversions. Notably
293        * RA pseudo instructions don't handle size conversions. This should be
294        * refined in the future.
295        */
296       if (def->src[0].size != src.size &&
297           !(def->src[0].size < src.size && agx_supports_zext(I, s)))
298          continue;
299 
300       /* Optimize split(64-bit uniform) so we can get better copyprop of the
301        * 32-bit uniform parts. This helps reduce moves with 64-bit uniforms.
302        */
303       if (I->op == AGX_OPCODE_SPLIT && def->src[0].type == AGX_INDEX_UNIFORM &&
304           src.size == AGX_SIZE_64 && I->dest[0].size == AGX_SIZE_32) {
305 
306          assert(I->nr_dests == 2 && "decomposing a 64-bit scalar");
307          agx_builder b = agx_init_builder(ctx, agx_before_instr(I));
308 
309          agx_index lo = def->src[0];
310          lo.size = AGX_SIZE_32;
311 
312          agx_index hi = lo;
313          hi.value += 2 /* half of 64-bits = 32-bits = 2 x 16-bits */;
314 
315          defs[I->dest[0].value] = agx_mov_to(&b, I->dest[0], lo);
316          defs[I->dest[1].value] = agx_mov_to(&b, I->dest[1], hi);
317 
318          agx_remove_instruction(I);
319          continue;
320       }
321 
322       /* Immediate inlining happens elsewhere */
323       if (def->src[0].type == AGX_INDEX_IMMEDIATE)
324          continue;
325 
326       /* ALU instructions cannot take 64-bit */
327       if (def->src[0].size == AGX_SIZE_64 &&
328           !(I->op == AGX_OPCODE_DEVICE_LOAD && s == 0) &&
329           !(I->op == AGX_OPCODE_DEVICE_STORE && s == 1) &&
330           !(I->op == AGX_OPCODE_ATOMIC && s == 1))
331          continue;
332 
333       agx_replace_src(I, s, def->src[0]);
334 
335       /* If we are zero-extending into an instruction that distinguishes sign
336        * and zero extend, make sure we pick zero-extend.
337        */
338       if (def->src[0].size < src.size &&
339           (I->op == AGX_OPCODE_IMAD || I->op == AGX_OPCODE_IADD)) {
340 
341          assert(agx_supports_zext(I, s));
342          I->src[s].abs = true;
343       }
344    }
345 }
346 
347 /*
348  * Fuse conditions into if. Specifically, acts on if_icmp and fuses:
349  *
350  *    if_icmp(cmp(x, y, *), 0, ne/eq) -> if_cmp(x, y, *)
351  */
352 static void
agx_optimizer_if_cmp(agx_instr ** defs,agx_instr * I)353 agx_optimizer_if_cmp(agx_instr **defs, agx_instr *I)
354 {
355    /* Check for unfused if */
356    if (!agx_is_equiv(I->src[1], agx_zero()) || I->icond != AGX_ICOND_UEQ ||
357        I->src[0].type != AGX_INDEX_NORMAL)
358       return;
359 
360    /* Check for condition */
361    agx_instr *def = defs[I->src[0].value];
362    if (def->op != AGX_OPCODE_ICMP && def->op != AGX_OPCODE_FCMP)
363       return;
364 
365    /* Fuse */
366    I->src[0] = def->src[0];
367    I->src[1] = def->src[1];
368    I->invert_cond = def->invert_cond ^ !I->invert_cond;
369 
370    if (def->op == AGX_OPCODE_ICMP) {
371       I->op = AGX_OPCODE_IF_ICMP;
372       I->icond = def->icond;
373    } else {
374       I->op = AGX_OPCODE_IF_FCMP;
375       I->fcond = def->fcond;
376    }
377 }
378 
379 /*
380  * Fuse invert into if. Acts on if_icmp and fuses:
381  *
382  *    if_icmp(xor(x, 1), 0, ne) -> if_cmp(x, 0, eq)
383  */
384 static void
agx_optimizer_if_not(agx_instr ** defs,agx_instr * I)385 agx_optimizer_if_not(agx_instr **defs, agx_instr *I)
386 {
387    /* Check for unfused if */
388    if (!agx_is_equiv(I->src[1], agx_zero()) || I->icond != AGX_ICOND_UEQ ||
389        I->src[0].type != AGX_INDEX_NORMAL)
390       return;
391 
392    /* Check for invert */
393    agx_instr *def = defs[I->src[0].value];
394    if (def->op != AGX_OPCODE_BITOP ||
395        !agx_is_equiv(def->src[1], agx_immediate(1)) ||
396        def->truth_table != AGX_BITOP_XOR)
397       return;
398 
399    /* Fuse */
400    I->src[0] = def->src[0];
401    I->invert_cond = !I->invert_cond;
402 }
403 
404 /*
405  * Fuse conditions into select. Specifically, acts on icmpsel and fuses:
406  *
407  *    icmpsel(cmp(x, y, *), 0, z, w, eq) -> cmpsel(x, y, w, z, *)
408  *
409  * Care must be taken to invert the condition by swapping cmpsel arguments.
410  */
411 static void
agx_optimizer_cmpsel(agx_instr ** defs,agx_instr * I)412 agx_optimizer_cmpsel(agx_instr **defs, agx_instr *I)
413 {
414    /* Check for unfused select */
415    if (!agx_is_equiv(I->src[1], agx_zero()) || I->icond != AGX_ICOND_UEQ ||
416        I->src[0].type != AGX_INDEX_NORMAL)
417       return;
418 
419    /* Check for condition */
420    agx_instr *def = defs[I->src[0].value];
421    if (def->op != AGX_OPCODE_ICMP && def->op != AGX_OPCODE_FCMP)
422       return;
423 
424    /* Fuse */
425    I->src[0] = def->src[0];
426    I->src[1] = def->src[1];
427 
428    /* In the unfused select, the condition is inverted due to the form:
429     *
430     *    (cond == 0) ? x : y
431     *
432     * So we need to swap the arguments when fusing to become cond ? y : x. If
433     * the condition was supposed to be inverted, we don't swap since it's
434     * already inverted. cmpsel does not have an invert_cond bit to use.
435     */
436    if (!def->invert_cond) {
437       agx_index temp = I->src[2];
438       I->src[2] = I->src[3];
439       I->src[3] = temp;
440    }
441 
442    if (def->op == AGX_OPCODE_ICMP) {
443       I->op = AGX_OPCODE_ICMPSEL;
444       I->icond = def->icond;
445    } else {
446       I->op = AGX_OPCODE_FCMPSEL;
447       I->fcond = def->fcond;
448    }
449 }
450 
451 /*
452  * Fuse conditions into ballots:
453  *
454  *    ballot(cmp(x, y)) -> ballot_cmp(x, y)
455  */
456 static void
agx_optimizer_ballot(agx_context * ctx,agx_instr ** defs,agx_instr * I)457 agx_optimizer_ballot(agx_context *ctx, agx_instr **defs, agx_instr *I)
458 {
459    if (I->src[0].type != AGX_INDEX_NORMAL)
460       return;
461 
462    agx_instr *def = defs[I->src[0].value];
463    if (!def || (def->op != AGX_OPCODE_ICMP && def->op != AGX_OPCODE_FCMP))
464       return;
465 
466    bool quad = I->op == AGX_OPCODE_QUAD_BALLOT;
467    assert(quad || I->op == AGX_OPCODE_BALLOT);
468 
469    /* Replace with a fused instruction since the # of sources changes */
470    agx_builder b = agx_init_builder(ctx, agx_before_instr(I));
471 
472    agx_instr *fused = agx_icmp_ballot_to(
473       &b, I->dest[0], def->src[0], def->src[1], def->icond, def->invert_cond);
474 
475    if (def->op == AGX_OPCODE_ICMP) {
476       fused->op = quad ? AGX_OPCODE_ICMP_QUAD_BALLOT : AGX_OPCODE_ICMP_BALLOT;
477    } else {
478       fused->op = quad ? AGX_OPCODE_FCMP_QUAD_BALLOT : AGX_OPCODE_FCMP_BALLOT;
479    }
480 
481    agx_remove_instruction(I);
482 }
483 
484 /*
485  * Fuse not srcs into bitop.
486  */
487 static void
agx_optimizer_bitop(agx_instr ** defs,agx_instr * I)488 agx_optimizer_bitop(agx_instr **defs, agx_instr *I)
489 {
490    agx_foreach_ssa_src(I, s) {
491       agx_index src = I->src[s];
492       agx_instr *def = defs[src.value];
493 
494       /* Check for not src */
495       if (def->op != AGX_OPCODE_NOT)
496          continue;
497 
498       /* Select new operation */
499       if (s == 0) {
500          I->truth_table =
501             ((I->truth_table & 0x5) << 1) | ((I->truth_table & 0xa) >> 1);
502       } else if (s == 1) {
503          I->truth_table = ((I->truth_table & 0x3) << 2) | (I->truth_table >> 2);
504       }
505 
506       /* Fuse */
507       I->src[s] = def->src[0];
508    }
509 }
510 
511 /*
512  * Fuse sign-extends into addition-like instructions:
513  */
514 static void
agx_optimizer_signext(agx_instr ** defs,agx_instr * I)515 agx_optimizer_signext(agx_instr **defs, agx_instr *I)
516 {
517    agx_foreach_ssa_src(I, s) {
518       agx_index src = I->src[s];
519       agx_instr *def = defs[src.value];
520 
521       if (def == NULL || def->op != AGX_OPCODE_SIGNEXT)
522          continue;
523 
524       agx_replace_src(I, s, def->src[0]);
525       assert(!I->src[s].abs && "sign-extended");
526    }
527 }
528 
529 void
agx_optimizer_forward(agx_context * ctx)530 agx_optimizer_forward(agx_context *ctx)
531 {
532    agx_instr **defs = calloc(ctx->alloc, sizeof(*defs));
533 
534    agx_foreach_instr_global_safe(ctx, I) {
535       struct agx_opcode_info info = agx_opcodes_info[I->op];
536 
537       agx_foreach_ssa_dest(I, d) {
538          defs[I->dest[d].value] = I;
539       }
540 
541       /* Optimize moves */
542       agx_optimizer_copyprop(ctx, defs, I);
543 
544       /* Propagate fmov down */
545       if (info.is_float || I->op == AGX_OPCODE_FCMPSEL ||
546           I->op == AGX_OPCODE_FCMP)
547          agx_optimizer_fmov(defs, I);
548 
549       /* Inline immediates if we can. TODO: systematic */
550       if (I->op != AGX_OPCODE_COLLECT && I->op != AGX_OPCODE_IMAGE_LOAD &&
551           I->op != AGX_OPCODE_TEXTURE_LOAD &&
552           I->op != AGX_OPCODE_UNIFORM_STORE && I->op != AGX_OPCODE_EXPORT)
553          agx_optimizer_inline_imm(defs, I);
554 
555       if (I->op == AGX_OPCODE_IF_ICMP) {
556          agx_optimizer_if_not(defs, I);
557          agx_optimizer_if_cmp(defs, I);
558       } else if (I->op == AGX_OPCODE_ICMPSEL) {
559          agx_optimizer_cmpsel(defs, I);
560       } else if (I->op == AGX_OPCODE_BALLOT ||
561                  I->op == AGX_OPCODE_QUAD_BALLOT) {
562          agx_optimizer_ballot(ctx, defs, I);
563       } else if (I->op == AGX_OPCODE_BITOP) {
564          agx_optimizer_bitop(defs, I);
565       } else if (I->op == AGX_OPCODE_IADD || I->op == AGX_OPCODE_IMAD) {
566          agx_optimizer_signext(defs, I);
567       }
568    }
569 
570    free(defs);
571 }
572 
573 static void
record_use(agx_instr ** uses,BITSET_WORD * multiple,agx_instr * I,unsigned src)574 record_use(agx_instr **uses, BITSET_WORD *multiple, agx_instr *I, unsigned src)
575 {
576    unsigned v = I->src[src].value;
577 
578    if (uses[v])
579       BITSET_SET(multiple, v);
580    else
581       uses[v] = I;
582 }
583 
584 void
agx_optimizer_backward(agx_context * ctx)585 agx_optimizer_backward(agx_context *ctx)
586 {
587    agx_instr **uses = calloc(ctx->alloc, sizeof(*uses));
588    BITSET_WORD *multiple = calloc(BITSET_WORDS(ctx->alloc), sizeof(*multiple));
589 
590    agx_foreach_block_rev(ctx, block) {
591       /* Phi sources are logically read at the end of predecessor, so process
592        * our source in our successors' phis firsts. This ensures we set
593        * `multiple` correctly with phi sources.
594        */
595       agx_foreach_successor(block, succ) {
596          unsigned s = agx_predecessor_index(succ, block);
597 
598          agx_foreach_phi_in_block(succ, phi) {
599             record_use(uses, multiple, phi, s);
600          }
601       }
602 
603       agx_foreach_instr_in_block_rev(block, I) {
604          struct agx_opcode_info info = agx_opcodes_info[I->op];
605 
606          /* Skip phis, they're handled specially */
607          if (I->op == AGX_OPCODE_PHI) {
608             continue;
609          }
610 
611          agx_foreach_ssa_src(I, s) {
612             record_use(uses, multiple, I, s);
613          }
614 
615          if (info.nr_dests != 1)
616             continue;
617 
618          if (I->dest[0].type != AGX_INDEX_NORMAL)
619             continue;
620 
621          agx_instr *use = uses[I->dest[0].value];
622 
623          if (!use || BITSET_TEST(multiple, I->dest[0].value))
624             continue;
625 
626          if (agx_optimizer_not(I, use)) {
627             agx_remove_instruction(use);
628             continue;
629          }
630 
631          /* Destination has a single use, try to propagate */
632          if (info.is_float && agx_optimizer_fmov_rev(I, use)) {
633             agx_remove_instruction(use);
634             continue;
635          }
636       }
637    }
638 
639    free(uses);
640    free(multiple);
641 }
642