• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2021 Alyssa Rosenzweig
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "agx_builder.h"
7 #include "agx_compiler.h"
8 #include "agx_minifloat.h"
9 #include "agx_opcodes.h"
10 
11 /* AGX peephole optimizer responsible for instruction combining. It operates in
12  * a forward direction and a backward direction, in each case traversing in
13  * source order. SSA means the forward pass satisfies the invariant:
14  *
15  *    Every def is visited before any of its uses.
16  *
17  * Dually, the backend pass satisfies the invariant:
18  *
19  *    Every use of a def is visited before the def.
20  *
21  * This means the forward pass can propagate modifiers forward, whereas the
22  * backwards pass propagates modifiers backward. Consider an example:
23  *
24  *    1 = fabs 0
25  *    2 = fround 1
26  *    3 = fsat 1
27  *
28  * The forwards pass would propagate the fabs to the fround (since we can
29  * lookup the fabs from the fround source and do the replacement). By contrast
30  * the backwards pass would propagate the fsat back to the fround (since when
31  * we see the fround we know it has only a single user, fsat).  Propagatable
32  * instruction have natural directions (like pushforwards and pullbacks).
33  *
34  * We are careful to update the tracked state whenever we modify an instruction
35  * to ensure the passes are linear-time and converge in a single iteration.
36  *
37  * Size conversions are worth special discussion. Consider the snippet:
38  *
39  *    2 = fadd 0, 1
40  *    3 = f2f16 2
41  *    4 = fround 3
42  *
43  * A priori, we can move the f2f16 in either direction. But it's not equal --
44  * if we move it up to the fadd, we get FP16 for two instructions, whereas if
45  * we push it into the fround, we effectively get FP32 for two instructions. So
46  * f2f16 is backwards. Likewise, consider
47  *
48  *    2 = fadd 0, 1
49  *    3 = f2f32 1
50  *    4 = fround 3
51  *
52  * This time if we move f2f32 up to the fadd, we get FP32 for two, but if we
53  * move it down to the fround, we get FP16 to too. So f2f32 is backwards.
54  */
55 
56 static bool
agx_is_fmov(agx_instr * def)57 agx_is_fmov(agx_instr *def)
58 {
59    return (def->op == AGX_OPCODE_FADD) &&
60           agx_is_equiv(def->src[1], agx_negzero());
61 }
62 
63 /* Compose floating-point modifiers with floating-point sources */
64 
65 static agx_index
agx_compose_float_src(agx_index to,agx_index from)66 agx_compose_float_src(agx_index to, agx_index from)
67 {
68    if (to.abs) {
69       from.neg = false;
70       from.abs = true;
71    }
72 
73    from.neg ^= to.neg;
74 
75    return from;
76 }
77 
78 static void
agx_optimizer_fmov(agx_instr ** defs,agx_instr * ins)79 agx_optimizer_fmov(agx_instr **defs, agx_instr *ins)
80 {
81    agx_foreach_ssa_src(ins, s) {
82       agx_index src = ins->src[s];
83       agx_instr *def = defs[src.value];
84 
85       if (def == NULL)
86          continue; /* happens for phis in loops */
87       if (!agx_is_fmov(def))
88          continue;
89       if (def->saturate)
90          continue;
91       if (ins->op == AGX_OPCODE_FCMPSEL && s >= 2)
92          continue;
93 
94       /* We can fold f2f32 into 32-bit instructions, but we can't fold f2f16
95        * into 16-bit instructions, since the latter would implicitly promote to
96        * a 32-bit instruction which is not exact.
97        */
98       assert(def->src[0].size == AGX_SIZE_32 ||
99              def->src[0].size == AGX_SIZE_16);
100       assert(src.size == AGX_SIZE_32 || src.size == AGX_SIZE_16);
101 
102       if (src.size == AGX_SIZE_16 && def->src[0].size == AGX_SIZE_32)
103          continue;
104 
105       ins->src[s] = agx_compose_float_src(src, def->src[0]);
106    }
107 }
108 
109 static bool
image_write_source_can_be_immediate(agx_instr * I,unsigned s)110 image_write_source_can_be_immediate(agx_instr *I, unsigned s)
111 {
112    assert(I->op == AGX_OPCODE_IMAGE_WRITE);
113 
114    /* LOD can always be immediate. Actually, it's just zero so far, we don't
115     * support nonzero LOD for images yet.
116     */
117    if (s == 2)
118       return true;
119 
120    /* If the "bindless" source (source 3) is an immediate, it means we don't
121     * have a bindless image, instead we have a texture state index. We're
122     * allowed to have immediate texture state registers (source 4). However,
123     * we're not allowed to have immediate bindless offsets (also source 4).
124     */
125    bool is_texture_state = (I->src[3].type == AGX_INDEX_IMMEDIATE);
126    if (s == 4 && is_texture_state)
127       return true;
128 
129    /* Otherwise, must be from a register */
130    return false;
131 }
132 
133 static void
agx_optimizer_inline_imm(agx_instr ** defs,agx_instr * I,bool is_float)134 agx_optimizer_inline_imm(agx_instr **defs, agx_instr *I, bool is_float)
135 {
136    agx_foreach_ssa_src(I, s) {
137       agx_index src = I->src[s];
138       if (src.neg)
139          continue;
140 
141       agx_instr *def = defs[src.value];
142       if (!def || def->op != AGX_OPCODE_MOV_IMM)
143          continue;
144 
145       uint8_t value = def->imm;
146       uint16_t value_u16 = def->imm;
147 
148       bool float_src = is_float;
149 
150       /* fcmpsel takes first 2 as floats specially */
151       if (s < 2 && (I->op == AGX_OPCODE_FCMPSEL || I->op == AGX_OPCODE_FCMP))
152          float_src = true;
153       if (I->op == AGX_OPCODE_ST_TILE && s == 0)
154          continue;
155       if (I->op == AGX_OPCODE_ZS_EMIT && s != 0)
156          continue;
157       if ((I->op == AGX_OPCODE_DEVICE_STORE ||
158            I->op == AGX_OPCODE_LOCAL_STORE || I->op == AGX_OPCODE_ATOMIC ||
159            I->op == AGX_OPCODE_LOCAL_ATOMIC) &&
160           s != 2)
161          continue;
162       if ((I->op == AGX_OPCODE_LOCAL_LOAD || I->op == AGX_OPCODE_DEVICE_LOAD ||
163            I->op == AGX_OPCODE_STACK_STORE) &&
164           s != 1)
165          continue;
166       if (I->op == AGX_OPCODE_SPLIT)
167          continue;
168 
169       if (I->op == AGX_OPCODE_IMAGE_WRITE &&
170           !image_write_source_can_be_immediate(I, s))
171          continue;
172 
173       if (float_src) {
174          bool fp16 = (def->dest[0].size == AGX_SIZE_16);
175          assert(fp16 || (def->dest[0].size == AGX_SIZE_32));
176 
177          float f = fp16 ? _mesa_half_to_float(def->imm) : uif(def->imm);
178          if (!agx_minifloat_exact(f))
179             continue;
180 
181          I->src[s] = agx_immediate_f(f);
182       } else if (value == def->imm) {
183          I->src[s] = agx_immediate(value);
184       } else if (value_u16 == def->imm && agx_allows_16bit_immediate(I)) {
185          I->src[s] = agx_abs(agx_immediate(value_u16));
186       }
187    }
188 }
189 
190 /*
191  * Fuse not into and/or/xor. Specifically, acts on not and fuses:
192  *
193  *    not(and(x, y) -> nand(x, y)
194  *    not(or(x, y) -> nor(x, y)
195  *    not(xor(x, y) -> xnor(x, y)
196  */
197 static bool
agx_optimizer_not(agx_instr * I,agx_instr * use)198 agx_optimizer_not(agx_instr *I, agx_instr *use)
199 {
200    /* Check for bit op and use of not op */
201    if (I->op != AGX_OPCODE_BITOP || use->op != AGX_OPCODE_NOT)
202       return false;
203 
204    /* Remap operation to the appropriate one */
205    I->truth_table ^= 0xF;
206    I->dest[0] = use->dest[0];
207 
208    return true;
209 }
210 
211 static bool
agx_optimizer_fmov_rev(agx_instr * I,agx_instr * use)212 agx_optimizer_fmov_rev(agx_instr *I, agx_instr *use)
213 {
214    if (!agx_is_fmov(use))
215       return false;
216    if (use->src[0].neg || use->src[0].abs)
217       return false;
218 
219    /* We can fold f2f16 into 32-bit instructions, but we can't fold f2f32 into
220     * 16-bit instructions, since the latter would implicitly promote to a 32-bit
221     * instruction which is not exact.
222     */
223    assert(use->dest[0].size == AGX_SIZE_32 || use->dest[0].size == AGX_SIZE_16);
224    assert(I->dest[0].size == AGX_SIZE_32 || I->dest[0].size == AGX_SIZE_16);
225 
226    if (I->dest[0].size == AGX_SIZE_16 && use->dest[0].size == AGX_SIZE_32)
227       return false;
228 
229    /* saturate(saturate(x)) = saturate(x) */
230    I->saturate |= use->saturate;
231    I->dest[0] = use->dest[0];
232    return true;
233 }
234 
235 static void
agx_optimizer_copyprop(agx_context * ctx,agx_instr ** defs,agx_instr * I)236 agx_optimizer_copyprop(agx_context *ctx, agx_instr **defs, agx_instr *I)
237 {
238    agx_foreach_ssa_src(I, s) {
239       agx_index src = I->src[s];
240       agx_instr *def = defs[src.value];
241 
242       if (def == NULL)
243          continue; /* happens for phis in loops */
244       if (def->op != AGX_OPCODE_MOV)
245          continue;
246 
247       /* At the moment, not all instructions support size conversions. Notably
248        * RA pseudo instructions don't handle size conversions. This should be
249        * refined in the future.
250        */
251       if (def->src[0].size != src.size)
252          continue;
253 
254       /* Optimize split(64-bit uniform) so we can get better copyprop of the
255        * 32-bit uniform parts. This helps reduce moves with 64-bit uniforms.
256        */
257       if (I->op == AGX_OPCODE_SPLIT && def->src[0].type == AGX_INDEX_UNIFORM &&
258           src.size == AGX_SIZE_64 && I->dest[0].size == AGX_SIZE_32) {
259 
260          assert(I->nr_dests == 2 && "decomposing a 64-bit scalar");
261          agx_builder b = agx_init_builder(ctx, agx_before_instr(I));
262 
263          agx_index lo = def->src[0];
264          lo.size = AGX_SIZE_32;
265 
266          agx_index hi = lo;
267          hi.value += 2 /* half of 64-bits = 32-bits = 2 x 16-bits */;
268 
269          defs[I->dest[0].value] = agx_mov_to(&b, I->dest[0], lo);
270          defs[I->dest[1].value] = agx_mov_to(&b, I->dest[1], hi);
271 
272          agx_remove_instruction(I);
273          continue;
274       }
275 
276       /* Immediate inlining happens elsewhere */
277       if (def->src[0].type == AGX_INDEX_IMMEDIATE)
278          continue;
279 
280       /* ALU instructions cannot take 64-bit */
281       if (def->src[0].size == AGX_SIZE_64 &&
282           !(I->op == AGX_OPCODE_DEVICE_LOAD && s == 0) &&
283           !(I->op == AGX_OPCODE_DEVICE_STORE && s == 1) &&
284           !(I->op == AGX_OPCODE_ATOMIC && s == 1))
285          continue;
286 
287       agx_replace_src(I, s, def->src[0]);
288    }
289 }
290 
291 /*
292  * Fuse conditions into if. Specifically, acts on if_icmp and fuses:
293  *
294  *    if_icmp(cmp(x, y, *), 0, ne) -> if_cmp(x, y, *)
295  */
296 static void
agx_optimizer_if_cmp(agx_instr ** defs,agx_instr * I)297 agx_optimizer_if_cmp(agx_instr **defs, agx_instr *I)
298 {
299    /* Check for unfused if */
300    if (!agx_is_equiv(I->src[1], agx_zero()) || I->icond != AGX_ICOND_UEQ ||
301        !I->invert_cond || I->src[0].type != AGX_INDEX_NORMAL)
302       return;
303 
304    /* Check for condition */
305    agx_instr *def = defs[I->src[0].value];
306    if (def->op != AGX_OPCODE_ICMP && def->op != AGX_OPCODE_FCMP)
307       return;
308 
309    /* Fuse */
310    I->src[0] = def->src[0];
311    I->src[1] = def->src[1];
312    I->invert_cond = def->invert_cond;
313 
314    if (def->op == AGX_OPCODE_ICMP) {
315       I->op = AGX_OPCODE_IF_ICMP;
316       I->icond = def->icond;
317    } else {
318       I->op = AGX_OPCODE_IF_FCMP;
319       I->fcond = def->fcond;
320    }
321 }
322 
323 /*
324  * Fuse conditions into select. Specifically, acts on icmpsel and fuses:
325  *
326  *    icmpsel(cmp(x, y, *), 0, z, w, eq) -> cmpsel(x, y, w, z, *)
327  *
328  * Care must be taken to invert the condition by swapping cmpsel arguments.
329  */
330 static void
agx_optimizer_cmpsel(agx_instr ** defs,agx_instr * I)331 agx_optimizer_cmpsel(agx_instr **defs, agx_instr *I)
332 {
333    /* Check for unfused select */
334    if (!agx_is_equiv(I->src[1], agx_zero()) || I->icond != AGX_ICOND_UEQ ||
335        I->src[0].type != AGX_INDEX_NORMAL)
336       return;
337 
338    /* Check for condition */
339    agx_instr *def = defs[I->src[0].value];
340    if (def->op != AGX_OPCODE_ICMP && def->op != AGX_OPCODE_FCMP)
341       return;
342 
343    /* Fuse */
344    I->src[0] = def->src[0];
345    I->src[1] = def->src[1];
346 
347    /* In the unfused select, the condition is inverted due to the form:
348     *
349     *    (cond == 0) ? x : y
350     *
351     * So we need to swap the arguments when fusing to become cond ? y : x. If
352     * the condition was supposed to be inverted, we don't swap since it's
353     * already inverted. cmpsel does not have an invert_cond bit to use.
354     */
355    if (!def->invert_cond) {
356       agx_index temp = I->src[2];
357       I->src[2] = I->src[3];
358       I->src[3] = temp;
359    }
360 
361    if (def->op == AGX_OPCODE_ICMP) {
362       I->op = AGX_OPCODE_ICMPSEL;
363       I->icond = def->icond;
364    } else {
365       I->op = AGX_OPCODE_FCMPSEL;
366       I->fcond = def->fcond;
367    }
368 }
369 
370 /*
371  * Fuse conditions into ballots:
372  *
373  *    ballot(cmp(x, y)) -> ballot_cmp(x, y)
374  */
375 static void
agx_optimizer_ballot(agx_context * ctx,agx_instr ** defs,agx_instr * I)376 agx_optimizer_ballot(agx_context *ctx, agx_instr **defs, agx_instr *I)
377 {
378    agx_instr *def = defs[I->src[0].value];
379    if (!def || (def->op != AGX_OPCODE_ICMP && def->op != AGX_OPCODE_FCMP))
380       return;
381 
382    bool quad = I->op == AGX_OPCODE_QUAD_BALLOT;
383    assert(quad || I->op == AGX_OPCODE_BALLOT);
384 
385    /* Replace with a fused instruction since the # of sources changes */
386    agx_builder b = agx_init_builder(ctx, agx_before_instr(I));
387 
388    agx_instr *fused = agx_icmp_ballot_to(
389       &b, I->dest[0], def->src[0], def->src[1], def->icond, def->invert_cond);
390 
391    if (def->op == AGX_OPCODE_ICMP) {
392       fused->op = quad ? AGX_OPCODE_ICMP_QUAD_BALLOT : AGX_OPCODE_ICMP_BALLOT;
393    } else {
394       fused->op = quad ? AGX_OPCODE_FCMP_QUAD_BALLOT : AGX_OPCODE_FCMP_BALLOT;
395    }
396 
397    agx_remove_instruction(I);
398 }
399 
400 /*
401  * Fuse not srcs into bitop.
402  */
403 static void
agx_optimizer_bitop(agx_instr ** defs,agx_instr * I)404 agx_optimizer_bitop(agx_instr **defs, agx_instr *I)
405 {
406    agx_foreach_ssa_src(I, s) {
407       agx_index src = I->src[s];
408       agx_instr *def = defs[src.value];
409 
410       /* Check for not src */
411       if (def->op != AGX_OPCODE_NOT)
412          continue;
413 
414       /* Select new operation */
415       if (s == 0) {
416          I->truth_table =
417             ((I->truth_table & 0x5) << 1) | ((I->truth_table & 0xa) >> 1);
418       } else if (s == 1) {
419          I->truth_table = ((I->truth_table & 0x3) << 2) | (I->truth_table >> 2);
420       }
421 
422       /* Fuse */
423       I->src[s] = def->src[0];
424    }
425 }
426 
427 static void
agx_optimizer_forward(agx_context * ctx)428 agx_optimizer_forward(agx_context *ctx)
429 {
430    agx_instr **defs = calloc(ctx->alloc, sizeof(*defs));
431 
432    agx_foreach_instr_global_safe(ctx, I) {
433       struct agx_opcode_info info = agx_opcodes_info[I->op];
434 
435       agx_foreach_ssa_dest(I, d) {
436          defs[I->dest[d].value] = I;
437       }
438 
439       /* Optimize moves */
440       agx_optimizer_copyprop(ctx, defs, I);
441 
442       /* Propagate fmov down */
443       if (info.is_float || I->op == AGX_OPCODE_FCMPSEL ||
444           I->op == AGX_OPCODE_FCMP)
445          agx_optimizer_fmov(defs, I);
446 
447       /* Inline immediates if we can. TODO: systematic */
448       if (I->op != AGX_OPCODE_ST_VARY && I->op != AGX_OPCODE_COLLECT &&
449           I->op != AGX_OPCODE_TEXTURE_SAMPLE &&
450           I->op != AGX_OPCODE_IMAGE_LOAD && I->op != AGX_OPCODE_TEXTURE_LOAD &&
451           I->op != AGX_OPCODE_UNIFORM_STORE &&
452           I->op != AGX_OPCODE_BLOCK_IMAGE_STORE)
453          agx_optimizer_inline_imm(defs, I, info.is_float);
454 
455       if (I->op == AGX_OPCODE_IF_ICMP)
456          agx_optimizer_if_cmp(defs, I);
457       else if (I->op == AGX_OPCODE_ICMPSEL)
458          agx_optimizer_cmpsel(defs, I);
459       else if (I->op == AGX_OPCODE_BALLOT || I->op == AGX_OPCODE_QUAD_BALLOT)
460          agx_optimizer_ballot(ctx, defs, I);
461       else if (I->op == AGX_OPCODE_BITOP)
462          agx_optimizer_bitop(defs, I);
463    }
464 
465    free(defs);
466 }
467 
468 static void
agx_optimizer_backward(agx_context * ctx)469 agx_optimizer_backward(agx_context *ctx)
470 {
471    agx_instr **uses = calloc(ctx->alloc, sizeof(*uses));
472    BITSET_WORD *multiple = calloc(BITSET_WORDS(ctx->alloc), sizeof(*multiple));
473 
474    agx_foreach_instr_global_rev(ctx, I) {
475       struct agx_opcode_info info = agx_opcodes_info[I->op];
476 
477       agx_foreach_ssa_src(I, s) {
478          if (I->src[s].type == AGX_INDEX_NORMAL) {
479             unsigned v = I->src[s].value;
480 
481             if (uses[v])
482                BITSET_SET(multiple, v);
483             else
484                uses[v] = I;
485          }
486       }
487 
488       if (info.nr_dests != 1)
489          continue;
490 
491       if (I->dest[0].type != AGX_INDEX_NORMAL)
492          continue;
493 
494       agx_instr *use = uses[I->dest[0].value];
495 
496       if (!use || BITSET_TEST(multiple, I->dest[0].value))
497          continue;
498 
499       if (agx_optimizer_not(I, use)) {
500          agx_remove_instruction(use);
501          continue;
502       }
503 
504       /* Destination has a single use, try to propagate */
505       if (info.is_float && agx_optimizer_fmov_rev(I, use)) {
506          agx_remove_instruction(use);
507          continue;
508       }
509    }
510 
511    free(uses);
512    free(multiple);
513 }
514 
515 void
agx_optimizer(agx_context * ctx)516 agx_optimizer(agx_context *ctx)
517 {
518    agx_optimizer_backward(ctx);
519    agx_optimizer_forward(ctx);
520 }
521