1 /*
2 * Copyright 2021 Alyssa Rosenzweig
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "agx_builder.h"
7 #include "agx_compiler.h"
8 #include "agx_minifloat.h"
9 #include "agx_opcodes.h"
10
11 /* AGX peephole optimizer responsible for instruction combining. It operates in
12 * a forward direction and a backward direction, in each case traversing in
13 * source order. SSA means the forward pass satisfies the invariant:
14 *
15 * Every def is visited before any of its uses.
16 *
17 * Dually, the backend pass satisfies the invariant:
18 *
19 * Every use of a def is visited before the def.
20 *
21 * This means the forward pass can propagate modifiers forward, whereas the
22 * backwards pass propagates modifiers backward. Consider an example:
23 *
24 * 1 = fabs 0
25 * 2 = fround 1
26 * 3 = fsat 1
27 *
28 * The forwards pass would propagate the fabs to the fround (since we can
29 * lookup the fabs from the fround source and do the replacement). By contrast
30 * the backwards pass would propagate the fsat back to the fround (since when
31 * we see the fround we know it has only a single user, fsat). Propagatable
32 * instruction have natural directions (like pushforwards and pullbacks).
33 *
34 * We are careful to update the tracked state whenever we modify an instruction
35 * to ensure the passes are linear-time and converge in a single iteration.
36 *
37 * Size conversions are worth special discussion. Consider the snippet:
38 *
39 * 2 = fadd 0, 1
40 * 3 = f2f16 2
41 * 4 = fround 3
42 *
43 * A priori, we can move the f2f16 in either direction. But it's not equal --
44 * if we move it up to the fadd, we get FP16 for two instructions, whereas if
45 * we push it into the fround, we effectively get FP32 for two instructions. So
46 * f2f16 is backwards. Likewise, consider
47 *
48 * 2 = fadd 0, 1
49 * 3 = f2f32 1
50 * 4 = fround 3
51 *
52 * This time if we move f2f32 up to the fadd, we get FP32 for two, but if we
53 * move it down to the fround, we get FP16 to too. So f2f32 is backwards.
54 */
55
56 static bool
agx_is_fmov(agx_instr * def)57 agx_is_fmov(agx_instr *def)
58 {
59 return (def->op == AGX_OPCODE_FADD) &&
60 agx_is_equiv(def->src[1], agx_negzero());
61 }
62
63 /* Compose floating-point modifiers with floating-point sources */
64
65 static agx_index
agx_compose_float_src(agx_index to,agx_index from)66 agx_compose_float_src(agx_index to, agx_index from)
67 {
68 if (to.abs) {
69 from.neg = false;
70 from.abs = true;
71 }
72
73 from.neg ^= to.neg;
74
75 return from;
76 }
77
78 static void
agx_optimizer_fmov(agx_instr ** defs,agx_instr * ins)79 agx_optimizer_fmov(agx_instr **defs, agx_instr *ins)
80 {
81 agx_foreach_ssa_src(ins, s) {
82 agx_index src = ins->src[s];
83 agx_instr *def = defs[src.value];
84
85 if (def == NULL)
86 continue; /* happens for phis in loops */
87 if (!agx_is_fmov(def))
88 continue;
89 if (def->saturate)
90 continue;
91 if (ins->op == AGX_OPCODE_FCMPSEL && s >= 2)
92 continue;
93
94 /* We can fold f2f32 into 32-bit instructions, but we can't fold f2f16
95 * into 16-bit instructions, since the latter would implicitly promote to
96 * a 32-bit instruction which is not exact.
97 */
98 assert(def->src[0].size == AGX_SIZE_32 ||
99 def->src[0].size == AGX_SIZE_16);
100 assert(src.size == AGX_SIZE_32 || src.size == AGX_SIZE_16);
101
102 if (src.size == AGX_SIZE_16 && def->src[0].size == AGX_SIZE_32)
103 continue;
104
105 ins->src[s] = agx_compose_float_src(src, def->src[0]);
106 }
107 }
108
109 static bool
image_write_source_can_be_immediate(agx_instr * I,unsigned s)110 image_write_source_can_be_immediate(agx_instr *I, unsigned s)
111 {
112 assert(I->op == AGX_OPCODE_IMAGE_WRITE);
113
114 /* LOD can always be immediate. Actually, it's just zero so far, we don't
115 * support nonzero LOD for images yet.
116 */
117 if (s == 2)
118 return true;
119
120 /* If the "bindless" source (source 3) is an immediate, it means we don't
121 * have a bindless image, instead we have a texture state index. We're
122 * allowed to have immediate texture state registers (source 4). However,
123 * we're not allowed to have immediate bindless offsets (also source 4).
124 */
125 bool is_texture_state = (I->src[3].type == AGX_INDEX_IMMEDIATE);
126 if (s == 4 && is_texture_state)
127 return true;
128
129 /* Otherwise, must be from a register */
130 return false;
131 }
132
133 static void
agx_optimizer_inline_imm(agx_instr ** defs,agx_instr * I,bool is_float)134 agx_optimizer_inline_imm(agx_instr **defs, agx_instr *I, bool is_float)
135 {
136 agx_foreach_ssa_src(I, s) {
137 agx_index src = I->src[s];
138 if (src.neg)
139 continue;
140
141 agx_instr *def = defs[src.value];
142 if (!def || def->op != AGX_OPCODE_MOV_IMM)
143 continue;
144
145 uint8_t value = def->imm;
146 uint16_t value_u16 = def->imm;
147
148 bool float_src = is_float;
149
150 /* fcmpsel takes first 2 as floats specially */
151 if (s < 2 && (I->op == AGX_OPCODE_FCMPSEL || I->op == AGX_OPCODE_FCMP))
152 float_src = true;
153 if (I->op == AGX_OPCODE_ST_TILE && s == 0)
154 continue;
155 if (I->op == AGX_OPCODE_ZS_EMIT && s != 0)
156 continue;
157 if ((I->op == AGX_OPCODE_DEVICE_STORE ||
158 I->op == AGX_OPCODE_LOCAL_STORE || I->op == AGX_OPCODE_ATOMIC ||
159 I->op == AGX_OPCODE_LOCAL_ATOMIC) &&
160 s != 2)
161 continue;
162 if ((I->op == AGX_OPCODE_LOCAL_LOAD || I->op == AGX_OPCODE_DEVICE_LOAD ||
163 I->op == AGX_OPCODE_STACK_STORE) &&
164 s != 1)
165 continue;
166 if (I->op == AGX_OPCODE_SPLIT)
167 continue;
168
169 if (I->op == AGX_OPCODE_IMAGE_WRITE &&
170 !image_write_source_can_be_immediate(I, s))
171 continue;
172
173 if (float_src) {
174 bool fp16 = (def->dest[0].size == AGX_SIZE_16);
175 assert(fp16 || (def->dest[0].size == AGX_SIZE_32));
176
177 float f = fp16 ? _mesa_half_to_float(def->imm) : uif(def->imm);
178 if (!agx_minifloat_exact(f))
179 continue;
180
181 I->src[s] = agx_immediate_f(f);
182 } else if (value == def->imm) {
183 I->src[s] = agx_immediate(value);
184 } else if (value_u16 == def->imm && agx_allows_16bit_immediate(I)) {
185 I->src[s] = agx_abs(agx_immediate(value_u16));
186 }
187 }
188 }
189
190 /*
191 * Fuse not into and/or/xor. Specifically, acts on not and fuses:
192 *
193 * not(and(x, y) -> nand(x, y)
194 * not(or(x, y) -> nor(x, y)
195 * not(xor(x, y) -> xnor(x, y)
196 */
197 static bool
agx_optimizer_not(agx_instr * I,agx_instr * use)198 agx_optimizer_not(agx_instr *I, agx_instr *use)
199 {
200 /* Check for bit op and use of not op */
201 if (I->op != AGX_OPCODE_BITOP || use->op != AGX_OPCODE_NOT)
202 return false;
203
204 /* Remap operation to the appropriate one */
205 I->truth_table ^= 0xF;
206 I->dest[0] = use->dest[0];
207
208 return true;
209 }
210
211 static bool
agx_optimizer_fmov_rev(agx_instr * I,agx_instr * use)212 agx_optimizer_fmov_rev(agx_instr *I, agx_instr *use)
213 {
214 if (!agx_is_fmov(use))
215 return false;
216 if (use->src[0].neg || use->src[0].abs)
217 return false;
218
219 /* We can fold f2f16 into 32-bit instructions, but we can't fold f2f32 into
220 * 16-bit instructions, since the latter would implicitly promote to a 32-bit
221 * instruction which is not exact.
222 */
223 assert(use->dest[0].size == AGX_SIZE_32 || use->dest[0].size == AGX_SIZE_16);
224 assert(I->dest[0].size == AGX_SIZE_32 || I->dest[0].size == AGX_SIZE_16);
225
226 if (I->dest[0].size == AGX_SIZE_16 && use->dest[0].size == AGX_SIZE_32)
227 return false;
228
229 /* saturate(saturate(x)) = saturate(x) */
230 I->saturate |= use->saturate;
231 I->dest[0] = use->dest[0];
232 return true;
233 }
234
235 static void
agx_optimizer_copyprop(agx_context * ctx,agx_instr ** defs,agx_instr * I)236 agx_optimizer_copyprop(agx_context *ctx, agx_instr **defs, agx_instr *I)
237 {
238 agx_foreach_ssa_src(I, s) {
239 agx_index src = I->src[s];
240 agx_instr *def = defs[src.value];
241
242 if (def == NULL)
243 continue; /* happens for phis in loops */
244 if (def->op != AGX_OPCODE_MOV)
245 continue;
246
247 /* At the moment, not all instructions support size conversions. Notably
248 * RA pseudo instructions don't handle size conversions. This should be
249 * refined in the future.
250 */
251 if (def->src[0].size != src.size)
252 continue;
253
254 /* Optimize split(64-bit uniform) so we can get better copyprop of the
255 * 32-bit uniform parts. This helps reduce moves with 64-bit uniforms.
256 */
257 if (I->op == AGX_OPCODE_SPLIT && def->src[0].type == AGX_INDEX_UNIFORM &&
258 src.size == AGX_SIZE_64 && I->dest[0].size == AGX_SIZE_32) {
259
260 assert(I->nr_dests == 2 && "decomposing a 64-bit scalar");
261 agx_builder b = agx_init_builder(ctx, agx_before_instr(I));
262
263 agx_index lo = def->src[0];
264 lo.size = AGX_SIZE_32;
265
266 agx_index hi = lo;
267 hi.value += 2 /* half of 64-bits = 32-bits = 2 x 16-bits */;
268
269 defs[I->dest[0].value] = agx_mov_to(&b, I->dest[0], lo);
270 defs[I->dest[1].value] = agx_mov_to(&b, I->dest[1], hi);
271
272 agx_remove_instruction(I);
273 continue;
274 }
275
276 /* Immediate inlining happens elsewhere */
277 if (def->src[0].type == AGX_INDEX_IMMEDIATE)
278 continue;
279
280 /* ALU instructions cannot take 64-bit */
281 if (def->src[0].size == AGX_SIZE_64 &&
282 !(I->op == AGX_OPCODE_DEVICE_LOAD && s == 0) &&
283 !(I->op == AGX_OPCODE_DEVICE_STORE && s == 1) &&
284 !(I->op == AGX_OPCODE_ATOMIC && s == 1))
285 continue;
286
287 agx_replace_src(I, s, def->src[0]);
288 }
289 }
290
291 /*
292 * Fuse conditions into if. Specifically, acts on if_icmp and fuses:
293 *
294 * if_icmp(cmp(x, y, *), 0, ne) -> if_cmp(x, y, *)
295 */
296 static void
agx_optimizer_if_cmp(agx_instr ** defs,agx_instr * I)297 agx_optimizer_if_cmp(agx_instr **defs, agx_instr *I)
298 {
299 /* Check for unfused if */
300 if (!agx_is_equiv(I->src[1], agx_zero()) || I->icond != AGX_ICOND_UEQ ||
301 !I->invert_cond || I->src[0].type != AGX_INDEX_NORMAL)
302 return;
303
304 /* Check for condition */
305 agx_instr *def = defs[I->src[0].value];
306 if (def->op != AGX_OPCODE_ICMP && def->op != AGX_OPCODE_FCMP)
307 return;
308
309 /* Fuse */
310 I->src[0] = def->src[0];
311 I->src[1] = def->src[1];
312 I->invert_cond = def->invert_cond;
313
314 if (def->op == AGX_OPCODE_ICMP) {
315 I->op = AGX_OPCODE_IF_ICMP;
316 I->icond = def->icond;
317 } else {
318 I->op = AGX_OPCODE_IF_FCMP;
319 I->fcond = def->fcond;
320 }
321 }
322
323 /*
324 * Fuse conditions into select. Specifically, acts on icmpsel and fuses:
325 *
326 * icmpsel(cmp(x, y, *), 0, z, w, eq) -> cmpsel(x, y, w, z, *)
327 *
328 * Care must be taken to invert the condition by swapping cmpsel arguments.
329 */
330 static void
agx_optimizer_cmpsel(agx_instr ** defs,agx_instr * I)331 agx_optimizer_cmpsel(agx_instr **defs, agx_instr *I)
332 {
333 /* Check for unfused select */
334 if (!agx_is_equiv(I->src[1], agx_zero()) || I->icond != AGX_ICOND_UEQ ||
335 I->src[0].type != AGX_INDEX_NORMAL)
336 return;
337
338 /* Check for condition */
339 agx_instr *def = defs[I->src[0].value];
340 if (def->op != AGX_OPCODE_ICMP && def->op != AGX_OPCODE_FCMP)
341 return;
342
343 /* Fuse */
344 I->src[0] = def->src[0];
345 I->src[1] = def->src[1];
346
347 /* In the unfused select, the condition is inverted due to the form:
348 *
349 * (cond == 0) ? x : y
350 *
351 * So we need to swap the arguments when fusing to become cond ? y : x. If
352 * the condition was supposed to be inverted, we don't swap since it's
353 * already inverted. cmpsel does not have an invert_cond bit to use.
354 */
355 if (!def->invert_cond) {
356 agx_index temp = I->src[2];
357 I->src[2] = I->src[3];
358 I->src[3] = temp;
359 }
360
361 if (def->op == AGX_OPCODE_ICMP) {
362 I->op = AGX_OPCODE_ICMPSEL;
363 I->icond = def->icond;
364 } else {
365 I->op = AGX_OPCODE_FCMPSEL;
366 I->fcond = def->fcond;
367 }
368 }
369
370 /*
371 * Fuse conditions into ballots:
372 *
373 * ballot(cmp(x, y)) -> ballot_cmp(x, y)
374 */
375 static void
agx_optimizer_ballot(agx_context * ctx,agx_instr ** defs,agx_instr * I)376 agx_optimizer_ballot(agx_context *ctx, agx_instr **defs, agx_instr *I)
377 {
378 agx_instr *def = defs[I->src[0].value];
379 if (!def || (def->op != AGX_OPCODE_ICMP && def->op != AGX_OPCODE_FCMP))
380 return;
381
382 bool quad = I->op == AGX_OPCODE_QUAD_BALLOT;
383 assert(quad || I->op == AGX_OPCODE_BALLOT);
384
385 /* Replace with a fused instruction since the # of sources changes */
386 agx_builder b = agx_init_builder(ctx, agx_before_instr(I));
387
388 agx_instr *fused = agx_icmp_ballot_to(
389 &b, I->dest[0], def->src[0], def->src[1], def->icond, def->invert_cond);
390
391 if (def->op == AGX_OPCODE_ICMP) {
392 fused->op = quad ? AGX_OPCODE_ICMP_QUAD_BALLOT : AGX_OPCODE_ICMP_BALLOT;
393 } else {
394 fused->op = quad ? AGX_OPCODE_FCMP_QUAD_BALLOT : AGX_OPCODE_FCMP_BALLOT;
395 }
396
397 agx_remove_instruction(I);
398 }
399
400 /*
401 * Fuse not srcs into bitop.
402 */
403 static void
agx_optimizer_bitop(agx_instr ** defs,agx_instr * I)404 agx_optimizer_bitop(agx_instr **defs, agx_instr *I)
405 {
406 agx_foreach_ssa_src(I, s) {
407 agx_index src = I->src[s];
408 agx_instr *def = defs[src.value];
409
410 /* Check for not src */
411 if (def->op != AGX_OPCODE_NOT)
412 continue;
413
414 /* Select new operation */
415 if (s == 0) {
416 I->truth_table =
417 ((I->truth_table & 0x5) << 1) | ((I->truth_table & 0xa) >> 1);
418 } else if (s == 1) {
419 I->truth_table = ((I->truth_table & 0x3) << 2) | (I->truth_table >> 2);
420 }
421
422 /* Fuse */
423 I->src[s] = def->src[0];
424 }
425 }
426
427 static void
agx_optimizer_forward(agx_context * ctx)428 agx_optimizer_forward(agx_context *ctx)
429 {
430 agx_instr **defs = calloc(ctx->alloc, sizeof(*defs));
431
432 agx_foreach_instr_global_safe(ctx, I) {
433 struct agx_opcode_info info = agx_opcodes_info[I->op];
434
435 agx_foreach_ssa_dest(I, d) {
436 defs[I->dest[d].value] = I;
437 }
438
439 /* Optimize moves */
440 agx_optimizer_copyprop(ctx, defs, I);
441
442 /* Propagate fmov down */
443 if (info.is_float || I->op == AGX_OPCODE_FCMPSEL ||
444 I->op == AGX_OPCODE_FCMP)
445 agx_optimizer_fmov(defs, I);
446
447 /* Inline immediates if we can. TODO: systematic */
448 if (I->op != AGX_OPCODE_ST_VARY && I->op != AGX_OPCODE_COLLECT &&
449 I->op != AGX_OPCODE_TEXTURE_SAMPLE &&
450 I->op != AGX_OPCODE_IMAGE_LOAD && I->op != AGX_OPCODE_TEXTURE_LOAD &&
451 I->op != AGX_OPCODE_UNIFORM_STORE &&
452 I->op != AGX_OPCODE_BLOCK_IMAGE_STORE)
453 agx_optimizer_inline_imm(defs, I, info.is_float);
454
455 if (I->op == AGX_OPCODE_IF_ICMP)
456 agx_optimizer_if_cmp(defs, I);
457 else if (I->op == AGX_OPCODE_ICMPSEL)
458 agx_optimizer_cmpsel(defs, I);
459 else if (I->op == AGX_OPCODE_BALLOT || I->op == AGX_OPCODE_QUAD_BALLOT)
460 agx_optimizer_ballot(ctx, defs, I);
461 else if (I->op == AGX_OPCODE_BITOP)
462 agx_optimizer_bitop(defs, I);
463 }
464
465 free(defs);
466 }
467
468 static void
agx_optimizer_backward(agx_context * ctx)469 agx_optimizer_backward(agx_context *ctx)
470 {
471 agx_instr **uses = calloc(ctx->alloc, sizeof(*uses));
472 BITSET_WORD *multiple = calloc(BITSET_WORDS(ctx->alloc), sizeof(*multiple));
473
474 agx_foreach_instr_global_rev(ctx, I) {
475 struct agx_opcode_info info = agx_opcodes_info[I->op];
476
477 agx_foreach_ssa_src(I, s) {
478 if (I->src[s].type == AGX_INDEX_NORMAL) {
479 unsigned v = I->src[s].value;
480
481 if (uses[v])
482 BITSET_SET(multiple, v);
483 else
484 uses[v] = I;
485 }
486 }
487
488 if (info.nr_dests != 1)
489 continue;
490
491 if (I->dest[0].type != AGX_INDEX_NORMAL)
492 continue;
493
494 agx_instr *use = uses[I->dest[0].value];
495
496 if (!use || BITSET_TEST(multiple, I->dest[0].value))
497 continue;
498
499 if (agx_optimizer_not(I, use)) {
500 agx_remove_instruction(use);
501 continue;
502 }
503
504 /* Destination has a single use, try to propagate */
505 if (info.is_float && agx_optimizer_fmov_rev(I, use)) {
506 agx_remove_instruction(use);
507 continue;
508 }
509 }
510
511 free(uses);
512 free(multiple);
513 }
514
515 void
agx_optimizer(agx_context * ctx)516 agx_optimizer(agx_context *ctx)
517 {
518 agx_optimizer_backward(ctx);
519 agx_optimizer_forward(ctx);
520 }
521