1 /*
2 * Copyright 2021 Alyssa Rosenzweig
3 * Copyright 2020 Collabora Ltd.
4 * Copyright 2016 Broadcom
5 * SPDX-License-Identifier: MIT
6 */
7
8 #include "agx_compile.h"
9 #include "asahi/clc/asahi_clc.h"
10 #include "asahi/layout/layout.h"
11 #include "compiler/nir/nir_builder.h"
12 #include "util/bitset.h"
13 #include "util/glheader.h"
14 #include "util/list.h"
15 #include "util/macros.h"
16 #include "util/u_debug.h"
17 #include "util/u_dynarray.h"
18 #include "agx_builder.h"
19 #include "agx_compiler.h"
20 #include "agx_debug.h"
21 #include "agx_nir.h"
22 #include "agx_opcodes.h"
23 #include "glsl_types.h"
24 #include "nir.h"
25 #include "nir_builtin_builder.h"
26 #include "nir_intrinsics.h"
27 #include "nir_intrinsics_indices.h"
28 #include "shader_enums.h"
29
30 /* Cache-line align shader programs. This matches the prop compiler. */
31 #define AGX_CODE_ALIGN 0x80
32
33 /* clang-format off */
34 static const struct debug_named_value agx_debug_options[] = {
35 {"shaders", AGX_DBG_SHADERS, "Dump shaders in NIR and AIR"},
36 {"shaderdb", AGX_DBG_SHADERDB, "Print statistics"},
37 {"verbose", AGX_DBG_VERBOSE, "Disassemble verbosely"},
38 {"internal", AGX_DBG_INTERNAL, "Dump even internal shaders"},
39 {"novalidate",AGX_DBG_NOVALIDATE,"Skip IR validation in debug builds"},
40 {"noopt", AGX_DBG_NOOPT, "Disable backend optimizations"},
41 {"wait", AGX_DBG_WAIT, "Wait after all async instructions"},
42 {"nopreamble",AGX_DBG_NOPREAMBLE,"Do not use shader preambles"},
43 {"demand", AGX_DBG_DEMAND, "Bound tightly to register demand"},
44 {"nosched", AGX_DBG_NOSCHED, "Do not schedule the shader"},
45 {"spill", AGX_DBG_SPILL, "Spill (almost) everything"},
46 {"nopromote", AGX_DBG_NOPROMOTE, "Do not promote constants to uniforms"},
47 DEBUG_NAMED_VALUE_END
48 };
49 /* clang-format on */
50
51 DEBUG_GET_ONCE_FLAGS_OPTION(agx_compiler_debug, "AGX_MESA_DEBUG",
52 agx_debug_options, 0)
53
54 int agx_compiler_debug = 0;
55
56 /*
57 * Pad binary to a given alignment and return aligned offset into the binary.
58 */
59 static unsigned
agx_pad_binary(struct util_dynarray * dyn,uint32_t align)60 agx_pad_binary(struct util_dynarray *dyn, uint32_t align)
61 {
62 if (dyn->size % align) {
63 unsigned ngrow = align - (dyn->size % align);
64 memset(util_dynarray_grow_bytes(dyn, ngrow, 1), 0, ngrow);
65 }
66
67 assert((dyn->size % align) == 0);
68 return dyn->size;
69 }
70
71 uint64_t
agx_get_compiler_debug(void)72 agx_get_compiler_debug(void)
73 {
74 return debug_get_option_agx_compiler_debug();
75 }
76
77 static agx_index
agx_cached_preload(agx_context * ctx,unsigned base,enum agx_size size)78 agx_cached_preload(agx_context *ctx, unsigned base, enum agx_size size)
79 {
80 if (agx_is_null(ctx->preloaded[base])) {
81 agx_block *block = agx_start_block(ctx);
82 agx_builder b = agx_init_builder(ctx, agx_before_block(block));
83 ctx->preloaded[base] = agx_preload(&b, agx_register(base, size));
84 }
85
86 return ctx->preloaded[base];
87 }
88
89 static agx_index
agx_tess_coord_x(agx_builder * b)90 agx_tess_coord_x(agx_builder *b)
91 {
92 return agx_cached_preload(b->shader, 4, AGX_SIZE_32);
93 }
94
95 static agx_index
agx_tess_coord_y(agx_builder * b)96 agx_tess_coord_y(agx_builder *b)
97 {
98 return agx_cached_preload(b->shader, 6, AGX_SIZE_32);
99 }
100
101 static agx_index
agx_vertex_id(agx_builder * b)102 agx_vertex_id(agx_builder *b)
103 {
104 return agx_cached_preload(b->shader, 10, AGX_SIZE_32);
105 }
106
107 static agx_index
agx_instance_id(agx_builder * b)108 agx_instance_id(agx_builder *b)
109 {
110 return agx_cached_preload(b->shader, 12, AGX_SIZE_32);
111 }
112
113 #define VARYING_NUM_COMPONENTS (VARYING_SLOT_MAX * 4)
114
115 struct coefficient_info {
116 BITSET_DECLARE(smooth, VARYING_NUM_COMPONENTS);
117 BITSET_DECLARE(flat, VARYING_NUM_COMPONENTS);
118 BITSET_DECLARE(noperspective, VARYING_NUM_COMPONENTS);
119 };
120
121 static BITSET_WORD *
bitset_for_interp(struct coefficient_info * info,enum glsl_interp_mode mode)122 bitset_for_interp(struct coefficient_info *info, enum glsl_interp_mode mode)
123 {
124 /* clang-format off */
125 switch (mode) {
126 case INTERP_MODE_NONE:
127 case INTERP_MODE_SMOOTH: return info->smooth;
128 case INTERP_MODE_NOPERSPECTIVE: return info->noperspective;
129 case INTERP_MODE_FLAT: return info->flat;
130 default: unreachable("invalid interp mode");
131 }
132 /* clang-format on */
133 }
134
135 static bool
gather_cf(nir_builder * b,nir_intrinsic_instr * intr,void * data)136 gather_cf(nir_builder *b, nir_intrinsic_instr *intr, void *data)
137 {
138 /* First handle frag coord loads */
139 struct coefficient_info *info = data;
140 if (intr->intrinsic == nir_intrinsic_load_frag_coord_zw) {
141 BITSET_SET(info->noperspective,
142 VARYING_SLOT_POS + nir_intrinsic_component(intr));
143 return false;
144 }
145
146 /* Look for input loads and grab the instruction with the interp mode */
147 nir_intrinsic_instr *bary;
148 unsigned nr = 1;
149
150 if (intr->intrinsic == nir_intrinsic_load_coefficients_agx) {
151 bary = intr;
152 /* Always load a scalar */
153 } else if (intr->intrinsic == nir_intrinsic_load_interpolated_input) {
154 bary = nir_src_as_intrinsic(intr->src[0]);
155 nr = intr->num_components;
156
157 /* Perspective interpolation internally reads W */
158 if (nir_intrinsic_interp_mode(bary) != INTERP_MODE_NOPERSPECTIVE)
159 BITSET_SET(info->noperspective, VARYING_SLOT_POS + 3);
160 } else {
161 return false;
162 }
163
164 BITSET_WORD *set = bitset_for_interp(data, nir_intrinsic_interp_mode(bary));
165 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
166 nir_src *offset = nir_get_io_offset_src(intr);
167
168 /* Mark the exact range for direct loads to minimize CF registers, but mark a
169 * conservative bounding range for indirect array access.
170 */
171 if (nir_src_is_const(*offset)) {
172 unsigned location = sem.location + nir_src_as_uint(*offset);
173 unsigned start_comp = (location * 4) + nir_intrinsic_component(intr);
174
175 BITSET_SET_RANGE(set, start_comp, start_comp + nr - 1);
176 } else {
177 unsigned start_comp = (sem.location * 4) + nir_intrinsic_component(intr);
178 bool compact = sem.location == VARYING_SLOT_CLIP_DIST0 ||
179 sem.location == VARYING_SLOT_CLIP_DIST1;
180 unsigned stride = compact ? 1 : 4;
181
182 /* For now we have to assign CF for the whole vec4 to make indirect
183 * indexiing work. This could be optimized later.
184 */
185 nr = stride;
186
187 for (unsigned i = 0; i < sem.num_slots; ++i) {
188 BITSET_SET_RANGE(set, start_comp + (i * stride),
189 start_comp + (i * stride) + nr - 1);
190 }
191 }
192
193 return false;
194 }
195
196 /*
197 * We assign all coefficient registers up front to ensure we have a consistent
198 * layout required for indirects to work.
199 */
200 static void
assign_coefficient_regs(nir_shader * nir,struct agx_varyings_fs * var)201 assign_coefficient_regs(nir_shader *nir, struct agx_varyings_fs *var)
202 {
203 struct coefficient_info info = {0};
204 nir_shader_intrinsics_pass(nir, gather_cf, nir_metadata_all, &info);
205
206 /* W */
207 if (BITSET_TEST(info.noperspective, VARYING_SLOT_POS + 3)) {
208 var->bindings[var->nr_bindings++] = (struct agx_cf_binding){
209 .cf_base = var->nr_cf++,
210 .slot = VARYING_SLOT_POS,
211 .offset = 3,
212 .count = 1,
213 .smooth = true,
214 };
215 }
216
217 /* Z */
218 if (BITSET_TEST(info.noperspective, VARYING_SLOT_POS + 2)) {
219 var->bindings[var->nr_bindings++] = (struct agx_cf_binding){
220 .cf_base = var->nr_cf++,
221 .slot = VARYING_SLOT_POS,
222 .offset = 2,
223 .count = 1,
224 .smooth = true,
225 };
226
227 var->reads_z = true;
228 }
229
230 static_assert(VARYING_SLOT_POS == 0, "special and handled first");
231
232 for (unsigned i = VARYING_SLOT_POS + 1; i < VARYING_SLOT_MAX; ++i) {
233 bool smooth = BITSET_TEST_RANGE(info.smooth, i * 4, (i * 4) + 3);
234 bool flat = BITSET_TEST_RANGE(info.flat, i * 4, (i * 4) + 3);
235 bool noperspective =
236 BITSET_TEST_RANGE(info.noperspective, i * 4, (i * 4) + 3);
237
238 if (!(smooth || flat || noperspective))
239 continue;
240
241 /* From the GLSL 4.60 spec ("Input Layout Qualifiers"):
242 *
243 * when location aliasing, the aliases sharing the location must have
244 * the same underlying numerical type and bit width (floating-point or
245 * integer, 32-bit versus 64-bit, etc.) and the same auxiliary storage
246 * and interpolation qualification.
247 *
248 * SPIR-V should obey this as well although the spec text is muddier.
249 */
250 assert((smooth + flat + noperspective) == 1 &&
251 "slots must have consistent interpolation");
252
253 BITSET_WORD *set = smooth ? info.smooth
254 : flat ? info.flat
255 : info.noperspective;
256
257 /* Find the start offset */
258 unsigned offset = 0;
259 for (offset = 0; offset < 4 && !BITSET_TEST(set, (i * 4) + offset);
260 ++offset)
261 ;
262
263 /* Find the end offset. TODO: Do we ever need to split into two bindings
264 * to handle e.g. x_zw read masks?
265 */
266 unsigned count = 0;
267 for (unsigned c = offset; c < 4; ++c) {
268 if (BITSET_TEST(set, (i * 4) + c))
269 count = c - offset + 1;
270 }
271 assert(count >= 1 && (count + offset) <= 4);
272
273 var->bindings[var->nr_bindings++] = (struct agx_cf_binding){
274 .cf_base = var->nr_cf,
275 .slot = i,
276 .offset = offset,
277 .count = count,
278 .smooth = !flat,
279 .perspective = smooth,
280 };
281
282 var->nr_cf += count;
283 }
284 }
285
286 static agx_index
agx_get_cf(agx_context * ctx,gl_varying_slot slot,unsigned offset)287 agx_get_cf(agx_context *ctx, gl_varying_slot slot, unsigned offset)
288 {
289 struct agx_varyings_fs *varyings = &ctx->out->varyings.fs;
290
291 /* We already have an appropriate binding, find it */
292 for (unsigned b = 0; b < varyings->nr_bindings; ++b) {
293 if (varyings->bindings[b].slot == slot &&
294 (slot != VARYING_SLOT_POS ||
295 offset == varyings->bindings[b].offset)) {
296
297 signed cf_offset = offset - varyings->bindings[b].offset;
298 assert(cf_offset >= 0);
299
300 return agx_immediate(varyings->bindings[b].cf_base + cf_offset);
301 }
302 }
303
304 unreachable("all coefficient registers preassigned");
305 }
306
307 /* Builds a 64-bit hash table key for an index */
308 static uint64_t
agx_index_to_key(agx_index idx)309 agx_index_to_key(agx_index idx)
310 {
311 STATIC_ASSERT(sizeof(idx) <= sizeof(uint64_t));
312
313 uint64_t key = 0;
314 memcpy(&key, &idx, sizeof(idx));
315 return key;
316 }
317
318 /*
319 * Extract a single channel out of a vector source. We split vectors with
320 * p_split so we can use the split components directly, without emitting a
321 * machine instruction. This has advantages of RA, as the split can usually be
322 * optimized away.
323 */
324 static agx_index
agx_emit_extract(agx_builder * b,agx_index vec,unsigned channel)325 agx_emit_extract(agx_builder *b, agx_index vec, unsigned channel)
326 {
327 agx_index *components = _mesa_hash_table_u64_search(b->shader->allocated_vec,
328 agx_index_to_key(vec));
329
330 assert(components != NULL && "missing agx_emit_collect_to");
331
332 return components[channel];
333 }
334
335 static agx_index
agx_extract_nir_src(agx_builder * b,nir_src src,unsigned channel)336 agx_extract_nir_src(agx_builder *b, nir_src src, unsigned channel)
337 {
338 agx_index idx = agx_src_index(&src);
339
340 /* We only deal with scalars, extract a single scalar if needed */
341 if (nir_src_num_components(src) > 1)
342 return agx_emit_extract(b, idx, channel);
343 else
344 return idx;
345 }
346
347 static void
agx_cache_collect(agx_builder * b,agx_index dst,unsigned nr_srcs,agx_index * srcs)348 agx_cache_collect(agx_builder *b, agx_index dst, unsigned nr_srcs,
349 agx_index *srcs)
350 {
351 /* Lifetime of a hash table entry has to be at least as long as the table */
352 agx_index *channels = ralloc_array(b->shader, agx_index, nr_srcs);
353
354 for (unsigned i = 0; i < nr_srcs; ++i)
355 channels[i] = srcs[i];
356
357 _mesa_hash_table_u64_insert(b->shader->allocated_vec, agx_index_to_key(dst),
358 channels);
359 }
360
361 /*
362 * Combine multiple scalars into a vector destination. This corresponds to
363 * collect, lowered to moves (a shuffle in general) after register allocation.
364 *
365 * To optimize vector extractions, we record the individual channels
366 */
367 static agx_instr *
agx_emit_collect_to(agx_builder * b,agx_index dst,unsigned nr_srcs,agx_index * srcs)368 agx_emit_collect_to(agx_builder *b, agx_index dst, unsigned nr_srcs,
369 agx_index *srcs)
370 {
371 agx_cache_collect(b, dst, nr_srcs, srcs);
372
373 if (nr_srcs == 1)
374 return agx_mov_to(b, dst, srcs[0]);
375
376 agx_instr *I = agx_collect_to(b, dst, nr_srcs);
377
378 agx_foreach_src(I, s)
379 I->src[s] = srcs[s];
380
381 return I;
382 }
383
384 static agx_index
agx_emit_collect(agx_builder * b,unsigned nr_srcs,agx_index * srcs)385 agx_emit_collect(agx_builder *b, unsigned nr_srcs, agx_index *srcs)
386 {
387 agx_index dst = agx_vec_temp(b->shader, srcs[0].size, nr_srcs);
388 agx_emit_collect_to(b, dst, nr_srcs, srcs);
389 return dst;
390 }
391
392 static agx_index
agx_vec2(agx_builder * b,agx_index s0,agx_index s1)393 agx_vec2(agx_builder *b, agx_index s0, agx_index s1)
394 {
395 return agx_emit_collect(b, 2, (agx_index[]){s0, s1});
396 }
397
398 static agx_index
agx_pad_to_32(agx_builder * b,agx_index s)399 agx_pad_to_32(agx_builder *b, agx_index s)
400 {
401 assert(s.size == AGX_SIZE_16);
402 assert(agx_channels(s) == 1);
403
404 agx_index srcs[2] = {s, agx_undef(AGX_SIZE_16)};
405 agx_index dst = agx_vec_temp(b->shader, AGX_SIZE_32, 1);
406 agx_emit_collect_to(b, dst, 2, srcs);
407 return dst;
408 }
409
410 static agx_index
agx_recollect_vector(agx_builder * b,nir_src vec)411 agx_recollect_vector(agx_builder *b, nir_src vec)
412 {
413 agx_index comps[4];
414 unsigned nr = nir_src_num_components(vec);
415
416 for (unsigned i = 0; i < nr; ++i)
417 comps[i] = agx_extract_nir_src(b, vec, i);
418
419 return agx_emit_collect(b, nr, comps);
420 }
421
422 /*
423 * Extract the lower or upper N-bits from a (2*N)-bit quantity. We use a split
424 * without null destinations to let us CSE (and coalesce) the splits when both x
425 * and y are split.
426 */
427 static agx_instr *
agx_subdivide_to(agx_builder * b,agx_index dst,agx_index s0,unsigned comp)428 agx_subdivide_to(agx_builder *b, agx_index dst, agx_index s0, unsigned comp)
429 {
430 assert((s0.size == (dst.size + 1)) && "only 2x subdivide handled");
431 assert((comp == 0 || comp == 1) && "too many components");
432
433 /* Handle immediates specially so we don't have to constant fold splits. */
434 if (s0.type == AGX_INDEX_IMMEDIATE) {
435 unsigned bits = 16 * agx_size_align_16(dst.size);
436 return agx_mov_imm_to(b, dst, (s0.value >> bits) & BITFIELD64_MASK(bits));
437 }
438
439 agx_instr *split = agx_split(b, 2, s0);
440 split->dest[comp] = dst;
441 split->dest[1 - comp] = agx_temp(b->shader, dst.size);
442 return split;
443 }
444
445 void
agx_block_add_successor(agx_block * block,agx_block * successor)446 agx_block_add_successor(agx_block *block, agx_block *successor)
447 {
448 assert(block != NULL && successor != NULL);
449
450 /* Cull impossible edges */
451 if (block->unconditional_jumps)
452 return;
453
454 for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
455 if (block->successors[i]) {
456 if (block->successors[i] == successor)
457 return;
458 else
459 continue;
460 }
461
462 block->successors[i] = successor;
463 util_dynarray_append(&successor->predecessors, agx_block *, block);
464 return;
465 }
466
467 unreachable("Too many successors");
468 }
469
470 /*
471 * Splits an n-component vector (vec) into n scalar destinations (dests) using a
472 * split pseudo-instruction.
473 *
474 * Pre-condition: dests is filled with agx_null().
475 */
476 static void
agx_emit_split(agx_builder * b,agx_index * dests,agx_index vec,unsigned n)477 agx_emit_split(agx_builder *b, agx_index *dests, agx_index vec, unsigned n)
478 {
479 agx_instr *I = agx_split(b, n, vec);
480
481 agx_foreach_dest(I, d) {
482 dests[d] = agx_temp(b->shader, vec.size);
483 I->dest[d] = dests[d];
484 }
485 }
486
487 static void
agx_emit_cached_split(agx_builder * b,agx_index vec,unsigned n)488 agx_emit_cached_split(agx_builder *b, agx_index vec, unsigned n)
489 {
490 agx_index dests[4] = {agx_null(), agx_null(), agx_null(), agx_null()};
491 agx_emit_split(b, dests, vec, n);
492 agx_cache_collect(b, vec, n, dests);
493 }
494
495 static void
agx_emit_load_const(agx_builder * b,nir_load_const_instr * instr)496 agx_emit_load_const(agx_builder *b, nir_load_const_instr *instr)
497 {
498 /* Ensure we've been scalarized and bit size lowered */
499 unsigned bit_size = instr->def.bit_size;
500 assert(instr->def.num_components == 1);
501
502 /* Emit move, later passes can inline/push if useful */
503 agx_mov_imm_to(b, agx_def_index(&instr->def),
504 nir_const_value_as_uint(instr->value[0], bit_size));
505 }
506
507 /*
508 * Implement mul_high of 32-bit sources by doing a 32x32->64-bit multiply and
509 * extracting only the high word.
510 */
511 static agx_instr *
agx_mul_high_to(agx_builder * b,agx_index dst,agx_index P,agx_index Q,bool is_signed)512 agx_mul_high_to(agx_builder *b, agx_index dst, agx_index P, agx_index Q,
513 bool is_signed)
514 {
515 assert(P.size == Q.size && "source sizes must match");
516 assert(P.size == dst.size && "dest size must match");
517 assert(P.size != AGX_SIZE_64 && "64x64 multiply should have been lowered");
518
519 static_assert(AGX_SIZE_64 == (AGX_SIZE_32 + 1), "enum wrong");
520 static_assert(AGX_SIZE_32 == (AGX_SIZE_16 + 1), "enum wrong");
521
522 if (!is_signed) {
523 P = agx_abs(P);
524 Q = agx_abs(Q);
525 }
526
527 agx_index product = agx_temp(b->shader, P.size + 1);
528 agx_imad_to(b, product, P, Q, agx_zero(), 0);
529
530 return agx_subdivide_to(b, dst, product, 1);
531 }
532
533 static enum agx_format
agx_format_for_pipe(enum pipe_format format)534 agx_format_for_pipe(enum pipe_format format)
535 {
536 #define CASE(x) \
537 if (format == (enum pipe_format)AIL_ISA_FORMAT_##x) \
538 return AGX_FORMAT_##x;
539
540 CASE(I8);
541 CASE(I16);
542 CASE(I32);
543 CASE(F16);
544 CASE(U8NORM);
545 CASE(S8NORM);
546 CASE(U16NORM);
547 CASE(S16NORM);
548 CASE(RGB10A2);
549 CASE(SRGBA8);
550 CASE(RG11B10F);
551 CASE(RGB9E5);
552
553 #undef CASE
554 unreachable("Invalid format");
555 }
556
557 static agx_index
cf_for_intrinsic(agx_builder * b,nir_intrinsic_instr * intr)558 cf_for_intrinsic(agx_builder *b, nir_intrinsic_instr *intr)
559 {
560 /* Determine the base location, taking into account a constant offset */
561 unsigned location = nir_intrinsic_io_semantics(intr).location;
562 bool compact = location == VARYING_SLOT_CLIP_DIST0 ||
563 location == VARYING_SLOT_CLIP_DIST1;
564
565 nir_src *offset = nir_get_io_offset_src(intr);
566 if (nir_src_is_const(*offset)) {
567 /* XXX: NIR is broken and uses constant offsets in slots but dynamic
568 * offsets in scalars for compact varyings. This needs to be fixed
569 * upstream.
570 */
571 location += nir_src_as_uint(*offset);
572 }
573
574 agx_index I = agx_get_cf(b->shader, location, nir_intrinsic_component(intr));
575
576 /* If we have a non-constant offset, we add it to the CF. Offsets are in
577 * vec4 slots (unless we're compact) but the CF is in components, so we need
578 * to shift the offset by 2 before adding.
579 */
580 if (!nir_src_is_const(*offset)) {
581 I = agx_iadd(b, I, agx_src_index(offset), compact ? 0 : 2);
582 }
583
584 return I;
585 }
586
587 static enum agx_interpolation
agx_interp_for_bary(nir_intrinsic_instr * bary,agx_index * sample_index)588 agx_interp_for_bary(nir_intrinsic_instr *bary, agx_index *sample_index)
589 {
590 switch (bary->intrinsic) {
591 case nir_intrinsic_load_barycentric_pixel:
592 return AGX_INTERPOLATION_CENTER;
593
594 case nir_intrinsic_load_barycentric_centroid:
595 return AGX_INTERPOLATION_CENTROID;
596
597 case nir_intrinsic_load_barycentric_at_sample:
598 *sample_index = agx_src_index(&bary->src[0]);
599 return AGX_INTERPOLATION_SAMPLE;
600
601 default:
602 unreachable("should have been lowered");
603 }
604 }
605
606 static void
agx_emit_load_vary(agx_builder * b,agx_index dest,nir_intrinsic_instr * instr)607 agx_emit_load_vary(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
608 {
609 ASSERTED unsigned components = instr->num_components;
610 nir_intrinsic_instr *bary = nir_src_as_intrinsic(instr->src[0]);
611
612 assert(components >= 1 && components <= 4);
613
614 agx_index sample_index = agx_zero();
615 enum agx_interpolation interp = agx_interp_for_bary(bary, &sample_index);
616
617 bool perspective =
618 nir_intrinsic_interp_mode(bary) != INTERP_MODE_NOPERSPECTIVE;
619
620 agx_index I = cf_for_intrinsic(b, instr);
621
622 /* For perspective interpolation, we project (multiply by 1/W) */
623 if (perspective) {
624 agx_index J = agx_get_cf(b->shader, VARYING_SLOT_POS, 3);
625 agx_iterproj_to(b, dest, I, J, sample_index, components, interp);
626 } else {
627 agx_iter_to(b, dest, I, sample_index, components, interp);
628 }
629
630 agx_emit_cached_split(b, dest, components);
631 }
632
633 static void
agx_wait_pixel_mask(agx_builder * b,uint32_t mask)634 agx_wait_pixel_mask(agx_builder *b, uint32_t mask)
635 {
636 /* Background programs do not need to wait as they are the eldest pixels */
637 if (b->shader->key->fs.ignore_tib_dependencies) {
638 assert(b->shader->nir->info.internal);
639 return;
640 }
641
642 /* No need to wait twice on a fence */
643 mask &= ~b->shader->already_pixel_waited;
644 if (mask == 0) {
645 return;
646 }
647
648 agx_wait_pix(b, mask);
649
650 /* Only mark the fence as waited if we're not in control flow. Eventually we
651 * should do something smarter with a dataflow.
652 */
653 if (b->shader->total_nesting == 0) {
654 b->shader->already_pixel_waited |= mask;
655 }
656 }
657
658 static agx_instr *
agx_emit_local_store_pixel(agx_builder * b,nir_intrinsic_instr * instr)659 agx_emit_local_store_pixel(agx_builder *b, nir_intrinsic_instr *instr)
660 {
661 bool explicit = nir_intrinsic_explicit_coord(instr);
662
663 /* TODO: Reverse-engineer interactions with MRT */
664 if (b->shader->stage == MESA_SHADER_FRAGMENT) {
665 agx_wait_pixel_mask(b, 0xC);
666 }
667
668 /* Compact the registers according to the mask */
669 agx_index compacted[4] = {agx_null()};
670
671 unsigned compact_count = 0;
672 u_foreach_bit(i, nir_intrinsic_write_mask(instr)) {
673 compacted[compact_count++] = agx_extract_nir_src(b, instr->src[0], i);
674 }
675
676 agx_index collected = agx_emit_collect(b, compact_count, compacted);
677 agx_index coords = explicit ? agx_src_index(&instr->src[2]) : agx_null();
678
679 b->shader->out->tag_write_disable = false;
680 return agx_st_tile(b, collected, agx_src_index(&instr->src[1]), coords,
681 agx_format_for_pipe(nir_intrinsic_format(instr)),
682 nir_intrinsic_write_mask(instr),
683 nir_intrinsic_base(instr), explicit);
684 }
685
686 static agx_instr *
agx_emit_store_zs(agx_builder * b,nir_intrinsic_instr * instr)687 agx_emit_store_zs(agx_builder *b, nir_intrinsic_instr *instr)
688 {
689 unsigned base = nir_intrinsic_base(instr);
690 bool write_z = base & 1;
691 bool write_s = base & 2;
692
693 agx_index z = agx_src_index(&instr->src[1]);
694 agx_index s = agx_src_index(&instr->src[2]);
695
696 assert(!write_z || z.size == AGX_SIZE_32);
697 assert(!write_s || s.size == AGX_SIZE_16);
698
699 if (write_z && write_s) {
700 agx_index u2u32 = agx_temp(b->shader, AGX_SIZE_32);
701 agx_mov_to(b, u2u32, s);
702 s = u2u32;
703 }
704
705 agx_index zs = (write_z && write_s) ? agx_vec2(b, z, s) : write_z ? z : s;
706
707 /* Not necessarily a sample mask but overlapping hw mechanism... Should
708 * maybe rename this flag to something more general.
709 */
710 b->shader->out->writes_sample_mask = true;
711
712 agx_wait_pixel_mask(b, 0x1);
713 return agx_zs_emit(b, agx_src_index(&instr->src[0]), zs, base);
714 }
715
716 static void
agx_emit_local_load_pixel(agx_builder * b,agx_index dest,nir_intrinsic_instr * instr)717 agx_emit_local_load_pixel(agx_builder *b, agx_index dest,
718 nir_intrinsic_instr *instr)
719 {
720 agx_wait_pixel_mask(b, 0x8);
721
722 unsigned nr_comps = instr->def.num_components;
723 agx_ld_tile_to(b, dest, agx_src_index(&instr->src[0]), agx_null(),
724 agx_format_for_pipe(nir_intrinsic_format(instr)),
725 BITFIELD_MASK(nr_comps), nir_intrinsic_base(instr), false);
726 agx_emit_cached_split(b, dest, nr_comps);
727 }
728
729 static bool
nir_is_coherent(nir_intrinsic_instr * instr)730 nir_is_coherent(nir_intrinsic_instr *instr)
731 {
732 return nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE);
733 }
734
735 static void
agx_emit_load(agx_builder * b,agx_index dest,nir_intrinsic_instr * instr)736 agx_emit_load(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
737 {
738 agx_index addr = agx_src_index(&instr->src[0]);
739 agx_index offset = agx_src_index(&instr->src[1]);
740 enum agx_format fmt = agx_format_for_pipe(nir_intrinsic_format(instr));
741 unsigned shift = nir_intrinsic_base(instr);
742
743 /* Zero-extend offset if we're not sign-extending */
744 if (!nir_intrinsic_sign_extend(instr))
745 offset = agx_abs(offset);
746
747 agx_device_load_to(b, dest, addr, offset, fmt,
748 BITFIELD_MASK(instr->def.num_components), shift,
749 nir_is_coherent(instr));
750 agx_emit_cached_split(b, dest, instr->def.num_components);
751 }
752
753 static void
agx_emit_store(agx_builder * b,nir_intrinsic_instr * instr)754 agx_emit_store(agx_builder *b, nir_intrinsic_instr *instr)
755 {
756 agx_index addr = agx_src_index(&instr->src[1]);
757 agx_index offset = agx_src_index(&instr->src[2]);
758 enum agx_format fmt = agx_format_for_pipe(nir_intrinsic_format(instr));
759 unsigned shift = nir_intrinsic_base(instr);
760
761 /* Zero-extend offset if we're not sign-extending */
762 if (!nir_intrinsic_sign_extend(instr))
763 offset = agx_abs(offset);
764
765 agx_device_store(b, agx_recollect_vector(b, instr->src[0]), addr, offset,
766 fmt, BITFIELD_MASK(nir_src_num_components(instr->src[0])),
767 shift, nir_is_coherent(instr));
768 }
769
770 /* Preambles write directly to uniform registers, so move from uniform to GPR */
771 static agx_instr *
agx_emit_load_preamble(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr)772 agx_emit_load_preamble(agx_builder *b, agx_index dst,
773 nir_intrinsic_instr *instr)
774 {
775 agx_index srcs[4] = {agx_null()};
776 unsigned dim = instr->def.num_components;
777 assert(dim <= ARRAY_SIZE(srcs) && "shouldn't see larger vectors");
778
779 unsigned base = nir_intrinsic_base(instr);
780 unsigned stride = agx_size_align_16(dst.size);
781
782 for (unsigned i = 0; i < dim; ++i)
783 srcs[i] = agx_uniform(base + i * stride, dst.size);
784
785 return agx_emit_collect_to(b, dst, dim, srcs);
786 }
787
788 static agx_instr *
agx_emit_store_preamble(agx_builder * b,nir_intrinsic_instr * instr)789 agx_emit_store_preamble(agx_builder *b, nir_intrinsic_instr *instr)
790 {
791 agx_index vec = agx_src_index(&instr->src[0]);
792 unsigned base = nir_intrinsic_base(instr);
793 unsigned stride = agx_size_align_16(vec.size);
794 unsigned nr = nir_src_num_components(instr->src[0]);
795
796 for (unsigned i = 0; i < nr; i += (4 / stride)) {
797 agx_index data[4] = {0};
798 unsigned count = MIN2(4 / stride, nr - i);
799
800 for (unsigned c = 0; c < count; ++c) {
801 data[c] = agx_extract_nir_src(b, instr->src[0], i + c);
802 }
803
804 agx_uniform_store(b, agx_emit_collect(b, count, data),
805 agx_immediate(base + i * stride), BITFIELD_MASK(count));
806 }
807
808 return NULL;
809 }
810
811 static enum agx_dim
agx_tex_dim(enum glsl_sampler_dim dim,bool array)812 agx_tex_dim(enum glsl_sampler_dim dim, bool array)
813 {
814 switch (dim) {
815 case GLSL_SAMPLER_DIM_1D:
816 return array ? AGX_DIM_1D_ARRAY : AGX_DIM_1D;
817
818 case GLSL_SAMPLER_DIM_2D:
819 case GLSL_SAMPLER_DIM_RECT:
820 case GLSL_SAMPLER_DIM_EXTERNAL:
821 return array ? AGX_DIM_2D_ARRAY : AGX_DIM_2D;
822
823 case GLSL_SAMPLER_DIM_MS:
824 return array ? AGX_DIM_2D_MS_ARRAY : AGX_DIM_2D_MS;
825
826 case GLSL_SAMPLER_DIM_3D:
827 assert(!array && "3D arrays unsupported");
828 return AGX_DIM_3D;
829
830 case GLSL_SAMPLER_DIM_CUBE:
831 return array ? AGX_DIM_CUBE_ARRAY : AGX_DIM_CUBE;
832
833 case GLSL_SAMPLER_DIM_BUF:
834 unreachable("Buffer textures should have been lowered");
835
836 default:
837 unreachable("Invalid sampler dim\n");
838 }
839 }
840
841 /*
842 * In the hardware, bindless texture sources are specified as a 64-bit uniform
843 * base address summed with a 32-bit register index. In NIR, we model this as a
844 * vec2, where the first source is the (constant) uniform register number and
845 * the second source is the (dynamic) byte offset.
846 */
847 static agx_index
agx_translate_bindless_handle(agx_builder * b,nir_src * handle,agx_index * base)848 agx_translate_bindless_handle(agx_builder *b, nir_src *handle, agx_index *base)
849 {
850 nir_scalar base_scalar = nir_scalar_resolved(handle->ssa, 0);
851 assert(nir_scalar_is_const(base_scalar) && "base must be constant");
852
853 unsigned base_uint = nir_scalar_as_uint(base_scalar);
854 *base = agx_uniform(base_uint, AGX_SIZE_64);
855
856 return agx_emit_extract(b, agx_src_index(handle), 1);
857 }
858
859 static agx_instr *
agx_emit_block_image_store(agx_builder * b,nir_intrinsic_instr * instr)860 agx_emit_block_image_store(agx_builder *b, nir_intrinsic_instr *instr)
861 {
862 agx_index offset = agx_src_index(&instr->src[1]);
863 agx_index coords = agx_src_index(&instr->src[2]);
864 enum agx_format format = agx_format_for_pipe(nir_intrinsic_format(instr));
865
866 bool ms = nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_MS;
867 bool array = nir_intrinsic_image_array(instr);
868 enum agx_dim dim = agx_tex_dim(nir_intrinsic_image_dim(instr), array);
869 bool explicit = nir_intrinsic_explicit_coord(instr);
870
871 /* 32-bit source physically, 16-bit in NIR, top half ignored but needed
872 * logically to ensure alignment.
873 */
874 offset = agx_pad_to_32(b, offset);
875
876 /* Modified coordinate descriptor */
877 if (!explicit) {
878 if (array) {
879 agx_index layer = coords;
880 coords = agx_temp(b->shader, AGX_SIZE_32);
881 agx_emit_collect_to(b, coords, 2,
882 (agx_index[]){
883 ms ? agx_mov_imm(b, 16, 0) : layer,
884 ms ? layer : agx_undef(AGX_SIZE_16),
885 });
886 } else {
887 coords = agx_null();
888 }
889 }
890
891 agx_index base, index;
892 if (instr->intrinsic == nir_intrinsic_bindless_image_store_block_agx) {
893 index = agx_translate_bindless_handle(b, &instr->src[0], &base);
894
895 assert(base.size == AGX_SIZE_64);
896 assert(index.size == AGX_SIZE_32);
897 } else {
898 base = agx_zero();
899 index = agx_src_index(&instr->src[0]);
900
901 assert(index.size == AGX_SIZE_16);
902 }
903
904 // XXX: how does this possibly work
905 if (format == AGX_FORMAT_F16)
906 format = AGX_FORMAT_I16;
907
908 return agx_block_image_store(b, base, index, offset, coords, format, dim,
909 explicit);
910 }
911
912 static agx_instr *
agx_load_compute_dimension(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr,enum agx_sr base)913 agx_load_compute_dimension(agx_builder *b, agx_index dst,
914 nir_intrinsic_instr *instr, enum agx_sr base)
915 {
916 unsigned dim = instr->def.num_components;
917 unsigned size = instr->def.bit_size;
918 assert(size == 16 || size == 32);
919
920 agx_index srcs[] = {
921 agx_get_sr(b, size, base + 0),
922 agx_get_sr(b, size, base + 1),
923 agx_get_sr(b, size, base + 2),
924 };
925
926 return agx_emit_collect_to(b, dst, dim, srcs);
927 }
928
929 static enum agx_atomic_opc
translate_atomic_opcode(nir_atomic_op op)930 translate_atomic_opcode(nir_atomic_op op)
931 {
932 /* clang-format off */
933 switch (op) {
934 case nir_atomic_op_iadd: return AGX_ATOMIC_OPC_ADD;
935 case nir_atomic_op_imin: return AGX_ATOMIC_OPC_IMIN;
936 case nir_atomic_op_umin: return AGX_ATOMIC_OPC_UMIN;
937 case nir_atomic_op_imax: return AGX_ATOMIC_OPC_IMAX;
938 case nir_atomic_op_umax: return AGX_ATOMIC_OPC_UMAX;
939 case nir_atomic_op_iand: return AGX_ATOMIC_OPC_AND;
940 case nir_atomic_op_ior: return AGX_ATOMIC_OPC_OR;
941 case nir_atomic_op_ixor: return AGX_ATOMIC_OPC_XOR;
942 case nir_atomic_op_xchg: return AGX_ATOMIC_OPC_XCHG;
943 case nir_atomic_op_cmpxchg: return AGX_ATOMIC_OPC_CMPXCHG;
944 default: unreachable("unknown atomic opcode");
945 }
946 /* clang-format on */
947 }
948
949 /*
950 * The "base" of a local load/store/atomic can be zero but no other immediates.
951 * This would be a little silly to handle when inlining immediates, so we
952 * instead exclude these ops from immediate inlining and just handle 0 specially
953 * when translating.
954 */
955 static agx_index
agx_local_base(nir_src src)956 agx_local_base(nir_src src)
957 {
958 if (nir_src_is_const(src) && nir_src_as_uint(src) == 0)
959 return agx_zero();
960 else
961 return agx_src_index(&src);
962 }
963
964 static void
agx_emit_atomic(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr,bool local)965 agx_emit_atomic(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr,
966 bool local)
967 {
968 enum agx_atomic_opc op =
969 translate_atomic_opcode(nir_intrinsic_atomic_op(instr));
970 agx_index base =
971 local ? agx_local_base(instr->src[0]) : agx_src_index(&instr->src[0]);
972 agx_index value = agx_src_index(&instr->src[local ? 1 : 2]);
973 agx_index index = local ? agx_zero() : agx_src_index(&instr->src[1]);
974
975 /* cmpxchg (only) takes 2 sources, passed in consecutive registers */
976 if (op == AGX_ATOMIC_OPC_CMPXCHG) {
977 agx_index value2 = agx_src_index(&instr->src[local ? 2 : 3]);
978 value = agx_vec2(b, value2, value);
979 }
980
981 if (local) {
982 assert(base.size == AGX_SIZE_16);
983 agx_local_atomic_to(b, dst, value, base, index, op);
984 } else {
985 assert(base.size == AGX_SIZE_64);
986 agx_atomic_to(b, dst, value, base, index, op);
987 }
988 }
989
990 static enum agx_format
format_for_bitsize(unsigned bitsize)991 format_for_bitsize(unsigned bitsize)
992 {
993 switch (bitsize) {
994 case 8:
995 return AGX_FORMAT_I8;
996 case 16:
997 return AGX_FORMAT_I16;
998 case 32:
999 return AGX_FORMAT_I32;
1000 default:
1001 unreachable("should've been lowered");
1002 }
1003 }
1004
1005 static void
agx_emit_local_load(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr)1006 agx_emit_local_load(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr)
1007 {
1008 agx_index base = agx_local_base(instr->src[0]);
1009 agx_index index = agx_zero(); /* TODO: optimize address arithmetic */
1010 assert(base.size == AGX_SIZE_16);
1011
1012 enum agx_format format = format_for_bitsize(instr->def.bit_size);
1013 unsigned nr = instr->def.num_components;
1014 unsigned mask = BITFIELD_MASK(nr);
1015
1016 agx_local_load_to(b, dst, base, index, format, mask);
1017 agx_emit_cached_split(b, dst, nr);
1018 }
1019
1020 static void
agx_emit_local_store(agx_builder * b,nir_intrinsic_instr * instr)1021 agx_emit_local_store(agx_builder *b, nir_intrinsic_instr *instr)
1022 {
1023 agx_index value = agx_src_index(&instr->src[0]);
1024 agx_index base = agx_local_base(instr->src[1]);
1025 agx_index index = agx_zero(); /* TODO: optimize address arithmetic */
1026 assert(base.size == AGX_SIZE_16);
1027
1028 enum agx_format format = format_for_bitsize(nir_src_bit_size(instr->src[0]));
1029 unsigned mask = BITFIELD_MASK(
1030 nir_src_num_components(instr->src[0])); /* XXX: there's a write mask */
1031
1032 agx_local_store(b, value, base, index, format, mask);
1033 }
1034
1035 static void
agx_emit_load_scratch(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr)1036 agx_emit_load_scratch(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr)
1037 {
1038 agx_index offset = agx_src_index(&instr->src[0]);
1039 enum agx_format format = format_for_bitsize(instr->def.bit_size);
1040 unsigned nr = instr->def.num_components;
1041 unsigned mask = BITFIELD_MASK(nr);
1042
1043 agx_stack_load_to(b, dst, offset, format, mask);
1044 agx_emit_cached_split(b, dst, nr);
1045 b->shader->any_scratch = true;
1046 }
1047
1048 static void
agx_emit_store_scratch(agx_builder * b,nir_intrinsic_instr * instr)1049 agx_emit_store_scratch(agx_builder *b, nir_intrinsic_instr *instr)
1050 {
1051 agx_index value = agx_recollect_vector(b, instr->src[0]);
1052 agx_index offset = agx_src_index(&instr->src[1]);
1053 enum agx_format format = format_for_bitsize(nir_src_bit_size(instr->src[0]));
1054 unsigned mask = BITFIELD_MASK(nir_src_num_components(instr->src[0]));
1055
1056 agx_stack_store(b, value, offset, format, mask);
1057 b->shader->any_scratch = true;
1058 }
1059
1060 static unsigned
agx_expand_tex_to(agx_builder * b,nir_def * def,agx_index src,bool masked)1061 agx_expand_tex_to(agx_builder *b, nir_def *def, agx_index src, bool masked)
1062 {
1063 unsigned nr_channels = def->num_components;
1064 nir_component_mask_t mask = nir_def_components_read(def);
1065
1066 if (!masked)
1067 mask = (nir_component_mask_t)BITFIELD_MASK(nr_channels);
1068
1069 agx_index packed_channels[4] = {agx_null()};
1070 agx_index unpacked_channels[4] = {agx_null()};
1071
1072 /* Hardware writes the masked components contiguously, expand out for NIR */
1073 agx_emit_split(b, packed_channels, src, 4 /* XXX: why not nr_channels */);
1074
1075 for (unsigned i = 0; i < nr_channels; ++i) {
1076 unpacked_channels[i] =
1077 (mask & BITFIELD_BIT(i))
1078 ? packed_channels[util_bitcount(mask & BITFIELD_MASK(i))]
1079 : agx_undef(src.size);
1080 }
1081
1082 agx_emit_collect_to(b, agx_def_index(def), nr_channels, unpacked_channels);
1083 return mask;
1084 }
1085
1086 static agx_instr *
agx_emit_image_load(agx_builder * b,agx_index dst,nir_intrinsic_instr * intr)1087 agx_emit_image_load(agx_builder *b, agx_index dst, nir_intrinsic_instr *intr)
1088 {
1089 agx_index ms_index = agx_src_index(&intr->src[2]);
1090 agx_index lod = agx_src_index(&intr->src[3]);
1091 enum agx_lod_mode lod_mode = AGX_LOD_MODE_LOD_MIN;
1092
1093 agx_index bindless = agx_immediate(0), texture;
1094 if (intr->intrinsic == nir_intrinsic_bindless_image_load)
1095 texture = agx_translate_bindless_handle(b, &intr->src[0], &bindless);
1096 else if (nir_src_is_const(intr->src[0]) &&
1097 nir_src_as_uint(intr->src[0]) < 0x100)
1098 texture = agx_immediate(nir_src_as_uint(intr->src[0]));
1099 else
1100 texture = agx_src_index(&intr->src[0]);
1101
1102 assert(nir_src_num_components(intr->src[1]) == 4);
1103 agx_index coord[4] = {
1104 agx_extract_nir_src(b, intr->src[1], 0),
1105 agx_extract_nir_src(b, intr->src[1], 1),
1106 agx_extract_nir_src(b, intr->src[1], 2),
1107 agx_extract_nir_src(b, intr->src[1], 3),
1108 };
1109
1110 /* Get the image dimension. Cubes are lowered to 2D, since they are logically
1111 * equivalent for imageLoad, but out-of-bounds behaviour for cubes on G13
1112 * is wrong according to Piglit's arb_shader_image_load_store-invalid.
1113 *
1114 * This requires a matching transform in the driver.
1115 */
1116 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr);
1117 bool is_array = nir_intrinsic_image_array(intr);
1118
1119 if (dim == GLSL_SAMPLER_DIM_CUBE) {
1120 dim = GLSL_SAMPLER_DIM_2D;
1121 is_array = true;
1122 }
1123
1124 bool is_ms = dim == GLSL_SAMPLER_DIM_MS;
1125 unsigned coord_comps = glsl_get_sampler_dim_coordinate_components(dim);
1126 if (is_array && is_ms) {
1127 agx_index layer = agx_temp(b->shader, AGX_SIZE_16);
1128 agx_subdivide_to(b, layer, coord[coord_comps], 0);
1129
1130 assert(ms_index.size == AGX_SIZE_16);
1131 agx_index tmp = agx_temp(b->shader, AGX_SIZE_32);
1132 agx_emit_collect_to(b, tmp, 2, (agx_index[]){ms_index, layer});
1133 coord[coord_comps++] = tmp;
1134 } else if (is_ms) {
1135 agx_index tmp = agx_temp(b->shader, AGX_SIZE_32);
1136 agx_mov_to(b, tmp, ms_index);
1137 coord[coord_comps++] = tmp;
1138 } else if (is_array) {
1139 coord_comps++;
1140 }
1141
1142 /* Multisampled images do not support mipmapping */
1143 if (is_ms) {
1144 lod_mode = AGX_LOD_MODE_AUTO_LOD;
1145 lod = agx_zero();
1146 }
1147
1148 agx_index coords = agx_emit_collect(b, coord_comps, coord);
1149 agx_index tmp = agx_vec_temp(b->shader, dst.size, 4);
1150
1151 agx_instr *I = agx_image_load_to(
1152 b, tmp, coords, lod, bindless, texture, agx_immediate(0), agx_null(),
1153 agx_tex_dim(dim, is_array), lod_mode, 0, false, nir_is_coherent(intr));
1154 I->mask = agx_expand_tex_to(b, &intr->def, tmp, true);
1155
1156 b->shader->out->uses_txf = true;
1157 return NULL;
1158 }
1159
1160 static agx_instr *
agx_emit_export(agx_builder * b,unsigned base,nir_src src)1161 agx_emit_export(agx_builder *b, unsigned base, nir_src src)
1162 {
1163 agx_builder b_ = *b;
1164 agx_cursor after_cursor = agx_after_block(agx_exit_block(b->shader));
1165 b_.cursor = after_cursor;
1166
1167 for (unsigned c = 0; c < nir_src_num_components(src); ++c) {
1168 agx_index chan = agx_extract_nir_src(b, src, c);
1169 unsigned stride = agx_size_align_16(chan.size);
1170
1171 agx_export(&b_, chan, base + (c * stride));
1172 }
1173
1174 if (agx_cursors_equal(b->cursor, after_cursor)) {
1175 b->cursor = agx_after_block_logical(b->cursor.block);
1176 }
1177
1178 return NULL;
1179 }
1180
1181 static agx_instr *
agx_load_exported_to(agx_builder * b,agx_index dst,unsigned base,unsigned nr)1182 agx_load_exported_to(agx_builder *b, agx_index dst, unsigned base, unsigned nr)
1183 {
1184 agx_index chans[4] = {0};
1185 unsigned stride = agx_size_align_16(dst.size);
1186
1187 for (unsigned c = 0; c < nr; ++c) {
1188 chans[c] = agx_cached_preload(b->shader, base + c * stride, dst.size);
1189 }
1190
1191 return agx_emit_collect_to(b, dst, nr, chans);
1192 }
1193
1194 static agx_instr *
agx_emit_image_store(agx_builder * b,nir_intrinsic_instr * instr)1195 agx_emit_image_store(agx_builder *b, nir_intrinsic_instr *instr)
1196 {
1197 /* See remarks in agx_emit_image_load */
1198 enum glsl_sampler_dim glsl_dim = nir_intrinsic_image_dim(instr);
1199 bool is_array = nir_intrinsic_image_array(instr);
1200
1201 if (glsl_dim == GLSL_SAMPLER_DIM_CUBE) {
1202 glsl_dim = GLSL_SAMPLER_DIM_2D;
1203 is_array = true;
1204 }
1205
1206 enum agx_dim dim = agx_tex_dim(glsl_dim, is_array);
1207 assert(glsl_dim != GLSL_SAMPLER_DIM_MS && "needs to be lowered");
1208
1209 agx_index base, index;
1210 if (instr->intrinsic == nir_intrinsic_bindless_image_store) {
1211 index = agx_translate_bindless_handle(b, &instr->src[0], &base);
1212
1213 assert(base.size == AGX_SIZE_64);
1214 assert(index.size == AGX_SIZE_32);
1215 } else {
1216 base = agx_zero();
1217 index = agx_src_index(&instr->src[0]);
1218
1219 assert(index.size == AGX_SIZE_16);
1220 }
1221
1222 agx_index coords4 = agx_src_index(&instr->src[1]);
1223 agx_index lod = agx_src_index(&instr->src[4]);
1224 assert(lod.size == AGX_SIZE_16);
1225
1226 int coord_components = glsl_get_sampler_dim_coordinate_components(glsl_dim);
1227 if (is_array)
1228 coord_components++;
1229
1230 agx_index coord_comps[4] = {};
1231 for (unsigned i = 0; i < coord_components; ++i)
1232 coord_comps[i] = agx_emit_extract(b, coords4, i);
1233
1234 agx_index coords = agx_emit_collect(b, coord_components, coord_comps);
1235 agx_index data = agx_src_index(&instr->src[3]);
1236
1237 /* If the image format has less than 4 components, nir_opt_shrink_stores can
1238 * shrink the store. But the IR still expects 4 components: pad with undef.
1239 */
1240 if (nir_src_num_components(instr->src[3]) < 4) {
1241 agx_index chan[4] = {agx_null()};
1242
1243 for (unsigned i = 0; i < 4; ++i) {
1244 if (i < nir_src_num_components(instr->src[3]))
1245 chan[i] = agx_extract_nir_src(b, instr->src[3], i);
1246 else
1247 chan[i] = agx_undef(data.size);
1248 }
1249
1250 data = agx_emit_collect(b, 4, chan);
1251 }
1252
1253 /* Image stores act like tilebuffer stores when used for tib spilling */
1254 b->shader->out->tag_write_disable = false;
1255
1256 return agx_image_write(b, data, coords, lod, base, index, dim,
1257 nir_is_coherent(instr));
1258 }
1259
1260 static enum agx_simd_op
translate_simd_op(nir_op op)1261 translate_simd_op(nir_op op)
1262 {
1263 #define CASE(agx_, nir_) \
1264 case nir_op_##nir_: \
1265 return AGX_SIMD_OP_##agx_;
1266
1267 switch (op) {
1268 CASE(AND, iand)
1269 CASE(FADD, fadd)
1270 CASE(OR, ior)
1271 CASE(FMUL, fmul)
1272 CASE(XOR, ixor)
1273 CASE(FMIN, fmin)
1274 CASE(FMAX, fmax)
1275 CASE(IADD, iadd)
1276 CASE(SMIN, imin)
1277 CASE(SMAX, imax)
1278 CASE(UMIN, umin)
1279 CASE(UMAX, umax)
1280 default:
1281 unreachable("unknown simd op");
1282 }
1283 #undef CASE
1284 }
1285
1286 static agx_instr *
agx_emit_intrinsic(agx_builder * b,nir_intrinsic_instr * instr)1287 agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
1288 {
1289 agx_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest
1290 ? agx_def_index(&instr->def)
1291 : agx_null();
1292 gl_shader_stage stage = b->shader->stage;
1293
1294 switch (instr->intrinsic) {
1295 case nir_intrinsic_load_barycentric_pixel:
1296 case nir_intrinsic_load_barycentric_centroid:
1297 case nir_intrinsic_load_barycentric_at_sample:
1298 case nir_intrinsic_load_barycentric_at_offset:
1299 /* handled later via load_vary */
1300 return NULL;
1301 case nir_intrinsic_load_interpolated_input:
1302 assert(stage == MESA_SHADER_FRAGMENT);
1303 agx_emit_load_vary(b, dst, instr);
1304 return NULL;
1305
1306 case nir_intrinsic_load_coefficients_agx:
1307 assert(stage == MESA_SHADER_FRAGMENT);
1308 agx_ldcf_to(b, dst, cf_for_intrinsic(b, instr), 1);
1309 agx_emit_cached_split(b, dst, 3);
1310 return NULL;
1311
1312 case nir_intrinsic_load_agx:
1313 case nir_intrinsic_load_constant_agx:
1314 agx_emit_load(b, dst, instr);
1315 return NULL;
1316
1317 case nir_intrinsic_store_uvs_agx:
1318 assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL);
1319 return agx_st_vary(b, agx_src_index(&instr->src[1]),
1320 agx_src_index(&instr->src[0]));
1321
1322 case nir_intrinsic_store_agx:
1323 agx_emit_store(b, instr);
1324 return NULL;
1325
1326 case nir_intrinsic_store_shared:
1327 agx_emit_local_store(b, instr);
1328 return NULL;
1329
1330 case nir_intrinsic_load_shared:
1331 agx_emit_local_load(b, dst, instr);
1332 return NULL;
1333
1334 case nir_intrinsic_global_atomic_agx:
1335 case nir_intrinsic_global_atomic_swap_agx:
1336 agx_emit_atomic(b, dst, instr, false);
1337 return NULL;
1338
1339 case nir_intrinsic_shared_atomic:
1340 case nir_intrinsic_shared_atomic_swap:
1341 agx_emit_atomic(b, dst, instr, true);
1342 return NULL;
1343
1344 case nir_intrinsic_store_zs_agx:
1345 assert(stage == MESA_SHADER_FRAGMENT);
1346 return agx_emit_store_zs(b, instr);
1347
1348 case nir_intrinsic_store_local_pixel_agx:
1349 return agx_emit_local_store_pixel(b, instr);
1350
1351 case nir_intrinsic_load_local_pixel_agx:
1352 assert(stage == MESA_SHADER_FRAGMENT);
1353 agx_emit_local_load_pixel(b, dst, instr);
1354 return NULL;
1355
1356 case nir_intrinsic_load_pixel_coord:
1357 return agx_emit_collect_to(
1358 b, dst, 2,
1359 (agx_index[2]){
1360 agx_get_sr(b, 16, AGX_SR_THREAD_POSITION_IN_GRID_X),
1361 agx_get_sr(b, 16, AGX_SR_THREAD_POSITION_IN_GRID_Y),
1362 });
1363
1364 case nir_intrinsic_load_frag_coord_zw: {
1365 agx_index cf = agx_get_cf(b->shader, VARYING_SLOT_POS,
1366 nir_intrinsic_component(instr));
1367
1368 return agx_iter_to(b, dst, cf, agx_zero(), 1, AGX_INTERPOLATION_CENTER);
1369 }
1370
1371 case nir_intrinsic_sample_mask_agx: {
1372 assert(stage == MESA_SHADER_FRAGMENT);
1373 b->shader->out->writes_sample_mask = true;
1374
1375 /* We need to wait_pix before running Z/S tests, but we don't need to
1376 * wait_pix before merely discarding. Omit the wait_pix when the affected
1377 * samples are unconditionally killed.
1378 */
1379 bool no_tests =
1380 nir_src_is_const(instr->src[1]) && nir_src_as_uint(instr->src[1]) == 0;
1381
1382 if (!no_tests)
1383 agx_wait_pixel_mask(b, 0x1);
1384
1385 return agx_sample_mask(b, agx_src_index(&instr->src[0]),
1386 agx_src_index(&instr->src[1]));
1387 }
1388
1389 case nir_intrinsic_load_back_face_agx:
1390 return agx_get_sr_to(b, dst, AGX_SR_BACKFACING);
1391
1392 case nir_intrinsic_load_samples_log2_agx:
1393 return agx_get_sr_to(b, dst, AGX_SR_SAMPLES_LOG2);
1394
1395 case nir_intrinsic_load_sample_mask_in:
1396 return agx_get_sr_to(b, dst, AGX_SR_INPUT_SAMPLE_MASK);
1397
1398 case nir_intrinsic_load_sample_mask:
1399 return agx_get_sr_coverage_to(b, dst, AGX_SR_COVERAGE_MASK);
1400
1401 case nir_intrinsic_load_helper_invocation:
1402 /* Compare special register to zero. We could lower this in NIR (letting
1403 * us fold in an inot) but meh?
1404 */
1405 return agx_icmp_to(b, dst,
1406 agx_get_sr_coverage(b, 32, AGX_SR_IS_ACTIVE_THREAD),
1407 agx_zero(), AGX_ICOND_UEQ, false);
1408
1409 case nir_intrinsic_load_vertex_id:
1410 /* We don't assert the HW stage since we use this same ABI with SW VS */
1411 return agx_mov_to(b, dst, agx_abs(agx_vertex_id(b)));
1412
1413 case nir_intrinsic_load_primitive_id:
1414 assert(stage == MESA_SHADER_TESS_EVAL);
1415 return agx_mov_to(b, dst, agx_abs(agx_vertex_id(b)));
1416
1417 case nir_intrinsic_load_instance_id:
1418 return agx_mov_to(b, dst, agx_abs(agx_instance_id(b)));
1419
1420 case nir_intrinsic_load_tess_coord_xy: {
1421 assert(stage == MESA_SHADER_TESS_EVAL);
1422
1423 agx_index coords[] = {agx_tess_coord_x(b), agx_tess_coord_y(b)};
1424 return agx_emit_collect_to(b, dst, 2, coords);
1425 }
1426
1427 case nir_intrinsic_load_preamble:
1428 return agx_emit_load_preamble(b, dst, instr);
1429
1430 case nir_intrinsic_store_preamble:
1431 return agx_emit_store_preamble(b, instr);
1432
1433 case nir_intrinsic_image_load:
1434 case nir_intrinsic_bindless_image_load:
1435 return agx_emit_image_load(b, dst, instr);
1436
1437 case nir_intrinsic_image_store:
1438 case nir_intrinsic_bindless_image_store:
1439 return agx_emit_image_store(b, instr);
1440
1441 case nir_intrinsic_image_store_block_agx:
1442 case nir_intrinsic_bindless_image_store_block_agx:
1443 return agx_emit_block_image_store(b, instr);
1444
1445 case nir_intrinsic_load_workgroup_id:
1446 return agx_load_compute_dimension(b, dst, instr,
1447 AGX_SR_THREADGROUP_POSITION_IN_GRID_X);
1448
1449 case nir_intrinsic_load_workgroup_size:
1450 return agx_load_compute_dimension(b, dst, instr,
1451 AGX_SR_THREADS_PER_THREADGROUP_X);
1452
1453 case nir_intrinsic_load_global_invocation_id:
1454 return agx_load_compute_dimension(b, dst, instr,
1455 AGX_SR_THREAD_POSITION_IN_GRID_X);
1456
1457 case nir_intrinsic_load_local_invocation_id:
1458 return agx_load_compute_dimension(
1459 b, dst, instr, AGX_SR_THREAD_POSITION_IN_THREADGROUP_X);
1460
1461 case nir_intrinsic_load_local_invocation_index:
1462 return agx_get_sr_to(b, dst, AGX_SR_THREAD_INDEX_IN_THREADGROUP);
1463
1464 case nir_intrinsic_load_layer_id:
1465 return agx_get_sr_to(b, dst, AGX_SR_THREADGROUP_POSITION_IN_GRID_Z);
1466
1467 case nir_intrinsic_barrier: {
1468 assert(!b->shader->is_preamble && "invalid");
1469
1470 bool needs_image_barriers = false;
1471
1472 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE) {
1473 nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
1474
1475 if (modes & (nir_var_mem_global | nir_var_image)) {
1476 agx_memory_barrier(b);
1477
1478 /* Pull out all the big hammers to make cross-workgroup memory
1479 * barriers work.
1480 */
1481 if (nir_intrinsic_memory_scope(instr) >= SCOPE_QUEUE_FAMILY) {
1482 agx_unknown_barrier_1(b);
1483 agx_memory_barrier_2(b);
1484
1485 /* These are observed on G13D. At some point we should figure out
1486 * what the individual opcodes do.
1487 */
1488 agx_device_barrier_2(b);
1489 agx_unknown_barrier_2(b);
1490 agx_memory_barrier_3(b);
1491 agx_device_barrier_1(b);
1492 }
1493 }
1494
1495 if (modes & nir_var_image) {
1496 agx_image_barrier_1(b);
1497 agx_image_barrier_2(b);
1498 needs_image_barriers = true;
1499 }
1500 }
1501
1502 /* Nothing to do for subgroup barriers */
1503 if (nir_intrinsic_execution_scope(instr) >= SCOPE_WORKGROUP) {
1504 assert(gl_shader_stage_is_compute(b->shader->nir->info.stage));
1505
1506 agx_threadgroup_barrier(b);
1507 }
1508
1509 if (needs_image_barriers) {
1510 agx_image_barrier_3(b);
1511 agx_image_barrier_4(b);
1512 }
1513
1514 return NULL;
1515 }
1516
1517 case nir_intrinsic_fence_pbe_to_tex_agx: {
1518 agx_image_barrier_1(b);
1519 agx_image_barrier_2(b);
1520 agx_image_barrier_3(b);
1521 agx_image_barrier_4(b);
1522 return NULL;
1523 }
1524
1525 case nir_intrinsic_fence_mem_to_tex_agx: {
1526 /* Flush out the atomic to main memory... Found experimentally... */
1527 agx_memory_barrier(b);
1528 agx_memory_barrier_2(b);
1529
1530 /* TODO: Which ones do we actually need? */
1531 agx_image_barrier_1(b);
1532 agx_image_barrier_2(b);
1533 agx_image_barrier_3(b);
1534 agx_image_barrier_4(b);
1535
1536 /* Flush out the texture cache */
1537 agx_flush_memory_to_texture(b);
1538 return NULL;
1539 }
1540
1541 case nir_intrinsic_fence_pbe_to_tex_pixel_agx: {
1542 agx_image_barrier_1(b);
1543 agx_image_barrier_2(b);
1544 agx_flush_memory_to_texture(b);
1545 agx_image_barrier_3(b);
1546 return NULL;
1547 }
1548
1549 case nir_intrinsic_fence_helper_exit_agx: {
1550 assert(b->shader->key->is_helper);
1551 agx_memory_barrier(b);
1552 agx_unknown_barrier_1(b);
1553 agx_memory_barrier_2(b);
1554 agx_unknown_barrier_2(b);
1555 agx_memory_barrier_3(b);
1556 return NULL;
1557 }
1558
1559 case nir_intrinsic_begin_invocation_interlock: {
1560 agx_wait_pixel_mask(b, 0xC);
1561 return NULL;
1562 }
1563
1564 case nir_intrinsic_ddx:
1565 case nir_intrinsic_ddx_coarse:
1566 case nir_intrinsic_ddx_fine:
1567 return agx_dfdx_to(b, dst, agx_src_index(&instr->src[0]));
1568
1569 case nir_intrinsic_ddy:
1570 case nir_intrinsic_ddy_coarse:
1571 case nir_intrinsic_ddy_fine:
1572 return agx_dfdy_to(b, dst, agx_src_index(&instr->src[0]));
1573
1574 case nir_intrinsic_load_subgroup_invocation:
1575 return agx_get_sr_to(b, dst, AGX_SR_THREAD_INDEX_IN_SUBGROUP);
1576
1577 case nir_intrinsic_load_subgroup_id:
1578 return agx_get_sr_to(b, dst, AGX_SR_SUBGROUP_INDEX_IN_THREADGROUP);
1579
1580 case nir_intrinsic_load_active_subgroup_invocation_agx:
1581 return agx_get_sr_coverage_to(b, dst,
1582 AGX_SR_ACTIVE_THREAD_INDEX_IN_SUBGROUP);
1583
1584 case nir_intrinsic_load_active_subgroup_count_agx:
1585 return agx_get_sr_coverage_to(b, dst,
1586 AGX_SR_TOTAL_ACTIVE_THREADS_IN_SUBGROUP);
1587
1588 case nir_intrinsic_reduce: {
1589 assert((instr->def.bit_size == 1 || instr->def.bit_size == 16 ||
1590 instr->def.bit_size == 32) &&
1591 "should've been lowered");
1592
1593 unsigned cluster_size = nir_intrinsic_cluster_size(instr);
1594 assert(cluster_size == 0 || cluster_size == 4 || cluster_size >= 32);
1595
1596 enum agx_simd_op op =
1597 translate_simd_op(nir_intrinsic_reduction_op(instr));
1598
1599 agx_index src0 = agx_src_index(&instr->src[0]);
1600
1601 if (cluster_size == 4)
1602 return agx_quad_reduce_to(b, dst, src0, op);
1603 else
1604 return agx_simd_reduce_to(b, dst, src0, op);
1605 }
1606
1607 case nir_intrinsic_exclusive_scan: {
1608 assert((instr->def.bit_size == 1 || instr->def.bit_size == 16 ||
1609 instr->def.bit_size == 32) &&
1610 "should've been lowered");
1611
1612 return agx_simd_prefix_to(
1613 b, dst, agx_src_index(&instr->src[0]),
1614 translate_simd_op(nir_intrinsic_reduction_op(instr)));
1615 }
1616
1617 case nir_intrinsic_read_invocation: {
1618 /* TODO: Check if we're actually inside divergent control flow */
1619 b->shader->any_quad_divergent_shuffle |= b->shader->any_cf;
1620
1621 /* Lane ID guaranteed to be uniform */
1622 return agx_shuffle_to(b, dst, agx_src_index(&instr->src[0]),
1623 agx_src_index(&instr->src[1]));
1624 }
1625
1626 case nir_intrinsic_quad_broadcast: {
1627 /* TODO: Check if we're actually inside divergent control flow */
1628 b->shader->any_quad_divergent_shuffle |= b->shader->any_cf;
1629
1630 /* Lane ID guaranteed to be uniform */
1631 return agx_quad_shuffle_to(b, dst, agx_src_index(&instr->src[0]),
1632 agx_src_index(&instr->src[1]));
1633 }
1634
1635 case nir_intrinsic_quad_swap_horizontal: {
1636 return agx_quad_shuffle_xor_to(b, dst, agx_src_index(&instr->src[0]),
1637 agx_immediate(1));
1638 }
1639
1640 case nir_intrinsic_quad_swap_vertical: {
1641 return agx_quad_shuffle_xor_to(b, dst, agx_src_index(&instr->src[0]),
1642 agx_immediate(2));
1643 }
1644
1645 case nir_intrinsic_quad_swap_diagonal: {
1646 return agx_quad_shuffle_xor_to(b, dst, agx_src_index(&instr->src[0]),
1647 agx_immediate(3));
1648 }
1649
1650 case nir_intrinsic_ballot: {
1651 return agx_ballot_to(b, dst, agx_src_index(&instr->src[0]));
1652 }
1653
1654 case nir_intrinsic_quad_ballot_agx: {
1655 return agx_quad_ballot_to(b, dst, agx_src_index(&instr->src[0]));
1656 }
1657
1658 case nir_intrinsic_doorbell_agx: {
1659 return agx_doorbell(b, nir_src_as_uint(instr->src[0]));
1660 }
1661
1662 case nir_intrinsic_stack_map_agx: {
1663 return agx_stack_map(b, agx_src_index(&instr->src[1]),
1664 nir_src_as_uint(instr->src[0]));
1665 }
1666
1667 case nir_intrinsic_stack_unmap_agx: {
1668 return agx_stack_unmap_to(b, dst, nir_src_as_uint(instr->src[0]));
1669 }
1670
1671 case nir_intrinsic_load_scratch:
1672 agx_emit_load_scratch(b, dst, instr);
1673 return NULL;
1674
1675 case nir_intrinsic_store_scratch:
1676 agx_emit_store_scratch(b, instr);
1677 return NULL;
1678
1679 case nir_intrinsic_load_core_id_agx:
1680 return agx_get_sr_to(b, dst, AGX_SR_CORE_ID);
1681
1682 case nir_intrinsic_load_helper_op_id_agx:
1683 assert(b->shader->key->is_helper);
1684 return agx_get_sr_barrier_to(b, dst, AGX_SR_HELPER_OP);
1685
1686 case nir_intrinsic_load_helper_arg_lo_agx:
1687 assert(b->shader->key->is_helper);
1688 return agx_get_sr_barrier_to(b, dst, AGX_SR_HELPER_ARG_L);
1689
1690 case nir_intrinsic_load_helper_arg_hi_agx:
1691 assert(b->shader->key->is_helper);
1692 return agx_get_sr_barrier_to(b, dst, AGX_SR_HELPER_ARG_H);
1693
1694 case nir_intrinsic_load_exported_agx:
1695 return agx_load_exported_to(b, dst, nir_intrinsic_base(instr),
1696 instr->def.num_components);
1697
1698 case nir_intrinsic_export_agx:
1699 return agx_emit_export(b, nir_intrinsic_base(instr), instr->src[0]);
1700
1701 case nir_intrinsic_load_barycentric_sample:
1702 case nir_intrinsic_load_sample_id:
1703 case nir_intrinsic_load_sample_pos:
1704 unreachable("Sample shading should have been lowered");
1705
1706 default:
1707 fprintf(stderr, "Unhandled intrinsic %s\n",
1708 nir_intrinsic_infos[instr->intrinsic].name);
1709 unreachable("Unhandled intrinsic");
1710 }
1711 }
1712
1713 static agx_index
agx_alu_src_index(agx_builder * b,nir_alu_src src)1714 agx_alu_src_index(agx_builder *b, nir_alu_src src)
1715 {
1716 /* Check well-formedness of the input NIR */
1717 ASSERTED unsigned bitsize = nir_src_bit_size(src.src);
1718 unsigned comps = nir_src_num_components(src.src);
1719 unsigned channel = src.swizzle[0];
1720
1721 assert(bitsize == 1 || bitsize == 8 || bitsize == 16 || bitsize == 32 ||
1722 bitsize == 64);
1723 assert(channel < comps);
1724
1725 return agx_extract_nir_src(b, src.src, channel);
1726 }
1727
1728 /*
1729 * Emit an instruction translating (s0 * s1) + (s2 << s3). Assuming s3 is
1730 * constant, this is an imad instruction. If s1 == 1, then this is optimized to
1731 * an iadd instruction, which is faster.
1732 */
1733 static agx_instr *
agx_emit_imadshl_agx(agx_builder * b,nir_alu_instr * alu,agx_index dst,agx_index s0,agx_index s1,agx_index s2,agx_index s3)1734 agx_emit_imadshl_agx(agx_builder *b, nir_alu_instr *alu, agx_index dst,
1735 agx_index s0, agx_index s1, agx_index s2, agx_index s3)
1736 {
1737 /* If the shift is not constant, use a variable shift. This should never
1738 * happen in practice but we don't want to constrain the NIR.
1739 */
1740 unsigned shift;
1741 if (!nir_src_is_const(alu->src[3].src)) {
1742 s2 = agx_bfi(b, agx_immediate(0), s2, s3, 0);
1743 shift = 0;
1744 } else {
1745 shift = nir_alu_src_as_uint(alu->src[3]);
1746 }
1747
1748 assert(shift <= 4 && "domain restriction on the input NIR");
1749
1750 /* Emit iadd if possible, else imad */
1751 if (nir_src_is_const(alu->src[1].src) &&
1752 nir_alu_src_as_uint(alu->src[1]) == 1) {
1753
1754 return agx_iadd_to(b, dst, s0, s2, shift);
1755 } else {
1756 return agx_imad_to(b, dst, s0, s1, s2, shift);
1757 }
1758 }
1759
1760 static bool
is_conversion_to_8bit(nir_op op)1761 is_conversion_to_8bit(nir_op op)
1762 {
1763 switch (op) {
1764 case nir_op_i2i8:
1765 case nir_op_u2u8:
1766 case nir_op_f2i8:
1767 case nir_op_f2u8:
1768 case nir_op_b2i8:
1769 return true;
1770 default:
1771 return false;
1772 }
1773 }
1774
1775 static agx_instr *
agx_fminmax_to(agx_builder * b,agx_index dst,agx_index s0,agx_index s1,nir_alu_instr * alu)1776 agx_fminmax_to(agx_builder *b, agx_index dst, agx_index s0, agx_index s1,
1777 nir_alu_instr *alu)
1778 {
1779 /* The hardware gtn/ltn modes are unfortunately incorrect for signed zeros */
1780 assert(!nir_alu_instr_is_signed_zero_preserve(alu) &&
1781 "should've been lowered");
1782
1783 assert((alu->def.bit_size == 16) ==
1784 (alu->op == nir_op_fmin || alu->op == nir_op_fmax) &&
1785 "fp32 should be lowered");
1786
1787 bool fmax = alu->op == nir_op_fmax || alu->op == nir_op_fmax_agx;
1788 enum agx_fcond fcond = fmax ? AGX_FCOND_GTN : AGX_FCOND_LTN;
1789
1790 /* Calculate min/max with the appropriate hardware instruction. This will not
1791 * handle denorms, but we were already lowered for that.
1792 */
1793 return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, fcond);
1794 }
1795
1796 static agx_instr *
agx_emit_alu(agx_builder * b,nir_alu_instr * instr)1797 agx_emit_alu(agx_builder *b, nir_alu_instr *instr)
1798 {
1799 unsigned srcs = nir_op_infos[instr->op].num_inputs;
1800 unsigned sz = instr->def.bit_size;
1801 unsigned src_sz = srcs ? nir_src_bit_size(instr->src[0].src) : 0;
1802 ASSERTED unsigned comps = instr->def.num_components;
1803
1804 assert(comps == 1 || nir_op_is_vec_or_mov(instr->op));
1805 assert(sz == 1 ||
1806 ((nir_op_is_vec_or_mov(instr->op) ||
1807 is_conversion_to_8bit(instr->op) || instr->op == nir_op_bcsel) &&
1808 sz == 8) ||
1809 sz == 16 || sz == 32 || sz == 64);
1810
1811 agx_index dst = agx_def_index(&instr->def);
1812 agx_index s0 = srcs > 0 ? agx_alu_src_index(b, instr->src[0]) : agx_null();
1813 agx_index s1 = srcs > 1 ? agx_alu_src_index(b, instr->src[1]) : agx_null();
1814 agx_index s2 = srcs > 2 ? agx_alu_src_index(b, instr->src[2]) : agx_null();
1815 agx_index s3 = srcs > 3 ? agx_alu_src_index(b, instr->src[3]) : agx_null();
1816
1817 agx_index i0 = agx_immediate(0);
1818 agx_index i1 = agx_immediate(1);
1819
1820 #define UNOP(nop, aop) \
1821 case nir_op_##nop: \
1822 return agx_##aop##_to(b, dst, s0);
1823 #define BINOP(nop, aop) \
1824 case nir_op_##nop: \
1825 return agx_##aop##_to(b, dst, s0, s1);
1826
1827 switch (instr->op) {
1828 UNOP(f2f16, fmov);
1829 UNOP(f2f16_rtne, fmov);
1830 UNOP(f2f32, fmov);
1831 UNOP(fround_even, roundeven);
1832 UNOP(ftrunc, trunc);
1833 UNOP(ffloor, floor);
1834 UNOP(fceil, ceil);
1835 UNOP(frcp, rcp);
1836 UNOP(frsq, rsqrt);
1837 UNOP(flog2, log2);
1838 UNOP(fexp2, exp2);
1839
1840 UNOP(mov, mov);
1841 UNOP(u2u32, mov);
1842 UNOP(bitfield_reverse, bitrev);
1843 UNOP(bit_count, popcount);
1844 UNOP(ufind_msb, ffs);
1845 BINOP(iand, and);
1846 BINOP(ior, or);
1847 BINOP(ixor, xor);
1848 BINOP(interleave_agx, intl);
1849
1850 case nir_op_fadd:
1851 if (instr->def.bit_size == 16)
1852 return agx_hadd_to(b, dst, s0, s1);
1853 else
1854 return agx_fadd_to(b, dst, s0, s1);
1855
1856 case nir_op_fmul:
1857 if (instr->def.bit_size == 16)
1858 return agx_hmul_to(b, dst, s0, s1);
1859 else
1860 return agx_fmul_to(b, dst, s0, s1);
1861
1862 case nir_op_ffma:
1863 if (instr->def.bit_size == 16)
1864 return agx_hfma_to(b, dst, s0, s1, s2);
1865 else
1866 return agx_ffma_to(b, dst, s0, s1, s2);
1867
1868 case nir_op_fsat: {
1869 agx_instr *I = agx_fadd_to(b, dst, s0, agx_negzero());
1870 if (instr->def.bit_size == 16)
1871 I->op = AGX_OPCODE_HADD;
1872
1873 I->saturate = true;
1874 return I;
1875 }
1876
1877 case nir_op_feq:
1878 return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_EQ, false);
1879 case nir_op_flt:
1880 return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_LT, false);
1881 case nir_op_fge:
1882 return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_GE, false);
1883 case nir_op_fneu:
1884 return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_EQ, true);
1885
1886 case nir_op_ieq:
1887 return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_UEQ, false);
1888 case nir_op_ine:
1889 return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_UEQ, true);
1890 case nir_op_ilt:
1891 return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_SLT, false);
1892 case nir_op_ige:
1893 return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_SLT, true);
1894 case nir_op_ult:
1895 return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_ULT, false);
1896 case nir_op_uge:
1897 return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_ULT, true);
1898
1899 case nir_op_inot:
1900 if (sz == 1)
1901 return agx_xor_to(b, dst, s0, i1);
1902 else
1903 return agx_not_to(b, dst, s0);
1904
1905 case nir_op_b2b1:
1906 return agx_icmp_to(b, dst, s0, i0, AGX_ICOND_UEQ, true);
1907
1908 case nir_op_fsqrt:
1909 return agx_fmul_to(b, dst, s0, agx_srsqrt(b, s0));
1910 case nir_op_fabs:
1911 return agx_fmov_to(b, dst, agx_abs(s0));
1912 case nir_op_fneg:
1913 return agx_fmov_to(b, dst, agx_neg(s0));
1914
1915 case nir_op_fmin:
1916 case nir_op_fmax:
1917 case nir_op_fmin_agx:
1918 case nir_op_fmax_agx:
1919 return agx_fminmax_to(b, dst, s0, s1, instr);
1920
1921 case nir_op_imin:
1922 return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SLT);
1923 case nir_op_imax:
1924 return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SGT);
1925 case nir_op_umin:
1926 return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_ULT);
1927 case nir_op_umax:
1928 return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_UGT);
1929 case nir_op_bounds_agx:
1930 /* end offset > bound ? 0 : data */
1931 return agx_icmpsel_to(b, dst, s1, s2, agx_zero(), s0, AGX_ICOND_UGT);
1932
1933 case nir_op_iadd:
1934 return agx_iadd_to(b, dst, s0, s1, 0);
1935 case nir_op_imadshl_agx:
1936 return agx_emit_imadshl_agx(b, instr, dst, s0, s1, s2, s3);
1937 case nir_op_imsubshl_agx:
1938 return agx_emit_imadshl_agx(b, instr, dst, s0, s1, agx_neg(s2), s3);
1939 case nir_op_isub:
1940 return agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
1941 case nir_op_ineg:
1942 return agx_iadd_to(b, dst, i0, agx_neg(s0), 0);
1943 case nir_op_imul:
1944 return agx_imad_to(b, dst, s0, s1, i0, 0);
1945 case nir_op_umul_2x32_64:
1946 return agx_imad_to(b, dst, agx_abs(s0), agx_abs(s1), i0, 0);
1947 case nir_op_imul_2x32_64:
1948 return agx_imad_to(b, dst, s0, s1, i0, 0);
1949 case nir_op_umul_high:
1950 return agx_mul_high_to(b, dst, s0, s1, false);
1951 case nir_op_imul_high:
1952 return agx_mul_high_to(b, dst, s0, s1, true);
1953
1954 case nir_op_ishl:
1955 return agx_bfi_to(b, dst, i0, s0, s1, 0);
1956 case nir_op_ushr:
1957 return agx_ushr_to(b, dst, s0, s1);
1958 case nir_op_ishr:
1959 return agx_asr_to(b, dst, s0, s1);
1960
1961 case nir_op_extr_agx:
1962 return agx_extr_to(b, dst, s0, s1, s2,
1963 nir_alu_src_as_uint(instr->src[3]));
1964
1965 case nir_op_ubitfield_extract: {
1966 unsigned m = nir_alu_src_as_uint(instr->src[2]);
1967 assert(m != 0 && "should've been optimized");
1968
1969 /* Disable masking if the whole thing is used */
1970 if (m >= 32)
1971 m = 0;
1972
1973 return agx_bfeil_to(b, dst, i0, s0, s1, m);
1974 }
1975
1976 case nir_op_bcsel:
1977 return agx_icmpsel_to(b, dst, s0, i0, s2, s1, AGX_ICOND_UEQ);
1978
1979 case nir_op_i2i32: {
1980 if (src_sz == 8) {
1981 /* Sign extend in software, NIR likes 8-bit conversions */
1982 agx_index ishl16 = agx_bfi(b, i0, s0, agx_immediate(8), 0);
1983 return agx_asr_to(b, dst, ishl16, agx_immediate(8));
1984 } else {
1985 assert(s0.size == AGX_SIZE_16 && "other conversions lowered");
1986 return agx_signext_to(b, dst, s0);
1987 }
1988 }
1989
1990 case nir_op_i2i16: {
1991 if (src_sz == 8) {
1992 /* Sign extend in software, NIR likes 8-bit conversions */
1993 agx_index ishl16 = agx_bfi(b, i0, s0, agx_immediate(8), 0);
1994 return agx_asr_to(b, dst, ishl16, agx_immediate(8));
1995 } else {
1996 assert(s0.size == AGX_SIZE_32 && "other conversions lowered");
1997 return agx_subdivide_to(b, dst, s0, 0);
1998 }
1999 }
2000
2001 case nir_op_u2u16: {
2002 if (s0.size == AGX_SIZE_32)
2003 return agx_subdivide_to(b, dst, s0, 0);
2004 else
2005 return agx_mov_to(b, dst, s0);
2006 }
2007
2008 /* It will be put into a 16-bit register, but zero out the garbage. We could
2009 * optimize this in the future but it ensures correctness for u2u16(u2u8(x))
2010 * sequences.
2011 */
2012 case nir_op_u2u8:
2013 case nir_op_i2i8:
2014 return agx_and_to(b, dst, s0, agx_immediate(0xFF));
2015
2016 case nir_op_iadd_sat: {
2017 agx_instr *I = agx_iadd_to(b, dst, s0, s1, 0);
2018 I->saturate = true;
2019 return I;
2020 }
2021
2022 case nir_op_isub_sat: {
2023 agx_instr *I = agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
2024 I->saturate = true;
2025 return I;
2026 }
2027
2028 case nir_op_uadd_sat: {
2029 agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_abs(s1), 0);
2030 I->saturate = true;
2031 return I;
2032 }
2033
2034 case nir_op_usub_sat: {
2035 agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_neg(agx_abs(s1)), 0);
2036 I->saturate = true;
2037 return I;
2038 }
2039
2040 case nir_op_fsin_agx: {
2041 agx_index fixup = agx_sin_pt_1(b, s0);
2042 agx_index sinc = agx_sin_pt_2(b, fixup);
2043 return agx_fmul_to(b, dst, sinc, fixup);
2044 }
2045
2046 case nir_op_f2i16:
2047 return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_S16), s0,
2048 AGX_ROUND_RTZ);
2049
2050 case nir_op_f2i32:
2051 return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_S32), s0,
2052 AGX_ROUND_RTZ);
2053
2054 case nir_op_f2u16:
2055 return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_U16), s0,
2056 AGX_ROUND_RTZ);
2057
2058 case nir_op_f2u32:
2059 return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_U32), s0,
2060 AGX_ROUND_RTZ);
2061
2062 case nir_op_u2f16:
2063 case nir_op_u2f32: {
2064 if (src_sz == 64)
2065 unreachable("64-bit conversions unimplemented");
2066
2067 enum agx_convert mode = (src_sz == 32) ? AGX_CONVERT_U32_TO_F
2068 : (src_sz == 16) ? AGX_CONVERT_U16_TO_F
2069 : AGX_CONVERT_U8_TO_F;
2070
2071 return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
2072 }
2073
2074 case nir_op_i2f16:
2075 case nir_op_i2f32: {
2076 if (src_sz == 64)
2077 unreachable("64-bit conversions unimplemented");
2078
2079 enum agx_convert mode = (src_sz == 32) ? AGX_CONVERT_S32_TO_F
2080 : (src_sz == 16) ? AGX_CONVERT_S16_TO_F
2081 : AGX_CONVERT_S8_TO_F;
2082
2083 return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
2084 }
2085
2086 case nir_op_pack_32_2x16_split:
2087 case nir_op_pack_64_2x32_split: {
2088 agx_index idx[] = {s0, s1};
2089 return agx_emit_collect_to(b, dst, 2, idx);
2090 }
2091
2092 case nir_op_unpack_64_2x32_split_x:
2093 case nir_op_unpack_32_2x16_split_x:
2094 return agx_subdivide_to(b, dst, s0, 0);
2095
2096 case nir_op_unpack_64_2x32_split_y:
2097 case nir_op_unpack_32_2x16_split_y:
2098 return agx_subdivide_to(b, dst, s0, 1);
2099
2100 case nir_op_vec2:
2101 case nir_op_vec3:
2102 case nir_op_vec4: {
2103 agx_index idx[] = {s0, s1, s2, s3};
2104 return agx_emit_collect_to(b, dst, srcs, idx);
2105 }
2106
2107 case nir_op_vec8:
2108 case nir_op_vec16:
2109 unreachable("should've been lowered");
2110
2111 default:
2112 fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
2113 unreachable("Unhandled ALU instruction");
2114 }
2115 }
2116
2117 static enum agx_lod_mode
agx_lod_mode_for_nir(nir_texop op,bool biased,bool min_lod,bool lod_is_zero)2118 agx_lod_mode_for_nir(nir_texop op, bool biased, bool min_lod, bool lod_is_zero)
2119 {
2120 switch (op) {
2121 case nir_texop_tex:
2122 case nir_texop_tg4:
2123 /* We could support this for tex, but it's never actually seen because tex
2124 * is always turned into txb to implement sampler LOD bias in Vulkan.
2125 */
2126 assert(!min_lod && "unimplemented");
2127
2128 return AGX_LOD_MODE_AUTO_LOD;
2129 case nir_texop_txb:
2130 return min_lod ? AGX_LOD_MODE_AUTO_LOD_BIAS_MIN
2131 : AGX_LOD_MODE_AUTO_LOD_BIAS;
2132 case nir_texop_lod:
2133 assert(!min_lod);
2134 return biased ? AGX_LOD_MODE_AUTO_LOD_BIAS : AGX_LOD_MODE_AUTO_LOD;
2135 case nir_texop_txd:
2136 return min_lod ? AGX_LOD_MODE_LOD_GRAD_MIN : AGX_LOD_MODE_LOD_GRAD;
2137 case nir_texop_txl:
2138 assert(!min_lod);
2139 return AGX_LOD_MODE_LOD_MIN;
2140 case nir_texop_txf:
2141 assert(!min_lod);
2142 return lod_is_zero ? AGX_LOD_MODE_AUTO_LOD : AGX_LOD_MODE_LOD_MIN;
2143 case nir_texop_txf_ms:
2144 assert(!min_lod);
2145 assert(lod_is_zero && "no mipmapping");
2146 return AGX_LOD_MODE_AUTO_LOD;
2147 default:
2148 unreachable("Unhandled texture op");
2149 }
2150 }
2151
2152 static enum agx_gather
agx_gather_for_nir(nir_tex_instr * tex)2153 agx_gather_for_nir(nir_tex_instr *tex)
2154 {
2155 if (tex->op == nir_texop_tg4) {
2156 enum agx_gather components[] = {
2157 AGX_GATHER_R,
2158 AGX_GATHER_G,
2159 AGX_GATHER_B,
2160 AGX_GATHER_A,
2161 };
2162
2163 assert(tex->component < ARRAY_SIZE(components));
2164 return components[tex->component];
2165 } else {
2166 return AGX_GATHER_NONE;
2167 }
2168 }
2169
2170 static void
agx_emit_tex(agx_builder * b,nir_tex_instr * instr)2171 agx_emit_tex(agx_builder *b, nir_tex_instr *instr)
2172 {
2173 agx_index coords = agx_null(), bindless = agx_immediate(0),
2174 texture = agx_immediate(instr->texture_index),
2175 sampler = agx_immediate(0), lod = agx_immediate(0),
2176 compare = agx_null(), packed_offset = agx_null(),
2177 min_lod = agx_null();
2178
2179 bool lod_is_zero = true;
2180
2181 for (unsigned i = 0; i < instr->num_srcs; ++i) {
2182 agx_index index = agx_src_index(&instr->src[i].src);
2183
2184 switch (instr->src[i].src_type) {
2185 case nir_tex_src_backend1:
2186 coords = index;
2187 break;
2188
2189 case nir_tex_src_backend2:
2190 packed_offset = index;
2191 break;
2192
2193 case nir_tex_src_lod:
2194 case nir_tex_src_bias:
2195 lod = index;
2196 lod_is_zero = nir_src_is_const(instr->src[i].src) &&
2197 nir_src_as_uint(instr->src[i].src) == 0;
2198 break;
2199
2200 case nir_tex_src_min_lod:
2201 assert(index.size == AGX_SIZE_16);
2202 min_lod = index;
2203 break;
2204
2205 case nir_tex_src_comparator:
2206 assert(index.size == AGX_SIZE_32);
2207 compare = index;
2208 break;
2209
2210 case nir_tex_src_texture_offset:
2211 texture = index;
2212 break;
2213 case nir_tex_src_sampler_handle:
2214 sampler = index;
2215 break;
2216
2217 case nir_tex_src_texture_handle:
2218 texture =
2219 agx_translate_bindless_handle(b, &instr->src[i].src, &bindless);
2220 break;
2221
2222 case nir_tex_src_ddx: {
2223 int y_idx = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
2224 assert(y_idx >= 0 && "we only handle gradients");
2225
2226 int min_idx = nir_tex_instr_src_index(instr, nir_tex_src_min_lod);
2227 bool has_min = min_idx >= 0;
2228 agx_index min;
2229
2230 unsigned n = nir_tex_instr_src_size(instr, y_idx);
2231 assert((n == 2 || n == 3) && "other sizes not supported");
2232
2233 agx_index index2 = agx_src_index(&instr->src[y_idx].src);
2234
2235 if (has_min) {
2236 min = agx_src_index(&instr->src[min_idx].src);
2237
2238 /* Undef extend to 32-bit since our IR is iffy */
2239 min = agx_pad_to_32(b, min);
2240 }
2241
2242 /* We explicitly don't cache about the split cache for this */
2243 unsigned chans = (2 * n) + (has_min ? 1 : 0);
2244 lod = agx_vec_temp(b->shader, AGX_SIZE_32, chans);
2245 agx_instr *I = agx_collect_to(b, lod, chans);
2246
2247 for (unsigned i = 0; i < n; ++i) {
2248 I->src[(2 * i) + 0] = agx_emit_extract(b, index, i);
2249 I->src[(2 * i) + 1] = agx_emit_extract(b, index2, i);
2250 }
2251
2252 if (has_min)
2253 I->src[2 * n] = min;
2254
2255 break;
2256 }
2257
2258 case nir_tex_src_ddy:
2259 /* handled above */
2260 break;
2261
2262 default:
2263 unreachable("Unexpected texture source");
2264 }
2265 }
2266
2267 enum agx_lod_mode lod_mode = agx_lod_mode_for_nir(
2268 instr->op, nir_tex_instr_src_index(instr, nir_tex_src_bias) >= 0,
2269 nir_tex_instr_src_index(instr, nir_tex_src_min_lod) >= 0, lod_is_zero);
2270
2271 if (lod_mode == AGX_LOD_MODE_AUTO_LOD) {
2272 /* Ignored logically but asserted 0 */
2273 lod = agx_immediate(0);
2274 } else if (lod_mode == AGX_LOD_MODE_AUTO_LOD_BIAS_MIN) {
2275 /* Combine min with lod */
2276 lod = agx_vec2(b, lod, min_lod);
2277 }
2278
2279 agx_index dst = agx_def_index(&instr->def);
2280
2281 /* Pack shadow reference value (compare) and packed offset together */
2282 agx_index compare_offset = agx_null();
2283
2284 if (!agx_is_null(compare) && !agx_is_null(packed_offset))
2285 compare_offset = agx_vec2(b, compare, packed_offset);
2286 else if (!agx_is_null(packed_offset))
2287 compare_offset = packed_offset;
2288 else if (!agx_is_null(compare))
2289 compare_offset = compare;
2290
2291 agx_index tmp = agx_vec_temp(b->shader, dst.size, 4);
2292 agx_instr *I = agx_texture_sample_to(
2293 b, tmp, coords, lod, bindless, texture, sampler, compare_offset,
2294 agx_tex_dim(instr->sampler_dim, instr->is_array), lod_mode, 0,
2295 !agx_is_null(packed_offset), !agx_is_null(compare),
2296 instr->op == nir_texop_lod, agx_gather_for_nir(instr));
2297
2298 if (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms) {
2299 I->op = AGX_OPCODE_TEXTURE_LOAD;
2300 b->shader->out->uses_txf = true;
2301 }
2302
2303 /* Destination masking doesn't seem to work properly for gathers (because
2304 * it's mostly pointless), but it does show up in the lowering of
2305 * textureGatherOffsets. Don't try to mask the destination for gathers.
2306 */
2307 bool masked = (instr->op != nir_texop_tg4);
2308 I->mask = agx_expand_tex_to(b, &instr->def, tmp, masked);
2309 }
2310
2311 /*
2312 * Determine if a NIR loop (CF list) uses a continue jump, including within
2313 * if-else statements but not including nested loops.
2314 */
2315 static bool
cf_list_uses_continue(struct exec_list * list)2316 cf_list_uses_continue(struct exec_list *list)
2317 {
2318 foreach_list_typed(nir_cf_node, node, node, list) {
2319 if (node->type == nir_cf_node_block) {
2320 nir_block *block = nir_cf_node_as_block(node);
2321
2322 nir_foreach_instr(instr, block) {
2323 if (instr->type == nir_instr_type_jump &&
2324 nir_instr_as_jump(instr)->type == nir_jump_continue)
2325 return true;
2326 }
2327 } else if (node->type == nir_cf_node_if) {
2328 nir_if *nif = nir_cf_node_as_if(node);
2329
2330 if (cf_list_uses_continue(&nif->then_list) ||
2331 cf_list_uses_continue(&nif->else_list))
2332 return true;
2333 } else {
2334 assert(node->type == nir_cf_node_loop && "don't care about nesting");
2335 }
2336 }
2337
2338 return false;
2339 }
2340
2341 static bool
loop_uses_continue(nir_loop * loop)2342 loop_uses_continue(nir_loop *loop)
2343 {
2344 return cf_list_uses_continue(&loop->body);
2345 }
2346
2347 /*
2348 * NIR loops are treated as a pair of AGX loops:
2349 *
2350 * do {
2351 * do {
2352 * ...
2353 * } while (0);
2354 * } while (cond);
2355 *
2356 * By manipulating the nesting counter, we may break out of nested loops, so
2357 * under the model, both break and continue may be implemented as breaks, where
2358 * break breaks out of the outer loop (2 layers) and continue breaks out of the
2359 * inner loop (1 layer).
2360 *
2361 * After manipulating the nesting counter directly, pop_exec #0 must be used to
2362 * flush the update to the execution mask.
2363 */
2364 static void
agx_emit_jump(agx_builder * b,nir_jump_instr * instr)2365 agx_emit_jump(agx_builder *b, nir_jump_instr *instr)
2366 {
2367 agx_context *ctx = b->shader;
2368
2369 if (instr->type == nir_jump_halt) {
2370 agx_stop(b);
2371 ctx->current_block->unconditional_jumps = true;
2372 return;
2373 }
2374
2375 assert(instr->type == nir_jump_break || instr->type == nir_jump_continue);
2376
2377 /* Break out of either one or two loops */
2378 unsigned nestings = b->shader->loop_nesting;
2379
2380 if (instr->type == nir_jump_continue) {
2381 nestings += 1;
2382 agx_block_add_successor(ctx->current_block, ctx->continue_block);
2383 } else if (instr->type == nir_jump_break) {
2384 nestings += ctx->loop_continues ? 2 : 1;
2385 agx_block_add_successor(ctx->current_block, ctx->break_block);
2386 }
2387
2388 agx_break(b, nestings, ctx->break_block);
2389 ctx->current_block->unconditional_jumps = true;
2390 }
2391
2392 static void
agx_emit_phi(agx_builder * b,nir_phi_instr * instr)2393 agx_emit_phi(agx_builder *b, nir_phi_instr *instr)
2394 {
2395 agx_instr *I =
2396 agx_phi_to(b, agx_def_index(&instr->def), exec_list_length(&instr->srcs));
2397
2398 /* Deferred */
2399 I->phi = instr;
2400 }
2401
2402 /* Look up the AGX block corresponding to a given NIR block. Used when
2403 * translating phi nodes after emitting all blocks.
2404 */
2405 static agx_block *
agx_from_nir_block(agx_context * ctx,nir_block * block)2406 agx_from_nir_block(agx_context *ctx, nir_block *block)
2407 {
2408 return ctx->indexed_nir_blocks[block->index];
2409 }
2410
2411 static void
agx_emit_phi_deferred(agx_context * ctx,agx_block * block,agx_instr * I)2412 agx_emit_phi_deferred(agx_context *ctx, agx_block *block, agx_instr *I)
2413 {
2414 nir_phi_instr *phi = I->phi;
2415 I->phi = NULL;
2416
2417 /* Guaranteed by lower_phis_to_scalar */
2418 assert(phi->def.num_components == 1);
2419
2420 nir_foreach_phi_src(src, phi) {
2421 agx_block *pred = agx_from_nir_block(ctx, src->pred);
2422 unsigned i = agx_predecessor_index(block, pred);
2423 assert(i < I->nr_srcs);
2424
2425 I->src[i] = agx_src_index(&src->src);
2426 }
2427 }
2428
2429 static void
agx_emit_phis_deferred(agx_context * ctx)2430 agx_emit_phis_deferred(agx_context *ctx)
2431 {
2432 agx_foreach_block(ctx, block) {
2433 agx_foreach_phi_in_block(block, I)
2434 agx_emit_phi_deferred(ctx, block, I);
2435 }
2436 }
2437
2438 static void
agx_emit_undef(agx_builder * b,nir_undef_instr * instr)2439 agx_emit_undef(agx_builder *b, nir_undef_instr *instr)
2440 {
2441 /* For now, just lower undefs to zero. This doesn't matter too much, since
2442 * the lowering happens in NIR and this just allows for late lowering passes
2443 * to result in undefs.
2444 */
2445 if (instr->def.num_components > 1) {
2446 assert(instr->def.num_components <= 4);
2447 agx_index zero = agx_mov_imm(b, instr->def.bit_size, 0);
2448
2449 agx_emit_collect_to(b, agx_def_index(&instr->def),
2450 instr->def.num_components,
2451 (agx_index[4]){zero, zero, zero, zero});
2452 } else {
2453 agx_mov_imm_to(b, agx_def_index(&instr->def), 0);
2454 }
2455 }
2456
2457 static void
agx_emit_instr(agx_builder * b,struct nir_instr * instr)2458 agx_emit_instr(agx_builder *b, struct nir_instr *instr)
2459 {
2460 switch (instr->type) {
2461 case nir_instr_type_load_const:
2462 agx_emit_load_const(b, nir_instr_as_load_const(instr));
2463 break;
2464
2465 case nir_instr_type_intrinsic:
2466 agx_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
2467 break;
2468
2469 case nir_instr_type_alu:
2470 agx_emit_alu(b, nir_instr_as_alu(instr));
2471 break;
2472
2473 case nir_instr_type_tex:
2474 agx_emit_tex(b, nir_instr_as_tex(instr));
2475 break;
2476
2477 case nir_instr_type_jump:
2478 agx_emit_jump(b, nir_instr_as_jump(instr));
2479 break;
2480
2481 case nir_instr_type_phi:
2482 agx_emit_phi(b, nir_instr_as_phi(instr));
2483 break;
2484
2485 case nir_instr_type_undef:
2486 agx_emit_undef(b, nir_instr_as_undef(instr));
2487 break;
2488
2489 default:
2490 unreachable("should've been lowered");
2491 }
2492 }
2493
2494 static agx_block *
agx_create_block(agx_context * ctx)2495 agx_create_block(agx_context *ctx)
2496 {
2497 agx_block *blk = rzalloc(ctx, agx_block);
2498
2499 util_dynarray_init(&blk->predecessors, blk);
2500
2501 return blk;
2502 }
2503
2504 static agx_block *
emit_block(agx_context * ctx,nir_block * block)2505 emit_block(agx_context *ctx, nir_block *block)
2506 {
2507 if (ctx->after_block) {
2508 ctx->current_block = ctx->after_block;
2509 ctx->after_block = NULL;
2510 } else {
2511 ctx->current_block = agx_create_block(ctx);
2512 }
2513
2514 agx_block *blk = ctx->current_block;
2515 list_addtail(&blk->link, &ctx->blocks);
2516 list_inithead(&blk->instructions);
2517
2518 ctx->indexed_nir_blocks[block->index] = blk;
2519
2520 agx_builder _b = agx_init_builder(ctx, agx_after_block(blk));
2521
2522 nir_foreach_instr(instr, block) {
2523 agx_emit_instr(&_b, instr);
2524 }
2525
2526 return blk;
2527 }
2528
2529 static agx_block *emit_cf_list(agx_context *ctx, struct exec_list *list);
2530
2531 /* Emit if-else as
2532 *
2533 * if_icmp cond != 0
2534 * ...
2535 * else_icmp cond == 0
2536 * ...
2537 * pop_exec
2538 *
2539 * If the else is empty, we can omit the else_icmp. This happens elsewhere, as
2540 * an empty else block can become nonempty after RA due to phi lowering. This is
2541 * not usually optimal, but it's a start.
2542 */
2543
2544 static void
emit_if(agx_context * ctx,nir_if * nif)2545 emit_if(agx_context *ctx, nir_if *nif)
2546 {
2547 agx_block *first_block = ctx->current_block;
2548 agx_builder _b = agx_init_builder(ctx, agx_after_block(first_block));
2549 agx_index cond = agx_src_index(&nif->condition);
2550
2551 agx_instr *if_ = agx_if_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, true,
2552 NULL /* filled in later */);
2553 ctx->loop_nesting++;
2554 ctx->total_nesting++;
2555
2556 /* Emit the two subblocks. */
2557 agx_block *if_block = emit_cf_list(ctx, &nif->then_list);
2558 agx_block *end_then = ctx->current_block;
2559
2560 _b.cursor = agx_after_block(ctx->current_block);
2561
2562 agx_block *else_block = emit_cf_list(ctx, &nif->else_list);
2563 agx_block *end_else = ctx->current_block;
2564
2565 /* If the "if" fails, we fallthrough to the else */
2566 if_->target = else_block;
2567
2568 /* Insert an else instruction at the beginning of the else block. We use
2569 * "else_fcmp 0.0, 0.0, eq" as unconditional else, matching the blob.
2570 *
2571 * If it fails, we fall through to the logical end of the last else block.
2572 */
2573 _b.cursor = agx_before_block(else_block);
2574 agx_else_fcmp(&_b, agx_zero(), agx_zero(), 1, AGX_FCOND_EQ, false, end_else);
2575
2576 ctx->after_block = agx_create_block(ctx);
2577
2578 agx_block_add_successor(first_block, if_block);
2579 agx_block_add_successor(first_block, else_block);
2580 agx_block_add_successor(end_then, ctx->after_block);
2581 agx_block_add_successor(end_else, ctx->after_block);
2582
2583 _b.cursor = agx_after_block(ctx->current_block);
2584 agx_pop_exec(&_b, 1);
2585 ctx->loop_nesting--;
2586 ctx->total_nesting--;
2587 }
2588
2589 static void
emit_loop(agx_context * ctx,nir_loop * nloop)2590 emit_loop(agx_context *ctx, nir_loop *nloop)
2591 {
2592 assert(!nir_loop_has_continue_construct(nloop));
2593 /* We only track nesting within the innermost loop, so push and reset */
2594 unsigned pushed_nesting = ctx->loop_nesting;
2595 ctx->loop_nesting = 0;
2596 ctx->total_nesting++;
2597
2598 bool old_continues = ctx->loop_continues;
2599 ctx->loop_continues = loop_uses_continue(nloop);
2600
2601 agx_block *popped_break = ctx->break_block;
2602 agx_block *popped_continue = ctx->continue_block;
2603
2604 ctx->break_block = agx_create_block(ctx);
2605 ctx->continue_block = agx_create_block(ctx);
2606
2607 /* If we are emitting a loop inside other control flow, there might be
2608 * threads masked off (TODO: divergence analysis), so push_exec them so
2609 * we get the lower nesting count values to ourselves.
2610 */
2611 agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
2612 if (ctx->total_nesting > 1)
2613 agx_push_exec(&_b, ctx->loop_continues ? 2 : 1);
2614
2615 /* Fallthrough to body */
2616 agx_block_add_successor(ctx->current_block, ctx->continue_block);
2617
2618 /* Emit the body */
2619 ctx->after_block = ctx->continue_block;
2620 ctx->after_block->loop_header = true;
2621 agx_block *start_block = emit_cf_list(ctx, &nloop->body);
2622
2623 /* If we used any continue jumps, we need to reactivate the continued
2624 * threads. We do this with an always true while_icmp, which behaves like:
2625 *
2626 * if (r0l == 1) {
2627 * r0l = 0;
2628 * }
2629 * update_exec
2630 *
2631 * If we did not use continue, this would be a no-op so it is omitted.
2632 */
2633 _b.cursor = agx_after_block(ctx->current_block);
2634
2635 if (ctx->loop_continues) {
2636 agx_while_icmp(
2637 &_b, agx_zero(), agx_zero(), 2, AGX_ICOND_UEQ, false,
2638 NULL /* no semantic target, used purely for side effects */);
2639 }
2640
2641 agx_jmp_exec_any(&_b, start_block);
2642 agx_pop_exec(&_b, ctx->loop_continues ? 2 : 1);
2643 agx_block_add_successor(ctx->current_block, ctx->continue_block);
2644
2645 /* Pop off */
2646 ctx->after_block = ctx->break_block;
2647 ctx->break_block = popped_break;
2648 ctx->continue_block = popped_continue;
2649
2650 /* Update shader-db stats */
2651 ++ctx->loop_count;
2652
2653 /* All nested control flow must have finished */
2654 assert(ctx->loop_nesting == 0);
2655
2656 /* Restore loop nesting (we might be inside an if inside an outer loop) */
2657 ctx->loop_nesting = pushed_nesting;
2658 ctx->total_nesting--;
2659 ctx->loop_continues = old_continues;
2660 }
2661
2662 /* Before the first control flow structure, the nesting counter needs to be
2663 * zeroed for correct operation. This only happens at most once, since by
2664 * definition this occurs at the end of the first block, which dominates the
2665 * rest of the program. */
2666
2667 static void
emit_first_cf(agx_context * ctx)2668 emit_first_cf(agx_context *ctx)
2669 {
2670 if (ctx->any_cf)
2671 return;
2672
2673 agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
2674 agx_begin_cf(&_b);
2675 ctx->any_cf = true;
2676 }
2677
2678 static agx_block *
emit_cf_list(agx_context * ctx,struct exec_list * list)2679 emit_cf_list(agx_context *ctx, struct exec_list *list)
2680 {
2681 agx_block *start_block = NULL;
2682
2683 foreach_list_typed(nir_cf_node, node, node, list) {
2684 switch (node->type) {
2685 case nir_cf_node_block: {
2686 agx_block *block = emit_block(ctx, nir_cf_node_as_block(node));
2687
2688 if (!start_block)
2689 start_block = block;
2690
2691 break;
2692 }
2693
2694 case nir_cf_node_if:
2695 emit_first_cf(ctx);
2696 emit_if(ctx, nir_cf_node_as_if(node));
2697 break;
2698
2699 case nir_cf_node_loop:
2700 emit_first_cf(ctx);
2701 emit_loop(ctx, nir_cf_node_as_loop(node));
2702 break;
2703
2704 default:
2705 unreachable("Unknown control flow");
2706 }
2707 }
2708
2709 return start_block;
2710 }
2711
2712 static void
agx_set_st_vary_final(agx_context * ctx)2713 agx_set_st_vary_final(agx_context *ctx)
2714 {
2715 agx_foreach_instr_global_rev(ctx, I) {
2716 if (I->op == AGX_OPCODE_ST_VARY) {
2717 I->last = true;
2718 return;
2719 }
2720 }
2721
2722 /* If we got here, there was no varying written. We need to mark that. */
2723 agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
2724 agx_builder _b = agx_init_builder(ctx, agx_after_block_logical(last_block));
2725 agx_no_varyings(&_b);
2726 }
2727
2728 static int
agx_dump_stats(agx_context * ctx,unsigned size,char ** out)2729 agx_dump_stats(agx_context *ctx, unsigned size, char **out)
2730 {
2731 unsigned nr_ins = 0, spills = 0, fills = 0;
2732
2733 /* Count instructions */
2734 agx_foreach_instr_global(ctx, I) {
2735 nr_ins++;
2736
2737 if (I->op == AGX_OPCODE_STACK_STORE)
2738 spills++;
2739 else if (I->op == AGX_OPCODE_STACK_LOAD)
2740 fills++;
2741 }
2742
2743 struct agx_cycle_estimate cycles = agx_estimate_cycles(ctx);
2744
2745 unsigned nr_threads =
2746 agx_occupancy_for_register_count(ctx->max_reg).max_threads;
2747
2748 return asprintf(
2749 out,
2750 "%s shader: %u inst, %u alu, %u fscib, %u ic, %u bytes, %u regs, "
2751 "%u uniforms, %u scratch, %u threads, %u loops, "
2752 "%u:%u spills:fills",
2753 gl_shader_stage_name(ctx->stage), nr_ins, cycles.alu, cycles.f_scib,
2754 cycles.ic, size, ctx->max_reg, ctx->out->push_count, ctx->scratch_size_B,
2755 nr_threads, ctx->loop_count, spills, fills);
2756 }
2757
2758 static bool
agx_lower_sincos_filter(const nir_instr * instr,UNUSED const void * _)2759 agx_lower_sincos_filter(const nir_instr *instr, UNUSED const void *_)
2760 {
2761 if (instr->type != nir_instr_type_alu)
2762 return false;
2763
2764 nir_alu_instr *alu = nir_instr_as_alu(instr);
2765 return alu->op == nir_op_fsin || alu->op == nir_op_fcos;
2766 }
2767
2768 /* Sine and cosine are implemented via the sin_pt_1 and sin_pt_2 opcodes for
2769 * heavy lifting. sin_pt_2 implements sinc in the first quadrant, expressed in
2770 * turns (sin (tau x) / x), while sin_pt_1 implements a piecewise sign/offset
2771 * fixup to transform a quadrant angle [0, 4] to [-1, 1]. The NIR opcode
2772 * fsin_agx models the fixup, sinc, and multiply to obtain sine, so we just
2773 * need to change units from radians to quadrants modulo turns. Cosine is
2774 * implemented by shifting by one quadrant: cos(x) = sin(x + tau/4).
2775 */
2776
2777 static nir_def *
agx_lower_sincos_impl(struct nir_builder * b,nir_instr * instr,UNUSED void * _)2778 agx_lower_sincos_impl(struct nir_builder *b, nir_instr *instr, UNUSED void *_)
2779 {
2780 nir_alu_instr *alu = nir_instr_as_alu(instr);
2781 nir_def *x = nir_mov_alu(b, alu->src[0], 1);
2782 nir_def *turns = nir_fmul_imm(b, x, M_1_PI * 0.5f);
2783
2784 if (alu->op == nir_op_fcos)
2785 turns = nir_fadd_imm(b, turns, 0.25f);
2786
2787 nir_def *quadrants = nir_fmul_imm(b, nir_ffract(b, turns), 4.0);
2788 return nir_fsin_agx(b, quadrants);
2789 }
2790
2791 static bool
agx_lower_sincos(nir_shader * shader)2792 agx_lower_sincos(nir_shader *shader)
2793 {
2794 return nir_shader_lower_instructions(shader, agx_lower_sincos_filter,
2795 agx_lower_sincos_impl, NULL);
2796 }
2797
2798 static bool
agx_lower_front_face(struct nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)2799 agx_lower_front_face(struct nir_builder *b, nir_intrinsic_instr *intr,
2800 UNUSED void *data)
2801 {
2802 if (intr->intrinsic != nir_intrinsic_load_front_face)
2803 return false;
2804
2805 nir_def *def = &intr->def;
2806 assert(def->bit_size == 1);
2807
2808 b->cursor = nir_before_instr(&intr->instr);
2809 nir_def_rewrite_uses(def, nir_inot(b, nir_load_back_face_agx(b, 1)));
2810 return true;
2811 }
2812
2813 /*
2814 * Standard NIR optimization loop. This is run in agx_preprocess_nir, then once
2815 * again at shader variant compile time. Unless there was a complex shader key,
2816 * the latter run should be almost a no-op.
2817 */
2818 static void
agx_optimize_loop_nir(nir_shader * nir)2819 agx_optimize_loop_nir(nir_shader *nir)
2820 {
2821 bool progress;
2822
2823 do {
2824 progress = false;
2825
2826 NIR_PASS(progress, nir, nir_copy_prop);
2827 NIR_PASS(progress, nir, nir_opt_remove_phis);
2828 NIR_PASS(progress, nir, nir_opt_dce);
2829 NIR_PASS(progress, nir, nir_opt_dead_cf);
2830 NIR_PASS(progress, nir, nir_opt_cse);
2831 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
2832 NIR_PASS(progress, nir, nir_opt_phi_precision);
2833 NIR_PASS(progress, nir, nir_opt_algebraic);
2834 NIR_PASS(progress, nir, nir_opt_constant_folding);
2835 NIR_PASS(progress, nir, nir_opt_undef);
2836 NIR_PASS(progress, nir, nir_opt_loop_unroll);
2837 } while (progress);
2838 }
2839
2840 bool
agx_mem_vectorize_cb(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,int64_t hole_size,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)2841 agx_mem_vectorize_cb(unsigned align_mul, unsigned align_offset,
2842 unsigned bit_size, unsigned num_components,
2843 int64_t hole_size, nir_intrinsic_instr *low,
2844 nir_intrinsic_instr *high, void *data)
2845 {
2846 if (hole_size > 0)
2847 return false;
2848
2849 /* Must be aligned to the size of the load */
2850 unsigned align = nir_combined_align(align_mul, align_offset);
2851 if ((bit_size / 8) > align)
2852 return false;
2853
2854 if (num_components > 4)
2855 return false;
2856
2857 if (bit_size > 32)
2858 return false;
2859
2860 return true;
2861 }
2862
2863 static bool
set_speculate(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * _)2864 set_speculate(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *_)
2865 {
2866 if (!nir_intrinsic_has_access(intr))
2867 return false;
2868
2869 nir_intrinsic_set_access(intr,
2870 ACCESS_CAN_SPECULATE | nir_intrinsic_access(intr));
2871 return true;
2872 }
2873
2874 static bool
optimize_bounds(nir_builder * b,nir_intrinsic_instr * intr,void * data)2875 optimize_bounds(nir_builder *b, nir_intrinsic_instr *intr, void *data)
2876 {
2877 if (intr->intrinsic != nir_intrinsic_load_constant_agx)
2878 return false;
2879
2880 assert(intr->def.bit_size > 1 && "no if-uses");
2881 nir_scalar srcs[2] = {{NULL}};
2882 unsigned use_count = 0;
2883
2884 nir_alu_instr *first_use = NULL;
2885
2886 nir_foreach_use(use, &intr->def) {
2887 /* All uses need to be bounds_agx */
2888 nir_instr *parent = nir_src_parent_instr(use);
2889 if (parent->type != nir_instr_type_alu)
2890 return false;
2891
2892 nir_alu_instr *alu = nir_instr_as_alu(parent);
2893 if ((alu->op != nir_op_bounds_agx) || (alu->src[0].src.ssa != &intr->def))
2894 return false;
2895
2896 assert(alu->def.num_components == 1 && alu->def.bit_size == 32);
2897
2898 /* All bounds checks need a common offset and bounds */
2899 for (unsigned s = 0; s < 2; ++s) {
2900 nir_scalar this = nir_scalar_resolved(alu->src[1 + s].src.ssa,
2901 alu->src[1 + s].swizzle[0]);
2902
2903 if (srcs[s].def == NULL)
2904 srcs[s] = this;
2905 else if (!nir_scalar_equal(srcs[s], this))
2906 return false;
2907
2908 /* To avoid dominance problems, we must sink loads. */
2909 if (this.def->parent_instr->block != intr->instr.block) {
2910 return false;
2911 }
2912 }
2913
2914 if (!first_use || first_use->def.index > alu->def.index) {
2915 first_use = alu;
2916 }
2917
2918 ++use_count;
2919 }
2920
2921 /* We've matched. Freeze the set of uses before chaning things. */
2922 nir_alu_instr **uses = alloca(sizeof(nir_alu_instr *) * use_count);
2923
2924 unsigned i = 0;
2925 nir_foreach_use(use, &intr->def) {
2926 nir_instr *parent = nir_src_parent_instr(use);
2927 uses[i++] = nir_instr_as_alu(parent);
2928 }
2929 assert(i == use_count && "should not have changed");
2930
2931 /* Sink the load */
2932 nir_instr_remove(&intr->instr);
2933 b->cursor = nir_before_instr(&first_use->instr);
2934 nir_builder_instr_insert(b, &intr->instr);
2935
2936 /* Now start rewriting. Grab some common variables */
2937 b->cursor = nir_before_instr(&intr->instr);
2938 nir_def *offset = nir_channel(b, srcs[0].def, srcs[0].comp);
2939 nir_def *bounds = nir_channel(b, srcs[1].def, srcs[1].comp);
2940
2941 nir_def *in_bounds = nir_uge(b, bounds, offset);
2942 nir_def *zero = nir_imm_int(b, 0);
2943
2944 nir_src *base_src = &intr->src[0];
2945 nir_src *offs_src = &intr->src[1];
2946
2947 nir_def *base_lo = nir_unpack_64_2x32_split_x(b, base_src->ssa);
2948 nir_def *base_hi = nir_unpack_64_2x32_split_y(b, base_src->ssa);
2949
2950 /* Bounds check the base/offset instead. We currently reserve the bottom
2951 * 2^36 of VA (this is driver/compiler ABI). With soft fault enabled, that
2952 * means any read of the lower region will return zero as required.
2953 *
2954 * Therefore, when out-of-bounds, we clamp the index to zero and the high
2955 * half of the address to zero. We don't need to clamp the low half of the
2956 * address. The resulting sum is thus:
2957 *
2958 * 0*(2^32) + lo + (index << shift)
2959 *
2960 * ...which will be in the unmapped zero region provided shift < 4.
2961 */
2962 base_hi = nir_bcsel(b, in_bounds, base_hi, zero);
2963
2964 /* Clamp index if the shift is too large or sign-extension used */
2965 if (nir_intrinsic_base(intr) >= 2 || nir_intrinsic_sign_extend(intr)) {
2966 nir_src_rewrite(offs_src, nir_bcsel(b, in_bounds, offs_src->ssa, zero));
2967 }
2968
2969 nir_src_rewrite(base_src, nir_pack_64_2x32_split(b, base_lo, base_hi));
2970
2971 /* Now that the load itself is bounds checked, all that's left is removing
2972 * the bounds checks on the output. This requires a little care to avoid an
2973 * infinite loop.
2974 *
2975 * Also note we cannot remove the uses here, because it would invalidate the
2976 * iterator inside intrinsics_pass. I hate C, don't you?
2977 */
2978 for (unsigned i = 0; i < use_count; ++i) {
2979 b->cursor = nir_after_instr(&uses[i]->instr);
2980 nir_def *chan = nir_channel(b, &intr->def, uses[i]->src[0].swizzle[0]);
2981 nir_def_rewrite_uses(&uses[i]->def, chan);
2982 }
2983
2984 return true;
2985 }
2986
2987 static void
agx_optimize_nir(nir_shader * nir,bool soft_fault,uint16_t * preamble_size)2988 agx_optimize_nir(nir_shader *nir, bool soft_fault, uint16_t *preamble_size)
2989 {
2990 /* This runs only once up front since other optimizations don't affect it */
2991 NIR_PASS(_, nir, nir_opt_shrink_stores, true);
2992
2993 agx_optimize_loop_nir(nir);
2994
2995 /* If soft fault is enabled, we can freely speculate everything. That lets us
2996 * peephole select and form preambles more aggressively.
2997 */
2998 if (soft_fault) {
2999 NIR_PASS(_, nir, nir_shader_intrinsics_pass, set_speculate,
3000 nir_metadata_control_flow, NULL);
3001 }
3002
3003 /* Peephole select again after setting the speculate flag but before
3004 * vectorizing. This cleans up short-circuit loads in unrolled loops.
3005 *
3006 * XXX: Set indirect_load_ok once we can investigate CTS flakes.
3007 */
3008 NIR_PASS(_, nir, nir_opt_peephole_select, 64, false, true);
3009
3010 NIR_PASS(_, nir, nir_opt_load_store_vectorize,
3011 &(const nir_load_store_vectorize_options){
3012 .modes = nir_var_mem_global | nir_var_mem_constant |
3013 nir_var_shader_temp,
3014 .callback = agx_mem_vectorize_cb,
3015 });
3016 NIR_PASS(_, nir, nir_lower_pack);
3017 NIR_PASS(_, nir, nir_opt_algebraic);
3018
3019 /* Lower addressing modes. The sooner we do this, the sooner we get rid of
3020 * amul/aadd instructions and can let nir_opt_algebraic do its job. But we
3021 * want to vectorize first since nir_opt_load_store_vectorize doesn't know
3022 * how to handle our loads.
3023 */
3024 NIR_PASS(_, nir, agx_nir_lower_address);
3025
3026 NIR_PASS_V(nir, nir_divergence_analysis);
3027 bool progress = false;
3028
3029 static const nir_lower_subgroups_options subgroups_options = {
3030 .ballot_bit_size = 32,
3031 .ballot_components = 1,
3032 .lower_elect = true,
3033 .lower_subgroup_masks = true,
3034 };
3035
3036 NIR_PASS(progress, nir, nir_opt_uniform_atomics, true);
3037 NIR_PASS(progress, nir, nir_opt_uniform_subgroup, &subgroups_options);
3038 if (progress) {
3039 NIR_PASS(_, nir, agx_nir_lower_subgroups);
3040 }
3041
3042 /* The above create operations that need lowering/optimizing */
3043 do {
3044 progress = false;
3045
3046 NIR_PASS(progress, nir, nir_opt_algebraic);
3047 NIR_PASS(progress, nir, nir_opt_constant_folding);
3048 NIR_PASS(progress, nir, nir_opt_dce);
3049 } while (progress);
3050
3051 progress = false;
3052
3053 /* If address lowering made progress, clean up before forming preambles.
3054 * Otherwise the optimized preambles might just be constants! Do it before
3055 * lowering int64 too, to avoid lowering constant int64 arithmetic.
3056 */
3057 if (progress) {
3058 NIR_PASS(_, nir, nir_opt_constant_folding);
3059 NIR_PASS(_, nir, nir_opt_dce);
3060 }
3061
3062 /* Only lower int64 after optimizing address arithmetic, so that u2u64/i2i64
3063 * conversions remain.
3064 */
3065 progress = false;
3066 NIR_PASS(progress, nir, nir_lower_int64);
3067
3068 /* If we lowered actual int64 arithmetic (not folded into the address
3069 * calculations), then clean up after the lowering.
3070 */
3071 if (progress) {
3072 do {
3073 progress = false;
3074
3075 NIR_PASS(progress, nir, nir_opt_algebraic);
3076 NIR_PASS(progress, nir, nir_opt_constant_folding);
3077 NIR_PASS(progress, nir, nir_opt_dce);
3078 } while (progress);
3079 }
3080
3081 /* Lower fmin/fmax before optimizing preambles so we can see across uniform
3082 * expressions.
3083 */
3084 NIR_PASS(_, nir, agx_nir_lower_fminmax);
3085
3086 if (preamble_size && (!(agx_compiler_debug & AGX_DBG_NOPREAMBLE))) {
3087 unsigned temp = *preamble_size;
3088 NIR_PASS(_, nir, agx_nir_opt_preamble, &temp);
3089 *preamble_size = temp;
3090 }
3091
3092 /* Forming preambles may dramatically reduce the instruction count
3093 * in certain blocks, causing some if-else statements to become
3094 * trivial. We want to peephole select those, given that control flow
3095 * prediction instructions are costly.
3096 *
3097 * We need to lower int64 again to deal with the resulting 64-bit csels.
3098 */
3099 NIR_PASS(_, nir, nir_opt_peephole_select, 64, false, true);
3100 NIR_PASS(_, nir, nir_lower_int64);
3101
3102 /* We need to lower fmin/fmax again after nir_opt_algebraic_late due to f2fmp
3103 * wackiness. This is usually a no-op but is required for correctness in
3104 * GLES.
3105 */
3106 NIR_PASS(_, nir, nir_opt_algebraic_late);
3107 NIR_PASS(_, nir, agx_nir_lower_fminmax);
3108
3109 /* Fuse add/sub/multiplies/shifts after running opt_algebraic_late to fuse
3110 * isub but before shifts are lowered.
3111 */
3112 do {
3113 progress = false;
3114
3115 NIR_PASS(progress, nir, nir_opt_dce);
3116 NIR_PASS(progress, nir, nir_opt_cse);
3117 NIR_PASS(progress, nir, agx_nir_fuse_algebraic_late);
3118 } while (progress);
3119
3120 /* Before optimizing bounds checks, we need to clean up and index defs so
3121 * optimize_bounds does the right thing.
3122 */
3123 NIR_PASS(_, nir, nir_copy_prop);
3124 NIR_PASS(_, nir, nir_opt_dce);
3125
3126 nir_foreach_function_impl(impl, nir) {
3127 nir_index_ssa_defs(impl);
3128 }
3129
3130 /* TODO: Reenable this pass. It's breaking Fallout 4 in ways I don't
3131 * understand yet.
3132 */
3133 if (soft_fault && 0) {
3134 NIR_PASS(_, nir, nir_shader_intrinsics_pass, optimize_bounds,
3135 nir_metadata_control_flow, NULL);
3136 }
3137
3138 /* Do remaining lowering late, since this inserts &s for shifts so we want to
3139 * do it after fusing constant shifts. Constant folding will clean up.
3140 */
3141 NIR_PASS(_, nir, agx_nir_lower_algebraic_late);
3142 NIR_PASS(_, nir, agx_nir_fuse_selects);
3143 NIR_PASS(_, nir, nir_opt_constant_folding);
3144 NIR_PASS(_, nir, nir_opt_combine_barriers, NULL, NULL);
3145 NIR_PASS(_, nir, nir_copy_prop);
3146 NIR_PASS(_, nir, nir_opt_dce);
3147 NIR_PASS(_, nir, nir_opt_cse);
3148 NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
3149 NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
3150
3151 /* Cleanup optimizations */
3152 nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
3153 nir_move_load_input | nir_move_comparisons |
3154 nir_move_copies | nir_move_load_ssbo |
3155 nir_move_alu;
3156
3157 NIR_PASS(_, nir, nir_opt_sink, move_all);
3158 NIR_PASS(_, nir, nir_opt_move, move_all);
3159 NIR_PASS(_, nir, nir_lower_phis_to_scalar, true);
3160 }
3161
3162 /*
3163 * Varyings that are used as texture coordinates should be kept at fp32, because
3164 * fp16 does not have enough precision for large textures. It's technically
3165 * conformant not to, but every app gets this wrong.
3166 */
3167 static bool
gather_texcoords(nir_builder * b,nir_instr * instr,void * data)3168 gather_texcoords(nir_builder *b, nir_instr *instr, void *data)
3169 {
3170 uint64_t *mask = data;
3171
3172 if (instr->type != nir_instr_type_tex)
3173 return false;
3174
3175 nir_tex_instr *tex = nir_instr_as_tex(instr);
3176
3177 int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
3178 if (coord_idx < 0)
3179 return false;
3180
3181 nir_src src = tex->src[coord_idx].src;
3182 nir_scalar x = nir_scalar_resolved(src.ssa, 0);
3183 nir_scalar y = nir_scalar_resolved(src.ssa, 1);
3184
3185 if (x.def != y.def)
3186 return false;
3187
3188 nir_instr *parent = x.def->parent_instr;
3189
3190 if (parent->type != nir_instr_type_intrinsic)
3191 return false;
3192
3193 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
3194
3195 if (intr->intrinsic != nir_intrinsic_load_interpolated_input)
3196 return false;
3197
3198 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
3199 *mask |= BITFIELD64_BIT(sem.location);
3200 return false;
3201 }
3202
3203 static bool
gather_interp(nir_builder * b,nir_intrinsic_instr * intr,void * data)3204 gather_interp(nir_builder *b, nir_intrinsic_instr *intr, void *data)
3205 {
3206 struct agx_interp_info *masks = data;
3207
3208 if (intr->intrinsic == nir_intrinsic_load_input) {
3209 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
3210 masks->flat |= BITFIELD64_RANGE(sem.location, sem.num_slots);
3211 } else if (intr->intrinsic == nir_intrinsic_load_interpolated_input &&
3212 nir_intrinsic_interp_mode(nir_src_as_intrinsic(intr->src[0])) ==
3213 INTERP_MODE_NOPERSPECTIVE) {
3214 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
3215 masks->linear |= BITFIELD64_RANGE(sem.location, sem.num_slots);
3216 }
3217
3218 return false;
3219 }
3220
3221 /*
3222 * Build a bit mask of varyings (by location) that are flatshaded and linear
3223 * shaded. This information is needed by the driver.
3224 */
3225 struct agx_interp_info
agx_gather_interp_info(nir_shader * nir)3226 agx_gather_interp_info(nir_shader *nir)
3227 {
3228 assert(nir->info.stage == MESA_SHADER_FRAGMENT);
3229
3230 struct agx_interp_info masks = {0};
3231 nir_shader_intrinsics_pass(nir, gather_interp, nir_metadata_all, &masks);
3232 return masks;
3233 }
3234
3235 /*
3236 * Build a bit mask of varyings (by location) that are used as texture
3237 * coordinates. This information is needed by lower_mediump_io.
3238 */
3239 uint64_t
agx_gather_texcoords(nir_shader * nir)3240 agx_gather_texcoords(nir_shader *nir)
3241 {
3242 assert(nir->info.stage == MESA_SHADER_FRAGMENT);
3243
3244 uint64_t mask = 0;
3245 nir_shader_instructions_pass(nir, gather_texcoords, nir_metadata_all, &mask);
3246 return mask;
3247 }
3248
3249 static nir_mem_access_size_align
mem_access_size_align_cb(nir_intrinsic_op intrin,uint8_t bytes,uint8_t bit_size,uint32_t align,uint32_t align_offset,bool offset_is_const,enum gl_access_qualifier access,const void * cb_data)3250 mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
3251 uint8_t bit_size, uint32_t align,
3252 uint32_t align_offset, bool offset_is_const,
3253 enum gl_access_qualifier access, const void *cb_data)
3254 {
3255 align = nir_combined_align(align, align_offset);
3256
3257 assert(util_is_power_of_two_nonzero(align));
3258
3259 if ((bytes & 1) || (align == 1))
3260 bit_size = 8;
3261 else if ((bytes & 2) || (align == 2))
3262 bit_size = 16;
3263 else if (bit_size >= 32)
3264 bit_size = 32;
3265
3266 return (nir_mem_access_size_align){
3267 .num_components = MIN2(bytes / (bit_size / 8), 4),
3268 .bit_size = bit_size,
3269 .align = bit_size / 8,
3270 .shift = nir_mem_access_shift_method_scalar,
3271 };
3272 }
3273
3274 static unsigned
lower_bit_size_callback(const nir_instr * instr,UNUSED void * _)3275 lower_bit_size_callback(const nir_instr *instr, UNUSED void *_)
3276 {
3277 if (instr->type == nir_instr_type_intrinsic) {
3278 /* Handle small subgroup ops */
3279 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3280
3281 switch (intr->intrinsic) {
3282 case nir_intrinsic_reduce:
3283 case nir_intrinsic_exclusive_scan:
3284 case nir_intrinsic_inclusive_scan:
3285 /* The identity for iand doesn't work for lowered 1-bit booleans, so
3286 * lower that explicitly.
3287 */
3288 if (nir_intrinsic_reduction_op(intr) == nir_op_iand &&
3289 intr->def.bit_size == 1)
3290 return 16;
3291
3292 /* In general, we have 16-bit ops instead of 8-bit, so lower those. */
3293 return intr->def.bit_size == 8 ? 16 : 0;
3294 default:
3295 return 0;
3296 }
3297 } else if (instr->type == nir_instr_type_alu) {
3298 /* Lower 8-bit ALU to 16-bit. We check the destination, as we do not want
3299 * to lower conversions from 8-bit to larger types. Those conversions get
3300 * implemented natively.
3301 */
3302 nir_alu_instr *alu = nir_instr_as_alu(instr);
3303 if (alu->def.bit_size == 8 && !is_conversion_to_8bit(alu->op))
3304 return 16;
3305 else if (alu->def.bit_size == 1 && alu->src[0].src.ssa->bit_size == 8)
3306 return 16 /* comparisons */;
3307 }
3308
3309 return 0;
3310 }
3311
3312 static bool
lower_load_from_texture_handle(nir_builder * b,nir_intrinsic_instr * intr,void * data)3313 lower_load_from_texture_handle(nir_builder *b, nir_intrinsic_instr *intr,
3314 void *data)
3315 {
3316 if (intr->intrinsic != nir_intrinsic_load_from_texture_handle_agx)
3317 return false;
3318
3319 /* Bindless handles are a vec2, where the first source is the (constant)
3320 * uniform register number and the second source is the byte offset.
3321 */
3322 nir_scalar uniform = nir_scalar_resolved(intr->src[0].ssa, 0);
3323 unsigned uniform_idx = nir_scalar_as_uint(uniform);
3324
3325 b->cursor = nir_instr_remove(&intr->instr);
3326 nir_def *base = nir_load_preamble(b, 1, 64, uniform_idx);
3327 nir_def *offset = nir_u2u64(b, nir_channel(b, intr->src[0].ssa, 1));
3328
3329 nir_def_rewrite_uses(&intr->def, nir_iadd(b, base, offset));
3330 return true;
3331 }
3332
3333 static void
agx_remove_unreachable_block(agx_block * block)3334 agx_remove_unreachable_block(agx_block *block)
3335 {
3336 /* Delete the edges */
3337 agx_foreach_successor(block, succ) {
3338 unsigned block_idx = agx_predecessor_index(succ, block);
3339
3340 /* Remove the corresponding predecessor from the successor */
3341 struct util_dynarray *blocks = &succ->predecessors;
3342 int remaining = agx_num_predecessors(succ) - (block_idx + 1);
3343 assert(remaining >= 0);
3344
3345 memcpy(util_dynarray_element(blocks, agx_block *, block_idx),
3346 util_dynarray_element(blocks, agx_block *, block_idx + 1),
3347 remaining * sizeof(agx_block *));
3348 blocks->size -= sizeof(agx_block *);
3349
3350 /* Remove the corresponding source from the phis */
3351 agx_foreach_phi_in_block(succ, phi) {
3352 assert(block_idx + 1 <= phi->nr_srcs);
3353
3354 memcpy(phi->src + block_idx, phi->src + block_idx + 1,
3355 (phi->nr_srcs - (block_idx + 1)) * sizeof(phi->src[0]));
3356
3357 phi->nr_srcs--;
3358
3359 /* This might cause phis to become trivial. Lower 1-source phis to
3360 * moves and let copyprop take it from here.
3361 */
3362 if (phi->nr_srcs == 1) {
3363 phi->op = AGX_OPCODE_MOV;
3364 }
3365 }
3366 }
3367
3368 /* Remove the successor from the predecessor. */
3369 block->successors[0] = NULL;
3370 block->successors[1] = NULL;
3371
3372 /* Note: we do not remove the block itself, although it is now fully orphaned
3373 * in the control flow graph. We still need it in source order if it has any
3374 * pop_exec instructions, for a loop continue block.
3375 *
3376 * TODO: Is there a better way to handle this?
3377 *
3378 * Affects: dEQP-VK.graphicsfuzz.cov-matching-if-always-true-inside-loop
3379 */
3380 }
3381
3382 /*
3383 * NIR sometimes contains unreachable blocks (e.g. due to infinite loops). These
3384 * blocks have no predecessors, but do have successors and can contribute to
3385 * phis. They are dead and do not need to be here. Further, they violate the IR
3386 * invariant:
3387 *
3388 * Live-in sources are live-out in all predecessors.
3389 *
3390 * ...which RA depends on when handling live range splits. The simplest solution
3391 * is to simply delete these dead blocks. Fortunately, because they are
3392 * unreachable, this does not have any ill effects. Notably, this cannot
3393 * introduce critical edges.
3394 *
3395 * Deleting a block may cause a successor to become unreachable, so we use a
3396 * fixed-point algorithm to converge.
3397 */
3398 static void
agx_remove_unreachable_blocks(agx_context * ctx)3399 agx_remove_unreachable_blocks(agx_context *ctx)
3400 {
3401 agx_block *start = agx_start_block(ctx);
3402 bool progress;
3403
3404 do {
3405 progress = false;
3406
3407 agx_foreach_block_safe(ctx, pred) {
3408 if (pred != start && agx_num_predecessors(pred) == 0 &&
3409 agx_num_successors(pred) > 0) {
3410
3411 agx_remove_unreachable_block(pred);
3412 progress = true;
3413 }
3414 }
3415 } while (progress);
3416 }
3417
3418 static bool
agx_should_dump(nir_shader * nir,unsigned agx_dbg_bit)3419 agx_should_dump(nir_shader *nir, unsigned agx_dbg_bit)
3420 {
3421 return (agx_compiler_debug & agx_dbg_bit) &&
3422 !(nir->info.internal && !(agx_compiler_debug & AGX_DBG_INTERNAL));
3423 }
3424
3425 #define AGX_PASS(shader, pass, ...) \
3426 do { \
3427 pass(shader, ##__VA_ARGS__); \
3428 agx_validate(ctx, #pass); \
3429 } while (0)
3430
3431 static unsigned
agx_compile_function_nir(nir_shader * nir,nir_function_impl * impl,struct agx_shader_key * key,struct util_debug_callback * debug,struct util_dynarray * binary,struct agx_shader_info * out)3432 agx_compile_function_nir(nir_shader *nir, nir_function_impl *impl,
3433 struct agx_shader_key *key,
3434 struct util_debug_callback *debug,
3435 struct util_dynarray *binary,
3436 struct agx_shader_info *out)
3437 {
3438 nir_index_blocks(impl);
3439 nir_index_ssa_defs(impl);
3440
3441 agx_context *ctx = rzalloc(NULL, agx_context);
3442 ctx->nir = nir;
3443 ctx->is_preamble = impl->function->is_preamble;
3444 ctx->out = out;
3445 ctx->key = key;
3446 ctx->stage = nir->info.stage;
3447 ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
3448 ctx->indexed_nir_blocks = rzalloc_array(ctx, agx_block *, impl->num_blocks);
3449 list_inithead(&ctx->blocks);
3450
3451 if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_preamble) {
3452 ctx->any_cf = key->fs.inside_sample_loop;
3453 }
3454
3455 ctx->alloc = impl->ssa_alloc;
3456 emit_cf_list(ctx, &impl->body);
3457 agx_emit_phis_deferred(ctx);
3458
3459 /* Index blocks now that we're done emitting so the order is consistent. Do
3460 * this before agx_remove_unreachable_blocks so we match NIR indexing. This
3461 * makes for easier debugging.
3462 */
3463 agx_foreach_block(ctx, block) {
3464 block->index = ctx->num_blocks++;
3465 }
3466
3467 agx_remove_unreachable_blocks(ctx);
3468
3469 /* Only allocate scratch if it's statically used, regardless of if the NIR
3470 * info claims otherwise.
3471 */
3472 if (ctx->any_scratch) {
3473 assert(!ctx->is_preamble && "preambles don't use scratch");
3474 ctx->scratch_size_B = ALIGN(nir->scratch_size, 16);
3475 }
3476
3477 /* Stop the main shader or preamble shader after the exit block. For real
3478 * functions, we would return here.
3479 */
3480 if (!ctx->key->no_stop || ctx->is_preamble) {
3481 agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
3482 agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block));
3483 agx_stop(&_b);
3484 }
3485
3486 agx_validate(ctx, "IR translation");
3487
3488 if (likely(!(agx_compiler_debug & AGX_DBG_NOOPT))) {
3489 /* Eliminate dead instructions before CSE to avoid silly scheduling */
3490 AGX_PASS(ctx, agx_dce, false);
3491
3492 /* CSE before eliminating dead destinations so that subdivision is
3493 * optimized properly.
3494 */
3495 AGX_PASS(ctx, agx_opt_cse);
3496
3497 /* After DCE, use counts are right so we can run the optimizer. */
3498 AGX_PASS(ctx, agx_optimizer_backward);
3499 AGX_PASS(ctx, agx_optimizer_forward);
3500 AGX_PASS(ctx, agx_opt_compact_constants);
3501
3502 /* After inlining constants, promote what's left */
3503 if (key->promote_constants && !key->secondary &&
3504 !(agx_compiler_debug & AGX_DBG_NOPROMOTE)) {
3505 AGX_PASS(ctx, agx_opt_promote_constants);
3506 }
3507 }
3508
3509 /* For correctness, lower uniform sources after copyprop (for correctness,
3510 * as copyprop creates uniform sources). To keep register pressure in
3511 * check, lower after CSE, since moves are cheaper than registers.
3512 */
3513 AGX_PASS(ctx, agx_lower_uniform_sources);
3514
3515 /* RA correctness depends on DCE */
3516 AGX_PASS(ctx, agx_dce, true);
3517
3518 if (agx_should_dump(nir, AGX_DBG_SHADERS))
3519 agx_print_shader(ctx, stdout);
3520
3521 if (likely(!(agx_compiler_debug & AGX_DBG_NOSCHED))) {
3522 AGX_PASS(ctx, agx_pressure_schedule);
3523 }
3524
3525 if (agx_should_dump(nir, AGX_DBG_SHADERS))
3526 agx_print_shader(ctx, stdout);
3527
3528 AGX_PASS(ctx, agx_ra);
3529 agx_lower_64bit_postra(ctx);
3530
3531 if (ctx->scratch_size_B > 0) {
3532 /* Apple always allocate 40 more bytes in the entrypoint and align to 4. */
3533 uint64_t stack_size = ALIGN(DIV_ROUND_UP(ctx->scratch_size_B, 4) + 10, 4);
3534
3535 assert(stack_size < INT16_MAX);
3536
3537 agx_block *start_block = agx_start_block(ctx);
3538 agx_builder _b = agx_init_builder(ctx, agx_before_block(start_block));
3539 agx_stack_adjust(&_b, stack_size);
3540
3541 /* If we're going to execute multiple times, make sure we clean up after
3542 * ourselves, else the hardware faults.
3543 */
3544 if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_preamble &&
3545 ctx->key->fs.inside_sample_loop) {
3546
3547 _b = agx_init_builder(ctx, agx_after_block(agx_end_block(ctx)));
3548 agx_stack_adjust(&_b, -stack_size);
3549 }
3550
3551 if (ctx->is_preamble)
3552 out->preamble_scratch_size = stack_size;
3553 else
3554 out->scratch_size = stack_size;
3555 }
3556
3557 if ((ctx->stage == MESA_SHADER_VERTEX ||
3558 ctx->stage == MESA_SHADER_TESS_EVAL) &&
3559 !impl->function->is_preamble && !ctx->key->secondary)
3560 agx_set_st_vary_final(ctx);
3561
3562 agx_insert_waits(ctx);
3563 agx_opt_empty_else(ctx);
3564 agx_opt_break_if(ctx);
3565 agx_opt_jmp_none(ctx);
3566
3567 if (ctx->any_quad_divergent_shuffle)
3568 agx_lower_divergent_shuffle(ctx);
3569
3570 agx_lower_pseudo(ctx);
3571
3572 if (agx_should_dump(nir, AGX_DBG_SHADERS))
3573 agx_print_shader(ctx, stdout);
3574
3575 /* Upload constants before the binary instead after to reduce the chance they
3576 * get prefetched into the i-cache, when we want them only in the d-cache.
3577 * Also helps fill the padding space for small preambles.
3578 */
3579 if (ctx->out->rodata.size_16 && !impl->function->is_preamble) {
3580 ctx->out->rodata.offset = agx_pad_binary(binary, 4);
3581 unsigned size_16 = ctx->out->rodata.size_16;
3582
3583 uint16_t *ro = util_dynarray_grow(binary, uint16_t, size_16);
3584 memcpy(ro, ctx->rodata, size_16 * 2);
3585 }
3586
3587 unsigned offset = agx_pad_binary(binary, AGX_CODE_ALIGN);
3588 agx_pack_binary(ctx, binary);
3589
3590 unsigned nr_gprs = ctx->max_reg + 1;
3591
3592 /* If the preamble uses scratch (due to spilling), we need to set maximal
3593 * GPRs. Do it here so the driver doesn't have to worry about it.
3594 */
3595 if (impl->function->is_preamble)
3596 out->nr_preamble_gprs = ctx->scratch_size_B ? 256 : nr_gprs;
3597 else
3598 out->nr_gprs = nr_gprs;
3599
3600 /* Don't dump statistics for preambles, since they're not worth optimizing */
3601 if (!impl->function->is_preamble) {
3602 char *stats;
3603 int ret = agx_dump_stats(ctx, binary->size, &stats);
3604
3605 if (ret >= 0) {
3606 if (agx_should_dump(nir, AGX_DBG_SHADERDB)) {
3607 fprintf(stderr, "SHADER-DB: %s - %s\n", nir->info.label ?: "",
3608 stats);
3609 }
3610
3611 if (debug)
3612 util_debug_message(debug, SHADER_INFO, "%s", stats);
3613
3614 free(stats);
3615 }
3616 }
3617
3618 ralloc_free(ctx);
3619
3620 return offset;
3621 }
3622
3623 void
agx_link_libagx(nir_shader * nir,const nir_shader * libagx)3624 agx_link_libagx(nir_shader *nir, const nir_shader *libagx)
3625 {
3626 nir_link_shader_functions(nir, libagx);
3627 NIR_PASS(_, nir, nir_inline_functions);
3628 nir_remove_non_entrypoints(nir);
3629 NIR_PASS(_, nir, nir_opt_deref);
3630 NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3631 NIR_PASS(_, nir, nir_remove_dead_derefs);
3632 NIR_PASS(_, nir, nir_remove_dead_variables,
3633 nir_var_function_temp | nir_var_shader_temp, NULL);
3634 NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
3635 nir_var_shader_temp | nir_var_function_temp,
3636 glsl_get_cl_type_size_align);
3637 }
3638
3639 /*
3640 * The hardware frcp instruction is sometimes off by 1 ULP. For correctly
3641 * rounded frcp, a refinement step is required. This routine has been
3642 * exhaustively tested with a modified math_bruteforce.
3643 *
3644 * While Khronos APIs allow 2.5 ULP error for divides, nir_lower_idiv relies on
3645 * correctly rounded frcp. This is therefore load bearing for integer division
3646 * on all APIs.
3647 */
3648 static nir_def *
libagx_frcp(nir_builder * b,nir_def * x)3649 libagx_frcp(nir_builder *b, nir_def *x)
3650 {
3651 nir_def *u = nir_frcp(b, x);
3652
3653 /* Do 1 Newton-Raphson refinement step.
3654 *
3655 * Define f(u) = xu - 1. Then f(u) = 0 iff u = 1/x. Newton's method gives:
3656 *
3657 * u_2 = u - f(u) / f'(u) = u - (xu - 1) / x
3658 *
3659 * Our original guess is close, so we approximate (1 / x) by u:
3660 *
3661 * u_2 = u - u(xu - 1) = u + u(1 - xu)
3662 * = fma(fma(-x, u, 1), u, u)
3663 */
3664 nir_def *one = nir_imm_float(b, 1.0);
3665 nir_def *u_2 = nir_ffma(b, nir_ffma(b, nir_fneg(b, x), u, one), u, u);
3666
3667 /* If the original value was infinite, frcp will generate the correct zero.
3668 * However, the Newton-Raphson step would multiply 0 * Inf and get a NaN. So
3669 * skip the refinement step for infinite inputs. We do this backwards,
3670 * checking whether the refined result is NaN, since we can implement this
3671 * check in a single fcmpsel instruction. The other case where the refinement
3672 * is NaN is a NaN input, in which skipping refinement is acceptable.
3673 */
3674 return nir_bcsel(b, nir_fisnan(b, u_2), u, u_2);
3675 }
3676
3677 static bool
agx_nir_lower_fdiv(nir_builder * b,nir_alu_instr * alu,void * _)3678 agx_nir_lower_fdiv(nir_builder *b, nir_alu_instr *alu, void *_)
3679 {
3680 if (alu->op != nir_op_frcp || !alu->exact || alu->def.bit_size != 32)
3681 return false;
3682
3683 b->cursor = nir_before_instr(&alu->instr);
3684 nir_def_replace(&alu->def, libagx_frcp(b, nir_ssa_for_alu_src(b, alu, 0)));
3685 return true;
3686 }
3687
3688 /* Preprocess NIR independent of shader state */
3689 void
agx_preprocess_nir(nir_shader * nir,const nir_shader * libagx)3690 agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx)
3691 {
3692 NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3693
3694 /* Lower large arrays to scratch and small arrays to csel */
3695 NIR_PASS(_, nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
3696 glsl_get_natural_size_align_bytes, glsl_get_word_size_align_bytes);
3697 NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
3698 NIR_PASS(_, nir, nir_split_var_copies);
3699 NIR_PASS(_, nir, nir_lower_global_vars_to_local);
3700 NIR_PASS(_, nir, nir_lower_var_copies);
3701
3702 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3703 NIR_PASS(_, nir, agx_nir_lower_frag_sidefx);
3704 }
3705
3706 /* Clean up deref gunk after lowering I/O */
3707 NIR_PASS(_, nir, nir_opt_dce);
3708
3709 agx_link_libagx(nir, libagx);
3710
3711 /* Runs before we lower away idiv, to work at all. But runs after lowering
3712 * textures, since the cube map array lowering generates division by 6.
3713 */
3714 NIR_PASS(_, nir, nir_opt_idiv_const, 16);
3715
3716 nir_lower_idiv_options idiv_options = {
3717 .allow_fp16 = true,
3718 };
3719
3720 NIR_PASS(_, nir, nir_lower_idiv, &idiv_options);
3721 NIR_PASS(_, nir, nir_lower_frexp);
3722 NIR_PASS(_, nir, nir_lower_alu);
3723 NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
3724 NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
3725 NIR_PASS(_, nir, nir_lower_flrp, 16 | 32 | 64, false);
3726 NIR_PASS(_, nir, agx_lower_sincos);
3727 NIR_PASS(_, nir, nir_shader_intrinsics_pass, agx_lower_front_face,
3728 nir_metadata_control_flow, NULL);
3729 NIR_PASS(_, nir, agx_nir_lower_subgroups);
3730 NIR_PASS(_, nir, nir_lower_phis_to_scalar, true);
3731 NIR_PASS(_, nir, nir_shader_alu_pass, agx_nir_lower_fdiv,
3732 nir_metadata_control_flow, NULL);
3733
3734 /* After lowering, run through the standard suite of NIR optimizations. We
3735 * will run through the loop later, once we have the shader key, but if we
3736 * run now, that run will ideally be almost a no-op.
3737 */
3738 agx_optimize_loop_nir(nir);
3739
3740 NIR_PASS(_, nir, nir_opt_deref);
3741 NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3742
3743 /* We're lowered away all variables. Remove them all for smaller shaders. */
3744 NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_all, NULL);
3745 nir->info.io_lowered = true;
3746
3747 /* Move before lowering */
3748 nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
3749 nir_move_load_input | nir_move_comparisons |
3750 nir_move_copies | nir_move_load_ssbo |
3751 nir_move_alu;
3752
3753 NIR_PASS(_, nir, nir_opt_sink, move_all);
3754 NIR_PASS(_, nir, nir_opt_move, move_all);
3755 NIR_PASS(_, nir, agx_nir_lower_shared_bitsize);
3756 }
3757
3758 void
agx_compile_shader_nir(nir_shader * nir,struct agx_shader_key * key,struct util_debug_callback * debug,struct agx_shader_part * out)3759 agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
3760 struct util_debug_callback *debug,
3761 struct agx_shader_part *out)
3762 {
3763 agx_compiler_debug = agx_get_compiler_debug();
3764 struct agx_shader_info *info = &out->info;
3765
3766 struct util_dynarray binary;
3767 util_dynarray_init(&binary, NULL);
3768
3769 memset(out, 0, sizeof *out);
3770
3771 assert(nir->info.io_lowered &&
3772 "agx_preprocess_nir is called first, then the shader is specalized,"
3773 "then the specialized shader is compiled");
3774
3775 /* If required, tag writes will be enabled by instruction selection */
3776 if (nir->info.stage == MESA_SHADER_FRAGMENT)
3777 info->tag_write_disable = !nir->info.writes_memory;
3778
3779 NIR_PASS(_, nir, nir_lower_printf_buffer, LIBAGX_PRINTF_BUFFER_ADDRESS,
3780 LIBAGX_PRINTF_BUFFER_SIZE - 8);
3781
3782 bool needs_libagx = true /* TODO: Optimize */;
3783
3784 NIR_PASS(_, nir, nir_lower_frag_coord_to_pixel_coord);
3785 NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3786
3787 if (needs_libagx) {
3788 agx_link_libagx(nir, key->libagx);
3789
3790 NIR_PASS(_, nir, nir_opt_deref);
3791 NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3792 NIR_PASS(_, nir, nir_lower_explicit_io,
3793 nir_var_shader_temp | nir_var_function_temp |
3794 nir_var_mem_shared | nir_var_mem_global,
3795 nir_address_format_62bit_generic);
3796 }
3797
3798 /* Late sysval lowering creates large loads. Load lowering creates unpacks */
3799 nir_lower_mem_access_bit_sizes_options lower_mem_access_options = {
3800 .modes = nir_var_mem_ssbo | nir_var_mem_constant |
3801 nir_var_mem_task_payload | nir_var_shader_temp |
3802 nir_var_function_temp | nir_var_mem_global | nir_var_mem_shared,
3803 .callback = mem_access_size_align_cb,
3804 };
3805 NIR_PASS(_, nir, nir_lower_mem_access_bit_sizes, &lower_mem_access_options);
3806
3807 /* Optimize scratch access */
3808 NIR_PASS(_, nir, nir_lower_scratch_to_var);
3809 NIR_PASS(_, nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
3810 glsl_get_natural_size_align_bytes,
3811 glsl_get_natural_size_align_bytes);
3812 NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
3813
3814 /* Cleanup 8-bit math before lowering */
3815 bool progress;
3816 do {
3817 progress = false;
3818
3819 NIR_PASS(progress, nir, nir_opt_algebraic);
3820 NIR_PASS(progress, nir, nir_opt_constant_folding);
3821 NIR_PASS(progress, nir, nir_opt_dce);
3822 } while (progress);
3823
3824 NIR_PASS(_, nir, nir_lower_bit_size, lower_bit_size_callback, NULL);
3825
3826 /* Late blend lowering creates vectors */
3827 NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
3828 NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
3829
3830 /* Late VBO lowering creates constant udiv instructions */
3831 NIR_PASS(_, nir, nir_opt_idiv_const, 16);
3832
3833 NIR_PASS(_, nir, nir_opt_constant_folding);
3834 NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_load_from_texture_handle,
3835 nir_metadata_control_flow, NULL);
3836
3837 info->push_count = key->reserved_preamble;
3838 agx_optimize_nir(nir, key->dev.soft_fault,
3839 key->secondary ? NULL : &info->push_count);
3840
3841 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3842 info->varyings.fs.nr_cf = key->fs.cf_base;
3843 assign_coefficient_regs(nir, &info->varyings.fs);
3844 }
3845
3846 if (agx_should_dump(nir, AGX_DBG_SHADERS))
3847 nir_print_shader(nir, stdout);
3848
3849 info->local_size = nir->info.shared_size;
3850
3851 nir_foreach_function_with_impl(func, impl, nir) {
3852 unsigned offset =
3853 agx_compile_function_nir(nir, impl, key, debug, &binary, &out->info);
3854
3855 if (func->is_preamble) {
3856 info->preamble_offset = offset;
3857 info->has_preamble = true;
3858 } else if (func->is_entrypoint) {
3859 info->main_offset = offset;
3860 info->main_size = binary.size - offset;
3861 } else {
3862 unreachable("General functions not yet supported");
3863 }
3864 }
3865
3866 info->stage = nir->info.stage;
3867
3868 /* Check these outside the stage check since nir->info.stage is the hardware
3869 * stage and these are read in the vertex *software* stage.
3870 */
3871 info->uses_draw_id =
3872 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
3873
3874 info->uses_base_param =
3875 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX) ||
3876 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_VERTEX) ||
3877 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE);
3878
3879 if (nir->info.stage == MESA_SHADER_VERTEX ||
3880 nir->info.stage == MESA_SHADER_TESS_EVAL) {
3881 info->nonzero_viewport = nir->info.outputs_written & VARYING_BIT_VIEWPORT;
3882
3883 info->writes_layer_viewport =
3884 nir->info.outputs_written & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
3885
3886 } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3887 info->disable_tri_merging = nir->info.uses_wide_subgroup_intrinsics ||
3888 nir->info.fs.needs_quad_helper_invocations ||
3889 nir->info.writes_memory;
3890
3891 /* Writing the sample mask requires tag writes */
3892 info->tag_write_disable &= !info->writes_sample_mask;
3893
3894 /* Report a canonical depth layout. This happens at the end because the
3895 * sample mask lowering affects it.
3896 */
3897 enum gl_frag_depth_layout layout = nir->info.fs.depth_layout;
3898
3899 if (!(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)))
3900 info->depth_layout = FRAG_DEPTH_LAYOUT_UNCHANGED;
3901 else if (layout == FRAG_DEPTH_LAYOUT_NONE)
3902 info->depth_layout = FRAG_DEPTH_LAYOUT_ANY;
3903 else
3904 info->depth_layout = layout;
3905
3906 info->reads_tib = nir->info.fs.uses_fbfetch_output;
3907 info->early_fragment_tests = nir->info.fs.early_fragment_tests;
3908 } else if (nir->info.stage == MESA_SHADER_COMPUTE) {
3909 info->imageblock_stride = nir->info.cs.image_block_size_per_thread_agx;
3910
3911 for (unsigned i = 0; i < 3; ++i) {
3912 info->workgroup_size[i] = nir->info.workgroup_size[i];
3913 }
3914 }
3915
3916 out->binary = binary.data;
3917 info->binary_size = binary.size;
3918 }
3919