• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2021 Alyssa Rosenzweig
3  * Copyright 2020 Collabora Ltd.
4  * Copyright 2016 Broadcom
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include "agx_compile.h"
9 #include "asahi/clc/asahi_clc.h"
10 #include "asahi/layout/layout.h"
11 #include "compiler/nir/nir_builder.h"
12 #include "util/bitset.h"
13 #include "util/glheader.h"
14 #include "util/list.h"
15 #include "util/macros.h"
16 #include "util/u_debug.h"
17 #include "util/u_dynarray.h"
18 #include "agx_builder.h"
19 #include "agx_compiler.h"
20 #include "agx_debug.h"
21 #include "agx_nir.h"
22 #include "agx_opcodes.h"
23 #include "glsl_types.h"
24 #include "nir.h"
25 #include "nir_builtin_builder.h"
26 #include "nir_intrinsics.h"
27 #include "nir_intrinsics_indices.h"
28 #include "shader_enums.h"
29 
30 /* Cache-line align shader programs. This matches the prop compiler. */
31 #define AGX_CODE_ALIGN 0x80
32 
33 /* clang-format off */
34 static const struct debug_named_value agx_debug_options[] = {
35    {"shaders",   AGX_DBG_SHADERS,	"Dump shaders in NIR and AIR"},
36    {"shaderdb",  AGX_DBG_SHADERDB,	"Print statistics"},
37    {"verbose",   AGX_DBG_VERBOSE,	"Disassemble verbosely"},
38    {"internal",  AGX_DBG_INTERNAL,	"Dump even internal shaders"},
39    {"novalidate",AGX_DBG_NOVALIDATE,"Skip IR validation in debug builds"},
40    {"noopt",     AGX_DBG_NOOPT,     "Disable backend optimizations"},
41    {"wait",      AGX_DBG_WAIT,      "Wait after all async instructions"},
42    {"nopreamble",AGX_DBG_NOPREAMBLE,"Do not use shader preambles"},
43    {"demand",    AGX_DBG_DEMAND,    "Bound tightly to register demand"},
44    {"nosched",   AGX_DBG_NOSCHED,   "Do not schedule the shader"},
45    {"spill",     AGX_DBG_SPILL,     "Spill (almost) everything"},
46    {"nopromote", AGX_DBG_NOPROMOTE, "Do not promote constants to uniforms"},
47    DEBUG_NAMED_VALUE_END
48 };
49 /* clang-format on */
50 
51 DEBUG_GET_ONCE_FLAGS_OPTION(agx_compiler_debug, "AGX_MESA_DEBUG",
52                             agx_debug_options, 0)
53 
54 int agx_compiler_debug = 0;
55 
56 /*
57  * Pad binary to a given alignment and return aligned offset into the binary.
58  */
59 static unsigned
agx_pad_binary(struct util_dynarray * dyn,uint32_t align)60 agx_pad_binary(struct util_dynarray *dyn, uint32_t align)
61 {
62    if (dyn->size % align) {
63       unsigned ngrow = align - (dyn->size % align);
64       memset(util_dynarray_grow_bytes(dyn, ngrow, 1), 0, ngrow);
65    }
66 
67    assert((dyn->size % align) == 0);
68    return dyn->size;
69 }
70 
71 uint64_t
agx_get_compiler_debug(void)72 agx_get_compiler_debug(void)
73 {
74    return debug_get_option_agx_compiler_debug();
75 }
76 
77 static agx_index
agx_cached_preload(agx_context * ctx,unsigned base,enum agx_size size)78 agx_cached_preload(agx_context *ctx, unsigned base, enum agx_size size)
79 {
80    if (agx_is_null(ctx->preloaded[base])) {
81       agx_block *block = agx_start_block(ctx);
82       agx_builder b = agx_init_builder(ctx, agx_before_block(block));
83       ctx->preloaded[base] = agx_preload(&b, agx_register(base, size));
84    }
85 
86    return ctx->preloaded[base];
87 }
88 
89 static agx_index
agx_tess_coord_x(agx_builder * b)90 agx_tess_coord_x(agx_builder *b)
91 {
92    return agx_cached_preload(b->shader, 4, AGX_SIZE_32);
93 }
94 
95 static agx_index
agx_tess_coord_y(agx_builder * b)96 agx_tess_coord_y(agx_builder *b)
97 {
98    return agx_cached_preload(b->shader, 6, AGX_SIZE_32);
99 }
100 
101 static agx_index
agx_vertex_id(agx_builder * b)102 agx_vertex_id(agx_builder *b)
103 {
104    return agx_cached_preload(b->shader, 10, AGX_SIZE_32);
105 }
106 
107 static agx_index
agx_instance_id(agx_builder * b)108 agx_instance_id(agx_builder *b)
109 {
110    return agx_cached_preload(b->shader, 12, AGX_SIZE_32);
111 }
112 
113 #define VARYING_NUM_COMPONENTS (VARYING_SLOT_MAX * 4)
114 
115 struct coefficient_info {
116    BITSET_DECLARE(smooth, VARYING_NUM_COMPONENTS);
117    BITSET_DECLARE(flat, VARYING_NUM_COMPONENTS);
118    BITSET_DECLARE(noperspective, VARYING_NUM_COMPONENTS);
119 };
120 
121 static BITSET_WORD *
bitset_for_interp(struct coefficient_info * info,enum glsl_interp_mode mode)122 bitset_for_interp(struct coefficient_info *info, enum glsl_interp_mode mode)
123 {
124    /* clang-format off */
125    switch (mode) {
126    case INTERP_MODE_NONE:
127    case INTERP_MODE_SMOOTH:         return info->smooth;
128    case INTERP_MODE_NOPERSPECTIVE:  return info->noperspective;
129    case INTERP_MODE_FLAT:           return info->flat;
130    default:                         unreachable("invalid interp mode");
131    }
132    /* clang-format on */
133 }
134 
135 static bool
gather_cf(nir_builder * b,nir_intrinsic_instr * intr,void * data)136 gather_cf(nir_builder *b, nir_intrinsic_instr *intr, void *data)
137 {
138    /* First handle frag coord loads */
139    struct coefficient_info *info = data;
140    if (intr->intrinsic == nir_intrinsic_load_frag_coord_zw) {
141       BITSET_SET(info->noperspective,
142                  VARYING_SLOT_POS + nir_intrinsic_component(intr));
143       return false;
144    }
145 
146    /* Look for input loads and grab the instruction with the interp mode */
147    nir_intrinsic_instr *bary;
148    unsigned nr = 1;
149 
150    if (intr->intrinsic == nir_intrinsic_load_coefficients_agx) {
151       bary = intr;
152       /* Always load a scalar */
153    } else if (intr->intrinsic == nir_intrinsic_load_interpolated_input) {
154       bary = nir_src_as_intrinsic(intr->src[0]);
155       nr = intr->num_components;
156 
157       /* Perspective interpolation internally reads W */
158       if (nir_intrinsic_interp_mode(bary) != INTERP_MODE_NOPERSPECTIVE)
159          BITSET_SET(info->noperspective, VARYING_SLOT_POS + 3);
160    } else {
161       return false;
162    }
163 
164    BITSET_WORD *set = bitset_for_interp(data, nir_intrinsic_interp_mode(bary));
165    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
166    nir_src *offset = nir_get_io_offset_src(intr);
167 
168    /* Mark the exact range for direct loads to minimize CF registers, but mark a
169     * conservative bounding range for indirect array access.
170     */
171    if (nir_src_is_const(*offset)) {
172       unsigned location = sem.location + nir_src_as_uint(*offset);
173       unsigned start_comp = (location * 4) + nir_intrinsic_component(intr);
174 
175       BITSET_SET_RANGE(set, start_comp, start_comp + nr - 1);
176    } else {
177       unsigned start_comp = (sem.location * 4) + nir_intrinsic_component(intr);
178       bool compact = sem.location == VARYING_SLOT_CLIP_DIST0 ||
179                      sem.location == VARYING_SLOT_CLIP_DIST1;
180       unsigned stride = compact ? 1 : 4;
181 
182       /* For now we have to assign CF for the whole vec4 to make indirect
183        * indexiing work. This could be optimized later.
184        */
185       nr = stride;
186 
187       for (unsigned i = 0; i < sem.num_slots; ++i) {
188          BITSET_SET_RANGE(set, start_comp + (i * stride),
189                           start_comp + (i * stride) + nr - 1);
190       }
191    }
192 
193    return false;
194 }
195 
196 /*
197  * We assign all coefficient registers up front to ensure we have a consistent
198  * layout required for indirects to work.
199  */
200 static void
assign_coefficient_regs(nir_shader * nir,struct agx_varyings_fs * var)201 assign_coefficient_regs(nir_shader *nir, struct agx_varyings_fs *var)
202 {
203    struct coefficient_info info = {0};
204    nir_shader_intrinsics_pass(nir, gather_cf, nir_metadata_all, &info);
205 
206    /* W */
207    if (BITSET_TEST(info.noperspective, VARYING_SLOT_POS + 3)) {
208       var->bindings[var->nr_bindings++] = (struct agx_cf_binding){
209          .cf_base = var->nr_cf++,
210          .slot = VARYING_SLOT_POS,
211          .offset = 3,
212          .count = 1,
213          .smooth = true,
214       };
215    }
216 
217    /* Z */
218    if (BITSET_TEST(info.noperspective, VARYING_SLOT_POS + 2)) {
219       var->bindings[var->nr_bindings++] = (struct agx_cf_binding){
220          .cf_base = var->nr_cf++,
221          .slot = VARYING_SLOT_POS,
222          .offset = 2,
223          .count = 1,
224          .smooth = true,
225       };
226 
227       var->reads_z = true;
228    }
229 
230    static_assert(VARYING_SLOT_POS == 0, "special and handled first");
231 
232    for (unsigned i = VARYING_SLOT_POS + 1; i < VARYING_SLOT_MAX; ++i) {
233       bool smooth = BITSET_TEST_RANGE(info.smooth, i * 4, (i * 4) + 3);
234       bool flat = BITSET_TEST_RANGE(info.flat, i * 4, (i * 4) + 3);
235       bool noperspective =
236          BITSET_TEST_RANGE(info.noperspective, i * 4, (i * 4) + 3);
237 
238       if (!(smooth || flat || noperspective))
239          continue;
240 
241       /* From the GLSL 4.60 spec ("Input Layout Qualifiers"):
242        *
243        *    when location aliasing, the aliases sharing the location must have
244        *    the same underlying numerical type and bit width (floating-point or
245        *    integer, 32-bit versus 64-bit, etc.) and the same auxiliary storage
246        *    and interpolation qualification.
247        *
248        * SPIR-V should obey this as well although the spec text is muddier.
249        */
250       assert((smooth + flat + noperspective) == 1 &&
251              "slots must have consistent interpolation");
252 
253       BITSET_WORD *set = smooth ? info.smooth
254                          : flat ? info.flat
255                                 : info.noperspective;
256 
257       /* Find the start offset */
258       unsigned offset = 0;
259       for (offset = 0; offset < 4 && !BITSET_TEST(set, (i * 4) + offset);
260            ++offset)
261          ;
262 
263       /* Find the end offset. TODO: Do we ever need to split into two bindings
264        * to handle e.g. x_zw read masks?
265        */
266       unsigned count = 0;
267       for (unsigned c = offset; c < 4; ++c) {
268          if (BITSET_TEST(set, (i * 4) + c))
269             count = c - offset + 1;
270       }
271       assert(count >= 1 && (count + offset) <= 4);
272 
273       var->bindings[var->nr_bindings++] = (struct agx_cf_binding){
274          .cf_base = var->nr_cf,
275          .slot = i,
276          .offset = offset,
277          .count = count,
278          .smooth = !flat,
279          .perspective = smooth,
280       };
281 
282       var->nr_cf += count;
283    }
284 }
285 
286 static agx_index
agx_get_cf(agx_context * ctx,gl_varying_slot slot,unsigned offset)287 agx_get_cf(agx_context *ctx, gl_varying_slot slot, unsigned offset)
288 {
289    struct agx_varyings_fs *varyings = &ctx->out->varyings.fs;
290 
291    /* We already have an appropriate binding, find it */
292    for (unsigned b = 0; b < varyings->nr_bindings; ++b) {
293       if (varyings->bindings[b].slot == slot &&
294           (slot != VARYING_SLOT_POS ||
295            offset == varyings->bindings[b].offset)) {
296 
297          signed cf_offset = offset - varyings->bindings[b].offset;
298          assert(cf_offset >= 0);
299 
300          return agx_immediate(varyings->bindings[b].cf_base + cf_offset);
301       }
302    }
303 
304    unreachable("all coefficient registers preassigned");
305 }
306 
307 /* Builds a 64-bit hash table key for an index */
308 static uint64_t
agx_index_to_key(agx_index idx)309 agx_index_to_key(agx_index idx)
310 {
311    STATIC_ASSERT(sizeof(idx) <= sizeof(uint64_t));
312 
313    uint64_t key = 0;
314    memcpy(&key, &idx, sizeof(idx));
315    return key;
316 }
317 
318 /*
319  * Extract a single channel out of a vector source. We split vectors with
320  * p_split so we can use the split components directly, without emitting a
321  * machine instruction. This has advantages of RA, as the split can usually be
322  * optimized away.
323  */
324 static agx_index
agx_emit_extract(agx_builder * b,agx_index vec,unsigned channel)325 agx_emit_extract(agx_builder *b, agx_index vec, unsigned channel)
326 {
327    agx_index *components = _mesa_hash_table_u64_search(b->shader->allocated_vec,
328                                                        agx_index_to_key(vec));
329 
330    assert(components != NULL && "missing agx_emit_collect_to");
331 
332    return components[channel];
333 }
334 
335 static agx_index
agx_extract_nir_src(agx_builder * b,nir_src src,unsigned channel)336 agx_extract_nir_src(agx_builder *b, nir_src src, unsigned channel)
337 {
338    agx_index idx = agx_src_index(&src);
339 
340    /* We only deal with scalars, extract a single scalar if needed */
341    if (nir_src_num_components(src) > 1)
342       return agx_emit_extract(b, idx, channel);
343    else
344       return idx;
345 }
346 
347 static void
agx_cache_collect(agx_builder * b,agx_index dst,unsigned nr_srcs,agx_index * srcs)348 agx_cache_collect(agx_builder *b, agx_index dst, unsigned nr_srcs,
349                   agx_index *srcs)
350 {
351    /* Lifetime of a hash table entry has to be at least as long as the table */
352    agx_index *channels = ralloc_array(b->shader, agx_index, nr_srcs);
353 
354    for (unsigned i = 0; i < nr_srcs; ++i)
355       channels[i] = srcs[i];
356 
357    _mesa_hash_table_u64_insert(b->shader->allocated_vec, agx_index_to_key(dst),
358                                channels);
359 }
360 
361 /*
362  * Combine multiple scalars into a vector destination. This corresponds to
363  * collect, lowered to moves (a shuffle in general) after register allocation.
364  *
365  * To optimize vector extractions, we record the individual channels
366  */
367 static agx_instr *
agx_emit_collect_to(agx_builder * b,agx_index dst,unsigned nr_srcs,agx_index * srcs)368 agx_emit_collect_to(agx_builder *b, agx_index dst, unsigned nr_srcs,
369                     agx_index *srcs)
370 {
371    agx_cache_collect(b, dst, nr_srcs, srcs);
372 
373    if (nr_srcs == 1)
374       return agx_mov_to(b, dst, srcs[0]);
375 
376    agx_instr *I = agx_collect_to(b, dst, nr_srcs);
377 
378    agx_foreach_src(I, s)
379       I->src[s] = srcs[s];
380 
381    return I;
382 }
383 
384 static agx_index
agx_emit_collect(agx_builder * b,unsigned nr_srcs,agx_index * srcs)385 agx_emit_collect(agx_builder *b, unsigned nr_srcs, agx_index *srcs)
386 {
387    agx_index dst = agx_vec_temp(b->shader, srcs[0].size, nr_srcs);
388    agx_emit_collect_to(b, dst, nr_srcs, srcs);
389    return dst;
390 }
391 
392 static agx_index
agx_vec2(agx_builder * b,agx_index s0,agx_index s1)393 agx_vec2(agx_builder *b, agx_index s0, agx_index s1)
394 {
395    return agx_emit_collect(b, 2, (agx_index[]){s0, s1});
396 }
397 
398 static agx_index
agx_pad_to_32(agx_builder * b,agx_index s)399 agx_pad_to_32(agx_builder *b, agx_index s)
400 {
401    assert(s.size == AGX_SIZE_16);
402    assert(agx_channels(s) == 1);
403 
404    agx_index srcs[2] = {s, agx_undef(AGX_SIZE_16)};
405    agx_index dst = agx_vec_temp(b->shader, AGX_SIZE_32, 1);
406    agx_emit_collect_to(b, dst, 2, srcs);
407    return dst;
408 }
409 
410 static agx_index
agx_recollect_vector(agx_builder * b,nir_src vec)411 agx_recollect_vector(agx_builder *b, nir_src vec)
412 {
413    agx_index comps[4];
414    unsigned nr = nir_src_num_components(vec);
415 
416    for (unsigned i = 0; i < nr; ++i)
417       comps[i] = agx_extract_nir_src(b, vec, i);
418 
419    return agx_emit_collect(b, nr, comps);
420 }
421 
422 /*
423  * Extract the lower or upper N-bits from a (2*N)-bit quantity. We use a split
424  * without null destinations to let us CSE (and coalesce) the splits when both x
425  * and y are split.
426  */
427 static agx_instr *
agx_subdivide_to(agx_builder * b,agx_index dst,agx_index s0,unsigned comp)428 agx_subdivide_to(agx_builder *b, agx_index dst, agx_index s0, unsigned comp)
429 {
430    assert((s0.size == (dst.size + 1)) && "only 2x subdivide handled");
431    assert((comp == 0 || comp == 1) && "too many components");
432 
433    /* Handle immediates specially so we don't have to constant fold splits. */
434    if (s0.type == AGX_INDEX_IMMEDIATE) {
435       unsigned bits = 16 * agx_size_align_16(dst.size);
436       return agx_mov_imm_to(b, dst, (s0.value >> bits) & BITFIELD64_MASK(bits));
437    }
438 
439    agx_instr *split = agx_split(b, 2, s0);
440    split->dest[comp] = dst;
441    split->dest[1 - comp] = agx_temp(b->shader, dst.size);
442    return split;
443 }
444 
445 void
agx_block_add_successor(agx_block * block,agx_block * successor)446 agx_block_add_successor(agx_block *block, agx_block *successor)
447 {
448    assert(block != NULL && successor != NULL);
449 
450    /* Cull impossible edges */
451    if (block->unconditional_jumps)
452       return;
453 
454    for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
455       if (block->successors[i]) {
456          if (block->successors[i] == successor)
457             return;
458          else
459             continue;
460       }
461 
462       block->successors[i] = successor;
463       util_dynarray_append(&successor->predecessors, agx_block *, block);
464       return;
465    }
466 
467    unreachable("Too many successors");
468 }
469 
470 /*
471  * Splits an n-component vector (vec) into n scalar destinations (dests) using a
472  * split pseudo-instruction.
473  *
474  * Pre-condition: dests is filled with agx_null().
475  */
476 static void
agx_emit_split(agx_builder * b,agx_index * dests,agx_index vec,unsigned n)477 agx_emit_split(agx_builder *b, agx_index *dests, agx_index vec, unsigned n)
478 {
479    agx_instr *I = agx_split(b, n, vec);
480 
481    agx_foreach_dest(I, d) {
482       dests[d] = agx_temp(b->shader, vec.size);
483       I->dest[d] = dests[d];
484    }
485 }
486 
487 static void
agx_emit_cached_split(agx_builder * b,agx_index vec,unsigned n)488 agx_emit_cached_split(agx_builder *b, agx_index vec, unsigned n)
489 {
490    agx_index dests[4] = {agx_null(), agx_null(), agx_null(), agx_null()};
491    agx_emit_split(b, dests, vec, n);
492    agx_cache_collect(b, vec, n, dests);
493 }
494 
495 static void
agx_emit_load_const(agx_builder * b,nir_load_const_instr * instr)496 agx_emit_load_const(agx_builder *b, nir_load_const_instr *instr)
497 {
498    /* Ensure we've been scalarized and bit size lowered */
499    unsigned bit_size = instr->def.bit_size;
500    assert(instr->def.num_components == 1);
501 
502    /* Emit move, later passes can inline/push if useful */
503    agx_mov_imm_to(b, agx_def_index(&instr->def),
504                   nir_const_value_as_uint(instr->value[0], bit_size));
505 }
506 
507 /*
508  * Implement mul_high of 32-bit sources by doing a 32x32->64-bit multiply and
509  * extracting only the high word.
510  */
511 static agx_instr *
agx_mul_high_to(agx_builder * b,agx_index dst,agx_index P,agx_index Q,bool is_signed)512 agx_mul_high_to(agx_builder *b, agx_index dst, agx_index P, agx_index Q,
513                 bool is_signed)
514 {
515    assert(P.size == Q.size && "source sizes must match");
516    assert(P.size == dst.size && "dest size must match");
517    assert(P.size != AGX_SIZE_64 && "64x64 multiply should have been lowered");
518 
519    static_assert(AGX_SIZE_64 == (AGX_SIZE_32 + 1), "enum wrong");
520    static_assert(AGX_SIZE_32 == (AGX_SIZE_16 + 1), "enum wrong");
521 
522    if (!is_signed) {
523       P = agx_abs(P);
524       Q = agx_abs(Q);
525    }
526 
527    agx_index product = agx_temp(b->shader, P.size + 1);
528    agx_imad_to(b, product, P, Q, agx_zero(), 0);
529 
530    return agx_subdivide_to(b, dst, product, 1);
531 }
532 
533 static enum agx_format
agx_format_for_pipe(enum pipe_format format)534 agx_format_for_pipe(enum pipe_format format)
535 {
536 #define CASE(x)                                                                \
537    if (format == (enum pipe_format)AIL_ISA_FORMAT_##x)                         \
538       return AGX_FORMAT_##x;
539 
540    CASE(I8);
541    CASE(I16);
542    CASE(I32);
543    CASE(F16);
544    CASE(U8NORM);
545    CASE(S8NORM);
546    CASE(U16NORM);
547    CASE(S16NORM);
548    CASE(RGB10A2);
549    CASE(SRGBA8);
550    CASE(RG11B10F);
551    CASE(RGB9E5);
552 
553 #undef CASE
554    unreachable("Invalid format");
555 }
556 
557 static agx_index
cf_for_intrinsic(agx_builder * b,nir_intrinsic_instr * intr)558 cf_for_intrinsic(agx_builder *b, nir_intrinsic_instr *intr)
559 {
560    /* Determine the base location, taking into account a constant offset */
561    unsigned location = nir_intrinsic_io_semantics(intr).location;
562    bool compact = location == VARYING_SLOT_CLIP_DIST0 ||
563                   location == VARYING_SLOT_CLIP_DIST1;
564 
565    nir_src *offset = nir_get_io_offset_src(intr);
566    if (nir_src_is_const(*offset)) {
567       /* XXX: NIR is broken and uses constant offsets in slots but dynamic
568        * offsets in scalars for compact varyings. This needs to be fixed
569        * upstream.
570        */
571       location += nir_src_as_uint(*offset);
572    }
573 
574    agx_index I = agx_get_cf(b->shader, location, nir_intrinsic_component(intr));
575 
576    /* If we have a non-constant offset, we add it to the CF. Offsets are in
577     * vec4 slots (unless we're compact) but the CF is in components, so we need
578     * to shift the offset by 2 before adding.
579     */
580    if (!nir_src_is_const(*offset)) {
581       I = agx_iadd(b, I, agx_src_index(offset), compact ? 0 : 2);
582    }
583 
584    return I;
585 }
586 
587 static enum agx_interpolation
agx_interp_for_bary(nir_intrinsic_instr * bary,agx_index * sample_index)588 agx_interp_for_bary(nir_intrinsic_instr *bary, agx_index *sample_index)
589 {
590    switch (bary->intrinsic) {
591    case nir_intrinsic_load_barycentric_pixel:
592       return AGX_INTERPOLATION_CENTER;
593 
594    case nir_intrinsic_load_barycentric_centroid:
595       return AGX_INTERPOLATION_CENTROID;
596 
597    case nir_intrinsic_load_barycentric_at_sample:
598       *sample_index = agx_src_index(&bary->src[0]);
599       return AGX_INTERPOLATION_SAMPLE;
600 
601    default:
602       unreachable("should have been lowered");
603    }
604 }
605 
606 static void
agx_emit_load_vary(agx_builder * b,agx_index dest,nir_intrinsic_instr * instr)607 agx_emit_load_vary(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
608 {
609    ASSERTED unsigned components = instr->num_components;
610    nir_intrinsic_instr *bary = nir_src_as_intrinsic(instr->src[0]);
611 
612    assert(components >= 1 && components <= 4);
613 
614    agx_index sample_index = agx_zero();
615    enum agx_interpolation interp = agx_interp_for_bary(bary, &sample_index);
616 
617    bool perspective =
618       nir_intrinsic_interp_mode(bary) != INTERP_MODE_NOPERSPECTIVE;
619 
620    agx_index I = cf_for_intrinsic(b, instr);
621 
622    /* For perspective interpolation, we project (multiply by 1/W) */
623    if (perspective) {
624       agx_index J = agx_get_cf(b->shader, VARYING_SLOT_POS, 3);
625       agx_iterproj_to(b, dest, I, J, sample_index, components, interp);
626    } else {
627       agx_iter_to(b, dest, I, sample_index, components, interp);
628    }
629 
630    agx_emit_cached_split(b, dest, components);
631 }
632 
633 static void
agx_wait_pixel_mask(agx_builder * b,uint32_t mask)634 agx_wait_pixel_mask(agx_builder *b, uint32_t mask)
635 {
636    /* Background programs do not need to wait as they are the eldest pixels */
637    if (b->shader->key->fs.ignore_tib_dependencies) {
638       assert(b->shader->nir->info.internal);
639       return;
640    }
641 
642    /* No need to wait twice on a fence */
643    mask &= ~b->shader->already_pixel_waited;
644    if (mask == 0) {
645       return;
646    }
647 
648    agx_wait_pix(b, mask);
649 
650    /* Only mark the fence as waited if we're not in control flow. Eventually we
651     * should do something smarter with a dataflow.
652     */
653    if (b->shader->total_nesting == 0) {
654       b->shader->already_pixel_waited |= mask;
655    }
656 }
657 
658 static agx_instr *
agx_emit_local_store_pixel(agx_builder * b,nir_intrinsic_instr * instr)659 agx_emit_local_store_pixel(agx_builder *b, nir_intrinsic_instr *instr)
660 {
661    bool explicit = nir_intrinsic_explicit_coord(instr);
662 
663    /* TODO: Reverse-engineer interactions with MRT */
664    if (b->shader->stage == MESA_SHADER_FRAGMENT) {
665       agx_wait_pixel_mask(b, 0xC);
666    }
667 
668    /* Compact the registers according to the mask */
669    agx_index compacted[4] = {agx_null()};
670 
671    unsigned compact_count = 0;
672    u_foreach_bit(i, nir_intrinsic_write_mask(instr)) {
673       compacted[compact_count++] = agx_extract_nir_src(b, instr->src[0], i);
674    }
675 
676    agx_index collected = agx_emit_collect(b, compact_count, compacted);
677    agx_index coords = explicit ? agx_src_index(&instr->src[2]) : agx_null();
678 
679    b->shader->out->tag_write_disable = false;
680    return agx_st_tile(b, collected, agx_src_index(&instr->src[1]), coords,
681                       agx_format_for_pipe(nir_intrinsic_format(instr)),
682                       nir_intrinsic_write_mask(instr),
683                       nir_intrinsic_base(instr), explicit);
684 }
685 
686 static agx_instr *
agx_emit_store_zs(agx_builder * b,nir_intrinsic_instr * instr)687 agx_emit_store_zs(agx_builder *b, nir_intrinsic_instr *instr)
688 {
689    unsigned base = nir_intrinsic_base(instr);
690    bool write_z = base & 1;
691    bool write_s = base & 2;
692 
693    agx_index z = agx_src_index(&instr->src[1]);
694    agx_index s = agx_src_index(&instr->src[2]);
695 
696    assert(!write_z || z.size == AGX_SIZE_32);
697    assert(!write_s || s.size == AGX_SIZE_16);
698 
699    if (write_z && write_s) {
700       agx_index u2u32 = agx_temp(b->shader, AGX_SIZE_32);
701       agx_mov_to(b, u2u32, s);
702       s = u2u32;
703    }
704 
705    agx_index zs = (write_z && write_s) ? agx_vec2(b, z, s) : write_z ? z : s;
706 
707    /* Not necessarily a sample mask but overlapping hw mechanism... Should
708     * maybe rename this flag to something more general.
709     */
710    b->shader->out->writes_sample_mask = true;
711 
712    agx_wait_pixel_mask(b, 0x1);
713    return agx_zs_emit(b, agx_src_index(&instr->src[0]), zs, base);
714 }
715 
716 static void
agx_emit_local_load_pixel(agx_builder * b,agx_index dest,nir_intrinsic_instr * instr)717 agx_emit_local_load_pixel(agx_builder *b, agx_index dest,
718                           nir_intrinsic_instr *instr)
719 {
720    agx_wait_pixel_mask(b, 0x8);
721 
722    unsigned nr_comps = instr->def.num_components;
723    agx_ld_tile_to(b, dest, agx_src_index(&instr->src[0]), agx_null(),
724                   agx_format_for_pipe(nir_intrinsic_format(instr)),
725                   BITFIELD_MASK(nr_comps), nir_intrinsic_base(instr), false);
726    agx_emit_cached_split(b, dest, nr_comps);
727 }
728 
729 static bool
nir_is_coherent(nir_intrinsic_instr * instr)730 nir_is_coherent(nir_intrinsic_instr *instr)
731 {
732    return nir_intrinsic_access(instr) & (ACCESS_COHERENT | ACCESS_VOLATILE);
733 }
734 
735 static void
agx_emit_load(agx_builder * b,agx_index dest,nir_intrinsic_instr * instr)736 agx_emit_load(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
737 {
738    agx_index addr = agx_src_index(&instr->src[0]);
739    agx_index offset = agx_src_index(&instr->src[1]);
740    enum agx_format fmt = agx_format_for_pipe(nir_intrinsic_format(instr));
741    unsigned shift = nir_intrinsic_base(instr);
742 
743    /* Zero-extend offset if we're not sign-extending */
744    if (!nir_intrinsic_sign_extend(instr))
745       offset = agx_abs(offset);
746 
747    agx_device_load_to(b, dest, addr, offset, fmt,
748                       BITFIELD_MASK(instr->def.num_components), shift,
749                       nir_is_coherent(instr));
750    agx_emit_cached_split(b, dest, instr->def.num_components);
751 }
752 
753 static void
agx_emit_store(agx_builder * b,nir_intrinsic_instr * instr)754 agx_emit_store(agx_builder *b, nir_intrinsic_instr *instr)
755 {
756    agx_index addr = agx_src_index(&instr->src[1]);
757    agx_index offset = agx_src_index(&instr->src[2]);
758    enum agx_format fmt = agx_format_for_pipe(nir_intrinsic_format(instr));
759    unsigned shift = nir_intrinsic_base(instr);
760 
761    /* Zero-extend offset if we're not sign-extending */
762    if (!nir_intrinsic_sign_extend(instr))
763       offset = agx_abs(offset);
764 
765    agx_device_store(b, agx_recollect_vector(b, instr->src[0]), addr, offset,
766                     fmt, BITFIELD_MASK(nir_src_num_components(instr->src[0])),
767                     shift, nir_is_coherent(instr));
768 }
769 
770 /* Preambles write directly to uniform registers, so move from uniform to GPR */
771 static agx_instr *
agx_emit_load_preamble(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr)772 agx_emit_load_preamble(agx_builder *b, agx_index dst,
773                        nir_intrinsic_instr *instr)
774 {
775    agx_index srcs[4] = {agx_null()};
776    unsigned dim = instr->def.num_components;
777    assert(dim <= ARRAY_SIZE(srcs) && "shouldn't see larger vectors");
778 
779    unsigned base = nir_intrinsic_base(instr);
780    unsigned stride = agx_size_align_16(dst.size);
781 
782    for (unsigned i = 0; i < dim; ++i)
783       srcs[i] = agx_uniform(base + i * stride, dst.size);
784 
785    return agx_emit_collect_to(b, dst, dim, srcs);
786 }
787 
788 static agx_instr *
agx_emit_store_preamble(agx_builder * b,nir_intrinsic_instr * instr)789 agx_emit_store_preamble(agx_builder *b, nir_intrinsic_instr *instr)
790 {
791    agx_index vec = agx_src_index(&instr->src[0]);
792    unsigned base = nir_intrinsic_base(instr);
793    unsigned stride = agx_size_align_16(vec.size);
794    unsigned nr = nir_src_num_components(instr->src[0]);
795 
796    for (unsigned i = 0; i < nr; i += (4 / stride)) {
797       agx_index data[4] = {0};
798       unsigned count = MIN2(4 / stride, nr - i);
799 
800       for (unsigned c = 0; c < count; ++c) {
801          data[c] = agx_extract_nir_src(b, instr->src[0], i + c);
802       }
803 
804       agx_uniform_store(b, agx_emit_collect(b, count, data),
805                         agx_immediate(base + i * stride), BITFIELD_MASK(count));
806    }
807 
808    return NULL;
809 }
810 
811 static enum agx_dim
agx_tex_dim(enum glsl_sampler_dim dim,bool array)812 agx_tex_dim(enum glsl_sampler_dim dim, bool array)
813 {
814    switch (dim) {
815    case GLSL_SAMPLER_DIM_1D:
816       return array ? AGX_DIM_1D_ARRAY : AGX_DIM_1D;
817 
818    case GLSL_SAMPLER_DIM_2D:
819    case GLSL_SAMPLER_DIM_RECT:
820    case GLSL_SAMPLER_DIM_EXTERNAL:
821       return array ? AGX_DIM_2D_ARRAY : AGX_DIM_2D;
822 
823    case GLSL_SAMPLER_DIM_MS:
824       return array ? AGX_DIM_2D_MS_ARRAY : AGX_DIM_2D_MS;
825 
826    case GLSL_SAMPLER_DIM_3D:
827       assert(!array && "3D arrays unsupported");
828       return AGX_DIM_3D;
829 
830    case GLSL_SAMPLER_DIM_CUBE:
831       return array ? AGX_DIM_CUBE_ARRAY : AGX_DIM_CUBE;
832 
833    case GLSL_SAMPLER_DIM_BUF:
834       unreachable("Buffer textures should have been lowered");
835 
836    default:
837       unreachable("Invalid sampler dim\n");
838    }
839 }
840 
841 /*
842  * In the hardware, bindless texture sources are specified as a 64-bit uniform
843  * base address summed with a 32-bit register index. In NIR, we model this as a
844  * vec2, where the first source is the (constant) uniform register number and
845  * the second source is the (dynamic) byte offset.
846  */
847 static agx_index
agx_translate_bindless_handle(agx_builder * b,nir_src * handle,agx_index * base)848 agx_translate_bindless_handle(agx_builder *b, nir_src *handle, agx_index *base)
849 {
850    nir_scalar base_scalar = nir_scalar_resolved(handle->ssa, 0);
851    assert(nir_scalar_is_const(base_scalar) && "base must be constant");
852 
853    unsigned base_uint = nir_scalar_as_uint(base_scalar);
854    *base = agx_uniform(base_uint, AGX_SIZE_64);
855 
856    return agx_emit_extract(b, agx_src_index(handle), 1);
857 }
858 
859 static agx_instr *
agx_emit_block_image_store(agx_builder * b,nir_intrinsic_instr * instr)860 agx_emit_block_image_store(agx_builder *b, nir_intrinsic_instr *instr)
861 {
862    agx_index offset = agx_src_index(&instr->src[1]);
863    agx_index coords = agx_src_index(&instr->src[2]);
864    enum agx_format format = agx_format_for_pipe(nir_intrinsic_format(instr));
865 
866    bool ms = nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_MS;
867    bool array = nir_intrinsic_image_array(instr);
868    enum agx_dim dim = agx_tex_dim(nir_intrinsic_image_dim(instr), array);
869    bool explicit = nir_intrinsic_explicit_coord(instr);
870 
871    /* 32-bit source physically, 16-bit in NIR, top half ignored but needed
872     * logically to ensure alignment.
873     */
874    offset = agx_pad_to_32(b, offset);
875 
876    /* Modified coordinate descriptor */
877    if (!explicit) {
878       if (array) {
879          agx_index layer = coords;
880          coords = agx_temp(b->shader, AGX_SIZE_32);
881          agx_emit_collect_to(b, coords, 2,
882                              (agx_index[]){
883                                 ms ? agx_mov_imm(b, 16, 0) : layer,
884                                 ms ? layer : agx_undef(AGX_SIZE_16),
885                              });
886       } else {
887          coords = agx_null();
888       }
889    }
890 
891    agx_index base, index;
892    if (instr->intrinsic == nir_intrinsic_bindless_image_store_block_agx) {
893       index = agx_translate_bindless_handle(b, &instr->src[0], &base);
894 
895       assert(base.size == AGX_SIZE_64);
896       assert(index.size == AGX_SIZE_32);
897    } else {
898       base = agx_zero();
899       index = agx_src_index(&instr->src[0]);
900 
901       assert(index.size == AGX_SIZE_16);
902    }
903 
904    // XXX: how does this possibly work
905    if (format == AGX_FORMAT_F16)
906       format = AGX_FORMAT_I16;
907 
908    return agx_block_image_store(b, base, index, offset, coords, format, dim,
909                                 explicit);
910 }
911 
912 static agx_instr *
agx_load_compute_dimension(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr,enum agx_sr base)913 agx_load_compute_dimension(agx_builder *b, agx_index dst,
914                            nir_intrinsic_instr *instr, enum agx_sr base)
915 {
916    unsigned dim = instr->def.num_components;
917    unsigned size = instr->def.bit_size;
918    assert(size == 16 || size == 32);
919 
920    agx_index srcs[] = {
921       agx_get_sr(b, size, base + 0),
922       agx_get_sr(b, size, base + 1),
923       agx_get_sr(b, size, base + 2),
924    };
925 
926    return agx_emit_collect_to(b, dst, dim, srcs);
927 }
928 
929 static enum agx_atomic_opc
translate_atomic_opcode(nir_atomic_op op)930 translate_atomic_opcode(nir_atomic_op op)
931 {
932    /* clang-format off */
933    switch (op) {
934    case nir_atomic_op_iadd:    return AGX_ATOMIC_OPC_ADD;
935    case nir_atomic_op_imin:    return AGX_ATOMIC_OPC_IMIN;
936    case nir_atomic_op_umin:    return AGX_ATOMIC_OPC_UMIN;
937    case nir_atomic_op_imax:    return AGX_ATOMIC_OPC_IMAX;
938    case nir_atomic_op_umax:    return AGX_ATOMIC_OPC_UMAX;
939    case nir_atomic_op_iand:    return AGX_ATOMIC_OPC_AND;
940    case nir_atomic_op_ior:     return AGX_ATOMIC_OPC_OR;
941    case nir_atomic_op_ixor:    return AGX_ATOMIC_OPC_XOR;
942    case nir_atomic_op_xchg:    return AGX_ATOMIC_OPC_XCHG;
943    case nir_atomic_op_cmpxchg: return AGX_ATOMIC_OPC_CMPXCHG;
944    default: unreachable("unknown atomic opcode");
945    }
946    /* clang-format on */
947 }
948 
949 /*
950  * The "base" of a local load/store/atomic can be zero but no other immediates.
951  * This would be a little silly to handle when inlining immediates, so we
952  * instead exclude these ops from immediate inlining and just handle 0 specially
953  * when translating.
954  */
955 static agx_index
agx_local_base(nir_src src)956 agx_local_base(nir_src src)
957 {
958    if (nir_src_is_const(src) && nir_src_as_uint(src) == 0)
959       return agx_zero();
960    else
961       return agx_src_index(&src);
962 }
963 
964 static void
agx_emit_atomic(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr,bool local)965 agx_emit_atomic(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr,
966                 bool local)
967 {
968    enum agx_atomic_opc op =
969       translate_atomic_opcode(nir_intrinsic_atomic_op(instr));
970    agx_index base =
971       local ? agx_local_base(instr->src[0]) : agx_src_index(&instr->src[0]);
972    agx_index value = agx_src_index(&instr->src[local ? 1 : 2]);
973    agx_index index = local ? agx_zero() : agx_src_index(&instr->src[1]);
974 
975    /* cmpxchg (only) takes 2 sources, passed in consecutive registers */
976    if (op == AGX_ATOMIC_OPC_CMPXCHG) {
977       agx_index value2 = agx_src_index(&instr->src[local ? 2 : 3]);
978       value = agx_vec2(b, value2, value);
979    }
980 
981    if (local) {
982       assert(base.size == AGX_SIZE_16);
983       agx_local_atomic_to(b, dst, value, base, index, op);
984    } else {
985       assert(base.size == AGX_SIZE_64);
986       agx_atomic_to(b, dst, value, base, index, op);
987    }
988 }
989 
990 static enum agx_format
format_for_bitsize(unsigned bitsize)991 format_for_bitsize(unsigned bitsize)
992 {
993    switch (bitsize) {
994    case 8:
995       return AGX_FORMAT_I8;
996    case 16:
997       return AGX_FORMAT_I16;
998    case 32:
999       return AGX_FORMAT_I32;
1000    default:
1001       unreachable("should've been lowered");
1002    }
1003 }
1004 
1005 static void
agx_emit_local_load(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr)1006 agx_emit_local_load(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr)
1007 {
1008    agx_index base = agx_local_base(instr->src[0]);
1009    agx_index index = agx_zero(); /* TODO: optimize address arithmetic */
1010    assert(base.size == AGX_SIZE_16);
1011 
1012    enum agx_format format = format_for_bitsize(instr->def.bit_size);
1013    unsigned nr = instr->def.num_components;
1014    unsigned mask = BITFIELD_MASK(nr);
1015 
1016    agx_local_load_to(b, dst, base, index, format, mask);
1017    agx_emit_cached_split(b, dst, nr);
1018 }
1019 
1020 static void
agx_emit_local_store(agx_builder * b,nir_intrinsic_instr * instr)1021 agx_emit_local_store(agx_builder *b, nir_intrinsic_instr *instr)
1022 {
1023    agx_index value = agx_src_index(&instr->src[0]);
1024    agx_index base = agx_local_base(instr->src[1]);
1025    agx_index index = agx_zero(); /* TODO: optimize address arithmetic */
1026    assert(base.size == AGX_SIZE_16);
1027 
1028    enum agx_format format = format_for_bitsize(nir_src_bit_size(instr->src[0]));
1029    unsigned mask = BITFIELD_MASK(
1030       nir_src_num_components(instr->src[0])); /* XXX: there's a write mask */
1031 
1032    agx_local_store(b, value, base, index, format, mask);
1033 }
1034 
1035 static void
agx_emit_load_scratch(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr)1036 agx_emit_load_scratch(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr)
1037 {
1038    agx_index offset = agx_src_index(&instr->src[0]);
1039    enum agx_format format = format_for_bitsize(instr->def.bit_size);
1040    unsigned nr = instr->def.num_components;
1041    unsigned mask = BITFIELD_MASK(nr);
1042 
1043    agx_stack_load_to(b, dst, offset, format, mask);
1044    agx_emit_cached_split(b, dst, nr);
1045    b->shader->any_scratch = true;
1046 }
1047 
1048 static void
agx_emit_store_scratch(agx_builder * b,nir_intrinsic_instr * instr)1049 agx_emit_store_scratch(agx_builder *b, nir_intrinsic_instr *instr)
1050 {
1051    agx_index value = agx_recollect_vector(b, instr->src[0]);
1052    agx_index offset = agx_src_index(&instr->src[1]);
1053    enum agx_format format = format_for_bitsize(nir_src_bit_size(instr->src[0]));
1054    unsigned mask = BITFIELD_MASK(nir_src_num_components(instr->src[0]));
1055 
1056    agx_stack_store(b, value, offset, format, mask);
1057    b->shader->any_scratch = true;
1058 }
1059 
1060 static unsigned
agx_expand_tex_to(agx_builder * b,nir_def * def,agx_index src,bool masked)1061 agx_expand_tex_to(agx_builder *b, nir_def *def, agx_index src, bool masked)
1062 {
1063    unsigned nr_channels = def->num_components;
1064    nir_component_mask_t mask = nir_def_components_read(def);
1065 
1066    if (!masked)
1067       mask = (nir_component_mask_t)BITFIELD_MASK(nr_channels);
1068 
1069    agx_index packed_channels[4] = {agx_null()};
1070    agx_index unpacked_channels[4] = {agx_null()};
1071 
1072    /* Hardware writes the masked components contiguously, expand out for NIR */
1073    agx_emit_split(b, packed_channels, src, 4 /* XXX: why not nr_channels */);
1074 
1075    for (unsigned i = 0; i < nr_channels; ++i) {
1076       unpacked_channels[i] =
1077          (mask & BITFIELD_BIT(i))
1078             ? packed_channels[util_bitcount(mask & BITFIELD_MASK(i))]
1079             : agx_undef(src.size);
1080    }
1081 
1082    agx_emit_collect_to(b, agx_def_index(def), nr_channels, unpacked_channels);
1083    return mask;
1084 }
1085 
1086 static agx_instr *
agx_emit_image_load(agx_builder * b,agx_index dst,nir_intrinsic_instr * intr)1087 agx_emit_image_load(agx_builder *b, agx_index dst, nir_intrinsic_instr *intr)
1088 {
1089    agx_index ms_index = agx_src_index(&intr->src[2]);
1090    agx_index lod = agx_src_index(&intr->src[3]);
1091    enum agx_lod_mode lod_mode = AGX_LOD_MODE_LOD_MIN;
1092 
1093    agx_index bindless = agx_immediate(0), texture;
1094    if (intr->intrinsic == nir_intrinsic_bindless_image_load)
1095       texture = agx_translate_bindless_handle(b, &intr->src[0], &bindless);
1096    else if (nir_src_is_const(intr->src[0]) &&
1097             nir_src_as_uint(intr->src[0]) < 0x100)
1098       texture = agx_immediate(nir_src_as_uint(intr->src[0]));
1099    else
1100       texture = agx_src_index(&intr->src[0]);
1101 
1102    assert(nir_src_num_components(intr->src[1]) == 4);
1103    agx_index coord[4] = {
1104       agx_extract_nir_src(b, intr->src[1], 0),
1105       agx_extract_nir_src(b, intr->src[1], 1),
1106       agx_extract_nir_src(b, intr->src[1], 2),
1107       agx_extract_nir_src(b, intr->src[1], 3),
1108    };
1109 
1110    /* Get the image dimension. Cubes are lowered to 2D, since they are logically
1111     * equivalent for imageLoad, but out-of-bounds behaviour for cubes on G13
1112     * is wrong according to Piglit's arb_shader_image_load_store-invalid.
1113     *
1114     * This requires a matching transform in the driver.
1115     */
1116    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr);
1117    bool is_array = nir_intrinsic_image_array(intr);
1118 
1119    if (dim == GLSL_SAMPLER_DIM_CUBE) {
1120       dim = GLSL_SAMPLER_DIM_2D;
1121       is_array = true;
1122    }
1123 
1124    bool is_ms = dim == GLSL_SAMPLER_DIM_MS;
1125    unsigned coord_comps = glsl_get_sampler_dim_coordinate_components(dim);
1126    if (is_array && is_ms) {
1127       agx_index layer = agx_temp(b->shader, AGX_SIZE_16);
1128       agx_subdivide_to(b, layer, coord[coord_comps], 0);
1129 
1130       assert(ms_index.size == AGX_SIZE_16);
1131       agx_index tmp = agx_temp(b->shader, AGX_SIZE_32);
1132       agx_emit_collect_to(b, tmp, 2, (agx_index[]){ms_index, layer});
1133       coord[coord_comps++] = tmp;
1134    } else if (is_ms) {
1135       agx_index tmp = agx_temp(b->shader, AGX_SIZE_32);
1136       agx_mov_to(b, tmp, ms_index);
1137       coord[coord_comps++] = tmp;
1138    } else if (is_array) {
1139       coord_comps++;
1140    }
1141 
1142    /* Multisampled images do not support mipmapping */
1143    if (is_ms) {
1144       lod_mode = AGX_LOD_MODE_AUTO_LOD;
1145       lod = agx_zero();
1146    }
1147 
1148    agx_index coords = agx_emit_collect(b, coord_comps, coord);
1149    agx_index tmp = agx_vec_temp(b->shader, dst.size, 4);
1150 
1151    agx_instr *I = agx_image_load_to(
1152       b, tmp, coords, lod, bindless, texture, agx_immediate(0), agx_null(),
1153       agx_tex_dim(dim, is_array), lod_mode, 0, false, nir_is_coherent(intr));
1154    I->mask = agx_expand_tex_to(b, &intr->def, tmp, true);
1155 
1156    b->shader->out->uses_txf = true;
1157    return NULL;
1158 }
1159 
1160 static agx_instr *
agx_emit_export(agx_builder * b,unsigned base,nir_src src)1161 agx_emit_export(agx_builder *b, unsigned base, nir_src src)
1162 {
1163    agx_builder b_ = *b;
1164    agx_cursor after_cursor = agx_after_block(agx_exit_block(b->shader));
1165    b_.cursor = after_cursor;
1166 
1167    for (unsigned c = 0; c < nir_src_num_components(src); ++c) {
1168       agx_index chan = agx_extract_nir_src(b, src, c);
1169       unsigned stride = agx_size_align_16(chan.size);
1170 
1171       agx_export(&b_, chan, base + (c * stride));
1172    }
1173 
1174    if (agx_cursors_equal(b->cursor, after_cursor)) {
1175       b->cursor = agx_after_block_logical(b->cursor.block);
1176    }
1177 
1178    return NULL;
1179 }
1180 
1181 static agx_instr *
agx_load_exported_to(agx_builder * b,agx_index dst,unsigned base,unsigned nr)1182 agx_load_exported_to(agx_builder *b, agx_index dst, unsigned base, unsigned nr)
1183 {
1184    agx_index chans[4] = {0};
1185    unsigned stride = agx_size_align_16(dst.size);
1186 
1187    for (unsigned c = 0; c < nr; ++c) {
1188       chans[c] = agx_cached_preload(b->shader, base + c * stride, dst.size);
1189    }
1190 
1191    return agx_emit_collect_to(b, dst, nr, chans);
1192 }
1193 
1194 static agx_instr *
agx_emit_image_store(agx_builder * b,nir_intrinsic_instr * instr)1195 agx_emit_image_store(agx_builder *b, nir_intrinsic_instr *instr)
1196 {
1197    /* See remarks in agx_emit_image_load */
1198    enum glsl_sampler_dim glsl_dim = nir_intrinsic_image_dim(instr);
1199    bool is_array = nir_intrinsic_image_array(instr);
1200 
1201    if (glsl_dim == GLSL_SAMPLER_DIM_CUBE) {
1202       glsl_dim = GLSL_SAMPLER_DIM_2D;
1203       is_array = true;
1204    }
1205 
1206    enum agx_dim dim = agx_tex_dim(glsl_dim, is_array);
1207    assert(glsl_dim != GLSL_SAMPLER_DIM_MS && "needs to be lowered");
1208 
1209    agx_index base, index;
1210    if (instr->intrinsic == nir_intrinsic_bindless_image_store) {
1211       index = agx_translate_bindless_handle(b, &instr->src[0], &base);
1212 
1213       assert(base.size == AGX_SIZE_64);
1214       assert(index.size == AGX_SIZE_32);
1215    } else {
1216       base = agx_zero();
1217       index = agx_src_index(&instr->src[0]);
1218 
1219       assert(index.size == AGX_SIZE_16);
1220    }
1221 
1222    agx_index coords4 = agx_src_index(&instr->src[1]);
1223    agx_index lod = agx_src_index(&instr->src[4]);
1224    assert(lod.size == AGX_SIZE_16);
1225 
1226    int coord_components = glsl_get_sampler_dim_coordinate_components(glsl_dim);
1227    if (is_array)
1228       coord_components++;
1229 
1230    agx_index coord_comps[4] = {};
1231    for (unsigned i = 0; i < coord_components; ++i)
1232       coord_comps[i] = agx_emit_extract(b, coords4, i);
1233 
1234    agx_index coords = agx_emit_collect(b, coord_components, coord_comps);
1235    agx_index data = agx_src_index(&instr->src[3]);
1236 
1237    /* If the image format has less than 4 components, nir_opt_shrink_stores can
1238     * shrink the store. But the IR still expects 4 components: pad with undef.
1239     */
1240    if (nir_src_num_components(instr->src[3]) < 4) {
1241       agx_index chan[4] = {agx_null()};
1242 
1243       for (unsigned i = 0; i < 4; ++i) {
1244          if (i < nir_src_num_components(instr->src[3]))
1245             chan[i] = agx_extract_nir_src(b, instr->src[3], i);
1246          else
1247             chan[i] = agx_undef(data.size);
1248       }
1249 
1250       data = agx_emit_collect(b, 4, chan);
1251    }
1252 
1253    /* Image stores act like tilebuffer stores when used for tib spilling */
1254    b->shader->out->tag_write_disable = false;
1255 
1256    return agx_image_write(b, data, coords, lod, base, index, dim,
1257                           nir_is_coherent(instr));
1258 }
1259 
1260 static enum agx_simd_op
translate_simd_op(nir_op op)1261 translate_simd_op(nir_op op)
1262 {
1263 #define CASE(agx_, nir_)                                                       \
1264    case nir_op_##nir_:                                                         \
1265       return AGX_SIMD_OP_##agx_;
1266 
1267    switch (op) {
1268       CASE(AND, iand)
1269       CASE(FADD, fadd)
1270       CASE(OR, ior)
1271       CASE(FMUL, fmul)
1272       CASE(XOR, ixor)
1273       CASE(FMIN, fmin)
1274       CASE(FMAX, fmax)
1275       CASE(IADD, iadd)
1276       CASE(SMIN, imin)
1277       CASE(SMAX, imax)
1278       CASE(UMIN, umin)
1279       CASE(UMAX, umax)
1280    default:
1281       unreachable("unknown simd op");
1282    }
1283 #undef CASE
1284 }
1285 
1286 static agx_instr *
agx_emit_intrinsic(agx_builder * b,nir_intrinsic_instr * instr)1287 agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
1288 {
1289    agx_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest
1290                       ? agx_def_index(&instr->def)
1291                       : agx_null();
1292    gl_shader_stage stage = b->shader->stage;
1293 
1294    switch (instr->intrinsic) {
1295    case nir_intrinsic_load_barycentric_pixel:
1296    case nir_intrinsic_load_barycentric_centroid:
1297    case nir_intrinsic_load_barycentric_at_sample:
1298    case nir_intrinsic_load_barycentric_at_offset:
1299       /* handled later via load_vary */
1300       return NULL;
1301    case nir_intrinsic_load_interpolated_input:
1302       assert(stage == MESA_SHADER_FRAGMENT);
1303       agx_emit_load_vary(b, dst, instr);
1304       return NULL;
1305 
1306    case nir_intrinsic_load_coefficients_agx:
1307       assert(stage == MESA_SHADER_FRAGMENT);
1308       agx_ldcf_to(b, dst, cf_for_intrinsic(b, instr), 1);
1309       agx_emit_cached_split(b, dst, 3);
1310       return NULL;
1311 
1312    case nir_intrinsic_load_agx:
1313    case nir_intrinsic_load_constant_agx:
1314       agx_emit_load(b, dst, instr);
1315       return NULL;
1316 
1317    case nir_intrinsic_store_uvs_agx:
1318       assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL);
1319       return agx_st_vary(b, agx_src_index(&instr->src[1]),
1320                          agx_src_index(&instr->src[0]));
1321 
1322    case nir_intrinsic_store_agx:
1323       agx_emit_store(b, instr);
1324       return NULL;
1325 
1326    case nir_intrinsic_store_shared:
1327       agx_emit_local_store(b, instr);
1328       return NULL;
1329 
1330    case nir_intrinsic_load_shared:
1331       agx_emit_local_load(b, dst, instr);
1332       return NULL;
1333 
1334    case nir_intrinsic_global_atomic_agx:
1335    case nir_intrinsic_global_atomic_swap_agx:
1336       agx_emit_atomic(b, dst, instr, false);
1337       return NULL;
1338 
1339    case nir_intrinsic_shared_atomic:
1340    case nir_intrinsic_shared_atomic_swap:
1341       agx_emit_atomic(b, dst, instr, true);
1342       return NULL;
1343 
1344    case nir_intrinsic_store_zs_agx:
1345       assert(stage == MESA_SHADER_FRAGMENT);
1346       return agx_emit_store_zs(b, instr);
1347 
1348    case nir_intrinsic_store_local_pixel_agx:
1349       return agx_emit_local_store_pixel(b, instr);
1350 
1351    case nir_intrinsic_load_local_pixel_agx:
1352       assert(stage == MESA_SHADER_FRAGMENT);
1353       agx_emit_local_load_pixel(b, dst, instr);
1354       return NULL;
1355 
1356    case nir_intrinsic_load_pixel_coord:
1357       return agx_emit_collect_to(
1358          b, dst, 2,
1359          (agx_index[2]){
1360             agx_get_sr(b, 16, AGX_SR_THREAD_POSITION_IN_GRID_X),
1361             agx_get_sr(b, 16, AGX_SR_THREAD_POSITION_IN_GRID_Y),
1362          });
1363 
1364    case nir_intrinsic_load_frag_coord_zw: {
1365       agx_index cf = agx_get_cf(b->shader, VARYING_SLOT_POS,
1366                                 nir_intrinsic_component(instr));
1367 
1368       return agx_iter_to(b, dst, cf, agx_zero(), 1, AGX_INTERPOLATION_CENTER);
1369    }
1370 
1371    case nir_intrinsic_sample_mask_agx: {
1372       assert(stage == MESA_SHADER_FRAGMENT);
1373       b->shader->out->writes_sample_mask = true;
1374 
1375       /* We need to wait_pix before running Z/S tests, but we don't need to
1376        * wait_pix before merely discarding. Omit the wait_pix when the affected
1377        * samples are unconditionally killed.
1378        */
1379       bool no_tests =
1380          nir_src_is_const(instr->src[1]) && nir_src_as_uint(instr->src[1]) == 0;
1381 
1382       if (!no_tests)
1383          agx_wait_pixel_mask(b, 0x1);
1384 
1385       return agx_sample_mask(b, agx_src_index(&instr->src[0]),
1386                              agx_src_index(&instr->src[1]));
1387    }
1388 
1389    case nir_intrinsic_load_back_face_agx:
1390       return agx_get_sr_to(b, dst, AGX_SR_BACKFACING);
1391 
1392    case nir_intrinsic_load_samples_log2_agx:
1393       return agx_get_sr_to(b, dst, AGX_SR_SAMPLES_LOG2);
1394 
1395    case nir_intrinsic_load_sample_mask_in:
1396       return agx_get_sr_to(b, dst, AGX_SR_INPUT_SAMPLE_MASK);
1397 
1398    case nir_intrinsic_load_sample_mask:
1399       return agx_get_sr_coverage_to(b, dst, AGX_SR_COVERAGE_MASK);
1400 
1401    case nir_intrinsic_load_helper_invocation:
1402       /* Compare special register to zero. We could lower this in NIR (letting
1403        * us fold in an inot) but meh?
1404        */
1405       return agx_icmp_to(b, dst,
1406                          agx_get_sr_coverage(b, 32, AGX_SR_IS_ACTIVE_THREAD),
1407                          agx_zero(), AGX_ICOND_UEQ, false);
1408 
1409    case nir_intrinsic_load_vertex_id:
1410       /* We don't assert the HW stage since we use this same ABI with SW VS */
1411       return agx_mov_to(b, dst, agx_abs(agx_vertex_id(b)));
1412 
1413    case nir_intrinsic_load_primitive_id:
1414       assert(stage == MESA_SHADER_TESS_EVAL);
1415       return agx_mov_to(b, dst, agx_abs(agx_vertex_id(b)));
1416 
1417    case nir_intrinsic_load_instance_id:
1418       return agx_mov_to(b, dst, agx_abs(agx_instance_id(b)));
1419 
1420    case nir_intrinsic_load_tess_coord_xy: {
1421       assert(stage == MESA_SHADER_TESS_EVAL);
1422 
1423       agx_index coords[] = {agx_tess_coord_x(b), agx_tess_coord_y(b)};
1424       return agx_emit_collect_to(b, dst, 2, coords);
1425    }
1426 
1427    case nir_intrinsic_load_preamble:
1428       return agx_emit_load_preamble(b, dst, instr);
1429 
1430    case nir_intrinsic_store_preamble:
1431       return agx_emit_store_preamble(b, instr);
1432 
1433    case nir_intrinsic_image_load:
1434    case nir_intrinsic_bindless_image_load:
1435       return agx_emit_image_load(b, dst, instr);
1436 
1437    case nir_intrinsic_image_store:
1438    case nir_intrinsic_bindless_image_store:
1439       return agx_emit_image_store(b, instr);
1440 
1441    case nir_intrinsic_image_store_block_agx:
1442    case nir_intrinsic_bindless_image_store_block_agx:
1443       return agx_emit_block_image_store(b, instr);
1444 
1445    case nir_intrinsic_load_workgroup_id:
1446       return agx_load_compute_dimension(b, dst, instr,
1447                                         AGX_SR_THREADGROUP_POSITION_IN_GRID_X);
1448 
1449    case nir_intrinsic_load_workgroup_size:
1450       return agx_load_compute_dimension(b, dst, instr,
1451                                         AGX_SR_THREADS_PER_THREADGROUP_X);
1452 
1453    case nir_intrinsic_load_global_invocation_id:
1454       return agx_load_compute_dimension(b, dst, instr,
1455                                         AGX_SR_THREAD_POSITION_IN_GRID_X);
1456 
1457    case nir_intrinsic_load_local_invocation_id:
1458       return agx_load_compute_dimension(
1459          b, dst, instr, AGX_SR_THREAD_POSITION_IN_THREADGROUP_X);
1460 
1461    case nir_intrinsic_load_local_invocation_index:
1462       return agx_get_sr_to(b, dst, AGX_SR_THREAD_INDEX_IN_THREADGROUP);
1463 
1464    case nir_intrinsic_load_layer_id:
1465       return agx_get_sr_to(b, dst, AGX_SR_THREADGROUP_POSITION_IN_GRID_Z);
1466 
1467    case nir_intrinsic_barrier: {
1468       assert(!b->shader->is_preamble && "invalid");
1469 
1470       bool needs_image_barriers = false;
1471 
1472       if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE) {
1473          nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
1474 
1475          if (modes & (nir_var_mem_global | nir_var_image)) {
1476             agx_memory_barrier(b);
1477 
1478             /* Pull out all the big hammers to make cross-workgroup memory
1479              * barriers work.
1480              */
1481             if (nir_intrinsic_memory_scope(instr) >= SCOPE_QUEUE_FAMILY) {
1482                agx_unknown_barrier_1(b);
1483                agx_memory_barrier_2(b);
1484 
1485                /* These are observed on G13D. At some point we should figure out
1486                 * what the individual opcodes do.
1487                 */
1488                agx_device_barrier_2(b);
1489                agx_unknown_barrier_2(b);
1490                agx_memory_barrier_3(b);
1491                agx_device_barrier_1(b);
1492             }
1493          }
1494 
1495          if (modes & nir_var_image) {
1496             agx_image_barrier_1(b);
1497             agx_image_barrier_2(b);
1498             needs_image_barriers = true;
1499          }
1500       }
1501 
1502       /* Nothing to do for subgroup barriers */
1503       if (nir_intrinsic_execution_scope(instr) >= SCOPE_WORKGROUP) {
1504          assert(gl_shader_stage_is_compute(b->shader->nir->info.stage));
1505 
1506          agx_threadgroup_barrier(b);
1507       }
1508 
1509       if (needs_image_barriers) {
1510          agx_image_barrier_3(b);
1511          agx_image_barrier_4(b);
1512       }
1513 
1514       return NULL;
1515    }
1516 
1517    case nir_intrinsic_fence_pbe_to_tex_agx: {
1518       agx_image_barrier_1(b);
1519       agx_image_barrier_2(b);
1520       agx_image_barrier_3(b);
1521       agx_image_barrier_4(b);
1522       return NULL;
1523    }
1524 
1525    case nir_intrinsic_fence_mem_to_tex_agx: {
1526       /* Flush out the atomic to main memory... Found experimentally... */
1527       agx_memory_barrier(b);
1528       agx_memory_barrier_2(b);
1529 
1530       /* TODO: Which ones do we actually need? */
1531       agx_image_barrier_1(b);
1532       agx_image_barrier_2(b);
1533       agx_image_barrier_3(b);
1534       agx_image_barrier_4(b);
1535 
1536       /* Flush out the texture cache */
1537       agx_flush_memory_to_texture(b);
1538       return NULL;
1539    }
1540 
1541    case nir_intrinsic_fence_pbe_to_tex_pixel_agx: {
1542       agx_image_barrier_1(b);
1543       agx_image_barrier_2(b);
1544       agx_flush_memory_to_texture(b);
1545       agx_image_barrier_3(b);
1546       return NULL;
1547    }
1548 
1549    case nir_intrinsic_fence_helper_exit_agx: {
1550       assert(b->shader->key->is_helper);
1551       agx_memory_barrier(b);
1552       agx_unknown_barrier_1(b);
1553       agx_memory_barrier_2(b);
1554       agx_unknown_barrier_2(b);
1555       agx_memory_barrier_3(b);
1556       return NULL;
1557    }
1558 
1559    case nir_intrinsic_begin_invocation_interlock: {
1560       agx_wait_pixel_mask(b, 0xC);
1561       return NULL;
1562    }
1563 
1564    case nir_intrinsic_ddx:
1565    case nir_intrinsic_ddx_coarse:
1566    case nir_intrinsic_ddx_fine:
1567       return agx_dfdx_to(b, dst, agx_src_index(&instr->src[0]));
1568 
1569    case nir_intrinsic_ddy:
1570    case nir_intrinsic_ddy_coarse:
1571    case nir_intrinsic_ddy_fine:
1572       return agx_dfdy_to(b, dst, agx_src_index(&instr->src[0]));
1573 
1574    case nir_intrinsic_load_subgroup_invocation:
1575       return agx_get_sr_to(b, dst, AGX_SR_THREAD_INDEX_IN_SUBGROUP);
1576 
1577    case nir_intrinsic_load_subgroup_id:
1578       return agx_get_sr_to(b, dst, AGX_SR_SUBGROUP_INDEX_IN_THREADGROUP);
1579 
1580    case nir_intrinsic_load_active_subgroup_invocation_agx:
1581       return agx_get_sr_coverage_to(b, dst,
1582                                     AGX_SR_ACTIVE_THREAD_INDEX_IN_SUBGROUP);
1583 
1584    case nir_intrinsic_load_active_subgroup_count_agx:
1585       return agx_get_sr_coverage_to(b, dst,
1586                                     AGX_SR_TOTAL_ACTIVE_THREADS_IN_SUBGROUP);
1587 
1588    case nir_intrinsic_reduce: {
1589       assert((instr->def.bit_size == 1 || instr->def.bit_size == 16 ||
1590               instr->def.bit_size == 32) &&
1591              "should've been lowered");
1592 
1593       unsigned cluster_size = nir_intrinsic_cluster_size(instr);
1594       assert(cluster_size == 0 || cluster_size == 4 || cluster_size >= 32);
1595 
1596       enum agx_simd_op op =
1597          translate_simd_op(nir_intrinsic_reduction_op(instr));
1598 
1599       agx_index src0 = agx_src_index(&instr->src[0]);
1600 
1601       if (cluster_size == 4)
1602          return agx_quad_reduce_to(b, dst, src0, op);
1603       else
1604          return agx_simd_reduce_to(b, dst, src0, op);
1605    }
1606 
1607    case nir_intrinsic_exclusive_scan: {
1608       assert((instr->def.bit_size == 1 || instr->def.bit_size == 16 ||
1609               instr->def.bit_size == 32) &&
1610              "should've been lowered");
1611 
1612       return agx_simd_prefix_to(
1613          b, dst, agx_src_index(&instr->src[0]),
1614          translate_simd_op(nir_intrinsic_reduction_op(instr)));
1615    }
1616 
1617    case nir_intrinsic_read_invocation: {
1618       /* TODO: Check if we're actually inside divergent control flow */
1619       b->shader->any_quad_divergent_shuffle |= b->shader->any_cf;
1620 
1621       /* Lane ID guaranteed to be uniform */
1622       return agx_shuffle_to(b, dst, agx_src_index(&instr->src[0]),
1623                             agx_src_index(&instr->src[1]));
1624    }
1625 
1626    case nir_intrinsic_quad_broadcast: {
1627       /* TODO: Check if we're actually inside divergent control flow */
1628       b->shader->any_quad_divergent_shuffle |= b->shader->any_cf;
1629 
1630       /* Lane ID guaranteed to be uniform */
1631       return agx_quad_shuffle_to(b, dst, agx_src_index(&instr->src[0]),
1632                                  agx_src_index(&instr->src[1]));
1633    }
1634 
1635    case nir_intrinsic_quad_swap_horizontal: {
1636       return agx_quad_shuffle_xor_to(b, dst, agx_src_index(&instr->src[0]),
1637                                      agx_immediate(1));
1638    }
1639 
1640    case nir_intrinsic_quad_swap_vertical: {
1641       return agx_quad_shuffle_xor_to(b, dst, agx_src_index(&instr->src[0]),
1642                                      agx_immediate(2));
1643    }
1644 
1645    case nir_intrinsic_quad_swap_diagonal: {
1646       return agx_quad_shuffle_xor_to(b, dst, agx_src_index(&instr->src[0]),
1647                                      agx_immediate(3));
1648    }
1649 
1650    case nir_intrinsic_ballot: {
1651       return agx_ballot_to(b, dst, agx_src_index(&instr->src[0]));
1652    }
1653 
1654    case nir_intrinsic_quad_ballot_agx: {
1655       return agx_quad_ballot_to(b, dst, agx_src_index(&instr->src[0]));
1656    }
1657 
1658    case nir_intrinsic_doorbell_agx: {
1659       return agx_doorbell(b, nir_src_as_uint(instr->src[0]));
1660    }
1661 
1662    case nir_intrinsic_stack_map_agx: {
1663       return agx_stack_map(b, agx_src_index(&instr->src[1]),
1664                            nir_src_as_uint(instr->src[0]));
1665    }
1666 
1667    case nir_intrinsic_stack_unmap_agx: {
1668       return agx_stack_unmap_to(b, dst, nir_src_as_uint(instr->src[0]));
1669    }
1670 
1671    case nir_intrinsic_load_scratch:
1672       agx_emit_load_scratch(b, dst, instr);
1673       return NULL;
1674 
1675    case nir_intrinsic_store_scratch:
1676       agx_emit_store_scratch(b, instr);
1677       return NULL;
1678 
1679    case nir_intrinsic_load_core_id_agx:
1680       return agx_get_sr_to(b, dst, AGX_SR_CORE_ID);
1681 
1682    case nir_intrinsic_load_helper_op_id_agx:
1683       assert(b->shader->key->is_helper);
1684       return agx_get_sr_barrier_to(b, dst, AGX_SR_HELPER_OP);
1685 
1686    case nir_intrinsic_load_helper_arg_lo_agx:
1687       assert(b->shader->key->is_helper);
1688       return agx_get_sr_barrier_to(b, dst, AGX_SR_HELPER_ARG_L);
1689 
1690    case nir_intrinsic_load_helper_arg_hi_agx:
1691       assert(b->shader->key->is_helper);
1692       return agx_get_sr_barrier_to(b, dst, AGX_SR_HELPER_ARG_H);
1693 
1694    case nir_intrinsic_load_exported_agx:
1695       return agx_load_exported_to(b, dst, nir_intrinsic_base(instr),
1696                                   instr->def.num_components);
1697 
1698    case nir_intrinsic_export_agx:
1699       return agx_emit_export(b, nir_intrinsic_base(instr), instr->src[0]);
1700 
1701    case nir_intrinsic_load_barycentric_sample:
1702    case nir_intrinsic_load_sample_id:
1703    case nir_intrinsic_load_sample_pos:
1704       unreachable("Sample shading should have been lowered");
1705 
1706    default:
1707       fprintf(stderr, "Unhandled intrinsic %s\n",
1708               nir_intrinsic_infos[instr->intrinsic].name);
1709       unreachable("Unhandled intrinsic");
1710    }
1711 }
1712 
1713 static agx_index
agx_alu_src_index(agx_builder * b,nir_alu_src src)1714 agx_alu_src_index(agx_builder *b, nir_alu_src src)
1715 {
1716    /* Check well-formedness of the input NIR */
1717    ASSERTED unsigned bitsize = nir_src_bit_size(src.src);
1718    unsigned comps = nir_src_num_components(src.src);
1719    unsigned channel = src.swizzle[0];
1720 
1721    assert(bitsize == 1 || bitsize == 8 || bitsize == 16 || bitsize == 32 ||
1722           bitsize == 64);
1723    assert(channel < comps);
1724 
1725    return agx_extract_nir_src(b, src.src, channel);
1726 }
1727 
1728 /*
1729  * Emit an instruction translating (s0 * s1) + (s2 << s3). Assuming s3 is
1730  * constant, this is an imad instruction. If s1 == 1, then this is optimized to
1731  * an iadd instruction, which is faster.
1732  */
1733 static agx_instr *
agx_emit_imadshl_agx(agx_builder * b,nir_alu_instr * alu,agx_index dst,agx_index s0,agx_index s1,agx_index s2,agx_index s3)1734 agx_emit_imadshl_agx(agx_builder *b, nir_alu_instr *alu, agx_index dst,
1735                      agx_index s0, agx_index s1, agx_index s2, agx_index s3)
1736 {
1737    /* If the shift is not constant, use a variable shift. This should never
1738     * happen in practice but we don't want to constrain the NIR.
1739     */
1740    unsigned shift;
1741    if (!nir_src_is_const(alu->src[3].src)) {
1742       s2 = agx_bfi(b, agx_immediate(0), s2, s3, 0);
1743       shift = 0;
1744    } else {
1745       shift = nir_alu_src_as_uint(alu->src[3]);
1746    }
1747 
1748    assert(shift <= 4 && "domain restriction on the input NIR");
1749 
1750    /* Emit iadd if possible, else imad */
1751    if (nir_src_is_const(alu->src[1].src) &&
1752        nir_alu_src_as_uint(alu->src[1]) == 1) {
1753 
1754       return agx_iadd_to(b, dst, s0, s2, shift);
1755    } else {
1756       return agx_imad_to(b, dst, s0, s1, s2, shift);
1757    }
1758 }
1759 
1760 static bool
is_conversion_to_8bit(nir_op op)1761 is_conversion_to_8bit(nir_op op)
1762 {
1763    switch (op) {
1764    case nir_op_i2i8:
1765    case nir_op_u2u8:
1766    case nir_op_f2i8:
1767    case nir_op_f2u8:
1768    case nir_op_b2i8:
1769       return true;
1770    default:
1771       return false;
1772    }
1773 }
1774 
1775 static agx_instr *
agx_fminmax_to(agx_builder * b,agx_index dst,agx_index s0,agx_index s1,nir_alu_instr * alu)1776 agx_fminmax_to(agx_builder *b, agx_index dst, agx_index s0, agx_index s1,
1777                nir_alu_instr *alu)
1778 {
1779    /* The hardware gtn/ltn modes are unfortunately incorrect for signed zeros */
1780    assert(!nir_alu_instr_is_signed_zero_preserve(alu) &&
1781           "should've been lowered");
1782 
1783    assert((alu->def.bit_size == 16) ==
1784              (alu->op == nir_op_fmin || alu->op == nir_op_fmax) &&
1785           "fp32 should be lowered");
1786 
1787    bool fmax = alu->op == nir_op_fmax || alu->op == nir_op_fmax_agx;
1788    enum agx_fcond fcond = fmax ? AGX_FCOND_GTN : AGX_FCOND_LTN;
1789 
1790    /* Calculate min/max with the appropriate hardware instruction. This will not
1791     * handle denorms, but we were already lowered for that.
1792     */
1793    return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, fcond);
1794 }
1795 
1796 static agx_instr *
agx_emit_alu(agx_builder * b,nir_alu_instr * instr)1797 agx_emit_alu(agx_builder *b, nir_alu_instr *instr)
1798 {
1799    unsigned srcs = nir_op_infos[instr->op].num_inputs;
1800    unsigned sz = instr->def.bit_size;
1801    unsigned src_sz = srcs ? nir_src_bit_size(instr->src[0].src) : 0;
1802    ASSERTED unsigned comps = instr->def.num_components;
1803 
1804    assert(comps == 1 || nir_op_is_vec_or_mov(instr->op));
1805    assert(sz == 1 ||
1806           ((nir_op_is_vec_or_mov(instr->op) ||
1807             is_conversion_to_8bit(instr->op) || instr->op == nir_op_bcsel) &&
1808            sz == 8) ||
1809           sz == 16 || sz == 32 || sz == 64);
1810 
1811    agx_index dst = agx_def_index(&instr->def);
1812    agx_index s0 = srcs > 0 ? agx_alu_src_index(b, instr->src[0]) : agx_null();
1813    agx_index s1 = srcs > 1 ? agx_alu_src_index(b, instr->src[1]) : agx_null();
1814    agx_index s2 = srcs > 2 ? agx_alu_src_index(b, instr->src[2]) : agx_null();
1815    agx_index s3 = srcs > 3 ? agx_alu_src_index(b, instr->src[3]) : agx_null();
1816 
1817    agx_index i0 = agx_immediate(0);
1818    agx_index i1 = agx_immediate(1);
1819 
1820 #define UNOP(nop, aop)                                                         \
1821    case nir_op_##nop:                                                          \
1822       return agx_##aop##_to(b, dst, s0);
1823 #define BINOP(nop, aop)                                                        \
1824    case nir_op_##nop:                                                          \
1825       return agx_##aop##_to(b, dst, s0, s1);
1826 
1827    switch (instr->op) {
1828       UNOP(f2f16, fmov);
1829       UNOP(f2f16_rtne, fmov);
1830       UNOP(f2f32, fmov);
1831       UNOP(fround_even, roundeven);
1832       UNOP(ftrunc, trunc);
1833       UNOP(ffloor, floor);
1834       UNOP(fceil, ceil);
1835       UNOP(frcp, rcp);
1836       UNOP(frsq, rsqrt);
1837       UNOP(flog2, log2);
1838       UNOP(fexp2, exp2);
1839 
1840       UNOP(mov, mov);
1841       UNOP(u2u32, mov);
1842       UNOP(bitfield_reverse, bitrev);
1843       UNOP(bit_count, popcount);
1844       UNOP(ufind_msb, ffs);
1845       BINOP(iand, and);
1846       BINOP(ior, or);
1847       BINOP(ixor, xor);
1848       BINOP(interleave_agx, intl);
1849 
1850    case nir_op_fadd:
1851       if (instr->def.bit_size == 16)
1852          return agx_hadd_to(b, dst, s0, s1);
1853       else
1854          return agx_fadd_to(b, dst, s0, s1);
1855 
1856    case nir_op_fmul:
1857       if (instr->def.bit_size == 16)
1858          return agx_hmul_to(b, dst, s0, s1);
1859       else
1860          return agx_fmul_to(b, dst, s0, s1);
1861 
1862    case nir_op_ffma:
1863       if (instr->def.bit_size == 16)
1864          return agx_hfma_to(b, dst, s0, s1, s2);
1865       else
1866          return agx_ffma_to(b, dst, s0, s1, s2);
1867 
1868    case nir_op_fsat: {
1869       agx_instr *I = agx_fadd_to(b, dst, s0, agx_negzero());
1870       if (instr->def.bit_size == 16)
1871          I->op = AGX_OPCODE_HADD;
1872 
1873       I->saturate = true;
1874       return I;
1875    }
1876 
1877    case nir_op_feq:
1878       return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_EQ, false);
1879    case nir_op_flt:
1880       return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_LT, false);
1881    case nir_op_fge:
1882       return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_GE, false);
1883    case nir_op_fneu:
1884       return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_EQ, true);
1885 
1886    case nir_op_ieq:
1887       return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_UEQ, false);
1888    case nir_op_ine:
1889       return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_UEQ, true);
1890    case nir_op_ilt:
1891       return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_SLT, false);
1892    case nir_op_ige:
1893       return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_SLT, true);
1894    case nir_op_ult:
1895       return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_ULT, false);
1896    case nir_op_uge:
1897       return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_ULT, true);
1898 
1899    case nir_op_inot:
1900       if (sz == 1)
1901          return agx_xor_to(b, dst, s0, i1);
1902       else
1903          return agx_not_to(b, dst, s0);
1904 
1905    case nir_op_b2b1:
1906       return agx_icmp_to(b, dst, s0, i0, AGX_ICOND_UEQ, true);
1907 
1908    case nir_op_fsqrt:
1909       return agx_fmul_to(b, dst, s0, agx_srsqrt(b, s0));
1910    case nir_op_fabs:
1911       return agx_fmov_to(b, dst, agx_abs(s0));
1912    case nir_op_fneg:
1913       return agx_fmov_to(b, dst, agx_neg(s0));
1914 
1915    case nir_op_fmin:
1916    case nir_op_fmax:
1917    case nir_op_fmin_agx:
1918    case nir_op_fmax_agx:
1919       return agx_fminmax_to(b, dst, s0, s1, instr);
1920 
1921    case nir_op_imin:
1922       return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SLT);
1923    case nir_op_imax:
1924       return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SGT);
1925    case nir_op_umin:
1926       return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_ULT);
1927    case nir_op_umax:
1928       return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_UGT);
1929    case nir_op_bounds_agx:
1930       /* end offset > bound ? 0 : data */
1931       return agx_icmpsel_to(b, dst, s1, s2, agx_zero(), s0, AGX_ICOND_UGT);
1932 
1933    case nir_op_iadd:
1934       return agx_iadd_to(b, dst, s0, s1, 0);
1935    case nir_op_imadshl_agx:
1936       return agx_emit_imadshl_agx(b, instr, dst, s0, s1, s2, s3);
1937    case nir_op_imsubshl_agx:
1938       return agx_emit_imadshl_agx(b, instr, dst, s0, s1, agx_neg(s2), s3);
1939    case nir_op_isub:
1940       return agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
1941    case nir_op_ineg:
1942       return agx_iadd_to(b, dst, i0, agx_neg(s0), 0);
1943    case nir_op_imul:
1944       return agx_imad_to(b, dst, s0, s1, i0, 0);
1945    case nir_op_umul_2x32_64:
1946       return agx_imad_to(b, dst, agx_abs(s0), agx_abs(s1), i0, 0);
1947    case nir_op_imul_2x32_64:
1948       return agx_imad_to(b, dst, s0, s1, i0, 0);
1949    case nir_op_umul_high:
1950       return agx_mul_high_to(b, dst, s0, s1, false);
1951    case nir_op_imul_high:
1952       return agx_mul_high_to(b, dst, s0, s1, true);
1953 
1954    case nir_op_ishl:
1955       return agx_bfi_to(b, dst, i0, s0, s1, 0);
1956    case nir_op_ushr:
1957       return agx_ushr_to(b, dst, s0, s1);
1958    case nir_op_ishr:
1959       return agx_asr_to(b, dst, s0, s1);
1960 
1961    case nir_op_extr_agx:
1962       return agx_extr_to(b, dst, s0, s1, s2,
1963                          nir_alu_src_as_uint(instr->src[3]));
1964 
1965    case nir_op_ubitfield_extract: {
1966       unsigned m = nir_alu_src_as_uint(instr->src[2]);
1967       assert(m != 0 && "should've been optimized");
1968 
1969       /* Disable masking if the whole thing is used */
1970       if (m >= 32)
1971          m = 0;
1972 
1973       return agx_bfeil_to(b, dst, i0, s0, s1, m);
1974    }
1975 
1976    case nir_op_bcsel:
1977       return agx_icmpsel_to(b, dst, s0, i0, s2, s1, AGX_ICOND_UEQ);
1978 
1979    case nir_op_i2i32: {
1980       if (src_sz == 8) {
1981          /* Sign extend in software, NIR likes 8-bit conversions */
1982          agx_index ishl16 = agx_bfi(b, i0, s0, agx_immediate(8), 0);
1983          return agx_asr_to(b, dst, ishl16, agx_immediate(8));
1984       } else {
1985          assert(s0.size == AGX_SIZE_16 && "other conversions lowered");
1986          return agx_signext_to(b, dst, s0);
1987       }
1988    }
1989 
1990    case nir_op_i2i16: {
1991       if (src_sz == 8) {
1992          /* Sign extend in software, NIR likes 8-bit conversions */
1993          agx_index ishl16 = agx_bfi(b, i0, s0, agx_immediate(8), 0);
1994          return agx_asr_to(b, dst, ishl16, agx_immediate(8));
1995       } else {
1996          assert(s0.size == AGX_SIZE_32 && "other conversions lowered");
1997          return agx_subdivide_to(b, dst, s0, 0);
1998       }
1999    }
2000 
2001    case nir_op_u2u16: {
2002       if (s0.size == AGX_SIZE_32)
2003          return agx_subdivide_to(b, dst, s0, 0);
2004       else
2005          return agx_mov_to(b, dst, s0);
2006    }
2007 
2008    /* It will be put into a 16-bit register, but zero out the garbage. We could
2009     * optimize this in the future but it ensures correctness for u2u16(u2u8(x))
2010     * sequences.
2011     */
2012    case nir_op_u2u8:
2013    case nir_op_i2i8:
2014       return agx_and_to(b, dst, s0, agx_immediate(0xFF));
2015 
2016    case nir_op_iadd_sat: {
2017       agx_instr *I = agx_iadd_to(b, dst, s0, s1, 0);
2018       I->saturate = true;
2019       return I;
2020    }
2021 
2022    case nir_op_isub_sat: {
2023       agx_instr *I = agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
2024       I->saturate = true;
2025       return I;
2026    }
2027 
2028    case nir_op_uadd_sat: {
2029       agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_abs(s1), 0);
2030       I->saturate = true;
2031       return I;
2032    }
2033 
2034    case nir_op_usub_sat: {
2035       agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_neg(agx_abs(s1)), 0);
2036       I->saturate = true;
2037       return I;
2038    }
2039 
2040    case nir_op_fsin_agx: {
2041       agx_index fixup = agx_sin_pt_1(b, s0);
2042       agx_index sinc = agx_sin_pt_2(b, fixup);
2043       return agx_fmul_to(b, dst, sinc, fixup);
2044    }
2045 
2046    case nir_op_f2i16:
2047       return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_S16), s0,
2048                             AGX_ROUND_RTZ);
2049 
2050    case nir_op_f2i32:
2051       return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_S32), s0,
2052                             AGX_ROUND_RTZ);
2053 
2054    case nir_op_f2u16:
2055       return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_U16), s0,
2056                             AGX_ROUND_RTZ);
2057 
2058    case nir_op_f2u32:
2059       return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_U32), s0,
2060                             AGX_ROUND_RTZ);
2061 
2062    case nir_op_u2f16:
2063    case nir_op_u2f32: {
2064       if (src_sz == 64)
2065          unreachable("64-bit conversions unimplemented");
2066 
2067       enum agx_convert mode = (src_sz == 32)   ? AGX_CONVERT_U32_TO_F
2068                               : (src_sz == 16) ? AGX_CONVERT_U16_TO_F
2069                                                : AGX_CONVERT_U8_TO_F;
2070 
2071       return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
2072    }
2073 
2074    case nir_op_i2f16:
2075    case nir_op_i2f32: {
2076       if (src_sz == 64)
2077          unreachable("64-bit conversions unimplemented");
2078 
2079       enum agx_convert mode = (src_sz == 32)   ? AGX_CONVERT_S32_TO_F
2080                               : (src_sz == 16) ? AGX_CONVERT_S16_TO_F
2081                                                : AGX_CONVERT_S8_TO_F;
2082 
2083       return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
2084    }
2085 
2086    case nir_op_pack_32_2x16_split:
2087    case nir_op_pack_64_2x32_split: {
2088       agx_index idx[] = {s0, s1};
2089       return agx_emit_collect_to(b, dst, 2, idx);
2090    }
2091 
2092    case nir_op_unpack_64_2x32_split_x:
2093    case nir_op_unpack_32_2x16_split_x:
2094       return agx_subdivide_to(b, dst, s0, 0);
2095 
2096    case nir_op_unpack_64_2x32_split_y:
2097    case nir_op_unpack_32_2x16_split_y:
2098       return agx_subdivide_to(b, dst, s0, 1);
2099 
2100    case nir_op_vec2:
2101    case nir_op_vec3:
2102    case nir_op_vec4: {
2103       agx_index idx[] = {s0, s1, s2, s3};
2104       return agx_emit_collect_to(b, dst, srcs, idx);
2105    }
2106 
2107    case nir_op_vec8:
2108    case nir_op_vec16:
2109       unreachable("should've been lowered");
2110 
2111    default:
2112       fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
2113       unreachable("Unhandled ALU instruction");
2114    }
2115 }
2116 
2117 static enum agx_lod_mode
agx_lod_mode_for_nir(nir_texop op,bool biased,bool min_lod,bool lod_is_zero)2118 agx_lod_mode_for_nir(nir_texop op, bool biased, bool min_lod, bool lod_is_zero)
2119 {
2120    switch (op) {
2121    case nir_texop_tex:
2122    case nir_texop_tg4:
2123       /* We could support this for tex, but it's never actually seen because tex
2124        * is always turned into txb to implement sampler LOD bias in Vulkan.
2125        */
2126       assert(!min_lod && "unimplemented");
2127 
2128       return AGX_LOD_MODE_AUTO_LOD;
2129    case nir_texop_txb:
2130       return min_lod ? AGX_LOD_MODE_AUTO_LOD_BIAS_MIN
2131                      : AGX_LOD_MODE_AUTO_LOD_BIAS;
2132    case nir_texop_lod:
2133       assert(!min_lod);
2134       return biased ? AGX_LOD_MODE_AUTO_LOD_BIAS : AGX_LOD_MODE_AUTO_LOD;
2135    case nir_texop_txd:
2136       return min_lod ? AGX_LOD_MODE_LOD_GRAD_MIN : AGX_LOD_MODE_LOD_GRAD;
2137    case nir_texop_txl:
2138       assert(!min_lod);
2139       return AGX_LOD_MODE_LOD_MIN;
2140    case nir_texop_txf:
2141       assert(!min_lod);
2142       return lod_is_zero ? AGX_LOD_MODE_AUTO_LOD : AGX_LOD_MODE_LOD_MIN;
2143    case nir_texop_txf_ms:
2144       assert(!min_lod);
2145       assert(lod_is_zero && "no mipmapping");
2146       return AGX_LOD_MODE_AUTO_LOD;
2147    default:
2148       unreachable("Unhandled texture op");
2149    }
2150 }
2151 
2152 static enum agx_gather
agx_gather_for_nir(nir_tex_instr * tex)2153 agx_gather_for_nir(nir_tex_instr *tex)
2154 {
2155    if (tex->op == nir_texop_tg4) {
2156       enum agx_gather components[] = {
2157          AGX_GATHER_R,
2158          AGX_GATHER_G,
2159          AGX_GATHER_B,
2160          AGX_GATHER_A,
2161       };
2162 
2163       assert(tex->component < ARRAY_SIZE(components));
2164       return components[tex->component];
2165    } else {
2166       return AGX_GATHER_NONE;
2167    }
2168 }
2169 
2170 static void
agx_emit_tex(agx_builder * b,nir_tex_instr * instr)2171 agx_emit_tex(agx_builder *b, nir_tex_instr *instr)
2172 {
2173    agx_index coords = agx_null(), bindless = agx_immediate(0),
2174              texture = agx_immediate(instr->texture_index),
2175              sampler = agx_immediate(0), lod = agx_immediate(0),
2176              compare = agx_null(), packed_offset = agx_null(),
2177              min_lod = agx_null();
2178 
2179    bool lod_is_zero = true;
2180 
2181    for (unsigned i = 0; i < instr->num_srcs; ++i) {
2182       agx_index index = agx_src_index(&instr->src[i].src);
2183 
2184       switch (instr->src[i].src_type) {
2185       case nir_tex_src_backend1:
2186          coords = index;
2187          break;
2188 
2189       case nir_tex_src_backend2:
2190          packed_offset = index;
2191          break;
2192 
2193       case nir_tex_src_lod:
2194       case nir_tex_src_bias:
2195          lod = index;
2196          lod_is_zero = nir_src_is_const(instr->src[i].src) &&
2197                        nir_src_as_uint(instr->src[i].src) == 0;
2198          break;
2199 
2200       case nir_tex_src_min_lod:
2201          assert(index.size == AGX_SIZE_16);
2202          min_lod = index;
2203          break;
2204 
2205       case nir_tex_src_comparator:
2206          assert(index.size == AGX_SIZE_32);
2207          compare = index;
2208          break;
2209 
2210       case nir_tex_src_texture_offset:
2211          texture = index;
2212          break;
2213       case nir_tex_src_sampler_handle:
2214          sampler = index;
2215          break;
2216 
2217       case nir_tex_src_texture_handle:
2218          texture =
2219             agx_translate_bindless_handle(b, &instr->src[i].src, &bindless);
2220          break;
2221 
2222       case nir_tex_src_ddx: {
2223          int y_idx = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
2224          assert(y_idx >= 0 && "we only handle gradients");
2225 
2226          int min_idx = nir_tex_instr_src_index(instr, nir_tex_src_min_lod);
2227          bool has_min = min_idx >= 0;
2228          agx_index min;
2229 
2230          unsigned n = nir_tex_instr_src_size(instr, y_idx);
2231          assert((n == 2 || n == 3) && "other sizes not supported");
2232 
2233          agx_index index2 = agx_src_index(&instr->src[y_idx].src);
2234 
2235          if (has_min) {
2236             min = agx_src_index(&instr->src[min_idx].src);
2237 
2238             /* Undef extend to 32-bit since our IR is iffy */
2239             min = agx_pad_to_32(b, min);
2240          }
2241 
2242          /* We explicitly don't cache about the split cache for this */
2243          unsigned chans = (2 * n) + (has_min ? 1 : 0);
2244          lod = agx_vec_temp(b->shader, AGX_SIZE_32, chans);
2245          agx_instr *I = agx_collect_to(b, lod, chans);
2246 
2247          for (unsigned i = 0; i < n; ++i) {
2248             I->src[(2 * i) + 0] = agx_emit_extract(b, index, i);
2249             I->src[(2 * i) + 1] = agx_emit_extract(b, index2, i);
2250          }
2251 
2252          if (has_min)
2253             I->src[2 * n] = min;
2254 
2255          break;
2256       }
2257 
2258       case nir_tex_src_ddy:
2259          /* handled above */
2260          break;
2261 
2262       default:
2263          unreachable("Unexpected texture source");
2264       }
2265    }
2266 
2267    enum agx_lod_mode lod_mode = agx_lod_mode_for_nir(
2268       instr->op, nir_tex_instr_src_index(instr, nir_tex_src_bias) >= 0,
2269       nir_tex_instr_src_index(instr, nir_tex_src_min_lod) >= 0, lod_is_zero);
2270 
2271    if (lod_mode == AGX_LOD_MODE_AUTO_LOD) {
2272       /* Ignored logically but asserted 0 */
2273       lod = agx_immediate(0);
2274    } else if (lod_mode == AGX_LOD_MODE_AUTO_LOD_BIAS_MIN) {
2275       /* Combine min with lod */
2276       lod = agx_vec2(b, lod, min_lod);
2277    }
2278 
2279    agx_index dst = agx_def_index(&instr->def);
2280 
2281    /* Pack shadow reference value (compare) and packed offset together */
2282    agx_index compare_offset = agx_null();
2283 
2284    if (!agx_is_null(compare) && !agx_is_null(packed_offset))
2285       compare_offset = agx_vec2(b, compare, packed_offset);
2286    else if (!agx_is_null(packed_offset))
2287       compare_offset = packed_offset;
2288    else if (!agx_is_null(compare))
2289       compare_offset = compare;
2290 
2291    agx_index tmp = agx_vec_temp(b->shader, dst.size, 4);
2292    agx_instr *I = agx_texture_sample_to(
2293       b, tmp, coords, lod, bindless, texture, sampler, compare_offset,
2294       agx_tex_dim(instr->sampler_dim, instr->is_array), lod_mode, 0,
2295       !agx_is_null(packed_offset), !agx_is_null(compare),
2296       instr->op == nir_texop_lod, agx_gather_for_nir(instr));
2297 
2298    if (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms) {
2299       I->op = AGX_OPCODE_TEXTURE_LOAD;
2300       b->shader->out->uses_txf = true;
2301    }
2302 
2303    /* Destination masking doesn't seem to work properly for gathers (because
2304     * it's mostly pointless), but it does show up in the lowering of
2305     * textureGatherOffsets. Don't try to mask the destination for gathers.
2306     */
2307    bool masked = (instr->op != nir_texop_tg4);
2308    I->mask = agx_expand_tex_to(b, &instr->def, tmp, masked);
2309 }
2310 
2311 /*
2312  * Determine if a NIR loop (CF list) uses a continue jump, including within
2313  * if-else statements but not including nested loops.
2314  */
2315 static bool
cf_list_uses_continue(struct exec_list * list)2316 cf_list_uses_continue(struct exec_list *list)
2317 {
2318    foreach_list_typed(nir_cf_node, node, node, list) {
2319       if (node->type == nir_cf_node_block) {
2320          nir_block *block = nir_cf_node_as_block(node);
2321 
2322          nir_foreach_instr(instr, block) {
2323             if (instr->type == nir_instr_type_jump &&
2324                 nir_instr_as_jump(instr)->type == nir_jump_continue)
2325                return true;
2326          }
2327       } else if (node->type == nir_cf_node_if) {
2328          nir_if *nif = nir_cf_node_as_if(node);
2329 
2330          if (cf_list_uses_continue(&nif->then_list) ||
2331              cf_list_uses_continue(&nif->else_list))
2332             return true;
2333       } else {
2334          assert(node->type == nir_cf_node_loop && "don't care about nesting");
2335       }
2336    }
2337 
2338    return false;
2339 }
2340 
2341 static bool
loop_uses_continue(nir_loop * loop)2342 loop_uses_continue(nir_loop *loop)
2343 {
2344    return cf_list_uses_continue(&loop->body);
2345 }
2346 
2347 /*
2348  * NIR loops are treated as a pair of AGX loops:
2349  *
2350  *    do {
2351  *       do {
2352  *          ...
2353  *       } while (0);
2354  *    } while (cond);
2355  *
2356  * By manipulating the nesting counter, we may break out of nested loops, so
2357  * under the model, both break and continue may be implemented as breaks, where
2358  * break breaks out of the outer loop (2 layers) and continue breaks out of the
2359  * inner loop (1 layer).
2360  *
2361  * After manipulating the nesting counter directly, pop_exec #0 must be used to
2362  * flush the update to the execution mask.
2363  */
2364 static void
agx_emit_jump(agx_builder * b,nir_jump_instr * instr)2365 agx_emit_jump(agx_builder *b, nir_jump_instr *instr)
2366 {
2367    agx_context *ctx = b->shader;
2368 
2369    if (instr->type == nir_jump_halt) {
2370       agx_stop(b);
2371       ctx->current_block->unconditional_jumps = true;
2372       return;
2373    }
2374 
2375    assert(instr->type == nir_jump_break || instr->type == nir_jump_continue);
2376 
2377    /* Break out of either one or two loops */
2378    unsigned nestings = b->shader->loop_nesting;
2379 
2380    if (instr->type == nir_jump_continue) {
2381       nestings += 1;
2382       agx_block_add_successor(ctx->current_block, ctx->continue_block);
2383    } else if (instr->type == nir_jump_break) {
2384       nestings += ctx->loop_continues ? 2 : 1;
2385       agx_block_add_successor(ctx->current_block, ctx->break_block);
2386    }
2387 
2388    agx_break(b, nestings, ctx->break_block);
2389    ctx->current_block->unconditional_jumps = true;
2390 }
2391 
2392 static void
agx_emit_phi(agx_builder * b,nir_phi_instr * instr)2393 agx_emit_phi(agx_builder *b, nir_phi_instr *instr)
2394 {
2395    agx_instr *I =
2396       agx_phi_to(b, agx_def_index(&instr->def), exec_list_length(&instr->srcs));
2397 
2398    /* Deferred */
2399    I->phi = instr;
2400 }
2401 
2402 /* Look up the AGX block corresponding to a given NIR block. Used when
2403  * translating phi nodes after emitting all blocks.
2404  */
2405 static agx_block *
agx_from_nir_block(agx_context * ctx,nir_block * block)2406 agx_from_nir_block(agx_context *ctx, nir_block *block)
2407 {
2408    return ctx->indexed_nir_blocks[block->index];
2409 }
2410 
2411 static void
agx_emit_phi_deferred(agx_context * ctx,agx_block * block,agx_instr * I)2412 agx_emit_phi_deferred(agx_context *ctx, agx_block *block, agx_instr *I)
2413 {
2414    nir_phi_instr *phi = I->phi;
2415    I->phi = NULL;
2416 
2417    /* Guaranteed by lower_phis_to_scalar */
2418    assert(phi->def.num_components == 1);
2419 
2420    nir_foreach_phi_src(src, phi) {
2421       agx_block *pred = agx_from_nir_block(ctx, src->pred);
2422       unsigned i = agx_predecessor_index(block, pred);
2423       assert(i < I->nr_srcs);
2424 
2425       I->src[i] = agx_src_index(&src->src);
2426    }
2427 }
2428 
2429 static void
agx_emit_phis_deferred(agx_context * ctx)2430 agx_emit_phis_deferred(agx_context *ctx)
2431 {
2432    agx_foreach_block(ctx, block) {
2433       agx_foreach_phi_in_block(block, I)
2434          agx_emit_phi_deferred(ctx, block, I);
2435    }
2436 }
2437 
2438 static void
agx_emit_undef(agx_builder * b,nir_undef_instr * instr)2439 agx_emit_undef(agx_builder *b, nir_undef_instr *instr)
2440 {
2441    /* For now, just lower undefs to zero. This doesn't matter too much, since
2442     * the lowering happens in NIR and this just allows for late lowering passes
2443     * to result in undefs.
2444     */
2445    if (instr->def.num_components > 1) {
2446       assert(instr->def.num_components <= 4);
2447       agx_index zero = agx_mov_imm(b, instr->def.bit_size, 0);
2448 
2449       agx_emit_collect_to(b, agx_def_index(&instr->def),
2450                           instr->def.num_components,
2451                           (agx_index[4]){zero, zero, zero, zero});
2452    } else {
2453       agx_mov_imm_to(b, agx_def_index(&instr->def), 0);
2454    }
2455 }
2456 
2457 static void
agx_emit_instr(agx_builder * b,struct nir_instr * instr)2458 agx_emit_instr(agx_builder *b, struct nir_instr *instr)
2459 {
2460    switch (instr->type) {
2461    case nir_instr_type_load_const:
2462       agx_emit_load_const(b, nir_instr_as_load_const(instr));
2463       break;
2464 
2465    case nir_instr_type_intrinsic:
2466       agx_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
2467       break;
2468 
2469    case nir_instr_type_alu:
2470       agx_emit_alu(b, nir_instr_as_alu(instr));
2471       break;
2472 
2473    case nir_instr_type_tex:
2474       agx_emit_tex(b, nir_instr_as_tex(instr));
2475       break;
2476 
2477    case nir_instr_type_jump:
2478       agx_emit_jump(b, nir_instr_as_jump(instr));
2479       break;
2480 
2481    case nir_instr_type_phi:
2482       agx_emit_phi(b, nir_instr_as_phi(instr));
2483       break;
2484 
2485    case nir_instr_type_undef:
2486       agx_emit_undef(b, nir_instr_as_undef(instr));
2487       break;
2488 
2489    default:
2490       unreachable("should've been lowered");
2491    }
2492 }
2493 
2494 static agx_block *
agx_create_block(agx_context * ctx)2495 agx_create_block(agx_context *ctx)
2496 {
2497    agx_block *blk = rzalloc(ctx, agx_block);
2498 
2499    util_dynarray_init(&blk->predecessors, blk);
2500 
2501    return blk;
2502 }
2503 
2504 static agx_block *
emit_block(agx_context * ctx,nir_block * block)2505 emit_block(agx_context *ctx, nir_block *block)
2506 {
2507    if (ctx->after_block) {
2508       ctx->current_block = ctx->after_block;
2509       ctx->after_block = NULL;
2510    } else {
2511       ctx->current_block = agx_create_block(ctx);
2512    }
2513 
2514    agx_block *blk = ctx->current_block;
2515    list_addtail(&blk->link, &ctx->blocks);
2516    list_inithead(&blk->instructions);
2517 
2518    ctx->indexed_nir_blocks[block->index] = blk;
2519 
2520    agx_builder _b = agx_init_builder(ctx, agx_after_block(blk));
2521 
2522    nir_foreach_instr(instr, block) {
2523       agx_emit_instr(&_b, instr);
2524    }
2525 
2526    return blk;
2527 }
2528 
2529 static agx_block *emit_cf_list(agx_context *ctx, struct exec_list *list);
2530 
2531 /* Emit if-else as
2532  *
2533  *    if_icmp cond != 0
2534  *       ...
2535  *    else_icmp cond == 0
2536  *       ...
2537  *    pop_exec
2538  *
2539  * If the else is empty, we can omit the else_icmp. This happens elsewhere, as
2540  * an empty else block can become nonempty after RA due to phi lowering. This is
2541  * not usually optimal, but it's a start.
2542  */
2543 
2544 static void
emit_if(agx_context * ctx,nir_if * nif)2545 emit_if(agx_context *ctx, nir_if *nif)
2546 {
2547    agx_block *first_block = ctx->current_block;
2548    agx_builder _b = agx_init_builder(ctx, agx_after_block(first_block));
2549    agx_index cond = agx_src_index(&nif->condition);
2550 
2551    agx_instr *if_ = agx_if_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, true,
2552                                 NULL /* filled in later */);
2553    ctx->loop_nesting++;
2554    ctx->total_nesting++;
2555 
2556    /* Emit the two subblocks. */
2557    agx_block *if_block = emit_cf_list(ctx, &nif->then_list);
2558    agx_block *end_then = ctx->current_block;
2559 
2560    _b.cursor = agx_after_block(ctx->current_block);
2561 
2562    agx_block *else_block = emit_cf_list(ctx, &nif->else_list);
2563    agx_block *end_else = ctx->current_block;
2564 
2565    /* If the "if" fails, we fallthrough to the else */
2566    if_->target = else_block;
2567 
2568    /* Insert an else instruction at the beginning of the else block. We use
2569     * "else_fcmp 0.0, 0.0, eq" as unconditional else, matching the blob.
2570     *
2571     * If it fails, we fall through to the logical end of the last else block.
2572     */
2573    _b.cursor = agx_before_block(else_block);
2574    agx_else_fcmp(&_b, agx_zero(), agx_zero(), 1, AGX_FCOND_EQ, false, end_else);
2575 
2576    ctx->after_block = agx_create_block(ctx);
2577 
2578    agx_block_add_successor(first_block, if_block);
2579    agx_block_add_successor(first_block, else_block);
2580    agx_block_add_successor(end_then, ctx->after_block);
2581    agx_block_add_successor(end_else, ctx->after_block);
2582 
2583    _b.cursor = agx_after_block(ctx->current_block);
2584    agx_pop_exec(&_b, 1);
2585    ctx->loop_nesting--;
2586    ctx->total_nesting--;
2587 }
2588 
2589 static void
emit_loop(agx_context * ctx,nir_loop * nloop)2590 emit_loop(agx_context *ctx, nir_loop *nloop)
2591 {
2592    assert(!nir_loop_has_continue_construct(nloop));
2593    /* We only track nesting within the innermost loop, so push and reset */
2594    unsigned pushed_nesting = ctx->loop_nesting;
2595    ctx->loop_nesting = 0;
2596    ctx->total_nesting++;
2597 
2598    bool old_continues = ctx->loop_continues;
2599    ctx->loop_continues = loop_uses_continue(nloop);
2600 
2601    agx_block *popped_break = ctx->break_block;
2602    agx_block *popped_continue = ctx->continue_block;
2603 
2604    ctx->break_block = agx_create_block(ctx);
2605    ctx->continue_block = agx_create_block(ctx);
2606 
2607    /* If we are emitting a loop inside other control flow, there might be
2608     * threads masked off (TODO: divergence analysis), so push_exec them so
2609     * we get the lower nesting count values to ourselves.
2610     */
2611    agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
2612    if (ctx->total_nesting > 1)
2613       agx_push_exec(&_b, ctx->loop_continues ? 2 : 1);
2614 
2615    /* Fallthrough to body */
2616    agx_block_add_successor(ctx->current_block, ctx->continue_block);
2617 
2618    /* Emit the body */
2619    ctx->after_block = ctx->continue_block;
2620    ctx->after_block->loop_header = true;
2621    agx_block *start_block = emit_cf_list(ctx, &nloop->body);
2622 
2623    /* If we used any continue jumps, we need to reactivate the continued
2624     * threads. We do this with an always true while_icmp, which behaves like:
2625     *
2626     *    if (r0l == 1) {
2627     *       r0l = 0;
2628     *    }
2629     *    update_exec
2630     *
2631     * If we did not use continue, this would be a no-op so it is omitted.
2632     */
2633    _b.cursor = agx_after_block(ctx->current_block);
2634 
2635    if (ctx->loop_continues) {
2636       agx_while_icmp(
2637          &_b, agx_zero(), agx_zero(), 2, AGX_ICOND_UEQ, false,
2638          NULL /* no semantic target, used purely for side effects */);
2639    }
2640 
2641    agx_jmp_exec_any(&_b, start_block);
2642    agx_pop_exec(&_b, ctx->loop_continues ? 2 : 1);
2643    agx_block_add_successor(ctx->current_block, ctx->continue_block);
2644 
2645    /* Pop off */
2646    ctx->after_block = ctx->break_block;
2647    ctx->break_block = popped_break;
2648    ctx->continue_block = popped_continue;
2649 
2650    /* Update shader-db stats */
2651    ++ctx->loop_count;
2652 
2653    /* All nested control flow must have finished */
2654    assert(ctx->loop_nesting == 0);
2655 
2656    /* Restore loop nesting (we might be inside an if inside an outer loop) */
2657    ctx->loop_nesting = pushed_nesting;
2658    ctx->total_nesting--;
2659    ctx->loop_continues = old_continues;
2660 }
2661 
2662 /* Before the first control flow structure, the nesting counter needs to be
2663  * zeroed for correct operation. This only happens at most once, since by
2664  * definition this occurs at the end of the first block, which dominates the
2665  * rest of the program. */
2666 
2667 static void
emit_first_cf(agx_context * ctx)2668 emit_first_cf(agx_context *ctx)
2669 {
2670    if (ctx->any_cf)
2671       return;
2672 
2673    agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
2674    agx_begin_cf(&_b);
2675    ctx->any_cf = true;
2676 }
2677 
2678 static agx_block *
emit_cf_list(agx_context * ctx,struct exec_list * list)2679 emit_cf_list(agx_context *ctx, struct exec_list *list)
2680 {
2681    agx_block *start_block = NULL;
2682 
2683    foreach_list_typed(nir_cf_node, node, node, list) {
2684       switch (node->type) {
2685       case nir_cf_node_block: {
2686          agx_block *block = emit_block(ctx, nir_cf_node_as_block(node));
2687 
2688          if (!start_block)
2689             start_block = block;
2690 
2691          break;
2692       }
2693 
2694       case nir_cf_node_if:
2695          emit_first_cf(ctx);
2696          emit_if(ctx, nir_cf_node_as_if(node));
2697          break;
2698 
2699       case nir_cf_node_loop:
2700          emit_first_cf(ctx);
2701          emit_loop(ctx, nir_cf_node_as_loop(node));
2702          break;
2703 
2704       default:
2705          unreachable("Unknown control flow");
2706       }
2707    }
2708 
2709    return start_block;
2710 }
2711 
2712 static void
agx_set_st_vary_final(agx_context * ctx)2713 agx_set_st_vary_final(agx_context *ctx)
2714 {
2715    agx_foreach_instr_global_rev(ctx, I) {
2716       if (I->op == AGX_OPCODE_ST_VARY) {
2717          I->last = true;
2718          return;
2719       }
2720    }
2721 
2722    /* If we got here, there was no varying written. We need to mark that. */
2723    agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
2724    agx_builder _b = agx_init_builder(ctx, agx_after_block_logical(last_block));
2725    agx_no_varyings(&_b);
2726 }
2727 
2728 static int
agx_dump_stats(agx_context * ctx,unsigned size,char ** out)2729 agx_dump_stats(agx_context *ctx, unsigned size, char **out)
2730 {
2731    unsigned nr_ins = 0, spills = 0, fills = 0;
2732 
2733    /* Count instructions */
2734    agx_foreach_instr_global(ctx, I) {
2735       nr_ins++;
2736 
2737       if (I->op == AGX_OPCODE_STACK_STORE)
2738          spills++;
2739       else if (I->op == AGX_OPCODE_STACK_LOAD)
2740          fills++;
2741    }
2742 
2743    struct agx_cycle_estimate cycles = agx_estimate_cycles(ctx);
2744 
2745    unsigned nr_threads =
2746       agx_occupancy_for_register_count(ctx->max_reg).max_threads;
2747 
2748    return asprintf(
2749       out,
2750       "%s shader: %u inst, %u alu, %u fscib, %u ic, %u bytes, %u regs, "
2751       "%u uniforms, %u scratch, %u threads, %u loops, "
2752       "%u:%u spills:fills",
2753       gl_shader_stage_name(ctx->stage), nr_ins, cycles.alu, cycles.f_scib,
2754       cycles.ic, size, ctx->max_reg, ctx->out->push_count, ctx->scratch_size_B,
2755       nr_threads, ctx->loop_count, spills, fills);
2756 }
2757 
2758 static bool
agx_lower_sincos_filter(const nir_instr * instr,UNUSED const void * _)2759 agx_lower_sincos_filter(const nir_instr *instr, UNUSED const void *_)
2760 {
2761    if (instr->type != nir_instr_type_alu)
2762       return false;
2763 
2764    nir_alu_instr *alu = nir_instr_as_alu(instr);
2765    return alu->op == nir_op_fsin || alu->op == nir_op_fcos;
2766 }
2767 
2768 /* Sine and cosine are implemented via the sin_pt_1 and sin_pt_2 opcodes for
2769  * heavy lifting. sin_pt_2 implements sinc in the first quadrant, expressed in
2770  * turns (sin (tau x) / x), while sin_pt_1 implements a piecewise sign/offset
2771  * fixup to transform a quadrant angle [0, 4] to [-1, 1]. The NIR opcode
2772  * fsin_agx models the fixup, sinc, and multiply to obtain sine, so we just
2773  * need to change units from radians to quadrants modulo turns. Cosine is
2774  * implemented by shifting by one quadrant: cos(x) = sin(x + tau/4).
2775  */
2776 
2777 static nir_def *
agx_lower_sincos_impl(struct nir_builder * b,nir_instr * instr,UNUSED void * _)2778 agx_lower_sincos_impl(struct nir_builder *b, nir_instr *instr, UNUSED void *_)
2779 {
2780    nir_alu_instr *alu = nir_instr_as_alu(instr);
2781    nir_def *x = nir_mov_alu(b, alu->src[0], 1);
2782    nir_def *turns = nir_fmul_imm(b, x, M_1_PI * 0.5f);
2783 
2784    if (alu->op == nir_op_fcos)
2785       turns = nir_fadd_imm(b, turns, 0.25f);
2786 
2787    nir_def *quadrants = nir_fmul_imm(b, nir_ffract(b, turns), 4.0);
2788    return nir_fsin_agx(b, quadrants);
2789 }
2790 
2791 static bool
agx_lower_sincos(nir_shader * shader)2792 agx_lower_sincos(nir_shader *shader)
2793 {
2794    return nir_shader_lower_instructions(shader, agx_lower_sincos_filter,
2795                                         agx_lower_sincos_impl, NULL);
2796 }
2797 
2798 static bool
agx_lower_front_face(struct nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)2799 agx_lower_front_face(struct nir_builder *b, nir_intrinsic_instr *intr,
2800                      UNUSED void *data)
2801 {
2802    if (intr->intrinsic != nir_intrinsic_load_front_face)
2803       return false;
2804 
2805    nir_def *def = &intr->def;
2806    assert(def->bit_size == 1);
2807 
2808    b->cursor = nir_before_instr(&intr->instr);
2809    nir_def_rewrite_uses(def, nir_inot(b, nir_load_back_face_agx(b, 1)));
2810    return true;
2811 }
2812 
2813 /*
2814  * Standard NIR optimization loop. This is run in agx_preprocess_nir, then once
2815  * again at shader variant compile time. Unless there was a complex shader key,
2816  * the latter run should be almost a no-op.
2817  */
2818 static void
agx_optimize_loop_nir(nir_shader * nir)2819 agx_optimize_loop_nir(nir_shader *nir)
2820 {
2821    bool progress;
2822 
2823    do {
2824       progress = false;
2825 
2826       NIR_PASS(progress, nir, nir_copy_prop);
2827       NIR_PASS(progress, nir, nir_opt_remove_phis);
2828       NIR_PASS(progress, nir, nir_opt_dce);
2829       NIR_PASS(progress, nir, nir_opt_dead_cf);
2830       NIR_PASS(progress, nir, nir_opt_cse);
2831       NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
2832       NIR_PASS(progress, nir, nir_opt_phi_precision);
2833       NIR_PASS(progress, nir, nir_opt_algebraic);
2834       NIR_PASS(progress, nir, nir_opt_constant_folding);
2835       NIR_PASS(progress, nir, nir_opt_undef);
2836       NIR_PASS(progress, nir, nir_opt_loop_unroll);
2837    } while (progress);
2838 }
2839 
2840 bool
agx_mem_vectorize_cb(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,int64_t hole_size,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)2841 agx_mem_vectorize_cb(unsigned align_mul, unsigned align_offset,
2842                      unsigned bit_size, unsigned num_components,
2843                      int64_t hole_size, nir_intrinsic_instr *low,
2844                      nir_intrinsic_instr *high, void *data)
2845 {
2846    if (hole_size > 0)
2847       return false;
2848 
2849    /* Must be aligned to the size of the load */
2850    unsigned align = nir_combined_align(align_mul, align_offset);
2851    if ((bit_size / 8) > align)
2852       return false;
2853 
2854    if (num_components > 4)
2855       return false;
2856 
2857    if (bit_size > 32)
2858       return false;
2859 
2860    return true;
2861 }
2862 
2863 static bool
set_speculate(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * _)2864 set_speculate(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *_)
2865 {
2866    if (!nir_intrinsic_has_access(intr))
2867       return false;
2868 
2869    nir_intrinsic_set_access(intr,
2870                             ACCESS_CAN_SPECULATE | nir_intrinsic_access(intr));
2871    return true;
2872 }
2873 
2874 static bool
optimize_bounds(nir_builder * b,nir_intrinsic_instr * intr,void * data)2875 optimize_bounds(nir_builder *b, nir_intrinsic_instr *intr, void *data)
2876 {
2877    if (intr->intrinsic != nir_intrinsic_load_constant_agx)
2878       return false;
2879 
2880    assert(intr->def.bit_size > 1 && "no if-uses");
2881    nir_scalar srcs[2] = {{NULL}};
2882    unsigned use_count = 0;
2883 
2884    nir_alu_instr *first_use = NULL;
2885 
2886    nir_foreach_use(use, &intr->def) {
2887       /* All uses need to be bounds_agx */
2888       nir_instr *parent = nir_src_parent_instr(use);
2889       if (parent->type != nir_instr_type_alu)
2890          return false;
2891 
2892       nir_alu_instr *alu = nir_instr_as_alu(parent);
2893       if ((alu->op != nir_op_bounds_agx) || (alu->src[0].src.ssa != &intr->def))
2894          return false;
2895 
2896       assert(alu->def.num_components == 1 && alu->def.bit_size == 32);
2897 
2898       /* All bounds checks need a common offset and bounds */
2899       for (unsigned s = 0; s < 2; ++s) {
2900          nir_scalar this = nir_scalar_resolved(alu->src[1 + s].src.ssa,
2901                                                alu->src[1 + s].swizzle[0]);
2902 
2903          if (srcs[s].def == NULL)
2904             srcs[s] = this;
2905          else if (!nir_scalar_equal(srcs[s], this))
2906             return false;
2907 
2908          /* To avoid dominance problems, we must sink loads. */
2909          if (this.def->parent_instr->block != intr->instr.block) {
2910             return false;
2911          }
2912       }
2913 
2914       if (!first_use || first_use->def.index > alu->def.index) {
2915          first_use = alu;
2916       }
2917 
2918       ++use_count;
2919    }
2920 
2921    /* We've matched. Freeze the set of uses before chaning things. */
2922    nir_alu_instr **uses = alloca(sizeof(nir_alu_instr *) * use_count);
2923 
2924    unsigned i = 0;
2925    nir_foreach_use(use, &intr->def) {
2926       nir_instr *parent = nir_src_parent_instr(use);
2927       uses[i++] = nir_instr_as_alu(parent);
2928    }
2929    assert(i == use_count && "should not have changed");
2930 
2931    /* Sink the load */
2932    nir_instr_remove(&intr->instr);
2933    b->cursor = nir_before_instr(&first_use->instr);
2934    nir_builder_instr_insert(b, &intr->instr);
2935 
2936    /* Now start rewriting. Grab some common variables */
2937    b->cursor = nir_before_instr(&intr->instr);
2938    nir_def *offset = nir_channel(b, srcs[0].def, srcs[0].comp);
2939    nir_def *bounds = nir_channel(b, srcs[1].def, srcs[1].comp);
2940 
2941    nir_def *in_bounds = nir_uge(b, bounds, offset);
2942    nir_def *zero = nir_imm_int(b, 0);
2943 
2944    nir_src *base_src = &intr->src[0];
2945    nir_src *offs_src = &intr->src[1];
2946 
2947    nir_def *base_lo = nir_unpack_64_2x32_split_x(b, base_src->ssa);
2948    nir_def *base_hi = nir_unpack_64_2x32_split_y(b, base_src->ssa);
2949 
2950    /* Bounds check the base/offset instead. We currently reserve the bottom
2951     * 2^36 of VA (this is driver/compiler ABI). With soft fault enabled, that
2952     * means any read of the lower region will return zero as required.
2953     *
2954     * Therefore, when out-of-bounds, we clamp the index to zero and the high
2955     * half of the address to zero. We don't need to clamp the low half of the
2956     * address. The resulting sum is thus:
2957     *
2958     *    0*(2^32) + lo + (index << shift)
2959     *
2960     * ...which will be in the unmapped zero region provided shift < 4.
2961     */
2962    base_hi = nir_bcsel(b, in_bounds, base_hi, zero);
2963 
2964    /* Clamp index if the shift is too large or sign-extension used */
2965    if (nir_intrinsic_base(intr) >= 2 || nir_intrinsic_sign_extend(intr)) {
2966       nir_src_rewrite(offs_src, nir_bcsel(b, in_bounds, offs_src->ssa, zero));
2967    }
2968 
2969    nir_src_rewrite(base_src, nir_pack_64_2x32_split(b, base_lo, base_hi));
2970 
2971    /* Now that the load itself is bounds checked, all that's left is removing
2972     * the bounds checks on the output. This requires a little care to avoid an
2973     * infinite loop.
2974     *
2975     * Also note we cannot remove the uses here, because it would invalidate the
2976     * iterator inside intrinsics_pass. I hate C, don't you?
2977     */
2978    for (unsigned i = 0; i < use_count; ++i) {
2979       b->cursor = nir_after_instr(&uses[i]->instr);
2980       nir_def *chan = nir_channel(b, &intr->def, uses[i]->src[0].swizzle[0]);
2981       nir_def_rewrite_uses(&uses[i]->def, chan);
2982    }
2983 
2984    return true;
2985 }
2986 
2987 static void
agx_optimize_nir(nir_shader * nir,bool soft_fault,uint16_t * preamble_size)2988 agx_optimize_nir(nir_shader *nir, bool soft_fault, uint16_t *preamble_size)
2989 {
2990    /* This runs only once up front since other optimizations don't affect it */
2991    NIR_PASS(_, nir, nir_opt_shrink_stores, true);
2992 
2993    agx_optimize_loop_nir(nir);
2994 
2995    /* If soft fault is enabled, we can freely speculate everything. That lets us
2996     * peephole select and form preambles more aggressively.
2997     */
2998    if (soft_fault) {
2999       NIR_PASS(_, nir, nir_shader_intrinsics_pass, set_speculate,
3000                nir_metadata_control_flow, NULL);
3001    }
3002 
3003    /* Peephole select again after setting the speculate flag but before
3004     * vectorizing. This cleans up short-circuit loads in unrolled loops.
3005     *
3006     * XXX: Set indirect_load_ok once we can investigate CTS flakes.
3007     */
3008    NIR_PASS(_, nir, nir_opt_peephole_select, 64, false, true);
3009 
3010    NIR_PASS(_, nir, nir_opt_load_store_vectorize,
3011             &(const nir_load_store_vectorize_options){
3012                .modes = nir_var_mem_global | nir_var_mem_constant |
3013                         nir_var_shader_temp,
3014                .callback = agx_mem_vectorize_cb,
3015             });
3016    NIR_PASS(_, nir, nir_lower_pack);
3017    NIR_PASS(_, nir, nir_opt_algebraic);
3018 
3019    /* Lower addressing modes. The sooner we do this, the sooner we get rid of
3020     * amul/aadd instructions and can let nir_opt_algebraic do its job. But we
3021     * want to vectorize first since nir_opt_load_store_vectorize doesn't know
3022     * how to handle our loads.
3023     */
3024    NIR_PASS(_, nir, agx_nir_lower_address);
3025 
3026    NIR_PASS_V(nir, nir_divergence_analysis);
3027    bool progress = false;
3028 
3029    static const nir_lower_subgroups_options subgroups_options = {
3030       .ballot_bit_size = 32,
3031       .ballot_components = 1,
3032       .lower_elect = true,
3033       .lower_subgroup_masks = true,
3034    };
3035 
3036    NIR_PASS(progress, nir, nir_opt_uniform_atomics, true);
3037    NIR_PASS(progress, nir, nir_opt_uniform_subgroup, &subgroups_options);
3038    if (progress) {
3039       NIR_PASS(_, nir, agx_nir_lower_subgroups);
3040    }
3041 
3042    /* The above create operations that need lowering/optimizing */
3043    do {
3044       progress = false;
3045 
3046       NIR_PASS(progress, nir, nir_opt_algebraic);
3047       NIR_PASS(progress, nir, nir_opt_constant_folding);
3048       NIR_PASS(progress, nir, nir_opt_dce);
3049    } while (progress);
3050 
3051    progress = false;
3052 
3053    /* If address lowering made progress, clean up before forming preambles.
3054     * Otherwise the optimized preambles might just be constants! Do it before
3055     * lowering int64 too, to avoid lowering constant int64 arithmetic.
3056     */
3057    if (progress) {
3058       NIR_PASS(_, nir, nir_opt_constant_folding);
3059       NIR_PASS(_, nir, nir_opt_dce);
3060    }
3061 
3062    /* Only lower int64 after optimizing address arithmetic, so that u2u64/i2i64
3063     * conversions remain.
3064     */
3065    progress = false;
3066    NIR_PASS(progress, nir, nir_lower_int64);
3067 
3068    /* If we lowered actual int64 arithmetic (not folded into the address
3069     * calculations), then clean up after the lowering.
3070     */
3071    if (progress) {
3072       do {
3073          progress = false;
3074 
3075          NIR_PASS(progress, nir, nir_opt_algebraic);
3076          NIR_PASS(progress, nir, nir_opt_constant_folding);
3077          NIR_PASS(progress, nir, nir_opt_dce);
3078       } while (progress);
3079    }
3080 
3081    /* Lower fmin/fmax before optimizing preambles so we can see across uniform
3082     * expressions.
3083     */
3084    NIR_PASS(_, nir, agx_nir_lower_fminmax);
3085 
3086    if (preamble_size && (!(agx_compiler_debug & AGX_DBG_NOPREAMBLE))) {
3087       unsigned temp = *preamble_size;
3088       NIR_PASS(_, nir, agx_nir_opt_preamble, &temp);
3089       *preamble_size = temp;
3090    }
3091 
3092    /* Forming preambles may dramatically reduce the instruction count
3093     * in certain blocks, causing some if-else statements to become
3094     * trivial. We want to peephole select those, given that control flow
3095     * prediction instructions are costly.
3096     *
3097     * We need to lower int64 again to deal with the resulting 64-bit csels.
3098     */
3099    NIR_PASS(_, nir, nir_opt_peephole_select, 64, false, true);
3100    NIR_PASS(_, nir, nir_lower_int64);
3101 
3102    /* We need to lower fmin/fmax again after nir_opt_algebraic_late due to f2fmp
3103     * wackiness. This is usually a no-op but is required for correctness in
3104     * GLES.
3105     */
3106    NIR_PASS(_, nir, nir_opt_algebraic_late);
3107    NIR_PASS(_, nir, agx_nir_lower_fminmax);
3108 
3109    /* Fuse add/sub/multiplies/shifts after running opt_algebraic_late to fuse
3110     * isub but before shifts are lowered.
3111     */
3112    do {
3113       progress = false;
3114 
3115       NIR_PASS(progress, nir, nir_opt_dce);
3116       NIR_PASS(progress, nir, nir_opt_cse);
3117       NIR_PASS(progress, nir, agx_nir_fuse_algebraic_late);
3118    } while (progress);
3119 
3120    /* Before optimizing bounds checks, we need to clean up and index defs so
3121     * optimize_bounds does the right thing.
3122     */
3123    NIR_PASS(_, nir, nir_copy_prop);
3124    NIR_PASS(_, nir, nir_opt_dce);
3125 
3126    nir_foreach_function_impl(impl, nir) {
3127       nir_index_ssa_defs(impl);
3128    }
3129 
3130    /* TODO: Reenable this pass. It's breaking Fallout 4 in ways I don't
3131     * understand yet.
3132     */
3133    if (soft_fault && 0) {
3134       NIR_PASS(_, nir, nir_shader_intrinsics_pass, optimize_bounds,
3135                nir_metadata_control_flow, NULL);
3136    }
3137 
3138    /* Do remaining lowering late, since this inserts &s for shifts so we want to
3139     * do it after fusing constant shifts. Constant folding will clean up.
3140     */
3141    NIR_PASS(_, nir, agx_nir_lower_algebraic_late);
3142    NIR_PASS(_, nir, agx_nir_fuse_selects);
3143    NIR_PASS(_, nir, nir_opt_constant_folding);
3144    NIR_PASS(_, nir, nir_opt_combine_barriers, NULL, NULL);
3145    NIR_PASS(_, nir, nir_copy_prop);
3146    NIR_PASS(_, nir, nir_opt_dce);
3147    NIR_PASS(_, nir, nir_opt_cse);
3148    NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
3149    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
3150 
3151    /* Cleanup optimizations */
3152    nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
3153                                nir_move_load_input | nir_move_comparisons |
3154                                nir_move_copies | nir_move_load_ssbo |
3155                                nir_move_alu;
3156 
3157    NIR_PASS(_, nir, nir_opt_sink, move_all);
3158    NIR_PASS(_, nir, nir_opt_move, move_all);
3159    NIR_PASS(_, nir, nir_lower_phis_to_scalar, true);
3160 }
3161 
3162 /*
3163  * Varyings that are used as texture coordinates should be kept at fp32, because
3164  * fp16 does not have enough precision for large textures. It's technically
3165  * conformant not to, but every app gets this wrong.
3166  */
3167 static bool
gather_texcoords(nir_builder * b,nir_instr * instr,void * data)3168 gather_texcoords(nir_builder *b, nir_instr *instr, void *data)
3169 {
3170    uint64_t *mask = data;
3171 
3172    if (instr->type != nir_instr_type_tex)
3173       return false;
3174 
3175    nir_tex_instr *tex = nir_instr_as_tex(instr);
3176 
3177    int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
3178    if (coord_idx < 0)
3179       return false;
3180 
3181    nir_src src = tex->src[coord_idx].src;
3182    nir_scalar x = nir_scalar_resolved(src.ssa, 0);
3183    nir_scalar y = nir_scalar_resolved(src.ssa, 1);
3184 
3185    if (x.def != y.def)
3186       return false;
3187 
3188    nir_instr *parent = x.def->parent_instr;
3189 
3190    if (parent->type != nir_instr_type_intrinsic)
3191       return false;
3192 
3193    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
3194 
3195    if (intr->intrinsic != nir_intrinsic_load_interpolated_input)
3196       return false;
3197 
3198    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
3199    *mask |= BITFIELD64_BIT(sem.location);
3200    return false;
3201 }
3202 
3203 static bool
gather_interp(nir_builder * b,nir_intrinsic_instr * intr,void * data)3204 gather_interp(nir_builder *b, nir_intrinsic_instr *intr, void *data)
3205 {
3206    struct agx_interp_info *masks = data;
3207 
3208    if (intr->intrinsic == nir_intrinsic_load_input) {
3209       nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
3210       masks->flat |= BITFIELD64_RANGE(sem.location, sem.num_slots);
3211    } else if (intr->intrinsic == nir_intrinsic_load_interpolated_input &&
3212               nir_intrinsic_interp_mode(nir_src_as_intrinsic(intr->src[0])) ==
3213                  INTERP_MODE_NOPERSPECTIVE) {
3214       nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
3215       masks->linear |= BITFIELD64_RANGE(sem.location, sem.num_slots);
3216    }
3217 
3218    return false;
3219 }
3220 
3221 /*
3222  * Build a bit mask of varyings (by location) that are flatshaded and linear
3223  * shaded. This information is needed by the driver.
3224  */
3225 struct agx_interp_info
agx_gather_interp_info(nir_shader * nir)3226 agx_gather_interp_info(nir_shader *nir)
3227 {
3228    assert(nir->info.stage == MESA_SHADER_FRAGMENT);
3229 
3230    struct agx_interp_info masks = {0};
3231    nir_shader_intrinsics_pass(nir, gather_interp, nir_metadata_all, &masks);
3232    return masks;
3233 }
3234 
3235 /*
3236  * Build a bit mask of varyings (by location) that are used as texture
3237  * coordinates. This information is needed by lower_mediump_io.
3238  */
3239 uint64_t
agx_gather_texcoords(nir_shader * nir)3240 agx_gather_texcoords(nir_shader *nir)
3241 {
3242    assert(nir->info.stage == MESA_SHADER_FRAGMENT);
3243 
3244    uint64_t mask = 0;
3245    nir_shader_instructions_pass(nir, gather_texcoords, nir_metadata_all, &mask);
3246    return mask;
3247 }
3248 
3249 static nir_mem_access_size_align
mem_access_size_align_cb(nir_intrinsic_op intrin,uint8_t bytes,uint8_t bit_size,uint32_t align,uint32_t align_offset,bool offset_is_const,enum gl_access_qualifier access,const void * cb_data)3250 mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
3251                          uint8_t bit_size, uint32_t align,
3252                          uint32_t align_offset, bool offset_is_const,
3253                          enum gl_access_qualifier access, const void *cb_data)
3254 {
3255    align = nir_combined_align(align, align_offset);
3256 
3257    assert(util_is_power_of_two_nonzero(align));
3258 
3259    if ((bytes & 1) || (align == 1))
3260       bit_size = 8;
3261    else if ((bytes & 2) || (align == 2))
3262       bit_size = 16;
3263    else if (bit_size >= 32)
3264       bit_size = 32;
3265 
3266    return (nir_mem_access_size_align){
3267       .num_components = MIN2(bytes / (bit_size / 8), 4),
3268       .bit_size = bit_size,
3269       .align = bit_size / 8,
3270       .shift = nir_mem_access_shift_method_scalar,
3271    };
3272 }
3273 
3274 static unsigned
lower_bit_size_callback(const nir_instr * instr,UNUSED void * _)3275 lower_bit_size_callback(const nir_instr *instr, UNUSED void *_)
3276 {
3277    if (instr->type == nir_instr_type_intrinsic) {
3278       /* Handle small subgroup ops */
3279       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
3280 
3281       switch (intr->intrinsic) {
3282       case nir_intrinsic_reduce:
3283       case nir_intrinsic_exclusive_scan:
3284       case nir_intrinsic_inclusive_scan:
3285          /* The identity for iand doesn't work for lowered 1-bit booleans, so
3286           * lower that explicitly.
3287           */
3288          if (nir_intrinsic_reduction_op(intr) == nir_op_iand &&
3289              intr->def.bit_size == 1)
3290             return 16;
3291 
3292          /* In general, we have 16-bit ops instead of 8-bit, so lower those. */
3293          return intr->def.bit_size == 8 ? 16 : 0;
3294       default:
3295          return 0;
3296       }
3297    } else if (instr->type == nir_instr_type_alu) {
3298       /* Lower 8-bit ALU to 16-bit. We check the destination, as we do not want
3299        * to lower conversions from 8-bit to larger types. Those conversions get
3300        * implemented natively.
3301        */
3302       nir_alu_instr *alu = nir_instr_as_alu(instr);
3303       if (alu->def.bit_size == 8 && !is_conversion_to_8bit(alu->op))
3304          return 16;
3305       else if (alu->def.bit_size == 1 && alu->src[0].src.ssa->bit_size == 8)
3306          return 16 /* comparisons */;
3307    }
3308 
3309    return 0;
3310 }
3311 
3312 static bool
lower_load_from_texture_handle(nir_builder * b,nir_intrinsic_instr * intr,void * data)3313 lower_load_from_texture_handle(nir_builder *b, nir_intrinsic_instr *intr,
3314                                void *data)
3315 {
3316    if (intr->intrinsic != nir_intrinsic_load_from_texture_handle_agx)
3317       return false;
3318 
3319    /* Bindless handles are a vec2, where the first source is the (constant)
3320     * uniform register number and the second source is the byte offset.
3321     */
3322    nir_scalar uniform = nir_scalar_resolved(intr->src[0].ssa, 0);
3323    unsigned uniform_idx = nir_scalar_as_uint(uniform);
3324 
3325    b->cursor = nir_instr_remove(&intr->instr);
3326    nir_def *base = nir_load_preamble(b, 1, 64, uniform_idx);
3327    nir_def *offset = nir_u2u64(b, nir_channel(b, intr->src[0].ssa, 1));
3328 
3329    nir_def_rewrite_uses(&intr->def, nir_iadd(b, base, offset));
3330    return true;
3331 }
3332 
3333 static void
agx_remove_unreachable_block(agx_block * block)3334 agx_remove_unreachable_block(agx_block *block)
3335 {
3336    /* Delete the edges */
3337    agx_foreach_successor(block, succ) {
3338       unsigned block_idx = agx_predecessor_index(succ, block);
3339 
3340       /* Remove the corresponding predecessor from the successor */
3341       struct util_dynarray *blocks = &succ->predecessors;
3342       int remaining = agx_num_predecessors(succ) - (block_idx + 1);
3343       assert(remaining >= 0);
3344 
3345       memcpy(util_dynarray_element(blocks, agx_block *, block_idx),
3346              util_dynarray_element(blocks, agx_block *, block_idx + 1),
3347              remaining * sizeof(agx_block *));
3348       blocks->size -= sizeof(agx_block *);
3349 
3350       /* Remove the corresponding source from the phis */
3351       agx_foreach_phi_in_block(succ, phi) {
3352          assert(block_idx + 1 <= phi->nr_srcs);
3353 
3354          memcpy(phi->src + block_idx, phi->src + block_idx + 1,
3355                 (phi->nr_srcs - (block_idx + 1)) * sizeof(phi->src[0]));
3356 
3357          phi->nr_srcs--;
3358 
3359          /* This might cause phis to become trivial. Lower 1-source phis to
3360           * moves and let copyprop take it from here.
3361           */
3362          if (phi->nr_srcs == 1) {
3363             phi->op = AGX_OPCODE_MOV;
3364          }
3365       }
3366    }
3367 
3368    /* Remove the successor from the predecessor. */
3369    block->successors[0] = NULL;
3370    block->successors[1] = NULL;
3371 
3372    /* Note: we do not remove the block itself, although it is now fully orphaned
3373     * in the control flow graph. We still need it in source order if it has any
3374     * pop_exec instructions, for a loop continue block.
3375     *
3376     * TODO: Is there a better way to handle this?
3377     *
3378     * Affects: dEQP-VK.graphicsfuzz.cov-matching-if-always-true-inside-loop
3379     */
3380 }
3381 
3382 /*
3383  * NIR sometimes contains unreachable blocks (e.g. due to infinite loops). These
3384  * blocks have no predecessors, but do have successors and can contribute to
3385  * phis. They are dead and do not need to be here. Further, they violate the IR
3386  * invariant:
3387  *
3388  *    Live-in sources are live-out in all predecessors.
3389  *
3390  * ...which RA depends on when handling live range splits. The simplest solution
3391  * is to simply delete these dead blocks. Fortunately, because they are
3392  * unreachable, this does not have any ill effects. Notably, this cannot
3393  * introduce critical edges.
3394  *
3395  * Deleting a block may cause a successor to become unreachable, so we use a
3396  * fixed-point algorithm to converge.
3397  */
3398 static void
agx_remove_unreachable_blocks(agx_context * ctx)3399 agx_remove_unreachable_blocks(agx_context *ctx)
3400 {
3401    agx_block *start = agx_start_block(ctx);
3402    bool progress;
3403 
3404    do {
3405       progress = false;
3406 
3407       agx_foreach_block_safe(ctx, pred) {
3408          if (pred != start && agx_num_predecessors(pred) == 0 &&
3409              agx_num_successors(pred) > 0) {
3410 
3411             agx_remove_unreachable_block(pred);
3412             progress = true;
3413          }
3414       }
3415    } while (progress);
3416 }
3417 
3418 static bool
agx_should_dump(nir_shader * nir,unsigned agx_dbg_bit)3419 agx_should_dump(nir_shader *nir, unsigned agx_dbg_bit)
3420 {
3421    return (agx_compiler_debug & agx_dbg_bit) &&
3422           !(nir->info.internal && !(agx_compiler_debug & AGX_DBG_INTERNAL));
3423 }
3424 
3425 #define AGX_PASS(shader, pass, ...)                                            \
3426    do {                                                                        \
3427       pass(shader, ##__VA_ARGS__);                                             \
3428       agx_validate(ctx, #pass);                                                \
3429    } while (0)
3430 
3431 static unsigned
agx_compile_function_nir(nir_shader * nir,nir_function_impl * impl,struct agx_shader_key * key,struct util_debug_callback * debug,struct util_dynarray * binary,struct agx_shader_info * out)3432 agx_compile_function_nir(nir_shader *nir, nir_function_impl *impl,
3433                          struct agx_shader_key *key,
3434                          struct util_debug_callback *debug,
3435                          struct util_dynarray *binary,
3436                          struct agx_shader_info *out)
3437 {
3438    nir_index_blocks(impl);
3439    nir_index_ssa_defs(impl);
3440 
3441    agx_context *ctx = rzalloc(NULL, agx_context);
3442    ctx->nir = nir;
3443    ctx->is_preamble = impl->function->is_preamble;
3444    ctx->out = out;
3445    ctx->key = key;
3446    ctx->stage = nir->info.stage;
3447    ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
3448    ctx->indexed_nir_blocks = rzalloc_array(ctx, agx_block *, impl->num_blocks);
3449    list_inithead(&ctx->blocks);
3450 
3451    if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_preamble) {
3452       ctx->any_cf = key->fs.inside_sample_loop;
3453    }
3454 
3455    ctx->alloc = impl->ssa_alloc;
3456    emit_cf_list(ctx, &impl->body);
3457    agx_emit_phis_deferred(ctx);
3458 
3459    /* Index blocks now that we're done emitting so the order is consistent. Do
3460     * this before agx_remove_unreachable_blocks so we match NIR indexing. This
3461     * makes for easier debugging.
3462     */
3463    agx_foreach_block(ctx, block) {
3464       block->index = ctx->num_blocks++;
3465    }
3466 
3467    agx_remove_unreachable_blocks(ctx);
3468 
3469    /* Only allocate scratch if it's statically used, regardless of if the NIR
3470     * info claims otherwise.
3471     */
3472    if (ctx->any_scratch) {
3473       assert(!ctx->is_preamble && "preambles don't use scratch");
3474       ctx->scratch_size_B = ALIGN(nir->scratch_size, 16);
3475    }
3476 
3477    /* Stop the main shader or preamble shader after the exit block. For real
3478     * functions, we would return here.
3479     */
3480    if (!ctx->key->no_stop || ctx->is_preamble) {
3481       agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
3482       agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block));
3483       agx_stop(&_b);
3484    }
3485 
3486    agx_validate(ctx, "IR translation");
3487 
3488    if (likely(!(agx_compiler_debug & AGX_DBG_NOOPT))) {
3489       /* Eliminate dead instructions before CSE to avoid silly scheduling */
3490       AGX_PASS(ctx, agx_dce, false);
3491 
3492       /* CSE before eliminating dead destinations so that subdivision is
3493        * optimized properly.
3494        */
3495       AGX_PASS(ctx, agx_opt_cse);
3496 
3497       /* After DCE, use counts are right so we can run the optimizer. */
3498       AGX_PASS(ctx, agx_optimizer_backward);
3499       AGX_PASS(ctx, agx_optimizer_forward);
3500       AGX_PASS(ctx, agx_opt_compact_constants);
3501 
3502       /* After inlining constants, promote what's left */
3503       if (key->promote_constants && !key->secondary &&
3504           !(agx_compiler_debug & AGX_DBG_NOPROMOTE)) {
3505          AGX_PASS(ctx, agx_opt_promote_constants);
3506       }
3507    }
3508 
3509    /* For correctness, lower uniform sources after copyprop (for correctness,
3510     * as copyprop creates uniform sources). To keep register pressure in
3511     * check, lower after CSE, since moves are cheaper than registers.
3512     */
3513    AGX_PASS(ctx, agx_lower_uniform_sources);
3514 
3515    /* RA correctness depends on DCE */
3516    AGX_PASS(ctx, agx_dce, true);
3517 
3518    if (agx_should_dump(nir, AGX_DBG_SHADERS))
3519       agx_print_shader(ctx, stdout);
3520 
3521    if (likely(!(agx_compiler_debug & AGX_DBG_NOSCHED))) {
3522       AGX_PASS(ctx, agx_pressure_schedule);
3523    }
3524 
3525    if (agx_should_dump(nir, AGX_DBG_SHADERS))
3526       agx_print_shader(ctx, stdout);
3527 
3528    AGX_PASS(ctx, agx_ra);
3529    agx_lower_64bit_postra(ctx);
3530 
3531    if (ctx->scratch_size_B > 0) {
3532       /* Apple always allocate 40 more bytes in the entrypoint and align to 4. */
3533       uint64_t stack_size = ALIGN(DIV_ROUND_UP(ctx->scratch_size_B, 4) + 10, 4);
3534 
3535       assert(stack_size < INT16_MAX);
3536 
3537       agx_block *start_block = agx_start_block(ctx);
3538       agx_builder _b = agx_init_builder(ctx, agx_before_block(start_block));
3539       agx_stack_adjust(&_b, stack_size);
3540 
3541       /* If we're going to execute multiple times, make sure we clean up after
3542        * ourselves, else the hardware faults.
3543        */
3544       if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_preamble &&
3545           ctx->key->fs.inside_sample_loop) {
3546 
3547          _b = agx_init_builder(ctx, agx_after_block(agx_end_block(ctx)));
3548          agx_stack_adjust(&_b, -stack_size);
3549       }
3550 
3551       if (ctx->is_preamble)
3552          out->preamble_scratch_size = stack_size;
3553       else
3554          out->scratch_size = stack_size;
3555    }
3556 
3557    if ((ctx->stage == MESA_SHADER_VERTEX ||
3558         ctx->stage == MESA_SHADER_TESS_EVAL) &&
3559        !impl->function->is_preamble && !ctx->key->secondary)
3560       agx_set_st_vary_final(ctx);
3561 
3562    agx_insert_waits(ctx);
3563    agx_opt_empty_else(ctx);
3564    agx_opt_break_if(ctx);
3565    agx_opt_jmp_none(ctx);
3566 
3567    if (ctx->any_quad_divergent_shuffle)
3568       agx_lower_divergent_shuffle(ctx);
3569 
3570    agx_lower_pseudo(ctx);
3571 
3572    if (agx_should_dump(nir, AGX_DBG_SHADERS))
3573       agx_print_shader(ctx, stdout);
3574 
3575    /* Upload constants before the binary instead after to reduce the chance they
3576     * get prefetched into the i-cache, when we want them only in the d-cache.
3577     * Also helps fill the padding space for small preambles.
3578     */
3579    if (ctx->out->rodata.size_16 && !impl->function->is_preamble) {
3580       ctx->out->rodata.offset = agx_pad_binary(binary, 4);
3581       unsigned size_16 = ctx->out->rodata.size_16;
3582 
3583       uint16_t *ro = util_dynarray_grow(binary, uint16_t, size_16);
3584       memcpy(ro, ctx->rodata, size_16 * 2);
3585    }
3586 
3587    unsigned offset = agx_pad_binary(binary, AGX_CODE_ALIGN);
3588    agx_pack_binary(ctx, binary);
3589 
3590    unsigned nr_gprs = ctx->max_reg + 1;
3591 
3592    /* If the preamble uses scratch (due to spilling), we need to set maximal
3593     * GPRs. Do it here so the driver doesn't have to worry about it.
3594     */
3595    if (impl->function->is_preamble)
3596       out->nr_preamble_gprs = ctx->scratch_size_B ? 256 : nr_gprs;
3597    else
3598       out->nr_gprs = nr_gprs;
3599 
3600    /* Don't dump statistics for preambles, since they're not worth optimizing */
3601    if (!impl->function->is_preamble) {
3602       char *stats;
3603       int ret = agx_dump_stats(ctx, binary->size, &stats);
3604 
3605       if (ret >= 0) {
3606          if (agx_should_dump(nir, AGX_DBG_SHADERDB)) {
3607             fprintf(stderr, "SHADER-DB: %s - %s\n", nir->info.label ?: "",
3608                     stats);
3609          }
3610 
3611          if (debug)
3612             util_debug_message(debug, SHADER_INFO, "%s", stats);
3613 
3614          free(stats);
3615       }
3616    }
3617 
3618    ralloc_free(ctx);
3619 
3620    return offset;
3621 }
3622 
3623 void
agx_link_libagx(nir_shader * nir,const nir_shader * libagx)3624 agx_link_libagx(nir_shader *nir, const nir_shader *libagx)
3625 {
3626    nir_link_shader_functions(nir, libagx);
3627    NIR_PASS(_, nir, nir_inline_functions);
3628    nir_remove_non_entrypoints(nir);
3629    NIR_PASS(_, nir, nir_opt_deref);
3630    NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3631    NIR_PASS(_, nir, nir_remove_dead_derefs);
3632    NIR_PASS(_, nir, nir_remove_dead_variables,
3633             nir_var_function_temp | nir_var_shader_temp, NULL);
3634    NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
3635             nir_var_shader_temp | nir_var_function_temp,
3636             glsl_get_cl_type_size_align);
3637 }
3638 
3639 /*
3640  * The hardware frcp instruction is sometimes off by 1 ULP. For correctly
3641  * rounded frcp, a refinement step is required. This routine has been
3642  * exhaustively tested with a modified math_bruteforce.
3643  *
3644  * While Khronos APIs allow 2.5 ULP error for divides, nir_lower_idiv relies on
3645  * correctly rounded frcp. This is therefore load bearing for integer division
3646  * on all APIs.
3647  */
3648 static nir_def *
libagx_frcp(nir_builder * b,nir_def * x)3649 libagx_frcp(nir_builder *b, nir_def *x)
3650 {
3651    nir_def *u = nir_frcp(b, x);
3652 
3653    /* Do 1 Newton-Raphson refinement step.
3654     *
3655     * Define f(u) = xu - 1. Then f(u) = 0 iff u = 1/x. Newton's method gives:
3656     *
3657     * u_2 = u - f(u) / f'(u) = u - (xu - 1) / x
3658     *
3659     * Our original guess is close, so we approximate (1 / x) by u:
3660     *
3661     * u_2 = u - u(xu - 1) = u + u(1 - xu)
3662     *     = fma(fma(-x, u, 1), u, u)
3663     */
3664    nir_def *one = nir_imm_float(b, 1.0);
3665    nir_def *u_2 = nir_ffma(b, nir_ffma(b, nir_fneg(b, x), u, one), u, u);
3666 
3667    /* If the original value was infinite, frcp will generate the correct zero.
3668     * However, the Newton-Raphson step would multiply 0 * Inf and get a NaN. So
3669     * skip the refinement step for infinite inputs. We do this backwards,
3670     * checking whether the refined result is NaN, since we can implement this
3671     * check in a single fcmpsel instruction. The other case where the refinement
3672     * is NaN is a NaN input, in which skipping refinement is acceptable.
3673     */
3674    return nir_bcsel(b, nir_fisnan(b, u_2), u, u_2);
3675 }
3676 
3677 static bool
agx_nir_lower_fdiv(nir_builder * b,nir_alu_instr * alu,void * _)3678 agx_nir_lower_fdiv(nir_builder *b, nir_alu_instr *alu, void *_)
3679 {
3680    if (alu->op != nir_op_frcp || !alu->exact || alu->def.bit_size != 32)
3681       return false;
3682 
3683    b->cursor = nir_before_instr(&alu->instr);
3684    nir_def_replace(&alu->def, libagx_frcp(b, nir_ssa_for_alu_src(b, alu, 0)));
3685    return true;
3686 }
3687 
3688 /* Preprocess NIR independent of shader state */
3689 void
agx_preprocess_nir(nir_shader * nir,const nir_shader * libagx)3690 agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx)
3691 {
3692    NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3693 
3694    /* Lower large arrays to scratch and small arrays to csel */
3695    NIR_PASS(_, nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
3696             glsl_get_natural_size_align_bytes, glsl_get_word_size_align_bytes);
3697    NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
3698    NIR_PASS(_, nir, nir_split_var_copies);
3699    NIR_PASS(_, nir, nir_lower_global_vars_to_local);
3700    NIR_PASS(_, nir, nir_lower_var_copies);
3701 
3702    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3703       NIR_PASS(_, nir, agx_nir_lower_frag_sidefx);
3704    }
3705 
3706    /* Clean up deref gunk after lowering I/O */
3707    NIR_PASS(_, nir, nir_opt_dce);
3708 
3709    agx_link_libagx(nir, libagx);
3710 
3711    /* Runs before we lower away idiv, to work at all. But runs after lowering
3712     * textures, since the cube map array lowering generates division by 6.
3713     */
3714    NIR_PASS(_, nir, nir_opt_idiv_const, 16);
3715 
3716    nir_lower_idiv_options idiv_options = {
3717       .allow_fp16 = true,
3718    };
3719 
3720    NIR_PASS(_, nir, nir_lower_idiv, &idiv_options);
3721    NIR_PASS(_, nir, nir_lower_frexp);
3722    NIR_PASS(_, nir, nir_lower_alu);
3723    NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
3724    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
3725    NIR_PASS(_, nir, nir_lower_flrp, 16 | 32 | 64, false);
3726    NIR_PASS(_, nir, agx_lower_sincos);
3727    NIR_PASS(_, nir, nir_shader_intrinsics_pass, agx_lower_front_face,
3728             nir_metadata_control_flow, NULL);
3729    NIR_PASS(_, nir, agx_nir_lower_subgroups);
3730    NIR_PASS(_, nir, nir_lower_phis_to_scalar, true);
3731    NIR_PASS(_, nir, nir_shader_alu_pass, agx_nir_lower_fdiv,
3732             nir_metadata_control_flow, NULL);
3733 
3734    /* After lowering, run through the standard suite of NIR optimizations. We
3735     * will run through the loop later, once we have the shader key, but if we
3736     * run now, that run will ideally be almost a no-op.
3737     */
3738    agx_optimize_loop_nir(nir);
3739 
3740    NIR_PASS(_, nir, nir_opt_deref);
3741    NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3742 
3743    /* We're lowered away all variables. Remove them all for smaller shaders. */
3744    NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_all, NULL);
3745    nir->info.io_lowered = true;
3746 
3747    /* Move before lowering */
3748    nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
3749                                nir_move_load_input | nir_move_comparisons |
3750                                nir_move_copies | nir_move_load_ssbo |
3751                                nir_move_alu;
3752 
3753    NIR_PASS(_, nir, nir_opt_sink, move_all);
3754    NIR_PASS(_, nir, nir_opt_move, move_all);
3755    NIR_PASS(_, nir, agx_nir_lower_shared_bitsize);
3756 }
3757 
3758 void
agx_compile_shader_nir(nir_shader * nir,struct agx_shader_key * key,struct util_debug_callback * debug,struct agx_shader_part * out)3759 agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
3760                        struct util_debug_callback *debug,
3761                        struct agx_shader_part *out)
3762 {
3763    agx_compiler_debug = agx_get_compiler_debug();
3764    struct agx_shader_info *info = &out->info;
3765 
3766    struct util_dynarray binary;
3767    util_dynarray_init(&binary, NULL);
3768 
3769    memset(out, 0, sizeof *out);
3770 
3771    assert(nir->info.io_lowered &&
3772           "agx_preprocess_nir is called first, then the shader is specalized,"
3773           "then the specialized shader is compiled");
3774 
3775    /* If required, tag writes will be enabled by instruction selection */
3776    if (nir->info.stage == MESA_SHADER_FRAGMENT)
3777       info->tag_write_disable = !nir->info.writes_memory;
3778 
3779    NIR_PASS(_, nir, nir_lower_printf_buffer, LIBAGX_PRINTF_BUFFER_ADDRESS,
3780             LIBAGX_PRINTF_BUFFER_SIZE - 8);
3781 
3782    bool needs_libagx = true /* TODO: Optimize */;
3783 
3784    NIR_PASS(_, nir, nir_lower_frag_coord_to_pixel_coord);
3785    NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3786 
3787    if (needs_libagx) {
3788       agx_link_libagx(nir, key->libagx);
3789 
3790       NIR_PASS(_, nir, nir_opt_deref);
3791       NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3792       NIR_PASS(_, nir, nir_lower_explicit_io,
3793                nir_var_shader_temp | nir_var_function_temp |
3794                   nir_var_mem_shared | nir_var_mem_global,
3795                nir_address_format_62bit_generic);
3796    }
3797 
3798    /* Late sysval lowering creates large loads. Load lowering creates unpacks */
3799    nir_lower_mem_access_bit_sizes_options lower_mem_access_options = {
3800       .modes = nir_var_mem_ssbo | nir_var_mem_constant |
3801                nir_var_mem_task_payload | nir_var_shader_temp |
3802                nir_var_function_temp | nir_var_mem_global | nir_var_mem_shared,
3803       .callback = mem_access_size_align_cb,
3804    };
3805    NIR_PASS(_, nir, nir_lower_mem_access_bit_sizes, &lower_mem_access_options);
3806 
3807    /* Optimize scratch access */
3808    NIR_PASS(_, nir, nir_lower_scratch_to_var);
3809    NIR_PASS(_, nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
3810             glsl_get_natural_size_align_bytes,
3811             glsl_get_natural_size_align_bytes);
3812    NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
3813 
3814    /* Cleanup 8-bit math before lowering */
3815    bool progress;
3816    do {
3817       progress = false;
3818 
3819       NIR_PASS(progress, nir, nir_opt_algebraic);
3820       NIR_PASS(progress, nir, nir_opt_constant_folding);
3821       NIR_PASS(progress, nir, nir_opt_dce);
3822    } while (progress);
3823 
3824    NIR_PASS(_, nir, nir_lower_bit_size, lower_bit_size_callback, NULL);
3825 
3826    /* Late blend lowering creates vectors */
3827    NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
3828    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
3829 
3830    /* Late VBO lowering creates constant udiv instructions */
3831    NIR_PASS(_, nir, nir_opt_idiv_const, 16);
3832 
3833    NIR_PASS(_, nir, nir_opt_constant_folding);
3834    NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_load_from_texture_handle,
3835             nir_metadata_control_flow, NULL);
3836 
3837    info->push_count = key->reserved_preamble;
3838    agx_optimize_nir(nir, key->dev.soft_fault,
3839                     key->secondary ? NULL : &info->push_count);
3840 
3841    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3842       info->varyings.fs.nr_cf = key->fs.cf_base;
3843       assign_coefficient_regs(nir, &info->varyings.fs);
3844    }
3845 
3846    if (agx_should_dump(nir, AGX_DBG_SHADERS))
3847       nir_print_shader(nir, stdout);
3848 
3849    info->local_size = nir->info.shared_size;
3850 
3851    nir_foreach_function_with_impl(func, impl, nir) {
3852       unsigned offset =
3853          agx_compile_function_nir(nir, impl, key, debug, &binary, &out->info);
3854 
3855       if (func->is_preamble) {
3856          info->preamble_offset = offset;
3857          info->has_preamble = true;
3858       } else if (func->is_entrypoint) {
3859          info->main_offset = offset;
3860          info->main_size = binary.size - offset;
3861       } else {
3862          unreachable("General functions not yet supported");
3863       }
3864    }
3865 
3866    info->stage = nir->info.stage;
3867 
3868    /* Check these outside the stage check since nir->info.stage is the hardware
3869     * stage and these are read in the vertex *software* stage.
3870     */
3871    info->uses_draw_id =
3872       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
3873 
3874    info->uses_base_param =
3875       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX) ||
3876       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_VERTEX) ||
3877       BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE);
3878 
3879    if (nir->info.stage == MESA_SHADER_VERTEX ||
3880        nir->info.stage == MESA_SHADER_TESS_EVAL) {
3881       info->nonzero_viewport = nir->info.outputs_written & VARYING_BIT_VIEWPORT;
3882 
3883       info->writes_layer_viewport =
3884          nir->info.outputs_written & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
3885 
3886    } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3887       info->disable_tri_merging = nir->info.uses_wide_subgroup_intrinsics ||
3888                                   nir->info.fs.needs_quad_helper_invocations ||
3889                                   nir->info.writes_memory;
3890 
3891       /* Writing the sample mask requires tag writes */
3892       info->tag_write_disable &= !info->writes_sample_mask;
3893 
3894       /* Report a canonical depth layout. This happens at the end because the
3895        * sample mask lowering affects it.
3896        */
3897       enum gl_frag_depth_layout layout = nir->info.fs.depth_layout;
3898 
3899       if (!(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)))
3900          info->depth_layout = FRAG_DEPTH_LAYOUT_UNCHANGED;
3901       else if (layout == FRAG_DEPTH_LAYOUT_NONE)
3902          info->depth_layout = FRAG_DEPTH_LAYOUT_ANY;
3903       else
3904          info->depth_layout = layout;
3905 
3906       info->reads_tib = nir->info.fs.uses_fbfetch_output;
3907       info->early_fragment_tests = nir->info.fs.early_fragment_tests;
3908    } else if (nir->info.stage == MESA_SHADER_COMPUTE) {
3909       info->imageblock_stride = nir->info.cs.image_block_size_per_thread_agx;
3910 
3911       for (unsigned i = 0; i < 3; ++i) {
3912          info->workgroup_size[i] = nir->info.workgroup_size[i];
3913       }
3914    }
3915 
3916    out->binary = binary.data;
3917    info->binary_size = binary.size;
3918 }
3919