• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2021 Alyssa Rosenzweig
3  * Copyright 2020 Collabora Ltd.
4  * Copyright 2016 Broadcom
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include "agx_compile.h"
9 #include "compiler/nir/nir_builder.h"
10 #include "util/glheader.h"
11 #include "util/macros.h"
12 #include "util/u_debug.h"
13 #include "agx_builder.h"
14 #include "agx_compiler.h"
15 #include "agx_debug.h"
16 #include "agx_internal_formats.h"
17 #include "agx_nir.h"
18 #include "glsl_types.h"
19 #include "nir.h"
20 #include "nir_intrinsics.h"
21 #include "nir_intrinsics_indices.h"
22 #include "shader_enums.h"
23 
24 /* Alignment for shader programs. I'm not sure what the optimal value is. */
25 #define AGX_CODE_ALIGN 0x100
26 
27 /* clang-format off */
28 static const struct debug_named_value agx_debug_options[] = {
29    {"shaders",   AGX_DBG_SHADERS,	"Dump shaders in NIR and AIR"},
30    {"shaderdb",  AGX_DBG_SHADERDB,	"Print statistics"},
31    {"verbose",   AGX_DBG_VERBOSE,	"Disassemble verbosely"},
32    {"internal",  AGX_DBG_INTERNAL,	"Dump even internal shaders"},
33    {"novalidate",AGX_DBG_NOVALIDATE,"Skip IR validation in debug builds"},
34    {"noopt",     AGX_DBG_NOOPT,     "Disable backend optimizations"},
35    {"wait",      AGX_DBG_WAIT,      "Wait after all async instructions"},
36    {"nopreamble",AGX_DBG_NOPREAMBLE,"Do not use shader preambles"},
37    {"demand",    AGX_DBG_DEMAND,    "Bound tightly to register demand"},
38    {"nosched",   AGX_DBG_NOSCHED,   "Do not schedule the shader"},
39    {"spill",     AGX_DBG_SPILL,     "Spill (almost) everything"},
40    DEBUG_NAMED_VALUE_END
41 };
42 /* clang-format on */
43 
44 DEBUG_GET_ONCE_FLAGS_OPTION(agx_compiler_debug, "AGX_MESA_DEBUG",
45                             agx_debug_options, 0)
46 
47 int agx_compiler_debug = 0;
48 
49 uint64_t
agx_get_compiler_debug(void)50 agx_get_compiler_debug(void)
51 {
52    return debug_get_option_agx_compiler_debug();
53 }
54 
55 static agx_index
agx_cached_preload(agx_context * ctx,agx_index * cache,unsigned base,enum agx_size size)56 agx_cached_preload(agx_context *ctx, agx_index *cache, unsigned base,
57                    enum agx_size size)
58 {
59    if (agx_is_null(*cache)) {
60       agx_block *block = agx_start_block(ctx);
61       agx_builder b = agx_init_builder(ctx, agx_before_block(block));
62       *cache = agx_preload(&b, agx_register(base, size));
63    }
64 
65    return *cache;
66 }
67 
68 static agx_index
agx_vertex_id(agx_builder * b)69 agx_vertex_id(agx_builder *b)
70 {
71    return agx_cached_preload(b->shader, &b->shader->vertex_id, 10, AGX_SIZE_32);
72 }
73 
74 static agx_index
agx_instance_id(agx_builder * b)75 agx_instance_id(agx_builder *b)
76 {
77    return agx_cached_preload(b->shader, &b->shader->instance_id, 12,
78                              AGX_SIZE_32);
79 }
80 
81 static agx_index
agx_get_cf(agx_context * ctx,bool smooth,bool perspective,gl_varying_slot slot,unsigned offset,unsigned count)82 agx_get_cf(agx_context *ctx, bool smooth, bool perspective,
83            gl_varying_slot slot, unsigned offset, unsigned count)
84 {
85    struct agx_varyings_fs *varyings = &ctx->out->varyings.fs;
86    unsigned cf_base = varyings->nr_cf;
87 
88    if (slot == VARYING_SLOT_POS) {
89       assert(offset == 2 || offset == 3);
90       varyings->reads_z |= (offset == 2);
91    }
92 
93    /* Forcibly vectorize pointcoord reads, since there's no (known) way to index
94     * Y alone.
95     */
96    bool is_pntc = (slot == VARYING_SLOT_PNTC);
97    bool is_tex = slot >= VARYING_SLOT_TEX0 && slot <= VARYING_SLOT_TEX7;
98    unsigned cf_offset = 0;
99 
100    if (is_pntc || is_tex) {
101       cf_offset = offset;
102       offset = 0;
103       count = is_tex ? 4 : MAX2(2, count + offset);
104    }
105 
106    /* First, search for an appropriate binding. This is O(n) to the number of
107     * bindings, which isn't great, but n should be small in practice.
108     */
109    for (unsigned b = 0; b < varyings->nr_bindings; ++b) {
110       if ((varyings->bindings[b].slot == slot) &&
111           (varyings->bindings[b].offset == offset) &&
112           (varyings->bindings[b].count == count) &&
113           (varyings->bindings[b].smooth == smooth) &&
114           (varyings->bindings[b].perspective == perspective)) {
115 
116          return agx_immediate(varyings->bindings[b].cf_base + cf_offset);
117       }
118    }
119 
120    /* If we didn't find one, make one */
121    unsigned b = varyings->nr_bindings++;
122    varyings->bindings[b].cf_base = varyings->nr_cf;
123    varyings->bindings[b].slot = slot;
124    varyings->bindings[b].offset = offset;
125    varyings->bindings[b].count = count;
126    varyings->bindings[b].smooth = smooth;
127    varyings->bindings[b].perspective = perspective;
128    varyings->nr_cf += count;
129 
130    return agx_immediate(cf_base + cf_offset);
131 }
132 
133 /* Builds a 64-bit hash table key for an index */
134 static uint64_t
agx_index_to_key(agx_index idx)135 agx_index_to_key(agx_index idx)
136 {
137    STATIC_ASSERT(sizeof(idx) <= sizeof(uint64_t));
138 
139    uint64_t key = 0;
140    memcpy(&key, &idx, sizeof(idx));
141    return key;
142 }
143 
144 /*
145  * Extract a single channel out of a vector source. We split vectors with
146  * p_split so we can use the split components directly, without emitting a
147  * machine instruction. This has advantages of RA, as the split can usually be
148  * optimized away.
149  */
150 static agx_index
agx_emit_extract(agx_builder * b,agx_index vec,unsigned channel)151 agx_emit_extract(agx_builder *b, agx_index vec, unsigned channel)
152 {
153    agx_index *components = _mesa_hash_table_u64_search(b->shader->allocated_vec,
154                                                        agx_index_to_key(vec));
155 
156    assert(components != NULL && "missing agx_emit_collect_to");
157 
158    return components[channel];
159 }
160 
161 static agx_index
agx_extract_nir_src(agx_builder * b,nir_src src,unsigned channel)162 agx_extract_nir_src(agx_builder *b, nir_src src, unsigned channel)
163 {
164    agx_index idx = agx_src_index(&src);
165 
166    /* We only deal with scalars, extract a single scalar if needed */
167    if (nir_src_num_components(src) > 1)
168       return agx_emit_extract(b, idx, channel);
169    else
170       return idx;
171 }
172 
173 static void
agx_cache_collect(agx_builder * b,agx_index dst,unsigned nr_srcs,agx_index * srcs)174 agx_cache_collect(agx_builder *b, agx_index dst, unsigned nr_srcs,
175                   agx_index *srcs)
176 {
177    /* Lifetime of a hash table entry has to be at least as long as the table */
178    agx_index *channels = ralloc_array(b->shader, agx_index, nr_srcs);
179 
180    for (unsigned i = 0; i < nr_srcs; ++i)
181       channels[i] = srcs[i];
182 
183    _mesa_hash_table_u64_insert(b->shader->allocated_vec, agx_index_to_key(dst),
184                                channels);
185 }
186 
187 /*
188  * Combine multiple scalars into a vector destination. This corresponds to
189  * collect, lowered to moves (a shuffle in general) after register allocation.
190  *
191  * To optimize vector extractions, we record the individual channels
192  */
193 static agx_instr *
agx_emit_collect_to(agx_builder * b,agx_index dst,unsigned nr_srcs,agx_index * srcs)194 agx_emit_collect_to(agx_builder *b, agx_index dst, unsigned nr_srcs,
195                     agx_index *srcs)
196 {
197    agx_cache_collect(b, dst, nr_srcs, srcs);
198 
199    if (nr_srcs == 1)
200       return agx_mov_to(b, dst, srcs[0]);
201 
202    agx_instr *I = agx_collect_to(b, dst, nr_srcs);
203 
204    agx_foreach_src(I, s)
205       I->src[s] = srcs[s];
206 
207    return I;
208 }
209 
210 static agx_index
agx_emit_collect(agx_builder * b,unsigned nr_srcs,agx_index * srcs)211 agx_emit_collect(agx_builder *b, unsigned nr_srcs, agx_index *srcs)
212 {
213    agx_index dst = agx_vec_temp(b->shader, srcs[0].size, nr_srcs);
214    agx_emit_collect_to(b, dst, nr_srcs, srcs);
215    return dst;
216 }
217 
218 static agx_index
agx_vec2(agx_builder * b,agx_index s0,agx_index s1)219 agx_vec2(agx_builder *b, agx_index s0, agx_index s1)
220 {
221    return agx_emit_collect(b, 2, (agx_index[]){s0, s1});
222 }
223 
224 static agx_index
agx_recollect_vector(agx_builder * b,nir_src vec)225 agx_recollect_vector(agx_builder *b, nir_src vec)
226 {
227    agx_index comps[4];
228    unsigned nr = nir_src_num_components(vec);
229 
230    for (unsigned i = 0; i < nr; ++i)
231       comps[i] = agx_extract_nir_src(b, vec, i);
232 
233    return agx_emit_collect(b, nr, comps);
234 }
235 
236 /*
237  * Extract the lower or upper N-bits from a (2*N)-bit quantity. We use a split
238  * without null destinations to let us CSE (and coalesce) the splits when both x
239  * and y are split.
240  */
241 static agx_instr *
agx_subdivide_to(agx_builder * b,agx_index dst,agx_index s0,unsigned comp)242 agx_subdivide_to(agx_builder *b, agx_index dst, agx_index s0, unsigned comp)
243 {
244    assert((s0.size == (dst.size + 1)) && "only 2x subdivide handled");
245    assert((comp == 0 || comp == 1) && "too many components");
246 
247    /* Handle immediates specially so we don't have to constant fold splits. */
248    if (s0.type == AGX_INDEX_IMMEDIATE) {
249       unsigned bits = 16 * agx_size_align_16(dst.size);
250       return agx_mov_imm_to(b, dst, (s0.value >> bits) & BITFIELD64_MASK(bits));
251    }
252 
253    agx_instr *split = agx_split(b, 2, s0);
254    split->dest[comp] = dst;
255    split->dest[1 - comp] = agx_temp(b->shader, dst.size);
256    return split;
257 }
258 
259 static void
agx_block_add_successor(agx_block * block,agx_block * successor)260 agx_block_add_successor(agx_block *block, agx_block *successor)
261 {
262    assert(block != NULL && successor != NULL);
263 
264    /* Cull impossible edges */
265    if (block->unconditional_jumps)
266       return;
267 
268    for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
269       if (block->successors[i]) {
270          if (block->successors[i] == successor)
271             return;
272          else
273             continue;
274       }
275 
276       block->successors[i] = successor;
277       util_dynarray_append(&successor->predecessors, agx_block *, block);
278       return;
279    }
280 
281    unreachable("Too many successors");
282 }
283 
284 /*
285  * Splits an n-component vector (vec) into n scalar destinations (dests) using a
286  * split pseudo-instruction.
287  *
288  * Pre-condition: dests is filled with agx_null().
289  */
290 static void
agx_emit_split(agx_builder * b,agx_index * dests,agx_index vec,unsigned n)291 agx_emit_split(agx_builder *b, agx_index *dests, agx_index vec, unsigned n)
292 {
293    agx_instr *I = agx_split(b, n, vec);
294 
295    agx_foreach_dest(I, d) {
296       dests[d] = agx_temp(b->shader, vec.size);
297       I->dest[d] = dests[d];
298    }
299 }
300 
301 static void
agx_emit_cached_split(agx_builder * b,agx_index vec,unsigned n)302 agx_emit_cached_split(agx_builder *b, agx_index vec, unsigned n)
303 {
304    agx_index dests[4] = {agx_null(), agx_null(), agx_null(), agx_null()};
305    agx_emit_split(b, dests, vec, n);
306    agx_cache_collect(b, vec, n, dests);
307 }
308 
309 static void
agx_emit_load_const(agx_builder * b,nir_load_const_instr * instr)310 agx_emit_load_const(agx_builder *b, nir_load_const_instr *instr)
311 {
312    /* Ensure we've been scalarized and bit size lowered */
313    unsigned bit_size = instr->def.bit_size;
314    assert(instr->def.num_components == 1);
315 
316    /* Emit move, later passes can inline/push if useful */
317    agx_mov_imm_to(b, agx_def_index(&instr->def),
318                   nir_const_value_as_uint(instr->value[0], bit_size));
319 }
320 
321 /*
322  * Implement mul_high of 32-bit sources by doing a 32x32->64-bit multiply and
323  * extracting only the high word.
324  */
325 static agx_instr *
agx_mul_high_to(agx_builder * b,agx_index dst,agx_index P,agx_index Q,bool is_signed)326 agx_mul_high_to(agx_builder *b, agx_index dst, agx_index P, agx_index Q,
327                 bool is_signed)
328 {
329    assert(P.size == Q.size && "source sizes must match");
330    assert(P.size == dst.size && "dest size must match");
331    assert(P.size != AGX_SIZE_64 && "64x64 multiply should have been lowered");
332 
333    static_assert(AGX_SIZE_64 == (AGX_SIZE_32 + 1), "enum wrong");
334    static_assert(AGX_SIZE_32 == (AGX_SIZE_16 + 1), "enum wrong");
335 
336    if (!is_signed) {
337       P = agx_abs(P);
338       Q = agx_abs(Q);
339    }
340 
341    agx_index product = agx_temp(b->shader, P.size + 1);
342    agx_imad_to(b, product, P, Q, agx_zero(), 0);
343 
344    return agx_subdivide_to(b, dst, product, 1);
345 }
346 
347 static enum agx_format
agx_format_for_pipe(enum pipe_format format)348 agx_format_for_pipe(enum pipe_format format)
349 {
350 #define CASE(x)                                                                \
351    if (format == (enum pipe_format)AGX_INTERNAL_FORMAT_##x)                    \
352       return AGX_FORMAT_##x;
353 
354    CASE(I8);
355    CASE(I16);
356    CASE(I32);
357    CASE(F16);
358    CASE(U8NORM);
359    CASE(S8NORM);
360    CASE(U16NORM);
361    CASE(S16NORM);
362    CASE(RGB10A2);
363    CASE(SRGBA8);
364    CASE(RG11B10F);
365    CASE(RGB9E5);
366 
367 #undef CASE
368    unreachable("Invalid format");
369 }
370 
371 static void
agx_emit_load_coefficients(agx_builder * b,agx_index dest,nir_intrinsic_instr * instr)372 agx_emit_load_coefficients(agx_builder *b, agx_index dest,
373                            nir_intrinsic_instr *instr)
374 {
375    enum glsl_interp_mode mode = nir_intrinsic_interp_mode(instr);
376    bool smooth = (mode != INTERP_MODE_FLAT);
377    bool perspective = smooth && (mode != INTERP_MODE_NOPERSPECTIVE);
378 
379    agx_index cf = agx_get_cf(b->shader, smooth, perspective,
380                              nir_intrinsic_io_semantics(instr).location,
381                              nir_intrinsic_component(instr), 1);
382 
383    agx_ldcf_to(b, dest, cf, 1);
384    agx_emit_cached_split(b, dest, 3);
385 }
386 
387 static enum agx_interpolation
agx_interp_for_bary(nir_intrinsic_instr * bary,agx_index * sample_index)388 agx_interp_for_bary(nir_intrinsic_instr *bary, agx_index *sample_index)
389 {
390    switch (bary->intrinsic) {
391    case nir_intrinsic_load_barycentric_pixel:
392       return AGX_INTERPOLATION_CENTER;
393 
394    case nir_intrinsic_load_barycentric_centroid:
395       return AGX_INTERPOLATION_CENTROID;
396 
397    case nir_intrinsic_load_barycentric_at_sample:
398       *sample_index = agx_src_index(&bary->src[0]);
399       return AGX_INTERPOLATION_SAMPLE;
400 
401    default:
402       unreachable("should have been lowered");
403    }
404 }
405 
406 static void
agx_emit_load_vary(agx_builder * b,agx_index dest,nir_intrinsic_instr * instr)407 agx_emit_load_vary(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
408 {
409    ASSERTED unsigned components = instr->num_components;
410    nir_intrinsic_instr *bary = nir_src_as_intrinsic(instr->src[0]);
411 
412    assert(components >= 1 && components <= 4);
413 
414    agx_index sample_index = agx_zero();
415    enum agx_interpolation interp = agx_interp_for_bary(bary, &sample_index);
416 
417    bool perspective =
418       nir_intrinsic_interp_mode(bary) != INTERP_MODE_NOPERSPECTIVE;
419 
420    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
421    nir_src *offset = nir_get_io_offset_src(instr);
422    assert(nir_src_is_const(*offset) && "no indirects");
423 
424    assert(nir_def_components_read(&instr->def) ==
425              nir_component_mask(components) &&
426           "iter does not handle write-after-write hazards");
427 
428    agx_index I = agx_get_cf(b->shader, true, perspective,
429                             sem.location + nir_src_as_uint(*offset),
430                             nir_intrinsic_component(instr), components);
431 
432    /* For perspective interpolation, we project (multiply by 1/W) */
433    if (perspective) {
434       agx_index J = agx_get_cf(b->shader, true, false, VARYING_SLOT_POS, 3, 1);
435       agx_iterproj_to(b, dest, I, J, sample_index, components, interp);
436    } else {
437       agx_iter_to(b, dest, I, sample_index, components, interp);
438    }
439 
440    agx_emit_cached_split(b, dest, components);
441 }
442 
443 static agx_instr *
agx_emit_store_vary(agx_builder * b,nir_intrinsic_instr * instr)444 agx_emit_store_vary(agx_builder *b, nir_intrinsic_instr *instr)
445 {
446    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
447    nir_src *offset = nir_get_io_offset_src(instr);
448    assert(nir_src_is_const(*offset) && "todo: indirects");
449 
450    unsigned imm_index = b->shader->out->varyings.vs.slots[sem.location];
451 
452    if (sem.location == VARYING_SLOT_LAYER ||
453        sem.location == VARYING_SLOT_CLIP_DIST0) {
454       /* Separate slots used for the sysval vs the varying. The default slot
455        * above is for the varying. Change for the sysval.
456        */
457       assert(sem.no_sysval_output || sem.no_varying);
458 
459       if (sem.no_varying) {
460          imm_index = sem.location == VARYING_SLOT_LAYER
461                         ? b->shader->out->varyings.vs.layer_viewport_slot
462                         : b->shader->out->varyings.vs.clip_dist_slot;
463       }
464    }
465 
466    assert(imm_index < ~0);
467    imm_index += (nir_src_as_uint(*offset) * 4) + nir_intrinsic_component(instr);
468 
469    /* nir_lower_io_to_scalar */
470    assert(nir_intrinsic_write_mask(instr) == 0x1);
471 
472    return agx_st_vary(b, agx_immediate(imm_index),
473                       agx_src_index(&instr->src[0]));
474 }
475 
476 static agx_instr *
agx_emit_local_store_pixel(agx_builder * b,nir_intrinsic_instr * instr)477 agx_emit_local_store_pixel(agx_builder *b, nir_intrinsic_instr *instr)
478 {
479    /* TODO: Reverse-engineer interactions with MRT */
480    if (b->shader->key->fs.ignore_tib_dependencies) {
481       assert(b->shader->nir->info.internal && "only for clear shaders");
482    } else if (b->shader->did_writeout) {
483       agx_wait_pix(b, 0x0004);
484    } else {
485       agx_wait_pix(b, 0x000C);
486    }
487 
488    /* Compact the registers according to the mask */
489    agx_index compacted[4] = {agx_null()};
490 
491    unsigned compact_count = 0;
492    u_foreach_bit(i, nir_intrinsic_write_mask(instr)) {
493       compacted[compact_count++] = agx_extract_nir_src(b, instr->src[0], i);
494    }
495 
496    agx_index collected = agx_emit_collect(b, compact_count, compacted);
497 
498    b->shader->did_writeout = true;
499    b->shader->out->tag_write_disable = false;
500    return agx_st_tile(b, collected, agx_src_index(&instr->src[1]),
501                       agx_format_for_pipe(nir_intrinsic_format(instr)),
502                       nir_intrinsic_write_mask(instr),
503                       nir_intrinsic_base(instr));
504 }
505 
506 static agx_instr *
agx_emit_store_zs(agx_builder * b,nir_intrinsic_instr * instr)507 agx_emit_store_zs(agx_builder *b, nir_intrinsic_instr *instr)
508 {
509    unsigned base = nir_intrinsic_base(instr);
510    bool write_z = base & 1;
511    bool write_s = base & 2;
512 
513    /* TODO: Handle better */
514    assert(!b->shader->key->fs.ignore_tib_dependencies && "not used");
515    agx_wait_pix(b, 0x0001);
516 
517    agx_index z = agx_src_index(&instr->src[1]);
518    agx_index s = agx_src_index(&instr->src[2]);
519 
520    assert(!write_z || z.size == AGX_SIZE_32);
521    assert(!write_s || s.size == AGX_SIZE_16);
522 
523    if (write_z && write_s) {
524       agx_index u2u32 = agx_temp(b->shader, AGX_SIZE_32);
525       agx_mov_to(b, u2u32, s);
526       s = u2u32;
527    }
528 
529    agx_index zs = (write_z && write_s) ? agx_vec2(b, z, s) : write_z ? z : s;
530 
531    /* Not necessarily a sample mask but overlapping hw mechanism... Should
532     * maybe rename this flag to something more general.
533     */
534    b->shader->out->writes_sample_mask = true;
535 
536    return agx_zs_emit(b, agx_src_index(&instr->src[0]), zs, base);
537 }
538 
539 static void
agx_emit_local_load_pixel(agx_builder * b,agx_index dest,nir_intrinsic_instr * instr)540 agx_emit_local_load_pixel(agx_builder *b, agx_index dest,
541                           nir_intrinsic_instr *instr)
542 {
543    /* TODO: Reverse-engineer interactions with MRT */
544    assert(!b->shader->key->fs.ignore_tib_dependencies && "invalid usage");
545    agx_wait_pix(b, 0x0008);
546    b->shader->did_writeout = true;
547    b->shader->out->reads_tib = true;
548 
549    unsigned nr_comps = instr->def.num_components;
550    agx_ld_tile_to(b, dest, agx_src_index(&instr->src[0]),
551                   agx_format_for_pipe(nir_intrinsic_format(instr)),
552                   BITFIELD_MASK(nr_comps), nir_intrinsic_base(instr));
553    agx_emit_cached_split(b, dest, nr_comps);
554 }
555 
556 static void
agx_emit_load(agx_builder * b,agx_index dest,nir_intrinsic_instr * instr)557 agx_emit_load(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
558 {
559    agx_index addr = agx_src_index(&instr->src[0]);
560    agx_index offset = agx_src_index(&instr->src[1]);
561    enum agx_format fmt = agx_format_for_pipe(nir_intrinsic_format(instr));
562    unsigned shift = nir_intrinsic_base(instr);
563 
564    /* Zero-extend offset if we're not sign-extending */
565    if (!nir_intrinsic_sign_extend(instr))
566       offset = agx_abs(offset);
567 
568    agx_device_load_to(b, dest, addr, offset, fmt,
569                       BITFIELD_MASK(instr->def.num_components), shift);
570    agx_emit_cached_split(b, dest, instr->def.num_components);
571 }
572 
573 static void
agx_emit_store(agx_builder * b,nir_intrinsic_instr * instr)574 agx_emit_store(agx_builder *b, nir_intrinsic_instr *instr)
575 {
576    agx_index addr = agx_src_index(&instr->src[1]);
577    agx_index offset = agx_src_index(&instr->src[2]);
578    enum agx_format fmt = agx_format_for_pipe(nir_intrinsic_format(instr));
579    unsigned shift = nir_intrinsic_base(instr);
580 
581    /* Zero-extend offset if we're not sign-extending */
582    if (!nir_intrinsic_sign_extend(instr))
583       offset = agx_abs(offset);
584 
585    agx_device_store(b, agx_recollect_vector(b, instr->src[0]), addr, offset,
586                     fmt, BITFIELD_MASK(nir_src_num_components(instr->src[0])),
587                     shift);
588 }
589 
590 /* Preambles write directly to uniform registers, so move from uniform to GPR */
591 static agx_instr *
agx_emit_load_preamble(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr)592 agx_emit_load_preamble(agx_builder *b, agx_index dst,
593                        nir_intrinsic_instr *instr)
594 {
595    agx_index srcs[4] = {agx_null()};
596    unsigned dim = instr->def.num_components;
597    assert(dim <= ARRAY_SIZE(srcs) && "shouldn't see larger vectors");
598 
599    unsigned base = nir_intrinsic_base(instr);
600    unsigned stride = agx_size_align_16(dst.size);
601 
602    for (unsigned i = 0; i < dim; ++i)
603       srcs[i] = agx_uniform(base + i * stride, dst.size);
604 
605    return agx_emit_collect_to(b, dst, dim, srcs);
606 }
607 
608 static agx_instr *
agx_emit_store_preamble(agx_builder * b,nir_intrinsic_instr * instr)609 agx_emit_store_preamble(agx_builder *b, nir_intrinsic_instr *instr)
610 {
611    agx_index vec = agx_src_index(&instr->src[0]);
612    unsigned base = nir_intrinsic_base(instr);
613    unsigned stride = agx_size_align_16(vec.size);
614 
615    for (unsigned i = 0; i < nir_src_num_components(instr->src[0]); ++i) {
616       agx_uniform_store(b, agx_extract_nir_src(b, instr->src[0], i),
617                         agx_immediate(base + i * stride));
618    }
619 
620    return NULL;
621 }
622 
623 static enum agx_dim
agx_tex_dim(enum glsl_sampler_dim dim,bool array)624 agx_tex_dim(enum glsl_sampler_dim dim, bool array)
625 {
626    switch (dim) {
627    case GLSL_SAMPLER_DIM_1D:
628       return array ? AGX_DIM_1D_ARRAY : AGX_DIM_1D;
629 
630    case GLSL_SAMPLER_DIM_2D:
631    case GLSL_SAMPLER_DIM_RECT:
632    case GLSL_SAMPLER_DIM_EXTERNAL:
633       return array ? AGX_DIM_2D_ARRAY : AGX_DIM_2D;
634 
635    case GLSL_SAMPLER_DIM_MS:
636       return array ? AGX_DIM_2D_MS_ARRAY : AGX_DIM_2D_MS;
637 
638    case GLSL_SAMPLER_DIM_3D:
639       assert(!array && "3D arrays unsupported");
640       return AGX_DIM_3D;
641 
642    case GLSL_SAMPLER_DIM_CUBE:
643       return array ? AGX_DIM_CUBE_ARRAY : AGX_DIM_CUBE;
644 
645    case GLSL_SAMPLER_DIM_BUF:
646       unreachable("Buffer textures should have been lowered");
647 
648    default:
649       unreachable("Invalid sampler dim\n");
650    }
651 }
652 
653 static agx_instr *
agx_emit_block_image_store(agx_builder * b,nir_intrinsic_instr * instr)654 agx_emit_block_image_store(agx_builder *b, nir_intrinsic_instr *instr)
655 {
656    unsigned image = nir_src_as_uint(instr->src[0]);
657    agx_index offset = agx_src_index(&instr->src[1]);
658    agx_index layer = agx_src_index(&instr->src[2]);
659    enum agx_format format = agx_format_for_pipe(nir_intrinsic_format(instr));
660 
661    bool ms = nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_MS;
662    bool array = nir_intrinsic_image_array(instr);
663    enum agx_dim dim = agx_tex_dim(nir_intrinsic_image_dim(instr), array);
664 
665    /* 32-bit source physically, 16-bit in NIR, top half ignored but needed
666     * logically to ensure alignment.
667     */
668    offset = agx_vec2(b, offset, agx_undef(AGX_SIZE_16));
669    offset.channels_m1--;
670    offset.size = AGX_SIZE_32;
671 
672    /* Modified coordinate descriptor */
673    agx_index coords;
674    if (array) {
675       coords = agx_temp(b->shader, AGX_SIZE_32);
676       agx_emit_collect_to(b, coords, 2,
677                           (agx_index[]){
678                              ms ? agx_mov_imm(b, 16, 0) : layer,
679                              ms ? layer : agx_undef(AGX_SIZE_16),
680                           });
681    } else {
682       coords = agx_null();
683    }
684 
685    // XXX: how does this possibly work
686    if (format == AGX_FORMAT_F16)
687       format = AGX_FORMAT_I16;
688 
689    return agx_block_image_store(b, agx_immediate(image), offset, coords, format,
690                                 dim);
691 }
692 
693 static agx_instr *
agx_load_compute_dimension(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr,enum agx_sr base)694 agx_load_compute_dimension(agx_builder *b, agx_index dst,
695                            nir_intrinsic_instr *instr, enum agx_sr base)
696 {
697    unsigned dim = instr->def.num_components;
698    unsigned size = instr->def.bit_size;
699    assert(size == 16 || size == 32);
700 
701    agx_index srcs[] = {
702       agx_get_sr(b, size, base + 0),
703       agx_get_sr(b, size, base + 1),
704       agx_get_sr(b, size, base + 2),
705    };
706 
707    return agx_emit_collect_to(b, dst, dim, srcs);
708 }
709 
710 static enum agx_atomic_opc
translate_atomic_opcode(nir_atomic_op op)711 translate_atomic_opcode(nir_atomic_op op)
712 {
713    /* clang-format off */
714    switch (op) {
715    case nir_atomic_op_iadd:    return AGX_ATOMIC_OPC_ADD;
716    case nir_atomic_op_imin:    return AGX_ATOMIC_OPC_IMIN;
717    case nir_atomic_op_umin:    return AGX_ATOMIC_OPC_UMIN;
718    case nir_atomic_op_imax:    return AGX_ATOMIC_OPC_IMAX;
719    case nir_atomic_op_umax:    return AGX_ATOMIC_OPC_UMAX;
720    case nir_atomic_op_iand:    return AGX_ATOMIC_OPC_AND;
721    case nir_atomic_op_ior:     return AGX_ATOMIC_OPC_OR;
722    case nir_atomic_op_ixor:    return AGX_ATOMIC_OPC_XOR;
723    case nir_atomic_op_xchg:    return AGX_ATOMIC_OPC_XCHG;
724    case nir_atomic_op_cmpxchg: return AGX_ATOMIC_OPC_CMPXCHG;
725    default: unreachable("unknown atomic opcode");
726    }
727    /* clang-format on */
728 }
729 
730 /*
731  * The "base" of a local load/store/atomic can be zero but no other immediates.
732  * This would be a little silly to handle when inlining immediates, so we
733  * instead exclude these ops from immediate inlining and just handle 0 specially
734  * when translating.
735  */
736 static agx_index
agx_local_base(nir_src src)737 agx_local_base(nir_src src)
738 {
739    if (nir_src_is_const(src) && nir_src_as_uint(src) == 0)
740       return agx_zero();
741    else
742       return agx_src_index(&src);
743 }
744 
745 static void
agx_emit_atomic(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr,bool local)746 agx_emit_atomic(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr,
747                 bool local)
748 {
749    enum agx_atomic_opc op =
750       translate_atomic_opcode(nir_intrinsic_atomic_op(instr));
751    agx_index base =
752       local ? agx_local_base(instr->src[0]) : agx_src_index(&instr->src[0]);
753    agx_index value = agx_src_index(&instr->src[local ? 1 : 2]);
754    agx_index index = local ? agx_zero() : agx_src_index(&instr->src[1]);
755 
756    /* cmpxchg (only) takes 2 sources, passed in consecutive registers */
757    if (op == AGX_ATOMIC_OPC_CMPXCHG) {
758       agx_index value2 = agx_src_index(&instr->src[local ? 2 : 3]);
759       value = agx_vec2(b, value2, value);
760    }
761 
762    if (local) {
763       assert(base.size == AGX_SIZE_16);
764       agx_local_atomic_to(b, dst, value, base, index, op);
765    } else {
766       assert(base.size == AGX_SIZE_64);
767       agx_atomic_to(b, dst, value, base, index, op);
768    }
769 }
770 
771 static enum agx_format
format_for_bitsize(unsigned bitsize)772 format_for_bitsize(unsigned bitsize)
773 {
774    switch (bitsize) {
775    case 8:
776       return AGX_FORMAT_I8;
777    case 16:
778       return AGX_FORMAT_I16;
779    case 32:
780       return AGX_FORMAT_I32;
781    default:
782       unreachable("should've been lowered");
783    }
784 }
785 
786 static void
agx_emit_local_load(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr)787 agx_emit_local_load(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr)
788 {
789    agx_index base = agx_local_base(instr->src[0]);
790    agx_index index = agx_zero(); /* TODO: optimize address arithmetic */
791    assert(base.size == AGX_SIZE_16);
792 
793    enum agx_format format = format_for_bitsize(instr->def.bit_size);
794    unsigned nr = instr->def.num_components;
795    unsigned mask = BITFIELD_MASK(nr);
796 
797    agx_local_load_to(b, dst, base, index, format, mask);
798    agx_emit_cached_split(b, dst, nr);
799 }
800 
801 static void
agx_emit_local_store(agx_builder * b,nir_intrinsic_instr * instr)802 agx_emit_local_store(agx_builder *b, nir_intrinsic_instr *instr)
803 {
804    agx_index value = agx_src_index(&instr->src[0]);
805    agx_index base = agx_local_base(instr->src[1]);
806    agx_index index = agx_zero(); /* TODO: optimize address arithmetic */
807    assert(base.size == AGX_SIZE_16);
808 
809    enum agx_format format = format_for_bitsize(nir_src_bit_size(instr->src[0]));
810    unsigned mask = BITFIELD_MASK(
811       nir_src_num_components(instr->src[0])); /* XXX: there's a write mask */
812 
813    agx_local_store(b, value, base, index, format, mask);
814 }
815 
816 static void
agx_emit_load_scratch(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr)817 agx_emit_load_scratch(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr)
818 {
819    agx_index offset = agx_src_index(&instr->src[0]);
820    enum agx_format format = format_for_bitsize(instr->def.bit_size);
821    unsigned nr = instr->def.num_components;
822    unsigned mask = BITFIELD_MASK(nr);
823 
824    agx_stack_load_to(b, dst, offset, format, mask);
825    agx_emit_cached_split(b, dst, nr);
826    b->shader->any_scratch = true;
827 }
828 
829 static void
agx_emit_store_scratch(agx_builder * b,nir_intrinsic_instr * instr)830 agx_emit_store_scratch(agx_builder *b, nir_intrinsic_instr *instr)
831 {
832    agx_index value = agx_recollect_vector(b, instr->src[0]);
833    agx_index offset = agx_src_index(&instr->src[1]);
834    enum agx_format format = format_for_bitsize(nir_src_bit_size(instr->src[0]));
835    unsigned mask = BITFIELD_MASK(nir_src_num_components(instr->src[0]));
836 
837    agx_stack_store(b, value, offset, format, mask);
838    b->shader->any_scratch = true;
839 }
840 
841 /*
842  * In the hardware, bindless texture sources are specified as a 64-bit uniform
843  * base address summed with a 32-bit register index. In NIR, we model this as a
844  * vec2, where the first source is the (constant) uniform register number and
845  * the second source is the (dynamic) byte offset.
846  */
847 static agx_index
agx_translate_bindless_handle(agx_builder * b,nir_src * handle,agx_index * base)848 agx_translate_bindless_handle(agx_builder *b, nir_src *handle, agx_index *base)
849 {
850    nir_scalar base_scalar = nir_scalar_resolved(handle->ssa, 0);
851    assert(nir_scalar_is_const(base_scalar) && "base must be constant");
852 
853    unsigned base_uint = nir_scalar_as_uint(base_scalar);
854    *base = agx_uniform(base_uint, AGX_SIZE_64);
855 
856    return agx_emit_extract(b, agx_src_index(handle), 1);
857 }
858 
859 /*
860  * Contrary to NIR, in the hardware txf requires a special sampler. The sampler
861  * cannot be arbitrary, since the hardware honours the clamps so particular
862  * configuration is required for correct out-of-bounds behaviour for txf. This
863  * helper gets the shader's txf sampler, allocating one if needed.
864  */
865 static agx_index
agx_txf_sampler(agx_context * ctx)866 agx_txf_sampler(agx_context *ctx)
867 {
868    if (!ctx->out->uses_txf) {
869       ctx->out->txf_sampler = BITSET_LAST_BIT(ctx->nir->info.samplers_used);
870       ctx->out->uses_txf = true;
871    }
872 
873    return agx_immediate(ctx->out->txf_sampler);
874 }
875 
876 static unsigned
agx_expand_tex_to(agx_builder * b,nir_def * def,agx_index src,bool masked)877 agx_expand_tex_to(agx_builder *b, nir_def *def, agx_index src, bool masked)
878 {
879    unsigned nr_channels = def->num_components;
880    nir_component_mask_t mask = nir_def_components_read(def);
881 
882    if (!masked)
883       mask = (nir_component_mask_t)BITFIELD_MASK(nr_channels);
884 
885    agx_index packed_channels[4] = {agx_null()};
886    agx_index unpacked_channels[4] = {agx_null()};
887 
888    /* Hardware writes the masked components contiguously, expand out for NIR */
889    agx_emit_split(b, packed_channels, src, 4 /* XXX: why not nr_channels */);
890 
891    for (unsigned i = 0; i < nr_channels; ++i) {
892       unpacked_channels[i] =
893          (mask & BITFIELD_BIT(i))
894             ? packed_channels[util_bitcount(mask & BITFIELD_MASK(i))]
895             : agx_undef(src.size);
896    }
897 
898    agx_emit_collect_to(b, agx_def_index(def), nr_channels, unpacked_channels);
899    return mask;
900 }
901 
902 static agx_instr *
agx_emit_image_load(agx_builder * b,agx_index dst,nir_intrinsic_instr * intr)903 agx_emit_image_load(agx_builder *b, agx_index dst, nir_intrinsic_instr *intr)
904 {
905    agx_index ms_index = agx_src_index(&intr->src[2]);
906    agx_index lod = agx_src_index(&intr->src[3]);
907    enum agx_lod_mode lod_mode = AGX_LOD_MODE_LOD_MIN;
908 
909    agx_index bindless = agx_immediate(0), texture;
910    if (intr->intrinsic == nir_intrinsic_bindless_image_load)
911       texture = agx_translate_bindless_handle(b, &intr->src[0], &bindless);
912    else if (nir_src_is_const(intr->src[0]) &&
913             nir_src_as_uint(intr->src[0]) < 0x100)
914       texture = agx_immediate(nir_src_as_uint(intr->src[0]));
915    else
916       texture = agx_src_index(&intr->src[0]);
917 
918    assert(nir_src_num_components(intr->src[1]) == 4);
919    agx_index coord[4] = {
920       agx_extract_nir_src(b, intr->src[1], 0),
921       agx_extract_nir_src(b, intr->src[1], 1),
922       agx_extract_nir_src(b, intr->src[1], 2),
923       agx_extract_nir_src(b, intr->src[1], 3),
924    };
925 
926    /* Get the image dimension. Cubes are lowered to 2D, since they are logically
927     * equivalent for imageLoad, but out-of-bounds behaviour for cubes on G13
928     * is wrong according to Piglit's arb_shader_image_load_store-invalid.
929     *
930     * This requires a matching transform in the driver.
931     */
932    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intr);
933    bool is_array = nir_intrinsic_image_array(intr);
934 
935    if (dim == GLSL_SAMPLER_DIM_CUBE) {
936       dim = GLSL_SAMPLER_DIM_2D;
937       is_array = true;
938    }
939 
940    bool is_ms = dim == GLSL_SAMPLER_DIM_MS;
941    unsigned coord_comps = glsl_get_sampler_dim_coordinate_components(dim);
942    if (is_array && is_ms) {
943       agx_index layer = agx_temp(b->shader, AGX_SIZE_16);
944       agx_subdivide_to(b, layer, coord[coord_comps], 0);
945 
946       assert(ms_index.size == AGX_SIZE_16);
947       agx_index vec = agx_vec2(b, ms_index, layer);
948       vec.size = AGX_SIZE_32;
949       vec.channels_m1 = 1 - 1;
950       coord[coord_comps++] = vec;
951    } else if (is_ms) {
952       agx_index tmp = agx_temp(b->shader, AGX_SIZE_32);
953       agx_mov_to(b, tmp, ms_index);
954       coord[coord_comps++] = tmp;
955    } else if (is_array) {
956       coord_comps++;
957    }
958 
959    /* Multisampled images do not support mipmapping */
960    if (is_ms) {
961       lod_mode = AGX_LOD_MODE_AUTO_LOD;
962       lod = agx_zero();
963    }
964 
965    agx_index coords = agx_emit_collect(b, coord_comps, coord);
966    agx_index tmp = agx_vec_temp(b->shader, dst.size, 4);
967 
968    agx_instr *I = agx_image_load_to(
969       b, tmp, coords, lod, bindless, texture, agx_txf_sampler(b->shader),
970       agx_null(), agx_tex_dim(dim, is_array), lod_mode, 0, false);
971    I->mask = agx_expand_tex_to(b, &intr->def, tmp, true);
972    return NULL;
973 }
974 
975 static agx_instr *
agx_emit_image_store(agx_builder * b,nir_intrinsic_instr * instr)976 agx_emit_image_store(agx_builder *b, nir_intrinsic_instr *instr)
977 {
978    /* See remarks in agx_emit_image_load */
979    enum glsl_sampler_dim glsl_dim = nir_intrinsic_image_dim(instr);
980    bool is_array = nir_intrinsic_image_array(instr);
981 
982    if (glsl_dim == GLSL_SAMPLER_DIM_CUBE) {
983       glsl_dim = GLSL_SAMPLER_DIM_2D;
984       is_array = true;
985    }
986 
987    enum agx_dim dim = agx_tex_dim(glsl_dim, is_array);
988    assert(glsl_dim != GLSL_SAMPLER_DIM_MS && "needs to be lowered");
989 
990    agx_index base, index;
991    if (instr->intrinsic == nir_intrinsic_bindless_image_store) {
992       index = agx_translate_bindless_handle(b, &instr->src[0], &base);
993 
994       assert(base.size == AGX_SIZE_64);
995       assert(index.size == AGX_SIZE_32);
996    } else {
997       base = agx_zero();
998       index = agx_src_index(&instr->src[0]);
999 
1000       assert(index.size == AGX_SIZE_16);
1001    }
1002 
1003    agx_index coords4 = agx_src_index(&instr->src[1]);
1004    agx_index lod = agx_src_index(&instr->src[4]);
1005    assert(lod.size == AGX_SIZE_16);
1006 
1007    int coord_components = glsl_get_sampler_dim_coordinate_components(glsl_dim);
1008    if (is_array)
1009       coord_components++;
1010 
1011    agx_index coord_comps[4] = {};
1012    for (unsigned i = 0; i < coord_components; ++i)
1013       coord_comps[i] = agx_emit_extract(b, coords4, i);
1014 
1015    agx_index coords = agx_emit_collect(b, coord_components, coord_comps);
1016    agx_index data = agx_src_index(&instr->src[3]);
1017 
1018    /* If the image format has less than 4 components, nir_opt_shrink_stores can
1019     * shrink the store. But the IR still expects 4 components: pad with undef.
1020     */
1021    if (nir_src_num_components(instr->src[3]) < 4) {
1022       agx_index chan[4] = {agx_null()};
1023 
1024       for (unsigned i = 0; i < 4; ++i) {
1025          if (i < nir_src_num_components(instr->src[3]))
1026             chan[i] = agx_extract_nir_src(b, instr->src[3], i);
1027          else
1028             chan[i] = agx_undef(data.size);
1029       }
1030 
1031       data = agx_emit_collect(b, 4, chan);
1032    }
1033 
1034    return agx_image_write(b, data, coords, lod, base, index, dim);
1035 }
1036 
1037 static agx_instr *
agx_emit_intrinsic(agx_builder * b,nir_intrinsic_instr * instr)1038 agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
1039 {
1040    agx_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest
1041                       ? agx_def_index(&instr->def)
1042                       : agx_null();
1043    gl_shader_stage stage = b->shader->stage;
1044 
1045    switch (instr->intrinsic) {
1046    case nir_intrinsic_load_barycentric_pixel:
1047    case nir_intrinsic_load_barycentric_centroid:
1048    case nir_intrinsic_load_barycentric_at_sample:
1049    case nir_intrinsic_load_barycentric_at_offset:
1050       /* handled later via load_vary */
1051       return NULL;
1052    case nir_intrinsic_load_interpolated_input:
1053       assert(stage == MESA_SHADER_FRAGMENT);
1054       agx_emit_load_vary(b, dst, instr);
1055       return NULL;
1056 
1057    case nir_intrinsic_load_coefficients_agx:
1058       assert(stage == MESA_SHADER_FRAGMENT);
1059       agx_emit_load_coefficients(b, dst, instr);
1060       return NULL;
1061 
1062    case nir_intrinsic_load_agx:
1063    case nir_intrinsic_load_constant_agx:
1064       agx_emit_load(b, dst, instr);
1065       return NULL;
1066 
1067    case nir_intrinsic_store_output:
1068       assert(stage == MESA_SHADER_VERTEX);
1069       return agx_emit_store_vary(b, instr);
1070 
1071    case nir_intrinsic_store_agx:
1072       agx_emit_store(b, instr);
1073       return NULL;
1074 
1075    case nir_intrinsic_store_shared:
1076       agx_emit_local_store(b, instr);
1077       return NULL;
1078 
1079    case nir_intrinsic_load_shared:
1080       agx_emit_local_load(b, dst, instr);
1081       return NULL;
1082 
1083    case nir_intrinsic_global_atomic_agx:
1084    case nir_intrinsic_global_atomic_swap_agx:
1085       agx_emit_atomic(b, dst, instr, false);
1086       return NULL;
1087 
1088    case nir_intrinsic_shared_atomic:
1089    case nir_intrinsic_shared_atomic_swap:
1090       agx_emit_atomic(b, dst, instr, true);
1091       return NULL;
1092 
1093    case nir_intrinsic_store_zs_agx:
1094       assert(stage == MESA_SHADER_FRAGMENT);
1095       return agx_emit_store_zs(b, instr);
1096 
1097    case nir_intrinsic_store_local_pixel_agx:
1098       assert(stage == MESA_SHADER_FRAGMENT);
1099       return agx_emit_local_store_pixel(b, instr);
1100 
1101    case nir_intrinsic_load_local_pixel_agx:
1102       assert(stage == MESA_SHADER_FRAGMENT);
1103       agx_emit_local_load_pixel(b, dst, instr);
1104       return NULL;
1105 
1106    case nir_intrinsic_load_pixel_coord:
1107       return agx_emit_collect_to(
1108          b, dst, 2,
1109          (agx_index[2]){
1110             agx_get_sr(b, 16, AGX_SR_THREAD_POSITION_IN_GRID_X),
1111             agx_get_sr(b, 16, AGX_SR_THREAD_POSITION_IN_GRID_Y),
1112          });
1113 
1114    case nir_intrinsic_load_frag_coord_zw: {
1115       agx_index cf = agx_get_cf(b->shader, true, false, VARYING_SLOT_POS,
1116                                 nir_intrinsic_component(instr), 1);
1117 
1118       return agx_iter_to(b, dst, cf, agx_zero(), 1, AGX_INTERPOLATION_CENTER);
1119    }
1120 
1121    case nir_intrinsic_sample_mask_agx: {
1122       assert(stage == MESA_SHADER_FRAGMENT);
1123       b->shader->out->writes_sample_mask = true;
1124 
1125       agx_wait_pix(b, 0x0001);
1126       return agx_sample_mask(b, agx_src_index(&instr->src[0]),
1127                              agx_src_index(&instr->src[1]));
1128    }
1129 
1130    case nir_intrinsic_load_back_face_agx:
1131       return agx_get_sr_to(b, dst, AGX_SR_BACKFACING);
1132 
1133    case nir_intrinsic_load_sample_mask_in:
1134       return agx_get_sr_to(b, dst, AGX_SR_INPUT_SAMPLE_MASK);
1135 
1136    case nir_intrinsic_load_sample_mask:
1137       return agx_get_sr_coverage_to(b, dst, AGX_SR_COVERAGE_MASK);
1138 
1139    case nir_intrinsic_load_helper_invocation:
1140       /* Compare special register to zero. We could lower this in NIR (letting
1141        * us fold in an inot) but meh?
1142        */
1143       return agx_icmp_to(b, dst,
1144                          agx_get_sr_coverage(b, 32, AGX_SR_IS_ACTIVE_THREAD),
1145                          agx_zero(), AGX_ICOND_UEQ, false);
1146 
1147    case nir_intrinsic_load_vertex_id:
1148       assert(b->shader->stage == MESA_SHADER_VERTEX);
1149       return agx_mov_to(b, dst, agx_abs(agx_vertex_id(b)));
1150 
1151    case nir_intrinsic_load_instance_id:
1152       assert(b->shader->stage == MESA_SHADER_VERTEX);
1153       return agx_mov_to(b, dst, agx_abs(agx_instance_id(b)));
1154 
1155    case nir_intrinsic_load_preamble:
1156       return agx_emit_load_preamble(b, dst, instr);
1157 
1158    case nir_intrinsic_store_preamble:
1159       return agx_emit_store_preamble(b, instr);
1160 
1161    case nir_intrinsic_image_load:
1162    case nir_intrinsic_bindless_image_load:
1163       return agx_emit_image_load(b, dst, instr);
1164 
1165    case nir_intrinsic_image_store:
1166    case nir_intrinsic_bindless_image_store:
1167       return agx_emit_image_store(b, instr);
1168 
1169    case nir_intrinsic_block_image_store_agx:
1170       return agx_emit_block_image_store(b, instr);
1171 
1172    case nir_intrinsic_load_workgroup_id:
1173       return agx_load_compute_dimension(b, dst, instr,
1174                                         AGX_SR_THREADGROUP_POSITION_IN_GRID_X);
1175 
1176    case nir_intrinsic_load_workgroup_size:
1177       return agx_load_compute_dimension(b, dst, instr,
1178                                         AGX_SR_THREADS_PER_THREADGROUP_X);
1179 
1180    case nir_intrinsic_load_global_invocation_id:
1181    case nir_intrinsic_load_global_invocation_id_zero_base:
1182       return agx_load_compute_dimension(b, dst, instr,
1183                                         AGX_SR_THREAD_POSITION_IN_GRID_X);
1184 
1185    case nir_intrinsic_load_local_invocation_id:
1186       return agx_load_compute_dimension(
1187          b, dst, instr, AGX_SR_THREAD_POSITION_IN_THREADGROUP_X);
1188 
1189    case nir_intrinsic_load_local_invocation_index:
1190       return agx_get_sr_to(b, dst, AGX_SR_THREAD_INDEX_IN_THREADGROUP);
1191 
1192    case nir_intrinsic_barrier: {
1193       assert(!b->shader->is_preamble && "invalid");
1194 
1195       bool needs_image_barriers = false;
1196 
1197       if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE) {
1198          nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
1199 
1200          if (modes & (nir_var_mem_global | nir_var_image))
1201             agx_memory_barrier(b);
1202 
1203          if (modes & nir_var_image) {
1204             agx_image_barrier_1(b);
1205             agx_image_barrier_2(b);
1206             needs_image_barriers = true;
1207          }
1208       }
1209 
1210       if (nir_intrinsic_execution_scope(instr) != SCOPE_NONE) {
1211          assert(nir_intrinsic_execution_scope(instr) > SCOPE_SUBGROUP &&
1212                 "todo: subgroup barriers");
1213          assert(gl_shader_stage_is_compute(b->shader->nir->info.stage));
1214 
1215          agx_threadgroup_barrier(b);
1216       }
1217 
1218       if (needs_image_barriers) {
1219          agx_image_barrier_3(b);
1220          agx_image_barrier_4(b);
1221       }
1222 
1223       return NULL;
1224    }
1225 
1226    case nir_intrinsic_fence_pbe_to_tex_agx: {
1227       agx_image_barrier_1(b);
1228       agx_image_barrier_2(b);
1229       agx_image_barrier_3(b);
1230       agx_image_barrier_4(b);
1231       return NULL;
1232    }
1233 
1234    case nir_intrinsic_fence_mem_to_tex_agx: {
1235       /* Flush out the atomic to main memory... Found experimentally... */
1236       agx_memory_barrier(b);
1237       agx_memory_barrier_2(b);
1238 
1239       /* TODO: Which ones do we actually need? */
1240       agx_image_barrier_1(b);
1241       agx_image_barrier_2(b);
1242       agx_image_barrier_3(b);
1243       agx_image_barrier_4(b);
1244 
1245       /* Flush out the texture cache */
1246       agx_flush_memory_to_texture(b);
1247       return NULL;
1248    }
1249 
1250    case nir_intrinsic_fence_pbe_to_tex_pixel_agx: {
1251       agx_image_barrier_1(b);
1252       agx_image_barrier_2(b);
1253       agx_flush_memory_to_texture(b);
1254       agx_image_barrier_3(b);
1255       return NULL;
1256    }
1257 
1258    case nir_intrinsic_fence_helper_exit_agx: {
1259       assert(b->shader->key->is_helper);
1260       agx_memory_barrier(b);
1261       agx_unknown_barrier_1(b);
1262       agx_memory_barrier_2(b);
1263       agx_unknown_barrier_2(b);
1264       agx_memory_barrier_3(b);
1265       return NULL;
1266    }
1267 
1268    case nir_intrinsic_begin_invocation_interlock: {
1269       if (!b->shader->did_writeout &&
1270           !b->shader->key->fs.ignore_tib_dependencies)
1271          agx_wait_pix(b, 0x000C);
1272 
1273       b->shader->did_writeout = true;
1274       return NULL;
1275    }
1276 
1277    case nir_intrinsic_load_subgroup_invocation:
1278       return agx_get_sr_to(b, dst, AGX_SR_THREAD_INDEX_IN_SUBGROUP);
1279 
1280    case nir_intrinsic_load_active_subgroup_invocation_agx:
1281       return agx_get_sr_coverage_to(b, dst,
1282                                     AGX_SR_ACTIVE_THREAD_INDEX_IN_SUBGROUP);
1283 
1284    case nir_intrinsic_reduce: {
1285       assert(nir_intrinsic_reduction_op(instr) == nir_op_iadd &&
1286              "other reductions todo");
1287 
1288       return agx_simd_iadd_to(b, dst, agx_src_index(&instr->src[0]));
1289    }
1290 
1291    case nir_intrinsic_exclusive_scan: {
1292       assert(nir_intrinsic_reduction_op(instr) == nir_op_iadd &&
1293              "other reductions todo");
1294 
1295       return agx_simd_prefix_iadd_to(b, dst, agx_src_index(&instr->src[0]));
1296    }
1297 
1298    case nir_intrinsic_read_invocation: {
1299       /* Lane ID guaranteed to be uniform */
1300       return agx_simd_shuffle_to(b, dst, agx_src_index(&instr->src[0]),
1301                                  agx_src_index(&instr->src[1]));
1302    }
1303 
1304    case nir_intrinsic_ballot: {
1305       return agx_ballot_to(b, dst, agx_src_index(&instr->src[0]));
1306    }
1307 
1308    case nir_intrinsic_doorbell_agx: {
1309       return agx_doorbell(b, nir_src_as_uint(instr->src[0]));
1310    }
1311 
1312    case nir_intrinsic_stack_map_agx: {
1313       return agx_stack_map(b, agx_src_index(&instr->src[1]),
1314                            nir_src_as_uint(instr->src[0]));
1315    }
1316 
1317    case nir_intrinsic_stack_unmap_agx: {
1318       return agx_stack_unmap_to(b, dst, nir_src_as_uint(instr->src[0]));
1319    }
1320 
1321    case nir_intrinsic_load_scratch:
1322       agx_emit_load_scratch(b, dst, instr);
1323       return NULL;
1324 
1325    case nir_intrinsic_store_scratch:
1326       agx_emit_store_scratch(b, instr);
1327       return NULL;
1328 
1329    case nir_intrinsic_load_core_id_agx:
1330       return agx_get_sr_to(b, dst, AGX_SR_CORE_ID);
1331 
1332    case nir_intrinsic_load_helper_op_id_agx:
1333       assert(b->shader->key->is_helper);
1334       return agx_get_sr_barrier_to(b, dst, AGX_SR_HELPER_OP);
1335 
1336    case nir_intrinsic_load_helper_arg_lo_agx:
1337       assert(b->shader->key->is_helper);
1338       return agx_get_sr_barrier_to(b, dst, AGX_SR_HELPER_ARG_L);
1339 
1340    case nir_intrinsic_load_helper_arg_hi_agx:
1341       assert(b->shader->key->is_helper);
1342       return agx_get_sr_barrier_to(b, dst, AGX_SR_HELPER_ARG_H);
1343 
1344    case nir_intrinsic_load_barycentric_sample:
1345    case nir_intrinsic_load_sample_id:
1346    case nir_intrinsic_load_sample_pos:
1347       unreachable("Sample shading should have been lowered");
1348 
1349    default:
1350       fprintf(stderr, "Unhandled intrinsic %s\n",
1351               nir_intrinsic_infos[instr->intrinsic].name);
1352       unreachable("Unhandled intrinsic");
1353    }
1354 }
1355 
1356 static agx_index
agx_alu_src_index(agx_builder * b,nir_alu_src src)1357 agx_alu_src_index(agx_builder *b, nir_alu_src src)
1358 {
1359    /* Check well-formedness of the input NIR */
1360    ASSERTED unsigned bitsize = nir_src_bit_size(src.src);
1361    unsigned comps = nir_src_num_components(src.src);
1362    unsigned channel = src.swizzle[0];
1363 
1364    assert(bitsize == 1 || bitsize == 8 || bitsize == 16 || bitsize == 32 ||
1365           bitsize == 64);
1366    assert(channel < comps);
1367 
1368    return agx_extract_nir_src(b, src.src, channel);
1369 }
1370 
1371 /*
1372  * Emit an instruction translating (s0 * s1) + (s2 << s3). Assuming s3 is
1373  * constant, this is an imad instruction. If s1 == 1, then this is optimized to
1374  * an iadd instruction, which is faster.
1375  */
1376 static agx_instr *
agx_emit_imadshl_agx(agx_builder * b,nir_alu_instr * alu,agx_index dst,agx_index s0,agx_index s1,agx_index s2,agx_index s3)1377 agx_emit_imadshl_agx(agx_builder *b, nir_alu_instr *alu, agx_index dst,
1378                      agx_index s0, agx_index s1, agx_index s2, agx_index s3)
1379 {
1380    /* If the shift is not constant, use a variable shift. This should never
1381     * happen in practice but we don't want to constrain the NIR.
1382     */
1383    unsigned shift;
1384    if (!nir_src_is_const(alu->src[3].src)) {
1385       s2 = agx_bfi(b, agx_immediate(0), s2, s3, 0);
1386       shift = 0;
1387    } else {
1388       shift = nir_alu_src_as_uint(alu->src[3]);
1389    }
1390 
1391    assert(shift <= 4 && "domain restriction on the input NIR");
1392 
1393    /* Emit iadd if possible, else imad */
1394    if (nir_src_is_const(alu->src[1].src) &&
1395        nir_alu_src_as_uint(alu->src[1]) == 1) {
1396 
1397       return agx_iadd_to(b, dst, s0, s2, shift);
1398    } else {
1399       return agx_imad_to(b, dst, s0, s1, s2, shift);
1400    }
1401 }
1402 
1403 static bool
is_conversion_to_8bit(nir_op op)1404 is_conversion_to_8bit(nir_op op)
1405 {
1406    switch (op) {
1407    case nir_op_i2i8:
1408    case nir_op_u2u8:
1409    case nir_op_f2i8:
1410    case nir_op_f2u8:
1411    case nir_op_b2i8:
1412       return true;
1413    default:
1414       return false;
1415    }
1416 }
1417 
1418 static agx_instr *
agx_emit_alu(agx_builder * b,nir_alu_instr * instr)1419 agx_emit_alu(agx_builder *b, nir_alu_instr *instr)
1420 {
1421    unsigned srcs = nir_op_infos[instr->op].num_inputs;
1422    unsigned sz = instr->def.bit_size;
1423    unsigned src_sz = srcs ? nir_src_bit_size(instr->src[0].src) : 0;
1424    ASSERTED unsigned comps = instr->def.num_components;
1425 
1426    assert(comps == 1 || nir_op_is_vec_or_mov(instr->op));
1427    assert(
1428       sz == 1 ||
1429       ((nir_op_is_vec_or_mov(instr->op) || is_conversion_to_8bit(instr->op)) &&
1430        sz == 8) ||
1431       sz == 16 || sz == 32 || sz == 64);
1432 
1433    agx_index dst = agx_def_index(&instr->def);
1434    agx_index s0 = srcs > 0 ? agx_alu_src_index(b, instr->src[0]) : agx_null();
1435    agx_index s1 = srcs > 1 ? agx_alu_src_index(b, instr->src[1]) : agx_null();
1436    agx_index s2 = srcs > 2 ? agx_alu_src_index(b, instr->src[2]) : agx_null();
1437    agx_index s3 = srcs > 3 ? agx_alu_src_index(b, instr->src[3]) : agx_null();
1438 
1439    agx_index i0 = agx_immediate(0);
1440    agx_index i1 = agx_immediate(1);
1441 
1442 #define UNOP(nop, aop)                                                         \
1443    case nir_op_##nop:                                                          \
1444       return agx_##aop##_to(b, dst, s0);
1445 #define BINOP(nop, aop)                                                        \
1446    case nir_op_##nop:                                                          \
1447       return agx_##aop##_to(b, dst, s0, s1);
1448 #define TRIOP(nop, aop)                                                        \
1449    case nir_op_##nop:                                                          \
1450       return agx_##aop##_to(b, dst, s0, s1, s2);
1451 
1452    switch (instr->op) {
1453       BINOP(fadd, fadd);
1454       BINOP(fmul, fmul);
1455       TRIOP(ffma, fma);
1456 
1457       UNOP(f2f16, fmov);
1458       UNOP(f2f16_rtne, fmov);
1459       UNOP(f2f32, fmov);
1460       UNOP(fround_even, roundeven);
1461       UNOP(ftrunc, trunc);
1462       UNOP(ffloor, floor);
1463       UNOP(fceil, ceil);
1464       UNOP(frcp, rcp);
1465       UNOP(frsq, rsqrt);
1466       UNOP(flog2, log2);
1467       UNOP(fexp2, exp2);
1468 
1469       UNOP(fddx, dfdx);
1470       UNOP(fddx_coarse, dfdx);
1471       UNOP(fddx_fine, dfdx);
1472 
1473       UNOP(fddy, dfdy);
1474       UNOP(fddy_coarse, dfdy);
1475       UNOP(fddy_fine, dfdy);
1476 
1477       UNOP(mov, mov);
1478       UNOP(u2u32, mov);
1479       UNOP(bitfield_reverse, bitrev);
1480       UNOP(bit_count, popcount);
1481       UNOP(ufind_msb, ffs);
1482       BINOP(iand, and);
1483       BINOP(ior, or);
1484       BINOP(ixor, xor);
1485       BINOP(interleave_agx, intl);
1486 
1487    case nir_op_feq:
1488       return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_EQ, false);
1489    case nir_op_flt:
1490       return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_LT, false);
1491    case nir_op_fge:
1492       return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_GE, false);
1493    case nir_op_fneu:
1494       return agx_fcmp_to(b, dst, s0, s1, AGX_FCOND_EQ, true);
1495 
1496    case nir_op_ieq:
1497       return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_UEQ, false);
1498    case nir_op_ine:
1499       return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_UEQ, true);
1500    case nir_op_ilt:
1501       return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_SLT, false);
1502    case nir_op_ige:
1503       return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_SLT, true);
1504    case nir_op_ult:
1505       return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_ULT, false);
1506    case nir_op_uge:
1507       return agx_icmp_to(b, dst, s0, s1, AGX_ICOND_ULT, true);
1508 
1509    case nir_op_inot:
1510       if (sz == 1)
1511          return agx_xor_to(b, dst, s0, i1);
1512       else
1513          return agx_not_to(b, dst, s0);
1514 
1515    case nir_op_b2b1:
1516       return agx_icmp_to(b, dst, s0, i0, AGX_ICOND_UEQ, true);
1517 
1518    case nir_op_fsqrt:
1519       return agx_fmul_to(b, dst, s0, agx_srsqrt(b, s0));
1520    case nir_op_fabs:
1521       return agx_fmov_to(b, dst, agx_abs(s0));
1522    case nir_op_fneg:
1523       return agx_fmov_to(b, dst, agx_neg(s0));
1524 
1525    case nir_op_fmin: {
1526       agx_index tmp = agx_fcmpsel(b, s0, s1, s0, s1, AGX_FCOND_LTN);
1527       /* flush denorms */
1528       return agx_fadd_to(b, dst, tmp, agx_negzero());
1529    }
1530    case nir_op_fmax: {
1531       agx_index tmp = agx_fcmpsel(b, s0, s1, s0, s1, AGX_FCOND_GTN);
1532       /* flush denorms */
1533       return agx_fadd_to(b, dst, tmp, agx_negzero());
1534    }
1535    case nir_op_imin:
1536       return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SLT);
1537    case nir_op_imax:
1538       return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SGT);
1539    case nir_op_umin:
1540       return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_ULT);
1541    case nir_op_umax:
1542       return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_UGT);
1543 
1544    case nir_op_iadd:
1545       return agx_iadd_to(b, dst, s0, s1, 0);
1546    case nir_op_imadshl_agx:
1547       return agx_emit_imadshl_agx(b, instr, dst, s0, s1, s2, s3);
1548    case nir_op_imsubshl_agx:
1549       return agx_emit_imadshl_agx(b, instr, dst, s0, s1, agx_neg(s2), s3);
1550    case nir_op_isub:
1551       return agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
1552    case nir_op_ineg:
1553       return agx_iadd_to(b, dst, i0, agx_neg(s0), 0);
1554    case nir_op_imul:
1555       return agx_imad_to(b, dst, s0, s1, i0, 0);
1556    case nir_op_umul_2x32_64:
1557       return agx_imad_to(b, dst, agx_abs(s0), agx_abs(s1), i0, 0);
1558    case nir_op_imul_2x32_64:
1559       return agx_imad_to(b, dst, s0, s1, i0, 0);
1560    case nir_op_umul_high:
1561       return agx_mul_high_to(b, dst, s0, s1, false);
1562    case nir_op_imul_high:
1563       return agx_mul_high_to(b, dst, s0, s1, true);
1564 
1565    case nir_op_ishl:
1566       return agx_bfi_to(b, dst, i0, s0, s1, 0);
1567    case nir_op_ushr:
1568       return agx_ushr_to(b, dst, s0, s1);
1569    case nir_op_ishr:
1570       return agx_asr_to(b, dst, s0, s1);
1571 
1572    case nir_op_extr_agx:
1573       return agx_extr_to(b, dst, s0, s1, s2,
1574                          nir_alu_src_as_uint(instr->src[3]));
1575 
1576    case nir_op_ubitfield_extract: {
1577       unsigned m = nir_alu_src_as_uint(instr->src[2]);
1578       assert(m != 0 && "should've been optimized");
1579 
1580       /* Disable masking if the whole thing is used */
1581       if (m >= 32)
1582          m = 0;
1583 
1584       return agx_bfeil_to(b, dst, i0, s0, s1, m);
1585    }
1586 
1587    case nir_op_bcsel:
1588       return agx_icmpsel_to(b, dst, s0, i0, s2, s1, AGX_ICOND_UEQ);
1589 
1590    case nir_op_i2i32: {
1591       if (src_sz == 8) {
1592          /* Sign extend in software, NIR likes 8-bit conversions */
1593          agx_index ishl16 = agx_bfi(b, i0, s0, agx_immediate(8), 0);
1594          return agx_asr_to(b, dst, ishl16, agx_immediate(8));
1595       } else {
1596          assert(s0.size == AGX_SIZE_16 && "other conversions lowered");
1597          return agx_iadd_to(b, dst, s0, i0, 0);
1598       }
1599    }
1600 
1601    case nir_op_i2i16: {
1602       if (src_sz == 8) {
1603          /* Sign extend in software, NIR likes 8-bit conversions */
1604          agx_index ishl16 = agx_bfi(b, i0, s0, agx_immediate(8), 0);
1605          return agx_asr_to(b, dst, ishl16, agx_immediate(8));
1606       } else {
1607          assert(s0.size == AGX_SIZE_32 && "other conversions lowered");
1608          return agx_subdivide_to(b, dst, s0, 0);
1609       }
1610    }
1611 
1612    case nir_op_u2u16: {
1613       if (s0.size == AGX_SIZE_32)
1614          return agx_subdivide_to(b, dst, s0, 0);
1615       else
1616          return agx_mov_to(b, dst, s0);
1617    }
1618 
1619    /* It will be put into a 16-bit register, but zero out the garbage. We could
1620     * optimize this in the future but it ensures correctness for u2u16(u2u8(x))
1621     * sequences.
1622     */
1623    case nir_op_u2u8:
1624    case nir_op_i2i8:
1625       return agx_and_to(b, dst, s0, agx_immediate(0xFF));
1626 
1627    case nir_op_iadd_sat: {
1628       agx_instr *I = agx_iadd_to(b, dst, s0, s1, 0);
1629       I->saturate = true;
1630       return I;
1631    }
1632 
1633    case nir_op_isub_sat: {
1634       agx_instr *I = agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
1635       I->saturate = true;
1636       return I;
1637    }
1638 
1639    case nir_op_uadd_sat: {
1640       agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_abs(s1), 0);
1641       I->saturate = true;
1642       return I;
1643    }
1644 
1645    case nir_op_usub_sat: {
1646       agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_neg(agx_abs(s1)), 0);
1647       I->saturate = true;
1648       return I;
1649    }
1650 
1651    case nir_op_fsat: {
1652       agx_instr *I = agx_fadd_to(b, dst, s0, agx_negzero());
1653       I->saturate = true;
1654       return I;
1655    }
1656 
1657    case nir_op_fsin_agx: {
1658       agx_index fixup = agx_sin_pt_1(b, s0);
1659       agx_index sinc = agx_sin_pt_2(b, fixup);
1660       return agx_fmul_to(b, dst, sinc, fixup);
1661    }
1662 
1663    case nir_op_f2i16:
1664       return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_S16), s0,
1665                             AGX_ROUND_RTZ);
1666 
1667    case nir_op_f2i32:
1668       return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_S32), s0,
1669                             AGX_ROUND_RTZ);
1670 
1671    case nir_op_f2u16:
1672       return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_U16), s0,
1673                             AGX_ROUND_RTZ);
1674 
1675    case nir_op_f2u32:
1676       return agx_convert_to(b, dst, agx_immediate(AGX_CONVERT_F_TO_U32), s0,
1677                             AGX_ROUND_RTZ);
1678 
1679    case nir_op_u2f16:
1680    case nir_op_u2f32: {
1681       if (src_sz == 64)
1682          unreachable("64-bit conversions unimplemented");
1683 
1684       enum agx_convert mode = (src_sz == 32)   ? AGX_CONVERT_U32_TO_F
1685                               : (src_sz == 16) ? AGX_CONVERT_U16_TO_F
1686                                                : AGX_CONVERT_U8_TO_F;
1687 
1688       return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
1689    }
1690 
1691    case nir_op_i2f16:
1692    case nir_op_i2f32: {
1693       if (src_sz == 64)
1694          unreachable("64-bit conversions unimplemented");
1695 
1696       enum agx_convert mode = (src_sz == 32)   ? AGX_CONVERT_S32_TO_F
1697                               : (src_sz == 16) ? AGX_CONVERT_S16_TO_F
1698                                                : AGX_CONVERT_S8_TO_F;
1699 
1700       return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
1701    }
1702 
1703    case nir_op_pack_32_2x16_split:
1704    case nir_op_pack_64_2x32_split: {
1705       agx_index idx[] = {s0, s1};
1706       return agx_emit_collect_to(b, dst, 2, idx);
1707    }
1708 
1709    case nir_op_unpack_64_2x32_split_x:
1710    case nir_op_unpack_32_2x16_split_x:
1711       return agx_subdivide_to(b, dst, s0, 0);
1712 
1713    case nir_op_unpack_64_2x32_split_y:
1714    case nir_op_unpack_32_2x16_split_y:
1715       return agx_subdivide_to(b, dst, s0, 1);
1716 
1717    case nir_op_vec2:
1718    case nir_op_vec3:
1719    case nir_op_vec4: {
1720       agx_index idx[] = {s0, s1, s2, s3};
1721       return agx_emit_collect_to(b, dst, srcs, idx);
1722    }
1723 
1724    case nir_op_vec8:
1725    case nir_op_vec16:
1726       unreachable("should've been lowered");
1727 
1728    default:
1729       fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
1730       unreachable("Unhandled ALU instruction");
1731    }
1732 }
1733 
1734 static enum agx_lod_mode
agx_lod_mode_for_nir(nir_texop op,bool biased)1735 agx_lod_mode_for_nir(nir_texop op, bool biased)
1736 {
1737    switch (op) {
1738    case nir_texop_tex:
1739    case nir_texop_tg4:
1740       return AGX_LOD_MODE_AUTO_LOD;
1741    case nir_texop_txb:
1742       return AGX_LOD_MODE_AUTO_LOD_BIAS;
1743    case nir_texop_lod:
1744       return biased ? AGX_LOD_MODE_AUTO_LOD_BIAS : AGX_LOD_MODE_AUTO_LOD;
1745    case nir_texop_txd:
1746       return AGX_LOD_MODE_LOD_GRAD;
1747    case nir_texop_txl:
1748       return AGX_LOD_MODE_LOD_MIN;
1749    case nir_texop_txf:
1750       return AGX_LOD_MODE_LOD_MIN;
1751    case nir_texop_txf_ms:
1752       return AGX_LOD_MODE_AUTO_LOD; /* no mipmapping */
1753    default:
1754       unreachable("Unhandled texture op");
1755    }
1756 }
1757 
1758 static enum agx_gather
agx_gather_for_nir(nir_tex_instr * tex)1759 agx_gather_for_nir(nir_tex_instr *tex)
1760 {
1761    if (tex->op == nir_texop_tg4) {
1762       enum agx_gather components[] = {
1763          AGX_GATHER_R,
1764          AGX_GATHER_G,
1765          AGX_GATHER_B,
1766          AGX_GATHER_A,
1767       };
1768 
1769       assert(tex->component < ARRAY_SIZE(components));
1770       return components[tex->component];
1771    } else {
1772       return AGX_GATHER_NONE;
1773    }
1774 }
1775 
1776 static void
agx_emit_tex(agx_builder * b,nir_tex_instr * instr)1777 agx_emit_tex(agx_builder *b, nir_tex_instr *instr)
1778 {
1779    agx_index coords = agx_null(), bindless = agx_immediate(0),
1780              texture = agx_immediate(instr->texture_index),
1781              sampler = agx_immediate(instr->sampler_index),
1782              lod = agx_immediate(0), compare = agx_null(),
1783              packed_offset = agx_null();
1784 
1785    bool txf = (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms);
1786 
1787    if (txf)
1788       sampler = agx_txf_sampler(b->shader);
1789 
1790    for (unsigned i = 0; i < instr->num_srcs; ++i) {
1791       agx_index index = agx_src_index(&instr->src[i].src);
1792 
1793       switch (instr->src[i].src_type) {
1794       case nir_tex_src_backend1:
1795          coords = index;
1796          break;
1797 
1798       case nir_tex_src_backend2:
1799          packed_offset = index;
1800          break;
1801 
1802       case nir_tex_src_lod:
1803       case nir_tex_src_bias:
1804          lod = index;
1805          break;
1806 
1807       case nir_tex_src_comparator:
1808          assert(index.size == AGX_SIZE_32);
1809          compare = index;
1810          break;
1811 
1812       case nir_tex_src_texture_offset:
1813          texture = index;
1814          break;
1815       case nir_tex_src_sampler_offset:
1816       case nir_tex_src_sampler_handle:
1817          sampler = index;
1818          break;
1819 
1820       case nir_tex_src_texture_handle:
1821          texture =
1822             agx_translate_bindless_handle(b, &instr->src[i].src, &bindless);
1823          break;
1824 
1825       case nir_tex_src_ddx: {
1826          int y_idx = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
1827          assert(y_idx >= 0 && "we only handle gradients");
1828 
1829          unsigned n = nir_tex_instr_src_size(instr, y_idx);
1830          assert((n == 2 || n == 3) && "other sizes not supported");
1831 
1832          agx_index index2 = agx_src_index(&instr->src[y_idx].src);
1833 
1834          /* We explicitly don't cache about the split cache for this */
1835          lod = agx_vec_temp(b->shader, AGX_SIZE_32, 2 * n);
1836          agx_instr *I = agx_collect_to(b, lod, 2 * n);
1837 
1838          for (unsigned i = 0; i < n; ++i) {
1839             I->src[(2 * i) + 0] = agx_emit_extract(b, index, i);
1840             I->src[(2 * i) + 1] = agx_emit_extract(b, index2, i);
1841          }
1842 
1843          break;
1844       }
1845 
1846       case nir_tex_src_ddy:
1847          /* handled above */
1848          break;
1849 
1850       default:
1851          unreachable("Unexpected texture source");
1852       }
1853    }
1854 
1855    agx_index dst = agx_def_index(&instr->def);
1856 
1857    /* Pack shadow reference value (compare) and packed offset together */
1858    agx_index compare_offset = agx_null();
1859 
1860    if (!agx_is_null(compare) && !agx_is_null(packed_offset))
1861       compare_offset = agx_vec2(b, compare, packed_offset);
1862    else if (!agx_is_null(packed_offset))
1863       compare_offset = packed_offset;
1864    else if (!agx_is_null(compare))
1865       compare_offset = compare;
1866 
1867    agx_index tmp = agx_vec_temp(b->shader, dst.size, 4);
1868    agx_instr *I = agx_texture_sample_to(
1869       b, tmp, coords, lod, bindless, texture, sampler, compare_offset,
1870       agx_tex_dim(instr->sampler_dim, instr->is_array),
1871       agx_lod_mode_for_nir(
1872          instr->op, nir_tex_instr_src_index(instr, nir_tex_src_bias) >= 0),
1873       0, !agx_is_null(packed_offset), !agx_is_null(compare),
1874       instr->op == nir_texop_lod, agx_gather_for_nir(instr));
1875 
1876    if (txf)
1877       I->op = AGX_OPCODE_TEXTURE_LOAD;
1878 
1879    /* Destination masking doesn't seem to work properly for gathers (because
1880     * it's mostly pointless), but it does show up in the lowering of
1881     * textureGatherOffsets. Don't try to mask the destination for gathers.
1882     */
1883    bool masked = (instr->op != nir_texop_tg4);
1884    I->mask = agx_expand_tex_to(b, &instr->def, tmp, masked);
1885 }
1886 
1887 /*
1888  * Determine if a NIR loop (CF list) uses a continue jump, including within
1889  * if-else statements but not including nested loops.
1890  */
1891 static bool
cf_list_uses_continue(struct exec_list * list)1892 cf_list_uses_continue(struct exec_list *list)
1893 {
1894    foreach_list_typed(nir_cf_node, node, node, list) {
1895       if (node->type == nir_cf_node_block) {
1896          nir_block *block = nir_cf_node_as_block(node);
1897 
1898          nir_foreach_instr(instr, block) {
1899             if (instr->type == nir_instr_type_jump &&
1900                 nir_instr_as_jump(instr)->type == nir_jump_continue)
1901                return true;
1902          }
1903       } else if (node->type == nir_cf_node_if) {
1904          nir_if *nif = nir_cf_node_as_if(node);
1905 
1906          if (cf_list_uses_continue(&nif->then_list) ||
1907              cf_list_uses_continue(&nif->else_list))
1908             return true;
1909       } else {
1910          assert(node->type == nir_cf_node_loop && "don't care about nesting");
1911       }
1912    }
1913 
1914    return false;
1915 }
1916 
1917 static bool
loop_uses_continue(nir_loop * loop)1918 loop_uses_continue(nir_loop *loop)
1919 {
1920    return cf_list_uses_continue(&loop->body);
1921 }
1922 
1923 /*
1924  * NIR loops are treated as a pair of AGX loops:
1925  *
1926  *    do {
1927  *       do {
1928  *          ...
1929  *       } while (0);
1930  *    } while (cond);
1931  *
1932  * By manipulating the nesting counter, we may break out of nested loops, so
1933  * under the model, both break and continue may be implemented as breaks, where
1934  * break breaks out of the outer loop (2 layers) and continue breaks out of the
1935  * inner loop (1 layer).
1936  *
1937  * After manipulating the nesting counter directly, pop_exec #0 must be used to
1938  * flush the update to the execution mask.
1939  */
1940 static void
agx_emit_jump(agx_builder * b,nir_jump_instr * instr)1941 agx_emit_jump(agx_builder *b, nir_jump_instr *instr)
1942 {
1943    agx_context *ctx = b->shader;
1944    assert(instr->type == nir_jump_break || instr->type == nir_jump_continue);
1945 
1946    /* Break out of either one or two loops */
1947    unsigned nestings = b->shader->loop_nesting;
1948 
1949    if (instr->type == nir_jump_continue) {
1950       nestings += 1;
1951       agx_block_add_successor(ctx->current_block, ctx->continue_block);
1952    } else if (instr->type == nir_jump_break) {
1953       nestings += ctx->loop_continues ? 2 : 1;
1954       agx_block_add_successor(ctx->current_block, ctx->break_block);
1955    }
1956 
1957    agx_break(b, nestings, ctx->break_block);
1958    ctx->current_block->unconditional_jumps = true;
1959 }
1960 
1961 static void
agx_emit_phi(agx_builder * b,nir_phi_instr * instr)1962 agx_emit_phi(agx_builder *b, nir_phi_instr *instr)
1963 {
1964    agx_instr *I =
1965       agx_phi_to(b, agx_def_index(&instr->def), exec_list_length(&instr->srcs));
1966 
1967    /* Deferred */
1968    I->phi = instr;
1969 }
1970 
1971 /* Look up the AGX block corresponding to a given NIR block. Used when
1972  * translating phi nodes after emitting all blocks.
1973  */
1974 static agx_block *
agx_from_nir_block(agx_context * ctx,nir_block * block)1975 agx_from_nir_block(agx_context *ctx, nir_block *block)
1976 {
1977    return ctx->indexed_nir_blocks[block->index];
1978 }
1979 
1980 static void
agx_emit_phi_deferred(agx_context * ctx,agx_block * block,agx_instr * I)1981 agx_emit_phi_deferred(agx_context *ctx, agx_block *block, agx_instr *I)
1982 {
1983    nir_phi_instr *phi = I->phi;
1984 
1985    /* Guaranteed by lower_phis_to_scalar */
1986    assert(phi->def.num_components == 1);
1987 
1988    nir_foreach_phi_src(src, phi) {
1989       agx_block *pred = agx_from_nir_block(ctx, src->pred);
1990       unsigned i = agx_predecessor_index(block, pred);
1991       assert(i < I->nr_srcs);
1992 
1993       I->src[i] = agx_src_index(&src->src);
1994    }
1995 }
1996 
1997 static void
agx_emit_phis_deferred(agx_context * ctx)1998 agx_emit_phis_deferred(agx_context *ctx)
1999 {
2000    agx_foreach_block(ctx, block) {
2001       agx_foreach_phi_in_block(block, I)
2002          agx_emit_phi_deferred(ctx, block, I);
2003    }
2004 }
2005 
2006 static void
agx_emit_undef(agx_builder * b,nir_undef_instr * instr)2007 agx_emit_undef(agx_builder *b, nir_undef_instr *instr)
2008 {
2009    /* For now, just lower undefs to zero. This doesn't matter too much, since
2010     * the lowering happens in NIR and this just allows for late lowering passes
2011     * to result in undefs.
2012     */
2013    if (instr->def.num_components > 1) {
2014       assert(instr->def.num_components <= 4);
2015       agx_index zero = agx_mov_imm(b, instr->def.bit_size, 0);
2016 
2017       agx_emit_collect_to(b, agx_def_index(&instr->def),
2018                           instr->def.num_components,
2019                           (agx_index[4]){zero, zero, zero, zero});
2020    } else {
2021       agx_mov_imm_to(b, agx_def_index(&instr->def), 0);
2022    }
2023 }
2024 
2025 static void
agx_emit_instr(agx_builder * b,struct nir_instr * instr)2026 agx_emit_instr(agx_builder *b, struct nir_instr *instr)
2027 {
2028    switch (instr->type) {
2029    case nir_instr_type_load_const:
2030       agx_emit_load_const(b, nir_instr_as_load_const(instr));
2031       break;
2032 
2033    case nir_instr_type_intrinsic:
2034       agx_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
2035       break;
2036 
2037    case nir_instr_type_alu:
2038       agx_emit_alu(b, nir_instr_as_alu(instr));
2039       break;
2040 
2041    case nir_instr_type_tex:
2042       agx_emit_tex(b, nir_instr_as_tex(instr));
2043       break;
2044 
2045    case nir_instr_type_jump:
2046       agx_emit_jump(b, nir_instr_as_jump(instr));
2047       break;
2048 
2049    case nir_instr_type_phi:
2050       agx_emit_phi(b, nir_instr_as_phi(instr));
2051       break;
2052 
2053    case nir_instr_type_undef:
2054       agx_emit_undef(b, nir_instr_as_undef(instr));
2055       break;
2056 
2057    default:
2058       unreachable("should've been lowered");
2059    }
2060 }
2061 
2062 static agx_block *
agx_create_block(agx_context * ctx)2063 agx_create_block(agx_context *ctx)
2064 {
2065    agx_block *blk = rzalloc(ctx, agx_block);
2066 
2067    util_dynarray_init(&blk->predecessors, blk);
2068 
2069    return blk;
2070 }
2071 
2072 static agx_block *
emit_block(agx_context * ctx,nir_block * block)2073 emit_block(agx_context *ctx, nir_block *block)
2074 {
2075    if (ctx->after_block) {
2076       ctx->current_block = ctx->after_block;
2077       ctx->after_block = NULL;
2078    } else {
2079       ctx->current_block = agx_create_block(ctx);
2080    }
2081 
2082    agx_block *blk = ctx->current_block;
2083    list_addtail(&blk->link, &ctx->blocks);
2084    list_inithead(&blk->instructions);
2085 
2086    ctx->indexed_nir_blocks[block->index] = blk;
2087 
2088    agx_builder _b = agx_init_builder(ctx, agx_after_block(blk));
2089 
2090    nir_foreach_instr(instr, block) {
2091       agx_emit_instr(&_b, instr);
2092    }
2093 
2094    return blk;
2095 }
2096 
2097 static agx_block *emit_cf_list(agx_context *ctx, struct exec_list *list);
2098 
2099 /* Emit if-else as
2100  *
2101  *    if_icmp cond != 0
2102  *       ...
2103  *    else_icmp cond == 0
2104  *       ...
2105  *    pop_exec
2106  *
2107  * If the else is empty, we can omit the else_icmp. This happens elsewhere, as
2108  * an empty else block can become nonempty after RA due to phi lowering. This is
2109  * not usually optimal, but it's a start.
2110  */
2111 
2112 static void
emit_if(agx_context * ctx,nir_if * nif)2113 emit_if(agx_context *ctx, nir_if *nif)
2114 {
2115    agx_block *first_block = ctx->current_block;
2116    agx_builder _b = agx_init_builder(ctx, agx_after_block(first_block));
2117    agx_index cond = agx_src_index(&nif->condition);
2118 
2119    agx_instr *if_ = agx_if_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, true,
2120                                 NULL /* filled in later */);
2121    ctx->loop_nesting++;
2122    ctx->total_nesting++;
2123 
2124    /* Emit the two subblocks. */
2125    agx_block *if_block = emit_cf_list(ctx, &nif->then_list);
2126    agx_block *end_then = ctx->current_block;
2127 
2128    _b.cursor = agx_after_block(ctx->current_block);
2129 
2130    agx_block *else_block = emit_cf_list(ctx, &nif->else_list);
2131    agx_block *end_else = ctx->current_block;
2132 
2133    /* If the "if" fails, we fallthrough to the else */
2134    if_->target = else_block;
2135 
2136    /* Insert an else instruction at the beginning of the else block. We use
2137     * "else_fcmp 0.0, 0.0, eq" as unconditional else, matching the blob.
2138     *
2139     * If it fails, we fall through to the logical end of the last else block.
2140     */
2141    _b.cursor = agx_before_block(else_block);
2142    agx_else_fcmp(&_b, agx_zero(), agx_zero(), 1, AGX_FCOND_EQ, false, end_else);
2143 
2144    ctx->after_block = agx_create_block(ctx);
2145 
2146    agx_block_add_successor(first_block, if_block);
2147    agx_block_add_successor(first_block, else_block);
2148    agx_block_add_successor(end_then, ctx->after_block);
2149    agx_block_add_successor(end_else, ctx->after_block);
2150 
2151    _b.cursor = agx_after_block(ctx->current_block);
2152    agx_pop_exec(&_b, 1);
2153    ctx->loop_nesting--;
2154    ctx->total_nesting--;
2155 }
2156 
2157 static void
emit_loop(agx_context * ctx,nir_loop * nloop)2158 emit_loop(agx_context *ctx, nir_loop *nloop)
2159 {
2160    assert(!nir_loop_has_continue_construct(nloop));
2161    /* We only track nesting within the innermost loop, so push and reset */
2162    unsigned pushed_nesting = ctx->loop_nesting;
2163    ctx->loop_nesting = 0;
2164    ctx->total_nesting++;
2165 
2166    bool old_continues = ctx->loop_continues;
2167    ctx->loop_continues = loop_uses_continue(nloop);
2168 
2169    agx_block *popped_break = ctx->break_block;
2170    agx_block *popped_continue = ctx->continue_block;
2171 
2172    ctx->break_block = agx_create_block(ctx);
2173    ctx->continue_block = agx_create_block(ctx);
2174 
2175    /* If we are emitting a loop inside other control flow, there might be
2176     * threads masked off (TODO: divergence analysis), so push_exec them so
2177     * we get the lower nesting count values to ourselves.
2178     */
2179    agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
2180    if (ctx->total_nesting > 1)
2181       agx_push_exec(&_b, ctx->loop_continues ? 2 : 1);
2182 
2183    /* Fallthrough to body */
2184    agx_block_add_successor(ctx->current_block, ctx->continue_block);
2185 
2186    /* Emit the body */
2187    ctx->after_block = ctx->continue_block;
2188    ctx->after_block->loop_header = true;
2189    agx_block *start_block = emit_cf_list(ctx, &nloop->body);
2190 
2191    /* If we used any continue jumps, we need to reactivate the continued
2192     * threads. We do this with an always true while_icmp, which behaves like:
2193     *
2194     *    if (r0l == 1) {
2195     *       r0l = 0;
2196     *    }
2197     *    update_exec
2198     *
2199     * If we did not use continue, this would be a no-op so it is omitted.
2200     */
2201    _b.cursor = agx_after_block(ctx->current_block);
2202 
2203    if (ctx->loop_continues) {
2204       agx_while_icmp(
2205          &_b, agx_zero(), agx_zero(), 2, AGX_ICOND_UEQ, false,
2206          NULL /* no semantic target, used purely for side effects */);
2207    }
2208 
2209    agx_jmp_exec_any(&_b, start_block);
2210    agx_pop_exec(&_b, ctx->loop_continues ? 2 : 1);
2211    agx_block_add_successor(ctx->current_block, ctx->continue_block);
2212 
2213    /* Pop off */
2214    ctx->after_block = ctx->break_block;
2215    ctx->break_block = popped_break;
2216    ctx->continue_block = popped_continue;
2217 
2218    /* Update shader-db stats */
2219    ++ctx->loop_count;
2220 
2221    /* All nested control flow must have finished */
2222    assert(ctx->loop_nesting == 0);
2223 
2224    /* Restore loop nesting (we might be inside an if inside an outer loop) */
2225    ctx->loop_nesting = pushed_nesting;
2226    ctx->total_nesting--;
2227    ctx->loop_continues = old_continues;
2228 }
2229 
2230 /* Before the first control flow structure, the nesting counter needs to be
2231  * zeroed for correct operation. This only happens at most once, since by
2232  * definition this occurs at the end of the first block, which dominates the
2233  * rest of the program. */
2234 
2235 static void
emit_first_cf(agx_context * ctx)2236 emit_first_cf(agx_context *ctx)
2237 {
2238    if (ctx->any_cf)
2239       return;
2240 
2241    agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
2242    agx_begin_cf(&_b);
2243    ctx->any_cf = true;
2244 }
2245 
2246 static agx_block *
emit_cf_list(agx_context * ctx,struct exec_list * list)2247 emit_cf_list(agx_context *ctx, struct exec_list *list)
2248 {
2249    agx_block *start_block = NULL;
2250 
2251    foreach_list_typed(nir_cf_node, node, node, list) {
2252       switch (node->type) {
2253       case nir_cf_node_block: {
2254          agx_block *block = emit_block(ctx, nir_cf_node_as_block(node));
2255 
2256          if (!start_block)
2257             start_block = block;
2258 
2259          break;
2260       }
2261 
2262       case nir_cf_node_if:
2263          emit_first_cf(ctx);
2264          emit_if(ctx, nir_cf_node_as_if(node));
2265          break;
2266 
2267       case nir_cf_node_loop:
2268          emit_first_cf(ctx);
2269          emit_loop(ctx, nir_cf_node_as_loop(node));
2270          break;
2271 
2272       default:
2273          unreachable("Unknown control flow");
2274       }
2275    }
2276 
2277    return start_block;
2278 }
2279 
2280 static void
agx_set_st_vary_final(agx_context * ctx)2281 agx_set_st_vary_final(agx_context *ctx)
2282 {
2283    agx_foreach_instr_global_rev(ctx, I) {
2284       if (I->op == AGX_OPCODE_ST_VARY) {
2285          I->last = true;
2286          return;
2287       }
2288    }
2289 
2290    /* If we got here, there was no varying written. We need to mark that. */
2291    agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
2292    agx_builder _b = agx_init_builder(ctx, agx_after_block_logical(last_block));
2293    agx_no_varyings(&_b);
2294 }
2295 
2296 static int
agx_dump_stats(agx_context * ctx,unsigned size,char ** out)2297 agx_dump_stats(agx_context *ctx, unsigned size, char **out)
2298 {
2299    unsigned nr_ins = 0;
2300 
2301    /* Count instructions */
2302    agx_foreach_instr_global(ctx, I)
2303       nr_ins++;
2304 
2305    unsigned nr_threads =
2306       agx_occupancy_for_register_count(ctx->max_reg).max_threads;
2307 
2308    return asprintf(out,
2309                    "%s shader: %u inst, %u bytes, %u halfregs, %u threads, "
2310                    "%u loops, %u:%u spills:fills",
2311                    gl_shader_stage_name(ctx->stage), nr_ins, size, ctx->max_reg,
2312                    nr_threads, ctx->loop_count, ctx->spills, ctx->fills);
2313 }
2314 
2315 static int
glsl_type_size(const struct glsl_type * type,bool bindless)2316 glsl_type_size(const struct glsl_type *type, bool bindless)
2317 {
2318    return glsl_count_attribute_slots(type, false);
2319 }
2320 
2321 static bool
agx_lower_sincos_filter(const nir_instr * instr,UNUSED const void * _)2322 agx_lower_sincos_filter(const nir_instr *instr, UNUSED const void *_)
2323 {
2324    if (instr->type != nir_instr_type_alu)
2325       return false;
2326 
2327    nir_alu_instr *alu = nir_instr_as_alu(instr);
2328    return alu->op == nir_op_fsin || alu->op == nir_op_fcos;
2329 }
2330 
2331 /* Sine and cosine are implemented via the sin_pt_1 and sin_pt_2 opcodes for
2332  * heavy lifting. sin_pt_2 implements sinc in the first quadrant, expressed in
2333  * turns (sin (tau x) / x), while sin_pt_1 implements a piecewise sign/offset
2334  * fixup to transform a quadrant angle [0, 4] to [-1, 1]. The NIR opcode
2335  * fsin_agx models the fixup, sinc, and multiply to obtain sine, so we just
2336  * need to change units from radians to quadrants modulo turns. Cosine is
2337  * implemented by shifting by one quadrant: cos(x) = sin(x + tau/4).
2338  */
2339 
2340 static nir_def *
agx_lower_sincos_impl(struct nir_builder * b,nir_instr * instr,UNUSED void * _)2341 agx_lower_sincos_impl(struct nir_builder *b, nir_instr *instr, UNUSED void *_)
2342 {
2343    nir_alu_instr *alu = nir_instr_as_alu(instr);
2344    nir_def *x = nir_mov_alu(b, alu->src[0], 1);
2345    nir_def *turns = nir_fmul_imm(b, x, M_1_PI * 0.5f);
2346 
2347    if (alu->op == nir_op_fcos)
2348       turns = nir_fadd_imm(b, turns, 0.25f);
2349 
2350    nir_def *quadrants = nir_fmul_imm(b, nir_ffract(b, turns), 4.0);
2351    return nir_fsin_agx(b, quadrants);
2352 }
2353 
2354 static bool
agx_lower_sincos(nir_shader * shader)2355 agx_lower_sincos(nir_shader *shader)
2356 {
2357    return nir_shader_lower_instructions(shader, agx_lower_sincos_filter,
2358                                         agx_lower_sincos_impl, NULL);
2359 }
2360 
2361 static bool
agx_lower_front_face(struct nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)2362 agx_lower_front_face(struct nir_builder *b, nir_intrinsic_instr *intr,
2363                      UNUSED void *data)
2364 {
2365    if (intr->intrinsic != nir_intrinsic_load_front_face)
2366       return false;
2367 
2368    nir_def *def = &intr->def;
2369    assert(def->bit_size == 1);
2370 
2371    b->cursor = nir_before_instr(&intr->instr);
2372    nir_def_rewrite_uses(def, nir_inot(b, nir_load_back_face_agx(b, 1)));
2373    return true;
2374 }
2375 
2376 /*
2377  * Standard NIR optimization loop. This is run in agx_preprocess_nir, then once
2378  * again at shader variant compile time. Unless there was a complex shader key,
2379  * the latter run should be almost a no-op.
2380  */
2381 static void
agx_optimize_loop_nir(nir_shader * nir)2382 agx_optimize_loop_nir(nir_shader *nir)
2383 {
2384    bool progress;
2385 
2386    do {
2387       progress = false;
2388 
2389       NIR_PASS(progress, nir, nir_copy_prop);
2390       NIR_PASS(progress, nir, nir_opt_remove_phis);
2391       NIR_PASS(progress, nir, nir_opt_dce);
2392       NIR_PASS(progress, nir, nir_opt_dead_cf);
2393       NIR_PASS(progress, nir, nir_opt_cse);
2394       NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
2395       NIR_PASS(progress, nir, nir_opt_phi_precision);
2396       NIR_PASS(progress, nir, nir_opt_algebraic);
2397       NIR_PASS(progress, nir, nir_opt_constant_folding);
2398       NIR_PASS(progress, nir, nir_opt_undef);
2399       NIR_PASS(progress, nir, nir_opt_shrink_vectors);
2400       NIR_PASS(progress, nir, nir_opt_loop_unroll);
2401    } while (progress);
2402 }
2403 
2404 static bool
mem_vectorize_cb(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)2405 mem_vectorize_cb(unsigned align_mul, unsigned align_offset, unsigned bit_size,
2406                  unsigned num_components, nir_intrinsic_instr *low,
2407                  nir_intrinsic_instr *high, void *data)
2408 {
2409    /* Must be aligned to the size of the load */
2410    unsigned align = nir_combined_align(align_mul, align_offset);
2411    if ((bit_size / 8) > align)
2412       return false;
2413 
2414    if (num_components > 4)
2415       return false;
2416 
2417    if (bit_size > 32)
2418       return false;
2419 
2420    return true;
2421 }
2422 
2423 static void
agx_optimize_nir(nir_shader * nir,unsigned * preamble_size)2424 agx_optimize_nir(nir_shader *nir, unsigned *preamble_size)
2425 {
2426    /* This runs only once up front since other optimizations don't affect it */
2427    NIR_PASS(_, nir, nir_opt_shrink_stores, true);
2428 
2429    agx_optimize_loop_nir(nir);
2430 
2431    NIR_PASS(_, nir, nir_opt_load_store_vectorize,
2432             &(const nir_load_store_vectorize_options){
2433                .modes = nir_var_mem_global | nir_var_mem_constant,
2434                .callback = mem_vectorize_cb,
2435             });
2436    NIR_PASS(_, nir, nir_lower_pack);
2437 
2438    bool progress = false;
2439    NIR_PASS(progress, nir, agx_nir_lower_address);
2440 
2441    /* If address lowering made progress, clean up before forming preambles.
2442     * Otherwise the optimized preambles might just be constants! Do it before
2443     * lowering int64 too, to avoid lowering constant int64 arithmetic.
2444     */
2445    if (progress) {
2446       NIR_PASS(_, nir, nir_opt_constant_folding);
2447       NIR_PASS(_, nir, nir_opt_dce);
2448    }
2449 
2450    /* Only lower int64 after optimizing address arithmetic, so that u2u64/i2i64
2451     * conversions remain.
2452     */
2453    progress = false;
2454    NIR_PASS(progress, nir, nir_lower_int64);
2455 
2456    /* If we lowered actual int64 arithmetic (not folded into the address
2457     * calculations), then clean up after the lowering.
2458     */
2459    if (progress) {
2460       do {
2461          progress = false;
2462 
2463          NIR_PASS(progress, nir, nir_opt_algebraic);
2464          NIR_PASS(progress, nir, nir_opt_constant_folding);
2465          NIR_PASS(progress, nir, nir_opt_dce);
2466       } while (progress);
2467    }
2468 
2469    if (likely(!(agx_compiler_debug & AGX_DBG_NOPREAMBLE)))
2470       NIR_PASS(_, nir, agx_nir_opt_preamble, preamble_size);
2471 
2472    /* Forming preambles may dramatically reduce the instruction count
2473     * in certain blocks, causing some if-else statements to become
2474     * trivial. We want to peephole select those, given that control flow
2475     * prediction instructions are costly.
2476     */
2477    NIR_PASS(_, nir, nir_opt_peephole_select, 64, false, true);
2478 
2479    NIR_PASS(_, nir, nir_opt_algebraic_late);
2480 
2481    /* Fuse add/sub/multiplies/shifts after running opt_algebraic_late to fuse
2482     * isub but before shifts are lowered.
2483     */
2484    do {
2485       progress = false;
2486 
2487       NIR_PASS(progress, nir, nir_opt_dce);
2488       NIR_PASS(progress, nir, nir_opt_cse);
2489       NIR_PASS(progress, nir, agx_nir_fuse_algebraic_late);
2490    } while (progress);
2491 
2492    /* Do remaining lowering late, since this inserts &s for shifts so we want to
2493     * do it after fusing constant shifts. Constant folding will clean up.
2494     */
2495    NIR_PASS(_, nir, agx_nir_lower_algebraic_late);
2496    NIR_PASS(_, nir, nir_opt_constant_folding);
2497    NIR_PASS(_, nir, nir_opt_combine_barriers, NULL, NULL);
2498 
2499    /* Must run after uses are fixed but before a last round of copyprop + DCE */
2500    if (nir->info.stage == MESA_SHADER_FRAGMENT)
2501       NIR_PASS(_, nir, agx_nir_lower_load_mask);
2502 
2503    NIR_PASS(_, nir, nir_copy_prop);
2504    NIR_PASS(_, nir, nir_opt_dce);
2505    NIR_PASS(_, nir, nir_opt_cse);
2506    NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
2507    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
2508 
2509    /* Cleanup optimizations */
2510    nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
2511                                nir_move_load_input | nir_move_comparisons |
2512                                nir_move_copies | nir_move_load_ssbo |
2513                                nir_move_alu;
2514 
2515    NIR_PASS(_, nir, nir_opt_sink, move_all);
2516    NIR_PASS(_, nir, nir_opt_move, move_all);
2517    NIR_PASS(_, nir, nir_lower_phis_to_scalar, true);
2518 }
2519 
2520 /* ABI: position first, then user, then psiz */
2521 static void
agx_remap_varyings_vs(nir_shader * nir,struct agx_varyings_vs * varyings,struct agx_shader_key * key)2522 agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings_vs *varyings,
2523                       struct agx_shader_key *key)
2524 {
2525    unsigned base = 0;
2526 
2527    /* Initialize to "nothing is written" */
2528    for (unsigned i = 0; i < ARRAY_SIZE(varyings->slots); ++i)
2529       varyings->slots[i] = ~0;
2530 
2531    /* gl_Position is implicitly written, although it may validly be absent in
2532     * vertex programs run only for transform feedback. Those ignore their
2533     * varyings so it doesn't matter what we do here as long as we don't fail.
2534     */
2535    varyings->slots[VARYING_SLOT_POS] = base;
2536    base += 4;
2537 
2538    /* These are always flat-shaded from the FS perspective */
2539    key->vs.outputs_flat_shaded |= VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
2540 
2541    /* The internal cull distance slots are always linearly-interpolated */
2542    key->vs.outputs_linear_shaded |=
2543       BITFIELD64_RANGE(VARYING_SLOT_CULL_PRIMITIVE, 2);
2544 
2545    assert(!(key->vs.outputs_flat_shaded & key->vs.outputs_linear_shaded));
2546 
2547    /* Smooth 32-bit user bindings go next */
2548    u_foreach_bit64(loc, nir->info.outputs_written &
2549                            ~key->vs.outputs_flat_shaded &
2550                            ~key->vs.outputs_linear_shaded) {
2551       if (loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ)
2552          continue;
2553 
2554       assert(loc < ARRAY_SIZE(varyings->slots));
2555       varyings->slots[loc] = base;
2556       base += 4;
2557       varyings->num_32_smooth += 4;
2558    }
2559 
2560    /* Flat 32-bit user bindings go next */
2561    u_foreach_bit64(loc,
2562                    nir->info.outputs_written & key->vs.outputs_flat_shaded) {
2563       if (loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ)
2564          continue;
2565 
2566       assert(loc < ARRAY_SIZE(varyings->slots));
2567       varyings->slots[loc] = base;
2568       base += 4;
2569       varyings->num_32_flat += 4;
2570    }
2571 
2572    /* Linear 32-bit user bindings go next */
2573    u_foreach_bit64(loc,
2574                    nir->info.outputs_written & key->vs.outputs_linear_shaded) {
2575       if (loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ)
2576          continue;
2577 
2578       assert(loc < ARRAY_SIZE(varyings->slots));
2579       varyings->slots[loc] = base;
2580       base += 4;
2581       varyings->num_32_linear += 4;
2582    }
2583 
2584    /* TODO: Link FP16 varyings */
2585    varyings->base_index_fp16 = base;
2586    varyings->num_16_smooth = 0;
2587    varyings->num_16_flat = 0;
2588    varyings->num_16_linear = 0;
2589 
2590    if (nir->info.outputs_written & VARYING_BIT_PSIZ) {
2591       varyings->slots[VARYING_SLOT_PSIZ] = base;
2592       base += 1;
2593    }
2594 
2595    if (nir->info.outputs_written & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT)) {
2596       varyings->layer_viewport_slot = base;
2597       base += 1;
2598    }
2599 
2600    if (nir->info.outputs_written & VARYING_BIT_CLIP_DIST0) {
2601       varyings->clip_dist_slot = base;
2602       varyings->nr_clip_dists = nir->info.clip_distance_array_size;
2603       base += varyings->nr_clip_dists;
2604    }
2605 
2606    /* All varyings linked now */
2607    varyings->nr_index = base;
2608 }
2609 
2610 /*
2611  * Varyings that are used as texture coordinates should be kept at fp32, because
2612  * fp16 does not have enough precision for large textures. It's technically
2613  * conformant not to, but every app gets this wrong.
2614  */
2615 static bool
agx_gather_texcoords(nir_builder * b,nir_instr * instr,void * data)2616 agx_gather_texcoords(nir_builder *b, nir_instr *instr, void *data)
2617 {
2618    uint64_t *mask = data;
2619 
2620    if (instr->type != nir_instr_type_tex)
2621       return false;
2622 
2623    nir_tex_instr *tex = nir_instr_as_tex(instr);
2624 
2625    int coord_idx = nir_tex_instr_src_index(tex, nir_tex_src_coord);
2626    if (coord_idx < 0)
2627       return false;
2628 
2629    nir_src src = tex->src[coord_idx].src;
2630    nir_scalar x = nir_scalar_resolved(src.ssa, 0);
2631    nir_scalar y = nir_scalar_resolved(src.ssa, 1);
2632 
2633    if (x.def != y.def)
2634       return false;
2635 
2636    nir_instr *parent = x.def->parent_instr;
2637 
2638    if (parent->type != nir_instr_type_intrinsic)
2639       return false;
2640 
2641    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
2642 
2643    if (intr->intrinsic != nir_intrinsic_load_interpolated_input)
2644       return false;
2645 
2646    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
2647    *mask |= BITFIELD64_BIT(sem.location);
2648    return false;
2649 }
2650 
2651 struct interp_masks {
2652    uint64_t flat;
2653    uint64_t linear;
2654 };
2655 
2656 static bool
agx_gather_interp(nir_builder * b,nir_intrinsic_instr * intr,void * data)2657 agx_gather_interp(nir_builder *b, nir_intrinsic_instr *intr, void *data)
2658 {
2659    struct interp_masks *masks = data;
2660 
2661    if (intr->intrinsic == nir_intrinsic_load_input) {
2662       nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
2663       masks->flat |= BITFIELD64_RANGE(sem.location, sem.num_slots);
2664    } else if (intr->intrinsic == nir_intrinsic_load_interpolated_input &&
2665               nir_intrinsic_interp_mode(nir_src_as_intrinsic(intr->src[0])) ==
2666                  INTERP_MODE_NOPERSPECTIVE) {
2667       nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
2668       masks->linear |= BITFIELD64_RANGE(sem.location, sem.num_slots);
2669    }
2670 
2671    return false;
2672 }
2673 
2674 /*
2675  * Build a bit mask of varyings (by location) that are flatshaded and linear
2676  * shaded. This information is needed by lower_mediump_io and
2677  * agx_uncompiled_shader_info.
2678  */
2679 static struct interp_masks
agx_interp_masks(nir_shader * nir)2680 agx_interp_masks(nir_shader *nir)
2681 {
2682    assert(nir->info.stage == MESA_SHADER_FRAGMENT);
2683 
2684    struct interp_masks masks = {0};
2685    nir_shader_intrinsics_pass(nir, agx_gather_interp, nir_metadata_all, &masks);
2686    return masks;
2687 }
2688 
2689 /*
2690  * Build a bit mask of varyings (by location) that are used as texture
2691  * coordinates. This information is needed by lower_mediump_io.
2692  */
2693 static uint64_t
agx_texcoord_mask(nir_shader * nir)2694 agx_texcoord_mask(nir_shader *nir)
2695 {
2696    assert(nir->info.stage == MESA_SHADER_FRAGMENT);
2697 
2698    uint64_t mask = 0;
2699    nir_shader_instructions_pass(nir, agx_gather_texcoords, nir_metadata_all,
2700                                 &mask);
2701    return mask;
2702 }
2703 
2704 static nir_mem_access_size_align
mem_access_size_align_cb(nir_intrinsic_op intrin,uint8_t bytes,uint8_t bit_size,uint32_t align,uint32_t align_offset,bool offset_is_const,const void * cb_data)2705 mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
2706                          uint8_t bit_size, uint32_t align,
2707                          uint32_t align_offset, bool offset_is_const,
2708                          const void *cb_data)
2709 {
2710    align = nir_combined_align(align, align_offset);
2711 
2712    assert(util_is_power_of_two_nonzero(align));
2713 
2714    if ((bytes & 1) || (align == 1))
2715       bit_size = 8;
2716    else if ((bytes & 2) || (align == 2))
2717       bit_size = 16;
2718    else if (bit_size >= 32)
2719       bit_size = 32;
2720 
2721    return (nir_mem_access_size_align){
2722       .num_components = MIN2(bytes / (bit_size / 8), 4),
2723       .bit_size = bit_size,
2724       .align = bit_size / 8,
2725    };
2726 }
2727 
2728 static unsigned
lower_bit_size_callback(const nir_instr * instr,UNUSED void * _)2729 lower_bit_size_callback(const nir_instr *instr, UNUSED void *_)
2730 {
2731    if (instr->type != nir_instr_type_alu)
2732       return 0;
2733 
2734    /* Lower 8-bit ALU to 16-bit. We check the destination, as we do not want to
2735     * lower conversions from 8-bit to larger types. Those conversions get
2736     * implemented natively.
2737     */
2738    nir_alu_instr *alu = nir_instr_as_alu(instr);
2739    if (alu->def.bit_size == 8 && !is_conversion_to_8bit(alu->op))
2740       return 16;
2741    else if (alu->def.bit_size == 1 && alu->src[0].src.ssa->bit_size == 8)
2742       return 16 /* comparisons */;
2743    else
2744       return 0;
2745 }
2746 
2747 static bool
lower_load_from_texture_handle(nir_builder * b,nir_intrinsic_instr * intr,void * data)2748 lower_load_from_texture_handle(nir_builder *b, nir_intrinsic_instr *intr,
2749                                void *data)
2750 {
2751    if (intr->intrinsic != nir_intrinsic_load_from_texture_handle_agx)
2752       return false;
2753 
2754    /* Bindless handles are a vec2, where the first source is the (constant)
2755     * uniform register number and the second source is the byte offset.
2756     */
2757    nir_scalar uniform = nir_scalar_resolved(intr->src[0].ssa, 0);
2758    unsigned uniform_idx = nir_scalar_as_uint(uniform);
2759 
2760    b->cursor = nir_instr_remove(&intr->instr);
2761    nir_def *base = nir_load_preamble(b, 1, 64, uniform_idx);
2762    nir_def *offset = nir_u2u64(b, nir_channel(b, intr->src[0].ssa, 1));
2763 
2764    nir_def_rewrite_uses(&intr->def, nir_iadd(b, base, offset));
2765    return true;
2766 }
2767 
2768 static bool
agx_should_dump(nir_shader * nir,unsigned agx_dbg_bit)2769 agx_should_dump(nir_shader *nir, unsigned agx_dbg_bit)
2770 {
2771    return (agx_compiler_debug & agx_dbg_bit) &&
2772           !(nir->info.internal && !(agx_compiler_debug & AGX_DBG_INTERNAL));
2773 }
2774 
2775 static unsigned
agx_compile_function_nir(nir_shader * nir,nir_function_impl * impl,struct agx_shader_key * key,struct util_debug_callback * debug,struct util_dynarray * binary,struct agx_shader_info * out)2776 agx_compile_function_nir(nir_shader *nir, nir_function_impl *impl,
2777                          struct agx_shader_key *key,
2778                          struct util_debug_callback *debug,
2779                          struct util_dynarray *binary,
2780                          struct agx_shader_info *out)
2781 {
2782    nir_index_blocks(impl);
2783    nir_index_ssa_defs(impl);
2784 
2785    agx_context *ctx = rzalloc(NULL, agx_context);
2786    ctx->nir = nir;
2787    ctx->is_preamble = impl->function->is_preamble;
2788    ctx->out = out;
2789    ctx->key = key;
2790    ctx->stage = nir->info.stage;
2791    ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
2792    ctx->indexed_nir_blocks = rzalloc_array(ctx, agx_block *, impl->num_blocks);
2793    list_inithead(&ctx->blocks);
2794 
2795    ctx->alloc = impl->ssa_alloc;
2796    emit_cf_list(ctx, &impl->body);
2797    agx_emit_phis_deferred(ctx);
2798 
2799    /* Only allocate scratch if it's statically used, regardless of if the NIR
2800     * info claims otherwise.
2801     */
2802    if (ctx->any_scratch) {
2803       assert(!ctx->is_preamble && "preambles don't use scratch");
2804       ctx->scratch_size = ALIGN(nir->scratch_size, 16);
2805    }
2806 
2807    /* Stop the main shader or preamble shader after the exit block. For real
2808     * functions, we would return here.
2809     */
2810    agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
2811    agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block));
2812    agx_stop(&_b);
2813 
2814    /* Index blocks now that we're done emitting so the order is consistent */
2815    agx_foreach_block(ctx, block)
2816       block->index = ctx->num_blocks++;
2817 
2818    agx_validate(ctx, "IR translation");
2819 
2820    if (likely(!(agx_compiler_debug & AGX_DBG_NOOPT))) {
2821       /* Eliminate dead instructions before CSE to avoid silly scheduling */
2822       agx_dce(ctx, false);
2823 
2824       /* CSE before eliminating dead destinations so that subdivision is
2825        * optimized properly.
2826        */
2827       agx_opt_cse(ctx);
2828 
2829       /* After DCE, use counts are right so we can run the optimizer. */
2830       agx_optimizer(ctx);
2831    }
2832 
2833    /* For correctness, lower uniform sources after copyprop (for correctness,
2834     * as copyprop creates uniform sources). To keep register pressure in
2835     * check, lower after CSE, since moves are cheaper than registers.
2836     */
2837    agx_lower_uniform_sources(ctx);
2838 
2839    /* RA correctness depends on DCE */
2840    agx_dce(ctx, true);
2841    agx_validate(ctx, "Pre-RA passes");
2842 
2843    if (agx_should_dump(nir, AGX_DBG_SHADERS))
2844       agx_print_shader(ctx, stdout);
2845 
2846    if (likely(!(agx_compiler_debug & AGX_DBG_NOSCHED))) {
2847       agx_pressure_schedule(ctx);
2848       agx_validate(ctx, "Pre-RA scheduler");
2849    }
2850 
2851    if (agx_should_dump(nir, AGX_DBG_SHADERS))
2852       agx_print_shader(ctx, stdout);
2853 
2854    agx_ra(ctx);
2855    agx_validate(ctx, "RA");
2856    agx_lower_64bit_postra(ctx);
2857 
2858    if (ctx->scratch_size > 0) {
2859       /* Apple always allocate 40 more bytes in the entrypoint and align to 4. */
2860       uint64_t stack_size = ALIGN(DIV_ROUND_UP(ctx->scratch_size, 4) + 10, 4);
2861 
2862       assert(stack_size < INT16_MAX);
2863 
2864       agx_block *start_block = agx_start_block(ctx);
2865       agx_builder _b = agx_init_builder(ctx, agx_before_block(start_block));
2866       agx_stack_adjust(&_b, stack_size);
2867 
2868       if (ctx->is_preamble)
2869          out->preamble_scratch_size = stack_size;
2870       else
2871          out->scratch_size = stack_size;
2872    }
2873 
2874    if (ctx->stage == MESA_SHADER_VERTEX && !impl->function->is_preamble)
2875       agx_set_st_vary_final(ctx);
2876 
2877    agx_insert_waits(ctx);
2878    agx_opt_empty_else(ctx);
2879    agx_opt_break_if(ctx);
2880    agx_opt_jmp_none(ctx);
2881    agx_lower_pseudo(ctx);
2882 
2883    if (agx_should_dump(nir, AGX_DBG_SHADERS))
2884       agx_print_shader(ctx, stdout);
2885 
2886    /* Pad binary */
2887    if (binary->size % AGX_CODE_ALIGN) {
2888       unsigned ngrow = AGX_CODE_ALIGN - (binary->size % AGX_CODE_ALIGN);
2889       memset(util_dynarray_grow_bytes(binary, ngrow, 1), 0, ngrow);
2890    }
2891 
2892    unsigned offset = binary->size;
2893    assert((offset % AGX_CODE_ALIGN) == 0);
2894 
2895    agx_pack_binary(ctx, binary);
2896 
2897    unsigned nr_gprs = ctx->max_reg + 1;
2898 
2899    /* If the preamble uses scratch (due to spilling), we need to set maximal
2900     * GPRs. Do it here so the driver doesn't have to worry about it.
2901     */
2902    if (impl->function->is_preamble)
2903       out->nr_preamble_gprs = ctx->scratch_size ? 256 : nr_gprs;
2904    else
2905       out->nr_gprs = nr_gprs;
2906 
2907    /* Don't dump statistics for preambles, since they're not worth optimizing */
2908    if (!impl->function->is_preamble) {
2909       char *stats;
2910       int ret = agx_dump_stats(ctx, binary->size, &stats);
2911 
2912       if (ret >= 0) {
2913          if (agx_should_dump(nir, AGX_DBG_SHADERDB)) {
2914             fprintf(stderr, "SHADER-DB: %s - %s\n", nir->info.label ?: "",
2915                     stats);
2916          }
2917 
2918          if (debug)
2919             util_debug_message(debug, SHADER_INFO, "%s", stats);
2920 
2921          free(stats);
2922       }
2923    }
2924 
2925    ralloc_free(ctx);
2926 
2927    return offset;
2928 }
2929 
2930 static void
link_libagx(nir_shader * nir,const nir_shader * libagx)2931 link_libagx(nir_shader *nir, const nir_shader *libagx)
2932 {
2933    nir_link_shader_functions(nir, libagx);
2934    NIR_PASS(_, nir, nir_inline_functions);
2935    nir_remove_non_entrypoints(nir);
2936    NIR_PASS(_, nir, nir_opt_deref);
2937    NIR_PASS(_, nir, nir_lower_vars_to_ssa);
2938    NIR_PASS(_, nir, nir_remove_dead_derefs);
2939    NIR_PASS(_, nir, nir_remove_dead_variables,
2940             nir_var_function_temp | nir_var_shader_temp, NULL);
2941    NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
2942             nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
2943                nir_var_mem_global,
2944             glsl_get_cl_type_size_align);
2945 }
2946 
2947 /*
2948  * Preprocess NIR. In particular, this lowers I/O. Drivers should call this
2949  * as soon as they don't need unlowered I/O.
2950  *
2951  * This also lowers as much as possible. After preprocessing NIR, the following
2952  * NIR passes are called by the GL driver:
2953  *
2954  *    - nir_lower_blend
2955  *    - nir_lower_texcoord_replace_late
2956  *    - agx_nir_lower_vbo
2957  *    - agx_nir_lower_tilebuffer
2958  *
2959  * Unless an instruction is constructed by one of the above passes, it should be
2960  * lowered here to avoid duplicate work with shader variants.
2961  */
2962 void
agx_preprocess_nir(nir_shader * nir,const nir_shader * libagx,bool allow_mediump,struct agx_uncompiled_shader_info * out)2963 agx_preprocess_nir(nir_shader *nir, const nir_shader *libagx,
2964                    bool allow_mediump, struct agx_uncompiled_shader_info *out)
2965 {
2966    if (out) {
2967       memset(out, 0, sizeof(*out));
2968 
2969       out->nr_bindful_textures = BITSET_LAST_BIT(nir->info.textures_used);
2970       out->nr_bindful_images = BITSET_LAST_BIT(nir->info.images_used);
2971    }
2972 
2973    NIR_PASS(_, nir, nir_lower_vars_to_ssa);
2974 
2975    /* Lower large arrays to scratch and small arrays to csel */
2976    NIR_PASS(_, nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,
2977             glsl_get_natural_size_align_bytes);
2978    NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
2979    NIR_PASS(_, nir, nir_split_var_copies);
2980    NIR_PASS(_, nir, nir_lower_global_vars_to_local);
2981    NIR_PASS(_, nir, nir_lower_var_copies);
2982    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
2983             glsl_type_size, nir_lower_io_lower_64bit_to_32);
2984    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2985       struct interp_masks masks = agx_interp_masks(nir);
2986 
2987       NIR_PASS(_, nir, agx_nir_lower_frag_sidefx);
2988 
2989       /* Interpolate varyings at fp16 and write to the tilebuffer at fp16. As an
2990        * exception, interpolate flat shaded at fp32. This works around a
2991        * hardware limitation. The resulting code (with an extra f2f16 at the end
2992        * if needed) matches what Metal produces.
2993        */
2994       if (likely(allow_mediump)) {
2995          uint64_t texcoord = agx_texcoord_mask(nir);
2996 
2997          NIR_PASS(_, nir, nir_lower_mediump_io,
2998                   nir_var_shader_in | nir_var_shader_out,
2999                   ~(masks.flat | texcoord), false);
3000       }
3001 
3002       if (out) {
3003          out->inputs_flat_shaded = masks.flat;
3004          out->inputs_linear_shaded = masks.linear;
3005       }
3006    } else if (nir->info.stage == MESA_SHADER_VERTEX ||
3007               nir->info.stage == MESA_SHADER_TESS_EVAL) {
3008       out->has_edgeflags = nir->info.outputs_written & VARYING_BIT_EDGE;
3009       out->cull_distance_size = nir->info.cull_distance_array_size;
3010 
3011       if (out->cull_distance_size)
3012          NIR_PASS(_, nir, agx_nir_lower_cull_distance_vs);
3013    }
3014 
3015    /* Clean up deref gunk after lowering I/O */
3016    NIR_PASS(_, nir, nir_opt_dce);
3017 
3018    link_libagx(nir, libagx);
3019 
3020    /* Runs before we lower away idiv, to work at all. But runs after lowering
3021     * textures, since the cube map array lowering generates division by 6.
3022     */
3023    NIR_PASS(_, nir, nir_opt_idiv_const, 16);
3024 
3025    nir_lower_idiv_options idiv_options = {
3026       .allow_fp16 = true,
3027    };
3028 
3029    NIR_PASS(_, nir, nir_lower_idiv, &idiv_options);
3030    NIR_PASS(_, nir, nir_lower_frexp);
3031    NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
3032    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
3033    NIR_PASS(_, nir, nir_lower_flrp, 16 | 32 | 64, false);
3034    NIR_PASS(_, nir, agx_lower_sincos);
3035    NIR_PASS(_, nir, nir_shader_intrinsics_pass, agx_lower_front_face,
3036             nir_metadata_block_index | nir_metadata_dominance, NULL);
3037    NIR_PASS(_, nir, nir_lower_frag_coord_to_pixel_coord);
3038    NIR_PASS(_, nir, agx_nir_lower_subgroups);
3039    NIR_PASS(_, nir, nir_lower_phis_to_scalar, true);
3040 
3041    /* After lowering, run through the standard suite of NIR optimizations. We
3042     * will run through the loop later, once we have the shader key, but if we
3043     * run now, that run will ideally be almost a no-op.
3044     */
3045    agx_optimize_loop_nir(nir);
3046 
3047    NIR_PASS(_, nir, nir_opt_deref);
3048    NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3049    NIR_PASS(_, nir, nir_lower_explicit_io,
3050             nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
3051                nir_var_mem_global,
3052             nir_address_format_62bit_generic);
3053 
3054    /* We're lowered away all variables. Remove them all for smaller shaders. */
3055    NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_all, NULL);
3056    nir->info.io_lowered = true;
3057 
3058    /* Move before lowering */
3059    nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
3060                                nir_move_load_input | nir_move_comparisons |
3061                                nir_move_copies | nir_move_load_ssbo;
3062 
3063    NIR_PASS(_, nir, nir_opt_sink, move_all);
3064    NIR_PASS(_, nir, nir_opt_move, move_all);
3065    NIR_PASS(_, nir, agx_nir_lower_shared_bitsize);
3066 }
3067 
3068 void
agx_compile_shader_nir(nir_shader * nir,struct agx_shader_key * key,struct util_debug_callback * debug,struct util_dynarray * binary,struct agx_shader_info * out)3069 agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
3070                        struct util_debug_callback *debug,
3071                        struct util_dynarray *binary,
3072                        struct agx_shader_info *out)
3073 {
3074    agx_compiler_debug = agx_get_compiler_debug();
3075 
3076    memset(out, 0, sizeof *out);
3077 
3078    assert(nir->info.io_lowered &&
3079           "agx_preprocess_nir is called first, then the shader is specalized,"
3080           "then the specialized shader is compiled");
3081 
3082    /* If required, tag writes will be enabled by instruction selection */
3083    if (nir->info.stage == MESA_SHADER_FRAGMENT)
3084       out->tag_write_disable = !nir->info.writes_memory;
3085 
3086    if (nir->info.stage == MESA_SHADER_VERTEX &&
3087        (nir->info.outputs_written & VARYING_BIT_CLIP_DIST0))
3088       NIR_PASS(_, nir, agx_nir_lower_clip_distance);
3089 
3090    bool needs_libagx = true /* TODO: Optimize */;
3091 
3092    if (nir->info.stage == MESA_SHADER_FRAGMENT)
3093       NIR_PASS(_, nir, agx_nir_lower_interpolation);
3094 
3095    NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3096 
3097    if (needs_libagx) {
3098       link_libagx(nir, key->libagx);
3099 
3100       NIR_PASS(_, nir, nir_opt_deref);
3101       NIR_PASS(_, nir, nir_lower_vars_to_ssa);
3102       NIR_PASS(_, nir, nir_lower_explicit_io,
3103                nir_var_shader_temp | nir_var_function_temp |
3104                   nir_var_mem_shared | nir_var_mem_global,
3105                nir_address_format_62bit_generic);
3106    }
3107 
3108    /* Late sysval lowering creates large loads. Load lowering creates unpacks */
3109    nir_lower_mem_access_bit_sizes_options lower_mem_access_options = {
3110       .modes = nir_var_mem_ssbo | nir_var_mem_constant |
3111                nir_var_mem_task_payload | nir_var_shader_temp |
3112                nir_var_function_temp | nir_var_mem_global | nir_var_mem_shared,
3113       .callback = mem_access_size_align_cb,
3114    };
3115    NIR_PASS(_, nir, nir_lower_mem_access_bit_sizes, &lower_mem_access_options);
3116 
3117    /* Cleanup 8-bit math before lowering */
3118    bool progress;
3119    do {
3120       progress = false;
3121 
3122       NIR_PASS(progress, nir, nir_opt_algebraic);
3123       NIR_PASS(progress, nir, nir_opt_constant_folding);
3124       NIR_PASS(progress, nir, nir_opt_dce);
3125    } while (progress);
3126 
3127    NIR_PASS(_, nir, nir_lower_bit_size, lower_bit_size_callback, NULL);
3128 
3129    /* Late blend lowering creates vectors */
3130    NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
3131    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
3132 
3133    /* Late VBO lowering creates constant udiv instructions */
3134    NIR_PASS(_, nir, nir_opt_idiv_const, 16);
3135 
3136    /* Varying output is scalar, other I/O is vector. Lowered late because
3137     * transform feedback programs will use vector output.
3138     */
3139    if (nir->info.stage == MESA_SHADER_VERTEX) {
3140       NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
3141 
3142       if (nir->info.outputs_written &
3143           (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT)) {
3144 
3145          NIR_PASS(_, nir, agx_nir_lower_layer);
3146       }
3147    }
3148 
3149    NIR_PASS(_, nir, nir_opt_constant_folding);
3150    NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_load_from_texture_handle,
3151             nir_metadata_block_index | nir_metadata_dominance, NULL);
3152 
3153    out->push_count = key->reserved_preamble;
3154    agx_optimize_nir(nir, &out->push_count);
3155 
3156    /* Must be last since NIR passes can remap driver_location freely */
3157    if (nir->info.stage == MESA_SHADER_VERTEX)
3158       agx_remap_varyings_vs(nir, &out->varyings.vs, key);
3159 
3160    if (agx_should_dump(nir, AGX_DBG_SHADERS))
3161       nir_print_shader(nir, stdout);
3162 
3163    out->local_size = nir->info.shared_size;
3164 
3165    nir_foreach_function_with_impl(func, impl, nir) {
3166       unsigned offset =
3167          agx_compile_function_nir(nir, impl, key, debug, binary, out);
3168 
3169       if (func->is_preamble) {
3170          out->preamble_offset = offset;
3171          out->has_preamble = true;
3172       } else if (func->is_entrypoint) {
3173          out->main_offset = offset;
3174       } else {
3175          unreachable("General functions not yet supported");
3176       }
3177    }
3178 
3179    if (nir->info.stage == MESA_SHADER_VERTEX) {
3180       out->writes_psiz =
3181          nir->info.outputs_written & BITFIELD_BIT(VARYING_SLOT_PSIZ);
3182 
3183       out->nonzero_viewport = nir->info.outputs_written & VARYING_BIT_VIEWPORT;
3184 
3185       out->writes_layer_viewport =
3186          nir->info.outputs_written & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
3187 
3188       out->uses_draw_id =
3189          BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID);
3190 
3191       out->uses_base_param =
3192          BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_VERTEX) ||
3193          BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE);
3194    } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3195       out->disable_tri_merging = nir->info.uses_wide_subgroup_intrinsics ||
3196                                  nir->info.fs.needs_quad_helper_invocations ||
3197                                  nir->info.writes_memory;
3198 
3199       /* Writing the sample mask requires tag writes */
3200       out->tag_write_disable &= !out->writes_sample_mask;
3201 
3202       /* Report a canonical depth layout. This happens at the end because the
3203        * sample mask lowering affects it.
3204        */
3205       enum gl_frag_depth_layout layout = nir->info.fs.depth_layout;
3206 
3207       if (!(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)))
3208          out->depth_layout = FRAG_DEPTH_LAYOUT_UNCHANGED;
3209       else if (layout == FRAG_DEPTH_LAYOUT_NONE)
3210          out->depth_layout = FRAG_DEPTH_LAYOUT_ANY;
3211       else
3212          out->depth_layout = layout;
3213    }
3214 }
3215