• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2021 Alyssa Rosenzweig <alyssa@rosenzweig.io>
3  * Copyright (C) 2020 Collabora Ltd.
4  * Copyright © 2016 Broadcom
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice (including the next
14  * paragraph) shall be included in all copies or substantial portions of the
15  * Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23  * SOFTWARE.
24  */
25 
26 #include "main/glheader.h"
27 #include "compiler/nir_types.h"
28 #include "compiler/nir/nir_builder.h"
29 #include "util/u_debug.h"
30 #include "util/fast_idiv_by_const.h"
31 #include "agx_compile.h"
32 #include "agx_compiler.h"
33 #include "agx_builder.h"
34 
35 static const struct debug_named_value agx_debug_options[] = {
36    {"msgs",      AGX_DBG_MSGS,		"Print debug messages"},
37    {"shaders",   AGX_DBG_SHADERS,	"Dump shaders in NIR and AIR"},
38    {"shaderdb",  AGX_DBG_SHADERDB,	"Print statistics"},
39    {"verbose",   AGX_DBG_VERBOSE,	"Disassemble verbosely"},
40    {"internal",  AGX_DBG_INTERNAL,	"Dump even internal shaders"},
41    {"novalidate",AGX_DBG_NOVALIDATE,"Skip IR validation in debug builds"},
42    DEBUG_NAMED_VALUE_END
43 };
44 
45 DEBUG_GET_ONCE_FLAGS_OPTION(agx_debug, "AGX_MESA_DEBUG", agx_debug_options, 0)
46 
47 int agx_debug = 0;
48 
49 #define DBG(fmt, ...) \
50    do { if (agx_debug & AGX_DBG_MSGS) \
51       fprintf(stderr, "%s:%d: "fmt, \
52             __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0)
53 
54 /* Builds a 64-bit hash table key for an index */
55 static uint64_t
agx_index_to_key(agx_index idx)56 agx_index_to_key(agx_index idx)
57 {
58    STATIC_ASSERT(sizeof(idx) <= sizeof(uint64_t));
59 
60    uint64_t key = 0;
61    memcpy(&key, &idx, sizeof(idx));
62    return key;
63 }
64 
65 /*
66  * Extract a single channel out of a vector source. We split vectors with
67  * p_split so we can use the split components directly, without emitting a
68  * machine instruction. This has advantages of RA, as the split can usually be
69  * optimized away.
70  */
71 static agx_index
agx_emit_extract(agx_builder * b,agx_index vec,unsigned channel)72 agx_emit_extract(agx_builder *b, agx_index vec, unsigned channel)
73 {
74    agx_index *components = _mesa_hash_table_u64_search(b->shader->allocated_vec,
75                                                        agx_index_to_key(vec));
76 
77    assert(components != NULL && "missing agx_emit_combine_to");
78 
79    return components[channel];
80 }
81 
82 static void
agx_cache_combine(agx_builder * b,agx_index dst,agx_index s0,agx_index s1,agx_index s2,agx_index s3)83 agx_cache_combine(agx_builder *b, agx_index dst,
84                   agx_index s0, agx_index s1, agx_index s2, agx_index s3)
85 {
86    /* Lifetime of a hash table entry has to be at least as long as the table */
87    agx_index *channels = ralloc_array(b->shader, agx_index, 4);
88 
89    channels[0] = s0;
90    channels[1] = s1;
91    channels[2] = s2;
92    channels[3] = s3;
93 
94    _mesa_hash_table_u64_insert(b->shader->allocated_vec, agx_index_to_key(dst),
95                                channels);
96 }
97 
98 /*
99  * Combine multiple scalars into a vector destination. This corresponds to
100  * p_combine, lowered to moves (a shuffle in general) after register allocation.
101  *
102  * To optimize vector extractions, we record the individual channels
103  */
104 static agx_instr *
agx_emit_combine_to(agx_builder * b,agx_index dst,agx_index s0,agx_index s1,agx_index s2,agx_index s3)105 agx_emit_combine_to(agx_builder *b, agx_index dst,
106                     agx_index s0, agx_index s1, agx_index s2, agx_index s3)
107 {
108    agx_cache_combine(b, dst, s0, s1, s2, s3);
109    return agx_p_combine_to(b, dst, s0, s1, s2, s3);
110 }
111 
112 static void
agx_block_add_successor(agx_block * block,agx_block * successor)113 agx_block_add_successor(agx_block *block, agx_block *successor)
114 {
115    assert(block != NULL && successor != NULL);
116 
117    /* Cull impossible edges */
118    if (block->unconditional_jumps)
119       return;
120 
121    for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
122       if (block->successors[i]) {
123          if (block->successors[i] == successor)
124             return;
125          else
126             continue;
127       }
128 
129       block->successors[i] = successor;
130       util_dynarray_append(&successor->predecessors, agx_block *, block);
131       return;
132    }
133 
134    unreachable("Too many successors");
135 }
136 
137 /*
138  * Splits an n-component vector (vec) into n scalar destinations (dests) using a
139  * split pseudo-instruction.
140  *
141  * Pre-condition: dests is filled with agx_null().
142  */
143 static void
agx_emit_split(agx_builder * b,agx_index * dests,agx_index vec,unsigned n)144 agx_emit_split(agx_builder *b, agx_index *dests, agx_index vec, unsigned n)
145 {
146    /* Setup the destinations */
147    for (unsigned i = 0; i < n; ++i) {
148       dests[i] = agx_temp(b->shader, vec.size);
149    }
150 
151    /* Emit the split */
152    agx_p_split_to(b, dests[0], dests[1], dests[2], dests[3], vec);
153 }
154 
155 static void
agx_emit_cached_split(agx_builder * b,agx_index vec,unsigned n)156 agx_emit_cached_split(agx_builder *b, agx_index vec, unsigned n)
157 {
158    agx_index dests[4] = { agx_null(), agx_null(), agx_null(), agx_null() };
159    agx_emit_split(b, dests, vec, n);
160    agx_cache_combine(b, vec, dests[0], dests[1], dests[2], dests[3]);
161 }
162 
163 static void
agx_emit_load_const(agx_builder * b,nir_load_const_instr * instr)164 agx_emit_load_const(agx_builder *b, nir_load_const_instr *instr)
165 {
166    /* Ensure we've been scalarized and bit size lowered */
167    unsigned bit_size = instr->def.bit_size;
168    assert(instr->def.num_components == 1);
169    assert(bit_size == 1 || bit_size == 16 || bit_size == 32);
170 
171    /* Emit move, later passes can inline/push if useful */
172    agx_mov_imm_to(b,
173                   agx_get_index(instr->def.index, agx_size_for_bits(bit_size)),
174                   nir_const_value_as_uint(instr->value[0], bit_size));
175 }
176 
177 /* Emit code dividing P by Q */
178 static agx_index
agx_udiv_const(agx_builder * b,agx_index P,uint32_t Q)179 agx_udiv_const(agx_builder *b, agx_index P, uint32_t Q)
180 {
181    /* P / 1 = P */
182    if (Q == 1) {
183       return P;
184    }
185 
186    /* P / UINT32_MAX = 0, unless P = UINT32_MAX when it's one */
187    if (Q == UINT32_MAX) {
188       agx_index max = agx_mov_imm(b, 32, UINT32_MAX);
189       agx_index one = agx_mov_imm(b, 32, 1);
190       return agx_icmpsel(b, P, max, one, agx_zero(), AGX_ICOND_UEQ);
191    }
192 
193    /* P / 2^N = P >> N */
194    if (util_is_power_of_two_or_zero(Q)) {
195       return agx_ushr(b, P, agx_mov_imm(b, 32, util_logbase2(Q)));
196    }
197 
198    /* Fall back on multiplication by a magic number */
199    struct util_fast_udiv_info info = util_compute_fast_udiv_info(Q, 32, 32);
200    agx_index preshift = agx_mov_imm(b, 32, info.pre_shift);
201    agx_index increment = agx_mov_imm(b, 32, info.increment);
202    agx_index postshift = agx_mov_imm(b, 32, info.post_shift);
203    agx_index multiplier = agx_mov_imm(b, 32, info.multiplier);
204    agx_index multiplied = agx_temp(b->shader, AGX_SIZE_64);
205    agx_index n = P;
206 
207    if (info.pre_shift != 0) n = agx_ushr(b, n, preshift);
208    if (info.increment != 0) n = agx_iadd(b, n, increment, 0);
209 
210    /* 64-bit multiplication, zero extending 32-bit x 32-bit, get the top word */
211    agx_imad_to(b, multiplied, agx_abs(n), agx_abs(multiplier), agx_zero(), 0);
212    n = agx_temp(b->shader, AGX_SIZE_32);
213    agx_p_extract_to(b, n, multiplied, 1);
214 
215    if (info.post_shift != 0) n = agx_ushr(b, n, postshift);
216 
217    return n;
218 }
219 
220 /* AGX appears to lack support for vertex attributes. Lower to global loads. */
221 static void
agx_emit_load_attr(agx_builder * b,agx_index * dests,nir_intrinsic_instr * instr)222 agx_emit_load_attr(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
223 {
224    nir_src *offset_src = nir_get_io_offset_src(instr);
225    assert(nir_src_is_const(*offset_src) && "no attribute indirects");
226    unsigned index = nir_intrinsic_base(instr) +
227                     nir_src_as_uint(*offset_src);
228 
229    struct agx_shader_key *key = b->shader->key;
230    struct agx_attribute attrib = key->vs.attributes[index];
231 
232    /* address = base + (stride * vertex_id) + src_offset */
233    unsigned buf = attrib.buf;
234    unsigned stride = key->vs.vbuf_strides[buf];
235    unsigned shift = agx_format_shift(attrib.format);
236 
237    agx_index shifted_stride = agx_mov_imm(b, 32, stride >> shift);
238    agx_index src_offset = agx_mov_imm(b, 32, attrib.src_offset);
239 
240    agx_index vertex_id = agx_register(10, AGX_SIZE_32);
241    agx_index instance_id = agx_register(12, AGX_SIZE_32);
242 
243    /* A nonzero divisor requires dividing the instance ID. A zero divisor
244     * specifies per-instance data. */
245    agx_index element_id = (attrib.divisor == 0) ? vertex_id :
246                           agx_udiv_const(b, instance_id, attrib.divisor);
247 
248    agx_index offset = agx_imad(b, element_id, shifted_stride, src_offset, 0);
249 
250    /* Each VBO has a 64-bit = 4 x 16-bit address, lookup the base address as a sysval */
251    unsigned num_vbos = key->vs.num_vbufs;
252    unsigned base_length = (num_vbos * 4);
253    agx_index base = agx_indexed_sysval(b->shader,
254                                        AGX_PUSH_VBO_BASES, AGX_SIZE_64, buf * 4, base_length);
255 
256    /* Load the data */
257    assert(instr->num_components <= 4);
258 
259    unsigned actual_comps = (attrib.nr_comps_minus_1 + 1);
260    agx_index vec = agx_vec_for_dest(b->shader, &instr->dest);
261    agx_device_load_to(b, vec, base, offset, attrib.format,
262                       BITFIELD_MASK(attrib.nr_comps_minus_1 + 1), 0);
263    agx_wait(b, 0);
264 
265    agx_emit_split(b, dests, vec, actual_comps);
266 
267    agx_index one = agx_mov_imm(b, 32, fui(1.0));
268    agx_index zero = agx_mov_imm(b, 32, 0);
269    agx_index default_value[4] = { zero, zero, zero, one };
270 
271    for (unsigned i = actual_comps; i < instr->num_components; ++i)
272       dests[i] = default_value[i];
273 }
274 
275 static void
agx_emit_load_vary_flat(agx_builder * b,agx_index * dests,nir_intrinsic_instr * instr)276 agx_emit_load_vary_flat(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
277 {
278    unsigned components = instr->num_components;
279    assert(components >= 1 && components <= 4);
280 
281    nir_src *offset = nir_get_io_offset_src(instr);
282    assert(nir_src_is_const(*offset) && "no indirects");
283    unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];
284    imm_index += nir_src_as_uint(*offset);
285 
286    assert(nir_dest_bit_size(instr->dest) == 32 && "no 16-bit flat shading");
287 
288    for (unsigned i = 0; i < components; ++i) {
289       /* vec3 for each vertex, unknown what first 2 channels are for */
290       agx_index values = agx_ld_vary_flat(b, agx_immediate(imm_index + i), 1);
291       dests[i] = agx_p_extract(b, values, 2);
292    }
293 }
294 
295 static void
agx_emit_load_vary(agx_builder * b,agx_index * dests,nir_intrinsic_instr * instr)296 agx_emit_load_vary(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
297 {
298    ASSERTED unsigned components = instr->num_components;
299    ASSERTED nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]);
300 
301    assert(components >= 1 && components <= 4);
302    assert(parent);
303 
304    /* TODO: Interpolation modes */
305    assert(parent->intrinsic == nir_intrinsic_load_barycentric_pixel);
306 
307    nir_src *offset = nir_get_io_offset_src(instr);
308    assert(nir_src_is_const(*offset) && "no indirects");
309    unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];
310    imm_index += nir_src_as_uint(*offset) * 4;
311 
312    agx_index vec = agx_vec_for_intr(b->shader, instr);
313    agx_ld_vary_to(b, vec, agx_immediate(imm_index), components, true);
314    agx_emit_split(b, dests, vec, components);
315 }
316 
317 static agx_instr *
agx_emit_store_vary(agx_builder * b,nir_intrinsic_instr * instr)318 agx_emit_store_vary(agx_builder *b, nir_intrinsic_instr *instr)
319 {
320    nir_src *offset = nir_get_io_offset_src(instr);
321    assert(nir_src_is_const(*offset) && "todo: indirects");
322    unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)];
323    imm_index += nir_intrinsic_component(instr);
324    imm_index += nir_src_as_uint(*offset);
325 
326    /* nir_lower_io_to_scalar */
327    assert(nir_intrinsic_write_mask(instr) == 0x1);
328 
329    return agx_st_vary(b,
330                agx_immediate(imm_index),
331                agx_src_index(&instr->src[0]));
332 }
333 
334 static agx_instr *
agx_emit_fragment_out(agx_builder * b,nir_intrinsic_instr * instr)335 agx_emit_fragment_out(agx_builder *b, nir_intrinsic_instr *instr)
336 {
337    const nir_variable *var =
338       nir_find_variable_with_driver_location(b->shader->nir,
339             nir_var_shader_out, nir_intrinsic_base(instr));
340    assert(var);
341 
342    unsigned loc = var->data.location;
343    assert(var->data.index == 0 && "todo: dual-source blending");
344    assert(loc == FRAG_RESULT_DATA0 && "todo: MRT");
345    unsigned rt = (loc - FRAG_RESULT_DATA0);
346 
347    /* TODO: Reverse-engineer interactions with MRT */
348    if (b->shader->nir->info.internal) {
349       /* clear */
350    } else if (b->shader->did_writeout) {
351 	   agx_writeout(b, 0x0004);
352    } else {
353 	   agx_writeout(b, 0xC200);
354 	   agx_writeout(b, 0x000C);
355    }
356 
357    if (b->shader->nir->info.fs.uses_discard) {
358       /* If the shader uses discard, the sample mask must be written by the
359        * shader on all exeuction paths. If we've reached the end of the shader,
360        * we are therefore still active and need to write a full sample mask.
361        * TODO: interactions with MSAA and gl_SampleMask writes
362        */
363       agx_sample_mask(b, agx_immediate(1));
364    }
365 
366    b->shader->did_writeout = true;
367    return agx_st_tile(b, agx_src_index(&instr->src[0]),
368              b->shader->key->fs.tib_formats[rt]);
369 }
370 
371 static void
agx_emit_load_tile(agx_builder * b,agx_index * dests,nir_intrinsic_instr * instr)372 agx_emit_load_tile(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
373 {
374    const nir_variable *var =
375       nir_find_variable_with_driver_location(b->shader->nir,
376             nir_var_shader_out, nir_intrinsic_base(instr));
377    assert(var);
378 
379    unsigned loc = var->data.location;
380    assert(var->data.index == 0 && "todo: dual-source blending");
381    assert(loc == FRAG_RESULT_DATA0 && "todo: MRT");
382    unsigned rt = (loc - FRAG_RESULT_DATA0);
383 
384    /* TODO: Reverse-engineer interactions with MRT */
385    agx_writeout(b, 0xC200);
386    agx_writeout(b, 0x0008);
387    b->shader->did_writeout = true;
388    b->shader->out->reads_tib = true;
389 
390    agx_index vec = agx_vec_for_dest(b->shader, &instr->dest);
391    agx_ld_tile_to(b, vec, b->shader->key->fs.tib_formats[rt]);
392    agx_emit_split(b, dests, vec, 4);
393 }
394 
395 static enum agx_format
agx_format_for_bits(unsigned bits)396 agx_format_for_bits(unsigned bits)
397 {
398    switch (bits) {
399    case 8: return AGX_FORMAT_I8;
400    case 16: return AGX_FORMAT_I16;
401    case 32: return AGX_FORMAT_I32;
402    default: unreachable("Invalid bit size for load/store");
403    }
404 }
405 
406 static agx_instr *
agx_emit_load_ubo(agx_builder * b,agx_index dst,nir_intrinsic_instr * instr)407 agx_emit_load_ubo(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr)
408 {
409    bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input);
410    nir_src *offset = nir_get_io_offset_src(instr);
411 
412    if (!kernel_input && !nir_src_is_const(instr->src[0]))
413       unreachable("todo: indirect UBO access");
414 
415    /* UBO blocks are specified (kernel inputs are always 0) */
416    uint32_t block = kernel_input ? 0 : nir_src_as_uint(instr->src[0]);
417 
418    /* Each UBO has a 64-bit = 4 x 16-bit address */
419    unsigned num_ubos = b->shader->nir->info.num_ubos;
420    unsigned base_length = (num_ubos * 4);
421    unsigned index = block * 4; /* 16 bit units */
422 
423    /* Lookup the base address (TODO: indirection) */
424    agx_index base = agx_indexed_sysval(b->shader,
425                                        AGX_PUSH_UBO_BASES, AGX_SIZE_64,
426                                        index, base_length);
427 
428    /* Load the data */
429    assert(instr->num_components <= 4);
430 
431    agx_device_load_to(b, dst, base, agx_src_index(offset),
432                       agx_format_for_bits(nir_dest_bit_size(instr->dest)),
433                       BITFIELD_MASK(instr->num_components), 0);
434    agx_wait(b, 0);
435    agx_emit_cached_split(b, dst, instr->num_components);
436 
437    return NULL;
438 }
439 
440 static void
agx_emit_load_frag_coord(agx_builder * b,agx_index * dests,nir_intrinsic_instr * instr)441 agx_emit_load_frag_coord(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
442 {
443    /* xy */
444    for (unsigned i = 0; i < 2; ++i) {
445       dests[i] = agx_fadd(b, agx_convert(b, agx_immediate(AGX_CONVERT_U32_TO_F),
446                agx_get_sr(b, 32, AGX_SR_THREAD_POSITION_IN_GRID_X + i),
447                AGX_ROUND_RTE), agx_immediate_f(0.5f));
448    }
449 
450    dests[2] = agx_ld_vary(b, agx_immediate(1), 1, false); /* z */
451    dests[3] = agx_ld_vary(b, agx_immediate(0), 1, false); /* w */
452 }
453 
454 static agx_instr *
agx_blend_const(agx_builder * b,agx_index dst,unsigned comp)455 agx_blend_const(agx_builder *b, agx_index dst, unsigned comp)
456 {
457      agx_index val = agx_indexed_sysval(b->shader,
458            AGX_PUSH_BLEND_CONST, AGX_SIZE_32, comp * 2, 4 * 2);
459 
460      return agx_mov_to(b, dst, val);
461 }
462 
463 /*
464  * Demoting a helper invocation is logically equivalent to zeroing the sample
465  * mask. Metal implement discard as such.
466  *
467  * XXX: Actually, Metal's "discard" is a demote, and what is implemented here
468  * is a demote. There might be a better way to implement this to get correct
469  * helper invocation semantics. For now, I'm kicking the can down the road.
470  */
471 static agx_instr *
agx_emit_discard(agx_builder * b,nir_intrinsic_instr * instr)472 agx_emit_discard(agx_builder *b, nir_intrinsic_instr *instr)
473 {
474    agx_writeout(b, 0xC200);
475    agx_writeout(b, 0x0001);
476    b->shader->did_writeout = true;
477 
478    b->shader->out->writes_sample_mask = true;
479    return agx_sample_mask(b, agx_immediate(0));
480 }
481 
482 static agx_instr *
agx_emit_intrinsic(agx_builder * b,nir_intrinsic_instr * instr)483 agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
484 {
485   agx_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ?
486      agx_dest_index(&instr->dest) : agx_null();
487   gl_shader_stage stage = b->shader->stage;
488   agx_index dests[4] = { agx_null() };
489 
490   switch (instr->intrinsic) {
491   case nir_intrinsic_load_barycentric_pixel:
492   case nir_intrinsic_load_barycentric_centroid:
493   case nir_intrinsic_load_barycentric_sample:
494   case nir_intrinsic_load_barycentric_at_sample:
495   case nir_intrinsic_load_barycentric_at_offset:
496      /* handled later via load_vary */
497      return NULL;
498   case nir_intrinsic_load_interpolated_input:
499      assert(stage == MESA_SHADER_FRAGMENT);
500      agx_emit_load_vary(b, dests, instr);
501      break;
502 
503   case nir_intrinsic_load_input:
504      if (stage == MESA_SHADER_FRAGMENT)
505         agx_emit_load_vary_flat(b, dests, instr);
506      else if (stage == MESA_SHADER_VERTEX)
507         agx_emit_load_attr(b, dests, instr);
508      else
509         unreachable("Unsupported shader stage");
510 
511      break;
512 
513   case nir_intrinsic_store_output:
514      if (stage == MESA_SHADER_FRAGMENT)
515         return agx_emit_fragment_out(b, instr);
516      else if (stage == MESA_SHADER_VERTEX)
517         return agx_emit_store_vary(b, instr);
518      else
519         unreachable("Unsupported shader stage");
520 
521   case nir_intrinsic_load_output:
522      assert(stage == MESA_SHADER_FRAGMENT);
523      agx_emit_load_tile(b, dests, instr);
524      break;
525 
526   case nir_intrinsic_load_ubo:
527   case nir_intrinsic_load_kernel_input:
528      return agx_emit_load_ubo(b, dst, instr);
529 
530   case nir_intrinsic_load_frag_coord:
531      agx_emit_load_frag_coord(b, dests, instr);
532      break;
533 
534   case nir_intrinsic_discard:
535      return agx_emit_discard(b, instr);
536 
537   case nir_intrinsic_load_back_face_agx:
538      return agx_get_sr_to(b, dst, AGX_SR_BACKFACING);
539 
540   case nir_intrinsic_load_vertex_id:
541      return agx_mov_to(b, dst, agx_abs(agx_register(10, AGX_SIZE_32)));
542 
543   case nir_intrinsic_load_instance_id:
544      return agx_mov_to(b, dst, agx_abs(agx_register(12, AGX_SIZE_32)));
545 
546   case nir_intrinsic_load_blend_const_color_r_float: return agx_blend_const(b, dst, 0);
547   case nir_intrinsic_load_blend_const_color_g_float: return agx_blend_const(b, dst, 1);
548   case nir_intrinsic_load_blend_const_color_b_float: return agx_blend_const(b, dst, 2);
549   case nir_intrinsic_load_blend_const_color_a_float: return agx_blend_const(b, dst, 3);
550 
551   default:
552        fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
553        unreachable("Unhandled intrinsic");
554   }
555 
556   /* If we got here, there is a vector destination for the intrinsic composed
557    * of separate scalars. Its components are specified separately in the dests
558    * array. We need to combine them so the vector destination itself is valid.
559    * If only individual components are accessed, this combine will be dead code
560    * eliminated.
561    */
562   return agx_emit_combine_to(b, dst, dests[0], dests[1], dests[2], dests[3]);
563 }
564 
565 static agx_index
agx_alu_src_index(agx_builder * b,nir_alu_src src)566 agx_alu_src_index(agx_builder *b, nir_alu_src src)
567 {
568    /* Check well-formedness of the input NIR */
569    ASSERTED unsigned bitsize = nir_src_bit_size(src.src);
570    unsigned comps = nir_src_num_components(src.src);
571    unsigned channel = src.swizzle[0];
572 
573    assert(bitsize == 1 || bitsize == 16 || bitsize == 32 || bitsize == 64);
574    assert(!(src.negate || src.abs));
575    assert(channel < comps);
576 
577    agx_index idx = agx_src_index(&src.src);
578 
579    /* We only deal with scalars, extract a single scalar if needed */
580    if (comps > 1)
581       return agx_emit_extract(b, idx, channel);
582    else
583       return idx;
584 }
585 
586 static agx_instr *
agx_emit_alu_bool(agx_builder * b,nir_op op,agx_index dst,agx_index s0,agx_index s1,agx_index s2)587 agx_emit_alu_bool(agx_builder *b, nir_op op,
588       agx_index dst, agx_index s0, agx_index s1, agx_index s2)
589 {
590    /* Handle 1-bit bools as zero/nonzero rather than specifically 0/1 or 0/~0.
591     * This will give the optimizer flexibility. */
592    agx_index f = agx_immediate(0);
593    agx_index t = agx_immediate(0x1);
594 
595    switch (op) {
596    case nir_op_feq: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_EQ);
597    case nir_op_flt: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_LT);
598    case nir_op_fge: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_GE);
599    case nir_op_fneu: return agx_fcmpsel_to(b, dst, s0, s1, f, t, AGX_FCOND_EQ);
600 
601    case nir_op_ieq: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_UEQ);
602    case nir_op_ine: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_UEQ);
603    case nir_op_ilt: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_SLT);
604    case nir_op_ige: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_SLT);
605    case nir_op_ult: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_ULT);
606    case nir_op_uge: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_ULT);
607 
608    case nir_op_mov: return agx_mov_to(b, dst, s0);
609    case nir_op_iand: return agx_and_to(b, dst, s0, s1);
610    case nir_op_ior: return agx_or_to(b, dst, s0, s1);
611    case nir_op_ixor: return agx_xor_to(b, dst, s0, s1);
612    case nir_op_inot: return agx_xor_to(b, dst, s0, t);
613 
614    case nir_op_f2b1: return agx_fcmpsel_to(b, dst, s0, f, f, t, AGX_FCOND_EQ);
615    case nir_op_i2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ);
616    case nir_op_b2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ);
617 
618    case nir_op_bcsel:
619       return agx_icmpsel_to(b, dst, s0, f, s2, s1, AGX_ICOND_UEQ);
620 
621    default:
622       fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[op].name);
623       unreachable("Unhandled boolean ALU instruction");
624    }
625 }
626 
627 static agx_instr *
agx_emit_alu(agx_builder * b,nir_alu_instr * instr)628 agx_emit_alu(agx_builder *b, nir_alu_instr *instr)
629 {
630    unsigned srcs = nir_op_infos[instr->op].num_inputs;
631    unsigned sz = nir_dest_bit_size(instr->dest.dest);
632    unsigned src_sz = srcs ? nir_src_bit_size(instr->src[0].src) : 0;
633    ASSERTED unsigned comps = nir_dest_num_components(instr->dest.dest);
634 
635    assert(comps == 1 || nir_op_is_vec(instr->op));
636    assert(sz == 1 || sz == 16 || sz == 32 || sz == 64);
637 
638    agx_index dst = agx_dest_index(&instr->dest.dest);
639    agx_index s0 = srcs > 0 ? agx_alu_src_index(b, instr->src[0]) : agx_null();
640    agx_index s1 = srcs > 1 ? agx_alu_src_index(b, instr->src[1]) : agx_null();
641    agx_index s2 = srcs > 2 ? agx_alu_src_index(b, instr->src[2]) : agx_null();
642    agx_index s3 = srcs > 3 ? agx_alu_src_index(b, instr->src[3]) : agx_null();
643 
644    /* 1-bit bools are a bit special, only handle with select ops */
645    if (sz == 1)
646       return agx_emit_alu_bool(b, instr->op, dst, s0, s1, s2);
647 
648 #define UNOP(nop, aop) \
649    case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0);
650 #define BINOP(nop, aop) \
651    case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1);
652 #define TRIOP(nop, aop) \
653    case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1, s2);
654 
655    switch (instr->op) {
656    BINOP(fadd, fadd);
657    BINOP(fmul, fmul);
658    TRIOP(ffma, fma);
659 
660    UNOP(f2f16, fmov);
661    UNOP(f2f32, fmov);
662    UNOP(fround_even, roundeven);
663    UNOP(ftrunc, trunc);
664    UNOP(ffloor, floor);
665    UNOP(fceil, ceil);
666    UNOP(frcp, rcp);
667    UNOP(frsq, rsqrt);
668    UNOP(flog2, log2);
669    UNOP(fexp2, exp2);
670 
671    UNOP(fddx, dfdx);
672    UNOP(fddx_coarse, dfdx);
673    UNOP(fddx_fine, dfdx);
674 
675    UNOP(fddy, dfdy);
676    UNOP(fddy_coarse, dfdy);
677    UNOP(fddy_fine, dfdy);
678 
679    UNOP(mov, mov);
680    UNOP(u2u16, mov);
681    UNOP(u2u32, mov);
682    UNOP(inot, not);
683    BINOP(iand, and);
684    BINOP(ior, or);
685    BINOP(ixor, xor);
686 
687    case nir_op_fsqrt: return agx_fmul_to(b, dst, s0, agx_srsqrt(b, s0));
688    case nir_op_fsub: return agx_fadd_to(b, dst, s0, agx_neg(s1));
689    case nir_op_fabs: return agx_fmov_to(b, dst, agx_abs(s0));
690    case nir_op_fneg: return agx_fmov_to(b, dst, agx_neg(s0));
691 
692    case nir_op_fmin: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_LTN);
693    case nir_op_fmax: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_GTN);
694    case nir_op_imin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SLT);
695    case nir_op_imax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SGT);
696    case nir_op_umin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_ULT);
697    case nir_op_umax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_UGT);
698 
699    case nir_op_iadd: return agx_iadd_to(b, dst, s0, s1, 0);
700    case nir_op_isub: return agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
701    case nir_op_ineg: return agx_iadd_to(b, dst, agx_zero(), agx_neg(s0), 0);
702    case nir_op_imul: return agx_imad_to(b, dst, s0, s1, agx_zero(), 0);
703 
704    case nir_op_ishl: return agx_bfi_to(b, dst, agx_zero(), s0, s1, 0);
705    case nir_op_ushr: return agx_ushr_to(b, dst, s0, s1);
706    case nir_op_ishr: return agx_asr_to(b, dst, s0, s1);
707 
708    case nir_op_bcsel:
709       return agx_icmpsel_to(b, dst, s0, agx_zero(), s2, s1, AGX_ICOND_UEQ);
710 
711    case nir_op_b2i32:
712    case nir_op_b2i16:
713       return agx_icmpsel_to(b, dst, s0, agx_zero(), agx_zero(), agx_immediate(1), AGX_ICOND_UEQ);
714 
715    case nir_op_b2f16:
716    case nir_op_b2f32:
717    {
718       /* At this point, boolean is just zero/nonzero, so compare with zero */
719       agx_index one = (sz == 16) ?
720          agx_mov_imm(b, 16, _mesa_float_to_half(1.0)) :
721          agx_mov_imm(b, 32, fui(1.0));
722 
723       agx_index zero = agx_zero();
724 
725       return agx_fcmpsel_to(b, dst, s0, zero, zero, one, AGX_FCOND_EQ);
726    }
727 
728    case nir_op_i2i32:
729    {
730       if (s0.size != AGX_SIZE_16)
731          unreachable("todo: more conversions");
732 
733       return agx_iadd_to(b, dst, s0, agx_zero(), 0);
734    }
735 
736    case nir_op_i2i16:
737    {
738       if (s0.size != AGX_SIZE_32)
739          unreachable("todo: more conversions");
740 
741       return agx_iadd_to(b, dst, s0, agx_zero(), 0);
742    }
743 
744    case nir_op_iadd_sat:
745    {
746       agx_instr *I = agx_iadd_to(b, dst, s0, s1, 0);
747       I->saturate = true;
748       return I;
749    }
750 
751    case nir_op_isub_sat:
752    {
753       agx_instr *I = agx_iadd_to(b, dst, s0, agx_neg(s1), 0);
754       I->saturate = true;
755       return I;
756    }
757 
758    case nir_op_uadd_sat:
759    {
760       agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_abs(s1), 0);
761       I->saturate = true;
762       return I;
763    }
764 
765    case nir_op_usub_sat:
766    {
767       agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_neg(agx_abs(s1)), 0);
768       I->saturate = true;
769       return I;
770    }
771 
772    case nir_op_fsat:
773    {
774       agx_instr *I = agx_fadd_to(b, dst, s0, agx_negzero());
775       I->saturate = true;
776       return I;
777    }
778 
779    case nir_op_fsin_agx:
780    {
781       agx_index fixup = agx_sin_pt_1(b, s0);
782       agx_index sinc = agx_sin_pt_2(b, fixup);
783       return agx_fmul_to(b, dst, sinc, fixup);
784    }
785 
786    case nir_op_f2i16:
787       return agx_convert_to(b, dst,
788             agx_immediate(AGX_CONVERT_F_TO_S16), s0, AGX_ROUND_RTZ);
789 
790    case nir_op_f2i32:
791       return agx_convert_to(b, dst,
792             agx_immediate(AGX_CONVERT_F_TO_S32), s0, AGX_ROUND_RTZ);
793 
794    case nir_op_f2u16:
795       return agx_convert_to(b, dst,
796             agx_immediate(AGX_CONVERT_F_TO_U16), s0, AGX_ROUND_RTZ);
797 
798    case nir_op_f2u32:
799       return agx_convert_to(b, dst,
800             agx_immediate(AGX_CONVERT_F_TO_U32), s0, AGX_ROUND_RTZ);
801 
802    case nir_op_u2f16:
803    case nir_op_u2f32:
804    {
805       if (src_sz == 64)
806          unreachable("64-bit conversions unimplemented");
807 
808       enum agx_convert mode =
809          (src_sz == 32) ? AGX_CONVERT_U32_TO_F :
810          (src_sz == 16) ? AGX_CONVERT_U16_TO_F :
811                           AGX_CONVERT_U8_TO_F;
812 
813       return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
814    }
815 
816    case nir_op_i2f16:
817    case nir_op_i2f32:
818    {
819       if (src_sz == 64)
820          unreachable("64-bit conversions unimplemented");
821 
822       enum agx_convert mode =
823          (src_sz == 32) ? AGX_CONVERT_S32_TO_F :
824          (src_sz == 16) ? AGX_CONVERT_S16_TO_F :
825                           AGX_CONVERT_S8_TO_F;
826 
827       return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE);
828    }
829 
830    case nir_op_vec2:
831    case nir_op_vec3:
832    case nir_op_vec4:
833       return agx_emit_combine_to(b, dst, s0, s1, s2, s3);
834 
835    case nir_op_vec8:
836    case nir_op_vec16:
837       unreachable("should've been lowered");
838 
839    default:
840       fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
841       unreachable("Unhandled ALU instruction");
842    }
843 }
844 
845 static enum agx_dim
agx_tex_dim(enum glsl_sampler_dim dim,bool array)846 agx_tex_dim(enum glsl_sampler_dim dim, bool array)
847 {
848    switch (dim) {
849    case GLSL_SAMPLER_DIM_1D:
850    case GLSL_SAMPLER_DIM_BUF:
851       return array ? AGX_DIM_TEX_1D_ARRAY : AGX_DIM_TEX_1D;
852 
853    case GLSL_SAMPLER_DIM_2D:
854    case GLSL_SAMPLER_DIM_RECT:
855    case GLSL_SAMPLER_DIM_EXTERNAL:
856       return array ? AGX_DIM_TEX_2D_ARRAY : AGX_DIM_TEX_2D;
857 
858    case GLSL_SAMPLER_DIM_MS:
859       assert(!array && "multisampled arrays unsupported");
860       return AGX_DIM_TEX_2D_MS;
861 
862    case GLSL_SAMPLER_DIM_3D:
863       assert(!array && "3D arrays unsupported");
864       return AGX_DIM_TEX_3D;
865 
866    case GLSL_SAMPLER_DIM_CUBE:
867       return array ? AGX_DIM_TEX_CUBE_ARRAY : AGX_DIM_TEX_CUBE;
868 
869    default:
870       unreachable("Invalid sampler dim\n");
871    }
872 }
873 
874 static enum agx_lod_mode
agx_lod_mode_for_nir(nir_texop op)875 agx_lod_mode_for_nir(nir_texop op)
876 {
877    switch (op) {
878    case nir_texop_tex: return AGX_LOD_MODE_AUTO_LOD;
879    case nir_texop_txb: return AGX_LOD_MODE_AUTO_LOD_BIAS;
880    case nir_texop_txl: return AGX_LOD_MODE_LOD_MIN;
881    default: unreachable("Unhandled texture op");
882    }
883 }
884 
885 static void
agx_emit_tex(agx_builder * b,nir_tex_instr * instr)886 agx_emit_tex(agx_builder *b, nir_tex_instr *instr)
887 {
888    switch (instr->op) {
889    case nir_texop_tex:
890    case nir_texop_txl:
891    case nir_texop_txb:
892       break;
893    default:
894       unreachable("Unhandled texture op");
895    }
896 
897    agx_index coords = agx_null(),
898              texture = agx_immediate(instr->texture_index),
899              sampler = agx_immediate(instr->sampler_index),
900              lod = agx_immediate(0),
901              offset = agx_null();
902 
903    for (unsigned i = 0; i < instr->num_srcs; ++i) {
904       agx_index index = agx_src_index(&instr->src[i].src);
905 
906       switch (instr->src[i].src_type) {
907       case nir_tex_src_coord:
908          coords = index;
909 
910          /* Array textures are indexed by a floating-point in NIR, but by an
911           * integer in AGX. Convert the array index from float-to-int for array
912           * textures. The array index is the last source in NIR. The conversion
913           * is according to the rule from 8.9 ("Texture Functions") of the GLSL
914           * ES 3.20 specification:
915           *
916           *     max(0, min(d - 1, floor(layer + 0.5))) =
917           *     max(0, min(d - 1, f32_to_u32(layer + 0.5))) =
918           *     min(d - 1, f32_to_u32(layer + 0.5))
919           */
920          if (instr->is_array) {
921             unsigned nr = nir_src_num_components(instr->src[i].src);
922             agx_index channels[4] = {};
923 
924             for (unsigned i = 0; i < nr; ++i)
925                channels[i] = agx_emit_extract(b, index, i);
926 
927             agx_index layer = agx_fadd(b, channels[nr - 1],
928                                           agx_immediate_f(0.5f));
929 
930             agx_index d1 = agx_indexed_sysval(b->shader,
931                   AGX_PUSH_ARRAY_SIZE_MINUS_1, AGX_SIZE_16,
932                   instr->texture_index, 1);
933 
934             layer = agx_convert(b, agx_immediate(AGX_CONVERT_F_TO_U32), layer,
935                                    AGX_ROUND_RTZ);
936 
937             agx_index layer16 = agx_temp(b->shader, AGX_SIZE_16);
938             agx_mov_to(b, layer16, layer);
939 
940             layer = agx_icmpsel(b, layer16, d1, layer16, d1, AGX_ICOND_ULT);
941 
942             agx_index layer32 = agx_temp(b->shader, AGX_SIZE_32);
943             agx_mov_to(b, layer32, layer);
944 
945             channels[nr - 1] = layer32;
946             coords = agx_p_combine(b, channels[0], channels[1], channels[2], channels[3]);
947          } else {
948             coords = index;
949          }
950 
951          break;
952 
953       case nir_tex_src_lod:
954       case nir_tex_src_bias:
955          lod = index;
956          break;
957 
958       case nir_tex_src_ms_index:
959       case nir_tex_src_offset:
960       case nir_tex_src_comparator:
961       case nir_tex_src_texture_offset:
962       case nir_tex_src_sampler_offset:
963       default:
964          unreachable("todo");
965       }
966    }
967 
968    agx_index dst = agx_dest_index(&instr->dest);
969    agx_texture_sample_to(b, dst, coords, lod, texture, sampler, offset,
970          agx_tex_dim(instr->sampler_dim, instr->is_array),
971          agx_lod_mode_for_nir(instr->op),
972          0xF, /* TODO: wrmask */
973          0);
974 
975    agx_wait(b, 0);
976    agx_emit_cached_split(b, dst, 4);
977 }
978 
979 /*
980  * Mark the logical end of the current block by emitting a p_logical_end marker.
981  * Note if an unconditional jump is emitted (for instance, to break out of a
982  * loop from inside an if), the block has already reached its logical end so we
983  * don't re-emit p_logical_end. The validator checks this, and correct register
984  * allocation depends on it.
985  */
986 static void
agx_emit_logical_end(agx_builder * b)987 agx_emit_logical_end(agx_builder *b)
988 {
989    if (!b->shader->current_block->unconditional_jumps)
990       agx_p_logical_end(b);
991 }
992 
993 /* NIR loops are treated as a pair of AGX loops:
994  *
995  *    do {
996  *       do {
997  *          ...
998  *       } while (0);
999  *    } while (cond);
1000  *
1001  * By manipulating the nesting counter (r0l), we may break out of nested loops,
1002  * so under the model, both break and continue may be implemented as breaks,
1003  * where break breaks out of the outer loop (2 layers) and continue breaks out
1004  * of the inner loop (1 layer).
1005  *
1006  * After manipulating the nesting counter directly, pop_exec #0 must be used to
1007  * flush the update to the execution mask.
1008  */
1009 
1010 static void
agx_emit_jump(agx_builder * b,nir_jump_instr * instr)1011 agx_emit_jump(agx_builder *b, nir_jump_instr *instr)
1012 {
1013    agx_context *ctx = b->shader;
1014    assert (instr->type == nir_jump_break || instr->type == nir_jump_continue);
1015 
1016    /* Break out of either one or two loops */
1017    unsigned nestings = b->shader->loop_nesting;
1018 
1019    if (instr->type == nir_jump_continue) {
1020       nestings += 1;
1021       agx_block_add_successor(ctx->current_block, ctx->continue_block);
1022    } else if (instr->type == nir_jump_break) {
1023       nestings += 2;
1024       agx_block_add_successor(ctx->current_block, ctx->break_block);
1025    }
1026 
1027    /* Update the counter and flush */
1028    agx_index r0l = agx_register(0, false);
1029    agx_mov_to(b, r0l, agx_immediate(nestings));
1030 
1031    /* Jumps must come at the end of a block */
1032    agx_emit_logical_end(b);
1033    agx_pop_exec(b, 0);
1034 
1035    ctx->current_block->unconditional_jumps = true;
1036 }
1037 
1038 static void
agx_emit_phi(agx_builder * b,nir_phi_instr * instr)1039 agx_emit_phi(agx_builder *b, nir_phi_instr *instr)
1040 {
1041    agx_instr *I = agx_phi_to(b, agx_dest_index(&instr->dest));
1042 
1043    /* Deferred */
1044    I->phi = instr;
1045 }
1046 
1047 /* Look up the AGX block corresponding to a given NIR block. Used when
1048  * translating phi nodes after emitting all blocks.
1049  */
1050 static agx_block *
agx_from_nir_block(agx_context * ctx,nir_block * block)1051 agx_from_nir_block(agx_context *ctx, nir_block *block)
1052 {
1053    return ctx->indexed_nir_blocks[block->index];
1054 }
1055 
1056 static void
agx_emit_phi_deferred(agx_context * ctx,agx_block * block,agx_instr * I)1057 agx_emit_phi_deferred(agx_context *ctx, agx_block *block, agx_instr *I)
1058 {
1059    nir_phi_instr *phi = I->phi;
1060 
1061    /* Guaranteed by lower_phis_to_scalar */
1062    assert(phi->dest.ssa.num_components == 1);
1063 
1064    I->nr_srcs = exec_list_length(&phi->srcs);
1065    I->src = rzalloc_array(I, agx_index, I->nr_srcs);
1066 
1067    nir_foreach_phi_src(src, phi) {
1068       agx_block *pred = agx_from_nir_block(ctx, src->pred);
1069       unsigned i = agx_predecessor_index(block, pred);
1070       assert(i < I->nr_srcs);
1071 
1072       I->src[i] = agx_src_index(&src->src);
1073    }
1074 }
1075 
1076 static void
agx_emit_phis_deferred(agx_context * ctx)1077 agx_emit_phis_deferred(agx_context *ctx)
1078 {
1079    agx_foreach_block(ctx, block) {
1080       agx_foreach_instr_in_block(block, I) {
1081          if (I->op == AGX_OPCODE_PHI)
1082             agx_emit_phi_deferred(ctx, block, I);
1083       }
1084    }
1085 }
1086 
1087 static void
agx_emit_instr(agx_builder * b,struct nir_instr * instr)1088 agx_emit_instr(agx_builder *b, struct nir_instr *instr)
1089 {
1090    switch (instr->type) {
1091    case nir_instr_type_load_const:
1092       agx_emit_load_const(b, nir_instr_as_load_const(instr));
1093       break;
1094 
1095    case nir_instr_type_intrinsic:
1096       agx_emit_intrinsic(b, nir_instr_as_intrinsic(instr));
1097       break;
1098 
1099    case nir_instr_type_alu:
1100       agx_emit_alu(b, nir_instr_as_alu(instr));
1101       break;
1102 
1103    case nir_instr_type_tex:
1104       agx_emit_tex(b, nir_instr_as_tex(instr));
1105       break;
1106 
1107    case nir_instr_type_jump:
1108       agx_emit_jump(b, nir_instr_as_jump(instr));
1109       break;
1110 
1111    case nir_instr_type_phi:
1112       agx_emit_phi(b, nir_instr_as_phi(instr));
1113       break;
1114 
1115    default:
1116       unreachable("should've been lowered");
1117    }
1118 }
1119 
1120 static agx_block *
agx_create_block(agx_context * ctx)1121 agx_create_block(agx_context *ctx)
1122 {
1123    agx_block *blk = rzalloc(ctx, agx_block);
1124 
1125    util_dynarray_init(&blk->predecessors, blk);
1126 
1127    return blk;
1128 }
1129 
1130 static agx_block *
emit_block(agx_context * ctx,nir_block * block)1131 emit_block(agx_context *ctx, nir_block *block)
1132 {
1133    if (ctx->after_block) {
1134       ctx->current_block = ctx->after_block;
1135       ctx->after_block = NULL;
1136    } else {
1137       ctx->current_block = agx_create_block(ctx);
1138    }
1139 
1140    agx_block *blk = ctx->current_block;
1141    list_addtail(&blk->link, &ctx->blocks);
1142    list_inithead(&blk->instructions);
1143 
1144    ctx->indexed_nir_blocks[block->index] = blk;
1145 
1146    agx_builder _b = agx_init_builder(ctx, agx_after_block(blk));
1147 
1148    nir_foreach_instr(instr, block) {
1149       agx_emit_instr(&_b, instr);
1150    }
1151 
1152    return blk;
1153 }
1154 
1155 static agx_block *
1156 emit_cf_list(agx_context *ctx, struct exec_list *list);
1157 
1158 /* Emit if-else as
1159  *
1160  *    if_icmp cond != 0
1161  *       ...
1162  *    else_icmp cond == 0
1163  *       ...
1164  *    pop_exec
1165  *
1166  * If the else is empty, we can omit the else_icmp. This happens elsewhere, as
1167  * an empty else block can become nonempty after RA due to phi lowering. This is
1168  * not usually optimal, but it's a start.
1169  */
1170 
1171 static void
emit_if(agx_context * ctx,nir_if * nif)1172 emit_if(agx_context *ctx, nir_if *nif)
1173 {
1174    agx_block *first_block = ctx->current_block;
1175    agx_builder _b = agx_init_builder(ctx, agx_after_block(first_block));
1176    agx_index cond = agx_src_index(&nif->condition);
1177 
1178    agx_emit_logical_end(&_b);
1179    agx_if_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, true);
1180    ctx->loop_nesting++;
1181 
1182    /* Emit the two subblocks. */
1183    agx_block *if_block = emit_cf_list(ctx, &nif->then_list);
1184    agx_block *end_then = ctx->current_block;
1185 
1186    _b.cursor = agx_after_block(ctx->current_block);
1187    agx_emit_logical_end(&_b);
1188    agx_else_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, false);
1189 
1190    agx_block *else_block = emit_cf_list(ctx, &nif->else_list);
1191    agx_block *end_else = ctx->current_block;
1192 
1193    ctx->after_block = agx_create_block(ctx);
1194 
1195    agx_block_add_successor(first_block, if_block);
1196    agx_block_add_successor(first_block, else_block);
1197    agx_block_add_successor(end_then, ctx->after_block);
1198    agx_block_add_successor(end_else, ctx->after_block);
1199 
1200    _b.cursor = agx_after_block(ctx->current_block);
1201    agx_emit_logical_end(&_b);
1202    agx_pop_exec(&_b, 1);
1203    ctx->loop_nesting--;
1204 }
1205 
1206 static void
emit_loop(agx_context * ctx,nir_loop * nloop)1207 emit_loop(agx_context *ctx, nir_loop *nloop)
1208 {
1209    /* We only track nesting within the innermost loop, so push and reset */
1210    unsigned pushed_nesting = ctx->loop_nesting;
1211    ctx->loop_nesting = 0;
1212 
1213    agx_block *popped_break = ctx->break_block;
1214    agx_block *popped_continue = ctx->continue_block;
1215 
1216    ctx->break_block = agx_create_block(ctx);
1217    ctx->continue_block = agx_create_block(ctx);
1218 
1219    /* Make room for break/continue nesting (TODO: skip if no divergent CF) */
1220    agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
1221    agx_emit_logical_end(&_b);
1222    agx_push_exec(&_b, 2);
1223 
1224    /* Fallthrough to body */
1225    agx_block_add_successor(ctx->current_block, ctx->continue_block);
1226 
1227    /* Emit the body */
1228    ctx->after_block = ctx->continue_block;
1229    agx_block *start_block = emit_cf_list(ctx, &nloop->body);
1230 
1231    /* Fix up the nesting counter via an always true while_icmp, and branch back
1232     * to start of loop if any lanes are active */
1233    _b.cursor = agx_after_block(ctx->current_block);
1234    agx_emit_logical_end(&_b);
1235    agx_while_icmp(&_b, agx_zero(), agx_zero(), 2, AGX_ICOND_UEQ, false);
1236    agx_jmp_exec_any(&_b, start_block);
1237    agx_pop_exec(&_b, 2);
1238    agx_block_add_successor(ctx->current_block, ctx->continue_block);
1239 
1240    /* Pop off */
1241    ctx->after_block = ctx->break_block;
1242    ctx->break_block = popped_break;
1243    ctx->continue_block = popped_continue;
1244 
1245    /* Update shader-db stats */
1246    ++ctx->loop_count;
1247 
1248    /* All nested control flow must have finished */
1249    assert(ctx->loop_nesting == 0);
1250 
1251    /* Restore loop nesting (we might be inside an if inside an outer loop) */
1252    ctx->loop_nesting = pushed_nesting;
1253 }
1254 
1255 /* Before the first control flow structure, the nesting counter (r0l) needs to
1256  * be zeroed for correct operation. This only happens at most once, since by
1257  * definition this occurs at the end of the first block, which dominates the
1258  * rest of the program. */
1259 
1260 static void
emit_first_cf(agx_context * ctx)1261 emit_first_cf(agx_context *ctx)
1262 {
1263    if (ctx->any_cf)
1264       return;
1265 
1266    agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block));
1267    agx_index r0l = agx_register(0, false);
1268 
1269    agx_mov_to(&_b, r0l, agx_immediate(0));
1270    ctx->any_cf = true;
1271 }
1272 
1273 static agx_block *
emit_cf_list(agx_context * ctx,struct exec_list * list)1274 emit_cf_list(agx_context *ctx, struct exec_list *list)
1275 {
1276    agx_block *start_block = NULL;
1277 
1278    foreach_list_typed(nir_cf_node, node, node, list) {
1279       switch (node->type) {
1280       case nir_cf_node_block: {
1281          agx_block *block = emit_block(ctx, nir_cf_node_as_block(node));
1282 
1283          if (!start_block)
1284             start_block = block;
1285 
1286          break;
1287       }
1288 
1289       case nir_cf_node_if:
1290          emit_first_cf(ctx);
1291          emit_if(ctx, nir_cf_node_as_if(node));
1292          break;
1293 
1294       case nir_cf_node_loop:
1295          emit_first_cf(ctx);
1296          emit_loop(ctx, nir_cf_node_as_loop(node));
1297          break;
1298 
1299       default:
1300          unreachable("Unknown control flow");
1301       }
1302    }
1303 
1304    return start_block;
1305 }
1306 
1307 static void
agx_set_st_vary_final(agx_context * ctx)1308 agx_set_st_vary_final(agx_context *ctx)
1309 {
1310    agx_foreach_instr_global_rev(ctx, I) {
1311       if (I->op == AGX_OPCODE_ST_VARY) {
1312          I->last = true;
1313          return;
1314       }
1315    }
1316 }
1317 
1318 static void
agx_print_stats(agx_context * ctx,unsigned size,FILE * fp)1319 agx_print_stats(agx_context *ctx, unsigned size, FILE *fp)
1320 {
1321    unsigned nr_ins = 0, max_reg = 0;
1322 
1323    agx_foreach_instr_global(ctx, I) {
1324       /* Count instructions */
1325       nr_ins++;
1326 
1327       /* Count registers */
1328       agx_foreach_dest(I, d) {
1329          if (I->dest[d].type == AGX_INDEX_REGISTER) {
1330             max_reg = MAX2(max_reg,
1331                            I->dest[d].value + agx_write_registers(I, d) - 1);
1332          }
1333       }
1334    }
1335 
1336    /* TODO: Pipe through occupancy */
1337    unsigned nr_threads = 1;
1338 
1339    fprintf(stderr, "%s - %s shader: %u inst, %u bytes, %u halfregs, %u threads, "
1340            "%u loops, %u:%u spills:fills\n",
1341            ctx->nir->info.label ?: "",
1342            gl_shader_stage_name(ctx->stage),
1343            nr_ins, size, max_reg, nr_threads, ctx->loop_count,
1344            ctx->spills, ctx->fills);
1345 }
1346 
1347 static int
glsl_type_size(const struct glsl_type * type,bool bindless)1348 glsl_type_size(const struct glsl_type *type, bool bindless)
1349 {
1350    return glsl_count_attribute_slots(type, false);
1351 }
1352 
1353 static bool
agx_lower_sincos_filter(const nir_instr * instr,UNUSED const void * _)1354 agx_lower_sincos_filter(const nir_instr *instr, UNUSED const void *_)
1355 {
1356    if (instr->type != nir_instr_type_alu)
1357       return false;
1358 
1359    nir_alu_instr *alu = nir_instr_as_alu(instr);
1360    return alu->op == nir_op_fsin || alu->op == nir_op_fcos;
1361 }
1362 
1363 /* Sine and cosine are implemented via the sin_pt_1 and sin_pt_2 opcodes for
1364  * heavy lifting. sin_pt_2 implements sinc in the first quadrant, expressed in
1365  * turns (sin (tau x) / x), while sin_pt_1 implements a piecewise sign/offset
1366  * fixup to transform a quadrant angle [0, 4] to [-1, 1]. The NIR opcode
1367  * fsin_agx models the fixup, sinc, and multiply to obtain sine, so we just
1368  * need to change units from radians to quadrants modulo turns. Cosine is
1369  * implemented by shifting by one quadrant: cos(x) = sin(x + tau/4).
1370  */
1371 
1372 static nir_ssa_def *
agx_lower_sincos_impl(struct nir_builder * b,nir_instr * instr,UNUSED void * _)1373 agx_lower_sincos_impl(struct nir_builder *b, nir_instr *instr, UNUSED void *_)
1374 {
1375    nir_alu_instr *alu = nir_instr_as_alu(instr);
1376    nir_ssa_def *x = nir_mov_alu(b, alu->src[0], 1);
1377    nir_ssa_def *turns = nir_fmul_imm(b, x, M_1_PI * 0.5f);
1378 
1379    if (alu->op == nir_op_fcos)
1380       turns = nir_fadd_imm(b, turns, 0.25f);
1381 
1382    nir_ssa_def *quadrants = nir_fmul_imm(b, nir_ffract(b, turns), 4.0);
1383    return nir_fsin_agx(b, quadrants);
1384 }
1385 
1386 static bool
agx_lower_sincos(nir_shader * shader)1387 agx_lower_sincos(nir_shader *shader)
1388 {
1389    return nir_shader_lower_instructions(shader,
1390          agx_lower_sincos_filter, agx_lower_sincos_impl, NULL);
1391 }
1392 
1393 static bool
agx_lower_front_face(struct nir_builder * b,nir_instr * instr,UNUSED void * data)1394 agx_lower_front_face(struct nir_builder *b,
1395                      nir_instr *instr, UNUSED void *data)
1396 {
1397    if (instr->type != nir_instr_type_intrinsic)
1398       return false;
1399 
1400    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1401    if (intr->intrinsic != nir_intrinsic_load_front_face)
1402       return false;
1403 
1404    assert(intr->dest.is_ssa);
1405    nir_ssa_def *def = &intr->dest.ssa;
1406    assert(def->bit_size == 1);
1407 
1408    b->cursor = nir_before_instr(&intr->instr);
1409    nir_ssa_def_rewrite_uses(def, nir_inot(b, nir_load_back_face_agx(b, 1)));
1410    return true;
1411 }
1412 
1413 static bool
agx_lower_aligned_offsets(struct nir_builder * b,nir_instr * instr,UNUSED void * data)1414 agx_lower_aligned_offsets(struct nir_builder *b,
1415                           nir_instr *instr, UNUSED void *data)
1416 {
1417    if (instr->type != nir_instr_type_intrinsic)
1418       return false;
1419 
1420    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1421    if (intr->intrinsic != nir_intrinsic_load_ubo)
1422       return false;
1423 
1424    b->cursor = nir_before_instr(&intr->instr);
1425 
1426    unsigned bytes = nir_dest_bit_size(intr->dest) / 8;
1427    assert(util_is_power_of_two_or_zero(bytes) && bytes != 0);
1428 
1429    nir_src *offset = &intr->src[1];
1430 
1431    unsigned shift = util_logbase2(bytes);
1432 
1433    nir_ssa_def *old = nir_ssa_for_src(b, *offset, 1);
1434    nir_ssa_def *new = nir_ishr_imm(b, old, shift);
1435 
1436    nir_instr_rewrite_src_ssa(instr, offset, new);
1437    return true;
1438 }
1439 
1440 static void
agx_optimize_nir(nir_shader * nir)1441 agx_optimize_nir(nir_shader *nir)
1442 {
1443    bool progress;
1444 
1445    nir_lower_idiv_options idiv_options = {
1446       .imprecise_32bit_lowering = true,
1447       .allow_fp16 = true,
1448    };
1449 
1450    NIR_PASS_V(nir, nir_lower_regs_to_ssa);
1451    NIR_PASS_V(nir, nir_lower_int64);
1452    NIR_PASS_V(nir, nir_lower_idiv, &idiv_options);
1453    NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
1454    NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
1455    NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false);
1456    NIR_PASS_V(nir, agx_lower_sincos);
1457    NIR_PASS_V(nir, nir_shader_instructions_pass,
1458          agx_lower_front_face,
1459          nir_metadata_block_index | nir_metadata_dominance, NULL);
1460 
1461    do {
1462       progress = false;
1463 
1464       NIR_PASS(progress, nir, nir_lower_var_copies);
1465       NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
1466 
1467       NIR_PASS(progress, nir, nir_copy_prop);
1468       NIR_PASS(progress, nir, nir_opt_remove_phis);
1469       NIR_PASS(progress, nir, nir_lower_phis_to_scalar, true);
1470       NIR_PASS(progress, nir, nir_opt_dce);
1471       NIR_PASS(progress, nir, nir_opt_dead_cf);
1472       NIR_PASS(progress, nir, nir_opt_cse);
1473       NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
1474       NIR_PASS(progress, nir, nir_opt_algebraic);
1475       NIR_PASS(progress, nir, nir_opt_constant_folding);
1476 
1477       NIR_PASS(progress, nir, nir_opt_undef);
1478       NIR_PASS(progress, nir, nir_lower_undef_to_zero);
1479 
1480       NIR_PASS(progress, nir, nir_opt_loop_unroll);
1481    } while (progress);
1482 
1483    NIR_PASS_V(nir, nir_opt_algebraic_late);
1484    NIR_PASS_V(nir, nir_opt_constant_folding);
1485    NIR_PASS_V(nir, nir_copy_prop);
1486    NIR_PASS_V(nir, nir_opt_dce);
1487    NIR_PASS_V(nir, nir_opt_cse);
1488    NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
1489    NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
1490 
1491    /* Cleanup optimizations */
1492    nir_move_options move_all =
1493       nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
1494       nir_move_comparisons | nir_move_copies | nir_move_load_ssbo;
1495 
1496    NIR_PASS_V(nir, nir_opt_sink, move_all);
1497    NIR_PASS_V(nir, nir_opt_move, move_all);
1498    NIR_PASS_V(nir, nir_lower_phis_to_scalar, true);
1499 }
1500 
1501 /* ABI: position first, then user, then psiz */
1502 static void
agx_remap_varyings_vs(nir_shader * nir,struct agx_varyings * varyings,unsigned * remap)1503 agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings *varyings,
1504                       unsigned *remap)
1505 {
1506    unsigned base = 0;
1507 
1508    nir_variable *pos = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_POS);
1509    if (pos) {
1510       assert(pos->data.driver_location < AGX_MAX_VARYINGS);
1511       remap[pos->data.driver_location] = base;
1512       base += 4;
1513    }
1514 
1515    nir_foreach_shader_out_variable(var, nir) {
1516       unsigned loc = var->data.location;
1517 
1518       if(loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ) {
1519          continue;
1520       }
1521 
1522       assert(var->data.driver_location < AGX_MAX_VARYINGS);
1523       remap[var->data.driver_location] = base;
1524       base += 4;
1525    }
1526 
1527    nir_variable *psiz = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_PSIZ);
1528    if (psiz) {
1529       assert(psiz->data.driver_location < AGX_MAX_VARYINGS);
1530       remap[psiz->data.driver_location] = base;
1531       base += 1;
1532    }
1533 
1534    varyings->nr_slots = base;
1535 }
1536 
1537 static void
agx_remap_varyings_fs(nir_shader * nir,struct agx_varyings * varyings,unsigned * remap)1538 agx_remap_varyings_fs(nir_shader *nir, struct agx_varyings *varyings,
1539                       unsigned *remap)
1540 {
1541    struct agx_varying_packed *packed = varyings->packed;
1542    unsigned base = 0;
1543 
1544    agx_pack(packed, VARYING, cfg) {
1545       cfg.type = AGX_VARYING_TYPE_FRAGCOORD_W;
1546       cfg.components = 1;
1547       cfg.triangle_slot = cfg.point_slot = base;
1548    }
1549 
1550    base++;
1551    packed++;
1552 
1553    agx_pack(packed, VARYING, cfg) {
1554       cfg.type = AGX_VARYING_TYPE_FRAGCOORD_Z;
1555       cfg.components = 1;
1556       cfg.triangle_slot = cfg.point_slot = base;
1557    }
1558 
1559    base++;
1560    packed++;
1561 
1562    unsigned comps[MAX_VARYING] = { 0 };
1563 
1564    nir_foreach_shader_in_variable(var, nir) {
1565      unsigned loc = var->data.driver_location;
1566      const struct glsl_type *column =
1567         glsl_without_array_or_matrix(var->type);
1568      unsigned chan = glsl_get_components(column);
1569 
1570      /* If we have a fractional location added, we need to increase the size
1571       * so it will fit, i.e. a vec3 in YZW requires us to allocate a vec4.
1572       * We could do better but this is an edge case as it is, normally
1573       * packed varyings will be aligned.
1574       */
1575      chan += var->data.location_frac;
1576      comps[loc] = MAX2(comps[loc], chan);
1577    }
1578 
1579    nir_foreach_shader_in_variable(var, nir) {
1580      unsigned loc = var->data.driver_location;
1581      unsigned sz = glsl_count_attribute_slots(var->type, FALSE);
1582      unsigned channels = comps[loc];
1583 
1584      assert(var->data.driver_location <= AGX_MAX_VARYINGS);
1585      remap[var->data.driver_location] = base;
1586 
1587      for (int c = 0; c < sz; ++c) {
1588         agx_pack(packed, VARYING, cfg) {
1589            cfg.type = (var->data.location == VARYING_SLOT_PNTC) ?
1590               AGX_VARYING_TYPE_POINT_COORDINATES :
1591               (var->data.interpolation == INTERP_MODE_FLAT) ?
1592                  AGX_VARYING_TYPE_FLAT_LAST :
1593                  AGX_VARYING_TYPE_SMOOTH;
1594 
1595            cfg.components = channels;
1596            cfg.triangle_slot = cfg.point_slot = base;
1597         }
1598 
1599         base += channels;
1600         packed++;
1601      }
1602    }
1603 
1604    varyings->nr_descs = (packed - varyings->packed);
1605    varyings->nr_slots = base;
1606 }
1607 
1608 /*
1609  * Build a bit mask of varyings (by location) that are flatshaded. This
1610  * information is needed by lower_mediump_io.
1611  */
1612 static uint64_t
agx_flat_varying_mask(nir_shader * nir)1613 agx_flat_varying_mask(nir_shader *nir)
1614 {
1615    uint64_t mask = 0;
1616 
1617    assert(nir->info.stage == MESA_SHADER_FRAGMENT);
1618 
1619    nir_foreach_shader_in_variable(var, nir) {
1620       if (var->data.interpolation == INTERP_MODE_FLAT)
1621          mask |= BITFIELD64_BIT(var->data.location);
1622    }
1623 
1624    return mask;
1625 }
1626 
1627 void
agx_compile_shader_nir(nir_shader * nir,struct agx_shader_key * key,struct util_dynarray * binary,struct agx_shader_info * out)1628 agx_compile_shader_nir(nir_shader *nir,
1629       struct agx_shader_key *key,
1630       struct util_dynarray *binary,
1631       struct agx_shader_info *out)
1632 {
1633    agx_debug = debug_get_option_agx_debug();
1634 
1635    agx_context *ctx = rzalloc(NULL, agx_context);
1636    ctx->nir = nir;
1637    ctx->out = out;
1638    ctx->key = key;
1639    ctx->stage = nir->info.stage;
1640    list_inithead(&ctx->blocks);
1641 
1642    if (ctx->stage == MESA_SHADER_VERTEX) {
1643       out->writes_psiz = nir->info.outputs_written &
1644          BITFIELD_BIT(VARYING_SLOT_PSIZ);
1645    }
1646 
1647    NIR_PASS_V(nir, nir_lower_vars_to_ssa);
1648 
1649    /* Lower large arrays to scratch and small arrays to csel */
1650    NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16,
1651          glsl_get_natural_size_align_bytes);
1652    NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0);
1653 
1654    if (ctx->stage == MESA_SHADER_VERTEX) {
1655       /* Lower from OpenGL [-1, 1] to [0, 1] if half-z is not set */
1656       if (!key->vs.clip_halfz)
1657          NIR_PASS_V(nir, nir_lower_clip_halfz);
1658    }
1659 
1660    NIR_PASS_V(nir, nir_split_var_copies);
1661    NIR_PASS_V(nir, nir_lower_global_vars_to_local);
1662    NIR_PASS_V(nir, nir_lower_var_copies);
1663    NIR_PASS_V(nir, nir_lower_vars_to_ssa);
1664    NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1665          glsl_type_size, 0);
1666    if (ctx->stage == MESA_SHADER_FRAGMENT) {
1667       /* Interpolate varyings at fp16 and write to the tilebuffer at fp16. As an
1668        * exception, interpolate flat shaded at fp32. This works around a
1669        * hardware limitation. The resulting code (with an extra f2f16 at the end
1670        * if needed) matches what Metal produces.
1671        */
1672       NIR_PASS_V(nir, nir_lower_mediump_io,
1673             nir_var_shader_in | nir_var_shader_out,
1674             ~agx_flat_varying_mask(nir), false);
1675    }
1676    NIR_PASS_V(nir, nir_shader_instructions_pass,
1677          agx_lower_aligned_offsets,
1678          nir_metadata_block_index | nir_metadata_dominance, NULL);
1679 
1680    NIR_PASS_V(nir, nir_lower_ssbo);
1681 
1682    /* Varying output is scalar, other I/O is vector */
1683    if (ctx->stage == MESA_SHADER_VERTEX) {
1684       NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out);
1685    }
1686 
1687    nir_lower_tex_options lower_tex_options = {
1688       .lower_txs_lod = true,
1689       .lower_txp = ~0,
1690       .lower_invalid_implicit_lod = true,
1691    };
1692 
1693    nir_tex_src_type_constraints tex_constraints = {
1694       [nir_tex_src_lod] = { true, 16 },
1695       [nir_tex_src_bias] = { true, 16 },
1696    };
1697 
1698    NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
1699    NIR_PASS_V(nir, nir_legalize_16bit_sampler_srcs, tex_constraints);
1700 
1701    agx_optimize_nir(nir);
1702 
1703    /* Implement conditional discard with real control flow like Metal */
1704    NIR_PASS_V(nir, nir_lower_discard_if);
1705 
1706    /* Must be last since NIR passes can remap driver_location freely */
1707    if (ctx->stage == MESA_SHADER_VERTEX) {
1708       agx_remap_varyings_vs(nir, &out->varyings, ctx->varyings);
1709    } else if (ctx->stage == MESA_SHADER_FRAGMENT) {
1710       agx_remap_varyings_fs(nir, &out->varyings, ctx->varyings);
1711    }
1712 
1713    bool skip_internal = nir->info.internal;
1714    skip_internal &= !(agx_debug & AGX_DBG_INTERNAL);
1715 
1716    if (agx_debug & AGX_DBG_SHADERS && !skip_internal) {
1717       nir_print_shader(nir, stdout);
1718    }
1719 
1720    ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
1721 
1722    nir_foreach_function(func, nir) {
1723       if (!func->impl)
1724          continue;
1725 
1726       nir_index_blocks(func->impl);
1727 
1728       ctx->indexed_nir_blocks =
1729          rzalloc_array(ctx, agx_block *, func->impl->num_blocks);
1730 
1731       ctx->alloc += func->impl->ssa_alloc;
1732       emit_cf_list(ctx, &func->impl->body);
1733       agx_emit_phis_deferred(ctx);
1734       break; /* TODO: Multi-function shaders */
1735    }
1736 
1737    /* Terminate the shader after the exit block */
1738    agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link);
1739    agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block));
1740    agx_stop(&_b);
1741 
1742    /* Also add traps to match the blob, unsure what the function is */
1743    for (unsigned i = 0; i < 8; ++i)
1744       agx_trap(&_b);
1745 
1746    /* Index blocks now that we're done emitting so the order is consistent */
1747    agx_foreach_block(ctx, block)
1748       block->index = ctx->num_blocks++;
1749 
1750    agx_validate(ctx, "IR translation");
1751 
1752    if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
1753       agx_print_shader(ctx, stdout);
1754 
1755    agx_optimizer(ctx);
1756    agx_dce(ctx);
1757    agx_validate(ctx, "Optimization");
1758 
1759    if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
1760       agx_print_shader(ctx, stdout);
1761 
1762    agx_ra(ctx);
1763 
1764    if (ctx->stage == MESA_SHADER_VERTEX)
1765       agx_set_st_vary_final(ctx);
1766 
1767    if (agx_debug & AGX_DBG_SHADERS && !skip_internal)
1768       agx_print_shader(ctx, stdout);
1769 
1770    agx_lower_pseudo(ctx);
1771 
1772    agx_pack_binary(ctx, binary);
1773 
1774    if ((agx_debug & AGX_DBG_SHADERDB) && !skip_internal)
1775       agx_print_stats(ctx, binary->size, stderr);
1776 
1777    ralloc_free(ctx);
1778 }
1779