• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2024 Alyssa Rosenzweig
3  * Copyright 2024 Valve Corporation
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "gallium/include/pipe/p_defines.h"
8 #include "util/format/u_formats.h"
9 #include "agx_abi.h"
10 #include "agx_linker.h"
11 #include "agx_nir.h"
12 #include "agx_nir_lower_gs.h"
13 #include "agx_nir_lower_vbo.h"
14 #include "agx_pack.h"
15 #include "agx_tilebuffer.h"
16 #include "libagx.h"
17 #include "nir.h"
18 #include "nir_builder.h"
19 #include "nir_builder_opcodes.h"
20 #include "nir_lower_blend.h"
21 #include "shader_enums.h"
22 
23 /*
24  * Insert code into a fragment shader to lower polygon stipple. The stipple is
25  * passed in a sideband, rather than requiring a texture binding. This is
26  * simpler for drivers to integrate and might be more efficient.
27  */
28 static bool
agx_nir_lower_poly_stipple(nir_shader * s)29 agx_nir_lower_poly_stipple(nir_shader *s)
30 {
31    assert(s->info.stage == MESA_SHADER_FRAGMENT);
32 
33    /* Insert at the beginning for performance. */
34    nir_builder b_ =
35       nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(s)));
36    nir_builder *b = &b_;
37 
38    /* The stipple coordinate is defined at the window coordinate mod 32. It's
39     * reversed along the X-axis to simplify the driver, hence the NOT.
40     */
41    nir_def *raw = nir_u2u32(b, nir_load_pixel_coord(b));
42    nir_def *coord = nir_umod_imm(
43       b,
44       nir_vec2(b, nir_inot(b, nir_channel(b, raw, 0)), nir_channel(b, raw, 1)),
45       32);
46 
47    /* Extract the column from the packed bitfield */
48    nir_def *pattern = nir_load_polygon_stipple_agx(b, nir_channel(b, coord, 1));
49    nir_def *bit = nir_ubitfield_extract(b, pattern, nir_channel(b, coord, 0),
50                                         nir_imm_int(b, 1));
51 
52    /* Discard fragments where the pattern is 0 */
53    nir_demote_if(b, nir_ieq_imm(b, bit, 0));
54    s->info.fs.uses_discard = true;
55 
56    nir_metadata_preserve(b->impl, nir_metadata_control_flow);
57    return true;
58 }
59 
60 static bool
lower_vbo(nir_shader * s,const struct agx_velem_key * key,const struct agx_robustness rs)61 lower_vbo(nir_shader *s, const struct agx_velem_key *key,
62           const struct agx_robustness rs)
63 {
64    struct agx_attribute out[AGX_MAX_VBUFS];
65 
66    for (unsigned i = 0; i < AGX_MAX_VBUFS; ++i) {
67       out[i] = (struct agx_attribute){
68          .divisor = key[i].divisor,
69          .stride = key[i].stride,
70          .format = key[i].format,
71          .instanced = key[i].instanced,
72       };
73    }
74 
75    return agx_nir_lower_vbo(s, out, rs);
76 }
77 
78 static int
map_vs_part_uniform(nir_intrinsic_instr * intr,unsigned nr_attribs)79 map_vs_part_uniform(nir_intrinsic_instr *intr, unsigned nr_attribs)
80 {
81    switch (intr->intrinsic) {
82    case nir_intrinsic_load_vbo_base_agx:
83       return 4 * nir_src_as_uint(intr->src[0]);
84    case nir_intrinsic_load_attrib_clamp_agx:
85       return (4 * nr_attribs) + (2 * nir_src_as_uint(intr->src[0]));
86    case nir_intrinsic_load_first_vertex:
87       return (6 * nr_attribs);
88    case nir_intrinsic_load_base_instance:
89       return (6 * nr_attribs) + 2;
90    case nir_intrinsic_load_input_assembly_buffer_agx:
91       return (6 * nr_attribs) + 8;
92    default:
93       return -1;
94    }
95 }
96 
97 static int
map_fs_part_uniform(nir_intrinsic_instr * intr)98 map_fs_part_uniform(nir_intrinsic_instr *intr)
99 {
100    switch (intr->intrinsic) {
101    case nir_intrinsic_load_blend_const_color_r_float:
102       return 4;
103    case nir_intrinsic_load_blend_const_color_g_float:
104       return 6;
105    case nir_intrinsic_load_blend_const_color_b_float:
106       return 8;
107    case nir_intrinsic_load_blend_const_color_a_float:
108       return 10;
109    default:
110       return -1;
111    }
112 }
113 
114 static bool
lower_non_monolithic_uniforms(nir_builder * b,nir_intrinsic_instr * intr,void * data)115 lower_non_monolithic_uniforms(nir_builder *b, nir_intrinsic_instr *intr,
116                               void *data)
117 {
118    int unif;
119    if (b->shader->info.stage == MESA_SHADER_VERTEX) {
120       unsigned *nr_attribs = data;
121       unif = map_vs_part_uniform(intr, *nr_attribs);
122    } else {
123       unif = map_fs_part_uniform(intr);
124    }
125 
126    if (unif >= 0) {
127       b->cursor = nir_instr_remove(&intr->instr);
128       nir_def *load = nir_load_preamble(b, 1, intr->def.bit_size, .base = unif);
129       nir_def_rewrite_uses(&intr->def, load);
130       return true;
131    } else if (intr->intrinsic == nir_intrinsic_load_texture_handle_agx) {
132       b->cursor = nir_instr_remove(&intr->instr);
133       nir_def *offs =
134          nir_imul_imm(b, nir_u2u32(b, intr->src[0].ssa), AGX_TEXTURE_LENGTH);
135       nir_def_rewrite_uses(&intr->def, nir_vec2(b, nir_imm_int(b, 0), offs));
136       return true;
137    } else {
138       return false;
139    }
140 }
141 
142 static bool
lower_adjacency(nir_builder * b,nir_intrinsic_instr * intr,void * data)143 lower_adjacency(nir_builder *b, nir_intrinsic_instr *intr, void *data)
144 {
145    const struct agx_vs_prolog_key *key = data;
146    b->cursor = nir_before_instr(&intr->instr);
147 
148    if (intr->intrinsic != nir_intrinsic_load_vertex_id)
149       return false;
150 
151    nir_def *id = nir_load_vertex_id(b);
152 
153    if (key->adjacency == MESA_PRIM_LINES_ADJACENCY) {
154       id = libagx_map_to_line_adj(b, id);
155    } else if (key->adjacency == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) {
156       id = libagx_map_to_tri_strip_adj(b, id);
157    } else if (key->adjacency == MESA_PRIM_LINE_STRIP_ADJACENCY) {
158       id = libagx_map_to_line_strip_adj(b, id);
159    } else if (key->adjacency == MESA_PRIM_TRIANGLES_ADJACENCY) {
160       /* Sequence (0, 2, 4), (6, 8, 10), ... */
161       id = nir_imul_imm(b, id, 2);
162    } else {
163       unreachable("unknown");
164    }
165 
166    id = agx_nir_load_vertex_id(b, id, key->sw_index_size_B);
167 
168    nir_def_replace(&intr->def, id);
169    return true;
170 }
171 
172 void
agx_nir_vs_prolog(nir_builder * b,const void * key_)173 agx_nir_vs_prolog(nir_builder *b, const void *key_)
174 {
175    const struct agx_vs_prolog_key *key = key_;
176    b->shader->info.stage = MESA_SHADER_VERTEX;
177    b->shader->info.name = "VS prolog";
178 
179    /* First, construct a passthrough shader reading each attribute and exporting
180     * the value. We also need to export vertex/instance ID in their usual regs.
181     */
182    unsigned i = 0;
183    nir_def *vec = NULL;
184    unsigned vec_idx = ~0;
185    BITSET_FOREACH_SET(i, key->component_mask, AGX_MAX_ATTRIBS * 4) {
186       unsigned a = i / 4;
187       unsigned c = i % 4;
188 
189       if (vec_idx != a) {
190          vec = nir_load_input(b, 4, 32, nir_imm_int(b, 0), .base = a);
191          vec_idx = a;
192       }
193 
194       nir_export_agx(b, nir_channel(b, vec, c), .base = AGX_ABI_VIN_ATTRIB(i));
195    }
196 
197    nir_export_agx(b, nir_load_vertex_id(b), .base = AGX_ABI_VIN_VERTEX_ID);
198    nir_export_agx(b, nir_load_instance_id(b), .base = AGX_ABI_VIN_INSTANCE_ID);
199 
200    /* Now lower the resulting program using the key */
201    lower_vbo(b->shader, key->attribs, key->robustness);
202 
203    /* Clean up redundant vertex ID loads */
204    if (!key->hw || key->adjacency) {
205       NIR_PASS(_, b->shader, nir_opt_cse);
206       NIR_PASS(_, b->shader, nir_opt_dce);
207    }
208 
209    if (!key->hw) {
210       agx_nir_lower_sw_vs(b->shader, key->sw_index_size_B);
211    } else if (key->adjacency) {
212       nir_shader_intrinsics_pass(b->shader, lower_adjacency,
213                                  nir_metadata_control_flow, (void *)key);
214    }
215 
216    /* Finally, lower uniforms according to our ABI */
217    unsigned nr = DIV_ROUND_UP(BITSET_LAST_BIT(key->component_mask), 4);
218    nir_shader_intrinsics_pass(b->shader, lower_non_monolithic_uniforms,
219                               nir_metadata_control_flow, &nr);
220    b->shader->info.io_lowered = true;
221 }
222 
223 static bool
lower_input_to_prolog(nir_builder * b,nir_intrinsic_instr * intr,void * data)224 lower_input_to_prolog(nir_builder *b, nir_intrinsic_instr *intr, void *data)
225 {
226    if (intr->intrinsic != nir_intrinsic_load_input)
227       return false;
228 
229    unsigned idx = nir_src_as_uint(intr->src[0]) + nir_intrinsic_base(intr);
230    unsigned comp = nir_intrinsic_component(intr);
231 
232    assert(intr->def.bit_size == 32 && "todo: push conversions up?");
233    unsigned base = 4 * idx + comp;
234 
235    b->cursor = nir_before_instr(&intr->instr);
236    nir_def *val =
237       nir_load_exported_agx(b, intr->def.num_components, intr->def.bit_size,
238                             .base = AGX_ABI_VIN_ATTRIB(base));
239 
240    BITSET_WORD *comps_read = data;
241    nir_component_mask_t mask = nir_def_components_read(&intr->def);
242 
243    u_foreach_bit(c, mask) {
244       BITSET_SET(comps_read, base + c);
245    }
246 
247    nir_def_replace(&intr->def, val);
248    return true;
249 }
250 
251 bool
agx_nir_lower_vs_input_to_prolog(nir_shader * s,BITSET_WORD * attrib_components_read)252 agx_nir_lower_vs_input_to_prolog(nir_shader *s,
253                                  BITSET_WORD *attrib_components_read)
254 {
255    return nir_shader_intrinsics_pass(s, lower_input_to_prolog,
256                                      nir_metadata_control_flow,
257                                      attrib_components_read);
258 }
259 
260 static bool
lower_active_samples_to_register(nir_builder * b,nir_intrinsic_instr * intr,void * data)261 lower_active_samples_to_register(nir_builder *b, nir_intrinsic_instr *intr,
262                                  void *data)
263 {
264    if (intr->intrinsic != nir_intrinsic_load_active_samples_agx)
265       return false;
266 
267    b->cursor = nir_before_instr(&intr->instr);
268    nir_def *id =
269       nir_load_exported_agx(b, 1, 16, .base = AGX_ABI_FIN_SAMPLE_MASK);
270 
271    nir_def_replace(&intr->def, id);
272    return true;
273 }
274 
275 static bool
lower_tests_zs_intr(nir_builder * b,nir_intrinsic_instr * intr,void * data)276 lower_tests_zs_intr(nir_builder *b, nir_intrinsic_instr *intr, void *data)
277 {
278    bool *value = data;
279    if (intr->intrinsic != nir_intrinsic_load_shader_part_tests_zs_agx)
280       return false;
281 
282    b->cursor = nir_instr_remove(&intr->instr);
283    nir_def_rewrite_uses(&intr->def, nir_imm_intN_t(b, *value ? 0xFF : 0, 16));
284    return true;
285 }
286 
287 static bool
lower_tests_zs(nir_shader * s,bool value)288 lower_tests_zs(nir_shader *s, bool value)
289 {
290    if (!s->info.fs.uses_discard)
291       return false;
292 
293    return nir_shader_intrinsics_pass(s, lower_tests_zs_intr,
294                                      nir_metadata_control_flow, &value);
295 }
296 
297 static inline bool
blend_uses_2src(struct agx_blend_rt_key rt)298 blend_uses_2src(struct agx_blend_rt_key rt)
299 {
300    enum pipe_blendfactor factors[] = {
301       rt.rgb_src_factor,
302       rt.rgb_dst_factor,
303       rt.alpha_src_factor,
304       rt.alpha_dst_factor,
305    };
306 
307    for (unsigned i = 0; i < ARRAY_SIZE(factors); ++i) {
308       switch (factors[i]) {
309       case PIPE_BLENDFACTOR_SRC1_COLOR:
310       case PIPE_BLENDFACTOR_SRC1_ALPHA:
311       case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
312       case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
313          return true;
314       default:
315          break;
316       }
317    }
318 
319    return false;
320 }
321 
322 static void
copy_colour(nir_builder * b,const struct agx_fs_epilog_key * key,unsigned out_rt,unsigned in_loc,bool dual_src)323 copy_colour(nir_builder *b, const struct agx_fs_epilog_key *key,
324             unsigned out_rt, unsigned in_loc, bool dual_src)
325 {
326    unsigned size = (key->link.size_32 & BITFIELD_BIT(in_loc)) ? 32 : 16;
327 
328    nir_def *value =
329       nir_load_exported_agx(b, 4, size, .base = AGX_ABI_FOUT_COLOUR(in_loc));
330 
331    if (key->link.loc0_w_1 && in_loc == 0) {
332       value =
333          nir_vector_insert_imm(b, value, nir_imm_floatN_t(b, 1.0, size), 3);
334    }
335 
336    nir_store_output(b, value, nir_imm_int(b, 0),
337                     .io_semantics.location = FRAG_RESULT_DATA0 + out_rt,
338                     .io_semantics.dual_source_blend_index = dual_src,
339                     .src_type = nir_type_float | size);
340 }
341 
342 void
agx_nir_fs_epilog(nir_builder * b,const void * key_)343 agx_nir_fs_epilog(nir_builder *b, const void *key_)
344 {
345    const struct agx_fs_epilog_key *key = key_;
346    b->shader->info.stage = MESA_SHADER_FRAGMENT;
347    b->shader->info.name = "FS epilog";
348 
349    /* First, construct a passthrough shader reading each colour and outputting
350     * the value.
351     */
352    for (unsigned rt = 0; rt < ARRAY_SIZE(key->remap); ++rt) {
353       int location = key->remap[rt];
354 
355       /* Negative remaps indicate the attachment isn't written. */
356       if (location >= 0 && key->link.loc_written & BITFIELD_BIT(location)) {
357          copy_colour(b, key, rt, location, false);
358 
359          /* If this render target uses dual source blending, also copy the dual
360           * source colour. While the copy_colour above is needed even for
361           * missing attachments to handle alpha-to-coverage, this copy is only
362           * for blending so should be suppressed for missing attachments to keep
363           * the assert from blowing up on OpenGL.
364           */
365          if (blend_uses_2src(key->blend.rt[rt]) &&
366              key->rt_formats[rt] != PIPE_FORMAT_NONE) {
367 
368             assert(location == 0);
369             copy_colour(b, key, rt, 1, true);
370          }
371       }
372    }
373 
374    /* Grab registers early, this has to happen in the first block. */
375    nir_def *sample_id = NULL, *write_samples = NULL;
376    if (key->link.sample_shading) {
377       sample_id =
378          nir_load_exported_agx(b, 1, 16, .base = AGX_ABI_FOUT_SAMPLE_MASK);
379    }
380 
381    if (key->link.sample_mask_after_force_early) {
382       write_samples =
383          nir_load_exported_agx(b, 1, 16, .base = AGX_ABI_FOUT_WRITE_SAMPLES);
384    }
385 
386    /* Now lower the resulting program using the key */
387    struct agx_tilebuffer_layout tib = agx_build_tilebuffer_layout(
388       key->rt_formats, ARRAY_SIZE(key->rt_formats), key->nr_samples, true);
389 
390    if (key->force_small_tile)
391       tib.tile_size = (struct agx_tile_size){16, 16};
392 
393    bool force_translucent = false;
394    nir_lower_blend_options opts = {
395       .scalar_blend_const = true,
396       .logicop_enable = key->blend.logicop_func != PIPE_LOGICOP_COPY,
397       .logicop_func = key->blend.logicop_func,
398    };
399 
400    static_assert(ARRAY_SIZE(opts.format) == 8, "max RTs out of sync");
401 
402    for (unsigned i = 0; i < 8; ++i) {
403       opts.format[i] = key->rt_formats[i];
404       opts.rt[i] = (nir_lower_blend_rt){
405          .rgb.src_factor = key->blend.rt[i].rgb_src_factor,
406          .rgb.dst_factor = key->blend.rt[i].rgb_dst_factor,
407          .rgb.func = key->blend.rt[i].rgb_func,
408 
409          .alpha.src_factor = key->blend.rt[i].alpha_src_factor,
410          .alpha.dst_factor = key->blend.rt[i].alpha_dst_factor,
411          .alpha.func = key->blend.rt[i].alpha_func,
412 
413          .colormask = key->blend.rt[i].colormask,
414       };
415    }
416 
417    /* It's more efficient to use masked stores (with
418     * agx_nir_lower_tilebuffer) than to emulate colour masking with
419     * nir_lower_blend.
420     */
421    uint8_t colormasks[8] = {0};
422 
423    for (unsigned i = 0; i < 8; ++i) {
424       if (key->rt_formats[i] == PIPE_FORMAT_NONE)
425          continue;
426 
427       /* TODO: Flakes some dEQPs, seems to invoke UB. Revisit later.
428        * dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.77
429        * dEQP-GLES2.functional.fragment_ops.interaction.basic_shader.98
430        */
431       if (0 /* agx_tilebuffer_supports_mask(&tib, i) */) {
432          colormasks[i] = key->blend.rt[i].colormask;
433          opts.rt[i].colormask = (uint8_t)BITFIELD_MASK(4);
434       } else {
435          colormasks[i] = (uint8_t)BITFIELD_MASK(4);
436       }
437 
438       /* If not all bound RTs are fully written to, we need to force
439        * translucent pass type. agx_nir_lower_tilebuffer will take
440        * care of this for its own colormasks input.
441        */
442       unsigned comps = util_format_get_nr_components(key->rt_formats[i]);
443       if ((opts.rt[i].colormask & BITFIELD_MASK(comps)) !=
444           BITFIELD_MASK(comps)) {
445          force_translucent = true;
446       }
447    }
448 
449    /* Alpha-to-coverage must be lowered before alpha-to-one */
450    if (key->blend.alpha_to_coverage)
451       NIR_PASS(_, b->shader, agx_nir_lower_alpha_to_coverage, tib.nr_samples);
452 
453    /* Depth/stencil writes must be deferred until after all discards,
454     * particularly alpha-to-coverage.
455     */
456    if (key->link.write_z || key->link.write_s) {
457       nir_store_zs_agx(
458          b, nir_imm_intN_t(b, 0xFF, 16),
459          nir_load_exported_agx(b, 1, 32, .base = AGX_ABI_FOUT_Z),
460          nir_load_exported_agx(b, 1, 16, .base = AGX_ABI_FOUT_S),
461          .base = (key->link.write_z ? 1 : 0) | (key->link.write_s ? 2 : 0));
462 
463       if (key->link.write_z)
464          b->shader->info.outputs_written |= BITFIELD64_BIT(FRAG_RESULT_DEPTH);
465 
466       if (key->link.write_s)
467          b->shader->info.outputs_written |= BITFIELD64_BIT(FRAG_RESULT_STENCIL);
468    }
469 
470    /* Alpha-to-one must be lowered before blending */
471    if (key->blend.alpha_to_one)
472       NIR_PASS(_, b->shader, agx_nir_lower_alpha_to_one);
473 
474    NIR_PASS(_, b->shader, nir_lower_blend, &opts);
475 
476    unsigned rt_spill = key->link.rt_spill_base;
477    NIR_PASS(_, b->shader, agx_nir_lower_tilebuffer, &tib, colormasks, &rt_spill,
478             write_samples, &force_translucent);
479    NIR_PASS(_, b->shader, agx_nir_lower_texture);
480    NIR_PASS(_, b->shader, agx_nir_lower_multisampled_image_store);
481 
482    /* If the API shader runs once per sample, then the epilog runs once per
483     * sample as well, so we need to lower our code to run for a single sample.
484     *
485     * If the API shader runs once per pixel, then the epilog runs once per
486     * pixel. So we run through the monolithic MSAA lowering, which wraps the
487     * epilog in the sample loop if needed. This localizes sample shading
488     * to the epilog, when sample shading is not used but blending is.
489     */
490    if (key->link.sample_shading) {
491       /* Lower the resulting discards. Done in agx_nir_lower_monolithic_msaa for
492        * the pixel shaded path. Must be done before agx_nir_lower_to_per_sample
493        * to avoid duplicating tests.
494        */
495       if (key->blend.alpha_to_coverage) {
496          NIR_PASS(_, b->shader, agx_nir_lower_sample_mask);
497       }
498 
499       NIR_PASS(_, b->shader, agx_nir_lower_to_per_sample);
500       NIR_PASS(_, b->shader, agx_nir_lower_fs_active_samples_to_register);
501 
502       /* Ensure the sample ID is preserved in register. We do this late since it
503        * has to go in the last block, and the above passes might add control
504        * flow when lowering.
505        */
506       b->cursor = nir_after_impl(b->impl);
507       nir_export_agx(b, sample_id, .base = AGX_ABI_FIN_SAMPLE_MASK);
508    } else {
509       NIR_PASS(_, b->shader, agx_nir_lower_monolithic_msaa, key->nr_samples);
510    }
511 
512    /* Finally, lower uniforms according to our ABI */
513    nir_shader_intrinsics_pass(b->shader, lower_non_monolithic_uniforms,
514                               nir_metadata_control_flow, NULL);
515 
516    /* There is no shader part after the epilog, so we're always responsible for
517     * running our own tests, unless the fragment shader forced early tests.
518     */
519    NIR_PASS(_, b->shader, lower_tests_zs, !key->link.already_ran_zs);
520 
521    b->shader->info.io_lowered = true;
522    b->shader->info.fs.uses_fbfetch_output |= force_translucent;
523    b->shader->info.fs.uses_sample_shading = key->link.sample_shading;
524 }
525 
526 struct lower_epilog_ctx {
527    struct agx_fs_epilog_link_info *info;
528    nir_variable *masked_samples;
529 };
530 
531 static bool
lower_output_to_epilog(nir_builder * b,nir_intrinsic_instr * intr,void * data)532 lower_output_to_epilog(nir_builder *b, nir_intrinsic_instr *intr, void *data)
533 {
534    struct lower_epilog_ctx *ctx = data;
535    struct agx_fs_epilog_link_info *info = ctx->info;
536 
537    if (intr->intrinsic == nir_intrinsic_store_zs_agx) {
538       assert(nir_src_as_uint(intr->src[0]) == 0xff && "msaa not yet lowered");
539       b->cursor = nir_instr_remove(&intr->instr);
540 
541       unsigned base = nir_intrinsic_base(intr);
542       info->write_z = !!(base & 1);
543       info->write_s = !!(base & 2);
544 
545       if (info->write_z)
546          nir_export_agx(b, intr->src[1].ssa, .base = AGX_ABI_FOUT_Z);
547 
548       if (info->write_s)
549          nir_export_agx(b, intr->src[2].ssa, .base = AGX_ABI_FOUT_S);
550 
551       return true;
552    }
553 
554    if (intr->intrinsic == nir_intrinsic_discard_agx &&
555        b->shader->info.fs.early_fragment_tests) {
556 
557       if (!ctx->masked_samples) {
558          b->cursor = nir_before_impl(nir_shader_get_entrypoint(b->shader));
559 
560          ctx->masked_samples =
561             nir_local_variable_create(b->impl, glsl_uint16_t_type(), NULL);
562 
563          nir_store_var(b, ctx->masked_samples, nir_imm_intN_t(b, 0xFF, 16),
564                        nir_component_mask(1));
565       }
566 
567       b->cursor = nir_before_instr(&intr->instr);
568 
569       nir_def *mask = nir_load_var(b, ctx->masked_samples);
570       nir_def *mask_2 =
571          nir_ixor(b, intr->src[0].ssa, nir_imm_intN_t(b, 0xff, 16));
572 
573       mask = nir_iand(b, mask, mask_2);
574       nir_store_var(b, ctx->masked_samples, mask, nir_component_mask(1));
575 
576       nir_instr_remove(&intr->instr);
577       return true;
578    }
579 
580    if (intr->intrinsic != nir_intrinsic_store_output)
581       return false;
582 
583    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
584 
585    /* Fix up gl_FragColor */
586    if (sem.location == FRAG_RESULT_COLOR) {
587       sem.location = FRAG_RESULT_DATA0;
588       info->broadcast_rt0 = true;
589    }
590 
591    /* We don't use the epilog for sample mask writes */
592    if (sem.location < FRAG_RESULT_DATA0)
593       return false;
594 
595    /* Determine the ABI location. Dual source blending aliases a second
596     * render target, so get that out of the way now.
597     */
598    unsigned loc = sem.location - FRAG_RESULT_DATA0;
599    loc += nir_src_as_uint(intr->src[1]);
600 
601    if (sem.dual_source_blend_index) {
602       assert(loc == 0);
603       loc = 1;
604    }
605 
606    b->cursor = nir_instr_remove(&intr->instr);
607    nir_def *vec = intr->src[0].ssa;
608 
609    info->loc_written |= BITFIELD_BIT(loc);
610 
611    if (vec->bit_size == 32)
612       info->size_32 |= BITFIELD_BIT(loc);
613    else
614       assert(vec->bit_size == 16);
615 
616    uint32_t one_f = (vec->bit_size == 32 ? fui(1.0) : _mesa_float_to_half(1.0));
617    unsigned comp = nir_intrinsic_component(intr);
618 
619    u_foreach_bit(c, nir_intrinsic_write_mask(intr)) {
620       nir_scalar s = nir_scalar_resolved(vec, c);
621       if (loc == 0 && c == 3 && nir_scalar_is_const(s) &&
622           nir_scalar_as_uint(s) == one_f) {
623 
624          info->loc0_w_1 = true;
625       } else {
626          unsigned stride = vec->bit_size / 16;
627 
628          nir_export_agx(b, nir_channel(b, vec, c),
629                         .base = AGX_ABI_FOUT_COLOUR(loc) + (comp + c) * stride);
630       }
631    }
632 
633    return true;
634 }
635 
636 bool
agx_nir_lower_fs_output_to_epilog(nir_shader * s,struct agx_fs_epilog_link_info * out)637 agx_nir_lower_fs_output_to_epilog(nir_shader *s,
638                                   struct agx_fs_epilog_link_info *out)
639 {
640    struct lower_epilog_ctx ctx = {.info = out};
641 
642    nir_shader_intrinsics_pass(s, lower_output_to_epilog,
643                               nir_metadata_control_flow, &ctx);
644 
645    if (ctx.masked_samples) {
646       nir_builder b =
647          nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(s)));
648 
649       nir_export_agx(&b, nir_load_var(&b, ctx.masked_samples),
650                      .base = AGX_ABI_FOUT_WRITE_SAMPLES);
651       out->sample_mask_after_force_early = true;
652 
653       bool progress;
654       do {
655          progress = false;
656          NIR_PASS(progress, s, nir_lower_vars_to_ssa);
657          NIR_PASS(progress, s, nir_opt_dce);
658       } while (progress);
659    }
660 
661    out->sample_shading = s->info.fs.uses_sample_shading;
662    return true;
663 }
664 
665 bool
agx_nir_lower_fs_active_samples_to_register(nir_shader * s)666 agx_nir_lower_fs_active_samples_to_register(nir_shader *s)
667 {
668    return nir_shader_intrinsics_pass(s, lower_active_samples_to_register,
669                                      nir_metadata_control_flow, NULL);
670 }
671 
672 static bool
agx_nir_lower_stats_fs(nir_shader * s)673 agx_nir_lower_stats_fs(nir_shader *s)
674 {
675    assert(s->info.stage == MESA_SHADER_FRAGMENT);
676    nir_builder b_ =
677       nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(s)));
678    nir_builder *b = &b_;
679 
680    nir_push_if(b, nir_inot(b, nir_load_helper_invocation(b, 1)));
681    nir_def *samples = nir_bit_count(b, nir_load_sample_mask_in(b));
682    unsigned query = PIPE_STAT_QUERY_PS_INVOCATIONS;
683 
684    nir_def *addr = nir_load_stat_query_address_agx(b, .base = query);
685    nir_global_atomic(b, 32, addr, samples, .atomic_op = nir_atomic_op_iadd);
686 
687    nir_pop_if(b, NULL);
688    nir_metadata_preserve(b->impl, nir_metadata_control_flow);
689    return true;
690 }
691 
692 void
agx_nir_fs_prolog(nir_builder * b,const void * key_)693 agx_nir_fs_prolog(nir_builder *b, const void *key_)
694 {
695    const struct agx_fs_prolog_key *key = key_;
696    b->shader->info.stage = MESA_SHADER_FRAGMENT;
697    b->shader->info.name = "FS prolog";
698 
699    /* First, insert code for any emulated features */
700    if (key->api_sample_mask != 0xff) {
701       /* Kill samples that are NOT covered by the mask */
702       nir_discard_agx(b, nir_imm_intN_t(b, key->api_sample_mask ^ 0xff, 16));
703       b->shader->info.fs.uses_discard = true;
704    }
705 
706    if (key->statistics) {
707       NIR_PASS(_, b->shader, agx_nir_lower_stats_fs);
708    }
709 
710    if (key->cull_distance_size) {
711       NIR_PASS(_, b->shader, agx_nir_lower_cull_distance_fs,
712                key->cull_distance_size);
713    }
714 
715    if (key->polygon_stipple) {
716       NIR_PASS_V(b->shader, agx_nir_lower_poly_stipple);
717    }
718 
719    /* Then, lower the prolog */
720    NIR_PASS(_, b->shader, agx_nir_lower_discard_zs_emit);
721    NIR_PASS(_, b->shader, agx_nir_lower_sample_mask);
722    NIR_PASS(_, b->shader, nir_shader_intrinsics_pass,
723             lower_non_monolithic_uniforms, nir_metadata_control_flow, NULL);
724    NIR_PASS(_, b->shader, lower_tests_zs, key->run_zs_tests);
725 
726    b->shader->info.io_lowered = true;
727 }
728