• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2023 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 /* This is a post-link lowering pass that lowers intrinsics to AMD-specific ones and thus breaks
8  * shader_info gathering.
9  *
10  * It lowers output stores to exports and inserts the bc_optimize conditional.
11  */
12 
13 #include "ac_nir.h"
14 #include "sid.h"
15 #include "nir_builder.h"
16 #include "nir_builtin_builder.h"
17 
18 typedef struct {
19    const ac_nir_lower_ps_late_options *options;
20 
21    nir_variable *persp_center;
22    nir_variable *persp_centroid;
23    nir_variable *persp_sample;
24    nir_variable *linear_center;
25    nir_variable *linear_centroid;
26    nir_variable *linear_sample;
27    bool lower_load_barycentric;
28 
29    nir_def *color[MAX_DRAW_BUFFERS][4];
30    nir_def *depth;
31    nir_def *stencil;
32    nir_def *sample_mask;
33 
34    uint8_t colors_written;
35    nir_alu_type color_type[MAX_DRAW_BUFFERS];
36    bool has_dual_src_blending;
37    bool writes_all_cbufs;
38 
39    /* MAX_DRAW_BUFFERS for MRT export, 1 for MRTZ export */
40    nir_intrinsic_instr *exp[MAX_DRAW_BUFFERS + 1];
41    unsigned exp_num;
42 
43    unsigned compacted_mrt_index;
44    unsigned spi_shader_col_format;
45 } lower_ps_state;
46 
47 static void
create_interp_param(nir_builder * b,lower_ps_state * s)48 create_interp_param(nir_builder *b, lower_ps_state *s)
49 {
50    if (s->options->bc_optimize_for_persp) {
51       s->persp_centroid =
52          nir_local_variable_create(b->impl, glsl_vec_type(2), "persp_centroid");
53    }
54 
55    if (s->options->bc_optimize_for_linear) {
56       s->linear_centroid =
57          nir_local_variable_create(b->impl, glsl_vec_type(2), "linear_centroid");
58    }
59 
60    s->lower_load_barycentric = s->persp_centroid || s->linear_centroid;
61 }
62 
63 static void
init_interp_param(nir_builder * b,lower_ps_state * s)64 init_interp_param(nir_builder *b, lower_ps_state *s)
65 {
66    b->cursor = nir_before_cf_list(&b->impl->body);
67 
68    /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
69     * The hw doesn't compute CENTROID if the whole wave only
70     * contains fully-covered quads.
71     */
72    if (s->options->bc_optimize_for_persp || s->options->bc_optimize_for_linear) {
73       nir_def *bc_optimize = nir_load_barycentric_optimize_amd(b);
74 
75       if (s->options->bc_optimize_for_persp) {
76          nir_def *center =
77             nir_load_barycentric_pixel(b, 32, .interp_mode = INTERP_MODE_SMOOTH);
78          nir_def *centroid =
79             nir_load_barycentric_centroid(b, 32, .interp_mode = INTERP_MODE_SMOOTH);
80 
81          nir_def *value = nir_bcsel(b, bc_optimize, center, centroid);
82          nir_store_var(b, s->persp_centroid, value, 0x3);
83       }
84 
85       if (s->options->bc_optimize_for_linear) {
86          nir_def *center =
87             nir_load_barycentric_pixel(b, 32, .interp_mode = INTERP_MODE_NOPERSPECTIVE);
88          nir_def *centroid =
89             nir_load_barycentric_centroid(b, 32, .interp_mode = INTERP_MODE_NOPERSPECTIVE);
90 
91          nir_def *value = nir_bcsel(b, bc_optimize, center, centroid);
92          nir_store_var(b, s->linear_centroid, value, 0x3);
93       }
94    }
95 }
96 
97 static bool
lower_ps_load_barycentric(nir_builder * b,nir_intrinsic_instr * intrin,lower_ps_state * s)98 lower_ps_load_barycentric(nir_builder *b, nir_intrinsic_instr *intrin, lower_ps_state *s)
99 {
100    enum glsl_interp_mode mode = nir_intrinsic_interp_mode(intrin);
101    nir_variable *var = NULL;
102 
103    switch (mode) {
104    case INTERP_MODE_NONE:
105    case INTERP_MODE_SMOOTH:
106       switch (intrin->intrinsic) {
107       case nir_intrinsic_load_barycentric_pixel:
108          var = s->persp_center;
109          break;
110       case nir_intrinsic_load_barycentric_centroid:
111          var = s->persp_centroid;
112          break;
113       case nir_intrinsic_load_barycentric_sample:
114          var = s->persp_sample;
115          break;
116       default:
117          break;
118       }
119       break;
120 
121    case INTERP_MODE_NOPERSPECTIVE:
122       switch (intrin->intrinsic) {
123       case nir_intrinsic_load_barycentric_pixel:
124          var = s->linear_center;
125          break;
126       case nir_intrinsic_load_barycentric_centroid:
127          var = s->linear_centroid;
128          break;
129       case nir_intrinsic_load_barycentric_sample:
130          var = s->linear_sample;
131          break;
132       default:
133          break;
134       }
135       break;
136 
137    default:
138       break;
139    }
140 
141    if (!var)
142       return false;
143 
144    b->cursor = nir_before_instr(&intrin->instr);
145 
146    nir_def *replacement = nir_load_var(b, var);
147    nir_def_replace(&intrin->def, replacement);
148    return true;
149 }
150 
151 static bool
gather_ps_store_output(nir_builder * b,nir_intrinsic_instr * intrin,lower_ps_state * s)152 gather_ps_store_output(nir_builder *b, nir_intrinsic_instr *intrin, lower_ps_state *s)
153 {
154    unsigned slot = nir_intrinsic_io_semantics(intrin).location;
155    unsigned dual_src_blend_index = nir_intrinsic_io_semantics(intrin).dual_source_blend_index;
156    unsigned write_mask = nir_intrinsic_write_mask(intrin);
157    unsigned component = nir_intrinsic_component(intrin);
158    unsigned color_index = (slot >= FRAG_RESULT_DATA0 ? slot - FRAG_RESULT_DATA0 : 0) +
159                           dual_src_blend_index;
160    nir_def *store_val = intrin->src[0].ssa;
161 
162    b->cursor = nir_before_instr(&intrin->instr);
163 
164    u_foreach_bit (i, write_mask) {
165       nir_def *chan = nir_channel(b, store_val, i);
166       unsigned comp = component + i;
167 
168       switch (slot) {
169       case FRAG_RESULT_DEPTH:
170          assert(comp == 0);
171          s->depth = chan;
172          break;
173       case FRAG_RESULT_STENCIL:
174          assert(comp == 0);
175          s->stencil = chan;
176          break;
177       case FRAG_RESULT_SAMPLE_MASK:
178          assert(comp == 0);
179          s->sample_mask = chan;
180          break;
181       case FRAG_RESULT_COLOR:
182          s->color[color_index][comp] = chan;
183          break;
184       default:
185          assert(slot >= FRAG_RESULT_DATA0 && slot <= FRAG_RESULT_DATA7);
186          s->color[color_index][comp] = chan;
187          break;
188       }
189    }
190 
191    if ((slot == FRAG_RESULT_COLOR || (slot >= FRAG_RESULT_DATA0 && slot <= FRAG_RESULT_DATA7)) &&
192        write_mask) {
193       s->colors_written |= BITFIELD_BIT(color_index);
194       s->color_type[color_index] = nir_intrinsic_src_type(intrin);
195       s->has_dual_src_blending |= dual_src_blend_index == 1;
196       s->writes_all_cbufs |= slot == FRAG_RESULT_COLOR;
197    }
198 
199    /* Keep output instruction if not exported in nir. */
200    if (!s->options->no_color_export && !s->options->no_depth_export) {
201       nir_instr_remove(&intrin->instr);
202    } else {
203       if (slot >= FRAG_RESULT_DATA0 && !s->options->no_color_export) {
204          nir_instr_remove(&intrin->instr);
205       } else if ((slot == FRAG_RESULT_DEPTH || slot == FRAG_RESULT_STENCIL ||
206                   slot == FRAG_RESULT_SAMPLE_MASK) && !s->options->no_depth_export) {
207          nir_instr_remove(&intrin->instr);
208       }
209    }
210 
211    return true;
212 }
213 
214 static bool
lower_ps_intrinsic(nir_builder * b,nir_instr * instr,void * state)215 lower_ps_intrinsic(nir_builder *b, nir_instr *instr, void *state)
216 {
217    lower_ps_state *s = (lower_ps_state *)state;
218 
219    if (instr->type != nir_instr_type_intrinsic)
220       return false;
221 
222    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
223 
224    switch (intrin->intrinsic) {
225    case nir_intrinsic_store_output:
226       return gather_ps_store_output(b, intrin, s);
227    case nir_intrinsic_load_barycentric_pixel:
228    case nir_intrinsic_load_barycentric_centroid:
229    case nir_intrinsic_load_barycentric_sample:
230       if (s->lower_load_barycentric)
231          return lower_ps_load_barycentric(b, intrin, s);
232       break;
233    default:
234       break;
235    }
236 
237    return false;
238 }
239 
240 static void
emit_ps_mrtz_export(nir_builder * b,lower_ps_state * s,nir_def * mrtz_alpha)241 emit_ps_mrtz_export(nir_builder *b, lower_ps_state *s, nir_def *mrtz_alpha)
242 {
243    /* skip mrtz export if no one has written to any of them */
244    if (!s->depth && !s->stencil && !s->sample_mask && !mrtz_alpha)
245       return;
246 
247    unsigned format =
248       ac_get_spi_shader_z_format(s->depth, s->stencil, s->sample_mask,
249                                  s->options->alpha_to_coverage_via_mrtz);
250 
251    nir_def *undef = nir_undef(b, 1, 32);
252    nir_def *outputs[4] = {undef, undef, undef, undef};
253    unsigned write_mask = 0;
254    unsigned flags = 0;
255 
256    if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
257       assert(!s->depth && !mrtz_alpha);
258 
259       if (s->options->gfx_level < GFX11)
260          flags |= AC_EXP_FLAG_COMPRESSED;
261 
262       if (s->stencil) {
263          outputs[0] = nir_ishl_imm(b, s->stencil, 16);
264          write_mask |= s->options->gfx_level >= GFX11 ? 0x1 : 0x3;
265       }
266 
267       if (s->sample_mask) {
268          outputs[1] = s->sample_mask;
269          write_mask |= s->options->gfx_level >= GFX11 ? 0x2 : 0xc;
270       }
271    } else {
272       if (s->depth) {
273          outputs[0] = s->depth;
274          write_mask |= 0x1;
275       }
276 
277       if (s->stencil) {
278          assert(format == V_028710_SPI_SHADER_32_GR ||
279                 format == V_028710_SPI_SHADER_32_ABGR);
280          outputs[1] = s->stencil;
281          write_mask |= 0x2;
282       }
283 
284       if (s->sample_mask) {
285          assert(format == V_028710_SPI_SHADER_32_ABGR);
286          outputs[2] = s->sample_mask;
287          write_mask |= 0x4;
288       }
289 
290       if (mrtz_alpha) {
291          assert(format == V_028710_SPI_SHADER_32_AR ||
292                 format == V_028710_SPI_SHADER_32_ABGR);
293          if (format == V_028710_SPI_SHADER_32_AR && s->options->gfx_level >= GFX10) {
294             outputs[1] = mrtz_alpha;
295             write_mask |= 0x2;
296          } else {
297             outputs[3] = mrtz_alpha;
298             write_mask |= 0x8;
299          }
300       }
301    }
302 
303    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the
304     * X writemask component.
305     */
306    if (s->options->gfx_level == GFX6 &&
307        s->options->family != CHIP_OLAND &&
308        s->options->family != CHIP_HAINAN) {
309       write_mask |= 0x1;
310    }
311 
312    s->exp[s->exp_num++] = nir_export_amd(b, nir_vec(b, outputs, 4),
313                                          .base = V_008DFC_SQ_EXP_MRTZ,
314                                          .write_mask = write_mask,
315                                          .flags = flags);
316 }
317 
318 static unsigned
get_ps_color_export_target(lower_ps_state * s)319 get_ps_color_export_target(lower_ps_state *s)
320 {
321    unsigned target = V_008DFC_SQ_EXP_MRT + s->compacted_mrt_index;
322 
323    if (s->options->dual_src_blend_swizzle && s->compacted_mrt_index < 2)
324       target += 21;
325 
326    s->compacted_mrt_index++;
327 
328    return target;
329 }
330 
331 static bool
emit_ps_color_export(nir_builder * b,lower_ps_state * s,unsigned output_index,unsigned mrt_index)332 emit_ps_color_export(nir_builder *b, lower_ps_state *s, unsigned output_index, unsigned mrt_index)
333 {
334    assert(output_index < 8 && mrt_index < 8);
335 
336    unsigned spi_shader_col_format = (s->spi_shader_col_format >> (mrt_index * 4)) & 0xf;
337    if (spi_shader_col_format == V_028714_SPI_SHADER_ZERO)
338       return false;
339 
340    /* get target after checking spi_shader_col_format as we need to increase
341     * compacted_mrt_index anyway regardless of whether the export is built
342     */
343    unsigned target = get_ps_color_export_target(s);
344 
345    /* no one has written to this slot */
346    if (!(s->colors_written & BITFIELD_BIT(output_index)))
347       return false;
348 
349    bool is_int8 = s->options->color_is_int8 & BITFIELD_BIT(mrt_index);
350    bool is_int10 = s->options->color_is_int10 & BITFIELD_BIT(mrt_index);
351    bool enable_mrt_output_nan_fixup =
352       s->options->enable_mrt_output_nan_fixup & BITFIELD_BIT(mrt_index);
353 
354    nir_def *undef = nir_undef(b, 1, 32);
355    nir_def *outputs[4] = {undef, undef, undef, undef};
356    unsigned write_mask = 0;
357    unsigned flags = 0;
358 
359    nir_alu_type type = s->color_type[output_index];
360    nir_alu_type base_type = nir_alu_type_get_base_type(type);
361    unsigned type_size = nir_alu_type_get_type_size(type);
362 
363    nir_def *data[4];
364    memcpy(data, s->color[output_index], sizeof(data));
365 
366    /* Replace NaN by zero (for 32-bit float formats) to fix game bugs if requested. */
367    if (enable_mrt_output_nan_fixup && type == nir_type_float32) {
368       for (int i = 0; i < 4; i++) {
369          if (data[i]) {
370             nir_def *isnan = nir_fisnan(b, data[i]);
371             data[i] = nir_bcsel(b, isnan, nir_imm_float(b, 0), data[i]);
372          }
373       }
374    }
375 
376    switch (spi_shader_col_format) {
377    case V_028714_SPI_SHADER_32_R:
378       if (data[0]) {
379          outputs[0] = nir_convert_to_bit_size(b, data[0], base_type, 32);
380          write_mask = 0x1;
381       }
382       break;
383 
384    case V_028714_SPI_SHADER_32_GR:
385       if (data[0]) {
386          outputs[0] = nir_convert_to_bit_size(b, data[0], base_type, 32);
387          write_mask |= 0x1;
388       }
389 
390       if (data[1]) {
391          outputs[1] = nir_convert_to_bit_size(b, data[1], base_type, 32);
392          write_mask |= 0x2;
393       }
394       break;
395 
396    case V_028714_SPI_SHADER_32_AR:
397       if (data[0]) {
398          outputs[0] = nir_convert_to_bit_size(b, data[0], base_type, 32);
399          write_mask |= 0x1;
400       }
401 
402       if (data[3]) {
403          unsigned index = s->options->gfx_level >= GFX10 ? 1 : 3;
404          outputs[index] = nir_convert_to_bit_size(b, data[3], base_type, 32);
405          write_mask |= BITFIELD_BIT(index);
406       }
407       break;
408 
409    case V_028714_SPI_SHADER_32_ABGR:
410       for (int i = 0; i < 4; i++) {
411          if (data[i]) {
412             outputs[i] = nir_convert_to_bit_size(b, data[i], base_type, 32);
413             write_mask |= BITFIELD_BIT(i);
414          }
415       }
416       break;
417 
418    default: {
419       nir_op pack_op = nir_op_pack_32_2x16;
420 
421       switch (spi_shader_col_format) {
422       case V_028714_SPI_SHADER_FP16_ABGR:
423          if (type_size == 32)
424             pack_op = nir_op_pack_half_2x16_rtz_split;
425          break;
426       case V_028714_SPI_SHADER_UINT16_ABGR:
427          if (type_size == 32) {
428             pack_op = nir_op_pack_uint_2x16;
429             if (is_int8 || is_int10) {
430                /* clamp 32bit output for 8/10 bit color component */
431                uint32_t max_rgb = is_int8 ? 255 : 1023;
432 
433                for (int i = 0; i < 4; i++) {
434                   if (!data[i])
435                      continue;
436 
437                   uint32_t max_value = i == 3 && is_int10 ? 3 : max_rgb;
438                   data[i] = nir_umin(b, data[i], nir_imm_int(b, max_value));
439                }
440             }
441          }
442          break;
443       case V_028714_SPI_SHADER_SINT16_ABGR:
444          if (type_size == 32) {
445             pack_op = nir_op_pack_sint_2x16;
446             if (is_int8 || is_int10) {
447                /* clamp 32bit output for 8/10 bit color component */
448                uint32_t max_rgb = is_int8 ? 127 : 511;
449                uint32_t min_rgb = is_int8 ? -128 : -512;
450 
451                for (int i = 0; i < 4; i++) {
452                   if (!data[i])
453                      continue;
454 
455                   uint32_t max_value = i == 3 && is_int10 ? 1 : max_rgb;
456                   uint32_t min_value = i == 3 && is_int10 ? -2u : min_rgb;
457 
458                   data[i] = nir_imin(b, data[i], nir_imm_int(b, max_value));
459                   data[i] = nir_imax(b, data[i], nir_imm_int(b, min_value));
460                }
461             }
462          }
463          break;
464       case V_028714_SPI_SHADER_UNORM16_ABGR:
465          pack_op = nir_op_pack_unorm_2x16;
466          break;
467       case V_028714_SPI_SHADER_SNORM16_ABGR:
468          pack_op = nir_op_pack_snorm_2x16;
469          break;
470       default:
471          unreachable("unsupported color export format");
472          break;
473       }
474 
475       for (int i = 0; i < 2; i++) {
476          nir_def *lo = data[i * 2];
477          nir_def *hi = data[i * 2 + 1];
478          if (!lo && !hi)
479             continue;
480 
481          lo = lo ? lo : nir_undef(b, 1, type_size);
482          hi = hi ? hi : nir_undef(b, 1, type_size);
483 
484          if (nir_op_infos[pack_op].num_inputs == 2) {
485             outputs[i] = nir_build_alu2(b, pack_op, lo, hi);
486          } else {
487             nir_def *vec = nir_vec2(b, lo, hi);
488             outputs[i] = nir_build_alu1(b, pack_op, vec);
489          }
490 
491          if (s->options->gfx_level >= GFX11)
492             write_mask |= BITFIELD_BIT(i);
493          else
494             write_mask |= 0x3 << (i * 2);
495       }
496 
497       if (s->options->gfx_level < GFX11)
498          flags |= AC_EXP_FLAG_COMPRESSED;
499    }
500    }
501 
502    s->exp[s->exp_num++] = nir_export_amd(b, nir_vec(b, outputs, 4),
503                                          .base = target,
504                                          .write_mask = write_mask,
505                                          .flags = flags);
506    return true;
507 }
508 
509 static void
emit_ps_dual_src_blend_swizzle(nir_builder * b,lower_ps_state * s,unsigned first_color_export)510 emit_ps_dual_src_blend_swizzle(nir_builder *b, lower_ps_state *s, unsigned first_color_export)
511 {
512    assert(s->exp_num > first_color_export + 1);
513 
514    nir_intrinsic_instr *mrt0_exp = s->exp[first_color_export];
515    nir_intrinsic_instr *mrt1_exp = s->exp[first_color_export + 1];
516 
517    /* There are some instructions which operate mrt1_exp's argument
518     * between mrt0_exp and mrt1_exp. Move mrt0_exp next to mrt1_exp,
519     * so that we can swizzle their arguments.
520     */
521    unsigned target0 = nir_intrinsic_base(mrt0_exp);
522    unsigned target1 = nir_intrinsic_base(mrt1_exp);
523    if (target0 > target1) {
524       /* mrt0 export is after mrt1 export, this happens when src0 is missing,
525        * so we emit mrt1 first then emit an empty mrt0.
526        *
527        * swap the pointer
528        */
529       nir_intrinsic_instr *tmp = mrt0_exp;
530       mrt0_exp = mrt1_exp;
531       mrt1_exp = tmp;
532 
533       /* move mrt1_exp down to after mrt0_exp */
534       nir_instr_move(nir_after_instr(&mrt0_exp->instr), &mrt1_exp->instr);
535    } else {
536       /* move mrt0_exp down to before mrt1_exp */
537       nir_instr_move(nir_before_instr(&mrt1_exp->instr), &mrt0_exp->instr);
538    }
539 
540    uint32_t mrt0_write_mask = nir_intrinsic_write_mask(mrt0_exp);
541    uint32_t mrt1_write_mask = nir_intrinsic_write_mask(mrt1_exp);
542    uint32_t write_mask = mrt0_write_mask & mrt1_write_mask;
543 
544    nir_def *mrt0_arg = mrt0_exp->src[0].ssa;
545    nir_def *mrt1_arg = mrt1_exp->src[0].ssa;
546 
547    /* Swizzle code is right before mrt0_exp. */
548    b->cursor = nir_before_instr(&mrt0_exp->instr);
549 
550    /* ACO need to emit the swizzle code by a pseudo instruction. */
551    if (s->options->use_aco) {
552       nir_export_dual_src_blend_amd(b, mrt0_arg, mrt1_arg, .write_mask = write_mask);
553       nir_instr_remove(&mrt0_exp->instr);
554       nir_instr_remove(&mrt1_exp->instr);
555       return;
556    }
557 
558    nir_def *undef = nir_undef(b, 1, 32);
559    nir_def *arg0_vec[4] = {undef, undef, undef, undef};
560    nir_def *arg1_vec[4] = {undef, undef, undef, undef};
561 
562    /* For illustration, originally
563     *   lane0 export arg00 and arg01
564     *   lane1 export arg10 and arg11.
565     *
566     * After the following operation
567     *   lane0 export arg00 and arg10
568     *   lane1 export arg01 and arg11.
569     */
570    u_foreach_bit (i, write_mask) {
571       nir_def *arg0 = nir_channel(b, mrt0_arg, i);
572       nir_def *arg1 = nir_channel(b, mrt1_arg, i);
573 
574       /* swap odd,even lanes of arg0 */
575       arg0 = nir_quad_swizzle_amd(b, arg0, .swizzle_mask = 0b10110001, .fetch_inactive = true);
576 
577       /* swap even lanes between arg0 and arg1 */
578       nir_def *tid = nir_load_subgroup_invocation(b);
579       nir_def *is_even = nir_ieq_imm(b, nir_iand_imm(b, tid, 1), 0);
580 
581       nir_def *tmp = arg0;
582       arg0 = nir_bcsel(b, is_even, arg1, arg0);
583       arg1 = nir_bcsel(b, is_even, tmp, arg1);
584 
585       /* swap odd,even lanes again for arg0 */
586       arg0 = nir_quad_swizzle_amd(b, arg0, .swizzle_mask = 0b10110001, .fetch_inactive = true);
587 
588       arg0_vec[i] = arg0;
589       arg1_vec[i] = arg1;
590    }
591 
592    nir_src_rewrite(&mrt0_exp->src[0], nir_vec(b, arg0_vec, 4));
593    nir_src_rewrite(&mrt1_exp->src[0], nir_vec(b, arg1_vec, 4));
594 
595    nir_intrinsic_set_write_mask(mrt0_exp, write_mask);
596    nir_intrinsic_set_write_mask(mrt1_exp, write_mask);
597 }
598 
599 static void
emit_ps_null_export(nir_builder * b,lower_ps_state * s)600 emit_ps_null_export(nir_builder *b, lower_ps_state *s)
601 {
602    const bool pops = b->shader->info.fs.sample_interlock_ordered ||
603                      b->shader->info.fs.sample_interlock_unordered ||
604                      b->shader->info.fs.pixel_interlock_ordered ||
605                      b->shader->info.fs.pixel_interlock_unordered;
606 
607    /* Gfx10+ doesn't need to export anything if we don't need to export the EXEC mask
608     * for discard.
609     * In Primitive Ordered Pixel Shading, however, GFX11+ explicitly uses the `done` export to exit
610     * the ordered section, and before GFX11, shaders with POPS also need an export.
611     */
612    if (s->options->gfx_level >= GFX10 && !s->options->uses_discard && !pops)
613       return;
614 
615    /* The `done` export exits the POPS ordered section on GFX11+, make sure UniformMemory and
616     * ImageMemory (in SPIR-V terms) accesses from the ordered section may not be reordered below it.
617     */
618    if (s->options->gfx_level >= GFX11 && pops)
619       nir_scoped_memory_barrier(b, SCOPE_QUEUE_FAMILY, NIR_MEMORY_RELEASE,
620                                 nir_var_image | nir_var_mem_ubo | nir_var_mem_ssbo |
621                                 nir_var_mem_global);
622 
623    /* Gfx11 doesn't support null exports, and mrt0 should be exported instead. */
624    unsigned target = s->options->gfx_level >= GFX11 ?
625       V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
626 
627    nir_intrinsic_instr *intrin =
628       nir_export_amd(b, nir_undef(b, 4, 32),
629                      .base = target,
630                      .flags = AC_EXP_FLAG_VALID_MASK | AC_EXP_FLAG_DONE);
631    /* To avoid builder set write mask to 0xf. */
632    nir_intrinsic_set_write_mask(intrin, 0);
633 }
634 
635 static void
export_ps_outputs(nir_builder * b,lower_ps_state * s)636 export_ps_outputs(nir_builder *b, lower_ps_state *s)
637 {
638    nir_def *mrtz_alpha = NULL;
639 
640    b->cursor = nir_after_impl(b->impl);
641 
642    /* Alpha-to-coverage should be before alpha-to-one. */
643    if (!s->options->no_depth_export && s->options->alpha_to_coverage_via_mrtz)
644       mrtz_alpha = s->color[0][3];
645 
646    u_foreach_bit (slot, s->colors_written) {
647       if (s->options->alpha_to_one)
648          s->color[slot][3] = nir_imm_floatN_t(b, 1, nir_alu_type_get_type_size(s->color_type[slot]));
649    }
650 
651    if (!s->options->no_depth_export)
652       emit_ps_mrtz_export(b, s, mrtz_alpha);
653 
654    /* When non-monolithic shader, RADV export mrtz in main part (except on
655     * RDNA3 for alpha to coverage) and export color in epilog.
656     */
657    if (s->options->no_color_export)
658       return;
659 
660    unsigned first_color_export = s->exp_num;
661 
662    /* Add exports for dual source blending manually if they are missing.
663     * It will automatically generate exports with undef.
664     */
665    if (s->has_dual_src_blending) {
666       switch (s->colors_written) {
667       case BITFIELD_BIT(0):
668          s->colors_written |= BITFIELD_BIT(1);
669          s->color_type[1] = s->color_type[0];
670          s->spi_shader_col_format |= (s->spi_shader_col_format & 0xf) << 4;
671          break;
672 
673       case BITFIELD_BIT(1):
674          s->colors_written |= BITFIELD_BIT(0);
675          s->color_type[0] = s->color_type[1];
676          s->spi_shader_col_format |= (s->spi_shader_col_format & 0xf0) >> 4;
677          break;
678       case BITFIELD_RANGE(0, 2):
679          break;
680       default:
681          unreachable("unexpected number of color outputs for dual source blending");
682       }
683    }
684 
685    if (s->writes_all_cbufs && s->colors_written == 0x1) {
686       /* This will do nothing for color buffers with SPI_SHADER_COL_FORMAT=ZERO, so always
687        * iterate over all 8.
688        */
689       for (int cbuf = 0; cbuf < 8; cbuf++)
690          emit_ps_color_export(b, s, 0, cbuf);
691    } else {
692       for (int cbuf = 0; cbuf < MAX_DRAW_BUFFERS; cbuf++)
693          emit_ps_color_export(b, s, cbuf, cbuf);
694    }
695 
696    if (s->exp_num) {
697       if (s->options->dual_src_blend_swizzle) {
698          emit_ps_dual_src_blend_swizzle(b, s, first_color_export);
699          /* Skip last export flag setting because they have been replaced by
700           * a pseudo instruction.
701           */
702          if (s->options->use_aco)
703             return;
704       }
705 
706       /* Specify that this is the last export */
707       nir_intrinsic_instr *final_exp = s->exp[s->exp_num - 1];
708       unsigned final_exp_flags = nir_intrinsic_flags(final_exp);
709       final_exp_flags |= AC_EXP_FLAG_DONE | AC_EXP_FLAG_VALID_MASK;
710       nir_intrinsic_set_flags(final_exp, final_exp_flags);
711 
712       /* The `done` export exits the POPS ordered section on GFX11+, make sure UniformMemory and
713        * ImageMemory (in SPIR-V terms) accesses from the ordered section may not be reordered below
714        * it.
715        */
716       if (s->options->gfx_level >= GFX11 &&
717           (b->shader->info.fs.sample_interlock_ordered ||
718            b->shader->info.fs.sample_interlock_unordered ||
719            b->shader->info.fs.pixel_interlock_ordered ||
720            b->shader->info.fs.pixel_interlock_unordered)) {
721          b->cursor = nir_before_instr(&final_exp->instr);
722          nir_scoped_memory_barrier(b, SCOPE_QUEUE_FAMILY, NIR_MEMORY_RELEASE,
723                                    nir_var_image | nir_var_mem_ubo | nir_var_mem_ssbo |
724                                    nir_var_mem_global);
725       }
726    } else {
727       emit_ps_null_export(b, s);
728    }
729 }
730 
731 void
ac_nir_lower_ps_late(nir_shader * nir,const ac_nir_lower_ps_late_options * options)732 ac_nir_lower_ps_late(nir_shader *nir, const ac_nir_lower_ps_late_options *options)
733 {
734    assert(nir->info.stage == MESA_SHADER_FRAGMENT);
735    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
736 
737    nir_builder builder = nir_builder_create(impl);
738    nir_builder *b = &builder;
739 
740    lower_ps_state state = {
741       .options = options,
742       .has_dual_src_blending = options->dual_src_blend_swizzle,
743       .spi_shader_col_format = options->spi_shader_col_format,
744    };
745 
746    create_interp_param(b, &state);
747 
748    nir_shader_instructions_pass(nir, lower_ps_intrinsic,
749                                 nir_metadata_control_flow,
750                                 &state);
751 
752    /* Must be after lower_ps_intrinsic() to prevent it lower added intrinsic here. */
753    init_interp_param(b, &state);
754 
755    export_ps_outputs(b, &state);
756 
757    /* Cleanup nir variable, as RADV won't do this. */
758    if (state.lower_load_barycentric)
759       nir_lower_vars_to_ssa(nir);
760 }
761