• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2023 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 /* This is a post-link lowering pass that lowers intrinsics to AMD-specific ones and thus breaks
8  * shader_info gathering.
9  *
10  * It lowers output stores to exports and inserts the bc_optimize conditional.
11  */
12 
13 #include "ac_nir.h"
14 #include "sid.h"
15 #include "nir_builder.h"
16 #include "nir_builtin_builder.h"
17 
18 typedef struct {
19    const ac_nir_lower_ps_late_options *options;
20 
21    nir_variable *persp_centroid;
22    nir_variable *linear_centroid;
23 
24    nir_def *color[MAX_DRAW_BUFFERS][4];
25    nir_def *depth;
26    nir_def *stencil;
27    nir_def *sample_mask;
28 
29    uint8_t colors_written;
30    nir_alu_type color_type[MAX_DRAW_BUFFERS];
31    bool has_dual_src_blending;
32    bool writes_all_cbufs;
33 
34    /* MAX_DRAW_BUFFERS for MRT export, 1 for MRTZ export */
35    nir_intrinsic_instr *exp[MAX_DRAW_BUFFERS + 1];
36    unsigned exp_num;
37 
38    unsigned compacted_mrt_index;
39    unsigned spi_shader_col_format;
40 } lower_ps_state;
41 
42 static nir_variable *
get_baryc_var_common(nir_builder * b,bool will_replace,nir_variable ** var,const char * var_name)43 get_baryc_var_common(nir_builder *b, bool will_replace, nir_variable **var, const char *var_name)
44 {
45    if (will_replace) {
46       if (!*var) {
47          *var = nir_local_variable_create(b->impl, glsl_vec_type(2), var_name);
48       }
49       return *var;
50    }
51    return NULL;
52 }
53 
54 static nir_variable *
get_centroid_var(nir_builder * b,enum glsl_interp_mode mode,lower_ps_state * s)55 get_centroid_var(nir_builder *b, enum glsl_interp_mode mode, lower_ps_state *s)
56 {
57    if (mode == INTERP_MODE_NOPERSPECTIVE) {
58       return get_baryc_var_common(b, s->options->bc_optimize_for_linear, &s->linear_centroid,
59                                   "linear_centroid");
60    } else {
61       return get_baryc_var_common(b, s->options->bc_optimize_for_persp, &s->persp_centroid,
62                                   "persp_centroid");
63    }
64 
65    return NULL;
66 }
67 
68 static void
init_interp_param(nir_builder * b,lower_ps_state * s)69 init_interp_param(nir_builder *b, lower_ps_state *s)
70 {
71    b->cursor = nir_before_cf_list(&b->impl->body);
72 
73    /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
74     * The hw doesn't compute CENTROID if the whole wave only
75     * contains fully-covered quads.
76     */
77    if (s->options->bc_optimize_for_persp || s->options->bc_optimize_for_linear) {
78       nir_def *bc_optimize = nir_load_barycentric_optimize_amd(b);
79 
80       if (s->options->bc_optimize_for_persp) {
81          nir_def *center =
82             nir_load_barycentric_pixel(b, 32, .interp_mode = INTERP_MODE_SMOOTH);
83          nir_def *centroid =
84             nir_load_barycentric_centroid(b, 32, .interp_mode = INTERP_MODE_SMOOTH);
85 
86          nir_def *value = nir_bcsel(b, bc_optimize, center, centroid);
87          nir_store_var(b, s->persp_centroid, value, 0x3);
88       }
89 
90       if (s->options->bc_optimize_for_linear) {
91          nir_def *center =
92             nir_load_barycentric_pixel(b, 32, .interp_mode = INTERP_MODE_NOPERSPECTIVE);
93          nir_def *centroid =
94             nir_load_barycentric_centroid(b, 32, .interp_mode = INTERP_MODE_NOPERSPECTIVE);
95 
96          nir_def *value = nir_bcsel(b, bc_optimize, center, centroid);
97          nir_store_var(b, s->linear_centroid, value, 0x3);
98       }
99    }
100 }
101 
102 static bool
lower_ps_load_barycentric_centroid(nir_builder * b,nir_intrinsic_instr * intrin,lower_ps_state * s)103 lower_ps_load_barycentric_centroid(nir_builder *b, nir_intrinsic_instr *intrin, lower_ps_state *s)
104 {
105    nir_variable *var = get_centroid_var(b, nir_intrinsic_interp_mode(intrin), s);
106    if (!var)
107       return false;
108 
109    b->cursor = nir_before_instr(&intrin->instr);
110 
111    nir_def_replace(&intrin->def, nir_load_var(b, var));
112    return true;
113 }
114 
115 static bool
gather_ps_store_output(nir_builder * b,nir_intrinsic_instr * intrin,lower_ps_state * s)116 gather_ps_store_output(nir_builder *b, nir_intrinsic_instr *intrin, lower_ps_state *s)
117 {
118    unsigned slot = nir_intrinsic_io_semantics(intrin).location;
119    unsigned dual_src_blend_index = nir_intrinsic_io_semantics(intrin).dual_source_blend_index;
120    unsigned write_mask = nir_intrinsic_write_mask(intrin);
121    unsigned component = nir_intrinsic_component(intrin);
122    unsigned color_index = (slot >= FRAG_RESULT_DATA0 ? slot - FRAG_RESULT_DATA0 : 0) +
123                           dual_src_blend_index;
124    nir_def *store_val = intrin->src[0].ssa;
125 
126    b->cursor = nir_before_instr(&intrin->instr);
127 
128    u_foreach_bit (i, write_mask) {
129       nir_def *chan = nir_channel(b, store_val, i);
130       unsigned comp = component + i;
131 
132       switch (slot) {
133       case FRAG_RESULT_DEPTH:
134          assert(comp == 0);
135          s->depth = chan;
136          break;
137       case FRAG_RESULT_STENCIL:
138          assert(comp == 0);
139          s->stencil = chan;
140          break;
141       case FRAG_RESULT_SAMPLE_MASK:
142          assert(comp == 0);
143          s->sample_mask = chan;
144          break;
145       case FRAG_RESULT_COLOR:
146          s->color[color_index][comp] = chan;
147          break;
148       default:
149          assert(slot >= FRAG_RESULT_DATA0 && slot <= FRAG_RESULT_DATA7);
150          s->color[color_index][comp] = chan;
151          break;
152       }
153    }
154 
155    if ((slot == FRAG_RESULT_COLOR || (slot >= FRAG_RESULT_DATA0 && slot <= FRAG_RESULT_DATA7)) &&
156        write_mask) {
157       s->colors_written |= BITFIELD_BIT(color_index);
158       s->color_type[color_index] = nir_intrinsic_src_type(intrin);
159       s->has_dual_src_blending |= dual_src_blend_index == 1;
160       s->writes_all_cbufs |= slot == FRAG_RESULT_COLOR;
161    }
162 
163    /* Keep output instruction if not exported in nir. */
164    if (!s->options->no_color_export && !s->options->no_depth_export) {
165       nir_instr_remove(&intrin->instr);
166    } else {
167       if (slot >= FRAG_RESULT_DATA0 && !s->options->no_color_export) {
168          nir_instr_remove(&intrin->instr);
169       } else if ((slot == FRAG_RESULT_DEPTH || slot == FRAG_RESULT_STENCIL ||
170                   slot == FRAG_RESULT_SAMPLE_MASK) && !s->options->no_depth_export) {
171          nir_instr_remove(&intrin->instr);
172       }
173    }
174 
175    return true;
176 }
177 
178 static bool
lower_ps_intrinsic(nir_builder * b,nir_intrinsic_instr * intrin,void * state)179 lower_ps_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin, void *state)
180 {
181    lower_ps_state *s = (lower_ps_state *)state;
182 
183    switch (intrin->intrinsic) {
184    case nir_intrinsic_store_output:
185       return gather_ps_store_output(b, intrin, s);
186    case nir_intrinsic_load_barycentric_centroid:
187       return lower_ps_load_barycentric_centroid(b, intrin, s);
188    default:
189       break;
190    }
191 
192    return false;
193 }
194 
195 static bool
emit_ps_mrtz_export(nir_builder * b,lower_ps_state * s,nir_def * mrtz_alpha)196 emit_ps_mrtz_export(nir_builder *b, lower_ps_state *s, nir_def *mrtz_alpha)
197 {
198    /* skip mrtz export if no one has written to any of them */
199    if (!s->depth && !s->stencil && !s->sample_mask && !mrtz_alpha)
200       return false;
201 
202    unsigned format =
203       ac_get_spi_shader_z_format(s->depth, s->stencil, s->sample_mask,
204                                  s->options->alpha_to_coverage_via_mrtz);
205 
206    nir_def *undef = nir_undef(b, 1, 32);
207    nir_def *outputs[4] = {undef, undef, undef, undef};
208    unsigned write_mask = 0;
209    unsigned flags = 0;
210 
211    if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
212       assert(!s->depth && !mrtz_alpha);
213 
214       if (s->options->gfx_level < GFX11)
215          flags |= AC_EXP_FLAG_COMPRESSED;
216 
217       if (s->stencil) {
218          outputs[0] = nir_ishl_imm(b, s->stencil, 16);
219          write_mask |= s->options->gfx_level >= GFX11 ? 0x1 : 0x3;
220       }
221 
222       if (s->sample_mask) {
223          outputs[1] = s->sample_mask;
224          write_mask |= s->options->gfx_level >= GFX11 ? 0x2 : 0xc;
225       }
226    } else {
227       if (s->depth) {
228          outputs[0] = s->depth;
229          write_mask |= 0x1;
230       }
231 
232       if (s->stencil) {
233          assert(format == V_028710_SPI_SHADER_32_GR ||
234                 format == V_028710_SPI_SHADER_32_ABGR);
235          outputs[1] = s->stencil;
236          write_mask |= 0x2;
237       }
238 
239       if (s->sample_mask) {
240          assert(format == V_028710_SPI_SHADER_32_ABGR);
241          outputs[2] = s->sample_mask;
242          write_mask |= 0x4;
243       }
244 
245       if (mrtz_alpha) {
246          assert(format == V_028710_SPI_SHADER_32_AR ||
247                 format == V_028710_SPI_SHADER_32_ABGR);
248          if (format == V_028710_SPI_SHADER_32_AR && s->options->gfx_level >= GFX10) {
249             outputs[1] = mrtz_alpha;
250             write_mask |= 0x2;
251          } else {
252             outputs[3] = mrtz_alpha;
253             write_mask |= 0x8;
254          }
255       }
256    }
257 
258    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the
259     * X writemask component.
260     */
261    if (s->options->gfx_level == GFX6 &&
262        s->options->family != CHIP_OLAND &&
263        s->options->family != CHIP_HAINAN) {
264       write_mask |= 0x1;
265    }
266 
267    s->exp[s->exp_num++] = nir_export_amd(b, nir_vec(b, outputs, 4),
268                                          .base = V_008DFC_SQ_EXP_MRTZ,
269                                          .write_mask = write_mask,
270                                          .flags = flags);
271    return true;
272 }
273 
274 static unsigned
get_ps_color_export_target(lower_ps_state * s)275 get_ps_color_export_target(lower_ps_state *s)
276 {
277    unsigned target = V_008DFC_SQ_EXP_MRT + s->compacted_mrt_index;
278 
279    if (s->options->dual_src_blend_swizzle && s->compacted_mrt_index < 2)
280       target += 21;
281 
282    s->compacted_mrt_index++;
283 
284    return target;
285 }
286 
287 static bool
emit_ps_color_export(nir_builder * b,lower_ps_state * s,unsigned output_index,unsigned mrt_index)288 emit_ps_color_export(nir_builder *b, lower_ps_state *s, unsigned output_index, unsigned mrt_index)
289 {
290    assert(output_index < 8 && mrt_index < 8);
291 
292    unsigned spi_shader_col_format = (s->spi_shader_col_format >> (mrt_index * 4)) & 0xf;
293    if (spi_shader_col_format == V_028714_SPI_SHADER_ZERO)
294       return false;
295 
296    /* get target after checking spi_shader_col_format as we need to increase
297     * compacted_mrt_index anyway regardless of whether the export is built
298     */
299    unsigned target = get_ps_color_export_target(s);
300 
301    /* no one has written to this slot */
302    if (!(s->colors_written & BITFIELD_BIT(output_index)))
303       return false;
304 
305    bool is_int8 = s->options->color_is_int8 & BITFIELD_BIT(mrt_index);
306    bool is_int10 = s->options->color_is_int10 & BITFIELD_BIT(mrt_index);
307    bool enable_mrt_output_nan_fixup =
308       s->options->enable_mrt_output_nan_fixup & BITFIELD_BIT(mrt_index);
309 
310    nir_def *undef = nir_undef(b, 1, 32);
311    nir_def *outputs[4] = {undef, undef, undef, undef};
312    unsigned write_mask = 0;
313    unsigned flags = 0;
314 
315    nir_alu_type type = s->color_type[output_index];
316    nir_alu_type base_type = nir_alu_type_get_base_type(type);
317    unsigned type_size = nir_alu_type_get_type_size(type);
318 
319    nir_def *data[4];
320    memcpy(data, s->color[output_index], sizeof(data));
321 
322    /* Replace NaN by zero (for 32-bit float formats) to fix game bugs if requested. */
323    if (enable_mrt_output_nan_fixup && type == nir_type_float32) {
324       for (int i = 0; i < 4; i++) {
325          if (data[i]) {
326             nir_def *isnan = nir_fisnan(b, data[i]);
327             data[i] = nir_bcsel(b, isnan, nir_imm_float(b, 0), data[i]);
328          }
329       }
330    }
331 
332    switch (spi_shader_col_format) {
333    case V_028714_SPI_SHADER_32_R:
334       if (data[0]) {
335          outputs[0] = nir_convert_to_bit_size(b, data[0], base_type, 32);
336          write_mask = 0x1;
337       }
338       break;
339 
340    case V_028714_SPI_SHADER_32_GR:
341       if (data[0]) {
342          outputs[0] = nir_convert_to_bit_size(b, data[0], base_type, 32);
343          write_mask |= 0x1;
344       }
345 
346       if (data[1]) {
347          outputs[1] = nir_convert_to_bit_size(b, data[1], base_type, 32);
348          write_mask |= 0x2;
349       }
350       break;
351 
352    case V_028714_SPI_SHADER_32_AR:
353       if (data[0]) {
354          outputs[0] = nir_convert_to_bit_size(b, data[0], base_type, 32);
355          write_mask |= 0x1;
356       }
357 
358       if (data[3]) {
359          unsigned index = s->options->gfx_level >= GFX10 ? 1 : 3;
360          outputs[index] = nir_convert_to_bit_size(b, data[3], base_type, 32);
361          write_mask |= BITFIELD_BIT(index);
362       }
363       break;
364 
365    case V_028714_SPI_SHADER_32_ABGR:
366       for (int i = 0; i < 4; i++) {
367          if (data[i]) {
368             outputs[i] = nir_convert_to_bit_size(b, data[i], base_type, 32);
369             write_mask |= BITFIELD_BIT(i);
370          }
371       }
372       break;
373 
374    default: {
375       nir_op pack_op = nir_op_pack_32_2x16;
376 
377       switch (spi_shader_col_format) {
378       case V_028714_SPI_SHADER_FP16_ABGR:
379          if (type_size == 32)
380             pack_op = nir_op_pack_half_2x16_rtz_split;
381          break;
382       case V_028714_SPI_SHADER_UINT16_ABGR:
383          if (type_size == 32) {
384             pack_op = nir_op_pack_uint_2x16;
385             if (is_int8 || is_int10) {
386                /* clamp 32bit output for 8/10 bit color component */
387                uint32_t max_rgb = is_int8 ? 255 : 1023;
388 
389                for (int i = 0; i < 4; i++) {
390                   if (!data[i])
391                      continue;
392 
393                   uint32_t max_value = i == 3 && is_int10 ? 3 : max_rgb;
394                   data[i] = nir_umin(b, data[i], nir_imm_int(b, max_value));
395                }
396             }
397          }
398          break;
399       case V_028714_SPI_SHADER_SINT16_ABGR:
400          if (type_size == 32) {
401             pack_op = nir_op_pack_sint_2x16;
402             if (is_int8 || is_int10) {
403                /* clamp 32bit output for 8/10 bit color component */
404                uint32_t max_rgb = is_int8 ? 127 : 511;
405                uint32_t min_rgb = is_int8 ? -128 : -512;
406 
407                for (int i = 0; i < 4; i++) {
408                   if (!data[i])
409                      continue;
410 
411                   uint32_t max_value = i == 3 && is_int10 ? 1 : max_rgb;
412                   uint32_t min_value = i == 3 && is_int10 ? -2u : min_rgb;
413 
414                   data[i] = nir_imin(b, data[i], nir_imm_int(b, max_value));
415                   data[i] = nir_imax(b, data[i], nir_imm_int(b, min_value));
416                }
417             }
418          }
419          break;
420       case V_028714_SPI_SHADER_UNORM16_ABGR:
421          pack_op = nir_op_pack_unorm_2x16;
422          break;
423       case V_028714_SPI_SHADER_SNORM16_ABGR:
424          pack_op = nir_op_pack_snorm_2x16;
425          break;
426       default:
427          unreachable("unsupported color export format");
428          break;
429       }
430 
431       for (int i = 0; i < 2; i++) {
432          nir_def *lo = data[i * 2];
433          nir_def *hi = data[i * 2 + 1];
434          if (!lo && !hi)
435             continue;
436 
437          lo = lo ? lo : nir_undef(b, 1, type_size);
438          hi = hi ? hi : nir_undef(b, 1, type_size);
439 
440          if (nir_op_infos[pack_op].num_inputs == 2) {
441             outputs[i] = nir_build_alu2(b, pack_op, lo, hi);
442          } else {
443             nir_def *vec = nir_vec2(b, lo, hi);
444             outputs[i] = nir_build_alu1(b, pack_op, vec);
445          }
446 
447          if (s->options->gfx_level >= GFX11)
448             write_mask |= BITFIELD_BIT(i);
449          else
450             write_mask |= 0x3 << (i * 2);
451       }
452 
453       if (s->options->gfx_level < GFX11)
454          flags |= AC_EXP_FLAG_COMPRESSED;
455    }
456    }
457 
458    s->exp[s->exp_num++] = nir_export_amd(b, nir_vec(b, outputs, 4),
459                                          .base = target,
460                                          .write_mask = write_mask,
461                                          .flags = flags);
462    return true;
463 }
464 
465 static void
emit_ps_dual_src_blend_swizzle(nir_builder * b,lower_ps_state * s,unsigned first_color_export)466 emit_ps_dual_src_blend_swizzle(nir_builder *b, lower_ps_state *s, unsigned first_color_export)
467 {
468    assert(s->exp_num > first_color_export + 1);
469 
470    nir_intrinsic_instr *mrt0_exp = s->exp[first_color_export];
471    nir_intrinsic_instr *mrt1_exp = s->exp[first_color_export + 1];
472 
473    /* There are some instructions which operate mrt1_exp's argument
474     * between mrt0_exp and mrt1_exp. Move mrt0_exp next to mrt1_exp,
475     * so that we can swizzle their arguments.
476     */
477    unsigned target0 = nir_intrinsic_base(mrt0_exp);
478    unsigned target1 = nir_intrinsic_base(mrt1_exp);
479    if (target0 > target1) {
480       /* mrt0 export is after mrt1 export, this happens when src0 is missing,
481        * so we emit mrt1 first then emit an empty mrt0.
482        *
483        * swap the pointer
484        */
485       nir_intrinsic_instr *tmp = mrt0_exp;
486       mrt0_exp = mrt1_exp;
487       mrt1_exp = tmp;
488 
489       /* move mrt1_exp down to after mrt0_exp */
490       nir_instr_move(nir_after_instr(&mrt0_exp->instr), &mrt1_exp->instr);
491    } else {
492       /* move mrt0_exp down to before mrt1_exp */
493       nir_instr_move(nir_before_instr(&mrt1_exp->instr), &mrt0_exp->instr);
494    }
495 
496    uint32_t mrt0_write_mask = nir_intrinsic_write_mask(mrt0_exp);
497    uint32_t mrt1_write_mask = nir_intrinsic_write_mask(mrt1_exp);
498    uint32_t write_mask = mrt0_write_mask & mrt1_write_mask;
499 
500    nir_def *mrt0_arg = mrt0_exp->src[0].ssa;
501    nir_def *mrt1_arg = mrt1_exp->src[0].ssa;
502 
503    /* Swizzle code is right before mrt0_exp. */
504    b->cursor = nir_before_instr(&mrt0_exp->instr);
505 
506    /* ACO need to emit the swizzle code by a pseudo instruction. */
507    if (s->options->use_aco) {
508       nir_export_dual_src_blend_amd(b, mrt0_arg, mrt1_arg, .write_mask = write_mask);
509       nir_instr_remove(&mrt0_exp->instr);
510       nir_instr_remove(&mrt1_exp->instr);
511       return;
512    }
513 
514    nir_def *undef = nir_undef(b, 1, 32);
515    nir_def *arg0_vec[4] = {undef, undef, undef, undef};
516    nir_def *arg1_vec[4] = {undef, undef, undef, undef};
517 
518    /* For illustration, originally
519     *   lane0 export arg00 and arg01
520     *   lane1 export arg10 and arg11.
521     *
522     * After the following operation
523     *   lane0 export arg00 and arg10
524     *   lane1 export arg01 and arg11.
525     */
526    u_foreach_bit (i, write_mask) {
527       nir_def *arg0 = nir_channel(b, mrt0_arg, i);
528       nir_def *arg1 = nir_channel(b, mrt1_arg, i);
529 
530       /* swap odd,even lanes of arg0 */
531       arg0 = nir_quad_swizzle_amd(b, arg0, .swizzle_mask = 0b10110001, .fetch_inactive = true);
532 
533       /* swap even lanes between arg0 and arg1 */
534       nir_def *tid = nir_load_subgroup_invocation(b);
535       nir_def *is_even = nir_ieq_imm(b, nir_iand_imm(b, tid, 1), 0);
536 
537       nir_def *tmp = arg0;
538       arg0 = nir_bcsel(b, is_even, arg1, arg0);
539       arg1 = nir_bcsel(b, is_even, tmp, arg1);
540 
541       /* swap odd,even lanes again for arg0 */
542       arg0 = nir_quad_swizzle_amd(b, arg0, .swizzle_mask = 0b10110001, .fetch_inactive = true);
543 
544       arg0_vec[i] = arg0;
545       arg1_vec[i] = arg1;
546    }
547 
548    nir_src_rewrite(&mrt0_exp->src[0], nir_vec(b, arg0_vec, 4));
549    nir_src_rewrite(&mrt1_exp->src[0], nir_vec(b, arg1_vec, 4));
550 
551    nir_intrinsic_set_write_mask(mrt0_exp, write_mask);
552    nir_intrinsic_set_write_mask(mrt1_exp, write_mask);
553 }
554 
555 static void
emit_ps_null_export(nir_builder * b,lower_ps_state * s)556 emit_ps_null_export(nir_builder *b, lower_ps_state *s)
557 {
558    const bool pops = b->shader->info.fs.sample_interlock_ordered ||
559                      b->shader->info.fs.sample_interlock_unordered ||
560                      b->shader->info.fs.pixel_interlock_ordered ||
561                      b->shader->info.fs.pixel_interlock_unordered;
562 
563    /* Gfx10+ doesn't need to export anything if we don't need to export the EXEC mask
564     * for discard.
565     * In Primitive Ordered Pixel Shading, however, GFX11+ explicitly uses the `done` export to exit
566     * the ordered section, and before GFX11, shaders with POPS also need an export.
567     */
568    if (s->options->gfx_level >= GFX10 && !s->options->uses_discard && !pops)
569       return;
570 
571    /* The `done` export exits the POPS ordered section on GFX11+, make sure UniformMemory and
572     * ImageMemory (in SPIR-V terms) accesses from the ordered section may not be reordered below it.
573     */
574    if (s->options->gfx_level >= GFX11 && pops)
575       nir_scoped_memory_barrier(b, SCOPE_QUEUE_FAMILY, NIR_MEMORY_RELEASE,
576                                 nir_var_image | nir_var_mem_ubo | nir_var_mem_ssbo |
577                                 nir_var_mem_global);
578 
579    /* Gfx11 doesn't support null exports, and mrt0 should be exported instead. */
580    unsigned target = s->options->gfx_level >= GFX11 ?
581       V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
582 
583    nir_intrinsic_instr *intrin =
584       nir_export_amd(b, nir_undef(b, 4, 32),
585                      .base = target,
586                      .flags = AC_EXP_FLAG_VALID_MASK | AC_EXP_FLAG_DONE);
587    /* To avoid builder set write mask to 0xf. */
588    nir_intrinsic_set_write_mask(intrin, 0);
589 }
590 
591 static bool
export_ps_outputs(nir_builder * b,lower_ps_state * s)592 export_ps_outputs(nir_builder *b, lower_ps_state *s)
593 {
594    b->cursor = nir_after_impl(b->impl);
595 
596    /* Alpha-to-coverage should be before alpha-to-one. */
597    nir_def *mrtz_alpha = NULL;
598    if (!s->options->no_depth_export && s->options->alpha_to_coverage_via_mrtz)
599       mrtz_alpha = s->color[0][3];
600 
601    bool progress = false;
602    if (!s->options->no_depth_export)
603       progress |= emit_ps_mrtz_export(b, s, mrtz_alpha);
604 
605    /* When non-monolithic shader, RADV export mrtz in main part (except on
606     * RDNA3 for alpha to coverage) and export color in epilog.
607     */
608    if (s->options->no_color_export)
609       return progress;
610 
611    u_foreach_bit (slot, s->colors_written) {
612       if (s->options->alpha_to_one)
613          s->color[slot][3] = nir_imm_floatN_t(b, 1, nir_alu_type_get_type_size(s->color_type[slot]));
614    }
615 
616    unsigned first_color_export = s->exp_num;
617 
618    /* Add exports for dual source blending manually if they are missing.
619     * It will automatically generate exports with undef.
620     */
621    if (s->has_dual_src_blending) {
622       switch (s->colors_written) {
623       case BITFIELD_BIT(0):
624          s->colors_written |= BITFIELD_BIT(1);
625          s->color_type[1] = s->color_type[0];
626          s->spi_shader_col_format |= (s->spi_shader_col_format & 0xf) << 4;
627          break;
628 
629       case BITFIELD_BIT(1):
630          s->colors_written |= BITFIELD_BIT(0);
631          s->color_type[0] = s->color_type[1];
632          s->spi_shader_col_format |= (s->spi_shader_col_format & 0xf0) >> 4;
633          break;
634       case BITFIELD_RANGE(0, 2):
635          break;
636       default:
637          unreachable("unexpected number of color outputs for dual source blending");
638       }
639    }
640 
641    if (s->writes_all_cbufs && s->colors_written == 0x1) {
642       /* This will do nothing for color buffers with SPI_SHADER_COL_FORMAT=ZERO, so always
643        * iterate over all 8.
644        */
645       for (int cbuf = 0; cbuf < 8; cbuf++)
646          emit_ps_color_export(b, s, 0, cbuf);
647    } else {
648       for (int cbuf = 0; cbuf < MAX_DRAW_BUFFERS; cbuf++)
649          emit_ps_color_export(b, s, cbuf, cbuf);
650    }
651 
652    if (s->exp_num) {
653       if (s->options->dual_src_blend_swizzle) {
654          emit_ps_dual_src_blend_swizzle(b, s, first_color_export);
655          /* Skip last export flag setting because they have been replaced by
656           * a pseudo instruction.
657           */
658          if (s->options->use_aco)
659             return true;
660       }
661 
662       /* Specify that this is the last export */
663       nir_intrinsic_instr *final_exp = s->exp[s->exp_num - 1];
664       unsigned final_exp_flags = nir_intrinsic_flags(final_exp);
665       final_exp_flags |= AC_EXP_FLAG_DONE | AC_EXP_FLAG_VALID_MASK;
666       nir_intrinsic_set_flags(final_exp, final_exp_flags);
667 
668       /* The `done` export exits the POPS ordered section on GFX11+, make sure UniformMemory and
669        * ImageMemory (in SPIR-V terms) accesses from the ordered section may not be reordered below
670        * it.
671        */
672       if (s->options->gfx_level >= GFX11 &&
673           (b->shader->info.fs.sample_interlock_ordered ||
674            b->shader->info.fs.sample_interlock_unordered ||
675            b->shader->info.fs.pixel_interlock_ordered ||
676            b->shader->info.fs.pixel_interlock_unordered)) {
677          b->cursor = nir_before_instr(&final_exp->instr);
678          nir_scoped_memory_barrier(b, SCOPE_QUEUE_FAMILY, NIR_MEMORY_RELEASE,
679                                    nir_var_image | nir_var_mem_ubo | nir_var_mem_ssbo |
680                                    nir_var_mem_global);
681       }
682    } else {
683       emit_ps_null_export(b, s);
684    }
685 
686    return true;
687 }
688 
689 bool
ac_nir_lower_ps_late(nir_shader * nir,const ac_nir_lower_ps_late_options * options)690 ac_nir_lower_ps_late(nir_shader *nir, const ac_nir_lower_ps_late_options *options)
691 {
692    assert(nir->info.stage == MESA_SHADER_FRAGMENT);
693    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
694 
695    nir_builder builder = nir_builder_create(impl);
696    nir_builder *b = &builder;
697 
698    lower_ps_state state = {
699       .options = options,
700       .has_dual_src_blending = options->dual_src_blend_swizzle,
701       .spi_shader_col_format = options->spi_shader_col_format,
702    };
703 
704    bool progress = nir_shader_intrinsics_pass(nir, lower_ps_intrinsic,
705                                               nir_metadata_control_flow, &state);
706    progress |= export_ps_outputs(b, &state);
707 
708    if (state.persp_centroid || state.linear_centroid) {
709       assert(progress);
710 
711       /* Must be after lower_ps_intrinsic() to prevent it lower added intrinsic here. */
712       init_interp_param(b, &state);
713 
714       /* Cleanup local variables, as RADV won't do this. */
715       NIR_PASS(_, nir, nir_lower_vars_to_ssa);
716    }
717 
718    return progress;
719 }
720