1 /*
2 * Copyright 2023 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 /* This is a post-link lowering pass that lowers intrinsics to AMD-specific ones and thus breaks
8 * shader_info gathering.
9 *
10 * It lowers output stores to exports and inserts the bc_optimize conditional.
11 */
12
13 #include "ac_nir.h"
14 #include "sid.h"
15 #include "nir_builder.h"
16 #include "nir_builtin_builder.h"
17
18 typedef struct {
19 const ac_nir_lower_ps_late_options *options;
20
21 nir_variable *persp_center;
22 nir_variable *persp_centroid;
23 nir_variable *persp_sample;
24 nir_variable *linear_center;
25 nir_variable *linear_centroid;
26 nir_variable *linear_sample;
27 bool lower_load_barycentric;
28
29 nir_def *color[MAX_DRAW_BUFFERS][4];
30 nir_def *depth;
31 nir_def *stencil;
32 nir_def *sample_mask;
33
34 uint8_t colors_written;
35 nir_alu_type color_type[MAX_DRAW_BUFFERS];
36 bool has_dual_src_blending;
37 bool writes_all_cbufs;
38
39 /* MAX_DRAW_BUFFERS for MRT export, 1 for MRTZ export */
40 nir_intrinsic_instr *exp[MAX_DRAW_BUFFERS + 1];
41 unsigned exp_num;
42
43 unsigned compacted_mrt_index;
44 unsigned spi_shader_col_format;
45 } lower_ps_state;
46
47 static void
create_interp_param(nir_builder * b,lower_ps_state * s)48 create_interp_param(nir_builder *b, lower_ps_state *s)
49 {
50 if (s->options->bc_optimize_for_persp) {
51 s->persp_centroid =
52 nir_local_variable_create(b->impl, glsl_vec_type(2), "persp_centroid");
53 }
54
55 if (s->options->bc_optimize_for_linear) {
56 s->linear_centroid =
57 nir_local_variable_create(b->impl, glsl_vec_type(2), "linear_centroid");
58 }
59
60 s->lower_load_barycentric = s->persp_centroid || s->linear_centroid;
61 }
62
63 static void
init_interp_param(nir_builder * b,lower_ps_state * s)64 init_interp_param(nir_builder *b, lower_ps_state *s)
65 {
66 b->cursor = nir_before_cf_list(&b->impl->body);
67
68 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
69 * The hw doesn't compute CENTROID if the whole wave only
70 * contains fully-covered quads.
71 */
72 if (s->options->bc_optimize_for_persp || s->options->bc_optimize_for_linear) {
73 nir_def *bc_optimize = nir_load_barycentric_optimize_amd(b);
74
75 if (s->options->bc_optimize_for_persp) {
76 nir_def *center =
77 nir_load_barycentric_pixel(b, 32, .interp_mode = INTERP_MODE_SMOOTH);
78 nir_def *centroid =
79 nir_load_barycentric_centroid(b, 32, .interp_mode = INTERP_MODE_SMOOTH);
80
81 nir_def *value = nir_bcsel(b, bc_optimize, center, centroid);
82 nir_store_var(b, s->persp_centroid, value, 0x3);
83 }
84
85 if (s->options->bc_optimize_for_linear) {
86 nir_def *center =
87 nir_load_barycentric_pixel(b, 32, .interp_mode = INTERP_MODE_NOPERSPECTIVE);
88 nir_def *centroid =
89 nir_load_barycentric_centroid(b, 32, .interp_mode = INTERP_MODE_NOPERSPECTIVE);
90
91 nir_def *value = nir_bcsel(b, bc_optimize, center, centroid);
92 nir_store_var(b, s->linear_centroid, value, 0x3);
93 }
94 }
95 }
96
97 static bool
lower_ps_load_barycentric(nir_builder * b,nir_intrinsic_instr * intrin,lower_ps_state * s)98 lower_ps_load_barycentric(nir_builder *b, nir_intrinsic_instr *intrin, lower_ps_state *s)
99 {
100 enum glsl_interp_mode mode = nir_intrinsic_interp_mode(intrin);
101 nir_variable *var = NULL;
102
103 switch (mode) {
104 case INTERP_MODE_NONE:
105 case INTERP_MODE_SMOOTH:
106 switch (intrin->intrinsic) {
107 case nir_intrinsic_load_barycentric_pixel:
108 var = s->persp_center;
109 break;
110 case nir_intrinsic_load_barycentric_centroid:
111 var = s->persp_centroid;
112 break;
113 case nir_intrinsic_load_barycentric_sample:
114 var = s->persp_sample;
115 break;
116 default:
117 break;
118 }
119 break;
120
121 case INTERP_MODE_NOPERSPECTIVE:
122 switch (intrin->intrinsic) {
123 case nir_intrinsic_load_barycentric_pixel:
124 var = s->linear_center;
125 break;
126 case nir_intrinsic_load_barycentric_centroid:
127 var = s->linear_centroid;
128 break;
129 case nir_intrinsic_load_barycentric_sample:
130 var = s->linear_sample;
131 break;
132 default:
133 break;
134 }
135 break;
136
137 default:
138 break;
139 }
140
141 if (!var)
142 return false;
143
144 b->cursor = nir_before_instr(&intrin->instr);
145
146 nir_def *replacement = nir_load_var(b, var);
147 nir_def_replace(&intrin->def, replacement);
148 return true;
149 }
150
151 static bool
gather_ps_store_output(nir_builder * b,nir_intrinsic_instr * intrin,lower_ps_state * s)152 gather_ps_store_output(nir_builder *b, nir_intrinsic_instr *intrin, lower_ps_state *s)
153 {
154 unsigned slot = nir_intrinsic_io_semantics(intrin).location;
155 unsigned dual_src_blend_index = nir_intrinsic_io_semantics(intrin).dual_source_blend_index;
156 unsigned write_mask = nir_intrinsic_write_mask(intrin);
157 unsigned component = nir_intrinsic_component(intrin);
158 unsigned color_index = (slot >= FRAG_RESULT_DATA0 ? slot - FRAG_RESULT_DATA0 : 0) +
159 dual_src_blend_index;
160 nir_def *store_val = intrin->src[0].ssa;
161
162 b->cursor = nir_before_instr(&intrin->instr);
163
164 u_foreach_bit (i, write_mask) {
165 nir_def *chan = nir_channel(b, store_val, i);
166 unsigned comp = component + i;
167
168 switch (slot) {
169 case FRAG_RESULT_DEPTH:
170 assert(comp == 0);
171 s->depth = chan;
172 break;
173 case FRAG_RESULT_STENCIL:
174 assert(comp == 0);
175 s->stencil = chan;
176 break;
177 case FRAG_RESULT_SAMPLE_MASK:
178 assert(comp == 0);
179 s->sample_mask = chan;
180 break;
181 case FRAG_RESULT_COLOR:
182 s->color[color_index][comp] = chan;
183 break;
184 default:
185 assert(slot >= FRAG_RESULT_DATA0 && slot <= FRAG_RESULT_DATA7);
186 s->color[color_index][comp] = chan;
187 break;
188 }
189 }
190
191 if ((slot == FRAG_RESULT_COLOR || (slot >= FRAG_RESULT_DATA0 && slot <= FRAG_RESULT_DATA7)) &&
192 write_mask) {
193 s->colors_written |= BITFIELD_BIT(color_index);
194 s->color_type[color_index] = nir_intrinsic_src_type(intrin);
195 s->has_dual_src_blending |= dual_src_blend_index == 1;
196 s->writes_all_cbufs |= slot == FRAG_RESULT_COLOR;
197 }
198
199 /* Keep output instruction if not exported in nir. */
200 if (!s->options->no_color_export && !s->options->no_depth_export) {
201 nir_instr_remove(&intrin->instr);
202 } else {
203 if (slot >= FRAG_RESULT_DATA0 && !s->options->no_color_export) {
204 nir_instr_remove(&intrin->instr);
205 } else if ((slot == FRAG_RESULT_DEPTH || slot == FRAG_RESULT_STENCIL ||
206 slot == FRAG_RESULT_SAMPLE_MASK) && !s->options->no_depth_export) {
207 nir_instr_remove(&intrin->instr);
208 }
209 }
210
211 return true;
212 }
213
214 static bool
lower_ps_intrinsic(nir_builder * b,nir_instr * instr,void * state)215 lower_ps_intrinsic(nir_builder *b, nir_instr *instr, void *state)
216 {
217 lower_ps_state *s = (lower_ps_state *)state;
218
219 if (instr->type != nir_instr_type_intrinsic)
220 return false;
221
222 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
223
224 switch (intrin->intrinsic) {
225 case nir_intrinsic_store_output:
226 return gather_ps_store_output(b, intrin, s);
227 case nir_intrinsic_load_barycentric_pixel:
228 case nir_intrinsic_load_barycentric_centroid:
229 case nir_intrinsic_load_barycentric_sample:
230 if (s->lower_load_barycentric)
231 return lower_ps_load_barycentric(b, intrin, s);
232 break;
233 default:
234 break;
235 }
236
237 return false;
238 }
239
240 static void
emit_ps_mrtz_export(nir_builder * b,lower_ps_state * s,nir_def * mrtz_alpha)241 emit_ps_mrtz_export(nir_builder *b, lower_ps_state *s, nir_def *mrtz_alpha)
242 {
243 /* skip mrtz export if no one has written to any of them */
244 if (!s->depth && !s->stencil && !s->sample_mask && !mrtz_alpha)
245 return;
246
247 unsigned format =
248 ac_get_spi_shader_z_format(s->depth, s->stencil, s->sample_mask,
249 s->options->alpha_to_coverage_via_mrtz);
250
251 nir_def *undef = nir_undef(b, 1, 32);
252 nir_def *outputs[4] = {undef, undef, undef, undef};
253 unsigned write_mask = 0;
254 unsigned flags = 0;
255
256 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
257 assert(!s->depth && !mrtz_alpha);
258
259 if (s->options->gfx_level < GFX11)
260 flags |= AC_EXP_FLAG_COMPRESSED;
261
262 if (s->stencil) {
263 outputs[0] = nir_ishl_imm(b, s->stencil, 16);
264 write_mask |= s->options->gfx_level >= GFX11 ? 0x1 : 0x3;
265 }
266
267 if (s->sample_mask) {
268 outputs[1] = s->sample_mask;
269 write_mask |= s->options->gfx_level >= GFX11 ? 0x2 : 0xc;
270 }
271 } else {
272 if (s->depth) {
273 outputs[0] = s->depth;
274 write_mask |= 0x1;
275 }
276
277 if (s->stencil) {
278 assert(format == V_028710_SPI_SHADER_32_GR ||
279 format == V_028710_SPI_SHADER_32_ABGR);
280 outputs[1] = s->stencil;
281 write_mask |= 0x2;
282 }
283
284 if (s->sample_mask) {
285 assert(format == V_028710_SPI_SHADER_32_ABGR);
286 outputs[2] = s->sample_mask;
287 write_mask |= 0x4;
288 }
289
290 if (mrtz_alpha) {
291 assert(format == V_028710_SPI_SHADER_32_AR ||
292 format == V_028710_SPI_SHADER_32_ABGR);
293 if (format == V_028710_SPI_SHADER_32_AR && s->options->gfx_level >= GFX10) {
294 outputs[1] = mrtz_alpha;
295 write_mask |= 0x2;
296 } else {
297 outputs[3] = mrtz_alpha;
298 write_mask |= 0x8;
299 }
300 }
301 }
302
303 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the
304 * X writemask component.
305 */
306 if (s->options->gfx_level == GFX6 &&
307 s->options->family != CHIP_OLAND &&
308 s->options->family != CHIP_HAINAN) {
309 write_mask |= 0x1;
310 }
311
312 s->exp[s->exp_num++] = nir_export_amd(b, nir_vec(b, outputs, 4),
313 .base = V_008DFC_SQ_EXP_MRTZ,
314 .write_mask = write_mask,
315 .flags = flags);
316 }
317
318 static unsigned
get_ps_color_export_target(lower_ps_state * s)319 get_ps_color_export_target(lower_ps_state *s)
320 {
321 unsigned target = V_008DFC_SQ_EXP_MRT + s->compacted_mrt_index;
322
323 if (s->options->dual_src_blend_swizzle && s->compacted_mrt_index < 2)
324 target += 21;
325
326 s->compacted_mrt_index++;
327
328 return target;
329 }
330
331 static bool
emit_ps_color_export(nir_builder * b,lower_ps_state * s,unsigned output_index,unsigned mrt_index)332 emit_ps_color_export(nir_builder *b, lower_ps_state *s, unsigned output_index, unsigned mrt_index)
333 {
334 assert(output_index < 8 && mrt_index < 8);
335
336 unsigned spi_shader_col_format = (s->spi_shader_col_format >> (mrt_index * 4)) & 0xf;
337 if (spi_shader_col_format == V_028714_SPI_SHADER_ZERO)
338 return false;
339
340 /* get target after checking spi_shader_col_format as we need to increase
341 * compacted_mrt_index anyway regardless of whether the export is built
342 */
343 unsigned target = get_ps_color_export_target(s);
344
345 /* no one has written to this slot */
346 if (!(s->colors_written & BITFIELD_BIT(output_index)))
347 return false;
348
349 bool is_int8 = s->options->color_is_int8 & BITFIELD_BIT(mrt_index);
350 bool is_int10 = s->options->color_is_int10 & BITFIELD_BIT(mrt_index);
351 bool enable_mrt_output_nan_fixup =
352 s->options->enable_mrt_output_nan_fixup & BITFIELD_BIT(mrt_index);
353
354 nir_def *undef = nir_undef(b, 1, 32);
355 nir_def *outputs[4] = {undef, undef, undef, undef};
356 unsigned write_mask = 0;
357 unsigned flags = 0;
358
359 nir_alu_type type = s->color_type[output_index];
360 nir_alu_type base_type = nir_alu_type_get_base_type(type);
361 unsigned type_size = nir_alu_type_get_type_size(type);
362
363 nir_def *data[4];
364 memcpy(data, s->color[output_index], sizeof(data));
365
366 /* Replace NaN by zero (for 32-bit float formats) to fix game bugs if requested. */
367 if (enable_mrt_output_nan_fixup && type == nir_type_float32) {
368 for (int i = 0; i < 4; i++) {
369 if (data[i]) {
370 nir_def *isnan = nir_fisnan(b, data[i]);
371 data[i] = nir_bcsel(b, isnan, nir_imm_float(b, 0), data[i]);
372 }
373 }
374 }
375
376 switch (spi_shader_col_format) {
377 case V_028714_SPI_SHADER_32_R:
378 if (data[0]) {
379 outputs[0] = nir_convert_to_bit_size(b, data[0], base_type, 32);
380 write_mask = 0x1;
381 }
382 break;
383
384 case V_028714_SPI_SHADER_32_GR:
385 if (data[0]) {
386 outputs[0] = nir_convert_to_bit_size(b, data[0], base_type, 32);
387 write_mask |= 0x1;
388 }
389
390 if (data[1]) {
391 outputs[1] = nir_convert_to_bit_size(b, data[1], base_type, 32);
392 write_mask |= 0x2;
393 }
394 break;
395
396 case V_028714_SPI_SHADER_32_AR:
397 if (data[0]) {
398 outputs[0] = nir_convert_to_bit_size(b, data[0], base_type, 32);
399 write_mask |= 0x1;
400 }
401
402 if (data[3]) {
403 unsigned index = s->options->gfx_level >= GFX10 ? 1 : 3;
404 outputs[index] = nir_convert_to_bit_size(b, data[3], base_type, 32);
405 write_mask |= BITFIELD_BIT(index);
406 }
407 break;
408
409 case V_028714_SPI_SHADER_32_ABGR:
410 for (int i = 0; i < 4; i++) {
411 if (data[i]) {
412 outputs[i] = nir_convert_to_bit_size(b, data[i], base_type, 32);
413 write_mask |= BITFIELD_BIT(i);
414 }
415 }
416 break;
417
418 default: {
419 nir_op pack_op = nir_op_pack_32_2x16;
420
421 switch (spi_shader_col_format) {
422 case V_028714_SPI_SHADER_FP16_ABGR:
423 if (type_size == 32)
424 pack_op = nir_op_pack_half_2x16_rtz_split;
425 break;
426 case V_028714_SPI_SHADER_UINT16_ABGR:
427 if (type_size == 32) {
428 pack_op = nir_op_pack_uint_2x16;
429 if (is_int8 || is_int10) {
430 /* clamp 32bit output for 8/10 bit color component */
431 uint32_t max_rgb = is_int8 ? 255 : 1023;
432
433 for (int i = 0; i < 4; i++) {
434 if (!data[i])
435 continue;
436
437 uint32_t max_value = i == 3 && is_int10 ? 3 : max_rgb;
438 data[i] = nir_umin(b, data[i], nir_imm_int(b, max_value));
439 }
440 }
441 }
442 break;
443 case V_028714_SPI_SHADER_SINT16_ABGR:
444 if (type_size == 32) {
445 pack_op = nir_op_pack_sint_2x16;
446 if (is_int8 || is_int10) {
447 /* clamp 32bit output for 8/10 bit color component */
448 uint32_t max_rgb = is_int8 ? 127 : 511;
449 uint32_t min_rgb = is_int8 ? -128 : -512;
450
451 for (int i = 0; i < 4; i++) {
452 if (!data[i])
453 continue;
454
455 uint32_t max_value = i == 3 && is_int10 ? 1 : max_rgb;
456 uint32_t min_value = i == 3 && is_int10 ? -2u : min_rgb;
457
458 data[i] = nir_imin(b, data[i], nir_imm_int(b, max_value));
459 data[i] = nir_imax(b, data[i], nir_imm_int(b, min_value));
460 }
461 }
462 }
463 break;
464 case V_028714_SPI_SHADER_UNORM16_ABGR:
465 pack_op = nir_op_pack_unorm_2x16;
466 break;
467 case V_028714_SPI_SHADER_SNORM16_ABGR:
468 pack_op = nir_op_pack_snorm_2x16;
469 break;
470 default:
471 unreachable("unsupported color export format");
472 break;
473 }
474
475 for (int i = 0; i < 2; i++) {
476 nir_def *lo = data[i * 2];
477 nir_def *hi = data[i * 2 + 1];
478 if (!lo && !hi)
479 continue;
480
481 lo = lo ? lo : nir_undef(b, 1, type_size);
482 hi = hi ? hi : nir_undef(b, 1, type_size);
483
484 if (nir_op_infos[pack_op].num_inputs == 2) {
485 outputs[i] = nir_build_alu2(b, pack_op, lo, hi);
486 } else {
487 nir_def *vec = nir_vec2(b, lo, hi);
488 outputs[i] = nir_build_alu1(b, pack_op, vec);
489 }
490
491 if (s->options->gfx_level >= GFX11)
492 write_mask |= BITFIELD_BIT(i);
493 else
494 write_mask |= 0x3 << (i * 2);
495 }
496
497 if (s->options->gfx_level < GFX11)
498 flags |= AC_EXP_FLAG_COMPRESSED;
499 }
500 }
501
502 s->exp[s->exp_num++] = nir_export_amd(b, nir_vec(b, outputs, 4),
503 .base = target,
504 .write_mask = write_mask,
505 .flags = flags);
506 return true;
507 }
508
509 static void
emit_ps_dual_src_blend_swizzle(nir_builder * b,lower_ps_state * s,unsigned first_color_export)510 emit_ps_dual_src_blend_swizzle(nir_builder *b, lower_ps_state *s, unsigned first_color_export)
511 {
512 assert(s->exp_num > first_color_export + 1);
513
514 nir_intrinsic_instr *mrt0_exp = s->exp[first_color_export];
515 nir_intrinsic_instr *mrt1_exp = s->exp[first_color_export + 1];
516
517 /* There are some instructions which operate mrt1_exp's argument
518 * between mrt0_exp and mrt1_exp. Move mrt0_exp next to mrt1_exp,
519 * so that we can swizzle their arguments.
520 */
521 unsigned target0 = nir_intrinsic_base(mrt0_exp);
522 unsigned target1 = nir_intrinsic_base(mrt1_exp);
523 if (target0 > target1) {
524 /* mrt0 export is after mrt1 export, this happens when src0 is missing,
525 * so we emit mrt1 first then emit an empty mrt0.
526 *
527 * swap the pointer
528 */
529 nir_intrinsic_instr *tmp = mrt0_exp;
530 mrt0_exp = mrt1_exp;
531 mrt1_exp = tmp;
532
533 /* move mrt1_exp down to after mrt0_exp */
534 nir_instr_move(nir_after_instr(&mrt0_exp->instr), &mrt1_exp->instr);
535 } else {
536 /* move mrt0_exp down to before mrt1_exp */
537 nir_instr_move(nir_before_instr(&mrt1_exp->instr), &mrt0_exp->instr);
538 }
539
540 uint32_t mrt0_write_mask = nir_intrinsic_write_mask(mrt0_exp);
541 uint32_t mrt1_write_mask = nir_intrinsic_write_mask(mrt1_exp);
542 uint32_t write_mask = mrt0_write_mask & mrt1_write_mask;
543
544 nir_def *mrt0_arg = mrt0_exp->src[0].ssa;
545 nir_def *mrt1_arg = mrt1_exp->src[0].ssa;
546
547 /* Swizzle code is right before mrt0_exp. */
548 b->cursor = nir_before_instr(&mrt0_exp->instr);
549
550 /* ACO need to emit the swizzle code by a pseudo instruction. */
551 if (s->options->use_aco) {
552 nir_export_dual_src_blend_amd(b, mrt0_arg, mrt1_arg, .write_mask = write_mask);
553 nir_instr_remove(&mrt0_exp->instr);
554 nir_instr_remove(&mrt1_exp->instr);
555 return;
556 }
557
558 nir_def *undef = nir_undef(b, 1, 32);
559 nir_def *arg0_vec[4] = {undef, undef, undef, undef};
560 nir_def *arg1_vec[4] = {undef, undef, undef, undef};
561
562 /* For illustration, originally
563 * lane0 export arg00 and arg01
564 * lane1 export arg10 and arg11.
565 *
566 * After the following operation
567 * lane0 export arg00 and arg10
568 * lane1 export arg01 and arg11.
569 */
570 u_foreach_bit (i, write_mask) {
571 nir_def *arg0 = nir_channel(b, mrt0_arg, i);
572 nir_def *arg1 = nir_channel(b, mrt1_arg, i);
573
574 /* swap odd,even lanes of arg0 */
575 arg0 = nir_quad_swizzle_amd(b, arg0, .swizzle_mask = 0b10110001, .fetch_inactive = true);
576
577 /* swap even lanes between arg0 and arg1 */
578 nir_def *tid = nir_load_subgroup_invocation(b);
579 nir_def *is_even = nir_ieq_imm(b, nir_iand_imm(b, tid, 1), 0);
580
581 nir_def *tmp = arg0;
582 arg0 = nir_bcsel(b, is_even, arg1, arg0);
583 arg1 = nir_bcsel(b, is_even, tmp, arg1);
584
585 /* swap odd,even lanes again for arg0 */
586 arg0 = nir_quad_swizzle_amd(b, arg0, .swizzle_mask = 0b10110001, .fetch_inactive = true);
587
588 arg0_vec[i] = arg0;
589 arg1_vec[i] = arg1;
590 }
591
592 nir_src_rewrite(&mrt0_exp->src[0], nir_vec(b, arg0_vec, 4));
593 nir_src_rewrite(&mrt1_exp->src[0], nir_vec(b, arg1_vec, 4));
594
595 nir_intrinsic_set_write_mask(mrt0_exp, write_mask);
596 nir_intrinsic_set_write_mask(mrt1_exp, write_mask);
597 }
598
599 static void
emit_ps_null_export(nir_builder * b,lower_ps_state * s)600 emit_ps_null_export(nir_builder *b, lower_ps_state *s)
601 {
602 const bool pops = b->shader->info.fs.sample_interlock_ordered ||
603 b->shader->info.fs.sample_interlock_unordered ||
604 b->shader->info.fs.pixel_interlock_ordered ||
605 b->shader->info.fs.pixel_interlock_unordered;
606
607 /* Gfx10+ doesn't need to export anything if we don't need to export the EXEC mask
608 * for discard.
609 * In Primitive Ordered Pixel Shading, however, GFX11+ explicitly uses the `done` export to exit
610 * the ordered section, and before GFX11, shaders with POPS also need an export.
611 */
612 if (s->options->gfx_level >= GFX10 && !s->options->uses_discard && !pops)
613 return;
614
615 /* The `done` export exits the POPS ordered section on GFX11+, make sure UniformMemory and
616 * ImageMemory (in SPIR-V terms) accesses from the ordered section may not be reordered below it.
617 */
618 if (s->options->gfx_level >= GFX11 && pops)
619 nir_scoped_memory_barrier(b, SCOPE_QUEUE_FAMILY, NIR_MEMORY_RELEASE,
620 nir_var_image | nir_var_mem_ubo | nir_var_mem_ssbo |
621 nir_var_mem_global);
622
623 /* Gfx11 doesn't support null exports, and mrt0 should be exported instead. */
624 unsigned target = s->options->gfx_level >= GFX11 ?
625 V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
626
627 nir_intrinsic_instr *intrin =
628 nir_export_amd(b, nir_undef(b, 4, 32),
629 .base = target,
630 .flags = AC_EXP_FLAG_VALID_MASK | AC_EXP_FLAG_DONE);
631 /* To avoid builder set write mask to 0xf. */
632 nir_intrinsic_set_write_mask(intrin, 0);
633 }
634
635 static void
export_ps_outputs(nir_builder * b,lower_ps_state * s)636 export_ps_outputs(nir_builder *b, lower_ps_state *s)
637 {
638 nir_def *mrtz_alpha = NULL;
639
640 b->cursor = nir_after_impl(b->impl);
641
642 /* Alpha-to-coverage should be before alpha-to-one. */
643 if (!s->options->no_depth_export && s->options->alpha_to_coverage_via_mrtz)
644 mrtz_alpha = s->color[0][3];
645
646 u_foreach_bit (slot, s->colors_written) {
647 if (s->options->alpha_to_one)
648 s->color[slot][3] = nir_imm_floatN_t(b, 1, nir_alu_type_get_type_size(s->color_type[slot]));
649 }
650
651 if (!s->options->no_depth_export)
652 emit_ps_mrtz_export(b, s, mrtz_alpha);
653
654 /* When non-monolithic shader, RADV export mrtz in main part (except on
655 * RDNA3 for alpha to coverage) and export color in epilog.
656 */
657 if (s->options->no_color_export)
658 return;
659
660 unsigned first_color_export = s->exp_num;
661
662 /* Add exports for dual source blending manually if they are missing.
663 * It will automatically generate exports with undef.
664 */
665 if (s->has_dual_src_blending) {
666 switch (s->colors_written) {
667 case BITFIELD_BIT(0):
668 s->colors_written |= BITFIELD_BIT(1);
669 s->color_type[1] = s->color_type[0];
670 s->spi_shader_col_format |= (s->spi_shader_col_format & 0xf) << 4;
671 break;
672
673 case BITFIELD_BIT(1):
674 s->colors_written |= BITFIELD_BIT(0);
675 s->color_type[0] = s->color_type[1];
676 s->spi_shader_col_format |= (s->spi_shader_col_format & 0xf0) >> 4;
677 break;
678 case BITFIELD_RANGE(0, 2):
679 break;
680 default:
681 unreachable("unexpected number of color outputs for dual source blending");
682 }
683 }
684
685 if (s->writes_all_cbufs && s->colors_written == 0x1) {
686 /* This will do nothing for color buffers with SPI_SHADER_COL_FORMAT=ZERO, so always
687 * iterate over all 8.
688 */
689 for (int cbuf = 0; cbuf < 8; cbuf++)
690 emit_ps_color_export(b, s, 0, cbuf);
691 } else {
692 for (int cbuf = 0; cbuf < MAX_DRAW_BUFFERS; cbuf++)
693 emit_ps_color_export(b, s, cbuf, cbuf);
694 }
695
696 if (s->exp_num) {
697 if (s->options->dual_src_blend_swizzle) {
698 emit_ps_dual_src_blend_swizzle(b, s, first_color_export);
699 /* Skip last export flag setting because they have been replaced by
700 * a pseudo instruction.
701 */
702 if (s->options->use_aco)
703 return;
704 }
705
706 /* Specify that this is the last export */
707 nir_intrinsic_instr *final_exp = s->exp[s->exp_num - 1];
708 unsigned final_exp_flags = nir_intrinsic_flags(final_exp);
709 final_exp_flags |= AC_EXP_FLAG_DONE | AC_EXP_FLAG_VALID_MASK;
710 nir_intrinsic_set_flags(final_exp, final_exp_flags);
711
712 /* The `done` export exits the POPS ordered section on GFX11+, make sure UniformMemory and
713 * ImageMemory (in SPIR-V terms) accesses from the ordered section may not be reordered below
714 * it.
715 */
716 if (s->options->gfx_level >= GFX11 &&
717 (b->shader->info.fs.sample_interlock_ordered ||
718 b->shader->info.fs.sample_interlock_unordered ||
719 b->shader->info.fs.pixel_interlock_ordered ||
720 b->shader->info.fs.pixel_interlock_unordered)) {
721 b->cursor = nir_before_instr(&final_exp->instr);
722 nir_scoped_memory_barrier(b, SCOPE_QUEUE_FAMILY, NIR_MEMORY_RELEASE,
723 nir_var_image | nir_var_mem_ubo | nir_var_mem_ssbo |
724 nir_var_mem_global);
725 }
726 } else {
727 emit_ps_null_export(b, s);
728 }
729 }
730
731 void
ac_nir_lower_ps_late(nir_shader * nir,const ac_nir_lower_ps_late_options * options)732 ac_nir_lower_ps_late(nir_shader *nir, const ac_nir_lower_ps_late_options *options)
733 {
734 assert(nir->info.stage == MESA_SHADER_FRAGMENT);
735 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
736
737 nir_builder builder = nir_builder_create(impl);
738 nir_builder *b = &builder;
739
740 lower_ps_state state = {
741 .options = options,
742 .has_dual_src_blending = options->dual_src_blend_swizzle,
743 .spi_shader_col_format = options->spi_shader_col_format,
744 };
745
746 create_interp_param(b, &state);
747
748 nir_shader_instructions_pass(nir, lower_ps_intrinsic,
749 nir_metadata_control_flow,
750 &state);
751
752 /* Must be after lower_ps_intrinsic() to prevent it lower added intrinsic here. */
753 init_interp_param(b, &state);
754
755 export_ps_outputs(b, &state);
756
757 /* Cleanup nir variable, as RADV won't do this. */
758 if (state.lower_load_barycentric)
759 nir_lower_vars_to_ssa(nir);
760 }
761