1 /*
2 * Copyright 2023 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ac_nir.h"
8 #include "sid.h"
9 #include "nir_builder.h"
10 #include "nir_builtin_builder.h"
11
12 typedef struct {
13 const ac_nir_lower_ps_options *options;
14
15 nir_variable *persp_center;
16 nir_variable *persp_centroid;
17 nir_variable *persp_sample;
18 nir_variable *linear_center;
19 nir_variable *linear_centroid;
20 nir_variable *linear_sample;
21 bool lower_load_barycentric;
22
23 /* Add one for dual source blend second output. */
24 nir_def *outputs[FRAG_RESULT_MAX + 1][4];
25 nir_alu_type output_types[FRAG_RESULT_MAX + 1];
26
27 /* MAX_DRAW_BUFFERS for MRT export, 1 for MRTZ export */
28 nir_intrinsic_instr *exp[MAX_DRAW_BUFFERS + 1];
29 unsigned exp_num;
30
31 unsigned compacted_mrt_index;
32 } lower_ps_state;
33
34 #define DUAL_SRC_BLEND_SLOT FRAG_RESULT_MAX
35
36 static void
create_interp_param(nir_builder * b,lower_ps_state * s)37 create_interp_param(nir_builder *b, lower_ps_state *s)
38 {
39 if (s->options->force_persp_sample_interp) {
40 s->persp_center =
41 nir_local_variable_create(b->impl, glsl_vec_type(2), "persp_center");
42 }
43
44 if (s->options->bc_optimize_for_persp ||
45 s->options->force_persp_sample_interp ||
46 s->options->force_persp_center_interp) {
47 s->persp_centroid =
48 nir_local_variable_create(b->impl, glsl_vec_type(2), "persp_centroid");
49 }
50
51 if (s->options->force_persp_center_interp) {
52 s->persp_sample =
53 nir_local_variable_create(b->impl, glsl_vec_type(2), "persp_sample");
54 }
55
56 if (s->options->force_linear_sample_interp) {
57 s->linear_center =
58 nir_local_variable_create(b->impl, glsl_vec_type(2), "linear_center");
59 }
60
61 if (s->options->bc_optimize_for_linear ||
62 s->options->force_linear_sample_interp ||
63 s->options->force_linear_center_interp) {
64 s->linear_centroid =
65 nir_local_variable_create(b->impl, glsl_vec_type(2), "linear_centroid");
66 }
67
68 if (s->options->force_linear_center_interp) {
69 s->linear_sample =
70 nir_local_variable_create(b->impl, glsl_vec_type(2), "linear_sample");
71 }
72
73 s->lower_load_barycentric =
74 s->persp_center || s->persp_centroid || s->persp_sample ||
75 s->linear_center || s->linear_centroid || s->linear_sample;
76 }
77
78 static void
init_interp_param(nir_builder * b,lower_ps_state * s)79 init_interp_param(nir_builder *b, lower_ps_state *s)
80 {
81 b->cursor = nir_before_cf_list(&b->impl->body);
82
83 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
84 * The hw doesn't compute CENTROID if the whole wave only
85 * contains fully-covered quads.
86 */
87 if (s->options->bc_optimize_for_persp || s->options->bc_optimize_for_linear) {
88 nir_def *bc_optimize = nir_load_barycentric_optimize_amd(b);
89
90 if (s->options->bc_optimize_for_persp) {
91 nir_def *center =
92 nir_load_barycentric_pixel(b, 32, .interp_mode = INTERP_MODE_SMOOTH);
93 nir_def *centroid =
94 nir_load_barycentric_centroid(b, 32, .interp_mode = INTERP_MODE_SMOOTH);
95
96 nir_def *value = nir_bcsel(b, bc_optimize, center, centroid);
97 nir_store_var(b, s->persp_centroid, value, 0x3);
98 }
99
100 if (s->options->bc_optimize_for_linear) {
101 nir_def *center =
102 nir_load_barycentric_pixel(b, 32, .interp_mode = INTERP_MODE_NOPERSPECTIVE);
103 nir_def *centroid =
104 nir_load_barycentric_centroid(b, 32, .interp_mode = INTERP_MODE_NOPERSPECTIVE);
105
106 nir_def *value = nir_bcsel(b, bc_optimize, center, centroid);
107 nir_store_var(b, s->linear_centroid, value, 0x3);
108 }
109 }
110
111 if (s->options->force_persp_sample_interp) {
112 nir_def *sample =
113 nir_load_barycentric_sample(b, 32, .interp_mode = INTERP_MODE_SMOOTH);
114 nir_store_var(b, s->persp_center, sample, 0x3);
115 nir_store_var(b, s->persp_centroid, sample, 0x3);
116 }
117
118 if (s->options->force_linear_sample_interp) {
119 nir_def *sample =
120 nir_load_barycentric_sample(b, 32, .interp_mode = INTERP_MODE_NOPERSPECTIVE);
121 nir_store_var(b, s->linear_center, sample, 0x3);
122 nir_store_var(b, s->linear_centroid, sample, 0x3);
123 }
124
125 if (s->options->force_persp_center_interp) {
126 nir_def *center =
127 nir_load_barycentric_pixel(b, 32, .interp_mode = INTERP_MODE_SMOOTH);
128 nir_store_var(b, s->persp_sample, center, 0x3);
129 nir_store_var(b, s->persp_centroid, center, 0x3);
130 }
131
132 if (s->options->force_linear_center_interp) {
133 nir_def *center =
134 nir_load_barycentric_pixel(b, 32, .interp_mode = INTERP_MODE_NOPERSPECTIVE);
135 nir_store_var(b, s->linear_sample, center, 0x3);
136 nir_store_var(b, s->linear_centroid, center, 0x3);
137 }
138 }
139
140 static bool
lower_ps_load_barycentric(nir_builder * b,nir_intrinsic_instr * intrin,lower_ps_state * s)141 lower_ps_load_barycentric(nir_builder *b, nir_intrinsic_instr *intrin, lower_ps_state *s)
142 {
143 enum glsl_interp_mode mode = nir_intrinsic_interp_mode(intrin);
144 nir_variable *var = NULL;
145
146 switch (mode) {
147 case INTERP_MODE_NONE:
148 case INTERP_MODE_SMOOTH:
149 switch (intrin->intrinsic) {
150 case nir_intrinsic_load_barycentric_pixel:
151 var = s->persp_center;
152 break;
153 case nir_intrinsic_load_barycentric_centroid:
154 var = s->persp_centroid;
155 break;
156 case nir_intrinsic_load_barycentric_sample:
157 var = s->persp_sample;
158 break;
159 default:
160 break;
161 }
162 break;
163
164 case INTERP_MODE_NOPERSPECTIVE:
165 switch (intrin->intrinsic) {
166 case nir_intrinsic_load_barycentric_pixel:
167 var = s->linear_center;
168 break;
169 case nir_intrinsic_load_barycentric_centroid:
170 var = s->linear_centroid;
171 break;
172 case nir_intrinsic_load_barycentric_sample:
173 var = s->linear_sample;
174 break;
175 default:
176 break;
177 }
178 break;
179
180 default:
181 break;
182 }
183
184 if (!var)
185 return false;
186
187 b->cursor = nir_before_instr(&intrin->instr);
188
189 nir_def *replacement = nir_load_var(b, var);
190 nir_def_rewrite_uses(&intrin->def, replacement);
191
192 nir_instr_remove(&intrin->instr);
193 return true;
194 }
195
196 static bool
gather_ps_store_output(nir_builder * b,nir_intrinsic_instr * intrin,lower_ps_state * s)197 gather_ps_store_output(nir_builder *b, nir_intrinsic_instr *intrin, lower_ps_state *s)
198 {
199 nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
200 unsigned write_mask = nir_intrinsic_write_mask(intrin);
201 unsigned component = nir_intrinsic_component(intrin);
202 nir_alu_type type = nir_intrinsic_src_type(intrin);
203 nir_def *store_val = intrin->src[0].ssa;
204
205 b->cursor = nir_before_instr(&intrin->instr);
206
207 unsigned slot = sem.dual_source_blend_index ?
208 DUAL_SRC_BLEND_SLOT : sem.location;
209
210 u_foreach_bit (i, write_mask) {
211 unsigned comp = component + i;
212 s->outputs[slot][comp] = nir_channel(b, store_val, i);
213 }
214
215 /* Same slot should have same type for all components. */
216 assert(s->output_types[slot] == nir_type_invalid || s->output_types[slot] == type);
217
218 s->output_types[slot] = type;
219
220 /* Keep output instruction if not exported in nir. */
221 if (!s->options->no_color_export && !s->options->no_depth_export) {
222 nir_instr_remove(&intrin->instr);
223 } else {
224 if (slot >= FRAG_RESULT_DATA0 && !s->options->no_color_export) {
225 nir_instr_remove(&intrin->instr);
226 } else if ((slot == FRAG_RESULT_DEPTH || slot == FRAG_RESULT_STENCIL ||
227 slot == FRAG_RESULT_SAMPLE_MASK) && !s->options->no_depth_export) {
228 nir_instr_remove(&intrin->instr);
229 }
230 }
231
232 return true;
233 }
234
235 static bool
lower_ps_load_sample_mask_in(nir_builder * b,nir_intrinsic_instr * intrin,lower_ps_state * s)236 lower_ps_load_sample_mask_in(nir_builder *b, nir_intrinsic_instr *intrin, lower_ps_state *s)
237 {
238 /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
239 * says:
240 *
241 * "When per-sample shading is active due to the use of a fragment
242 * input qualified by sample or due to the use of the gl_SampleID
243 * or gl_SamplePosition variables, only the bit for the current
244 * sample is set in gl_SampleMaskIn. When state specifies multiple
245 * fragment shader invocations for a given fragment, the sample
246 * mask for any single fragment shader invocation may specify a
247 * subset of the covered samples for the fragment. In this case,
248 * the bit corresponding to each covered sample will be set in
249 * exactly one fragment shader invocation."
250 *
251 * The samplemask loaded by hardware is always the coverage of the
252 * entire pixel/fragment, so mask bits out based on the sample ID.
253 */
254
255 b->cursor = nir_before_instr(&intrin->instr);
256
257 uint32_t ps_iter_mask = ac_get_ps_iter_mask(s->options->ps_iter_samples);
258 nir_def *sampleid = nir_load_sample_id(b);
259 nir_def *submask = nir_ishl(b, nir_imm_int(b, ps_iter_mask), sampleid);
260
261 nir_def *sample_mask = nir_load_sample_mask_in(b);
262 nir_def *replacement = nir_iand(b, sample_mask, submask);
263
264 nir_def_rewrite_uses(&intrin->def, replacement);
265
266 nir_instr_remove(&intrin->instr);
267 return true;
268 }
269
270 static bool
lower_ps_intrinsic(nir_builder * b,nir_instr * instr,void * state)271 lower_ps_intrinsic(nir_builder *b, nir_instr *instr, void *state)
272 {
273 lower_ps_state *s = (lower_ps_state *)state;
274
275 if (instr->type != nir_instr_type_intrinsic)
276 return false;
277
278 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
279
280 switch (intrin->intrinsic) {
281 case nir_intrinsic_store_output:
282 return gather_ps_store_output(b, intrin, s);
283 case nir_intrinsic_load_barycentric_pixel:
284 case nir_intrinsic_load_barycentric_centroid:
285 case nir_intrinsic_load_barycentric_sample:
286 if (s->lower_load_barycentric)
287 return lower_ps_load_barycentric(b, intrin, s);
288 break;
289 case nir_intrinsic_load_sample_mask_in:
290 if (s->options->ps_iter_samples > 1)
291 return lower_ps_load_sample_mask_in(b, intrin, s);
292 break;
293 default:
294 break;
295 }
296
297 return false;
298 }
299
300 static void
emit_ps_color_clamp_and_alpha_test(nir_builder * b,lower_ps_state * s)301 emit_ps_color_clamp_and_alpha_test(nir_builder *b, lower_ps_state *s)
302 {
303 uint32_t color_mask =
304 BITFIELD_BIT(FRAG_RESULT_COLOR) |
305 BITFIELD_RANGE(FRAG_RESULT_DATA0, MAX_DRAW_BUFFERS);
306 uint32_t color_outputs =
307 (b->shader->info.outputs_written & color_mask) |
308 /* both dual source blend outputs use FRAG_RESULT_DATA0 slot in nir,
309 * but we use an extra slot number in lower_ps_state for the second
310 * output
311 */
312 BITFIELD_BIT(DUAL_SRC_BLEND_SLOT);
313
314 u_foreach_bit (slot, color_outputs) {
315 if (s->options->clamp_color) {
316 for (int i = 0; i < 4; i++) {
317 if (s->outputs[slot][i])
318 s->outputs[slot][i] = nir_fsat(b, s->outputs[slot][i]);
319 }
320 }
321
322 if (s->options->alpha_to_one) {
323 /* any one has written to this slot */
324 if (s->output_types[slot] != nir_type_invalid) {
325 unsigned bit_size = nir_alu_type_get_type_size(s->output_types[slot]);
326 s->outputs[slot][3] = nir_imm_floatN_t(b, 1, bit_size);
327 }
328 }
329
330 if (slot == FRAG_RESULT_COLOR || slot == FRAG_RESULT_DATA0) {
331 if (s->options->alpha_func == COMPARE_FUNC_ALWAYS) {
332 /* always pass, do nothing */
333 } else if (s->options->alpha_func == COMPARE_FUNC_NEVER) {
334 nir_discard(b);
335 } else if (s->outputs[slot][3]) {
336 nir_def *ref = nir_load_alpha_reference_amd(b);
337 nir_def *cond =
338 nir_compare_func(b, s->options->alpha_func, s->outputs[slot][3], ref);
339 nir_discard_if(b, nir_inot(b, cond));
340 }
341 }
342 }
343 }
344
345 static void
emit_ps_mrtz_export(nir_builder * b,lower_ps_state * s)346 emit_ps_mrtz_export(nir_builder *b, lower_ps_state *s)
347 {
348 uint64_t outputs_written = b->shader->info.outputs_written;
349
350 nir_def *mrtz_alpha = NULL;
351 if (s->options->alpha_to_coverage_via_mrtz) {
352 mrtz_alpha = s->outputs[FRAG_RESULT_COLOR][3] ?
353 s->outputs[FRAG_RESULT_COLOR][3] :
354 s->outputs[FRAG_RESULT_DATA0][3];
355 }
356
357 nir_def *depth = s->outputs[FRAG_RESULT_DEPTH][0];
358 nir_def *stencil = s->outputs[FRAG_RESULT_STENCIL][0];
359 nir_def *sample_mask = s->outputs[FRAG_RESULT_SAMPLE_MASK][0];
360
361 if (s->options->kill_samplemask) {
362 sample_mask = NULL;
363 outputs_written &= ~BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
364 }
365
366 /* skip mrtz export if no one has written to any of them */
367 if (!depth && !stencil && !sample_mask && !mrtz_alpha)
368 return;
369
370 /* use outputs_written to determine export format as we use it to set
371 * R_028710_SPI_SHADER_Z_FORMAT instead of relying on the real store output,
372 * because store output may be optimized out.
373 */
374 unsigned format =
375 ac_get_spi_shader_z_format(outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH),
376 outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL),
377 outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK),
378 s->options->alpha_to_coverage_via_mrtz);
379
380 nir_def *undef = nir_undef(b, 1, 32);
381 nir_def *outputs[4] = {undef, undef, undef, undef};
382 unsigned write_mask = 0;
383 unsigned flags = 0;
384
385 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
386 assert(!depth && !mrtz_alpha);
387
388 if (s->options->gfx_level < GFX11)
389 flags |= AC_EXP_FLAG_COMPRESSED;
390
391 if (stencil) {
392 outputs[0] = nir_ishl_imm(b, stencil, 16);
393 write_mask |= s->options->gfx_level >= GFX11 ? 0x1 : 0x3;
394 }
395
396 if (sample_mask) {
397 outputs[1] = sample_mask;
398 write_mask |= s->options->gfx_level >= GFX11 ? 0x2 : 0xc;
399 }
400 } else {
401 if (depth) {
402 outputs[0] = depth;
403 write_mask |= 0x1;
404 }
405
406 if (stencil) {
407 outputs[1] = stencil;
408 write_mask |= 0x2;
409 }
410
411 if (sample_mask) {
412 outputs[2] = sample_mask;
413 write_mask |= 0x4;
414 }
415
416 if (mrtz_alpha) {
417 outputs[3] = mrtz_alpha;
418 write_mask |= 0x8;
419 }
420 }
421
422 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the
423 * X writemask component.
424 */
425 if (s->options->gfx_level == GFX6 &&
426 s->options->family != CHIP_OLAND &&
427 s->options->family != CHIP_HAINAN) {
428 write_mask |= 0x1;
429 }
430
431 s->exp[s->exp_num++] = nir_export_amd(b, nir_vec(b, outputs, 4),
432 .base = V_008DFC_SQ_EXP_MRTZ,
433 .write_mask = write_mask,
434 .flags = flags);
435 }
436
437 static unsigned
get_ps_color_export_target(lower_ps_state * s)438 get_ps_color_export_target(lower_ps_state *s)
439 {
440 unsigned target = V_008DFC_SQ_EXP_MRT + s->compacted_mrt_index;
441
442 if (s->options->dual_src_blend_swizzle && s->compacted_mrt_index < 2)
443 target += 21;
444
445 s->compacted_mrt_index++;
446
447 return target;
448 }
449
450 static bool
emit_ps_color_export(nir_builder * b,lower_ps_state * s,gl_frag_result slot,unsigned cbuf)451 emit_ps_color_export(nir_builder *b, lower_ps_state *s, gl_frag_result slot, unsigned cbuf)
452 {
453 assert(cbuf < 8);
454
455 unsigned spi_shader_col_format = (s->options->spi_shader_col_format >> (cbuf * 4)) & 0xf;
456 if (spi_shader_col_format == V_028714_SPI_SHADER_ZERO)
457 return false;
458
459 /* get target after checking spi_shader_col_format as we need to increase
460 * compacted_mrt_index anyway regardless of whether the export is built
461 */
462 unsigned target = get_ps_color_export_target(s);
463
464 nir_alu_type type = s->output_types[slot];
465 /* no one has written to this slot */
466 if (type == nir_type_invalid)
467 return false;
468
469 bool is_int8 = s->options->color_is_int8 & BITFIELD_BIT(cbuf);
470 bool is_int10 = s->options->color_is_int10 & BITFIELD_BIT(cbuf);
471 bool enable_mrt_output_nan_fixup =
472 s->options->enable_mrt_output_nan_fixup & BITFIELD_BIT(cbuf);
473
474 nir_def *undef = nir_undef(b, 1, 32);
475 nir_def *outputs[4] = {undef, undef, undef, undef};
476 unsigned write_mask = 0;
477 unsigned flags = 0;
478
479 nir_alu_type base_type = nir_alu_type_get_base_type(type);
480 unsigned type_size = nir_alu_type_get_type_size(type);
481
482 nir_def *data[4];
483 memcpy(data, s->outputs[slot], sizeof(data));
484
485 /* Replace NaN by zero (for 32-bit float formats) to fix game bugs if requested. */
486 if (enable_mrt_output_nan_fixup && type == nir_type_float32) {
487 for (int i = 0; i < 4; i++) {
488 if (data[i]) {
489 nir_def *isnan = nir_fisnan(b, data[i]);
490 data[i] = nir_bcsel(b, isnan, nir_imm_float(b, 0), data[i]);
491 }
492 }
493 }
494
495 switch (spi_shader_col_format) {
496 case V_028714_SPI_SHADER_32_R:
497 if (!data[0])
498 return false;
499
500 outputs[0] = nir_convert_to_bit_size(b, data[0], base_type, 32);
501 write_mask = 0x1;
502 break;
503
504 case V_028714_SPI_SHADER_32_GR:
505 if (!data[0] && !data[1])
506 return false;
507
508 if (data[0]) {
509 outputs[0] = nir_convert_to_bit_size(b, data[0], base_type, 32);
510 write_mask |= 0x1;
511 }
512
513 if (data[1]) {
514 outputs[1] = nir_convert_to_bit_size(b, data[1], base_type, 32);
515 write_mask |= 0x2;
516 }
517 break;
518
519 case V_028714_SPI_SHADER_32_AR:
520 if (!data[0] && !data[3])
521 return false;
522
523 if (data[0]) {
524 outputs[0] = nir_convert_to_bit_size(b, data[0], base_type, 32);
525 write_mask |= 0x1;
526 }
527
528 if (data[3]) {
529 unsigned index = s->options->gfx_level >= GFX10 ? 1 : 3;
530 outputs[index] = nir_convert_to_bit_size(b, data[3], base_type, 32);
531 write_mask |= BITFIELD_BIT(index);
532 }
533 break;
534
535 case V_028714_SPI_SHADER_32_ABGR:
536 for (int i = 0; i < 4; i++) {
537 if (data[i]) {
538 outputs[i] = nir_convert_to_bit_size(b, data[i], base_type, 32);
539 write_mask |= BITFIELD_BIT(i);
540 }
541 }
542 break;
543
544 default: {
545 nir_op pack_op = nir_op_pack_32_2x16;
546
547 switch (spi_shader_col_format) {
548 case V_028714_SPI_SHADER_FP16_ABGR:
549 if (type_size == 32)
550 pack_op = nir_op_pack_half_2x16;
551 break;
552 case V_028714_SPI_SHADER_UINT16_ABGR:
553 if (type_size == 32) {
554 pack_op = nir_op_pack_uint_2x16;
555 if (is_int8 || is_int10) {
556 /* clamp 32bit output for 8/10 bit color component */
557 uint32_t max_rgb = is_int8 ? 255 : 1023;
558
559 for (int i = 0; i < 4; i++) {
560 if (!data[i])
561 continue;
562
563 uint32_t max_value = i == 3 && is_int10 ? 3 : max_rgb;
564 data[i] = nir_umin(b, data[i], nir_imm_int(b, max_value));
565 }
566 }
567 }
568 break;
569 case V_028714_SPI_SHADER_SINT16_ABGR:
570 if (type_size == 32) {
571 pack_op = nir_op_pack_sint_2x16;
572 if (is_int8 || is_int10) {
573 /* clamp 32bit output for 8/10 bit color component */
574 uint32_t max_rgb = is_int8 ? 127 : 511;
575 uint32_t min_rgb = is_int8 ? -128 : -512;
576
577 for (int i = 0; i < 4; i++) {
578 if (!data[i])
579 continue;
580
581 uint32_t max_value = i == 3 && is_int10 ? 1 : max_rgb;
582 uint32_t min_value = i == 3 && is_int10 ? -2u : min_rgb;
583
584 data[i] = nir_imin(b, data[i], nir_imm_int(b, max_value));
585 data[i] = nir_imax(b, data[i], nir_imm_int(b, min_value));
586 }
587 }
588 }
589 break;
590 case V_028714_SPI_SHADER_UNORM16_ABGR:
591 pack_op = nir_op_pack_unorm_2x16;
592 break;
593 case V_028714_SPI_SHADER_SNORM16_ABGR:
594 pack_op = nir_op_pack_snorm_2x16;
595 break;
596 default:
597 unreachable("unsupported color export format");
598 break;
599 }
600
601 for (int i = 0; i < 2; i++) {
602 nir_def *lo = data[i * 2];
603 nir_def *hi = data[i * 2 + 1];
604 if (!lo && !hi)
605 continue;
606
607 lo = lo ? lo : nir_undef(b, 1, type_size);
608 hi = hi ? hi : nir_undef(b, 1, type_size);
609 nir_def *vec = nir_vec2(b, lo, hi);
610
611 outputs[i] = nir_build_alu1(b, pack_op, vec);
612
613 if (s->options->gfx_level >= GFX11)
614 write_mask |= BITFIELD_BIT(i);
615 else
616 write_mask |= 0x3 << (i * 2);
617 }
618
619 if (s->options->gfx_level < GFX11)
620 flags |= AC_EXP_FLAG_COMPRESSED;
621 }
622 }
623
624 s->exp[s->exp_num++] = nir_export_amd(b, nir_vec(b, outputs, 4),
625 .base = target,
626 .write_mask = write_mask,
627 .flags = flags);
628 return true;
629 }
630
631 static void
emit_ps_dual_src_blend_swizzle(nir_builder * b,lower_ps_state * s,unsigned first_color_export)632 emit_ps_dual_src_blend_swizzle(nir_builder *b, lower_ps_state *s, unsigned first_color_export)
633 {
634 assert(s->exp_num > first_color_export + 1);
635
636 nir_intrinsic_instr *mrt0_exp = s->exp[first_color_export];
637 nir_intrinsic_instr *mrt1_exp = s->exp[first_color_export + 1];
638
639 /* There are some instructions which operate mrt1_exp's argument
640 * between mrt0_exp and mrt1_exp. Move mrt0_exp next to mrt1_exp,
641 * so that we can swizzle their arguments.
642 */
643 unsigned target0 = nir_intrinsic_base(mrt0_exp);
644 unsigned target1 = nir_intrinsic_base(mrt1_exp);
645 if (target0 > target1) {
646 /* mrt0 export is after mrt1 export, this happens when src0 is missing,
647 * so we emit mrt1 first then emit an empty mrt0.
648 *
649 * swap the pointer
650 */
651 nir_intrinsic_instr *tmp = mrt0_exp;
652 mrt0_exp = mrt1_exp;
653 mrt1_exp = tmp;
654
655 /* move mrt1_exp down to after mrt0_exp */
656 nir_instr_move(nir_after_instr(&mrt0_exp->instr), &mrt1_exp->instr);
657 } else {
658 /* move mrt0_exp down to before mrt1_exp */
659 nir_instr_move(nir_before_instr(&mrt1_exp->instr), &mrt0_exp->instr);
660 }
661
662 uint32_t mrt0_write_mask = nir_intrinsic_write_mask(mrt0_exp);
663 uint32_t mrt1_write_mask = nir_intrinsic_write_mask(mrt1_exp);
664 uint32_t write_mask = mrt0_write_mask | mrt1_write_mask;
665
666 nir_def *mrt0_arg = mrt0_exp->src[0].ssa;
667 nir_def *mrt1_arg = mrt1_exp->src[0].ssa;
668
669 /* Swizzle code is right before mrt0_exp. */
670 b->cursor = nir_before_instr(&mrt0_exp->instr);
671
672 /* ACO need to emit the swizzle code by a pseudo instruction. */
673 if (s->options->use_aco) {
674 nir_export_dual_src_blend_amd(b, mrt0_arg, mrt1_arg, .write_mask = write_mask);
675 nir_instr_remove(&mrt0_exp->instr);
676 nir_instr_remove(&mrt1_exp->instr);
677 return;
678 }
679
680 nir_def *undef = nir_undef(b, 1, 32);
681 nir_def *arg0_vec[4] = {undef, undef, undef, undef};
682 nir_def *arg1_vec[4] = {undef, undef, undef, undef};
683
684 /* For illustration, originally
685 * lane0 export arg00 and arg01
686 * lane1 export arg10 and arg11.
687 *
688 * After the following operation
689 * lane0 export arg00 and arg10
690 * lane1 export arg01 and arg11.
691 */
692 u_foreach_bit (i, write_mask) {
693 nir_def *arg0 = nir_channel(b, mrt0_arg, i);
694 nir_def *arg1 = nir_channel(b, mrt1_arg, i);
695
696 /* swap odd,even lanes of arg0 */
697 arg0 = nir_quad_swizzle_amd(b, arg0, .swizzle_mask = 0b10110001, .fetch_inactive = true);
698
699 /* swap even lanes between arg0 and arg1 */
700 nir_def *tid = nir_load_subgroup_invocation(b);
701 nir_def *is_even = nir_ieq_imm(b, nir_iand_imm(b, tid, 1), 0);
702
703 nir_def *tmp = arg0;
704 arg0 = nir_bcsel(b, is_even, arg1, arg0);
705 arg1 = nir_bcsel(b, is_even, tmp, arg1);
706
707 /* swap odd,even lanes again for arg0 */
708 arg0 = nir_quad_swizzle_amd(b, arg0, .swizzle_mask = 0b10110001, .fetch_inactive = true);
709
710 arg0_vec[i] = arg0;
711 arg1_vec[i] = arg1;
712 }
713
714 nir_src_rewrite(&mrt0_exp->src[0], nir_vec(b, arg0_vec, 4));
715 nir_src_rewrite(&mrt1_exp->src[0], nir_vec(b, arg1_vec, 4));
716
717 nir_intrinsic_set_write_mask(mrt0_exp, write_mask);
718 nir_intrinsic_set_write_mask(mrt1_exp, write_mask);
719 }
720
721 static void
emit_ps_null_export(nir_builder * b,lower_ps_state * s)722 emit_ps_null_export(nir_builder *b, lower_ps_state *s)
723 {
724 const bool pops = b->shader->info.fs.sample_interlock_ordered ||
725 b->shader->info.fs.sample_interlock_unordered ||
726 b->shader->info.fs.pixel_interlock_ordered ||
727 b->shader->info.fs.pixel_interlock_unordered;
728
729 /* Gfx10+ doesn't need to export anything if we don't need to export the EXEC mask
730 * for discard.
731 * In Primitive Ordered Pixel Shading, however, GFX11+ explicitly uses the `done` export to exit
732 * the ordered section, and before GFX11, shaders with POPS also need an export.
733 */
734 if (s->options->gfx_level >= GFX10 && !s->options->uses_discard && !pops)
735 return;
736
737 /* The `done` export exits the POPS ordered section on GFX11+, make sure UniformMemory and
738 * ImageMemory (in SPIR-V terms) accesses from the ordered section may not be reordered below it.
739 */
740 if (s->options->gfx_level >= GFX11 && pops)
741 nir_scoped_memory_barrier(b, SCOPE_QUEUE_FAMILY, NIR_MEMORY_RELEASE,
742 nir_var_image | nir_var_mem_ubo | nir_var_mem_ssbo |
743 nir_var_mem_global);
744
745 /* Gfx11 doesn't support null exports, and mrt0 should be exported instead. */
746 unsigned target = s->options->gfx_level >= GFX11 ?
747 V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
748
749 nir_intrinsic_instr *intrin =
750 nir_export_amd(b, nir_undef(b, 4, 32),
751 .base = target,
752 .flags = AC_EXP_FLAG_VALID_MASK | AC_EXP_FLAG_DONE);
753 /* To avoid builder set write mask to 0xf. */
754 nir_intrinsic_set_write_mask(intrin, 0);
755 }
756
757 static void
export_ps_outputs(nir_builder * b,lower_ps_state * s)758 export_ps_outputs(nir_builder *b, lower_ps_state *s)
759 {
760 b->cursor = nir_after_impl(b->impl);
761
762 emit_ps_color_clamp_and_alpha_test(b, s);
763
764 if (!s->options->no_depth_export)
765 emit_ps_mrtz_export(b, s);
766
767 /* When non-monolithic shader, RADV export mrtz in main part (except on
768 * RDNA3 for alpha to coverage) and export color in epilog.
769 */
770 if (s->options->no_color_export)
771 return;
772
773 unsigned first_color_export = s->exp_num;
774
775 /* When dual src blend is enabled and we need both src0 and src1
776 * export present, try to export both src, and add an empty export
777 * for either src missing.
778 */
779 if (s->output_types[DUAL_SRC_BLEND_SLOT] != nir_type_invalid ||
780 s->options->dual_src_blend_swizzle) {
781 unsigned slot;
782 if (s->output_types[FRAG_RESULT_COLOR] != nir_type_invalid) {
783 /* when dual source blending, there must be only one color buffer */
784 assert(s->options->broadcast_last_cbuf == 0);
785 slot = FRAG_RESULT_COLOR;
786 } else {
787 slot = FRAG_RESULT_DATA0;
788 }
789
790 bool src0_exported = emit_ps_color_export(b, s, slot, 0);
791 /* src1 use cubf1 info, when dual src blend is enabled it's
792 * same as cbuf0, but when dual src blend is disabled it's used
793 * to disable src1 export.
794 */
795 bool src1_exported = emit_ps_color_export(b, s, DUAL_SRC_BLEND_SLOT, 1);
796
797 bool need_empty_export =
798 /* miss src1, need to add src1 only when swizzle case */
799 (src0_exported && !src1_exported && s->options->dual_src_blend_swizzle) ||
800 /* miss src0, always need to add src0 */
801 (!src0_exported && src1_exported);
802
803 if (need_empty_export) {
804 /* set to expected value */
805 s->compacted_mrt_index = src0_exported ? 1 : 0;
806
807 unsigned target = get_ps_color_export_target(s);
808
809 s->exp[s->exp_num++] =
810 nir_export_amd(b, nir_undef(b, 4, 32), .base = target);
811 }
812 } else {
813 if (s->output_types[FRAG_RESULT_COLOR] != nir_type_invalid) {
814 /* write to all color buffers */
815 for (int cbuf = 0; cbuf <= s->options->broadcast_last_cbuf; cbuf++)
816 emit_ps_color_export(b, s, FRAG_RESULT_COLOR, cbuf);
817 } else {
818 for (int cbuf = 0; cbuf < MAX_DRAW_BUFFERS; cbuf++) {
819 unsigned slot = FRAG_RESULT_DATA0 + cbuf;
820 emit_ps_color_export(b, s, slot, cbuf);
821 }
822 }
823 }
824
825 if (s->exp_num) {
826 if (s->options->dual_src_blend_swizzle) {
827 emit_ps_dual_src_blend_swizzle(b, s, first_color_export);
828 /* Skip last export flag setting because they have been replaced by
829 * a pseudo instruction.
830 */
831 if (s->options->use_aco)
832 return;
833 }
834
835 /* Specify that this is the last export */
836 nir_intrinsic_instr *final_exp = s->exp[s->exp_num - 1];
837 unsigned final_exp_flags = nir_intrinsic_flags(final_exp);
838 final_exp_flags |= AC_EXP_FLAG_DONE | AC_EXP_FLAG_VALID_MASK;
839 nir_intrinsic_set_flags(final_exp, final_exp_flags);
840
841 /* The `done` export exits the POPS ordered section on GFX11+, make sure UniformMemory and
842 * ImageMemory (in SPIR-V terms) accesses from the ordered section may not be reordered below
843 * it.
844 */
845 if (s->options->gfx_level >= GFX11 &&
846 (b->shader->info.fs.sample_interlock_ordered ||
847 b->shader->info.fs.sample_interlock_unordered ||
848 b->shader->info.fs.pixel_interlock_ordered ||
849 b->shader->info.fs.pixel_interlock_unordered)) {
850 b->cursor = nir_before_instr(&final_exp->instr);
851 nir_scoped_memory_barrier(b, SCOPE_QUEUE_FAMILY, NIR_MEMORY_RELEASE,
852 nir_var_image | nir_var_mem_ubo | nir_var_mem_ssbo |
853 nir_var_mem_global);
854 }
855 } else {
856 emit_ps_null_export(b, s);
857 }
858 }
859
860 void
ac_nir_lower_ps(nir_shader * nir,const ac_nir_lower_ps_options * options)861 ac_nir_lower_ps(nir_shader *nir, const ac_nir_lower_ps_options *options)
862 {
863 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
864
865 nir_builder builder = nir_builder_create(impl);
866 nir_builder *b = &builder;
867
868 lower_ps_state state = {
869 .options = options,
870 };
871
872 create_interp_param(b, &state);
873
874 nir_shader_instructions_pass(nir, lower_ps_intrinsic,
875 nir_metadata_block_index | nir_metadata_dominance,
876 &state);
877
878 /* Must be after lower_ps_intrinsic() to prevent it lower added intrinsic here. */
879 init_interp_param(b, &state);
880
881 export_ps_outputs(b, &state);
882
883 /* Cleanup nir variable, as RADV won't do this. */
884 if (state.lower_load_barycentric)
885 nir_lower_vars_to_ssa(nir);
886 }
887