1 /*
2 * Copyright 2020 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "si_pipe.h"
8 #include "si_shader_internal.h"
9 #include "si_shader_llvm.h"
10 #include "sid.h"
11
si_build_fs_interp(struct si_shader_context * ctx,unsigned attr_index,unsigned chan,LLVMValueRef prim_mask,LLVMValueRef i,LLVMValueRef j)12 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx, unsigned attr_index,
13 unsigned chan, LLVMValueRef prim_mask, LLVMValueRef i,
14 LLVMValueRef j)
15 {
16 if (i || j) {
17 return ac_build_fs_interp(&ctx->ac, LLVMConstInt(ctx->ac.i32, chan, 0),
18 LLVMConstInt(ctx->ac.i32, attr_index, 0), prim_mask, i, j);
19 }
20 return ac_build_fs_interp_mov(&ctx->ac, 0, /* P0 */
21 LLVMConstInt(ctx->ac.i32, chan, 0),
22 LLVMConstInt(ctx->ac.i32, attr_index, 0), prim_mask);
23 }
24
25 /**
26 * Interpolate a fragment shader input.
27 *
28 * @param ctx context
29 * @param input_index index of the input in hardware
30 * @param semantic_index semantic index
31 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
32 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
33 * @param interp_param interpolation weights (i,j)
34 * @param prim_mask SI_PARAM_PRIM_MASK
35 * @param face SI_PARAM_FRONT_FACE
36 * @param result the return value (4 components)
37 */
interp_fs_color(struct si_shader_context * ctx,unsigned input_index,unsigned semantic_index,unsigned num_interp_inputs,unsigned colors_read_mask,LLVMValueRef interp_param,LLVMValueRef prim_mask,LLVMValueRef face,LLVMValueRef result[4])38 static void interp_fs_color(struct si_shader_context *ctx, unsigned input_index,
39 unsigned semantic_index, unsigned num_interp_inputs,
40 unsigned colors_read_mask, LLVMValueRef interp_param,
41 LLVMValueRef prim_mask, LLVMValueRef face, LLVMValueRef result[4])
42 {
43 LLVMValueRef i = NULL, j = NULL;
44 unsigned chan;
45
46 /* fs.constant returns the param from the middle vertex, so it's not
47 * really useful for flat shading. It's meant to be used for custom
48 * interpolation (but the intrinsic can't fetch from the other two
49 * vertices).
50 *
51 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
52 * to do the right thing. The only reason we use fs.constant is that
53 * fs.interp cannot be used on integers, because they can be equal
54 * to NaN.
55 *
56 * When interp is false we will use fs.constant or for newer llvm,
57 * amdgcn.interp.mov.
58 */
59 bool interp = interp_param != NULL;
60
61 if (interp) {
62 i = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_0, "");
63 j = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_1, "");
64 }
65
66 if (ctx->shader->key.ps.part.prolog.color_two_side) {
67 LLVMValueRef is_face_positive;
68
69 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
70 * otherwise it's at offset "num_inputs".
71 */
72 unsigned back_attr_offset = num_interp_inputs;
73 if (semantic_index == 1 && colors_read_mask & 0xf)
74 back_attr_offset += 1;
75
76 is_face_positive = LLVMBuildFCmp(ctx->ac.builder, LLVMRealOLT, ctx->ac.f32_0, face, "");
77
78 for (chan = 0; chan < 4; chan++) {
79 LLVMValueRef front, back;
80
81 front = si_build_fs_interp(ctx, input_index, chan, prim_mask, i, j);
82 back = si_build_fs_interp(ctx, back_attr_offset, chan, prim_mask, i, j);
83
84 result[chan] = LLVMBuildSelect(ctx->ac.builder, is_face_positive, front, back, "");
85 }
86 } else {
87 for (chan = 0; chan < 4; chan++) {
88 result[chan] = si_build_fs_interp(ctx, input_index, chan, prim_mask, i, j);
89 }
90 }
91 }
92
si_alpha_test(struct si_shader_context * ctx,LLVMValueRef alpha)93 static void si_alpha_test(struct si_shader_context *ctx, LLVMValueRef alpha)
94 {
95 if (ctx->shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_NEVER) {
96 static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
97 [PIPE_FUNC_LESS] = LLVMRealOLT, [PIPE_FUNC_EQUAL] = LLVMRealOEQ,
98 [PIPE_FUNC_LEQUAL] = LLVMRealOLE, [PIPE_FUNC_GREATER] = LLVMRealOGT,
99 [PIPE_FUNC_NOTEQUAL] = LLVMRealONE, [PIPE_FUNC_GEQUAL] = LLVMRealOGE,
100 };
101 LLVMRealPredicate cond = cond_map[ctx->shader->key.ps.part.epilog.alpha_func];
102 assert(cond);
103
104 LLVMValueRef alpha_ref = ac_get_arg(&ctx->ac, ctx->args->alpha_reference);
105 if (LLVMTypeOf(alpha) == ctx->ac.f16)
106 alpha_ref = LLVMBuildFPTrunc(ctx->ac.builder, alpha_ref, ctx->ac.f16, "");
107
108 LLVMValueRef alpha_pass = LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
109 ac_build_kill_if_false(&ctx->ac, alpha_pass);
110 } else {
111 ac_build_kill_if_false(&ctx->ac, ctx->ac.i1false);
112 }
113 }
114
115 struct si_ps_exports {
116 unsigned num;
117 struct ac_export_args args[10];
118 };
119
pack_two_16bit(struct ac_llvm_context * ctx,LLVMValueRef args[2])120 static LLVMValueRef pack_two_16bit(struct ac_llvm_context *ctx, LLVMValueRef args[2])
121 {
122 LLVMValueRef tmp = ac_build_gather_values(ctx, args, 2);
123 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2f16, "");
124 }
125
get_color_32bit(struct si_shader_context * ctx,unsigned color_type,LLVMValueRef value)126 static LLVMValueRef get_color_32bit(struct si_shader_context *ctx, unsigned color_type,
127 LLVMValueRef value)
128 {
129 switch (color_type) {
130 case SI_TYPE_FLOAT16:
131 return LLVMBuildFPExt(ctx->ac.builder, value, ctx->ac.f32, "");
132 case SI_TYPE_INT16:
133 value = ac_to_integer(&ctx->ac, value);
134 value = LLVMBuildSExt(ctx->ac.builder, value, ctx->ac.i32, "");
135 return ac_to_float(&ctx->ac, value);
136 case SI_TYPE_UINT16:
137 value = ac_to_integer(&ctx->ac, value);
138 value = LLVMBuildZExt(ctx->ac.builder, value, ctx->ac.i32, "");
139 return ac_to_float(&ctx->ac, value);
140 case SI_TYPE_ANY32:
141 return value;
142 }
143 return NULL;
144 }
145
146 /* Initialize arguments for the shader export intrinsic */
si_llvm_init_ps_export_args(struct si_shader_context * ctx,LLVMValueRef * values,unsigned cbuf,unsigned compacted_mrt_index,unsigned color_type,struct ac_export_args * args)147 static bool si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValueRef *values,
148 unsigned cbuf, unsigned compacted_mrt_index,
149 unsigned color_type, struct ac_export_args *args)
150 {
151 const union si_shader_key *key = &ctx->shader->key;
152 unsigned col_formats = key->ps.part.epilog.spi_shader_col_format;
153 LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
154 unsigned spi_shader_col_format;
155 unsigned chan;
156 bool is_int8, is_int10;
157
158 assert(cbuf < 8);
159
160 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
161 if (spi_shader_col_format == V_028714_SPI_SHADER_ZERO)
162 return false;
163
164 is_int8 = (key->ps.part.epilog.color_is_int8 >> cbuf) & 0x1;
165 is_int10 = (key->ps.part.epilog.color_is_int10 >> cbuf) & 0x1;
166
167 /* Default is 0xf. Adjusted below depending on the format. */
168 args->enabled_channels = 0xf; /* writemask */
169
170 /* Specify whether the EXEC mask represents the valid mask */
171 args->valid_mask = 0;
172
173 /* Specify whether this is the last export */
174 args->done = 0;
175
176 /* Specify the target we are exporting */
177 args->target = V_008DFC_SQ_EXP_MRT + compacted_mrt_index;
178
179 if (key->ps.part.epilog.dual_src_blend_swizzle &&
180 (compacted_mrt_index == 0 || compacted_mrt_index == 1)) {
181 assert(ctx->ac.gfx_level >= GFX11);
182 args->target += 21;
183 }
184
185 args->compr = false;
186 args->out[0] = f32undef;
187 args->out[1] = f32undef;
188 args->out[2] = f32undef;
189 args->out[3] = f32undef;
190
191 LLVMValueRef (*packf)(struct ac_llvm_context * ctx, LLVMValueRef args[2]) = NULL;
192 LLVMValueRef (*packi)(struct ac_llvm_context * ctx, LLVMValueRef args[2], unsigned bits,
193 bool hi) = NULL;
194
195 switch (spi_shader_col_format) {
196 case V_028714_SPI_SHADER_32_R:
197 args->enabled_channels = 1; /* writemask */
198 args->out[0] = get_color_32bit(ctx, color_type, values[0]);
199 break;
200
201 case V_028714_SPI_SHADER_32_GR:
202 args->enabled_channels = 0x3; /* writemask */
203 args->out[0] = get_color_32bit(ctx, color_type, values[0]);
204 args->out[1] = get_color_32bit(ctx, color_type, values[1]);
205 break;
206
207 case V_028714_SPI_SHADER_32_AR:
208 if (ctx->screen->info.gfx_level >= GFX10) {
209 args->enabled_channels = 0x3; /* writemask */
210 args->out[0] = get_color_32bit(ctx, color_type, values[0]);
211 args->out[1] = get_color_32bit(ctx, color_type, values[3]);
212 } else {
213 args->enabled_channels = 0x9; /* writemask */
214 args->out[0] = get_color_32bit(ctx, color_type, values[0]);
215 args->out[3] = get_color_32bit(ctx, color_type, values[3]);
216 }
217 break;
218
219 case V_028714_SPI_SHADER_FP16_ABGR:
220 if (color_type != SI_TYPE_ANY32)
221 packf = pack_two_16bit;
222 else
223 packf = ac_build_cvt_pkrtz_f16;
224 break;
225
226 case V_028714_SPI_SHADER_UNORM16_ABGR:
227 if (color_type != SI_TYPE_ANY32)
228 packf = ac_build_cvt_pknorm_u16_f16;
229 else
230 packf = ac_build_cvt_pknorm_u16;
231 break;
232
233 case V_028714_SPI_SHADER_SNORM16_ABGR:
234 if (color_type != SI_TYPE_ANY32)
235 packf = ac_build_cvt_pknorm_i16_f16;
236 else
237 packf = ac_build_cvt_pknorm_i16;
238 break;
239
240 case V_028714_SPI_SHADER_UINT16_ABGR:
241 if (color_type != SI_TYPE_ANY32)
242 packf = pack_two_16bit;
243 else
244 packi = ac_build_cvt_pk_u16;
245 break;
246
247 case V_028714_SPI_SHADER_SINT16_ABGR:
248 if (color_type != SI_TYPE_ANY32)
249 packf = pack_two_16bit;
250 else
251 packi = ac_build_cvt_pk_i16;
252 break;
253
254 case V_028714_SPI_SHADER_32_ABGR:
255 for (unsigned i = 0; i < 4; i++)
256 args->out[i] = get_color_32bit(ctx, color_type, values[i]);
257 break;
258 }
259
260 /* Pack f16 or norm_i16/u16. */
261 if (packf) {
262 for (chan = 0; chan < 2; chan++) {
263 LLVMValueRef pack_args[2] = {values[2 * chan], values[2 * chan + 1]};
264 LLVMValueRef packed;
265
266 packed = packf(&ctx->ac, pack_args);
267 args->out[chan] = ac_to_float(&ctx->ac, packed);
268 }
269 }
270 /* Pack i16/u16. */
271 if (packi) {
272 for (chan = 0; chan < 2; chan++) {
273 LLVMValueRef pack_args[2] = {ac_to_integer(&ctx->ac, values[2 * chan]),
274 ac_to_integer(&ctx->ac, values[2 * chan + 1])};
275 LLVMValueRef packed;
276
277 packed = packi(&ctx->ac, pack_args, is_int8 ? 8 : is_int10 ? 10 : 16, chan == 1);
278 args->out[chan] = ac_to_float(&ctx->ac, packed);
279 }
280 }
281 if (packf || packi) {
282 if (ctx->screen->info.gfx_level >= GFX11)
283 args->enabled_channels = 0x3;
284 else
285 args->compr = 1; /* COMPR flag */
286 }
287
288 return true;
289 }
290
si_llvm_build_clamp_alpha_test(struct si_shader_context * ctx,LLVMValueRef * color,unsigned index)291 static void si_llvm_build_clamp_alpha_test(struct si_shader_context *ctx,
292 LLVMValueRef *color, unsigned index)
293 {
294 int i;
295
296 /* Clamp color */
297 if (ctx->shader->key.ps.part.epilog.clamp_color)
298 for (i = 0; i < 4; i++)
299 color[i] = ac_build_clamp(&ctx->ac, color[i]);
300
301 /* Alpha to one */
302 if (ctx->shader->key.ps.part.epilog.alpha_to_one)
303 color[3] = LLVMConstReal(LLVMTypeOf(color[0]), 1);
304
305 /* Alpha test */
306 if (index == 0 && ctx->shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS)
307 si_alpha_test(ctx, color[3]);
308 }
309
si_export_mrt_color(struct si_shader_context * ctx,LLVMValueRef * color,unsigned index,unsigned first_color_export,unsigned color_type,bool writes_all_cbufs,struct si_ps_exports * exp)310 static void si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *color, unsigned index,
311 unsigned first_color_export, unsigned color_type,
312 bool writes_all_cbufs, struct si_ps_exports *exp)
313 {
314 if (writes_all_cbufs) {
315 assert(exp->num == first_color_export);
316
317 /* This will do nothing for color buffers with SPI_SHADER_COL_FORMAT=ZERO, so always
318 * iterate over all 8.
319 */
320 for (int c = 0; c < 8; c++) {
321 if (si_llvm_init_ps_export_args(ctx, color, c, exp->num - first_color_export,
322 color_type, &exp->args[exp->num])) {
323 assert(exp->args[exp->num].enabled_channels);
324 exp->num++;
325 }
326 }
327 } else {
328 /* Export */
329 if (si_llvm_init_ps_export_args(ctx, color, index, exp->num - first_color_export,
330 color_type, &exp->args[exp->num])) {
331 assert(exp->args[exp->num].enabled_channels);
332 exp->num++;
333 }
334 }
335 }
336
337 /**
338 * Return PS outputs in this order:
339 *
340 * v[0:3] = color0.xyzw
341 * v[4:7] = color1.xyzw
342 * ...
343 * vN+0 = Depth
344 * vN+1 = Stencil
345 * vN+2 = SampleMask
346 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
347 *
348 * The alpha-ref SGPR is returned via its original location.
349 */
si_llvm_ps_build_end(struct si_shader_context * ctx)350 void si_llvm_ps_build_end(struct si_shader_context *ctx)
351 {
352 struct si_shader *shader = ctx->shader;
353 struct si_shader_info *info = &shader->selector->info;
354 LLVMBuilderRef builder = ctx->ac.builder;
355 unsigned i, j, vgpr;
356 LLVMValueRef *addrs = ctx->abi.outputs;
357
358 LLVMValueRef color[8][4] = {};
359 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
360 LLVMValueRef ret;
361
362 /* Read the output values. */
363 for (i = 0; i < info->num_outputs; i++) {
364 unsigned semantic = info->output_semantic[i];
365 LLVMTypeRef type = ctx->abi.is_16bit[4 * i] ? ctx->ac.f16 : ctx->ac.f32;
366
367 switch (semantic) {
368 case FRAG_RESULT_DEPTH:
369 depth = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
370 break;
371 case FRAG_RESULT_STENCIL:
372 stencil = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
373 break;
374 case FRAG_RESULT_SAMPLE_MASK:
375 samplemask = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
376 break;
377 default:
378 if (semantic >= FRAG_RESULT_DATA0 && semantic <= FRAG_RESULT_DATA7) {
379 unsigned index = semantic - FRAG_RESULT_DATA0;
380
381 for (j = 0; j < 4; j++) {
382 LLVMValueRef ptr = addrs[4 * i + j];
383 type = ctx->abi.is_16bit[4 * i + j] ? ctx->ac.f16 : ctx->ac.f32;
384 LLVMValueRef result = LLVMBuildLoad2(builder, type, ptr, "");
385 color[index][j] = result;
386 }
387 } else {
388 fprintf(stderr, "Warning: Unhandled fs output type:%d\n", semantic);
389 }
390 break;
391 }
392 }
393
394 /* Fill the return structure. */
395 ret = ctx->return_value;
396
397 /* Set SGPRs. */
398 ret = LLVMBuildInsertValue(
399 builder, ret, ac_to_integer(&ctx->ac, LLVMGetParam(ctx->main_fn.value, SI_PARAM_ALPHA_REF)),
400 SI_SGPR_ALPHA_REF, "");
401
402 /* Set VGPRs */
403 vgpr = SI_SGPR_ALPHA_REF + 1;
404 for (i = 0; i < ARRAY_SIZE(color); i++) {
405 if (!color[i][0])
406 continue;
407
408 if (LLVMTypeOf(color[i][0]) == ctx->ac.f16) {
409 for (j = 0; j < 2; j++) {
410 LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, &color[i][j * 2], 2);
411 tmp = LLVMBuildBitCast(builder, tmp, ctx->ac.f32, "");
412 ret = LLVMBuildInsertValue(builder, ret, tmp, vgpr++, "");
413 }
414 vgpr += 2;
415 } else {
416 for (j = 0; j < 4; j++)
417 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
418 }
419 }
420 if (depth)
421 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
422 if (stencil)
423 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
424 if (samplemask)
425 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
426
427 ctx->return_value = ret;
428 }
429
si_llvm_emit_polygon_stipple(struct si_shader_context * ctx)430 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx)
431 {
432 LLVMBuilderRef builder = ctx->ac.builder;
433 LLVMValueRef desc, offset, row, bit, address[2];
434
435 /* Use the fixed-point gl_FragCoord input.
436 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
437 * per coordinate to get the repeating effect.
438 */
439 address[0] = si_unpack_param(ctx, ctx->args->ac.pos_fixed_pt, 0, 5);
440 address[1] = si_unpack_param(ctx, ctx->args->ac.pos_fixed_pt, 16, 5);
441
442 /* Load the buffer descriptor. */
443 desc = si_prolog_get_internal_binding_slot(ctx, SI_PS_CONST_POLY_STIPPLE);
444
445 /* The stipple pattern is 32x32, each row has 32 bits. */
446 offset = LLVMBuildMul(builder, address[1], LLVMConstInt(ctx->ac.i32, 4, 0), "");
447 row = si_buffer_load_const(ctx, desc, offset);
448 row = ac_to_integer(&ctx->ac, row);
449 bit = LLVMBuildLShr(builder, row, address[0], "");
450 bit = LLVMBuildTrunc(builder, bit, ctx->ac.i1, "");
451 ac_build_kill_if_false(&ctx->ac, bit);
452 }
453
insert_ret_of_arg(struct si_shader_context * ctx,LLVMValueRef ret,LLVMValueRef data,unsigned arg_index)454 static LLVMValueRef insert_ret_of_arg(struct si_shader_context *ctx, LLVMValueRef ret,
455 LLVMValueRef data, unsigned arg_index)
456 {
457 unsigned base = ctx->args->ac.args[arg_index].file == AC_ARG_VGPR ?
458 ctx->args->ac.num_sgprs_used : 0;
459 unsigned index = base + ctx->args->ac.args[arg_index].offset;
460
461 if (ctx->args->ac.args[arg_index].size == 1) {
462 return LLVMBuildInsertValue(ctx->ac.builder, ret, data, index, "");
463 } else {
464 assert(ctx->args->ac.args[arg_index].size == 2);
465 LLVMValueRef tmp = LLVMBuildExtractElement(ctx->ac.builder, data, ctx->ac.i32_0, "");
466 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, index, "");
467 tmp = LLVMBuildExtractElement(ctx->ac.builder, data, ctx->ac.i32_1, "");
468 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, index + 1, "");
469 return ret;
470 }
471 }
472
473 /**
474 * Build the pixel shader prolog function. This handles:
475 * - two-side color selection and interpolation
476 * - overriding interpolation parameters for the API PS
477 * - polygon stippling
478 *
479 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
480 * overridden by other states. (e.g. per-sample interpolation)
481 * Interpolated colors are stored after the preloaded VGPRs.
482 */
si_llvm_build_ps_prolog(struct si_shader_context * ctx,union si_shader_part_key * key)483 void si_llvm_build_ps_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
484 {
485 struct si_shader_args *args = ctx->args;
486 si_get_ps_prolog_args(args, key);
487
488 /* Declare outputs (same as inputs + add colors if needed) */
489 LLVMTypeRef return_types[AC_MAX_ARGS];
490 int num_returns = 0;
491
492 for (int i = 0; i < args->ac.num_sgprs_used; i++)
493 return_types[num_returns++] = ctx->ac.i32;
494
495 unsigned num_color_channels = util_bitcount(key->ps_prolog.colors_read);
496 unsigned num_output_vgprs = args->ac.num_vgprs_used + num_color_channels;
497 for (int i = 0; i < num_output_vgprs; i++)
498 return_types[num_returns++] = ctx->ac.f32;
499
500 /* Create the function. */
501 si_llvm_create_func(ctx, "ps_prolog", return_types, num_returns, 0);
502 LLVMValueRef func = ctx->main_fn.value;
503
504 /* Copy inputs to outputs. This should be no-op, as the registers match,
505 * but it will prevent the compiler from overwriting them unintentionally.
506 */
507 LLVMValueRef ret = ctx->return_value;
508 for (int i = 0; i < args->ac.arg_count; i++) {
509 LLVMValueRef p = LLVMGetParam(func, i);
510 ret = insert_ret_of_arg(ctx, ret, p, i);
511 }
512
513 /* Polygon stippling. */
514 if (key->ps_prolog.states.poly_stipple)
515 si_llvm_emit_polygon_stipple(ctx);
516
517 if (key->ps_prolog.states.bc_optimize_for_persp ||
518 key->ps_prolog.states.bc_optimize_for_linear) {
519 LLVMValueRef center, centroid, tmp;
520
521 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
522 * The hw doesn't compute CENTROID if the whole wave only
523 * contains fully-covered quads.
524 */
525 LLVMValueRef bc_optimize = ac_get_arg(&ctx->ac, args->ac.prim_mask);
526 bc_optimize =
527 LLVMBuildLShr(ctx->ac.builder, bc_optimize, LLVMConstInt(ctx->ac.i32, 31, 0), "");
528 bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize, ctx->ac.i1, "");
529
530 if (key->ps_prolog.states.bc_optimize_for_persp) {
531 center = ac_get_arg(&ctx->ac, args->ac.persp_center);
532 centroid = ac_get_arg(&ctx->ac, args->ac.persp_centroid);
533 /* Select PERSP_CENTROID. */
534 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center, centroid, "");
535 ret = insert_ret_of_arg(ctx, ret, tmp, args->ac.persp_centroid.arg_index);
536 }
537 if (key->ps_prolog.states.bc_optimize_for_linear) {
538 center = ac_get_arg(&ctx->ac, args->ac.linear_center);
539 centroid = ac_get_arg(&ctx->ac, args->ac.linear_centroid);
540 /* Select PERSP_CENTROID. */
541 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center, centroid, "");
542 ret = insert_ret_of_arg(ctx, ret, tmp, args->ac.linear_centroid.arg_index);
543 }
544 }
545
546 /* Force per-sample interpolation. */
547 if (key->ps_prolog.states.force_persp_sample_interp) {
548 LLVMValueRef persp_sample = ac_get_arg(&ctx->ac, args->ac.persp_sample);
549 /* Overwrite PERSP_CENTER. */
550 ret = insert_ret_of_arg(ctx, ret, persp_sample, args->ac.persp_center.arg_index);
551 /* Overwrite PERSP_CENTROID. */
552 ret = insert_ret_of_arg(ctx, ret, persp_sample, args->ac.persp_centroid.arg_index);
553 }
554 if (key->ps_prolog.states.force_linear_sample_interp) {
555 LLVMValueRef linear_sample = ac_get_arg(&ctx->ac, args->ac.linear_sample);
556 /* Overwrite LINEAR_CENTER. */
557 ret = insert_ret_of_arg(ctx, ret, linear_sample, args->ac.linear_center.arg_index);
558 /* Overwrite LINEAR_CENTROID. */
559 ret = insert_ret_of_arg(ctx, ret, linear_sample, args->ac.linear_centroid.arg_index);
560 }
561
562 /* Force center interpolation. */
563 if (key->ps_prolog.states.force_persp_center_interp) {
564 LLVMValueRef persp_center = ac_get_arg(&ctx->ac, args->ac.persp_center);
565 /* Overwrite PERSP_SAMPLE. */
566 ret = insert_ret_of_arg(ctx, ret, persp_center, args->ac.persp_sample.arg_index);
567 /* Overwrite PERSP_CENTROID. */
568 ret = insert_ret_of_arg(ctx, ret, persp_center, args->ac.persp_centroid.arg_index);
569 }
570 if (key->ps_prolog.states.force_linear_center_interp) {
571 LLVMValueRef linear_center = ac_get_arg(&ctx->ac, args->ac.linear_center);
572 /* Overwrite LINEAR_SAMPLE. */
573 ret = insert_ret_of_arg(ctx, ret, linear_center, args->ac.linear_sample.arg_index);
574 /* Overwrite LINEAR_CENTROID. */
575 ret = insert_ret_of_arg(ctx, ret, linear_center, args->ac.linear_centroid.arg_index);
576 }
577
578 /* Interpolate colors. */
579 unsigned color_out_idx = 0;
580 unsigned num_input_gprs = args->ac.num_sgprs_used + args->ac.num_vgprs_used;
581 for (int i = 0; i < 2; i++) {
582 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
583
584 if (!writemask)
585 continue;
586
587 /* If the interpolation qualifier is not CONSTANT (-1). */
588 LLVMValueRef interp_ij = NULL;
589 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
590 unsigned index =
591 args->ac.num_sgprs_used + key->ps_prolog.color_interp_vgpr_index[i];
592
593 /* Get the (i,j) updated by bc_optimize handling. */
594 LLVMValueRef interp[2] = {
595 LLVMBuildExtractValue(ctx->ac.builder, ret, index, ""),
596 LLVMBuildExtractValue(ctx->ac.builder, ret, index + 1, ""),
597 };
598 interp_ij = ac_build_gather_values(&ctx->ac, interp, 2);
599 }
600
601 LLVMValueRef prim_mask = ac_get_arg(&ctx->ac, args->ac.prim_mask);
602
603 LLVMValueRef face = NULL;
604 if (key->ps_prolog.states.color_two_side)
605 face = ac_get_arg(&ctx->ac, args->ac.front_face);
606
607 LLVMValueRef color[4];
608 interp_fs_color(ctx, key->ps_prolog.color_attr_index[i], i, key->ps_prolog.num_interp_inputs,
609 key->ps_prolog.colors_read, interp_ij, prim_mask, face, color);
610
611 while (writemask) {
612 unsigned chan = u_bit_scan(&writemask);
613 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan],
614 num_input_gprs + color_out_idx++, "");
615 }
616 }
617
618 /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
619 * says:
620 *
621 * "When per-sample shading is active due to the use of a fragment
622 * input qualified by sample or due to the use of the gl_SampleID
623 * or gl_SamplePosition variables, only the bit for the current
624 * sample is set in gl_SampleMaskIn. When state specifies multiple
625 * fragment shader invocations for a given fragment, the sample
626 * mask for any single fragment shader invocation may specify a
627 * subset of the covered samples for the fragment. In this case,
628 * the bit corresponding to each covered sample will be set in
629 * exactly one fragment shader invocation."
630 *
631 * The samplemask loaded by hardware is always the coverage of the
632 * entire pixel/fragment, so mask bits out based on the sample ID.
633 */
634 if (key->ps_prolog.states.samplemask_log_ps_iter) {
635 LLVMValueRef sample_id = si_unpack_param(ctx, args->ac.ancillary, 8, 4);
636 LLVMValueRef sample_mask_in;
637
638 /* Set samplemask_log_ps_iter=3 if full sample shading is enabled even for 2x and 4x MSAA
639 * to get this fast path that fully replaces sample_mask_in with sample_id.
640 */
641 if (key->ps_prolog.states.samplemask_log_ps_iter == 3) {
642 sample_mask_in =
643 LLVMBuildSelect(ctx->ac.builder, ac_build_load_helper_invocation(&ctx->ac),
644 ctx->ac.i32_0,
645 LLVMBuildShl(ctx->ac.builder, ctx->ac.i32_1, sample_id, ""), "");
646 } else {
647 uint32_t ps_iter_mask =
648 ac_get_ps_iter_mask(1 << key->ps_prolog.states.samplemask_log_ps_iter);
649 sample_mask_in =
650 LLVMBuildAnd(ctx->ac.builder,
651 ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, args->ac.sample_coverage)),
652 LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false),
653 sample_id, ""), "");
654 }
655
656 sample_mask_in = ac_to_float(&ctx->ac, sample_mask_in);
657 ret = insert_ret_of_arg(ctx, ret, sample_mask_in, args->ac.sample_coverage.arg_index);
658 } else if (key->ps_prolog.states.force_samplemask_to_helper_invocation) {
659 LLVMValueRef sample_mask_in =
660 LLVMBuildZExt(ctx->ac.builder,
661 LLVMBuildNot(ctx->ac.builder, ac_build_load_helper_invocation(&ctx->ac), ""),
662 ctx->ac.i32, "");
663 ret = insert_ret_of_arg(ctx, ret, ac_to_float(&ctx->ac, sample_mask_in),
664 args->ac.sample_coverage.arg_index);
665 }
666
667 if (key->ps_prolog.states.get_frag_coord_from_pixel_coord) {
668 LLVMValueRef pixel_coord = ac_get_arg(&ctx->ac, args->ac.pos_fixed_pt);
669 pixel_coord = LLVMBuildBitCast(ctx->ac.builder, pixel_coord, ctx->ac.v2i16, "");
670 pixel_coord = LLVMBuildUIToFP(ctx->ac.builder, pixel_coord, ctx->ac.v2f32, "");
671
672 if (!key->ps_prolog.pixel_center_integer) {
673 LLVMValueRef vec2_half = LLVMConstVector((LLVMValueRef[]){LLVMConstReal(ctx->ac.f32, 0.5),
674 LLVMConstReal(ctx->ac.f32, 0.5)}, 2);
675 pixel_coord = LLVMBuildFAdd(ctx->ac.builder, pixel_coord, vec2_half, "");
676 }
677
678 for (unsigned i = 0; i < 2; i++) {
679 if (!args->ac.frag_pos[i].used)
680 continue;
681
682 ret = insert_ret_of_arg(ctx, ret,
683 LLVMBuildExtractElement(ctx->ac.builder, pixel_coord,
684 LLVMConstInt(ctx->ac.i32, i, 0), ""),
685 args->ac.frag_pos[i].arg_index);
686 }
687 }
688
689 /* Tell LLVM to insert WQM instruction sequence when needed. */
690 if (key->ps_prolog.wqm) {
691 LLVMAddTargetDependentFunctionAttr(func, "amdgpu-ps-wqm-outputs", "");
692 }
693
694 si_llvm_build_ret(ctx, ret);
695 }
696
697 /**
698 * Build the pixel shader epilog function. This handles everything that must be
699 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
700 */
si_llvm_build_ps_epilog(struct si_shader_context * ctx,union si_shader_part_key * key)701 void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part_key *key)
702 {
703 int i;
704 struct si_ps_exports exp = {};
705 LLVMValueRef color[8][4] = {};
706
707 struct si_shader_args *args = ctx->args;
708 struct ac_arg color_args[MAX_DRAW_BUFFERS];
709 struct ac_arg depth_arg, stencil_arg, samplemask_arg;
710 si_get_ps_epilog_args(args, key, color_args, &depth_arg, &stencil_arg, &samplemask_arg);
711
712 /* Create the function. */
713 si_llvm_create_func(ctx, "ps_epilog", NULL, 0, 0);
714 /* Disable elimination of unused inputs. */
715 ac_llvm_add_target_dep_function_attr(ctx->main_fn.value, "InitialPSInputAddr", 0xffffff);
716
717 /* Prepare color. */
718 unsigned colors_written = key->ps_epilog.colors_written;
719 LLVMValueRef mrtz_alpha = NULL;
720
721 while (colors_written) {
722 int write_i = u_bit_scan(&colors_written);
723 unsigned color_type = (key->ps_epilog.color_types >> (write_i * 2)) & 0x3;
724 LLVMValueRef arg = ac_get_arg(&ctx->ac, color_args[write_i]);
725
726 if (color_type != SI_TYPE_ANY32)
727 arg = LLVMBuildBitCast(ctx->ac.builder, arg, LLVMVectorType(ctx->ac.f16, 8), "");
728
729 for (i = 0; i < 4; i++)
730 color[write_i][i] = ac_llvm_extract_elem(&ctx->ac, arg, i);
731
732 if (key->ps_epilog.states.alpha_to_coverage_via_mrtz && write_i == 0)
733 mrtz_alpha = color[0][3];
734
735 si_llvm_build_clamp_alpha_test(ctx, color[write_i], write_i);
736 }
737 bool writes_z = key->ps_epilog.writes_z && !key->ps_epilog.states.kill_z;
738 bool writes_stencil = key->ps_epilog.writes_stencil && !key->ps_epilog.states.kill_stencil;
739 bool writes_samplemask = key->ps_epilog.writes_samplemask && !key->ps_epilog.states.kill_samplemask;
740
741 /* Prepare the mrtz export. */
742 if (writes_z || writes_stencil || writes_samplemask || mrtz_alpha) {
743 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
744
745 if (writes_z)
746 depth = ac_get_arg(&ctx->ac, depth_arg);
747 if (writes_stencil)
748 stencil = ac_get_arg(&ctx->ac, stencil_arg);
749 if (writes_samplemask)
750 samplemask = ac_get_arg(&ctx->ac, samplemask_arg);
751
752 ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, mrtz_alpha, false,
753 &exp.args[exp.num++]);
754 }
755
756 /* Prepare color exports. */
757 const unsigned first_color_export = exp.num;
758 colors_written = key->ps_epilog.colors_written;
759
760 while (colors_written) {
761 int write_i = u_bit_scan(&colors_written);
762 unsigned color_type = (key->ps_epilog.color_types >> (write_i * 2)) & 0x3;
763
764 si_export_mrt_color(ctx, color[write_i], write_i, first_color_export, color_type,
765 key->ps_epilog.writes_all_cbufs, &exp);
766 }
767
768 if (exp.num) {
769 exp.args[exp.num - 1].valid_mask = 1; /* whether the EXEC mask is valid */
770 exp.args[exp.num - 1].done = 1; /* DONE bit */
771
772 if (key->ps_epilog.states.dual_src_blend_swizzle) {
773 assert(ctx->ac.gfx_level >= GFX11);
774 assert((key->ps_epilog.colors_written & 0x3) == 0x3);
775 ac_build_dual_src_blend_swizzle(&ctx->ac, &exp.args[first_color_export],
776 &exp.args[first_color_export + 1]);
777 }
778
779 for (unsigned i = 0; i < exp.num; i++)
780 ac_build_export(&ctx->ac, &exp.args[i]);
781 } else {
782 ac_build_export_null(&ctx->ac, key->ps_epilog.uses_discard);
783 }
784
785 /* Compile. */
786 LLVMBuildRetVoid(ctx->ac.builder);
787 }
788