• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_pipe.h"
8 #include "si_shader_internal.h"
9 #include "si_shader_llvm.h"
10 #include "sid.h"
11 
si_build_fs_interp(struct si_shader_context * ctx,unsigned attr_index,unsigned chan,LLVMValueRef prim_mask,LLVMValueRef i,LLVMValueRef j)12 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx, unsigned attr_index,
13                                        unsigned chan, LLVMValueRef prim_mask, LLVMValueRef i,
14                                        LLVMValueRef j)
15 {
16    if (i || j) {
17       return ac_build_fs_interp(&ctx->ac, LLVMConstInt(ctx->ac.i32, chan, 0),
18                                 LLVMConstInt(ctx->ac.i32, attr_index, 0), prim_mask, i, j);
19    }
20    return ac_build_fs_interp_mov(&ctx->ac, 0, /* P0 */
21                                  LLVMConstInt(ctx->ac.i32, chan, 0),
22                                  LLVMConstInt(ctx->ac.i32, attr_index, 0), prim_mask);
23 }
24 
25 /**
26  * Interpolate a fragment shader input.
27  *
28  * @param ctx                context
29  * @param input_index        index of the input in hardware
30  * @param semantic_index     semantic index
31  * @param num_interp_inputs  number of all interpolated inputs (= BCOLOR offset)
32  * @param colors_read_mask   color components read (4 bits for each color, 8 bits in total)
33  * @param interp_param       interpolation weights (i,j)
34  * @param prim_mask          SI_PARAM_PRIM_MASK
35  * @param face               SI_PARAM_FRONT_FACE
36  * @param result             the return value (4 components)
37  */
interp_fs_color(struct si_shader_context * ctx,unsigned input_index,unsigned semantic_index,unsigned num_interp_inputs,unsigned colors_read_mask,LLVMValueRef interp_param,LLVMValueRef prim_mask,LLVMValueRef face,LLVMValueRef result[4])38 static void interp_fs_color(struct si_shader_context *ctx, unsigned input_index,
39                             unsigned semantic_index, unsigned num_interp_inputs,
40                             unsigned colors_read_mask, LLVMValueRef interp_param,
41                             LLVMValueRef prim_mask, LLVMValueRef face, LLVMValueRef result[4])
42 {
43    LLVMValueRef i = NULL, j = NULL;
44    unsigned chan;
45 
46    /* fs.constant returns the param from the middle vertex, so it's not
47     * really useful for flat shading. It's meant to be used for custom
48     * interpolation (but the intrinsic can't fetch from the other two
49     * vertices).
50     *
51     * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
52     * to do the right thing. The only reason we use fs.constant is that
53     * fs.interp cannot be used on integers, because they can be equal
54     * to NaN.
55     *
56     * When interp is false we will use fs.constant or for newer llvm,
57     * amdgcn.interp.mov.
58     */
59    bool interp = interp_param != NULL;
60 
61    if (interp) {
62       i = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_0, "");
63       j = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_1, "");
64    }
65 
66    if (ctx->shader->key.ps.part.prolog.color_two_side) {
67       LLVMValueRef is_face_positive;
68 
69       /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
70        * otherwise it's at offset "num_inputs".
71        */
72       unsigned back_attr_offset = num_interp_inputs;
73       if (semantic_index == 1 && colors_read_mask & 0xf)
74          back_attr_offset += 1;
75 
76       is_face_positive = LLVMBuildFCmp(ctx->ac.builder, LLVMRealOLT, ctx->ac.f32_0, face, "");
77 
78       for (chan = 0; chan < 4; chan++) {
79          LLVMValueRef front, back;
80 
81          front = si_build_fs_interp(ctx, input_index, chan, prim_mask, i, j);
82          back = si_build_fs_interp(ctx, back_attr_offset, chan, prim_mask, i, j);
83 
84          result[chan] = LLVMBuildSelect(ctx->ac.builder, is_face_positive, front, back, "");
85       }
86    } else {
87       for (chan = 0; chan < 4; chan++) {
88          result[chan] = si_build_fs_interp(ctx, input_index, chan, prim_mask, i, j);
89       }
90    }
91 }
92 
si_alpha_test(struct si_shader_context * ctx,LLVMValueRef alpha)93 static void si_alpha_test(struct si_shader_context *ctx, LLVMValueRef alpha)
94 {
95    if (ctx->shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_NEVER) {
96       static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
97          [PIPE_FUNC_LESS] = LLVMRealOLT,     [PIPE_FUNC_EQUAL] = LLVMRealOEQ,
98          [PIPE_FUNC_LEQUAL] = LLVMRealOLE,   [PIPE_FUNC_GREATER] = LLVMRealOGT,
99          [PIPE_FUNC_NOTEQUAL] = LLVMRealONE, [PIPE_FUNC_GEQUAL] = LLVMRealOGE,
100       };
101       LLVMRealPredicate cond = cond_map[ctx->shader->key.ps.part.epilog.alpha_func];
102       assert(cond);
103 
104       LLVMValueRef alpha_ref = ac_get_arg(&ctx->ac, ctx->args->alpha_reference);
105       if (LLVMTypeOf(alpha) == ctx->ac.f16)
106          alpha_ref = LLVMBuildFPTrunc(ctx->ac.builder, alpha_ref, ctx->ac.f16, "");
107 
108       LLVMValueRef alpha_pass = LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
109       ac_build_kill_if_false(&ctx->ac, alpha_pass);
110    } else {
111       ac_build_kill_if_false(&ctx->ac, ctx->ac.i1false);
112    }
113 }
114 
115 struct si_ps_exports {
116    unsigned num;
117    struct ac_export_args args[10];
118 };
119 
pack_two_16bit(struct ac_llvm_context * ctx,LLVMValueRef args[2])120 static LLVMValueRef pack_two_16bit(struct ac_llvm_context *ctx, LLVMValueRef args[2])
121 {
122    LLVMValueRef tmp = ac_build_gather_values(ctx, args, 2);
123    return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2f16, "");
124 }
125 
get_color_32bit(struct si_shader_context * ctx,unsigned color_type,LLVMValueRef value)126 static LLVMValueRef get_color_32bit(struct si_shader_context *ctx, unsigned color_type,
127                                     LLVMValueRef value)
128 {
129    switch (color_type) {
130    case SI_TYPE_FLOAT16:
131       return LLVMBuildFPExt(ctx->ac.builder, value, ctx->ac.f32, "");
132    case SI_TYPE_INT16:
133       value = ac_to_integer(&ctx->ac, value);
134       value = LLVMBuildSExt(ctx->ac.builder, value, ctx->ac.i32, "");
135       return ac_to_float(&ctx->ac, value);
136    case SI_TYPE_UINT16:
137       value = ac_to_integer(&ctx->ac, value);
138       value = LLVMBuildZExt(ctx->ac.builder, value, ctx->ac.i32, "");
139       return ac_to_float(&ctx->ac, value);
140    case SI_TYPE_ANY32:
141       return value;
142    }
143    return NULL;
144 }
145 
146 /* Initialize arguments for the shader export intrinsic */
si_llvm_init_ps_export_args(struct si_shader_context * ctx,LLVMValueRef * values,unsigned cbuf,unsigned compacted_mrt_index,unsigned color_type,struct ac_export_args * args)147 static bool si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValueRef *values,
148                                         unsigned cbuf, unsigned compacted_mrt_index,
149                                         unsigned color_type, struct ac_export_args *args)
150 {
151    const union si_shader_key *key = &ctx->shader->key;
152    unsigned col_formats = key->ps.part.epilog.spi_shader_col_format;
153    LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
154    unsigned spi_shader_col_format;
155    unsigned chan;
156    bool is_int8, is_int10;
157 
158    assert(cbuf < 8);
159 
160    spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
161    if (spi_shader_col_format == V_028714_SPI_SHADER_ZERO)
162       return false;
163 
164    is_int8 = (key->ps.part.epilog.color_is_int8 >> cbuf) & 0x1;
165    is_int10 = (key->ps.part.epilog.color_is_int10 >> cbuf) & 0x1;
166 
167    /* Default is 0xf. Adjusted below depending on the format. */
168    args->enabled_channels = 0xf; /* writemask */
169 
170    /* Specify whether the EXEC mask represents the valid mask */
171    args->valid_mask = 0;
172 
173    /* Specify whether this is the last export */
174    args->done = 0;
175 
176    /* Specify the target we are exporting */
177    args->target = V_008DFC_SQ_EXP_MRT + compacted_mrt_index;
178 
179    if (key->ps.part.epilog.dual_src_blend_swizzle &&
180        (compacted_mrt_index == 0 || compacted_mrt_index == 1)) {
181       assert(ctx->ac.gfx_level >= GFX11);
182       args->target += 21;
183    }
184 
185    args->compr = false;
186    args->out[0] = f32undef;
187    args->out[1] = f32undef;
188    args->out[2] = f32undef;
189    args->out[3] = f32undef;
190 
191    LLVMValueRef (*packf)(struct ac_llvm_context * ctx, LLVMValueRef args[2]) = NULL;
192    LLVMValueRef (*packi)(struct ac_llvm_context * ctx, LLVMValueRef args[2], unsigned bits,
193                          bool hi) = NULL;
194 
195    switch (spi_shader_col_format) {
196    case V_028714_SPI_SHADER_32_R:
197       args->enabled_channels = 1; /* writemask */
198       args->out[0] = get_color_32bit(ctx, color_type, values[0]);
199       break;
200 
201    case V_028714_SPI_SHADER_32_GR:
202       args->enabled_channels = 0x3; /* writemask */
203       args->out[0] = get_color_32bit(ctx, color_type, values[0]);
204       args->out[1] = get_color_32bit(ctx, color_type, values[1]);
205       break;
206 
207    case V_028714_SPI_SHADER_32_AR:
208       if (ctx->screen->info.gfx_level >= GFX10) {
209          args->enabled_channels = 0x3; /* writemask */
210          args->out[0] = get_color_32bit(ctx, color_type, values[0]);
211          args->out[1] = get_color_32bit(ctx, color_type, values[3]);
212       } else {
213          args->enabled_channels = 0x9; /* writemask */
214          args->out[0] = get_color_32bit(ctx, color_type, values[0]);
215          args->out[3] = get_color_32bit(ctx, color_type, values[3]);
216       }
217       break;
218 
219    case V_028714_SPI_SHADER_FP16_ABGR:
220       if (color_type != SI_TYPE_ANY32)
221          packf = pack_two_16bit;
222       else
223          packf = ac_build_cvt_pkrtz_f16;
224       break;
225 
226    case V_028714_SPI_SHADER_UNORM16_ABGR:
227       if (color_type != SI_TYPE_ANY32)
228          packf = ac_build_cvt_pknorm_u16_f16;
229       else
230          packf = ac_build_cvt_pknorm_u16;
231       break;
232 
233    case V_028714_SPI_SHADER_SNORM16_ABGR:
234       if (color_type != SI_TYPE_ANY32)
235          packf = ac_build_cvt_pknorm_i16_f16;
236       else
237          packf = ac_build_cvt_pknorm_i16;
238       break;
239 
240    case V_028714_SPI_SHADER_UINT16_ABGR:
241       if (color_type != SI_TYPE_ANY32)
242          packf = pack_two_16bit;
243       else
244          packi = ac_build_cvt_pk_u16;
245       break;
246 
247    case V_028714_SPI_SHADER_SINT16_ABGR:
248       if (color_type != SI_TYPE_ANY32)
249          packf = pack_two_16bit;
250       else
251          packi = ac_build_cvt_pk_i16;
252       break;
253 
254    case V_028714_SPI_SHADER_32_ABGR:
255       for (unsigned i = 0; i < 4; i++)
256          args->out[i] = get_color_32bit(ctx, color_type, values[i]);
257       break;
258    }
259 
260    /* Pack f16 or norm_i16/u16. */
261    if (packf) {
262       for (chan = 0; chan < 2; chan++) {
263          LLVMValueRef pack_args[2] = {values[2 * chan], values[2 * chan + 1]};
264          LLVMValueRef packed;
265 
266          packed = packf(&ctx->ac, pack_args);
267          args->out[chan] = ac_to_float(&ctx->ac, packed);
268       }
269    }
270    /* Pack i16/u16. */
271    if (packi) {
272       for (chan = 0; chan < 2; chan++) {
273          LLVMValueRef pack_args[2] = {ac_to_integer(&ctx->ac, values[2 * chan]),
274                                       ac_to_integer(&ctx->ac, values[2 * chan + 1])};
275          LLVMValueRef packed;
276 
277          packed = packi(&ctx->ac, pack_args, is_int8 ? 8 : is_int10 ? 10 : 16, chan == 1);
278          args->out[chan] = ac_to_float(&ctx->ac, packed);
279       }
280    }
281    if (packf || packi) {
282       if (ctx->screen->info.gfx_level >= GFX11)
283          args->enabled_channels = 0x3;
284       else
285          args->compr = 1; /* COMPR flag */
286    }
287 
288    return true;
289 }
290 
si_llvm_build_clamp_alpha_test(struct si_shader_context * ctx,LLVMValueRef * color,unsigned index)291 static void si_llvm_build_clamp_alpha_test(struct si_shader_context *ctx,
292                                            LLVMValueRef *color, unsigned index)
293 {
294    int i;
295 
296    /* Clamp color */
297    if (ctx->shader->key.ps.part.epilog.clamp_color)
298       for (i = 0; i < 4; i++)
299          color[i] = ac_build_clamp(&ctx->ac, color[i]);
300 
301    /* Alpha to one */
302    if (ctx->shader->key.ps.part.epilog.alpha_to_one)
303       color[3] = LLVMConstReal(LLVMTypeOf(color[0]), 1);
304 
305    /* Alpha test */
306    if (index == 0 && ctx->shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS)
307       si_alpha_test(ctx, color[3]);
308 }
309 
si_export_mrt_color(struct si_shader_context * ctx,LLVMValueRef * color,unsigned index,unsigned first_color_export,unsigned color_type,bool writes_all_cbufs,struct si_ps_exports * exp)310 static void si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *color, unsigned index,
311                                 unsigned first_color_export, unsigned color_type,
312                                 bool writes_all_cbufs, struct si_ps_exports *exp)
313 {
314    if (writes_all_cbufs) {
315       assert(exp->num == first_color_export);
316 
317       /* This will do nothing for color buffers with SPI_SHADER_COL_FORMAT=ZERO, so always
318        * iterate over all 8.
319        */
320       for (int c = 0; c < 8; c++) {
321          if (si_llvm_init_ps_export_args(ctx, color, c, exp->num - first_color_export,
322                                          color_type, &exp->args[exp->num])) {
323             assert(exp->args[exp->num].enabled_channels);
324             exp->num++;
325          }
326       }
327    } else {
328       /* Export */
329       if (si_llvm_init_ps_export_args(ctx, color, index, exp->num - first_color_export,
330                                       color_type, &exp->args[exp->num])) {
331          assert(exp->args[exp->num].enabled_channels);
332          exp->num++;
333       }
334    }
335 }
336 
337 /**
338  * Return PS outputs in this order:
339  *
340  * v[0:3] = color0.xyzw
341  * v[4:7] = color1.xyzw
342  * ...
343  * vN+0 = Depth
344  * vN+1 = Stencil
345  * vN+2 = SampleMask
346  * vN+3 = SampleMaskIn (used for OpenGL smoothing)
347  *
348  * The alpha-ref SGPR is returned via its original location.
349  */
si_llvm_ps_build_end(struct si_shader_context * ctx)350 void si_llvm_ps_build_end(struct si_shader_context *ctx)
351 {
352    struct si_shader *shader = ctx->shader;
353    struct si_shader_info *info = &shader->selector->info;
354    LLVMBuilderRef builder = ctx->ac.builder;
355    unsigned i, j, vgpr;
356    LLVMValueRef *addrs = ctx->abi.outputs;
357 
358    LLVMValueRef color[8][4] = {};
359    LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
360    LLVMValueRef ret;
361 
362    /* Read the output values. */
363    for (i = 0; i < info->num_outputs; i++) {
364       unsigned semantic = info->output_semantic[i];
365       LLVMTypeRef type = ctx->abi.is_16bit[4 * i] ? ctx->ac.f16 : ctx->ac.f32;
366 
367       switch (semantic) {
368       case FRAG_RESULT_DEPTH:
369          depth = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
370          break;
371       case FRAG_RESULT_STENCIL:
372          stencil = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
373          break;
374       case FRAG_RESULT_SAMPLE_MASK:
375          samplemask = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
376          break;
377       default:
378          if (semantic >= FRAG_RESULT_DATA0 && semantic <= FRAG_RESULT_DATA7) {
379             unsigned index = semantic - FRAG_RESULT_DATA0;
380 
381             for (j = 0; j < 4; j++) {
382                LLVMValueRef ptr = addrs[4 * i + j];
383                type = ctx->abi.is_16bit[4 * i + j] ? ctx->ac.f16 : ctx->ac.f32;
384                LLVMValueRef result = LLVMBuildLoad2(builder, type, ptr, "");
385                color[index][j] = result;
386             }
387          } else {
388             fprintf(stderr, "Warning: Unhandled fs output type:%d\n", semantic);
389          }
390          break;
391       }
392    }
393 
394    /* Fill the return structure. */
395    ret = ctx->return_value;
396 
397    /* Set SGPRs. */
398    ret = LLVMBuildInsertValue(
399       builder, ret, ac_to_integer(&ctx->ac, LLVMGetParam(ctx->main_fn.value, SI_PARAM_ALPHA_REF)),
400       SI_SGPR_ALPHA_REF, "");
401 
402    /* Set VGPRs */
403    vgpr = SI_SGPR_ALPHA_REF + 1;
404    for (i = 0; i < ARRAY_SIZE(color); i++) {
405       if (!color[i][0])
406          continue;
407 
408       if (LLVMTypeOf(color[i][0]) == ctx->ac.f16) {
409          for (j = 0; j < 2; j++) {
410             LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, &color[i][j * 2], 2);
411             tmp = LLVMBuildBitCast(builder, tmp, ctx->ac.f32, "");
412             ret = LLVMBuildInsertValue(builder, ret, tmp, vgpr++, "");
413          }
414          vgpr += 2;
415       } else {
416          for (j = 0; j < 4; j++)
417             ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
418       }
419    }
420    if (depth)
421       ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
422    if (stencil)
423       ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
424    if (samplemask)
425       ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
426 
427    ctx->return_value = ret;
428 }
429 
si_llvm_emit_polygon_stipple(struct si_shader_context * ctx)430 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx)
431 {
432    LLVMBuilderRef builder = ctx->ac.builder;
433    LLVMValueRef desc, offset, row, bit, address[2];
434 
435    /* Use the fixed-point gl_FragCoord input.
436     * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
437     * per coordinate to get the repeating effect.
438     */
439    address[0] = si_unpack_param(ctx, ctx->args->ac.pos_fixed_pt, 0, 5);
440    address[1] = si_unpack_param(ctx, ctx->args->ac.pos_fixed_pt, 16, 5);
441 
442    /* Load the buffer descriptor. */
443    desc = si_prolog_get_internal_binding_slot(ctx, SI_PS_CONST_POLY_STIPPLE);
444 
445    /* The stipple pattern is 32x32, each row has 32 bits. */
446    offset = LLVMBuildMul(builder, address[1], LLVMConstInt(ctx->ac.i32, 4, 0), "");
447    row = si_buffer_load_const(ctx, desc, offset);
448    row = ac_to_integer(&ctx->ac, row);
449    bit = LLVMBuildLShr(builder, row, address[0], "");
450    bit = LLVMBuildTrunc(builder, bit, ctx->ac.i1, "");
451    ac_build_kill_if_false(&ctx->ac, bit);
452 }
453 
insert_ret_of_arg(struct si_shader_context * ctx,LLVMValueRef ret,LLVMValueRef data,unsigned arg_index)454 static LLVMValueRef insert_ret_of_arg(struct si_shader_context *ctx, LLVMValueRef ret,
455                                       LLVMValueRef data, unsigned arg_index)
456 {
457    unsigned base = ctx->args->ac.args[arg_index].file == AC_ARG_VGPR ?
458       ctx->args->ac.num_sgprs_used : 0;
459    unsigned index = base + ctx->args->ac.args[arg_index].offset;
460 
461    if (ctx->args->ac.args[arg_index].size == 1) {
462       return LLVMBuildInsertValue(ctx->ac.builder, ret, data, index, "");
463    } else {
464       assert(ctx->args->ac.args[arg_index].size == 2);
465       LLVMValueRef tmp = LLVMBuildExtractElement(ctx->ac.builder, data, ctx->ac.i32_0, "");
466       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, index, "");
467       tmp = LLVMBuildExtractElement(ctx->ac.builder, data, ctx->ac.i32_1, "");
468       ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, index + 1, "");
469       return ret;
470    }
471 }
472 
473 /**
474  * Build the pixel shader prolog function. This handles:
475  * - two-side color selection and interpolation
476  * - overriding interpolation parameters for the API PS
477  * - polygon stippling
478  *
479  * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
480  * overridden by other states. (e.g. per-sample interpolation)
481  * Interpolated colors are stored after the preloaded VGPRs.
482  */
si_llvm_build_ps_prolog(struct si_shader_context * ctx,union si_shader_part_key * key)483 void si_llvm_build_ps_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
484 {
485    struct si_shader_args *args = ctx->args;
486    si_get_ps_prolog_args(args, key);
487 
488    /* Declare outputs (same as inputs + add colors if needed) */
489    LLVMTypeRef return_types[AC_MAX_ARGS];
490    int num_returns = 0;
491 
492    for (int i = 0; i < args->ac.num_sgprs_used; i++)
493       return_types[num_returns++] = ctx->ac.i32;
494 
495    unsigned num_color_channels = util_bitcount(key->ps_prolog.colors_read);
496    unsigned num_output_vgprs = args->ac.num_vgprs_used + num_color_channels;
497    for (int i = 0; i < num_output_vgprs; i++)
498       return_types[num_returns++] = ctx->ac.f32;
499 
500    /* Create the function. */
501    si_llvm_create_func(ctx, "ps_prolog", return_types, num_returns, 0);
502    LLVMValueRef func = ctx->main_fn.value;
503 
504    /* Copy inputs to outputs. This should be no-op, as the registers match,
505     * but it will prevent the compiler from overwriting them unintentionally.
506     */
507    LLVMValueRef ret = ctx->return_value;
508    for (int i = 0; i < args->ac.arg_count; i++) {
509       LLVMValueRef p = LLVMGetParam(func, i);
510       ret = insert_ret_of_arg(ctx, ret, p, i);
511    }
512 
513    /* Polygon stippling. */
514    if (key->ps_prolog.states.poly_stipple)
515       si_llvm_emit_polygon_stipple(ctx);
516 
517    if (key->ps_prolog.states.bc_optimize_for_persp ||
518        key->ps_prolog.states.bc_optimize_for_linear) {
519       LLVMValueRef center, centroid, tmp;
520 
521       /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
522        * The hw doesn't compute CENTROID if the whole wave only
523        * contains fully-covered quads.
524        */
525       LLVMValueRef bc_optimize = ac_get_arg(&ctx->ac, args->ac.prim_mask);
526       bc_optimize =
527          LLVMBuildLShr(ctx->ac.builder, bc_optimize, LLVMConstInt(ctx->ac.i32, 31, 0), "");
528       bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize, ctx->ac.i1, "");
529 
530       if (key->ps_prolog.states.bc_optimize_for_persp) {
531          center = ac_get_arg(&ctx->ac, args->ac.persp_center);
532          centroid = ac_get_arg(&ctx->ac, args->ac.persp_centroid);
533          /* Select PERSP_CENTROID. */
534          tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center, centroid, "");
535          ret = insert_ret_of_arg(ctx, ret, tmp, args->ac.persp_centroid.arg_index);
536       }
537       if (key->ps_prolog.states.bc_optimize_for_linear) {
538          center = ac_get_arg(&ctx->ac, args->ac.linear_center);
539          centroid = ac_get_arg(&ctx->ac, args->ac.linear_centroid);
540          /* Select PERSP_CENTROID. */
541          tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center, centroid, "");
542          ret = insert_ret_of_arg(ctx, ret, tmp, args->ac.linear_centroid.arg_index);
543       }
544    }
545 
546    /* Force per-sample interpolation. */
547    if (key->ps_prolog.states.force_persp_sample_interp) {
548       LLVMValueRef persp_sample = ac_get_arg(&ctx->ac, args->ac.persp_sample);
549       /* Overwrite PERSP_CENTER. */
550       ret = insert_ret_of_arg(ctx, ret, persp_sample, args->ac.persp_center.arg_index);
551       /* Overwrite PERSP_CENTROID. */
552       ret = insert_ret_of_arg(ctx, ret, persp_sample, args->ac.persp_centroid.arg_index);
553    }
554    if (key->ps_prolog.states.force_linear_sample_interp) {
555       LLVMValueRef linear_sample = ac_get_arg(&ctx->ac, args->ac.linear_sample);
556       /* Overwrite LINEAR_CENTER. */
557       ret = insert_ret_of_arg(ctx, ret, linear_sample, args->ac.linear_center.arg_index);
558       /* Overwrite LINEAR_CENTROID. */
559       ret = insert_ret_of_arg(ctx, ret, linear_sample, args->ac.linear_centroid.arg_index);
560    }
561 
562    /* Force center interpolation. */
563    if (key->ps_prolog.states.force_persp_center_interp) {
564       LLVMValueRef persp_center = ac_get_arg(&ctx->ac, args->ac.persp_center);
565       /* Overwrite PERSP_SAMPLE. */
566       ret = insert_ret_of_arg(ctx, ret, persp_center, args->ac.persp_sample.arg_index);
567       /* Overwrite PERSP_CENTROID. */
568       ret = insert_ret_of_arg(ctx, ret, persp_center, args->ac.persp_centroid.arg_index);
569    }
570    if (key->ps_prolog.states.force_linear_center_interp) {
571       LLVMValueRef linear_center = ac_get_arg(&ctx->ac, args->ac.linear_center);
572       /* Overwrite LINEAR_SAMPLE. */
573       ret = insert_ret_of_arg(ctx, ret, linear_center, args->ac.linear_sample.arg_index);
574       /* Overwrite LINEAR_CENTROID. */
575       ret = insert_ret_of_arg(ctx, ret, linear_center, args->ac.linear_centroid.arg_index);
576    }
577 
578    /* Interpolate colors. */
579    unsigned color_out_idx = 0;
580    unsigned num_input_gprs = args->ac.num_sgprs_used + args->ac.num_vgprs_used;
581    for (int i = 0; i < 2; i++) {
582       unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
583 
584       if (!writemask)
585          continue;
586 
587       /* If the interpolation qualifier is not CONSTANT (-1). */
588       LLVMValueRef interp_ij = NULL;
589       if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
590          unsigned index =
591             args->ac.num_sgprs_used + key->ps_prolog.color_interp_vgpr_index[i];
592 
593          /* Get the (i,j) updated by bc_optimize handling. */
594          LLVMValueRef interp[2] = {
595             LLVMBuildExtractValue(ctx->ac.builder, ret, index, ""),
596             LLVMBuildExtractValue(ctx->ac.builder, ret, index + 1, ""),
597          };
598          interp_ij = ac_build_gather_values(&ctx->ac, interp, 2);
599       }
600 
601       LLVMValueRef prim_mask = ac_get_arg(&ctx->ac, args->ac.prim_mask);
602 
603       LLVMValueRef face = NULL;
604       if (key->ps_prolog.states.color_two_side)
605          face = ac_get_arg(&ctx->ac, args->ac.front_face);
606 
607       LLVMValueRef color[4];
608       interp_fs_color(ctx, key->ps_prolog.color_attr_index[i], i, key->ps_prolog.num_interp_inputs,
609                       key->ps_prolog.colors_read, interp_ij, prim_mask, face, color);
610 
611       while (writemask) {
612          unsigned chan = u_bit_scan(&writemask);
613          ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan],
614                                     num_input_gprs + color_out_idx++, "");
615       }
616    }
617 
618    /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
619     * says:
620     *
621     *    "When per-sample shading is active due to the use of a fragment
622     *     input qualified by sample or due to the use of the gl_SampleID
623     *     or gl_SamplePosition variables, only the bit for the current
624     *     sample is set in gl_SampleMaskIn. When state specifies multiple
625     *     fragment shader invocations for a given fragment, the sample
626     *     mask for any single fragment shader invocation may specify a
627     *     subset of the covered samples for the fragment. In this case,
628     *     the bit corresponding to each covered sample will be set in
629     *     exactly one fragment shader invocation."
630     *
631     * The samplemask loaded by hardware is always the coverage of the
632     * entire pixel/fragment, so mask bits out based on the sample ID.
633     */
634    if (key->ps_prolog.states.samplemask_log_ps_iter) {
635       LLVMValueRef sample_id = si_unpack_param(ctx, args->ac.ancillary, 8, 4);
636       LLVMValueRef sample_mask_in;
637 
638       /* Set samplemask_log_ps_iter=3 if full sample shading is enabled even for 2x and 4x MSAA
639        * to get this fast path that fully replaces sample_mask_in with sample_id.
640        */
641       if (key->ps_prolog.states.samplemask_log_ps_iter == 3) {
642          sample_mask_in =
643             LLVMBuildSelect(ctx->ac.builder, ac_build_load_helper_invocation(&ctx->ac),
644                             ctx->ac.i32_0,
645                             LLVMBuildShl(ctx->ac.builder, ctx->ac.i32_1, sample_id, ""), "");
646       } else {
647          uint32_t ps_iter_mask =
648             ac_get_ps_iter_mask(1 << key->ps_prolog.states.samplemask_log_ps_iter);
649          sample_mask_in =
650             LLVMBuildAnd(ctx->ac.builder,
651                          ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, args->ac.sample_coverage)),
652                          LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false),
653                                       sample_id, ""), "");
654       }
655 
656       sample_mask_in = ac_to_float(&ctx->ac, sample_mask_in);
657       ret = insert_ret_of_arg(ctx, ret, sample_mask_in, args->ac.sample_coverage.arg_index);
658    } else if (key->ps_prolog.states.force_samplemask_to_helper_invocation) {
659       LLVMValueRef sample_mask_in =
660          LLVMBuildZExt(ctx->ac.builder,
661                        LLVMBuildNot(ctx->ac.builder, ac_build_load_helper_invocation(&ctx->ac), ""),
662                        ctx->ac.i32, "");
663       ret = insert_ret_of_arg(ctx, ret, ac_to_float(&ctx->ac, sample_mask_in),
664                               args->ac.sample_coverage.arg_index);
665    }
666 
667    if (key->ps_prolog.states.get_frag_coord_from_pixel_coord) {
668       LLVMValueRef pixel_coord = ac_get_arg(&ctx->ac, args->ac.pos_fixed_pt);
669       pixel_coord = LLVMBuildBitCast(ctx->ac.builder, pixel_coord, ctx->ac.v2i16, "");
670       pixel_coord = LLVMBuildUIToFP(ctx->ac.builder, pixel_coord, ctx->ac.v2f32, "");
671 
672       if (!key->ps_prolog.pixel_center_integer) {
673          LLVMValueRef vec2_half = LLVMConstVector((LLVMValueRef[]){LLVMConstReal(ctx->ac.f32, 0.5),
674                                                                    LLVMConstReal(ctx->ac.f32, 0.5)}, 2);
675          pixel_coord = LLVMBuildFAdd(ctx->ac.builder, pixel_coord, vec2_half, "");
676       }
677 
678       for (unsigned i = 0; i < 2; i++) {
679          if (!args->ac.frag_pos[i].used)
680             continue;
681 
682          ret = insert_ret_of_arg(ctx, ret,
683                                  LLVMBuildExtractElement(ctx->ac.builder, pixel_coord,
684                                                          LLVMConstInt(ctx->ac.i32, i, 0), ""),
685                                  args->ac.frag_pos[i].arg_index);
686       }
687    }
688 
689    /* Tell LLVM to insert WQM instruction sequence when needed. */
690    if (key->ps_prolog.wqm) {
691       LLVMAddTargetDependentFunctionAttr(func, "amdgpu-ps-wqm-outputs", "");
692    }
693 
694    si_llvm_build_ret(ctx, ret);
695 }
696 
697 /**
698  * Build the pixel shader epilog function. This handles everything that must be
699  * emulated for pixel shader exports. (alpha-test, format conversions, etc)
700  */
si_llvm_build_ps_epilog(struct si_shader_context * ctx,union si_shader_part_key * key)701 void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part_key *key)
702 {
703    int i;
704    struct si_ps_exports exp = {};
705    LLVMValueRef color[8][4] = {};
706 
707    struct si_shader_args *args = ctx->args;
708    struct ac_arg color_args[MAX_DRAW_BUFFERS];
709    struct ac_arg depth_arg, stencil_arg, samplemask_arg;
710    si_get_ps_epilog_args(args, key, color_args, &depth_arg, &stencil_arg, &samplemask_arg);
711 
712    /* Create the function. */
713    si_llvm_create_func(ctx, "ps_epilog", NULL, 0, 0);
714    /* Disable elimination of unused inputs. */
715    ac_llvm_add_target_dep_function_attr(ctx->main_fn.value, "InitialPSInputAddr", 0xffffff);
716 
717    /* Prepare color. */
718    unsigned colors_written = key->ps_epilog.colors_written;
719    LLVMValueRef mrtz_alpha = NULL;
720 
721    while (colors_written) {
722       int write_i = u_bit_scan(&colors_written);
723       unsigned color_type = (key->ps_epilog.color_types >> (write_i * 2)) & 0x3;
724       LLVMValueRef arg = ac_get_arg(&ctx->ac, color_args[write_i]);
725 
726       if (color_type != SI_TYPE_ANY32)
727          arg = LLVMBuildBitCast(ctx->ac.builder, arg, LLVMVectorType(ctx->ac.f16, 8), "");
728 
729       for (i = 0; i < 4; i++)
730          color[write_i][i] = ac_llvm_extract_elem(&ctx->ac, arg, i);
731 
732       if (key->ps_epilog.states.alpha_to_coverage_via_mrtz && write_i == 0)
733          mrtz_alpha = color[0][3];
734 
735       si_llvm_build_clamp_alpha_test(ctx, color[write_i], write_i);
736    }
737    bool writes_z = key->ps_epilog.writes_z && !key->ps_epilog.states.kill_z;
738    bool writes_stencil = key->ps_epilog.writes_stencil && !key->ps_epilog.states.kill_stencil;
739    bool writes_samplemask = key->ps_epilog.writes_samplemask && !key->ps_epilog.states.kill_samplemask;
740 
741    /* Prepare the mrtz export. */
742    if (writes_z || writes_stencil || writes_samplemask || mrtz_alpha) {
743       LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
744 
745       if (writes_z)
746          depth = ac_get_arg(&ctx->ac, depth_arg);
747       if (writes_stencil)
748          stencil = ac_get_arg(&ctx->ac, stencil_arg);
749       if (writes_samplemask)
750          samplemask = ac_get_arg(&ctx->ac, samplemask_arg);
751 
752       ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, mrtz_alpha, false,
753                       &exp.args[exp.num++]);
754    }
755 
756    /* Prepare color exports. */
757    const unsigned first_color_export = exp.num;
758    colors_written = key->ps_epilog.colors_written;
759 
760    while (colors_written) {
761       int write_i = u_bit_scan(&colors_written);
762       unsigned color_type = (key->ps_epilog.color_types >> (write_i * 2)) & 0x3;
763 
764       si_export_mrt_color(ctx, color[write_i], write_i, first_color_export, color_type,
765                           key->ps_epilog.writes_all_cbufs, &exp);
766    }
767 
768    if (exp.num) {
769       exp.args[exp.num - 1].valid_mask = 1;  /* whether the EXEC mask is valid */
770       exp.args[exp.num - 1].done = 1;        /* DONE bit */
771 
772       if (key->ps_epilog.states.dual_src_blend_swizzle) {
773          assert(ctx->ac.gfx_level >= GFX11);
774          assert((key->ps_epilog.colors_written & 0x3) == 0x3);
775          ac_build_dual_src_blend_swizzle(&ctx->ac, &exp.args[first_color_export],
776                                          &exp.args[first_color_export + 1]);
777       }
778 
779       for (unsigned i = 0; i < exp.num; i++)
780          ac_build_export(&ctx->ac, &exp.args[i]);
781    } else {
782       ac_build_export_null(&ctx->ac, key->ps_epilog.uses_discard);
783    }
784 
785    /* Compile. */
786    LLVMBuildRetVoid(ctx->ac.builder);
787 }
788