1 /*
2 * Copyright 2020 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "si_pipe.h"
26 #include "si_shader_internal.h"
27 #include "sid.h"
28
si_get_sample_id(struct si_shader_context * ctx)29 LLVMValueRef si_get_sample_id(struct si_shader_context *ctx)
30 {
31 return si_unpack_param(ctx, ctx->args.ancillary, 8, 4);
32 }
33
load_sample_position(struct ac_shader_abi * abi,LLVMValueRef sample_id)34 static LLVMValueRef load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id)
35 {
36 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
37 LLVMValueRef desc = ac_get_arg(&ctx->ac, ctx->internal_bindings);
38 LLVMValueRef buf_index = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
39 LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index);
40
41 /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
42 LLVMValueRef offset0 =
43 LLVMBuildMul(ctx->ac.builder, sample_id, LLVMConstInt(ctx->ac.i32, 8, 0), "");
44 LLVMValueRef offset1 =
45 LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->ac.i32, 4, 0), "");
46
47 LLVMValueRef pos[4] = {si_buffer_load_const(ctx, resource, offset0),
48 si_buffer_load_const(ctx, resource, offset1),
49 LLVMConstReal(ctx->ac.f32, 0), LLVMConstReal(ctx->ac.f32, 0)};
50
51 return ac_build_gather_values(&ctx->ac, pos, 4);
52 }
53
si_nir_emit_fbfetch(struct ac_shader_abi * abi)54 static LLVMValueRef si_nir_emit_fbfetch(struct ac_shader_abi *abi)
55 {
56 struct si_shader_context *ctx = si_shader_context_from_abi(abi);
57 struct ac_image_args args = {};
58 LLVMValueRef ptr, image, fmask;
59
60 /* Ignore src0, because KHR_blend_func_extended disallows multiple render
61 * targets.
62 */
63
64 /* Load the image descriptor. */
65 STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0);
66 STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0_FMASK % 2 == 0);
67
68 ptr = ac_get_arg(&ctx->ac, ctx->internal_bindings);
69 ptr =
70 LLVMBuildPointerCast(ctx->ac.builder, ptr, ac_array_in_const32_addr_space(ctx->ac.v8i32), "");
71 image =
72 ac_build_load_to_sgpr(&ctx->ac, ptr, LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0 / 2, 0));
73
74 unsigned chan = 0;
75
76 args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 0, 16);
77
78 if (!ctx->shader->key.ps.mono.fbfetch_is_1D)
79 args.coords[chan++] = si_unpack_param(ctx, ctx->pos_fixed_pt, 16, 16);
80
81 /* Get the current render target layer index. */
82 if (ctx->shader->key.ps.mono.fbfetch_layered)
83 args.coords[chan++] = si_unpack_param(ctx, ctx->args.ancillary, 16, 11);
84
85 if (ctx->shader->key.ps.mono.fbfetch_msaa)
86 args.coords[chan++] = si_get_sample_id(ctx);
87
88 if (ctx->screen->info.gfx_level < GFX11 &&
89 ctx->shader->key.ps.mono.fbfetch_msaa &&
90 !(ctx->screen->debug_flags & DBG(NO_FMASK))) {
91 fmask = ac_build_load_to_sgpr(&ctx->ac, ptr,
92 LLVMConstInt(ctx->ac.i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0));
93
94 ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords,
95 ctx->shader->key.ps.mono.fbfetch_layered);
96 }
97
98 args.opcode = ac_image_load;
99 args.resource = image;
100 args.dmask = 0xf;
101 args.attributes = AC_FUNC_ATTR_READNONE;
102
103 if (ctx->shader->key.ps.mono.fbfetch_msaa)
104 args.dim =
105 ctx->shader->key.ps.mono.fbfetch_layered ? ac_image_2darraymsaa : ac_image_2dmsaa;
106 else if (ctx->shader->key.ps.mono.fbfetch_is_1D)
107 args.dim = ctx->shader->key.ps.mono.fbfetch_layered ? ac_image_1darray : ac_image_1d;
108 else
109 args.dim = ctx->shader->key.ps.mono.fbfetch_layered ? ac_image_2darray : ac_image_2d;
110
111 return ac_build_image_opcode(&ctx->ac, &args);
112 }
113
si_build_fs_interp(struct si_shader_context * ctx,unsigned attr_index,unsigned chan,LLVMValueRef prim_mask,LLVMValueRef i,LLVMValueRef j)114 static LLVMValueRef si_build_fs_interp(struct si_shader_context *ctx, unsigned attr_index,
115 unsigned chan, LLVMValueRef prim_mask, LLVMValueRef i,
116 LLVMValueRef j)
117 {
118 if (i || j) {
119 return ac_build_fs_interp(&ctx->ac, LLVMConstInt(ctx->ac.i32, chan, 0),
120 LLVMConstInt(ctx->ac.i32, attr_index, 0), prim_mask, i, j);
121 }
122 return ac_build_fs_interp_mov(&ctx->ac, LLVMConstInt(ctx->ac.i32, 2, 0), /* P0 */
123 LLVMConstInt(ctx->ac.i32, chan, 0),
124 LLVMConstInt(ctx->ac.i32, attr_index, 0), prim_mask);
125 }
126
127 /**
128 * Interpolate a fragment shader input.
129 *
130 * @param ctx context
131 * @param input_index index of the input in hardware
132 * @param semantic_index semantic index
133 * @param num_interp_inputs number of all interpolated inputs (= BCOLOR offset)
134 * @param colors_read_mask color components read (4 bits for each color, 8 bits in total)
135 * @param interp_param interpolation weights (i,j)
136 * @param prim_mask SI_PARAM_PRIM_MASK
137 * @param face SI_PARAM_FRONT_FACE
138 * @param result the return value (4 components)
139 */
interp_fs_color(struct si_shader_context * ctx,unsigned input_index,unsigned semantic_index,unsigned num_interp_inputs,unsigned colors_read_mask,LLVMValueRef interp_param,LLVMValueRef prim_mask,LLVMValueRef face,LLVMValueRef result[4])140 static void interp_fs_color(struct si_shader_context *ctx, unsigned input_index,
141 unsigned semantic_index, unsigned num_interp_inputs,
142 unsigned colors_read_mask, LLVMValueRef interp_param,
143 LLVMValueRef prim_mask, LLVMValueRef face, LLVMValueRef result[4])
144 {
145 LLVMValueRef i = NULL, j = NULL;
146 unsigned chan;
147
148 /* fs.constant returns the param from the middle vertex, so it's not
149 * really useful for flat shading. It's meant to be used for custom
150 * interpolation (but the intrinsic can't fetch from the other two
151 * vertices).
152 *
153 * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
154 * to do the right thing. The only reason we use fs.constant is that
155 * fs.interp cannot be used on integers, because they can be equal
156 * to NaN.
157 *
158 * When interp is false we will use fs.constant or for newer llvm,
159 * amdgcn.interp.mov.
160 */
161 bool interp = interp_param != NULL;
162
163 if (interp) {
164 interp_param =
165 LLVMBuildBitCast(ctx->ac.builder, interp_param, ctx->ac.v2f32, "");
166
167 i = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_0, "");
168 j = LLVMBuildExtractElement(ctx->ac.builder, interp_param, ctx->ac.i32_1, "");
169 }
170
171 if (ctx->shader->key.ps.part.prolog.color_two_side) {
172 LLVMValueRef is_face_positive;
173
174 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
175 * otherwise it's at offset "num_inputs".
176 */
177 unsigned back_attr_offset = num_interp_inputs;
178 if (semantic_index == 1 && colors_read_mask & 0xf)
179 back_attr_offset += 1;
180
181 is_face_positive = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, face, ctx->ac.i32_0, "");
182
183 for (chan = 0; chan < 4; chan++) {
184 LLVMValueRef front, back;
185
186 front = si_build_fs_interp(ctx, input_index, chan, prim_mask, i, j);
187 back = si_build_fs_interp(ctx, back_attr_offset, chan, prim_mask, i, j);
188
189 result[chan] = LLVMBuildSelect(ctx->ac.builder, is_face_positive, front, back, "");
190 }
191 } else {
192 for (chan = 0; chan < 4; chan++) {
193 result[chan] = si_build_fs_interp(ctx, input_index, chan, prim_mask, i, j);
194 }
195 }
196 }
197
si_alpha_test(struct si_shader_context * ctx,LLVMValueRef alpha)198 static void si_alpha_test(struct si_shader_context *ctx, LLVMValueRef alpha)
199 {
200 if (ctx->shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_NEVER) {
201 static LLVMRealPredicate cond_map[PIPE_FUNC_ALWAYS + 1] = {
202 [PIPE_FUNC_LESS] = LLVMRealOLT, [PIPE_FUNC_EQUAL] = LLVMRealOEQ,
203 [PIPE_FUNC_LEQUAL] = LLVMRealOLE, [PIPE_FUNC_GREATER] = LLVMRealOGT,
204 [PIPE_FUNC_NOTEQUAL] = LLVMRealONE, [PIPE_FUNC_GEQUAL] = LLVMRealOGE,
205 };
206 LLVMRealPredicate cond = cond_map[ctx->shader->key.ps.part.epilog.alpha_func];
207 assert(cond);
208
209 LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn, SI_PARAM_ALPHA_REF);
210 if (LLVMTypeOf(alpha) == ctx->ac.f16)
211 alpha_ref = LLVMBuildFPTrunc(ctx->ac.builder, alpha_ref, ctx->ac.f16, "");
212
213 LLVMValueRef alpha_pass = LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
214 ac_build_kill_if_false(&ctx->ac, alpha_pass);
215 } else {
216 ac_build_kill_if_false(&ctx->ac, ctx->ac.i1false);
217 }
218 }
219
220 struct si_ps_exports {
221 unsigned num;
222 struct ac_export_args args[10];
223 };
224
pack_two_16bit(struct ac_llvm_context * ctx,LLVMValueRef args[2])225 static LLVMValueRef pack_two_16bit(struct ac_llvm_context *ctx, LLVMValueRef args[2])
226 {
227 LLVMValueRef tmp = ac_build_gather_values(ctx, args, 2);
228 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2f16, "");
229 }
230
get_color_32bit(struct si_shader_context * ctx,unsigned color_type,LLVMValueRef value)231 static LLVMValueRef get_color_32bit(struct si_shader_context *ctx, unsigned color_type,
232 LLVMValueRef value)
233 {
234 switch (color_type) {
235 case SI_TYPE_FLOAT16:
236 return LLVMBuildFPExt(ctx->ac.builder, value, ctx->ac.f32, "");
237 case SI_TYPE_INT16:
238 value = ac_to_integer(&ctx->ac, value);
239 value = LLVMBuildSExt(ctx->ac.builder, value, ctx->ac.i32, "");
240 return ac_to_float(&ctx->ac, value);
241 case SI_TYPE_UINT16:
242 value = ac_to_integer(&ctx->ac, value);
243 value = LLVMBuildZExt(ctx->ac.builder, value, ctx->ac.i32, "");
244 return ac_to_float(&ctx->ac, value);
245 case SI_TYPE_ANY32:
246 return value;
247 }
248 return NULL;
249 }
250
251 /* Initialize arguments for the shader export intrinsic */
si_llvm_init_ps_export_args(struct si_shader_context * ctx,LLVMValueRef * values,unsigned cbuf,unsigned compacted_mrt_index,unsigned color_type,struct ac_export_args * args)252 static bool si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValueRef *values,
253 unsigned cbuf, unsigned compacted_mrt_index,
254 unsigned color_type, struct ac_export_args *args)
255 {
256 const union si_shader_key *key = &ctx->shader->key;
257 unsigned col_formats = key->ps.part.epilog.spi_shader_col_format;
258 LLVMValueRef f32undef = LLVMGetUndef(ctx->ac.f32);
259 unsigned spi_shader_col_format;
260 unsigned chan;
261 bool is_int8, is_int10;
262
263 assert(cbuf < 8);
264
265 spi_shader_col_format = (col_formats >> (cbuf * 4)) & 0xf;
266 if (spi_shader_col_format == V_028714_SPI_SHADER_ZERO)
267 return false;
268
269 is_int8 = (key->ps.part.epilog.color_is_int8 >> cbuf) & 0x1;
270 is_int10 = (key->ps.part.epilog.color_is_int10 >> cbuf) & 0x1;
271
272 /* Default is 0xf. Adjusted below depending on the format. */
273 args->enabled_channels = 0xf; /* writemask */
274
275 /* Specify whether the EXEC mask represents the valid mask */
276 args->valid_mask = 0;
277
278 /* Specify whether this is the last export */
279 args->done = 0;
280
281 /* Specify the target we are exporting */
282 args->target = V_008DFC_SQ_EXP_MRT + compacted_mrt_index;
283
284 if (key->ps.part.epilog.dual_src_blend_swizzle &&
285 (compacted_mrt_index == 0 || compacted_mrt_index == 1)) {
286 assert(ctx->ac.gfx_level >= GFX11);
287 args->target += 21;
288 }
289
290 args->compr = false;
291 args->out[0] = f32undef;
292 args->out[1] = f32undef;
293 args->out[2] = f32undef;
294 args->out[3] = f32undef;
295
296 LLVMValueRef (*packf)(struct ac_llvm_context * ctx, LLVMValueRef args[2]) = NULL;
297 LLVMValueRef (*packi)(struct ac_llvm_context * ctx, LLVMValueRef args[2], unsigned bits,
298 bool hi) = NULL;
299
300 switch (spi_shader_col_format) {
301 case V_028714_SPI_SHADER_32_R:
302 args->enabled_channels = 1; /* writemask */
303 args->out[0] = get_color_32bit(ctx, color_type, values[0]);
304 break;
305
306 case V_028714_SPI_SHADER_32_GR:
307 args->enabled_channels = 0x3; /* writemask */
308 args->out[0] = get_color_32bit(ctx, color_type, values[0]);
309 args->out[1] = get_color_32bit(ctx, color_type, values[1]);
310 break;
311
312 case V_028714_SPI_SHADER_32_AR:
313 if (ctx->screen->info.gfx_level >= GFX10) {
314 args->enabled_channels = 0x3; /* writemask */
315 args->out[0] = get_color_32bit(ctx, color_type, values[0]);
316 args->out[1] = get_color_32bit(ctx, color_type, values[3]);
317 } else {
318 args->enabled_channels = 0x9; /* writemask */
319 args->out[0] = get_color_32bit(ctx, color_type, values[0]);
320 args->out[3] = get_color_32bit(ctx, color_type, values[3]);
321 }
322 break;
323
324 case V_028714_SPI_SHADER_FP16_ABGR:
325 if (color_type != SI_TYPE_ANY32)
326 packf = pack_two_16bit;
327 else
328 packf = ac_build_cvt_pkrtz_f16;
329 break;
330
331 case V_028714_SPI_SHADER_UNORM16_ABGR:
332 if (color_type != SI_TYPE_ANY32)
333 packf = ac_build_cvt_pknorm_u16_f16;
334 else
335 packf = ac_build_cvt_pknorm_u16;
336 break;
337
338 case V_028714_SPI_SHADER_SNORM16_ABGR:
339 if (color_type != SI_TYPE_ANY32)
340 packf = ac_build_cvt_pknorm_i16_f16;
341 else
342 packf = ac_build_cvt_pknorm_i16;
343 break;
344
345 case V_028714_SPI_SHADER_UINT16_ABGR:
346 if (color_type != SI_TYPE_ANY32)
347 packf = pack_two_16bit;
348 else
349 packi = ac_build_cvt_pk_u16;
350 break;
351
352 case V_028714_SPI_SHADER_SINT16_ABGR:
353 if (color_type != SI_TYPE_ANY32)
354 packf = pack_two_16bit;
355 else
356 packi = ac_build_cvt_pk_i16;
357 break;
358
359 case V_028714_SPI_SHADER_32_ABGR:
360 for (unsigned i = 0; i < 4; i++)
361 args->out[i] = get_color_32bit(ctx, color_type, values[i]);
362 break;
363 }
364
365 /* Pack f16 or norm_i16/u16. */
366 if (packf) {
367 for (chan = 0; chan < 2; chan++) {
368 LLVMValueRef pack_args[2] = {values[2 * chan], values[2 * chan + 1]};
369 LLVMValueRef packed;
370
371 packed = packf(&ctx->ac, pack_args);
372 args->out[chan] = ac_to_float(&ctx->ac, packed);
373 }
374 }
375 /* Pack i16/u16. */
376 if (packi) {
377 for (chan = 0; chan < 2; chan++) {
378 LLVMValueRef pack_args[2] = {ac_to_integer(&ctx->ac, values[2 * chan]),
379 ac_to_integer(&ctx->ac, values[2 * chan + 1])};
380 LLVMValueRef packed;
381
382 packed = packi(&ctx->ac, pack_args, is_int8 ? 8 : is_int10 ? 10 : 16, chan == 1);
383 args->out[chan] = ac_to_float(&ctx->ac, packed);
384 }
385 }
386 if (packf || packi) {
387 if (ctx->screen->info.gfx_level >= GFX11)
388 args->enabled_channels = 0x3;
389 else
390 args->compr = 1; /* COMPR flag */
391 }
392
393 return true;
394 }
395
si_llvm_build_clamp_alpha_test(struct si_shader_context * ctx,LLVMValueRef * color,unsigned index)396 static void si_llvm_build_clamp_alpha_test(struct si_shader_context *ctx,
397 LLVMValueRef *color, unsigned index)
398 {
399 int i;
400
401 /* Clamp color */
402 if (ctx->shader->key.ps.part.epilog.clamp_color)
403 for (i = 0; i < 4; i++)
404 color[i] = ac_build_clamp(&ctx->ac, color[i]);
405
406 /* Alpha to one */
407 if (ctx->shader->key.ps.part.epilog.alpha_to_one)
408 color[3] = LLVMConstReal(LLVMTypeOf(color[0]), 1);
409
410 /* Alpha test */
411 if (index == 0 && ctx->shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS)
412 si_alpha_test(ctx, color[3]);
413 }
414
si_export_mrt_color(struct si_shader_context * ctx,LLVMValueRef * color,unsigned index,unsigned first_color_export,unsigned color_type,struct si_ps_exports * exp)415 static void si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *color, unsigned index,
416 unsigned first_color_export, unsigned color_type,
417 struct si_ps_exports *exp)
418 {
419 /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
420 if (ctx->shader->key.ps.part.epilog.last_cbuf > 0) {
421 assert(exp->num == first_color_export);
422
423 /* Get the export arguments, also find out what the last one is. */
424 for (int c = 0; c <= ctx->shader->key.ps.part.epilog.last_cbuf; c++) {
425 if (si_llvm_init_ps_export_args(ctx, color, c, exp->num - first_color_export,
426 color_type, &exp->args[exp->num])) {
427 assert(exp->args[exp->num].enabled_channels);
428 exp->num++;
429 }
430 }
431 } else {
432 /* Export */
433 if (si_llvm_init_ps_export_args(ctx, color, index, exp->num - first_color_export,
434 color_type, &exp->args[exp->num])) {
435 assert(exp->args[exp->num].enabled_channels);
436 exp->num++;
437 }
438 }
439 }
440
441 /**
442 * Return PS outputs in this order:
443 *
444 * v[0:3] = color0.xyzw
445 * v[4:7] = color1.xyzw
446 * ...
447 * vN+0 = Depth
448 * vN+1 = Stencil
449 * vN+2 = SampleMask
450 * vN+3 = SampleMaskIn (used for OpenGL smoothing)
451 *
452 * The alpha-ref SGPR is returned via its original location.
453 */
si_llvm_ps_build_end(struct si_shader_context * ctx)454 void si_llvm_ps_build_end(struct si_shader_context *ctx)
455 {
456 struct si_shader *shader = ctx->shader;
457 struct si_shader_info *info = &shader->selector->info;
458 LLVMBuilderRef builder = ctx->ac.builder;
459 unsigned i, j, vgpr;
460 LLVMValueRef *addrs = ctx->abi.outputs;
461
462 LLVMValueRef color[8][4] = {};
463 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
464 LLVMValueRef ret;
465
466 /* Read the output values. */
467 for (i = 0; i < info->num_outputs; i++) {
468 unsigned semantic = info->output_semantic[i];
469 LLVMTypeRef type = ctx->abi.is_16bit[4 * i] ? ctx->ac.f16 : ctx->ac.f32;
470
471 switch (semantic) {
472 case FRAG_RESULT_DEPTH:
473 depth = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
474 break;
475 case FRAG_RESULT_STENCIL:
476 stencil = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
477 break;
478 case FRAG_RESULT_SAMPLE_MASK:
479 samplemask = LLVMBuildLoad2(builder, type, addrs[4 * i + 0], "");
480 break;
481 default:
482 if (semantic >= FRAG_RESULT_DATA0 && semantic <= FRAG_RESULT_DATA7) {
483 unsigned index = semantic - FRAG_RESULT_DATA0;
484
485 for (j = 0; j < 4; j++) {
486 LLVMValueRef ptr = addrs[4 * i + j];
487 type = ctx->abi.is_16bit[4 * i + j] ? ctx->ac.f16 : ctx->ac.f32;
488 LLVMValueRef result = LLVMBuildLoad2(builder, type, ptr, "");
489 color[index][j] = result;
490 }
491 } else {
492 fprintf(stderr, "Warning: Unhandled fs output type:%d\n", semantic);
493 }
494 break;
495 }
496 }
497
498 /* Fill the return structure. */
499 ret = ctx->return_value;
500
501 /* Set SGPRs. */
502 ret = LLVMBuildInsertValue(
503 builder, ret, ac_to_integer(&ctx->ac, LLVMGetParam(ctx->main_fn, SI_PARAM_ALPHA_REF)),
504 SI_SGPR_ALPHA_REF, "");
505
506 /* Set VGPRs */
507 vgpr = SI_SGPR_ALPHA_REF + 1;
508 for (i = 0; i < ARRAY_SIZE(color); i++) {
509 if (!color[i][0])
510 continue;
511
512 if (LLVMTypeOf(color[i][0]) == ctx->ac.f16) {
513 for (j = 0; j < 2; j++) {
514 LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, &color[i][j * 2], 2);
515 tmp = LLVMBuildBitCast(builder, tmp, ctx->ac.f32, "");
516 ret = LLVMBuildInsertValue(builder, ret, tmp, vgpr++, "");
517 }
518 vgpr += 2;
519 } else {
520 for (j = 0; j < 4; j++)
521 ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
522 }
523 }
524 if (depth)
525 ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
526 if (stencil)
527 ret = LLVMBuildInsertValue(builder, ret, stencil, vgpr++, "");
528 if (samplemask)
529 ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");
530
531 ctx->return_value = ret;
532 }
533
si_llvm_emit_polygon_stipple(struct si_shader_context * ctx,LLVMValueRef param_internal_bindings,struct ac_arg param_pos_fixed_pt)534 static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx,
535 LLVMValueRef param_internal_bindings,
536 struct ac_arg param_pos_fixed_pt)
537 {
538 LLVMBuilderRef builder = ctx->ac.builder;
539 LLVMValueRef slot, desc, offset, row, bit, address[2];
540
541 /* Use the fixed-point gl_FragCoord input.
542 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
543 * per coordinate to get the repeating effect.
544 */
545 address[0] = si_unpack_param(ctx, param_pos_fixed_pt, 0, 5);
546 address[1] = si_unpack_param(ctx, param_pos_fixed_pt, 16, 5);
547
548 /* Load the buffer descriptor. */
549 slot = LLVMConstInt(ctx->ac.i32, SI_PS_CONST_POLY_STIPPLE, 0);
550 desc = ac_build_load_to_sgpr(&ctx->ac, param_internal_bindings, slot);
551
552 /* The stipple pattern is 32x32, each row has 32 bits. */
553 offset = LLVMBuildMul(builder, address[1], LLVMConstInt(ctx->ac.i32, 4, 0), "");
554 row = si_buffer_load_const(ctx, desc, offset);
555 row = ac_to_integer(&ctx->ac, row);
556 bit = LLVMBuildLShr(builder, row, address[0], "");
557 bit = LLVMBuildTrunc(builder, bit, ctx->ac.i1, "");
558 ac_build_kill_if_false(&ctx->ac, bit);
559 }
560
561 /**
562 * Build the pixel shader prolog function. This handles:
563 * - two-side color selection and interpolation
564 * - overriding interpolation parameters for the API PS
565 * - polygon stippling
566 *
567 * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
568 * overriden by other states. (e.g. per-sample interpolation)
569 * Interpolated colors are stored after the preloaded VGPRs.
570 */
si_llvm_build_ps_prolog(struct si_shader_context * ctx,union si_shader_part_key * key)571 void si_llvm_build_ps_prolog(struct si_shader_context *ctx, union si_shader_part_key *key)
572 {
573 LLVMValueRef ret, func;
574 int num_returns, i, num_color_channels;
575
576 memset(&ctx->args, 0, sizeof(ctx->args));
577
578 /* Declare inputs. */
579 LLVMTypeRef return_types[AC_MAX_ARGS];
580 num_returns = 0;
581 num_color_channels = util_bitcount(key->ps_prolog.colors_read);
582 assert(key->ps_prolog.num_input_sgprs + key->ps_prolog.num_input_vgprs + num_color_channels <=
583 AC_MAX_ARGS);
584 for (i = 0; i < key->ps_prolog.num_input_sgprs; i++) {
585 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
586 return_types[num_returns++] = ctx->ac.i32;
587 }
588
589 struct ac_arg pos_fixed_pt;
590 struct ac_arg ancillary;
591 struct ac_arg param_sample_mask;
592 for (i = 0; i < key->ps_prolog.num_input_vgprs; i++) {
593 struct ac_arg *arg = NULL;
594 if (i == key->ps_prolog.ancillary_vgpr_index) {
595 arg = &ancillary;
596 } else if (i == key->ps_prolog.sample_coverage_vgpr_index) {
597 arg = ¶m_sample_mask;
598 } else if (i == key->ps_prolog.num_input_vgprs - 1) {
599 /* POS_FIXED_PT is always last. */
600 arg = &pos_fixed_pt;
601 }
602 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, arg);
603 return_types[num_returns++] = ctx->ac.f32;
604 }
605
606 /* Declare outputs (same as inputs + add colors if needed) */
607 for (i = 0; i < num_color_channels; i++)
608 return_types[num_returns++] = ctx->ac.f32;
609
610 /* Create the function. */
611 si_llvm_create_func(ctx, "ps_prolog", return_types, num_returns, 0);
612 func = ctx->main_fn;
613
614 /* Copy inputs to outputs. This should be no-op, as the registers match,
615 * but it will prevent the compiler from overwriting them unintentionally.
616 */
617 ret = ctx->return_value;
618 for (i = 0; i < ctx->args.arg_count; i++) {
619 LLVMValueRef p = LLVMGetParam(func, i);
620 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, p, i, "");
621 }
622
623 /* Polygon stippling. */
624 if (key->ps_prolog.states.poly_stipple) {
625 LLVMValueRef list = si_prolog_get_internal_bindings(ctx);
626
627 si_llvm_emit_polygon_stipple(ctx, list, pos_fixed_pt);
628 }
629
630 if (key->ps_prolog.states.bc_optimize_for_persp ||
631 key->ps_prolog.states.bc_optimize_for_linear) {
632 unsigned i, base = key->ps_prolog.num_input_sgprs;
633 LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
634
635 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
636 * The hw doesn't compute CENTROID if the whole wave only
637 * contains fully-covered quads.
638 *
639 * PRIM_MASK is after user SGPRs.
640 */
641 bc_optimize = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
642 bc_optimize =
643 LLVMBuildLShr(ctx->ac.builder, bc_optimize, LLVMConstInt(ctx->ac.i32, 31, 0), "");
644 bc_optimize = LLVMBuildTrunc(ctx->ac.builder, bc_optimize, ctx->ac.i1, "");
645
646 if (key->ps_prolog.states.bc_optimize_for_persp) {
647 /* Read PERSP_CENTER. */
648 for (i = 0; i < 2; i++)
649 center[i] = LLVMGetParam(func, base + 2 + i);
650 /* Read PERSP_CENTROID. */
651 for (i = 0; i < 2; i++)
652 centroid[i] = LLVMGetParam(func, base + 4 + i);
653 /* Select PERSP_CENTROID. */
654 for (i = 0; i < 2; i++) {
655 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center[i], centroid[i], "");
656 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, base + 4 + i, "");
657 }
658 }
659 if (key->ps_prolog.states.bc_optimize_for_linear) {
660 /* Read LINEAR_CENTER. */
661 for (i = 0; i < 2; i++)
662 center[i] = LLVMGetParam(func, base + 8 + i);
663 /* Read LINEAR_CENTROID. */
664 for (i = 0; i < 2; i++)
665 centroid[i] = LLVMGetParam(func, base + 10 + i);
666 /* Select LINEAR_CENTROID. */
667 for (i = 0; i < 2; i++) {
668 tmp = LLVMBuildSelect(ctx->ac.builder, bc_optimize, center[i], centroid[i], "");
669 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, tmp, base + 10 + i, "");
670 }
671 }
672 }
673
674 /* Force per-sample interpolation. */
675 if (key->ps_prolog.states.force_persp_sample_interp) {
676 unsigned i, base = key->ps_prolog.num_input_sgprs;
677 LLVMValueRef persp_sample[2];
678
679 /* Read PERSP_SAMPLE. */
680 for (i = 0; i < 2; i++)
681 persp_sample[i] = LLVMGetParam(func, base + i);
682 /* Overwrite PERSP_CENTER. */
683 for (i = 0; i < 2; i++)
684 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_sample[i], base + 2 + i, "");
685 /* Overwrite PERSP_CENTROID. */
686 for (i = 0; i < 2; i++)
687 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_sample[i], base + 4 + i, "");
688 }
689 if (key->ps_prolog.states.force_linear_sample_interp) {
690 unsigned i, base = key->ps_prolog.num_input_sgprs;
691 LLVMValueRef linear_sample[2];
692
693 /* Read LINEAR_SAMPLE. */
694 for (i = 0; i < 2; i++)
695 linear_sample[i] = LLVMGetParam(func, base + 6 + i);
696 /* Overwrite LINEAR_CENTER. */
697 for (i = 0; i < 2; i++)
698 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_sample[i], base + 8 + i, "");
699 /* Overwrite LINEAR_CENTROID. */
700 for (i = 0; i < 2; i++)
701 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_sample[i], base + 10 + i, "");
702 }
703
704 /* Force center interpolation. */
705 if (key->ps_prolog.states.force_persp_center_interp) {
706 unsigned i, base = key->ps_prolog.num_input_sgprs;
707 LLVMValueRef persp_center[2];
708
709 /* Read PERSP_CENTER. */
710 for (i = 0; i < 2; i++)
711 persp_center[i] = LLVMGetParam(func, base + 2 + i);
712 /* Overwrite PERSP_SAMPLE. */
713 for (i = 0; i < 2; i++)
714 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_center[i], base + i, "");
715 /* Overwrite PERSP_CENTROID. */
716 for (i = 0; i < 2; i++)
717 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, persp_center[i], base + 4 + i, "");
718 }
719 if (key->ps_prolog.states.force_linear_center_interp) {
720 unsigned i, base = key->ps_prolog.num_input_sgprs;
721 LLVMValueRef linear_center[2];
722
723 /* Read LINEAR_CENTER. */
724 for (i = 0; i < 2; i++)
725 linear_center[i] = LLVMGetParam(func, base + 8 + i);
726 /* Overwrite LINEAR_SAMPLE. */
727 for (i = 0; i < 2; i++)
728 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_center[i], base + 6 + i, "");
729 /* Overwrite LINEAR_CENTROID. */
730 for (i = 0; i < 2; i++)
731 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, linear_center[i], base + 10 + i, "");
732 }
733
734 /* Interpolate colors. */
735 unsigned color_out_idx = 0;
736 for (i = 0; i < 2; i++) {
737 unsigned writemask = (key->ps_prolog.colors_read >> (i * 4)) & 0xf;
738 unsigned face_vgpr = key->ps_prolog.num_input_sgprs + key->ps_prolog.face_vgpr_index;
739 LLVMValueRef interp[2], color[4];
740 LLVMValueRef interp_ij = NULL, prim_mask = NULL, face = NULL;
741
742 if (!writemask)
743 continue;
744
745 /* If the interpolation qualifier is not CONSTANT (-1). */
746 if (key->ps_prolog.color_interp_vgpr_index[i] != -1) {
747 unsigned interp_vgpr =
748 key->ps_prolog.num_input_sgprs + key->ps_prolog.color_interp_vgpr_index[i];
749
750 /* Get the (i,j) updated by bc_optimize handling. */
751 interp[0] = LLVMBuildExtractValue(ctx->ac.builder, ret, interp_vgpr, "");
752 interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret, interp_vgpr + 1, "");
753 interp_ij = ac_build_gather_values(&ctx->ac, interp, 2);
754 }
755
756 /* Use the absolute location of the input. */
757 prim_mask = LLVMGetParam(func, SI_PS_NUM_USER_SGPR);
758
759 if (key->ps_prolog.states.color_two_side) {
760 face = LLVMGetParam(func, face_vgpr);
761 face = ac_to_integer(&ctx->ac, face);
762 }
763
764 interp_fs_color(ctx, key->ps_prolog.color_attr_index[i], i, key->ps_prolog.num_interp_inputs,
765 key->ps_prolog.colors_read, interp_ij, prim_mask, face, color);
766
767 while (writemask) {
768 unsigned chan = u_bit_scan(&writemask);
769 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, color[chan],
770 ctx->args.arg_count + color_out_idx++, "");
771 }
772 }
773
774 /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
775 * says:
776 *
777 * "When per-sample shading is active due to the use of a fragment
778 * input qualified by sample or due to the use of the gl_SampleID
779 * or gl_SamplePosition variables, only the bit for the current
780 * sample is set in gl_SampleMaskIn. When state specifies multiple
781 * fragment shader invocations for a given fragment, the sample
782 * mask for any single fragment shader invocation may specify a
783 * subset of the covered samples for the fragment. In this case,
784 * the bit corresponding to each covered sample will be set in
785 * exactly one fragment shader invocation."
786 *
787 * The samplemask loaded by hardware is always the coverage of the
788 * entire pixel/fragment, so mask bits out based on the sample ID.
789 */
790 if (key->ps_prolog.states.samplemask_log_ps_iter) {
791 /* The bit pattern matches that used by fixed function fragment
792 * processing. */
793 static const uint16_t ps_iter_masks[] = {
794 0xffff, /* not used */
795 0x5555, 0x1111, 0x0101, 0x0001,
796 };
797 assert(key->ps_prolog.states.samplemask_log_ps_iter < ARRAY_SIZE(ps_iter_masks));
798
799 uint32_t ps_iter_mask = ps_iter_masks[key->ps_prolog.states.samplemask_log_ps_iter];
800 LLVMValueRef sampleid = si_unpack_param(ctx, ancillary, 8, 4);
801 LLVMValueRef samplemask = ac_get_arg(&ctx->ac, param_sample_mask);
802
803 samplemask = ac_to_integer(&ctx->ac, samplemask);
804 samplemask =
805 LLVMBuildAnd(ctx->ac.builder, samplemask,
806 LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false),
807 sampleid, ""),
808 "");
809 samplemask = ac_to_float(&ctx->ac, samplemask);
810
811 ret = LLVMBuildInsertValue(ctx->ac.builder, ret, samplemask, param_sample_mask.arg_index, "");
812 }
813
814 /* Tell LLVM to insert WQM instruction sequence when needed. */
815 if (key->ps_prolog.wqm) {
816 LLVMAddTargetDependentFunctionAttr(func, "amdgpu-ps-wqm-outputs", "");
817 }
818
819 si_llvm_build_ret(ctx, ret);
820 }
821
822 /**
823 * Build the pixel shader epilog function. This handles everything that must be
824 * emulated for pixel shader exports. (alpha-test, format conversions, etc)
825 */
si_llvm_build_ps_epilog(struct si_shader_context * ctx,union si_shader_part_key * key)826 void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part_key *key)
827 {
828 int i;
829 struct si_ps_exports exp = {};
830 LLVMValueRef color[8][4] = {};
831
832 memset(&ctx->args, 0, sizeof(ctx->args));
833
834 /* Declare input SGPRs. */
835 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->internal_bindings);
836 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->bindless_samplers_and_images);
837 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->const_and_shader_buffers);
838 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->samplers_and_images);
839 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_ALPHA_REF);
840
841 /* Declare input VGPRs. */
842 unsigned required_num_params =
843 ctx->args.num_sgprs_used + util_bitcount(key->ps_epilog.colors_written) * 4 +
844 key->ps_epilog.writes_z + key->ps_epilog.writes_stencil + key->ps_epilog.writes_samplemask;
845
846 while (ctx->args.arg_count < required_num_params)
847 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
848
849 /* Create the function. */
850 si_llvm_create_func(ctx, "ps_epilog", NULL, 0, 0);
851 /* Disable elimination of unused inputs. */
852 ac_llvm_add_target_dep_function_attr(ctx->main_fn, "InitialPSInputAddr", 0xffffff);
853
854 /* Prepare color. */
855 unsigned vgpr = ctx->args.num_sgprs_used;
856 unsigned colors_written = key->ps_epilog.colors_written;
857
858 while (colors_written) {
859 int write_i = u_bit_scan(&colors_written);
860 unsigned color_type = (key->ps_epilog.color_types >> (write_i * 2)) & 0x3;
861
862 if (color_type != SI_TYPE_ANY32) {
863 for (i = 0; i < 4; i++) {
864 color[write_i][i] = LLVMGetParam(ctx->main_fn, vgpr + i / 2);
865 color[write_i][i] = LLVMBuildBitCast(ctx->ac.builder, color[write_i][i],
866 ctx->ac.v2f16, "");
867 color[write_i][i] = ac_llvm_extract_elem(&ctx->ac, color[write_i][i], i % 2);
868 }
869 vgpr += 4;
870 } else {
871 for (i = 0; i < 4; i++)
872 color[write_i][i] = LLVMGetParam(ctx->main_fn, vgpr++);
873 }
874
875 si_llvm_build_clamp_alpha_test(ctx, color[write_i], write_i);
876 }
877
878 LLVMValueRef mrtz_alpha =
879 key->ps_epilog.states.alpha_to_coverage_via_mrtz ? color[0][3] : NULL;
880
881 /* Prepare the mrtz export. */
882 if (key->ps_epilog.writes_z ||
883 key->ps_epilog.writes_stencil ||
884 key->ps_epilog.writes_samplemask ||
885 mrtz_alpha) {
886 LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
887 unsigned vgpr_index = ctx->args.num_sgprs_used +
888 util_bitcount(key->ps_epilog.colors_written) * 4;
889
890 if (key->ps_epilog.writes_z)
891 depth = LLVMGetParam(ctx->main_fn, vgpr_index++);
892 if (key->ps_epilog.writes_stencil)
893 stencil = LLVMGetParam(ctx->main_fn, vgpr_index++);
894 if (key->ps_epilog.writes_samplemask)
895 samplemask = LLVMGetParam(ctx->main_fn, vgpr_index++);
896
897 ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, mrtz_alpha, false,
898 &exp.args[exp.num++]);
899 }
900
901 /* Prepare color exports. */
902 const unsigned first_color_export = exp.num;
903 colors_written = key->ps_epilog.colors_written;
904
905 while (colors_written) {
906 int write_i = u_bit_scan(&colors_written);
907 unsigned color_type = (key->ps_epilog.color_types >> (write_i * 2)) & 0x3;
908
909 si_export_mrt_color(ctx, color[write_i], write_i, first_color_export, color_type, &exp);
910 }
911
912 if (exp.num) {
913 exp.args[exp.num - 1].valid_mask = 1; /* whether the EXEC mask is valid */
914 exp.args[exp.num - 1].done = 1; /* DONE bit */
915
916 if (key->ps_epilog.states.dual_src_blend_swizzle) {
917 assert(ctx->ac.gfx_level >= GFX11);
918 assert((key->ps_epilog.colors_written & 0x3) == 0x3);
919 ac_build_dual_src_blend_swizzle(&ctx->ac, &exp.args[first_color_export],
920 &exp.args[first_color_export + 1]);
921 }
922
923 for (unsigned i = 0; i < exp.num; i++)
924 ac_build_export(&ctx->ac, &exp.args[i]);
925 } else {
926 ac_build_export_null(&ctx->ac, key->ps_epilog.uses_discard);
927 }
928
929 /* Compile. */
930 LLVMBuildRetVoid(ctx->ac.builder);
931 }
932
si_llvm_build_monolithic_ps(struct si_shader_context * ctx,struct si_shader * shader)933 void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, struct si_shader *shader)
934 {
935 LLVMValueRef parts[3];
936 unsigned num_parts = 0, main_index;
937 LLVMValueRef main_fn = ctx->main_fn;
938
939 union si_shader_part_key prolog_key;
940 si_get_ps_prolog_key(shader, &prolog_key, false);
941
942 if (si_need_ps_prolog(&prolog_key)) {
943 si_llvm_build_ps_prolog(ctx, &prolog_key);
944 parts[num_parts++] = ctx->main_fn;
945 }
946
947 main_index = num_parts;
948 parts[num_parts++] = main_fn;
949
950 union si_shader_part_key epilog_key;
951 si_get_ps_epilog_key(shader, &epilog_key);
952 si_llvm_build_ps_epilog(ctx, &epilog_key);
953 parts[num_parts++] = ctx->main_fn;
954
955 si_build_wrapper_function(ctx, parts, num_parts, main_index, 0, false);
956 }
957
si_llvm_init_ps_callbacks(struct si_shader_context * ctx)958 void si_llvm_init_ps_callbacks(struct si_shader_context *ctx)
959 {
960 ctx->abi.load_sample_position = load_sample_position;
961 ctx->abi.emit_fbfetch = si_nir_emit_fbfetch;
962 }
963