1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "ac_nir.h"
26 #include "ac_rtld.h"
27 #include "nir.h"
28 #include "nir_builder.h"
29 #include "nir_serialize.h"
30 #include "nir/nir_helpers.h"
31 #include "ralloc.h"
32 #include "si_pipe.h"
33 #include "si_shader_internal.h"
34 #include "sid.h"
35 #include "tgsi/tgsi_from_mesa.h"
36 #include "tgsi/tgsi_strings.h"
37 #include "util/u_memory.h"
38 #include "util/mesa-sha1.h"
39
40 static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
41
42 static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
43
44 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
45
46 /* Get the number of all interpolated inputs */
si_get_ps_num_interp(struct si_shader * ps)47 unsigned si_get_ps_num_interp(struct si_shader *ps)
48 {
49 struct si_shader_info *info = &ps->selector->info;
50 unsigned num_colors = !!(info->colors_read & 0x0f) + !!(info->colors_read & 0xf0);
51 unsigned num_interp =
52 ps->selector->info.num_inputs + (ps->key.ps.part.prolog.color_two_side ? num_colors : 0);
53
54 assert(num_interp <= 32);
55 return MIN2(num_interp, 32);
56 }
57
58 /** Whether the shader runs as a combination of multiple API shaders */
si_is_multi_part_shader(struct si_shader * shader)59 bool si_is_multi_part_shader(struct si_shader *shader)
60 {
61 if (shader->selector->screen->info.gfx_level <= GFX8 ||
62 shader->selector->stage > MESA_SHADER_GEOMETRY)
63 return false;
64
65 return shader->key.ge.as_ls || shader->key.ge.as_es ||
66 shader->selector->stage == MESA_SHADER_TESS_CTRL ||
67 shader->selector->stage == MESA_SHADER_GEOMETRY;
68 }
69
70 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
si_is_merged_shader(struct si_shader * shader)71 bool si_is_merged_shader(struct si_shader *shader)
72 {
73 if (shader->selector->stage > MESA_SHADER_GEOMETRY)
74 return false;
75
76 return shader->key.ge.as_ngg || si_is_multi_part_shader(shader);
77 }
78
79 /**
80 * Returns a unique index for a per-patch semantic name and index. The index
81 * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
82 * can be calculated.
83 */
si_shader_io_get_unique_index_patch(unsigned semantic)84 unsigned si_shader_io_get_unique_index_patch(unsigned semantic)
85 {
86 switch (semantic) {
87 case VARYING_SLOT_TESS_LEVEL_OUTER:
88 return 0;
89 case VARYING_SLOT_TESS_LEVEL_INNER:
90 return 1;
91 default:
92 if (semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_PATCH0 + 30)
93 return 2 + (semantic - VARYING_SLOT_PATCH0);
94
95 assert(!"invalid semantic");
96 return 0;
97 }
98 }
99
100 /**
101 * Returns a unique index for a semantic name and index. The index must be
102 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
103 * calculated.
104 */
si_shader_io_get_unique_index(unsigned semantic,bool is_varying)105 unsigned si_shader_io_get_unique_index(unsigned semantic, bool is_varying)
106 {
107 switch (semantic) {
108 case VARYING_SLOT_POS:
109 return 0;
110 default:
111 /* Since some shader stages use the highest used IO index
112 * to determine the size to allocate for inputs/outputs
113 * (in LDS, tess and GS rings). GENERIC should be placed right
114 * after POSITION to make that size as small as possible.
115 */
116 if (semantic >= VARYING_SLOT_VAR0 && semantic <= VARYING_SLOT_VAR31)
117 return 1 + (semantic - VARYING_SLOT_VAR0); /* 1..32 */
118
119 /* Put 16-bit GLES varyings after 32-bit varyings. They can use the same indices as
120 * legacy desktop GL varyings because they are mutually exclusive.
121 */
122 if (semantic >= VARYING_SLOT_VAR0_16BIT && semantic <= VARYING_SLOT_VAR15_16BIT)
123 return 33 + (semantic - VARYING_SLOT_VAR0_16BIT); /* 33..48 */
124
125 assert(!"invalid generic index");
126 return 0;
127
128 /* Legacy desktop GL varyings. */
129 case VARYING_SLOT_FOGC:
130 return 33;
131 case VARYING_SLOT_COL0:
132 return 34;
133 case VARYING_SLOT_COL1:
134 return 35;
135 case VARYING_SLOT_BFC0:
136 /* If it's a varying, COLOR and BCOLOR alias. */
137 if (is_varying)
138 return 34;
139 else
140 return 36;
141 case VARYING_SLOT_BFC1:
142 if (is_varying)
143 return 35;
144 else
145 return 37;
146 case VARYING_SLOT_TEX0:
147 case VARYING_SLOT_TEX1:
148 case VARYING_SLOT_TEX2:
149 case VARYING_SLOT_TEX3:
150 case VARYING_SLOT_TEX4:
151 case VARYING_SLOT_TEX5:
152 case VARYING_SLOT_TEX6:
153 case VARYING_SLOT_TEX7:
154 return 38 + (semantic - VARYING_SLOT_TEX0);
155 case VARYING_SLOT_CLIP_VERTEX:
156 return 46;
157
158 /* Varyings present in both GLES and desktop GL must start at 49 after 16-bit varyings. */
159 case VARYING_SLOT_CLIP_DIST0:
160 return 49;
161 case VARYING_SLOT_CLIP_DIST1:
162 return 50;
163 case VARYING_SLOT_PSIZ:
164 return 51;
165
166 /* These can't be written by LS, HS, and ES. */
167 case VARYING_SLOT_LAYER:
168 return 52;
169 case VARYING_SLOT_VIEWPORT:
170 return 53;
171 case VARYING_SLOT_PRIMITIVE_ID:
172 return 54;
173 }
174 }
175
si_dump_streamout(struct pipe_stream_output_info * so)176 static void si_dump_streamout(struct pipe_stream_output_info *so)
177 {
178 unsigned i;
179
180 if (so->num_outputs) {
181 fprintf(stderr, "STREAMOUT\n");
182
183 fprintf(stderr, " STRIDES: {");
184 for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++)
185 fprintf(stderr, "%u%s", so->stride[i], i < 3 ? ", " : "");
186 fprintf(stderr, "}\n");
187 }
188
189 for (i = 0; i < so->num_outputs; i++) {
190 unsigned mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
191 fprintf(stderr, " %i: STREAM%u: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
192 i, so->output[i].stream, so->output[i].output_buffer,
193 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
194 so->output[i].register_index, mask & 1 ? "x" : "", mask & 2 ? "y" : "",
195 mask & 4 ? "z" : "", mask & 8 ? "w" : "");
196 }
197 }
198
declare_streamout_params(struct si_shader_context * ctx,struct pipe_stream_output_info * so)199 static void declare_streamout_params(struct si_shader_context *ctx,
200 struct pipe_stream_output_info *so)
201 {
202 if (ctx->screen->use_ngg_streamout) {
203 if (ctx->stage == MESA_SHADER_TESS_EVAL)
204 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
205 return;
206 }
207
208 /* Streamout SGPRs. */
209 if (so->num_outputs) {
210 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_config);
211 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_write_index);
212 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
213 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
214 }
215
216 /* A streamout buffer offset is loaded if the stride is non-zero. */
217 for (int i = 0; i < 4; i++) {
218 if (!so->stride[i])
219 continue;
220
221 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_offset[i]);
222 }
223 }
224
si_get_max_workgroup_size(const struct si_shader * shader)225 unsigned si_get_max_workgroup_size(const struct si_shader *shader)
226 {
227 switch (shader->selector->stage) {
228 case MESA_SHADER_VERTEX:
229 case MESA_SHADER_TESS_EVAL:
230 return shader->key.ge.as_ngg ? 128 : 0;
231
232 case MESA_SHADER_TESS_CTRL:
233 /* Return this so that LLVM doesn't remove s_barrier
234 * instructions on chips where we use s_barrier. */
235 return shader->selector->screen->info.gfx_level >= GFX7 ? 128 : 0;
236
237 case MESA_SHADER_GEOMETRY:
238 return shader->selector->screen->info.gfx_level >= GFX9 ? 128 : 0;
239
240 case MESA_SHADER_COMPUTE:
241 break; /* see below */
242
243 default:
244 return 0;
245 }
246
247 /* Compile a variable block size using the maximum variable size. */
248 if (shader->selector->info.base.workgroup_size_variable)
249 return SI_MAX_VARIABLE_THREADS_PER_BLOCK;
250
251 uint16_t *local_size = shader->selector->info.base.workgroup_size;
252 unsigned max_work_group_size = (uint32_t)local_size[0] *
253 (uint32_t)local_size[1] *
254 (uint32_t)local_size[2];
255 assert(max_work_group_size);
256 return max_work_group_size;
257 }
258
declare_const_and_shader_buffers(struct si_shader_context * ctx,bool assign_params)259 static void declare_const_and_shader_buffers(struct si_shader_context *ctx, bool assign_params)
260 {
261 enum ac_arg_type const_shader_buf_type;
262
263 if (ctx->shader->selector->info.base.num_ubos == 1 &&
264 ctx->shader->selector->info.base.num_ssbos == 0)
265 const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
266 else
267 const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
268
269 ac_add_arg(
270 &ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
271 assign_params ? &ctx->const_and_shader_buffers : &ctx->other_const_and_shader_buffers);
272 }
273
declare_samplers_and_images(struct si_shader_context * ctx,bool assign_params)274 static void declare_samplers_and_images(struct si_shader_context *ctx, bool assign_params)
275 {
276 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
277 assign_params ? &ctx->samplers_and_images : &ctx->other_samplers_and_images);
278 }
279
declare_per_stage_desc_pointers(struct si_shader_context * ctx,bool assign_params)280 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, bool assign_params)
281 {
282 declare_const_and_shader_buffers(ctx, assign_params);
283 declare_samplers_and_images(ctx, assign_params);
284 }
285
declare_global_desc_pointers(struct si_shader_context * ctx)286 static void declare_global_desc_pointers(struct si_shader_context *ctx)
287 {
288 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->internal_bindings);
289 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
290 &ctx->bindless_samplers_and_images);
291 }
292
declare_vb_descriptor_input_sgprs(struct si_shader_context * ctx)293 static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
294 {
295 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->args.vertex_buffers);
296
297 unsigned num_vbos_in_user_sgprs = ctx->shader->selector->info.num_vbos_in_user_sgprs;
298 if (num_vbos_in_user_sgprs) {
299 unsigned user_sgprs = ctx->args.num_sgprs_used;
300
301 if (si_is_merged_shader(ctx->shader))
302 user_sgprs -= 8;
303 assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
304
305 /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
306 for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
307 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
308
309 assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
310 for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
311 ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
312 }
313 }
314
declare_vs_input_vgprs(struct si_shader_context * ctx,unsigned * num_prolog_vgprs)315 static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs)
316 {
317 struct si_shader *shader = ctx->shader;
318
319 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
320 if (shader->key.ge.as_ls) {
321 if (ctx->screen->info.gfx_level >= GFX11) {
322 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
323 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
324 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
325 } else if (ctx->screen->info.gfx_level >= GFX10) {
326 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_rel_patch_id);
327 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
328 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
329 } else {
330 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_rel_patch_id);
331 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
332 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
333 }
334 } else if (ctx->screen->info.gfx_level >= GFX10) {
335 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
336 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
337 &ctx->args.vs_prim_id); /* user vgpr or PrimID (legacy) */
338 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
339 } else {
340 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
341 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_prim_id);
342 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
343 }
344
345 if (!shader->is_gs_copy_shader) {
346 /* Vertex load indices. */
347 if (shader->selector->info.num_inputs) {
348 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0);
349 for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
350 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
351 }
352 *num_prolog_vgprs += shader->selector->info.num_inputs;
353 }
354 }
355
declare_vs_blit_inputs(struct si_shader_context * ctx,unsigned vs_blit_property)356 static void declare_vs_blit_inputs(struct si_shader_context *ctx, unsigned vs_blit_property)
357 {
358 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_blit_inputs); /* i16 x1, y1 */
359 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */
360 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */
361
362 if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
363 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
364 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
365 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
366 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
367 } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
368 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
369 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
370 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
371 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
372 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
373 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
374 }
375 }
376
declare_tes_input_vgprs(struct si_shader_context * ctx)377 static void declare_tes_input_vgprs(struct si_shader_context *ctx)
378 {
379 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.tes_u);
380 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.tes_v);
381 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_rel_patch_id);
382 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
383 }
384
385 enum
386 {
387 /* Convenient merged shader definitions. */
388 SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES,
389 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
390 };
391
si_add_arg_checked(struct ac_shader_args * args,enum ac_arg_regfile file,unsigned registers,enum ac_arg_type type,struct ac_arg * arg,unsigned idx)392 void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
393 enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
394 {
395 assert(args->arg_count == idx);
396 ac_add_arg(args, file, registers, type, arg);
397 }
398
si_init_shader_args(struct si_shader_context * ctx,bool ngg_cull_shader)399 void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
400 {
401 struct si_shader *shader = ctx->shader;
402 unsigned i, num_returns, num_return_sgprs;
403 unsigned num_prolog_vgprs = 0;
404 unsigned stage = ctx->stage;
405
406 memset(&ctx->args, 0, sizeof(ctx->args));
407
408 /* Set MERGED shaders. */
409 if (ctx->screen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY) {
410 if (shader->key.ge.as_ls || stage == MESA_SHADER_TESS_CTRL)
411 stage = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
412 else if (shader->key.ge.as_es || shader->key.ge.as_ngg || stage == MESA_SHADER_GEOMETRY)
413 stage = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
414 }
415
416 switch (stage) {
417 case MESA_SHADER_VERTEX:
418 declare_global_desc_pointers(ctx);
419
420 if (shader->selector->info.base.vs.blit_sgprs_amd) {
421 declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd);
422
423 /* VGPRs */
424 declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
425 break;
426 }
427
428 declare_per_stage_desc_pointers(ctx, true);
429 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
430
431 if (ctx->shader->is_gs_copy_shader) {
432 declare_streamout_params(ctx, &ctx->so);
433 /* VGPRs */
434 declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
435 break;
436 }
437
438 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
439 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
440 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
441 declare_vb_descriptor_input_sgprs(ctx);
442
443 if (shader->key.ge.as_es) {
444 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.es2gs_offset);
445 } else if (shader->key.ge.as_ls) {
446 /* no extra parameters */
447 } else {
448 declare_streamout_params(ctx, &ctx->so);
449 }
450
451 /* VGPRs */
452 declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
453 break;
454
455 case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
456 declare_global_desc_pointers(ctx);
457 declare_per_stage_desc_pointers(ctx, true);
458 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
459 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
460 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
461 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
462 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
463 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset);
464
465 /* VGPRs */
466 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
467 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
468
469 /* param_tcs_offchip_offset and param_tcs_factor_offset are
470 * placed after the user SGPRs.
471 */
472 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
473 ac_add_return(&ctx->args, AC_ARG_SGPR);
474 for (i = 0; i < 11; i++)
475 ac_add_return(&ctx->args, AC_ARG_VGPR);
476 break;
477
478 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
479 /* Merged stages have 8 system SGPRs at the beginning. */
480 /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
481 /* Gfx11+: SPI_SHADER_PGM_LO/HI_HS */
482 declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_TESS_CTRL);
483 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
484 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.merged_wave_info);
485 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset);
486 if (ctx->screen->info.gfx_level >= GFX11)
487 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_wave_id);
488 else
489 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset);
490 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
491 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
492
493 declare_global_desc_pointers(ctx);
494 declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_VERTEX);
495
496 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
497 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
498 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
499 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
500 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
501 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
502 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
503 if (ctx->stage == MESA_SHADER_VERTEX)
504 declare_vb_descriptor_input_sgprs(ctx);
505
506 /* VGPRs (first TCS, then VS) */
507 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
508 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
509
510 if (ctx->stage == MESA_SHADER_VERTEX) {
511 declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
512
513 /* LS return values are inputs to the TCS main shader part. */
514 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
515 ac_add_return(&ctx->args, AC_ARG_SGPR);
516 for (i = 0; i < 2; i++)
517 ac_add_return(&ctx->args, AC_ARG_VGPR);
518
519 /* VS outputs passed via VGPRs to TCS. */
520 if (shader->key.ge.opt.same_patch_vertices) {
521 unsigned num_outputs = util_last_bit64(shader->selector->info.outputs_written);
522 for (i = 0; i < num_outputs * 4; i++)
523 ac_add_return(&ctx->args, AC_ARG_VGPR);
524 }
525 } else {
526 /* TCS inputs are passed via VGPRs from VS. */
527 if (shader->key.ge.opt.same_patch_vertices) {
528 unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->info.outputs_written);
529 for (i = 0; i < num_inputs * 4; i++)
530 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
531 }
532
533 /* TCS return values are inputs to the TCS epilog.
534 *
535 * param_tcs_offchip_offset, param_tcs_factor_offset,
536 * param_tcs_offchip_layout, and internal_bindings
537 * should be passed to the epilog.
538 */
539 for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
540 ac_add_return(&ctx->args, AC_ARG_SGPR);
541 for (i = 0; i < 11; i++)
542 ac_add_return(&ctx->args, AC_ARG_VGPR);
543 }
544 break;
545
546 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
547 /* Merged stages have 8 system SGPRs at the beginning. */
548 /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
549 /* Gfx11+: SPI_SHADER_PGM_LO/HI_GS */
550 declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_GEOMETRY);
551
552 if (ctx->shader->key.ge.as_ngg)
553 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_tg_info);
554 else
555 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs2vs_offset);
556
557 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.merged_wave_info);
558 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
559 if (ctx->screen->info.gfx_level >= GFX11)
560 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_attr_offset);
561 else
562 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset);
563 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
564 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
565
566 declare_global_desc_pointers(ctx);
567 if (ctx->stage != MESA_SHADER_VERTEX || !shader->selector->info.base.vs.blit_sgprs_amd) {
568 declare_per_stage_desc_pointers(
569 ctx, (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL));
570 }
571
572 if (ctx->stage == MESA_SHADER_VERTEX && shader->selector->info.base.vs.blit_sgprs_amd) {
573 declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd);
574 } else {
575 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
576
577 if (ctx->stage == MESA_SHADER_VERTEX) {
578 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
579 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
580 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
581 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
582 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
583 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
584 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
585 } else {
586 /* GS */
587 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
588 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
589 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
590 }
591
592 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->small_prim_cull_info);
593 if (ctx->screen->info.gfx_level >= GFX11)
594 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_attr_address);
595 else
596 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
597
598 if (ctx->stage == MESA_SHADER_VERTEX)
599 declare_vb_descriptor_input_sgprs(ctx);
600 }
601
602 /* VGPRs (first GS, then VS/TES) */
603 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]);
604 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]);
605 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
606 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
607 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]);
608
609 if (ctx->stage == MESA_SHADER_VERTEX) {
610 declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
611 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
612 declare_tes_input_vgprs(ctx);
613 }
614
615 if ((ctx->shader->key.ge.as_es || ngg_cull_shader) &&
616 (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) {
617 unsigned num_user_sgprs, num_vgprs;
618
619 if (ctx->stage == MESA_SHADER_VERTEX && ngg_cull_shader) {
620 /* For the NGG cull shader, add 1 SGPR to hold
621 * the vertex buffer pointer.
622 */
623 num_user_sgprs = GFX9_GS_NUM_USER_SGPR + 1;
624
625 if (shader->selector->info.num_vbos_in_user_sgprs) {
626 assert(num_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
627 num_user_sgprs =
628 SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->info.num_vbos_in_user_sgprs * 4;
629 }
630 } else {
631 num_user_sgprs = GFX9_GS_NUM_USER_SGPR;
632 }
633
634 /* The NGG cull shader has to return all 9 VGPRs.
635 *
636 * The normal merged ESGS shader only has to return the 5 VGPRs
637 * for the GS stage.
638 */
639 num_vgprs = ngg_cull_shader ? 9 : 5;
640
641 /* ES return values are inputs to GS. */
642 for (i = 0; i < 8 + num_user_sgprs; i++)
643 ac_add_return(&ctx->args, AC_ARG_SGPR);
644 for (i = 0; i < num_vgprs; i++)
645 ac_add_return(&ctx->args, AC_ARG_VGPR);
646 }
647 break;
648
649 case MESA_SHADER_TESS_EVAL:
650 declare_global_desc_pointers(ctx);
651 declare_per_stage_desc_pointers(ctx, true);
652 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
653 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
654 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
655
656 if (shader->key.ge.as_es) {
657 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
658 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
659 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.es2gs_offset);
660 } else {
661 declare_streamout_params(ctx, &ctx->so);
662 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
663 }
664
665 /* VGPRs */
666 declare_tes_input_vgprs(ctx);
667 break;
668
669 case MESA_SHADER_GEOMETRY:
670 declare_global_desc_pointers(ctx);
671 declare_per_stage_desc_pointers(ctx, true);
672 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs2vs_offset);
673 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_wave_id);
674
675 /* VGPRs */
676 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]);
677 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]);
678 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
679 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]);
680 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[3]);
681 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[4]);
682 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[5]);
683 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
684 break;
685
686 case MESA_SHADER_FRAGMENT:
687 declare_global_desc_pointers(ctx);
688 declare_per_stage_desc_pointers(ctx, true);
689 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL, SI_PARAM_ALPHA_REF);
690 si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.prim_mask,
691 SI_PARAM_PRIM_MASK);
692
693 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
694 SI_PARAM_PERSP_SAMPLE);
695 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_center,
696 SI_PARAM_PERSP_CENTER);
697 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_centroid,
698 SI_PARAM_PERSP_CENTROID);
699 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
700 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_sample,
701 SI_PARAM_LINEAR_SAMPLE);
702 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_center,
703 SI_PARAM_LINEAR_CENTER);
704 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid,
705 SI_PARAM_LINEAR_CENTROID);
706 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
707 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0],
708 SI_PARAM_POS_X_FLOAT);
709 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1],
710 SI_PARAM_POS_Y_FLOAT);
711 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[2],
712 SI_PARAM_POS_Z_FLOAT);
713 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[3],
714 SI_PARAM_POS_W_FLOAT);
715 shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
716 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.front_face,
717 SI_PARAM_FRONT_FACE);
718 shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
719 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.ancillary,
720 SI_PARAM_ANCILLARY);
721 shader->info.sample_coverage_vgpr_index = ctx->args.num_vgprs_used;
722 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.sample_coverage,
723 SI_PARAM_SAMPLE_COVERAGE);
724 si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->pos_fixed_pt,
725 SI_PARAM_POS_FIXED_PT);
726
727 /* Color inputs from the prolog. */
728 if (shader->selector->info.colors_read) {
729 unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);
730
731 for (i = 0; i < num_color_elements; i++)
732 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
733
734 num_prolog_vgprs += num_color_elements;
735 }
736
737 /* Outputs for the epilog. */
738 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
739 num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
740 shader->selector->info.writes_z + shader->selector->info.writes_stencil +
741 shader->selector->info.writes_samplemask + 1 /* SampleMaskIn */;
742
743 for (i = 0; i < num_return_sgprs; i++)
744 ac_add_return(&ctx->args, AC_ARG_SGPR);
745 for (; i < num_returns; i++)
746 ac_add_return(&ctx->args, AC_ARG_VGPR);
747 break;
748
749 case MESA_SHADER_COMPUTE:
750 declare_global_desc_pointers(ctx);
751 declare_per_stage_desc_pointers(ctx, true);
752 if (shader->selector->info.uses_grid_size)
753 ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups);
754 if (shader->selector->info.uses_variable_block_size)
755 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->block_size);
756
757 unsigned cs_user_data_dwords =
758 shader->selector->info.base.cs.user_data_components_amd;
759 if (cs_user_data_dwords) {
760 ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &ctx->cs_user_data);
761 }
762
763 /* Some descriptors can be in user SGPRs. */
764 /* Shader buffers in user SGPRs. */
765 for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) {
766 while (ctx->args.num_sgprs_used % 4 != 0)
767 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
768
769 ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->cs_shaderbuf[i]);
770 }
771 /* Images in user SGPRs. */
772 for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) {
773 unsigned num_sgprs = BITSET_TEST(shader->selector->info.base.image_buffers, i) ? 4 : 8;
774
775 while (ctx->args.num_sgprs_used % num_sgprs != 0)
776 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
777
778 ac_add_arg(&ctx->args, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &ctx->cs_image[i]);
779 }
780
781 /* Hardware SGPRs. */
782 for (i = 0; i < 3; i++) {
783 if (shader->selector->info.uses_block_id[i]) {
784 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.workgroup_ids[i]);
785 }
786 }
787 if (shader->selector->info.uses_subgroup_info)
788 ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
789
790 /* Hardware VGPRs. */
791 /* Thread IDs are packed in VGPR0, 10 bits per component or stored in 3 separate VGPRs */
792 if (ctx->screen->info.gfx_level >= GFX11 ||
793 (!ctx->screen->info.has_graphics && ctx->screen->info.family >= CHIP_ALDEBARAN))
794 ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.local_invocation_ids);
795 else
796 ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, &ctx->args.local_invocation_ids);
797 break;
798 default:
799 assert(0 && "unimplemented shader");
800 return;
801 }
802
803 shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
804 shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
805
806 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
807 shader->info.num_input_vgprs -= num_prolog_vgprs;
808 }
809
810 /* For the UMR disassembler. */
811 #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
812 #define DEBUGGER_NUM_MARKERS 5
813
get_lds_granularity(struct si_screen * screen,gl_shader_stage stage)814 static unsigned get_lds_granularity(struct si_screen *screen, gl_shader_stage stage)
815 {
816 return screen->info.gfx_level >= GFX11 && stage == MESA_SHADER_FRAGMENT ? 1024 :
817 screen->info.gfx_level >= GFX7 ? 512 : 256;
818 }
819
si_shader_binary_open(struct si_screen * screen,struct si_shader * shader,struct ac_rtld_binary * rtld)820 static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
821 struct ac_rtld_binary *rtld)
822 {
823 const struct si_shader_selector *sel = shader->selector;
824 const char *part_elfs[5];
825 size_t part_sizes[5];
826 unsigned num_parts = 0;
827
828 #define add_part(shader_or_part) \
829 if (shader_or_part) { \
830 part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \
831 part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \
832 num_parts++; \
833 }
834
835 add_part(shader->prolog);
836 add_part(shader->previous_stage);
837 add_part(shader);
838 add_part(shader->epilog);
839
840 #undef add_part
841
842 struct ac_rtld_symbol lds_symbols[2];
843 unsigned num_lds_symbols = 0;
844
845 if (sel && screen->info.gfx_level >= GFX9 && !shader->is_gs_copy_shader &&
846 (sel->stage == MESA_SHADER_GEOMETRY ||
847 (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg))) {
848 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
849 sym->name = "esgs_ring";
850 sym->size = shader->gs_info.esgs_ring_size * 4;
851 sym->align = 64 * 1024;
852 }
853
854 if (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
855 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
856 sym->name = "ngg_emit";
857 sym->size = shader->ngg.ngg_emit_size * 4;
858 sym->align = 4;
859 }
860
861 bool ok = ac_rtld_open(
862 rtld, (struct ac_rtld_open_info){.info = &screen->info,
863 .options =
864 {
865 .halt_at_entry = screen->options.halt_shaders,
866 },
867 .shader_type = sel->stage,
868 .wave_size = shader->wave_size,
869 .num_parts = num_parts,
870 .elf_ptrs = part_elfs,
871 .elf_sizes = part_sizes,
872 .num_shared_lds_symbols = num_lds_symbols,
873 .shared_lds_symbols = lds_symbols});
874
875 if (rtld->lds_size > 0) {
876 unsigned alloc_granularity = get_lds_granularity(screen, sel->stage);
877 shader->config.lds_size = DIV_ROUND_UP(rtld->lds_size, alloc_granularity);
878 }
879
880 return ok;
881 }
882
si_get_shader_binary_size(struct si_screen * screen,struct si_shader * shader)883 static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
884 {
885 struct ac_rtld_binary rtld;
886 si_shader_binary_open(screen, shader, &rtld);
887 uint64_t size = rtld.exec_size;
888 ac_rtld_close(&rtld);
889 return size;
890 }
891
si_get_external_symbol(enum amd_gfx_level gfx_level,void * data,const char * name,uint64_t * value)892 static bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
893 uint64_t *value)
894 {
895 uint64_t *scratch_va = data;
896
897 if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
898 *value = (uint32_t)*scratch_va;
899 return true;
900 }
901 if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
902 /* Enable scratch coalescing. */
903 *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32);
904
905 if (gfx_level >= GFX11)
906 *value |= S_008F04_SWIZZLE_ENABLE_GFX11(1);
907 else
908 *value |= S_008F04_SWIZZLE_ENABLE_GFX6(1);
909 return true;
910 }
911
912 return false;
913 }
914
si_shader_binary_upload(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va)915 bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
916 uint64_t scratch_va)
917 {
918 struct ac_rtld_binary binary;
919 if (!si_shader_binary_open(sscreen, shader, &binary))
920 return false;
921
922 si_resource_reference(&shader->bo, NULL);
923 shader->bo = si_aligned_buffer_create(
924 &sscreen->b,
925 (sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY) |
926 SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT,
927 PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256);
928 if (!shader->bo)
929 return false;
930
931 /* Upload. */
932 struct ac_rtld_upload_info u = {};
933 u.binary = &binary;
934 u.get_external_symbol = si_get_external_symbol;
935 u.cb_data = &scratch_va;
936 u.rx_va = shader->bo->gpu_address;
937 u.rx_ptr = sscreen->ws->buffer_map(sscreen->ws,
938 shader->bo->buf, NULL,
939 PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);
940 if (!u.rx_ptr)
941 return false;
942
943 int size = ac_rtld_upload(&u);
944
945 if (sscreen->debug_flags & DBG(SQTT)) {
946 /* Remember the uploaded code */
947 shader->binary.uploaded_code_size = size;
948 shader->binary.uploaded_code = malloc(size);
949 memcpy(shader->binary.uploaded_code, u.rx_ptr, size);
950 }
951
952 sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf);
953 ac_rtld_close(&binary);
954
955 return size >= 0;
956 }
957
si_shader_dump_disassembly(struct si_screen * screen,const struct si_shader_binary * binary,gl_shader_stage stage,unsigned wave_size,struct util_debug_callback * debug,const char * name,FILE * file)958 static void si_shader_dump_disassembly(struct si_screen *screen,
959 const struct si_shader_binary *binary,
960 gl_shader_stage stage, unsigned wave_size,
961 struct util_debug_callback *debug, const char *name,
962 FILE *file)
963 {
964 struct ac_rtld_binary rtld_binary;
965
966 if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
967 .info = &screen->info,
968 .shader_type = stage,
969 .wave_size = wave_size,
970 .num_parts = 1,
971 .elf_ptrs = &binary->elf_buffer,
972 .elf_sizes = &binary->elf_size}))
973 return;
974
975 const char *disasm;
976 size_t nbytes;
977
978 if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
979 goto out;
980
981 if (nbytes > INT_MAX)
982 goto out;
983
984 if (debug && debug->debug_message) {
985 /* Very long debug messages are cut off, so send the
986 * disassembly one line at a time. This causes more
987 * overhead, but on the plus side it simplifies
988 * parsing of resulting logs.
989 */
990 util_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");
991
992 uint64_t line = 0;
993 while (line < nbytes) {
994 int count = nbytes - line;
995 const char *nl = memchr(disasm + line, '\n', nbytes - line);
996 if (nl)
997 count = nl - (disasm + line);
998
999 if (count) {
1000 util_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
1001 }
1002
1003 line += count + 1;
1004 }
1005
1006 util_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
1007 }
1008
1009 if (file) {
1010 fprintf(file, "Shader %s disassembly:\n", name);
1011 fprintf(file, "%*s", (int)nbytes, disasm);
1012 }
1013
1014 out:
1015 ac_rtld_close(&rtld_binary);
1016 }
1017
si_calculate_max_simd_waves(struct si_shader * shader)1018 static void si_calculate_max_simd_waves(struct si_shader *shader)
1019 {
1020 struct si_screen *sscreen = shader->selector->screen;
1021 struct ac_shader_config *conf = &shader->config;
1022 unsigned num_inputs = shader->selector->info.num_inputs;
1023 unsigned lds_increment = get_lds_granularity(sscreen, shader->selector->stage);
1024 unsigned lds_per_wave = 0;
1025 unsigned max_simd_waves;
1026
1027 max_simd_waves = sscreen->info.max_wave64_per_simd;
1028
1029 /* Compute LDS usage for PS. */
1030 switch (shader->selector->stage) {
1031 case MESA_SHADER_FRAGMENT:
1032 /* The minimum usage per wave is (num_inputs * 48). The maximum
1033 * usage is (num_inputs * 48 * 16).
1034 * We can get anything in between and it varies between waves.
1035 *
1036 * The 48 bytes per input for a single primitive is equal to
1037 * 4 bytes/component * 4 components/input * 3 points.
1038 *
1039 * Other stages don't know the size at compile time or don't
1040 * allocate LDS per wave, but instead they do it per thread group.
1041 */
1042 lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment);
1043 break;
1044 case MESA_SHADER_COMPUTE: {
1045 unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
1046 lds_per_wave = (conf->lds_size * lds_increment) /
1047 DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
1048 }
1049 break;
1050 default:;
1051 }
1052
1053 /* Compute the per-SIMD wave counts. */
1054 if (conf->num_sgprs) {
1055 max_simd_waves =
1056 MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
1057 }
1058
1059 if (conf->num_vgprs) {
1060 /* Always print wave limits as Wave64, so that we can compare
1061 * Wave32 and Wave64 with shader-db fairly. */
1062 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
1063 max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
1064 }
1065
1066 unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
1067 if (lds_per_wave)
1068 max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
1069
1070 shader->info.max_simd_waves = max_simd_waves;
1071 }
1072
si_shader_dump_stats_for_shader_db(struct si_screen * screen,struct si_shader * shader,struct util_debug_callback * debug)1073 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1074 struct util_debug_callback *debug)
1075 {
1076 const struct ac_shader_config *conf = &shader->config;
1077 static const char *stages[] = {"VS", "TCS", "TES", "GS", "PS", "CS"};
1078
1079 if (screen->options.debug_disassembly)
1080 si_shader_dump_disassembly(screen, &shader->binary, shader->selector->stage,
1081 shader->wave_size, debug, "main", NULL);
1082
1083 util_debug_message(debug, SHADER_INFO,
1084 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
1085 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
1086 "Spilled VGPRs: %d PrivMem VGPRs: %d DivergentLoop: %d, InlineUniforms: %d, "
1087 "ParamExports: %u, (%s, W%u)",
1088 conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
1089 conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
1090 conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
1091 shader->selector->info.has_divergent_loop,
1092 shader->selector->info.base.num_inlinable_uniforms,
1093 shader->info.nr_param_exports,
1094 stages[shader->selector->stage], shader->wave_size);
1095 }
1096
si_shader_dump_stats(struct si_screen * sscreen,struct si_shader * shader,FILE * file,bool check_debug_option)1097 static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
1098 bool check_debug_option)
1099 {
1100 const struct ac_shader_config *conf = &shader->config;
1101
1102 if (!check_debug_option || si_can_dump_shader(sscreen, shader->selector->stage)) {
1103 if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
1104 fprintf(file,
1105 "*** SHADER CONFIG ***\n"
1106 "SPI_PS_INPUT_ADDR = 0x%04x\n"
1107 "SPI_PS_INPUT_ENA = 0x%04x\n",
1108 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
1109 }
1110
1111 fprintf(file,
1112 "*** SHADER STATS ***\n"
1113 "SGPRS: %d\n"
1114 "VGPRS: %d\n"
1115 "Spilled SGPRs: %d\n"
1116 "Spilled VGPRs: %d\n"
1117 "Private memory VGPRs: %d\n"
1118 "Code Size: %d bytes\n"
1119 "LDS: %d bytes\n"
1120 "Scratch: %d bytes per wave\n"
1121 "Max Waves: %d\n"
1122 "********************\n\n\n",
1123 conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
1124 shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
1125 conf->lds_size * get_lds_granularity(sscreen, shader->selector->stage),
1126 conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
1127 }
1128 }
1129
si_get_shader_name(const struct si_shader * shader)1130 const char *si_get_shader_name(const struct si_shader *shader)
1131 {
1132 switch (shader->selector->stage) {
1133 case MESA_SHADER_VERTEX:
1134 if (shader->key.ge.as_es)
1135 return "Vertex Shader as ES";
1136 else if (shader->key.ge.as_ls)
1137 return "Vertex Shader as LS";
1138 else if (shader->key.ge.as_ngg)
1139 return "Vertex Shader as ESGS";
1140 else
1141 return "Vertex Shader as VS";
1142 case MESA_SHADER_TESS_CTRL:
1143 return "Tessellation Control Shader";
1144 case MESA_SHADER_TESS_EVAL:
1145 if (shader->key.ge.as_es)
1146 return "Tessellation Evaluation Shader as ES";
1147 else if (shader->key.ge.as_ngg)
1148 return "Tessellation Evaluation Shader as ESGS";
1149 else
1150 return "Tessellation Evaluation Shader as VS";
1151 case MESA_SHADER_GEOMETRY:
1152 if (shader->is_gs_copy_shader)
1153 return "GS Copy Shader as VS";
1154 else
1155 return "Geometry Shader";
1156 case MESA_SHADER_FRAGMENT:
1157 return "Pixel Shader";
1158 case MESA_SHADER_COMPUTE:
1159 return "Compute Shader";
1160 default:
1161 return "Unknown Shader";
1162 }
1163 }
1164
si_shader_dump(struct si_screen * sscreen,struct si_shader * shader,struct util_debug_callback * debug,FILE * file,bool check_debug_option)1165 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1166 struct util_debug_callback *debug, FILE *file, bool check_debug_option)
1167 {
1168 gl_shader_stage stage = shader->selector->stage;
1169
1170 if (!check_debug_option || si_can_dump_shader(sscreen, stage))
1171 si_dump_shader_key(shader, file);
1172
1173 if (!check_debug_option && shader->binary.llvm_ir_string) {
1174 if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
1175 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
1176 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
1177 }
1178
1179 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
1180 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
1181 }
1182
1183 if (!check_debug_option ||
1184 (si_can_dump_shader(sscreen, stage) && !(sscreen->debug_flags & DBG(NO_ASM)))) {
1185
1186 fprintf(file, "\n%s:\n", si_get_shader_name(shader));
1187
1188 if (shader->prolog)
1189 si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, shader->wave_size, debug,
1190 "prolog", file);
1191 if (shader->previous_stage)
1192 si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,
1193 shader->wave_size, debug, "previous stage", file);
1194 si_shader_dump_disassembly(sscreen, &shader->binary, stage, shader->wave_size, debug, "main",
1195 file);
1196
1197 if (shader->epilog)
1198 si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, shader->wave_size, debug,
1199 "epilog", file);
1200 fprintf(file, "\n");
1201 }
1202
1203 si_shader_dump_stats(sscreen, shader, file, check_debug_option);
1204 }
1205
si_dump_shader_key_vs(const union si_shader_key * key,const struct si_vs_prolog_bits * prolog,const char * prefix,FILE * f)1206 static void si_dump_shader_key_vs(const union si_shader_key *key,
1207 const struct si_vs_prolog_bits *prolog, const char *prefix,
1208 FILE *f)
1209 {
1210 fprintf(f, " %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one);
1211 fprintf(f, " %s.instance_divisor_is_fetched = %u\n", prefix,
1212 prolog->instance_divisor_is_fetched);
1213 fprintf(f, " %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix);
1214
1215 fprintf(f, " mono.vs.fetch_opencode = %x\n", key->ge.mono.vs_fetch_opencode);
1216 fprintf(f, " mono.vs.fix_fetch = {");
1217 for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
1218 union si_vs_fix_fetch fix = key->ge.mono.vs_fix_fetch[i];
1219 if (i)
1220 fprintf(f, ", ");
1221 if (!fix.bits)
1222 fprintf(f, "0");
1223 else
1224 fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
1225 fix.u.format);
1226 }
1227 fprintf(f, "}\n");
1228 }
1229
si_dump_shader_key(const struct si_shader * shader,FILE * f)1230 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
1231 {
1232 const union si_shader_key *key = &shader->key;
1233 gl_shader_stage stage = shader->selector->stage;
1234
1235 fprintf(f, "SHADER KEY\n");
1236 fprintf(f, " source_sha1 = {");
1237 _mesa_sha1_print(f, shader->selector->info.base.source_sha1);
1238 fprintf(f, "}\n");
1239
1240 switch (stage) {
1241 case MESA_SHADER_VERTEX:
1242 si_dump_shader_key_vs(key, &key->ge.part.vs.prolog, "part.vs.prolog", f);
1243 fprintf(f, " as_es = %u\n", key->ge.as_es);
1244 fprintf(f, " as_ls = %u\n", key->ge.as_ls);
1245 fprintf(f, " as_ngg = %u\n", key->ge.as_ngg);
1246 fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1247 break;
1248
1249 case MESA_SHADER_TESS_CTRL:
1250 if (shader->selector->screen->info.gfx_level >= GFX9) {
1251 si_dump_shader_key_vs(key, &key->ge.part.tcs.ls_prolog, "part.tcs.ls_prolog", f);
1252 }
1253 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->ge.part.tcs.epilog.prim_mode);
1254 fprintf(f, " opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1255 fprintf(f, " opt.same_patch_vertices = %u\n", key->ge.opt.same_patch_vertices);
1256 break;
1257
1258 case MESA_SHADER_TESS_EVAL:
1259 fprintf(f, " as_es = %u\n", key->ge.as_es);
1260 fprintf(f, " as_ngg = %u\n", key->ge.as_ngg);
1261 fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1262 break;
1263
1264 case MESA_SHADER_GEOMETRY:
1265 if (shader->is_gs_copy_shader)
1266 break;
1267
1268 if (shader->selector->screen->info.gfx_level >= GFX9 &&
1269 key->ge.part.gs.es->stage == MESA_SHADER_VERTEX) {
1270 si_dump_shader_key_vs(key, &key->ge.part.gs.vs_prolog, "part.gs.vs_prolog", f);
1271 }
1272 fprintf(f, " mono.u.gs_tri_strip_adj_fix = %u\n", key->ge.mono.u.gs_tri_strip_adj_fix);
1273 fprintf(f, " as_ngg = %u\n", key->ge.as_ngg);
1274 break;
1275
1276 case MESA_SHADER_COMPUTE:
1277 break;
1278
1279 case MESA_SHADER_FRAGMENT:
1280 fprintf(f, " prolog.color_two_side = %u\n", key->ps.part.prolog.color_two_side);
1281 fprintf(f, " prolog.flatshade_colors = %u\n", key->ps.part.prolog.flatshade_colors);
1282 fprintf(f, " prolog.poly_stipple = %u\n", key->ps.part.prolog.poly_stipple);
1283 fprintf(f, " prolog.force_persp_sample_interp = %u\n",
1284 key->ps.part.prolog.force_persp_sample_interp);
1285 fprintf(f, " prolog.force_linear_sample_interp = %u\n",
1286 key->ps.part.prolog.force_linear_sample_interp);
1287 fprintf(f, " prolog.force_persp_center_interp = %u\n",
1288 key->ps.part.prolog.force_persp_center_interp);
1289 fprintf(f, " prolog.force_linear_center_interp = %u\n",
1290 key->ps.part.prolog.force_linear_center_interp);
1291 fprintf(f, " prolog.bc_optimize_for_persp = %u\n",
1292 key->ps.part.prolog.bc_optimize_for_persp);
1293 fprintf(f, " prolog.bc_optimize_for_linear = %u\n",
1294 key->ps.part.prolog.bc_optimize_for_linear);
1295 fprintf(f, " prolog.samplemask_log_ps_iter = %u\n",
1296 key->ps.part.prolog.samplemask_log_ps_iter);
1297 fprintf(f, " epilog.spi_shader_col_format = 0x%x\n",
1298 key->ps.part.epilog.spi_shader_col_format);
1299 fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.part.epilog.color_is_int8);
1300 fprintf(f, " epilog.color_is_int10 = 0x%X\n", key->ps.part.epilog.color_is_int10);
1301 fprintf(f, " epilog.last_cbuf = %u\n", key->ps.part.epilog.last_cbuf);
1302 fprintf(f, " epilog.alpha_func = %u\n", key->ps.part.epilog.alpha_func);
1303 fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.part.epilog.alpha_to_one);
1304 fprintf(f, " epilog.alpha_to_coverage_via_mrtz = %u\n", key->ps.part.epilog.alpha_to_coverage_via_mrtz);
1305 fprintf(f, " epilog.clamp_color = %u\n", key->ps.part.epilog.clamp_color);
1306 fprintf(f, " epilog.dual_src_blend_swizzle = %u\n", key->ps.part.epilog.dual_src_blend_swizzle);
1307 fprintf(f, " mono.poly_line_smoothing = %u\n", key->ps.mono.poly_line_smoothing);
1308 fprintf(f, " mono.point_smoothing = %u\n", key->ps.mono.point_smoothing);
1309 fprintf(f, " mono.interpolate_at_sample_force_center = %u\n",
1310 key->ps.mono.interpolate_at_sample_force_center);
1311 fprintf(f, " mono.fbfetch_msaa = %u\n", key->ps.mono.fbfetch_msaa);
1312 fprintf(f, " mono.fbfetch_is_1D = %u\n", key->ps.mono.fbfetch_is_1D);
1313 fprintf(f, " mono.fbfetch_layered = %u\n", key->ps.mono.fbfetch_layered);
1314 break;
1315
1316 default:
1317 assert(0);
1318 }
1319
1320 if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||
1321 stage == MESA_SHADER_VERTEX) &&
1322 !key->ge.as_es && !key->ge.as_ls) {
1323 fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs);
1324 fprintf(f, " opt.kill_pointsize = 0x%x\n", key->ge.opt.kill_pointsize);
1325 fprintf(f, " opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances);
1326 fprintf(f, " opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling);
1327 }
1328
1329 if (stage <= MESA_SHADER_GEOMETRY)
1330 fprintf(f, " opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1331 else
1332 fprintf(f, " opt.prefer_mono = %u\n", key->ps.opt.prefer_mono);
1333
1334 if (stage <= MESA_SHADER_GEOMETRY) {
1335 if (key->ge.opt.inline_uniforms) {
1336 fprintf(f, " opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1337 key->ge.opt.inline_uniforms,
1338 key->ge.opt.inlined_uniform_values[0],
1339 key->ge.opt.inlined_uniform_values[1],
1340 key->ge.opt.inlined_uniform_values[2],
1341 key->ge.opt.inlined_uniform_values[3]);
1342 } else {
1343 fprintf(f, " opt.inline_uniforms = 0\n");
1344 }
1345 } else {
1346 if (key->ps.opt.inline_uniforms) {
1347 fprintf(f, " opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1348 key->ps.opt.inline_uniforms,
1349 key->ps.opt.inlined_uniform_values[0],
1350 key->ps.opt.inlined_uniform_values[1],
1351 key->ps.opt.inlined_uniform_values[2],
1352 key->ps.opt.inlined_uniform_values[3]);
1353 } else {
1354 fprintf(f, " opt.inline_uniforms = 0\n");
1355 }
1356 }
1357 }
1358
si_vs_needs_prolog(const struct si_shader_selector * sel,const struct si_vs_prolog_bits * prolog_key,const union si_shader_key * key,bool ngg_cull_shader,bool is_gs)1359 bool si_vs_needs_prolog(const struct si_shader_selector *sel,
1360 const struct si_vs_prolog_bits *prolog_key,
1361 const union si_shader_key *key, bool ngg_cull_shader,
1362 bool is_gs)
1363 {
1364 assert(sel->stage == MESA_SHADER_VERTEX);
1365
1366 /* VGPR initialization fixup for Vega10 and Raven is always done in the
1367 * VS prolog. */
1368 return sel->info.vs_needs_prolog || prolog_key->ls_vgpr_fix ||
1369 /* The 2nd VS prolog loads input VGPRs from LDS */
1370 (key->ge.opt.ngg_culling && !ngg_cull_shader && !is_gs);
1371 }
1372
1373 /**
1374 * Compute the VS prolog key, which contains all the information needed to
1375 * build the VS prolog function, and set shader->info bits where needed.
1376 *
1377 * \param info Shader info of the vertex shader.
1378 * \param num_input_sgprs Number of input SGPRs for the vertex shader.
1379 * \param has_old_ Whether the preceding shader part is the NGG cull shader.
1380 * \param prolog_key Key of the VS prolog
1381 * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS.
1382 * \param key Output shader part key.
1383 */
si_get_vs_prolog_key(const struct si_shader_info * info,unsigned num_input_sgprs,bool ngg_cull_shader,const struct si_vs_prolog_bits * prolog_key,struct si_shader * shader_out,union si_shader_part_key * key)1384 void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs,
1385 bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key,
1386 struct si_shader *shader_out, union si_shader_part_key *key)
1387 {
1388 memset(key, 0, sizeof(*key));
1389 key->vs_prolog.states = *prolog_key;
1390 key->vs_prolog.wave32 = shader_out->wave_size == 32;
1391 key->vs_prolog.num_input_sgprs = num_input_sgprs;
1392 key->vs_prolog.num_inputs = info->num_inputs;
1393 key->vs_prolog.as_ls = shader_out->key.ge.as_ls;
1394 key->vs_prolog.as_es = shader_out->key.ge.as_es;
1395 key->vs_prolog.as_ngg = shader_out->key.ge.as_ngg;
1396
1397 if (shader_out->selector->stage != MESA_SHADER_GEOMETRY &&
1398 !ngg_cull_shader && shader_out->key.ge.opt.ngg_culling)
1399 key->vs_prolog.load_vgprs_after_culling = 1;
1400
1401 if (shader_out->selector->stage == MESA_SHADER_TESS_CTRL) {
1402 key->vs_prolog.as_ls = 1;
1403 key->vs_prolog.num_merged_next_stage_vgprs = 2;
1404 } else if (shader_out->selector->stage == MESA_SHADER_GEOMETRY) {
1405 key->vs_prolog.as_es = 1;
1406 key->vs_prolog.num_merged_next_stage_vgprs = 5;
1407 } else if (shader_out->key.ge.as_ngg) {
1408 key->vs_prolog.num_merged_next_stage_vgprs = 5;
1409 }
1410
1411 /* Only one of these combinations can be set. as_ngg can be set with as_es. */
1412 assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +
1413 (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) <= 1);
1414
1415 /* Enable loading the InstanceID VGPR. */
1416 uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
1417
1418 if ((key->vs_prolog.states.instance_divisor_is_one |
1419 key->vs_prolog.states.instance_divisor_is_fetched) &
1420 input_mask)
1421 shader_out->info.uses_instanceid = true;
1422 }
1423
1424 /* TODO: convert to nir_shader_instructions_pass */
si_nir_kill_outputs(nir_shader * nir,const union si_shader_key * key)1425 static bool si_nir_kill_outputs(nir_shader *nir, const union si_shader_key *key)
1426 {
1427 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1428 assert(impl);
1429
1430 if (nir->info.stage > MESA_SHADER_GEOMETRY ||
1431 (!key->ge.opt.kill_outputs &&
1432 !key->ge.opt.kill_pointsize &&
1433 !key->ge.opt.kill_clip_distances)) {
1434 nir_metadata_preserve(impl, nir_metadata_all);
1435 return false;
1436 }
1437
1438 bool progress = false;
1439
1440 nir_builder b;
1441 nir_builder_init(&b, impl);
1442
1443 nir_foreach_block(block, impl) {
1444 nir_foreach_instr_safe(instr, block) {
1445 if (instr->type != nir_instr_type_intrinsic)
1446 continue;
1447
1448 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1449 if (intr->intrinsic != nir_intrinsic_store_output)
1450 continue;
1451
1452 /* No indirect indexing allowed. */
1453 ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1454 assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1455
1456 assert(intr->num_components == 1); /* only scalar stores expected */
1457 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1458
1459 if (nir_slot_is_varying(sem.location) &&
1460 key->ge.opt.kill_outputs &
1461 (1ull << si_shader_io_get_unique_index(sem.location, true))) {
1462 nir_remove_varying(intr);
1463 progress = true;
1464 }
1465
1466 if (key->ge.opt.kill_pointsize && sem.location == VARYING_SLOT_PSIZ) {
1467 nir_remove_sysval_output(intr);
1468 progress = true;
1469 }
1470
1471 /* TODO: We should only kill specific clip planes as required by kill_clip_distance,
1472 * not whole gl_ClipVertex. Lower ClipVertex in NIR.
1473 */
1474 if ((key->ge.opt.kill_clip_distances & SI_USER_CLIP_PLANE_MASK) == SI_USER_CLIP_PLANE_MASK &&
1475 sem.location == VARYING_SLOT_CLIP_VERTEX) {
1476 nir_remove_sysval_output(intr);
1477 progress = true;
1478 }
1479
1480 if (key->ge.opt.kill_clip_distances &&
1481 (sem.location == VARYING_SLOT_CLIP_DIST0 ||
1482 sem.location == VARYING_SLOT_CLIP_DIST1)) {
1483 assert(nir_intrinsic_src_type(intr) == nir_type_float32);
1484 unsigned index = (sem.location - VARYING_SLOT_CLIP_DIST0) * 4 +
1485 nir_intrinsic_component(intr);
1486
1487 if ((key->ge.opt.kill_clip_distances >> index) & 0x1) {
1488 nir_remove_sysval_output(intr);
1489 progress = true;
1490 }
1491 }
1492 }
1493 }
1494
1495 if (progress) {
1496 nir_metadata_preserve(impl, nir_metadata_dominance |
1497 nir_metadata_block_index);
1498 } else {
1499 nir_metadata_preserve(impl, nir_metadata_all);
1500 }
1501
1502 return progress;
1503 }
1504
si_map_io_driver_location(unsigned semantic)1505 static unsigned si_map_io_driver_location(unsigned semantic)
1506 {
1507 if ((semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_TESS_MAX) ||
1508 semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
1509 semantic == VARYING_SLOT_TESS_LEVEL_OUTER)
1510 return si_shader_io_get_unique_index_patch(semantic);
1511
1512 return si_shader_io_get_unique_index(semantic, false);
1513 }
1514
si_lower_io_to_mem(struct si_shader * shader,nir_shader * nir,uint64_t tcs_vgpr_only_inputs)1515 static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir,
1516 uint64_t tcs_vgpr_only_inputs)
1517 {
1518 struct si_shader_selector *sel = shader->selector;
1519 const union si_shader_key *key = &shader->key;
1520
1521 if (nir->info.stage == MESA_SHADER_VERTEX) {
1522 if (key->ge.as_ls) {
1523 NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem, si_map_io_driver_location,
1524 key->ge.opt.same_patch_vertices, tcs_vgpr_only_inputs);
1525 return true;
1526 } else if (key->ge.as_es) {
1527 NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1528 sel->screen->info.gfx_level, sel->info.esgs_itemsize);
1529 return true;
1530 }
1531 } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
1532 NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem, si_map_io_driver_location,
1533 key->ge.opt.same_patch_vertices);
1534 NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, si_map_io_driver_location,
1535 sel->screen->info.gfx_level,
1536 false, /* does not matter as we disabled final tess factor write */
1537 ~0ULL, ~0ULL, /* no TES inputs filter */
1538 util_last_bit64(sel->info.outputs_written),
1539 util_last_bit64(sel->info.patch_outputs_written),
1540 shader->wave_size,
1541 /* ALL TCS inputs are passed by register. */
1542 key->ge.opt.same_patch_vertices &&
1543 !(sel->info.base.inputs_read & ~sel->info.tcs_vgpr_only_inputs),
1544 sel->info.tessfactors_are_def_in_all_invocs, false);
1545 return true;
1546 } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
1547 NIR_PASS_V(nir, ac_nir_lower_tes_inputs_to_mem, si_map_io_driver_location);
1548
1549 if (key->ge.as_es) {
1550 NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1551 sel->screen->info.gfx_level, sel->info.esgs_itemsize);
1552 }
1553
1554 return true;
1555 } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1556 NIR_PASS_V(nir, ac_nir_lower_gs_inputs_to_mem, si_map_io_driver_location,
1557 sel->screen->info.gfx_level, key->ge.mono.u.gs_tri_strip_adj_fix);
1558 return true;
1559 }
1560
1561 return false;
1562 }
1563
si_deserialize_shader(struct si_shader_selector * sel)1564 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel)
1565 {
1566 struct pipe_screen *screen = &sel->screen->b;
1567 const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
1568 pipe_shader_type_from_mesa(sel->stage));
1569
1570 struct blob_reader blob_reader;
1571 blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
1572 return nir_deserialize(NULL, options, &blob_reader);
1573 }
1574
si_get_nir_shader(struct si_shader * shader,bool * free_nir,uint64_t tcs_vgpr_only_inputs)1575 struct nir_shader *si_get_nir_shader(struct si_shader *shader, bool *free_nir,
1576 uint64_t tcs_vgpr_only_inputs)
1577 {
1578 struct si_shader_selector *sel = shader->selector;
1579 const union si_shader_key *key = &shader->key;
1580
1581 nir_shader *nir;
1582 *free_nir = false;
1583
1584 if (sel->nir) {
1585 nir = sel->nir;
1586 } else if (sel->nir_binary) {
1587 nir = si_deserialize_shader(sel);
1588 *free_nir = true;
1589 } else {
1590 return NULL;
1591 }
1592
1593 bool progress = false;
1594
1595 const char *original_name = NULL;
1596 if (unlikely(should_print_nir(nir))) {
1597 /* Modify the shader's name so that each variant gets its own name. */
1598 original_name = ralloc_strdup(nir, nir->info.name);
1599 ralloc_asprintf_append((char **)&nir->info.name, "-%08x", _mesa_hash_data(key, sizeof(*key)));
1600
1601 /* Dummy pass to get the starting point. */
1602 printf("nir_dummy_pass\n");
1603 nir_print_shader(nir, stdout);
1604 }
1605
1606 /* Kill outputs according to the shader key. */
1607 if (sel->stage <= MESA_SHADER_GEOMETRY)
1608 NIR_PASS(progress, nir, si_nir_kill_outputs, key);
1609
1610 bool inline_uniforms = false;
1611 uint32_t *inlined_uniform_values;
1612 si_get_inline_uniform_state((union si_shader_key*)key, sel->pipe_shader_type,
1613 &inline_uniforms, &inlined_uniform_values);
1614
1615 if (inline_uniforms) {
1616 assert(*free_nir);
1617
1618 /* Most places use shader information from the default variant, not
1619 * the optimized variant. These are the things that the driver looks at
1620 * in optimized variants and the list of things that we need to do.
1621 *
1622 * The driver takes into account these things if they suddenly disappear
1623 * from the shader code:
1624 * - Register usage and code size decrease (obvious)
1625 * - Eliminated PS system values are disabled by LLVM
1626 * (FragCoord, FrontFace, barycentrics)
1627 * - VS/TES/GS outputs feeding PS are eliminated if outputs are undef.
1628 * The storage for eliminated outputs is also not allocated.
1629 * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)
1630 * - TCS output stores are eliminated
1631 *
1632 * TODO: These are things the driver ignores in the final shader code
1633 * and relies on the default shader info.
1634 * - Other system values are not eliminated
1635 * - PS.NUM_INTERP = bitcount64(inputs_read), renumber inputs
1636 * to remove holes
1637 * - uses_discard - if it changed to false
1638 * - writes_memory - if it changed to false
1639 * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not
1640 * eliminated
1641 * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS)
1642 * GS outputs are eliminated except for the temporary LDS.
1643 * Clip distances, gl_PointSize, and PS outputs are eliminated based
1644 * on current states, so we don't care about the shader code.
1645 *
1646 * TODO: Merged shaders don't inline uniforms for the first stage.
1647 * VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS.
1648 * (key == NULL for the first stage here)
1649 *
1650 * TODO: Compute shaders don't support inlinable uniforms, because they
1651 * don't have shader variants.
1652 *
1653 * TODO: The driver uses a linear search to find a shader variant. This
1654 * can be really slow if we get too many variants due to uniform inlining.
1655 */
1656 NIR_PASS_V(nir, nir_inline_uniforms,
1657 nir->info.num_inlinable_uniforms,
1658 inlined_uniform_values,
1659 nir->info.inlinable_uniform_dw_offsets);
1660 progress = true;
1661 }
1662
1663 if (sel->stage == MESA_SHADER_FRAGMENT && key->ps.mono.poly_line_smoothing)
1664 NIR_PASS(progress, nir, nir_lower_poly_line_smooth, SI_NUM_SMOOTH_AA_SAMPLES);
1665
1666 if (sel->stage == MESA_SHADER_FRAGMENT && key->ps.mono.point_smoothing)
1667 NIR_PASS(progress, nir, nir_lower_point_smooth);
1668
1669 if (progress)
1670 si_nir_opts(sel->screen, nir, true);
1671
1672 /* Lower large variables that are always constant with load_constant intrinsics, which
1673 * get turned into PC-relative loads from a data section next to the shader.
1674 *
1675 * Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
1676 * this should be done after that.
1677 *
1678 * The pass crashes if there are dead temps of lowered IO interface types, so remove
1679 * them first.
1680 */
1681 bool progress2 = false;
1682 NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
1683 NIR_PASS(progress2, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
1684
1685 /* Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
1686 * this should be done after that.
1687 */
1688 progress2 |= ac_nir_lower_indirect_derefs(nir, sel->screen->info.gfx_level);
1689
1690 bool opt_offsets = si_lower_io_to_mem(shader, nir, tcs_vgpr_only_inputs);
1691
1692 if (progress2 || opt_offsets)
1693 si_nir_opts(sel->screen, nir, false);
1694
1695 if (opt_offsets) {
1696 static const nir_opt_offsets_options offset_options = {
1697 .uniform_max = 0,
1698 .buffer_max = ~0,
1699 .shared_max = ~0,
1700 };
1701 NIR_PASS_V(nir, nir_opt_offsets, &offset_options);
1702 }
1703
1704 if (progress || progress2 || opt_offsets)
1705 si_nir_late_opts(nir);
1706
1707 NIR_PASS_V(nir, nir_divergence_analysis);
1708
1709 /* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
1710 * 200 is tuned for Viewperf. It should be done last.
1711 */
1712 NIR_PASS_V(nir, nir_group_loads, nir_group_same_resource_only, 200);
1713
1714 if (unlikely(original_name)) {
1715 ralloc_free((void*)nir->info.name);
1716 nir->info.name = original_name;
1717 }
1718
1719 return nir;
1720 }
1721
si_update_shader_binary_info(struct si_shader * shader,nir_shader * nir)1722 void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir)
1723 {
1724 struct si_shader_info info;
1725 si_nir_scan_shader(shader->selector->screen, nir, &info);
1726
1727 shader->info.uses_vmem_load_other |= info.uses_vmem_load_other;
1728 shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh;
1729 }
1730
si_nir_assign_param_offsets(nir_shader * nir,const struct si_shader_info * info,int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS],uint8_t * num_param_exports,uint64_t * output_param_mask,uint8_t vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS])1731 static void si_nir_assign_param_offsets(nir_shader *nir, const struct si_shader_info *info,
1732 int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS],
1733 uint8_t *num_param_exports, uint64_t *output_param_mask,
1734 uint8_t vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS])
1735 {
1736 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1737 assert(impl);
1738
1739 nir_foreach_block(block, impl) {
1740 nir_foreach_instr_safe(instr, block) {
1741 if (instr->type != nir_instr_type_intrinsic)
1742 continue;
1743
1744 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1745 if (intr->intrinsic != nir_intrinsic_store_output)
1746 continue;
1747
1748 /* No indirect indexing allowed. */
1749 ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1750 assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1751
1752 assert(intr->num_components == 1); /* only scalar stores expected */
1753 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1754
1755 /* Assign the param index if it's unassigned. */
1756 if (nir_slot_is_varying(sem.location) && !sem.no_varying &&
1757 (sem.gs_streams & 0x3) == 0 &&
1758 vs_output_param_offset[sem.location] == AC_EXP_PARAM_DEFAULT_VAL_0000) {
1759 /* The semantic and the base should be the same as in si_shader_info. */
1760 assert(sem.location == info->output_semantic[nir_intrinsic_base(intr)]);
1761 /* It must not be remapped (duplicated). */
1762 assert(slot_remap[sem.location] == -1);
1763
1764 vs_output_param_offset[sem.location] = (*num_param_exports)++;
1765 *output_param_mask |= BITFIELD64_BIT(nir_intrinsic_base(intr));
1766 }
1767 }
1768 }
1769
1770 /* Duplicated outputs are redirected here. */
1771 for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
1772 if (slot_remap[i] >= 0)
1773 vs_output_param_offset[i] = vs_output_param_offset[slot_remap[i]];
1774 }
1775 }
1776
si_compile_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)1777 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1778 struct si_shader *shader, struct util_debug_callback *debug)
1779 {
1780 struct si_shader_selector *sel = shader->selector;
1781 bool free_nir;
1782 struct nir_shader *nir = si_get_nir_shader(shader, &free_nir, 0);
1783
1784 /* Assign param export indices. */
1785 if ((sel->stage == MESA_SHADER_VERTEX ||
1786 sel->stage == MESA_SHADER_TESS_EVAL ||
1787 (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)) &&
1788 !shader->key.ge.as_ls && !shader->key.ge.as_es) {
1789 /* Initialize this first. */
1790 shader->info.nr_param_exports = 0;
1791 shader->info.vs_output_param_mask = 0;
1792
1793 STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
1794 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
1795 sizeof(shader->info.vs_output_param_offset));
1796
1797 /* A slot remapping table for duplicated outputs, so that 1 vertex shader output can be
1798 * mapped to multiple fragment shader inputs.
1799 */
1800 int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS];
1801 memset(slot_remap, -1, NUM_TOTAL_VARYING_SLOTS);
1802
1803 /* This sets DEFAULT_VAL for constant outputs in vs_output_param_offset. */
1804 /* TODO: This doesn't affect GS. */
1805 NIR_PASS_V(nir, ac_nir_optimize_outputs, false, slot_remap,
1806 shader->info.vs_output_param_offset);
1807
1808 /* Assign the non-constant outputs. */
1809 /* TODO: Use this for the GS copy shader too. */
1810 si_nir_assign_param_offsets(nir, &sel->info, slot_remap, &shader->info.nr_param_exports,
1811 &shader->info.vs_output_param_mask,
1812 shader->info.vs_output_param_offset);
1813
1814 if (shader->key.ge.mono.u.vs_export_prim_id) {
1815 shader->info.vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = shader->info.nr_param_exports++;
1816 shader->info.vs_output_param_mask |= BITFIELD64_BIT(sel->info.num_outputs);
1817 }
1818 }
1819
1820 struct pipe_stream_output_info so = {};
1821 if (si_shader_uses_streamout(shader))
1822 nir_gather_stream_output_info(nir, &so);
1823
1824 /* Dump NIR before doing NIR->LLVM conversion in case the
1825 * conversion fails. */
1826 if (si_can_dump_shader(sscreen, sel->stage) &&
1827 !(sscreen->debug_flags & DBG(NO_NIR))) {
1828 nir_print_shader(nir, stderr);
1829 si_dump_streamout(&so);
1830 }
1831
1832 /* Initialize vs_output_ps_input_cntl to default. */
1833 for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++)
1834 shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED;
1835 shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0;
1836
1837 si_update_shader_binary_info(shader, nir);
1838
1839 shader->info.uses_instanceid = sel->info.uses_instanceid;
1840 shader->info.private_mem_vgprs = DIV_ROUND_UP(nir->scratch_size, 4);
1841
1842 /* Set the FP ALU behavior. */
1843 /* By default, we disable denormals for FP32 and enable them for FP16 and FP64
1844 * for performance and correctness reasons. FP32 denormals can't be enabled because
1845 * they break output modifiers and v_mad_f32 and are very slow on GFX6-7.
1846 *
1847 * float_controls_execution_mode defines the set of valid behaviors. Contradicting flags
1848 * can be set simultaneously, which means we are allowed to choose, but not really because
1849 * some options cause GLCTS failures.
1850 */
1851 unsigned float_mode = V_00B028_FP_16_64_DENORMS;
1852
1853 if (!(nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) &&
1854 nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
1855 float_mode |= V_00B028_FP_32_ROUND_TOWARDS_ZERO;
1856
1857 if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
1858 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64)) &&
1859 nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
1860 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
1861 float_mode |= V_00B028_FP_16_64_ROUND_TOWARDS_ZERO;
1862
1863 if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_PRESERVE_FP16 |
1864 FLOAT_CONTROLS_DENORM_PRESERVE_FP64)) &&
1865 nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
1866 FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64))
1867 float_mode &= ~V_00B028_FP_16_64_DENORMS;
1868
1869 /* TODO: ACO could compile non-monolithic shaders here (starting
1870 * with PS and NGG VS), but monolithic shaders should be compiled
1871 * by LLVM due to more complicated compilation.
1872 */
1873 if (!si_llvm_compile_shader(sscreen, compiler, shader, &so, debug, nir, free_nir))
1874 return false;
1875
1876 shader->config.float_mode = float_mode;
1877
1878 /* The GS copy shader is compiled next. */
1879 if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
1880 shader->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, &so, debug);
1881 if (!shader->gs_copy_shader) {
1882 fprintf(stderr, "radeonsi: can't create GS copy shader\n");
1883 return false;
1884 }
1885 }
1886
1887 /* Compute vs_output_ps_input_cntl. */
1888 if ((sel->stage == MESA_SHADER_VERTEX ||
1889 sel->stage == MESA_SHADER_TESS_EVAL ||
1890 sel->stage == MESA_SHADER_GEOMETRY) &&
1891 !shader->key.ge.as_ls && !shader->key.ge.as_es) {
1892 ubyte *vs_output_param_offset = shader->info.vs_output_param_offset;
1893
1894 if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg)
1895 vs_output_param_offset = shader->gs_copy_shader->info.vs_output_param_offset;
1896
1897 /* We must use the original shader info before the removal of duplicated shader outputs. */
1898 /* VS and TES should also set primitive ID output if it's used. */
1899 unsigned num_outputs_with_prim_id = sel->info.num_outputs +
1900 shader->key.ge.mono.u.vs_export_prim_id;
1901
1902 for (unsigned i = 0; i < num_outputs_with_prim_id; i++) {
1903 unsigned semantic = sel->info.output_semantic[i];
1904 unsigned offset = vs_output_param_offset[semantic];
1905 unsigned ps_input_cntl;
1906
1907 if (offset <= AC_EXP_PARAM_OFFSET_31) {
1908 /* The input is loaded from parameter memory. */
1909 ps_input_cntl = S_028644_OFFSET(offset);
1910 } else {
1911 /* The input is a DEFAULT_VAL constant. */
1912 assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
1913 offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
1914 offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
1915
1916 /* OFFSET=0x20 means that DEFAULT_VAL is used. */
1917 ps_input_cntl = S_028644_OFFSET(0x20) |
1918 S_028644_DEFAULT_VAL(offset);
1919 }
1920
1921 shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl;
1922 }
1923 }
1924
1925 /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */
1926 if (sel->stage == MESA_SHADER_COMPUTE) {
1927 unsigned max_vgprs =
1928 sscreen->info.num_physical_wave64_vgprs_per_simd * (shader->wave_size == 32 ? 2 : 1);
1929 unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
1930 unsigned max_sgprs_per_wave = 128;
1931 unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
1932 unsigned threads_per_tg = si_get_max_workgroup_size(shader);
1933 unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, shader->wave_size);
1934 unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
1935
1936 max_vgprs = max_vgprs / waves_per_simd;
1937 max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
1938
1939 if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
1940 fprintf(stderr,
1941 "LLVM failed to compile a shader correctly: "
1942 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
1943 shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);
1944
1945 /* Just terminate the process, because dependent
1946 * shaders can hang due to bad input data, but use
1947 * the env var to allow shader-db to work.
1948 */
1949 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
1950 abort();
1951 }
1952 }
1953
1954 /* Add the scratch offset to input SGPRs. */
1955 if (sel->screen->info.gfx_level < GFX11 &&
1956 shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader))
1957 shader->info.num_input_sgprs += 1; /* scratch byte offset */
1958
1959 /* Calculate the number of fragment input VGPRs. */
1960 if (sel->stage == MESA_SHADER_FRAGMENT) {
1961 shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(
1962 &shader->config, &shader->info.face_vgpr_index, &shader->info.ancillary_vgpr_index,
1963 &shader->info.sample_coverage_vgpr_index);
1964 }
1965
1966 si_calculate_max_simd_waves(shader);
1967 si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
1968 return true;
1969 }
1970
1971 /**
1972 * Create, compile and return a shader part (prolog or epilog).
1973 *
1974 * \param sscreen screen
1975 * \param list list of shader parts of the same category
1976 * \param type shader type
1977 * \param key shader part key
1978 * \param prolog whether the part being requested is a prolog
1979 * \param tm LLVM target machine
1980 * \param debug debug callback
1981 * \param build the callback responsible for building the main function
1982 * \return non-NULL on success
1983 */
1984 static struct si_shader_part *
si_get_shader_part(struct si_screen * sscreen,struct si_shader_part ** list,gl_shader_stage stage,bool prolog,union si_shader_part_key * key,struct ac_llvm_compiler * compiler,struct util_debug_callback * debug,void (* build)(struct si_shader_context *,union si_shader_part_key *),const char * name)1985 si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
1986 gl_shader_stage stage, bool prolog, union si_shader_part_key *key,
1987 struct ac_llvm_compiler *compiler, struct util_debug_callback *debug,
1988 void (*build)(struct si_shader_context *, union si_shader_part_key *),
1989 const char *name)
1990 {
1991 struct si_shader_part *result;
1992
1993 simple_mtx_lock(&sscreen->shader_parts_mutex);
1994
1995 /* Find existing. */
1996 for (result = *list; result; result = result->next) {
1997 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
1998 simple_mtx_unlock(&sscreen->shader_parts_mutex);
1999 return result;
2000 }
2001 }
2002
2003 /* Compile a new one. */
2004 result = CALLOC_STRUCT(si_shader_part);
2005 result->key = *key;
2006
2007 struct si_shader_selector sel = {};
2008 sel.screen = sscreen;
2009
2010 struct si_shader shader = {};
2011 shader.selector = &sel;
2012 bool wave32 = false;
2013
2014 switch (stage) {
2015 case MESA_SHADER_VERTEX:
2016 shader.key.ge.as_ls = key->vs_prolog.as_ls;
2017 shader.key.ge.as_es = key->vs_prolog.as_es;
2018 shader.key.ge.as_ngg = key->vs_prolog.as_ngg;
2019 wave32 = key->vs_prolog.wave32;
2020 break;
2021 case MESA_SHADER_TESS_CTRL:
2022 assert(!prolog);
2023 shader.key.ge.part.tcs.epilog = key->tcs_epilog.states;
2024 wave32 = key->tcs_epilog.wave32;
2025 break;
2026 case MESA_SHADER_FRAGMENT:
2027 if (prolog) {
2028 shader.key.ps.part.prolog = key->ps_prolog.states;
2029 wave32 = key->ps_prolog.wave32;
2030 } else {
2031 shader.key.ps.part.epilog = key->ps_epilog.states;
2032 wave32 = key->ps_epilog.wave32;
2033 }
2034 break;
2035 default:
2036 unreachable("bad shader part");
2037 }
2038
2039 struct si_shader_context ctx;
2040 si_llvm_context_init(&ctx, sscreen, compiler, wave32 ? 32 : 64);
2041
2042 ctx.shader = &shader;
2043 ctx.stage = stage;
2044
2045 build(&ctx, key);
2046
2047 /* Compile. */
2048 si_llvm_optimize_module(&ctx);
2049
2050 if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler, &ctx.ac, debug,
2051 ctx.stage, name, false)) {
2052 FREE(result);
2053 result = NULL;
2054 goto out;
2055 }
2056
2057 result->next = *list;
2058 *list = result;
2059
2060 out:
2061 si_llvm_dispose(&ctx);
2062 simple_mtx_unlock(&sscreen->shader_parts_mutex);
2063 return result;
2064 }
2065
si_get_vs_prolog(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug,struct si_shader * main_part,const struct si_vs_prolog_bits * key)2066 static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2067 struct si_shader *shader, struct util_debug_callback *debug,
2068 struct si_shader *main_part, const struct si_vs_prolog_bits *key)
2069 {
2070 struct si_shader_selector *vs = main_part->selector;
2071
2072 if (!si_vs_needs_prolog(vs, key, &shader->key, false,
2073 shader->selector->stage == MESA_SHADER_GEOMETRY))
2074 return true;
2075
2076 /* Get the prolog. */
2077 union si_shader_part_key prolog_key;
2078 si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader,
2079 &prolog_key);
2080
2081 shader->prolog =
2082 si_get_shader_part(sscreen, &sscreen->vs_prologs, MESA_SHADER_VERTEX, true, &prolog_key,
2083 compiler, debug, si_llvm_build_vs_prolog, "Vertex Shader Prolog");
2084 return shader->prolog != NULL;
2085 }
2086
2087 /**
2088 * Select and compile (or reuse) vertex shader parts (prolog & epilog).
2089 */
si_shader_select_vs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)2090 static bool si_shader_select_vs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2091 struct si_shader *shader, struct util_debug_callback *debug)
2092 {
2093 return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, &shader->key.ge.part.vs.prolog);
2094 }
2095
si_get_tcs_epilog_key(struct si_shader * shader,union si_shader_part_key * key)2096 void si_get_tcs_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
2097 {
2098 memset(key, 0, sizeof(*key));
2099 key->tcs_epilog.wave32 = shader->wave_size == 32;
2100 key->tcs_epilog.states = shader->key.ge.part.tcs.epilog;
2101
2102 /* If output patches are wholly in one wave, we don't need a barrier. */
2103 key->tcs_epilog.noop_s_barrier =
2104 shader->wave_size % shader->selector->info.base.tess.tcs_vertices_out == 0;
2105 }
2106
2107 /**
2108 * Select and compile (or reuse) TCS parts (epilog).
2109 */
si_shader_select_tcs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)2110 static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2111 struct si_shader *shader, struct util_debug_callback *debug)
2112 {
2113 if (sscreen->info.gfx_level >= GFX9) {
2114 struct si_shader *ls_main_part = shader->key.ge.part.tcs.ls->main_shader_part_ls;
2115
2116 if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
2117 &shader->key.ge.part.tcs.ls_prolog))
2118 return false;
2119
2120 shader->previous_stage = ls_main_part;
2121 }
2122
2123 /* Get the epilog. */
2124 union si_shader_part_key epilog_key;
2125 si_get_tcs_epilog_key(shader, &epilog_key);
2126
2127 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, MESA_SHADER_TESS_CTRL, false,
2128 &epilog_key, compiler, debug, si_llvm_build_tcs_epilog,
2129 "Tessellation Control Shader Epilog");
2130 return shader->epilog != NULL;
2131 }
2132
2133 /**
2134 * Select and compile (or reuse) GS parts (prolog).
2135 */
si_shader_select_gs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)2136 static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2137 struct si_shader *shader, struct util_debug_callback *debug)
2138 {
2139 if (sscreen->info.gfx_level >= GFX9) {
2140 struct si_shader *es_main_part;
2141
2142 if (shader->key.ge.as_ngg)
2143 es_main_part = shader->key.ge.part.gs.es->main_shader_part_ngg_es;
2144 else
2145 es_main_part = shader->key.ge.part.gs.es->main_shader_part_es;
2146
2147 if (shader->key.ge.part.gs.es->stage == MESA_SHADER_VERTEX &&
2148 !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
2149 &shader->key.ge.part.gs.vs_prolog))
2150 return false;
2151
2152 shader->previous_stage = es_main_part;
2153 }
2154
2155 return true;
2156 }
2157
2158 /**
2159 * Compute the PS prolog key, which contains all the information needed to
2160 * build the PS prolog function, and set related bits in shader->config.
2161 */
si_get_ps_prolog_key(struct si_shader * shader,union si_shader_part_key * key,bool separate_prolog)2162 void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,
2163 bool separate_prolog)
2164 {
2165 struct si_shader_info *info = &shader->selector->info;
2166
2167 memset(key, 0, sizeof(*key));
2168 key->ps_prolog.states = shader->key.ps.part.prolog;
2169 key->ps_prolog.wave32 = shader->wave_size == 32;
2170 key->ps_prolog.colors_read = info->colors_read;
2171 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
2172 key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
2173 key->ps_prolog.wqm =
2174 info->base.fs.needs_quad_helper_invocations &&
2175 (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
2176 key->ps_prolog.states.force_linear_sample_interp ||
2177 key->ps_prolog.states.force_persp_center_interp ||
2178 key->ps_prolog.states.force_linear_center_interp ||
2179 key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);
2180 key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
2181 key->ps_prolog.sample_coverage_vgpr_index = shader->info.sample_coverage_vgpr_index;
2182
2183 if (shader->key.ps.part.prolog.poly_stipple)
2184 shader->info.uses_vmem_load_other = true;
2185
2186 if (info->colors_read) {
2187 ubyte *color = shader->selector->info.color_attr_index;
2188
2189 if (shader->key.ps.part.prolog.color_two_side) {
2190 /* BCOLORs are stored after the last input. */
2191 key->ps_prolog.num_interp_inputs = info->num_inputs;
2192 key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
2193 if (separate_prolog)
2194 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
2195 }
2196
2197 for (unsigned i = 0; i < 2; i++) {
2198 unsigned interp = info->color_interpolate[i];
2199 unsigned location = info->color_interpolate_loc[i];
2200
2201 if (!(info->colors_read & (0xf << i * 4)))
2202 continue;
2203
2204 key->ps_prolog.color_attr_index[i] = color[i];
2205
2206 if (shader->key.ps.part.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
2207 interp = INTERP_MODE_FLAT;
2208
2209 switch (interp) {
2210 case INTERP_MODE_FLAT:
2211 key->ps_prolog.color_interp_vgpr_index[i] = -1;
2212 break;
2213 case INTERP_MODE_SMOOTH:
2214 case INTERP_MODE_COLOR:
2215 /* Force the interpolation location for colors here. */
2216 if (shader->key.ps.part.prolog.force_persp_sample_interp)
2217 location = TGSI_INTERPOLATE_LOC_SAMPLE;
2218 if (shader->key.ps.part.prolog.force_persp_center_interp)
2219 location = TGSI_INTERPOLATE_LOC_CENTER;
2220
2221 switch (location) {
2222 case TGSI_INTERPOLATE_LOC_SAMPLE:
2223 key->ps_prolog.color_interp_vgpr_index[i] = 0;
2224 if (separate_prolog) {
2225 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2226 }
2227 break;
2228 case TGSI_INTERPOLATE_LOC_CENTER:
2229 key->ps_prolog.color_interp_vgpr_index[i] = 2;
2230 if (separate_prolog) {
2231 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2232 }
2233 break;
2234 case TGSI_INTERPOLATE_LOC_CENTROID:
2235 key->ps_prolog.color_interp_vgpr_index[i] = 4;
2236 if (separate_prolog) {
2237 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
2238 }
2239 break;
2240 default:
2241 assert(0);
2242 }
2243 break;
2244 case INTERP_MODE_NOPERSPECTIVE:
2245 /* Force the interpolation location for colors here. */
2246 if (shader->key.ps.part.prolog.force_linear_sample_interp)
2247 location = TGSI_INTERPOLATE_LOC_SAMPLE;
2248 if (shader->key.ps.part.prolog.force_linear_center_interp)
2249 location = TGSI_INTERPOLATE_LOC_CENTER;
2250
2251 /* The VGPR assignment for non-monolithic shaders
2252 * works because InitialPSInputAddr is set on the
2253 * main shader and PERSP_PULL_MODEL is never used.
2254 */
2255 switch (location) {
2256 case TGSI_INTERPOLATE_LOC_SAMPLE:
2257 key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 6 : 9;
2258 if (separate_prolog) {
2259 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2260 }
2261 break;
2262 case TGSI_INTERPOLATE_LOC_CENTER:
2263 key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 8 : 11;
2264 if (separate_prolog) {
2265 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2266 }
2267 break;
2268 case TGSI_INTERPOLATE_LOC_CENTROID:
2269 key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 10 : 13;
2270 if (separate_prolog) {
2271 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
2272 }
2273 break;
2274 default:
2275 assert(0);
2276 }
2277 break;
2278 default:
2279 assert(0);
2280 }
2281 }
2282 }
2283 }
2284
2285 /**
2286 * Check whether a PS prolog is required based on the key.
2287 */
si_need_ps_prolog(const union si_shader_part_key * key)2288 bool si_need_ps_prolog(const union si_shader_part_key *key)
2289 {
2290 return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
2291 key->ps_prolog.states.force_linear_sample_interp ||
2292 key->ps_prolog.states.force_persp_center_interp ||
2293 key->ps_prolog.states.force_linear_center_interp ||
2294 key->ps_prolog.states.bc_optimize_for_persp ||
2295 key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
2296 key->ps_prolog.states.samplemask_log_ps_iter;
2297 }
2298
2299 /**
2300 * Compute the PS epilog key, which contains all the information needed to
2301 * build the PS epilog function.
2302 */
si_get_ps_epilog_key(struct si_shader * shader,union si_shader_part_key * key)2303 void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
2304 {
2305 struct si_shader_info *info = &shader->selector->info;
2306 memset(key, 0, sizeof(*key));
2307 key->ps_epilog.wave32 = shader->wave_size == 32;
2308 key->ps_epilog.uses_discard = si_shader_uses_discard(shader);
2309 key->ps_epilog.colors_written = info->colors_written;
2310 key->ps_epilog.color_types = info->output_color_types;
2311 key->ps_epilog.writes_z = info->writes_z;
2312 key->ps_epilog.writes_stencil = info->writes_stencil;
2313 key->ps_epilog.writes_samplemask = info->writes_samplemask;
2314 key->ps_epilog.states = shader->key.ps.part.epilog;
2315 }
2316
2317 /**
2318 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
2319 */
si_shader_select_ps_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)2320 static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2321 struct si_shader *shader, struct util_debug_callback *debug)
2322 {
2323 union si_shader_part_key prolog_key;
2324 union si_shader_part_key epilog_key;
2325
2326 /* Get the prolog. */
2327 si_get_ps_prolog_key(shader, &prolog_key, true);
2328
2329 /* The prolog is a no-op if these aren't set. */
2330 if (si_need_ps_prolog(&prolog_key)) {
2331 shader->prolog =
2332 si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key,
2333 compiler, debug, si_llvm_build_ps_prolog, "Fragment Shader Prolog");
2334 if (!shader->prolog)
2335 return false;
2336 }
2337
2338 /* Get the epilog. */
2339 si_get_ps_epilog_key(shader, &epilog_key);
2340
2341 shader->epilog =
2342 si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key,
2343 compiler, debug, si_llvm_build_ps_epilog, "Fragment Shader Epilog");
2344 if (!shader->epilog)
2345 return false;
2346
2347 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
2348 if (shader->key.ps.part.prolog.poly_stipple) {
2349 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2350 assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
2351 }
2352
2353 /* Set up the enable bits for per-sample shading if needed. */
2354 if (shader->key.ps.part.prolog.force_persp_sample_interp &&
2355 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2356 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2357 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
2358 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2359 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2360 }
2361 if (shader->key.ps.part.prolog.force_linear_sample_interp &&
2362 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2363 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2364 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
2365 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2366 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2367 }
2368 if (shader->key.ps.part.prolog.force_persp_center_interp &&
2369 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2370 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2371 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
2372 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2373 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2374 }
2375 if (shader->key.ps.part.prolog.force_linear_center_interp &&
2376 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2377 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2378 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
2379 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2380 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2381 }
2382
2383 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
2384 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
2385 !(shader->config.spi_ps_input_ena & 0xf)) {
2386 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2387 assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
2388 }
2389
2390 /* At least one pair of interpolation weights must be enabled. */
2391 if (!(shader->config.spi_ps_input_ena & 0x7f)) {
2392 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2393 assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
2394 }
2395
2396 /* Samplemask fixup requires the sample ID. */
2397 if (shader->key.ps.part.prolog.samplemask_log_ps_iter) {
2398 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2399 assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
2400 }
2401
2402 return true;
2403 }
2404
si_multiwave_lds_size_workaround(struct si_screen * sscreen,unsigned * lds_size)2405 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
2406 {
2407 /* If tessellation is all offchip and on-chip GS isn't used, this
2408 * workaround is not needed.
2409 */
2410 return;
2411
2412 /* SPI barrier management bug:
2413 * Make sure we have at least 4k of LDS in use to avoid the bug.
2414 * It applies to workgroup sizes of more than one wavefront.
2415 */
2416 if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
2417 *lds_size = MAX2(*lds_size, 8);
2418 }
2419
si_fix_resource_usage(struct si_screen * sscreen,struct si_shader * shader)2420 void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
2421 {
2422 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
2423
2424 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
2425
2426 if (shader->selector->stage == MESA_SHADER_COMPUTE &&
2427 si_get_max_workgroup_size(shader) > shader->wave_size) {
2428 si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
2429 }
2430 }
2431
si_create_shader_variant(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)2432 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2433 struct si_shader *shader, struct util_debug_callback *debug)
2434 {
2435 struct si_shader_selector *sel = shader->selector;
2436 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
2437
2438 /* LS, ES, VS are compiled on demand if the main part hasn't been
2439 * compiled for that stage.
2440 *
2441 * GS are compiled on demand if the main part hasn't been compiled
2442 * for the chosen NGG-ness.
2443 *
2444 * Vertex shaders are compiled on demand when a vertex fetch
2445 * workaround must be applied.
2446 */
2447 if (shader->is_monolithic) {
2448 /* Monolithic shader (compiled as a whole, has many variants,
2449 * may take a long time to compile).
2450 */
2451 if (!si_compile_shader(sscreen, compiler, shader, debug))
2452 return false;
2453 } else {
2454 /* The shader consists of several parts:
2455 *
2456 * - the middle part is the user shader, it has 1 variant only
2457 * and it was compiled during the creation of the shader
2458 * selector
2459 * - the prolog part is inserted at the beginning
2460 * - the epilog part is inserted at the end
2461 *
2462 * The prolog and epilog have many (but simple) variants.
2463 *
2464 * Starting with gfx9, geometry and tessellation control
2465 * shaders also contain the prolog and user shader parts of
2466 * the previous shader stage.
2467 */
2468
2469 if (!mainp)
2470 return false;
2471
2472 /* Copy the compiled shader data over. */
2473 shader->is_binary_shared = true;
2474 shader->binary = mainp->binary;
2475 shader->config = mainp->config;
2476 shader->info = mainp->info;
2477
2478 /* Select prologs and/or epilogs. */
2479 switch (sel->stage) {
2480 case MESA_SHADER_VERTEX:
2481 if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
2482 return false;
2483 break;
2484 case MESA_SHADER_TESS_CTRL:
2485 if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
2486 return false;
2487 break;
2488 case MESA_SHADER_TESS_EVAL:
2489 break;
2490 case MESA_SHADER_GEOMETRY:
2491 if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
2492 return false;
2493
2494 /* Clone the GS copy shader for the shader variant.
2495 * We can't just copy the pointer because we change the pm4 state and
2496 * si_shader_selector::gs_copy_shader must be immutable because it's shared
2497 * by multiple contexts.
2498 */
2499 if (!shader->key.ge.as_ngg) {
2500 assert(sel->main_shader_part == mainp);
2501 assert(sel->main_shader_part->gs_copy_shader);
2502 assert(sel->main_shader_part->gs_copy_shader->bo);
2503 assert(!sel->main_shader_part->gs_copy_shader->previous_stage_sel);
2504 assert(!sel->main_shader_part->gs_copy_shader->scratch_bo);
2505
2506 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
2507 memcpy(shader->gs_copy_shader, sel->main_shader_part->gs_copy_shader,
2508 sizeof(*shader->gs_copy_shader));
2509 /* Increase the reference count. */
2510 pipe_reference(NULL, &shader->gs_copy_shader->bo->b.b.reference);
2511 /* Initialize some fields differently. */
2512 shader->gs_copy_shader->shader_log = NULL;
2513 shader->gs_copy_shader->is_binary_shared = true;
2514 util_queue_fence_init(&shader->gs_copy_shader->ready);
2515 }
2516 break;
2517 case MESA_SHADER_FRAGMENT:
2518 if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
2519 return false;
2520
2521 /* Make sure we have at least as many VGPRs as there
2522 * are allocated inputs.
2523 */
2524 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
2525 break;
2526 default:;
2527 }
2528
2529 assert(shader->wave_size == mainp->wave_size);
2530 assert(!shader->previous_stage || shader->wave_size == shader->previous_stage->wave_size);
2531
2532 /* Update SGPR and VGPR counts. */
2533 if (shader->prolog) {
2534 shader->config.num_sgprs =
2535 MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs);
2536 shader->config.num_vgprs =
2537 MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs);
2538 }
2539 if (shader->previous_stage) {
2540 shader->config.num_sgprs =
2541 MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
2542 shader->config.num_vgprs =
2543 MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
2544 shader->config.spilled_sgprs =
2545 MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
2546 shader->config.spilled_vgprs =
2547 MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
2548 shader->info.private_mem_vgprs =
2549 MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
2550 shader->config.scratch_bytes_per_wave =
2551 MAX2(shader->config.scratch_bytes_per_wave,
2552 shader->previous_stage->config.scratch_bytes_per_wave);
2553 shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
2554 shader->info.uses_vmem_load_other |= shader->previous_stage->info.uses_vmem_load_other;
2555 shader->info.uses_vmem_sampler_or_bvh |= shader->previous_stage->info.uses_vmem_sampler_or_bvh;
2556 }
2557 if (shader->epilog) {
2558 shader->config.num_sgprs =
2559 MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs);
2560 shader->config.num_vgprs =
2561 MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs);
2562 }
2563 si_calculate_max_simd_waves(shader);
2564 }
2565
2566 if (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
2567 assert(!shader->key.ge.as_es && !shader->key.ge.as_ls);
2568 if (!gfx10_ngg_calculate_subgroup_info(shader)) {
2569 fprintf(stderr, "Failed to compute subgroup info\n");
2570 return false;
2571 }
2572 } else if (sscreen->info.gfx_level >= GFX9 && sel->stage == MESA_SHADER_GEOMETRY) {
2573 gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
2574 }
2575
2576 shader->uses_vs_state_provoking_vertex =
2577 sscreen->use_ngg &&
2578 /* Used to convert triangle strips from GS to triangles. */
2579 ((sel->stage == MESA_SHADER_GEOMETRY &&
2580 util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) ||
2581 (sel->stage == MESA_SHADER_VERTEX &&
2582 /* Used to export PrimitiveID from the correct vertex. */
2583 shader->key.ge.mono.u.vs_export_prim_id));
2584
2585 shader->uses_gs_state_outprim = sscreen->use_ngg &&
2586 /* Only used by streamout in vertex shaders. */
2587 sel->stage == MESA_SHADER_VERTEX &&
2588 si_shader_uses_streamout(shader);
2589
2590 if (sel->stage == MESA_SHADER_VERTEX) {
2591 shader->uses_base_instance = sel->info.uses_base_instance ||
2592 shader->key.ge.part.vs.prolog.instance_divisor_is_one ||
2593 shader->key.ge.part.vs.prolog.instance_divisor_is_fetched;
2594 } else if (sel->stage == MESA_SHADER_TESS_CTRL) {
2595 shader->uses_base_instance = shader->previous_stage_sel &&
2596 (shader->previous_stage_sel->info.uses_base_instance ||
2597 shader->key.ge.part.tcs.ls_prolog.instance_divisor_is_one ||
2598 shader->key.ge.part.tcs.ls_prolog.instance_divisor_is_fetched);
2599 } else if (sel->stage == MESA_SHADER_GEOMETRY) {
2600 shader->uses_base_instance = shader->previous_stage_sel &&
2601 (shader->previous_stage_sel->info.uses_base_instance ||
2602 shader->key.ge.part.gs.vs_prolog.instance_divisor_is_one ||
2603 shader->key.ge.part.gs.vs_prolog.instance_divisor_is_fetched);
2604 }
2605
2606 si_fix_resource_usage(sscreen, shader);
2607
2608 /* Upload. */
2609 bool ok = si_shader_binary_upload(sscreen, shader, 0);
2610 si_shader_dump(sscreen, shader, debug, stderr, true);
2611
2612 if (!ok)
2613 fprintf(stderr, "LLVM failed to upload shader\n");
2614 return ok;
2615 }
2616
si_shader_binary_clean(struct si_shader_binary * binary)2617 void si_shader_binary_clean(struct si_shader_binary *binary)
2618 {
2619 free((void *)binary->elf_buffer);
2620 binary->elf_buffer = NULL;
2621
2622 free(binary->llvm_ir_string);
2623 binary->llvm_ir_string = NULL;
2624
2625 free(binary->uploaded_code);
2626 binary->uploaded_code = NULL;
2627 binary->uploaded_code_size = 0;
2628 }
2629
si_shader_destroy(struct si_shader * shader)2630 void si_shader_destroy(struct si_shader *shader)
2631 {
2632 if (shader->scratch_bo)
2633 si_resource_reference(&shader->scratch_bo, NULL);
2634
2635 si_resource_reference(&shader->bo, NULL);
2636
2637 if (!shader->is_binary_shared)
2638 si_shader_binary_clean(&shader->binary);
2639
2640 free(shader->shader_log);
2641 }
2642