• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2012 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "ac_nir.h"
26 #include "ac_rtld.h"
27 #include "nir.h"
28 #include "nir_builder.h"
29 #include "nir_serialize.h"
30 #include "nir/nir_helpers.h"
31 #include "ralloc.h"
32 #include "si_pipe.h"
33 #include "si_shader_internal.h"
34 #include "sid.h"
35 #include "tgsi/tgsi_from_mesa.h"
36 #include "tgsi/tgsi_strings.h"
37 #include "util/u_memory.h"
38 #include "util/mesa-sha1.h"
39 
40 static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
41 
42 static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
43 
44 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
45 
46 /* Get the number of all interpolated inputs */
si_get_ps_num_interp(struct si_shader * ps)47 unsigned si_get_ps_num_interp(struct si_shader *ps)
48 {
49    struct si_shader_info *info = &ps->selector->info;
50    unsigned num_colors = !!(info->colors_read & 0x0f) + !!(info->colors_read & 0xf0);
51    unsigned num_interp =
52       ps->selector->info.num_inputs + (ps->key.ps.part.prolog.color_two_side ? num_colors : 0);
53 
54    assert(num_interp <= 32);
55    return MIN2(num_interp, 32);
56 }
57 
58 /** Whether the shader runs as a combination of multiple API shaders */
si_is_multi_part_shader(struct si_shader * shader)59 bool si_is_multi_part_shader(struct si_shader *shader)
60 {
61    if (shader->selector->screen->info.gfx_level <= GFX8 ||
62        shader->selector->stage > MESA_SHADER_GEOMETRY)
63       return false;
64 
65    return shader->key.ge.as_ls || shader->key.ge.as_es ||
66           shader->selector->stage == MESA_SHADER_TESS_CTRL ||
67           shader->selector->stage == MESA_SHADER_GEOMETRY;
68 }
69 
70 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
si_is_merged_shader(struct si_shader * shader)71 bool si_is_merged_shader(struct si_shader *shader)
72 {
73    if (shader->selector->stage > MESA_SHADER_GEOMETRY)
74       return false;
75 
76    return shader->key.ge.as_ngg || si_is_multi_part_shader(shader);
77 }
78 
79 /**
80  * Returns a unique index for a per-patch semantic name and index. The index
81  * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
82  * can be calculated.
83  */
si_shader_io_get_unique_index_patch(unsigned semantic)84 unsigned si_shader_io_get_unique_index_patch(unsigned semantic)
85 {
86    switch (semantic) {
87    case VARYING_SLOT_TESS_LEVEL_OUTER:
88       return 0;
89    case VARYING_SLOT_TESS_LEVEL_INNER:
90       return 1;
91    default:
92       if (semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_PATCH0 + 30)
93          return 2 + (semantic - VARYING_SLOT_PATCH0);
94 
95       assert(!"invalid semantic");
96       return 0;
97    }
98 }
99 
100 /**
101  * Returns a unique index for a semantic name and index. The index must be
102  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
103  * calculated.
104  */
si_shader_io_get_unique_index(unsigned semantic,bool is_varying)105 unsigned si_shader_io_get_unique_index(unsigned semantic, bool is_varying)
106 {
107    switch (semantic) {
108    case VARYING_SLOT_POS:
109       return 0;
110    default:
111       /* Since some shader stages use the highest used IO index
112        * to determine the size to allocate for inputs/outputs
113        * (in LDS, tess and GS rings). GENERIC should be placed right
114        * after POSITION to make that size as small as possible.
115        */
116       if (semantic >= VARYING_SLOT_VAR0 && semantic <= VARYING_SLOT_VAR31)
117          return 1 + (semantic - VARYING_SLOT_VAR0); /* 1..32 */
118 
119       /* Put 16-bit GLES varyings after 32-bit varyings. They can use the same indices as
120        * legacy desktop GL varyings because they are mutually exclusive.
121        */
122       if (semantic >= VARYING_SLOT_VAR0_16BIT && semantic <= VARYING_SLOT_VAR15_16BIT)
123          return 33 + (semantic - VARYING_SLOT_VAR0_16BIT); /* 33..48 */
124 
125       assert(!"invalid generic index");
126       return 0;
127 
128    /* Legacy desktop GL varyings. */
129    case VARYING_SLOT_FOGC:
130       return 33;
131    case VARYING_SLOT_COL0:
132       return 34;
133    case VARYING_SLOT_COL1:
134       return 35;
135    case VARYING_SLOT_BFC0:
136       /* If it's a varying, COLOR and BCOLOR alias. */
137       if (is_varying)
138          return 34;
139       else
140          return 36;
141    case VARYING_SLOT_BFC1:
142       if (is_varying)
143          return 35;
144       else
145          return 37;
146    case VARYING_SLOT_TEX0:
147    case VARYING_SLOT_TEX1:
148    case VARYING_SLOT_TEX2:
149    case VARYING_SLOT_TEX3:
150    case VARYING_SLOT_TEX4:
151    case VARYING_SLOT_TEX5:
152    case VARYING_SLOT_TEX6:
153    case VARYING_SLOT_TEX7:
154       return 38 + (semantic - VARYING_SLOT_TEX0);
155    case VARYING_SLOT_CLIP_VERTEX:
156       return 46;
157 
158    /* Varyings present in both GLES and desktop GL must start at 49 after 16-bit varyings. */
159    case VARYING_SLOT_CLIP_DIST0:
160       return 49;
161    case VARYING_SLOT_CLIP_DIST1:
162       return 50;
163    case VARYING_SLOT_PSIZ:
164       return 51;
165 
166    /* These can't be written by LS, HS, and ES. */
167    case VARYING_SLOT_LAYER:
168       return 52;
169    case VARYING_SLOT_VIEWPORT:
170       return 53;
171    case VARYING_SLOT_PRIMITIVE_ID:
172       return 54;
173    }
174 }
175 
si_dump_streamout(struct pipe_stream_output_info * so)176 static void si_dump_streamout(struct pipe_stream_output_info *so)
177 {
178    unsigned i;
179 
180    if (so->num_outputs) {
181       fprintf(stderr, "STREAMOUT\n");
182 
183       fprintf(stderr, "  STRIDES: {");
184       for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++)
185          fprintf(stderr, "%u%s", so->stride[i], i < 3 ? ", " : "");
186       fprintf(stderr, "}\n");
187    }
188 
189    for (i = 0; i < so->num_outputs; i++) {
190       unsigned mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
191       fprintf(stderr, "  %i: STREAM%u: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n",
192               i, so->output[i].stream, so->output[i].output_buffer,
193               so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
194               so->output[i].register_index, mask & 1 ? "x" : "", mask & 2 ? "y" : "",
195               mask & 4 ? "z" : "", mask & 8 ? "w" : "");
196    }
197 }
198 
declare_streamout_params(struct si_shader_context * ctx,struct pipe_stream_output_info * so)199 static void declare_streamout_params(struct si_shader_context *ctx,
200                                      struct pipe_stream_output_info *so)
201 {
202    if (ctx->screen->use_ngg_streamout) {
203       if (ctx->stage == MESA_SHADER_TESS_EVAL)
204          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
205       return;
206    }
207 
208    /* Streamout SGPRs. */
209    if (so->num_outputs) {
210       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_config);
211       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_write_index);
212    } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
213       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
214    }
215 
216    /* A streamout buffer offset is loaded if the stride is non-zero. */
217    for (int i = 0; i < 4; i++) {
218       if (!so->stride[i])
219          continue;
220 
221       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_offset[i]);
222    }
223 }
224 
si_get_max_workgroup_size(const struct si_shader * shader)225 unsigned si_get_max_workgroup_size(const struct si_shader *shader)
226 {
227    switch (shader->selector->stage) {
228    case MESA_SHADER_VERTEX:
229    case MESA_SHADER_TESS_EVAL:
230       return shader->key.ge.as_ngg ? 128 : 0;
231 
232    case MESA_SHADER_TESS_CTRL:
233       /* Return this so that LLVM doesn't remove s_barrier
234        * instructions on chips where we use s_barrier. */
235       return shader->selector->screen->info.gfx_level >= GFX7 ? 128 : 0;
236 
237    case MESA_SHADER_GEOMETRY:
238       return shader->selector->screen->info.gfx_level >= GFX9 ? 128 : 0;
239 
240    case MESA_SHADER_COMPUTE:
241       break; /* see below */
242 
243    default:
244       return 0;
245    }
246 
247    /* Compile a variable block size using the maximum variable size. */
248    if (shader->selector->info.base.workgroup_size_variable)
249       return SI_MAX_VARIABLE_THREADS_PER_BLOCK;
250 
251    uint16_t *local_size = shader->selector->info.base.workgroup_size;
252    unsigned max_work_group_size = (uint32_t)local_size[0] *
253                                   (uint32_t)local_size[1] *
254                                   (uint32_t)local_size[2];
255    assert(max_work_group_size);
256    return max_work_group_size;
257 }
258 
declare_const_and_shader_buffers(struct si_shader_context * ctx,bool assign_params)259 static void declare_const_and_shader_buffers(struct si_shader_context *ctx, bool assign_params)
260 {
261    enum ac_arg_type const_shader_buf_type;
262 
263    if (ctx->shader->selector->info.base.num_ubos == 1 &&
264        ctx->shader->selector->info.base.num_ssbos == 0)
265       const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
266    else
267       const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
268 
269    ac_add_arg(
270       &ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
271       assign_params ? &ctx->const_and_shader_buffers : &ctx->other_const_and_shader_buffers);
272 }
273 
declare_samplers_and_images(struct si_shader_context * ctx,bool assign_params)274 static void declare_samplers_and_images(struct si_shader_context *ctx, bool assign_params)
275 {
276    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
277               assign_params ? &ctx->samplers_and_images : &ctx->other_samplers_and_images);
278 }
279 
declare_per_stage_desc_pointers(struct si_shader_context * ctx,bool assign_params)280 static void declare_per_stage_desc_pointers(struct si_shader_context *ctx, bool assign_params)
281 {
282    declare_const_and_shader_buffers(ctx, assign_params);
283    declare_samplers_and_images(ctx, assign_params);
284 }
285 
declare_global_desc_pointers(struct si_shader_context * ctx)286 static void declare_global_desc_pointers(struct si_shader_context *ctx)
287 {
288    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->internal_bindings);
289    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
290               &ctx->bindless_samplers_and_images);
291 }
292 
declare_vb_descriptor_input_sgprs(struct si_shader_context * ctx)293 static void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
294 {
295    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->args.vertex_buffers);
296 
297    unsigned num_vbos_in_user_sgprs = ctx->shader->selector->info.num_vbos_in_user_sgprs;
298    if (num_vbos_in_user_sgprs) {
299       unsigned user_sgprs = ctx->args.num_sgprs_used;
300 
301       if (si_is_merged_shader(ctx->shader))
302          user_sgprs -= 8;
303       assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
304 
305       /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
306       for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
307          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
308 
309       assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
310       for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
311          ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
312    }
313 }
314 
declare_vs_input_vgprs(struct si_shader_context * ctx,unsigned * num_prolog_vgprs)315 static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs)
316 {
317    struct si_shader *shader = ctx->shader;
318 
319    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
320    if (shader->key.ge.as_ls) {
321       if (ctx->screen->info.gfx_level >= GFX11) {
322          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
323          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
324          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
325       } else if (ctx->screen->info.gfx_level >= GFX10) {
326          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_rel_patch_id);
327          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
328          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
329       } else {
330          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_rel_patch_id);
331          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
332          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
333       }
334    } else if (ctx->screen->info.gfx_level >= GFX10) {
335       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
336       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
337                  &ctx->args.vs_prim_id); /* user vgpr or PrimID (legacy) */
338       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
339    } else {
340       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
341       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_prim_id);
342       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
343    }
344 
345    if (!shader->is_gs_copy_shader) {
346       /* Vertex load indices. */
347       if (shader->selector->info.num_inputs) {
348          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0);
349          for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
350             ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
351       }
352       *num_prolog_vgprs += shader->selector->info.num_inputs;
353    }
354 }
355 
declare_vs_blit_inputs(struct si_shader_context * ctx,unsigned vs_blit_property)356 static void declare_vs_blit_inputs(struct si_shader_context *ctx, unsigned vs_blit_property)
357 {
358    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_blit_inputs); /* i16 x1, y1 */
359    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);                 /* i16 x1, y1 */
360    ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL);               /* depth */
361 
362    if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
363       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
364       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
365       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
366       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
367    } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
368       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
369       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
370       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
371       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
372       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
373       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
374    }
375 }
376 
declare_tes_input_vgprs(struct si_shader_context * ctx)377 static void declare_tes_input_vgprs(struct si_shader_context *ctx)
378 {
379    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.tes_u);
380    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.tes_v);
381    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_rel_patch_id);
382    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
383 }
384 
385 enum
386 {
387    /* Convenient merged shader definitions. */
388    SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES,
389    SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
390 };
391 
si_add_arg_checked(struct ac_shader_args * args,enum ac_arg_regfile file,unsigned registers,enum ac_arg_type type,struct ac_arg * arg,unsigned idx)392 void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
393                         enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
394 {
395    assert(args->arg_count == idx);
396    ac_add_arg(args, file, registers, type, arg);
397 }
398 
si_init_shader_args(struct si_shader_context * ctx,bool ngg_cull_shader)399 void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
400 {
401    struct si_shader *shader = ctx->shader;
402    unsigned i, num_returns, num_return_sgprs;
403    unsigned num_prolog_vgprs = 0;
404    unsigned stage = ctx->stage;
405 
406    memset(&ctx->args, 0, sizeof(ctx->args));
407 
408    /* Set MERGED shaders. */
409    if (ctx->screen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY) {
410       if (shader->key.ge.as_ls || stage == MESA_SHADER_TESS_CTRL)
411          stage = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
412       else if (shader->key.ge.as_es || shader->key.ge.as_ngg || stage == MESA_SHADER_GEOMETRY)
413          stage = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
414    }
415 
416    switch (stage) {
417    case MESA_SHADER_VERTEX:
418       declare_global_desc_pointers(ctx);
419 
420       if (shader->selector->info.base.vs.blit_sgprs_amd) {
421          declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd);
422 
423          /* VGPRs */
424          declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
425          break;
426       }
427 
428       declare_per_stage_desc_pointers(ctx, true);
429       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
430 
431       if (ctx->shader->is_gs_copy_shader) {
432          declare_streamout_params(ctx, &ctx->so);
433          /* VGPRs */
434          declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
435          break;
436       }
437 
438       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
439       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
440       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
441       declare_vb_descriptor_input_sgprs(ctx);
442 
443       if (shader->key.ge.as_es) {
444          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.es2gs_offset);
445       } else if (shader->key.ge.as_ls) {
446          /* no extra parameters */
447       } else {
448          declare_streamout_params(ctx, &ctx->so);
449       }
450 
451       /* VGPRs */
452       declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
453       break;
454 
455    case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
456       declare_global_desc_pointers(ctx);
457       declare_per_stage_desc_pointers(ctx, true);
458       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
459       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
460       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
461       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
462       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
463       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset);
464 
465       /* VGPRs */
466       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
467       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
468 
469       /* param_tcs_offchip_offset and param_tcs_factor_offset are
470        * placed after the user SGPRs.
471        */
472       for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
473          ac_add_return(&ctx->args, AC_ARG_SGPR);
474       for (i = 0; i < 11; i++)
475          ac_add_return(&ctx->args, AC_ARG_VGPR);
476       break;
477 
478    case SI_SHADER_MERGED_VERTEX_TESSCTRL:
479       /* Merged stages have 8 system SGPRs at the beginning. */
480       /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
481       /* Gfx11+:  SPI_SHADER_PGM_LO/HI_HS */
482       declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_TESS_CTRL);
483       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
484       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.merged_wave_info);
485       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset);
486       if (ctx->screen->info.gfx_level >= GFX11)
487          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_wave_id);
488       else
489          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset);
490       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
491       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
492 
493       declare_global_desc_pointers(ctx);
494       declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_VERTEX);
495 
496       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
497       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
498       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
499       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
500       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
501       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
502       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
503       if (ctx->stage == MESA_SHADER_VERTEX)
504          declare_vb_descriptor_input_sgprs(ctx);
505 
506       /* VGPRs (first TCS, then VS) */
507       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
508       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
509 
510       if (ctx->stage == MESA_SHADER_VERTEX) {
511          declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
512 
513          /* LS return values are inputs to the TCS main shader part. */
514          for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
515             ac_add_return(&ctx->args, AC_ARG_SGPR);
516          for (i = 0; i < 2; i++)
517             ac_add_return(&ctx->args, AC_ARG_VGPR);
518 
519          /* VS outputs passed via VGPRs to TCS. */
520          if (shader->key.ge.opt.same_patch_vertices) {
521             unsigned num_outputs = util_last_bit64(shader->selector->info.outputs_written);
522             for (i = 0; i < num_outputs * 4; i++)
523                ac_add_return(&ctx->args, AC_ARG_VGPR);
524          }
525       } else {
526          /* TCS inputs are passed via VGPRs from VS. */
527          if (shader->key.ge.opt.same_patch_vertices) {
528             unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->info.outputs_written);
529             for (i = 0; i < num_inputs * 4; i++)
530                ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
531          }
532 
533          /* TCS return values are inputs to the TCS epilog.
534           *
535           * param_tcs_offchip_offset, param_tcs_factor_offset,
536           * param_tcs_offchip_layout, and internal_bindings
537           * should be passed to the epilog.
538           */
539          for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
540             ac_add_return(&ctx->args, AC_ARG_SGPR);
541          for (i = 0; i < 11; i++)
542             ac_add_return(&ctx->args, AC_ARG_VGPR);
543       }
544       break;
545 
546    case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
547       /* Merged stages have 8 system SGPRs at the beginning. */
548       /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
549       /* Gfx11+:  SPI_SHADER_PGM_LO/HI_GS */
550       declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_GEOMETRY);
551 
552       if (ctx->shader->key.ge.as_ngg)
553          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_tg_info);
554       else
555          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs2vs_offset);
556 
557       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.merged_wave_info);
558       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
559       if (ctx->screen->info.gfx_level >= GFX11)
560          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_attr_offset);
561       else
562          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset);
563       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
564       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
565 
566       declare_global_desc_pointers(ctx);
567       if (ctx->stage != MESA_SHADER_VERTEX || !shader->selector->info.base.vs.blit_sgprs_amd) {
568          declare_per_stage_desc_pointers(
569             ctx, (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL));
570       }
571 
572       if (ctx->stage == MESA_SHADER_VERTEX && shader->selector->info.base.vs.blit_sgprs_amd) {
573          declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd);
574       } else {
575          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
576 
577          if (ctx->stage == MESA_SHADER_VERTEX) {
578             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
579             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
580             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
581          } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
582             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
583             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
584             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
585          } else {
586             /* GS */
587             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
588             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
589             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
590          }
591 
592          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->small_prim_cull_info);
593          if (ctx->screen->info.gfx_level >= GFX11)
594             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->gs_attr_address);
595          else
596             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
597 
598          if (ctx->stage == MESA_SHADER_VERTEX)
599             declare_vb_descriptor_input_sgprs(ctx);
600       }
601 
602       /* VGPRs (first GS, then VS/TES) */
603       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]);
604       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]);
605       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
606       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
607       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]);
608 
609       if (ctx->stage == MESA_SHADER_VERTEX) {
610          declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
611       } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
612          declare_tes_input_vgprs(ctx);
613       }
614 
615       if ((ctx->shader->key.ge.as_es || ngg_cull_shader) &&
616           (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) {
617          unsigned num_user_sgprs, num_vgprs;
618 
619          if (ctx->stage == MESA_SHADER_VERTEX && ngg_cull_shader) {
620             /* For the NGG cull shader, add 1 SGPR to hold
621              * the vertex buffer pointer.
622              */
623             num_user_sgprs = GFX9_GS_NUM_USER_SGPR + 1;
624 
625             if (shader->selector->info.num_vbos_in_user_sgprs) {
626                assert(num_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
627                num_user_sgprs =
628                   SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->info.num_vbos_in_user_sgprs * 4;
629             }
630          } else {
631             num_user_sgprs = GFX9_GS_NUM_USER_SGPR;
632          }
633 
634          /* The NGG cull shader has to return all 9 VGPRs.
635           *
636           * The normal merged ESGS shader only has to return the 5 VGPRs
637           * for the GS stage.
638           */
639          num_vgprs = ngg_cull_shader ? 9 : 5;
640 
641          /* ES return values are inputs to GS. */
642          for (i = 0; i < 8 + num_user_sgprs; i++)
643             ac_add_return(&ctx->args, AC_ARG_SGPR);
644          for (i = 0; i < num_vgprs; i++)
645             ac_add_return(&ctx->args, AC_ARG_VGPR);
646       }
647       break;
648 
649    case MESA_SHADER_TESS_EVAL:
650       declare_global_desc_pointers(ctx);
651       declare_per_stage_desc_pointers(ctx, true);
652       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
653       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
654       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
655 
656       if (shader->key.ge.as_es) {
657          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
658          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
659          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.es2gs_offset);
660       } else {
661          declare_streamout_params(ctx, &ctx->so);
662          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
663       }
664 
665       /* VGPRs */
666       declare_tes_input_vgprs(ctx);
667       break;
668 
669    case MESA_SHADER_GEOMETRY:
670       declare_global_desc_pointers(ctx);
671       declare_per_stage_desc_pointers(ctx, true);
672       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs2vs_offset);
673       ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_wave_id);
674 
675       /* VGPRs */
676       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]);
677       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]);
678       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
679       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]);
680       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[3]);
681       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[4]);
682       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[5]);
683       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
684       break;
685 
686    case MESA_SHADER_FRAGMENT:
687       declare_global_desc_pointers(ctx);
688       declare_per_stage_desc_pointers(ctx, true);
689       si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL, SI_PARAM_ALPHA_REF);
690       si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.prim_mask,
691                          SI_PARAM_PRIM_MASK);
692 
693       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
694                          SI_PARAM_PERSP_SAMPLE);
695       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_center,
696                          SI_PARAM_PERSP_CENTER);
697       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_centroid,
698                          SI_PARAM_PERSP_CENTROID);
699       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
700       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_sample,
701                          SI_PARAM_LINEAR_SAMPLE);
702       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_center,
703                          SI_PARAM_LINEAR_CENTER);
704       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid,
705                          SI_PARAM_LINEAR_CENTROID);
706       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
707       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0],
708                          SI_PARAM_POS_X_FLOAT);
709       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1],
710                          SI_PARAM_POS_Y_FLOAT);
711       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[2],
712                          SI_PARAM_POS_Z_FLOAT);
713       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[3],
714                          SI_PARAM_POS_W_FLOAT);
715       shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
716       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.front_face,
717                          SI_PARAM_FRONT_FACE);
718       shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
719       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.ancillary,
720                          SI_PARAM_ANCILLARY);
721       shader->info.sample_coverage_vgpr_index = ctx->args.num_vgprs_used;
722       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.sample_coverage,
723                          SI_PARAM_SAMPLE_COVERAGE);
724       si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->pos_fixed_pt,
725                          SI_PARAM_POS_FIXED_PT);
726 
727       /* Color inputs from the prolog. */
728       if (shader->selector->info.colors_read) {
729          unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);
730 
731          for (i = 0; i < num_color_elements; i++)
732             ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
733 
734          num_prolog_vgprs += num_color_elements;
735       }
736 
737       /* Outputs for the epilog. */
738       num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
739       num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
740                     shader->selector->info.writes_z + shader->selector->info.writes_stencil +
741                     shader->selector->info.writes_samplemask + 1 /* SampleMaskIn */;
742 
743       for (i = 0; i < num_return_sgprs; i++)
744          ac_add_return(&ctx->args, AC_ARG_SGPR);
745       for (; i < num_returns; i++)
746          ac_add_return(&ctx->args, AC_ARG_VGPR);
747       break;
748 
749    case MESA_SHADER_COMPUTE:
750       declare_global_desc_pointers(ctx);
751       declare_per_stage_desc_pointers(ctx, true);
752       if (shader->selector->info.uses_grid_size)
753          ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups);
754       if (shader->selector->info.uses_variable_block_size)
755          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->block_size);
756 
757       unsigned cs_user_data_dwords =
758          shader->selector->info.base.cs.user_data_components_amd;
759       if (cs_user_data_dwords) {
760          ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &ctx->cs_user_data);
761       }
762 
763       /* Some descriptors can be in user SGPRs. */
764       /* Shader buffers in user SGPRs. */
765       for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) {
766          while (ctx->args.num_sgprs_used % 4 != 0)
767             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
768 
769          ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->cs_shaderbuf[i]);
770       }
771       /* Images in user SGPRs. */
772       for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) {
773          unsigned num_sgprs = BITSET_TEST(shader->selector->info.base.image_buffers, i) ? 4 : 8;
774 
775          while (ctx->args.num_sgprs_used % num_sgprs != 0)
776             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
777 
778          ac_add_arg(&ctx->args, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &ctx->cs_image[i]);
779       }
780 
781       /* Hardware SGPRs. */
782       for (i = 0; i < 3; i++) {
783          if (shader->selector->info.uses_block_id[i]) {
784             ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.workgroup_ids[i]);
785          }
786       }
787       if (shader->selector->info.uses_subgroup_info)
788          ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
789 
790       /* Hardware VGPRs. */
791       /* Thread IDs are packed in VGPR0, 10 bits per component or stored in 3 separate VGPRs */
792       if (ctx->screen->info.gfx_level >= GFX11 ||
793           (!ctx->screen->info.has_graphics && ctx->screen->info.family >= CHIP_ALDEBARAN))
794          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.local_invocation_ids);
795       else
796          ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, &ctx->args.local_invocation_ids);
797       break;
798    default:
799       assert(0 && "unimplemented shader");
800       return;
801    }
802 
803    shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
804    shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
805 
806    assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
807    shader->info.num_input_vgprs -= num_prolog_vgprs;
808 }
809 
810 /* For the UMR disassembler. */
811 #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
812 #define DEBUGGER_NUM_MARKERS        5
813 
get_lds_granularity(struct si_screen * screen,gl_shader_stage stage)814 static unsigned get_lds_granularity(struct si_screen *screen, gl_shader_stage stage)
815 {
816    return screen->info.gfx_level >= GFX11 && stage == MESA_SHADER_FRAGMENT ? 1024 :
817           screen->info.gfx_level >= GFX7 ? 512 : 256;
818 }
819 
si_shader_binary_open(struct si_screen * screen,struct si_shader * shader,struct ac_rtld_binary * rtld)820 static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
821                                   struct ac_rtld_binary *rtld)
822 {
823    const struct si_shader_selector *sel = shader->selector;
824    const char *part_elfs[5];
825    size_t part_sizes[5];
826    unsigned num_parts = 0;
827 
828 #define add_part(shader_or_part)                                                                   \
829    if (shader_or_part) {                                                                           \
830       part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer;                                  \
831       part_sizes[num_parts] = (shader_or_part)->binary.elf_size;                                   \
832       num_parts++;                                                                                 \
833    }
834 
835    add_part(shader->prolog);
836    add_part(shader->previous_stage);
837    add_part(shader);
838    add_part(shader->epilog);
839 
840 #undef add_part
841 
842    struct ac_rtld_symbol lds_symbols[2];
843    unsigned num_lds_symbols = 0;
844 
845    if (sel && screen->info.gfx_level >= GFX9 && !shader->is_gs_copy_shader &&
846        (sel->stage == MESA_SHADER_GEOMETRY ||
847         (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg))) {
848       struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
849       sym->name = "esgs_ring";
850       sym->size = shader->gs_info.esgs_ring_size * 4;
851       sym->align = 64 * 1024;
852    }
853 
854    if (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
855       struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
856       sym->name = "ngg_emit";
857       sym->size = shader->ngg.ngg_emit_size * 4;
858       sym->align = 4;
859    }
860 
861    bool ok = ac_rtld_open(
862       rtld, (struct ac_rtld_open_info){.info = &screen->info,
863                                        .options =
864                                           {
865                                              .halt_at_entry = screen->options.halt_shaders,
866                                           },
867                                        .shader_type = sel->stage,
868                                        .wave_size = shader->wave_size,
869                                        .num_parts = num_parts,
870                                        .elf_ptrs = part_elfs,
871                                        .elf_sizes = part_sizes,
872                                        .num_shared_lds_symbols = num_lds_symbols,
873                                        .shared_lds_symbols = lds_symbols});
874 
875    if (rtld->lds_size > 0) {
876       unsigned alloc_granularity = get_lds_granularity(screen, sel->stage);
877       shader->config.lds_size = DIV_ROUND_UP(rtld->lds_size, alloc_granularity);
878    }
879 
880    return ok;
881 }
882 
si_get_shader_binary_size(struct si_screen * screen,struct si_shader * shader)883 static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
884 {
885    struct ac_rtld_binary rtld;
886    si_shader_binary_open(screen, shader, &rtld);
887    uint64_t size = rtld.exec_size;
888    ac_rtld_close(&rtld);
889    return size;
890 }
891 
si_get_external_symbol(enum amd_gfx_level gfx_level,void * data,const char * name,uint64_t * value)892 static bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
893                                    uint64_t *value)
894 {
895    uint64_t *scratch_va = data;
896 
897    if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
898       *value = (uint32_t)*scratch_va;
899       return true;
900    }
901    if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
902       /* Enable scratch coalescing. */
903       *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32);
904 
905       if (gfx_level >= GFX11)
906          *value |= S_008F04_SWIZZLE_ENABLE_GFX11(1);
907       else
908          *value |= S_008F04_SWIZZLE_ENABLE_GFX6(1);
909       return true;
910    }
911 
912    return false;
913 }
914 
si_shader_binary_upload(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va)915 bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
916                              uint64_t scratch_va)
917 {
918    struct ac_rtld_binary binary;
919    if (!si_shader_binary_open(sscreen, shader, &binary))
920       return false;
921 
922    si_resource_reference(&shader->bo, NULL);
923    shader->bo = si_aligned_buffer_create(
924       &sscreen->b,
925       (sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY) |
926       SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT,
927       PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256);
928    if (!shader->bo)
929       return false;
930 
931    /* Upload. */
932    struct ac_rtld_upload_info u = {};
933    u.binary = &binary;
934    u.get_external_symbol = si_get_external_symbol;
935    u.cb_data = &scratch_va;
936    u.rx_va = shader->bo->gpu_address;
937    u.rx_ptr = sscreen->ws->buffer_map(sscreen->ws,
938       shader->bo->buf, NULL,
939       PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);
940    if (!u.rx_ptr)
941       return false;
942 
943    int size = ac_rtld_upload(&u);
944 
945    if (sscreen->debug_flags & DBG(SQTT)) {
946       /* Remember the uploaded code */
947       shader->binary.uploaded_code_size = size;
948       shader->binary.uploaded_code = malloc(size);
949       memcpy(shader->binary.uploaded_code, u.rx_ptr, size);
950    }
951 
952    sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf);
953    ac_rtld_close(&binary);
954 
955    return size >= 0;
956 }
957 
si_shader_dump_disassembly(struct si_screen * screen,const struct si_shader_binary * binary,gl_shader_stage stage,unsigned wave_size,struct util_debug_callback * debug,const char * name,FILE * file)958 static void si_shader_dump_disassembly(struct si_screen *screen,
959                                        const struct si_shader_binary *binary,
960                                        gl_shader_stage stage, unsigned wave_size,
961                                        struct util_debug_callback *debug, const char *name,
962                                        FILE *file)
963 {
964    struct ac_rtld_binary rtld_binary;
965 
966    if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
967                                       .info = &screen->info,
968                                       .shader_type = stage,
969                                       .wave_size = wave_size,
970                                       .num_parts = 1,
971                                       .elf_ptrs = &binary->elf_buffer,
972                                       .elf_sizes = &binary->elf_size}))
973       return;
974 
975    const char *disasm;
976    size_t nbytes;
977 
978    if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
979       goto out;
980 
981    if (nbytes > INT_MAX)
982       goto out;
983 
984    if (debug && debug->debug_message) {
985       /* Very long debug messages are cut off, so send the
986        * disassembly one line at a time. This causes more
987        * overhead, but on the plus side it simplifies
988        * parsing of resulting logs.
989        */
990       util_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");
991 
992       uint64_t line = 0;
993       while (line < nbytes) {
994          int count = nbytes - line;
995          const char *nl = memchr(disasm + line, '\n', nbytes - line);
996          if (nl)
997             count = nl - (disasm + line);
998 
999          if (count) {
1000             util_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
1001          }
1002 
1003          line += count + 1;
1004       }
1005 
1006       util_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
1007    }
1008 
1009    if (file) {
1010       fprintf(file, "Shader %s disassembly:\n", name);
1011       fprintf(file, "%*s", (int)nbytes, disasm);
1012    }
1013 
1014 out:
1015    ac_rtld_close(&rtld_binary);
1016 }
1017 
si_calculate_max_simd_waves(struct si_shader * shader)1018 static void si_calculate_max_simd_waves(struct si_shader *shader)
1019 {
1020    struct si_screen *sscreen = shader->selector->screen;
1021    struct ac_shader_config *conf = &shader->config;
1022    unsigned num_inputs = shader->selector->info.num_inputs;
1023    unsigned lds_increment = get_lds_granularity(sscreen, shader->selector->stage);
1024    unsigned lds_per_wave = 0;
1025    unsigned max_simd_waves;
1026 
1027    max_simd_waves = sscreen->info.max_wave64_per_simd;
1028 
1029    /* Compute LDS usage for PS. */
1030    switch (shader->selector->stage) {
1031    case MESA_SHADER_FRAGMENT:
1032       /* The minimum usage per wave is (num_inputs * 48). The maximum
1033        * usage is (num_inputs * 48 * 16).
1034        * We can get anything in between and it varies between waves.
1035        *
1036        * The 48 bytes per input for a single primitive is equal to
1037        * 4 bytes/component * 4 components/input * 3 points.
1038        *
1039        * Other stages don't know the size at compile time or don't
1040        * allocate LDS per wave, but instead they do it per thread group.
1041        */
1042       lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment);
1043       break;
1044    case MESA_SHADER_COMPUTE: {
1045          unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
1046          lds_per_wave = (conf->lds_size * lds_increment) /
1047                         DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
1048       }
1049       break;
1050    default:;
1051    }
1052 
1053    /* Compute the per-SIMD wave counts. */
1054    if (conf->num_sgprs) {
1055       max_simd_waves =
1056          MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
1057    }
1058 
1059    if (conf->num_vgprs) {
1060       /* Always print wave limits as Wave64, so that we can compare
1061        * Wave32 and Wave64 with shader-db fairly. */
1062       unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
1063       max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
1064    }
1065 
1066    unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
1067    if (lds_per_wave)
1068       max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
1069 
1070    shader->info.max_simd_waves = max_simd_waves;
1071 }
1072 
si_shader_dump_stats_for_shader_db(struct si_screen * screen,struct si_shader * shader,struct util_debug_callback * debug)1073 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1074                                         struct util_debug_callback *debug)
1075 {
1076    const struct ac_shader_config *conf = &shader->config;
1077    static const char *stages[] = {"VS", "TCS", "TES", "GS", "PS", "CS"};
1078 
1079    if (screen->options.debug_disassembly)
1080       si_shader_dump_disassembly(screen, &shader->binary, shader->selector->stage,
1081                                  shader->wave_size, debug, "main", NULL);
1082 
1083    util_debug_message(debug, SHADER_INFO,
1084                       "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
1085                       "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
1086                       "Spilled VGPRs: %d PrivMem VGPRs: %d DivergentLoop: %d, InlineUniforms: %d, "
1087                       "ParamExports: %u, (%s, W%u)",
1088                       conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
1089                       conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
1090                       conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
1091                       shader->selector->info.has_divergent_loop,
1092                       shader->selector->info.base.num_inlinable_uniforms,
1093                       shader->info.nr_param_exports,
1094                       stages[shader->selector->stage], shader->wave_size);
1095 }
1096 
si_shader_dump_stats(struct si_screen * sscreen,struct si_shader * shader,FILE * file,bool check_debug_option)1097 static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
1098                                  bool check_debug_option)
1099 {
1100    const struct ac_shader_config *conf = &shader->config;
1101 
1102    if (!check_debug_option || si_can_dump_shader(sscreen, shader->selector->stage)) {
1103       if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
1104          fprintf(file,
1105                  "*** SHADER CONFIG ***\n"
1106                  "SPI_PS_INPUT_ADDR = 0x%04x\n"
1107                  "SPI_PS_INPUT_ENA  = 0x%04x\n",
1108                  conf->spi_ps_input_addr, conf->spi_ps_input_ena);
1109       }
1110 
1111       fprintf(file,
1112               "*** SHADER STATS ***\n"
1113               "SGPRS: %d\n"
1114               "VGPRS: %d\n"
1115               "Spilled SGPRs: %d\n"
1116               "Spilled VGPRs: %d\n"
1117               "Private memory VGPRs: %d\n"
1118               "Code Size: %d bytes\n"
1119               "LDS: %d bytes\n"
1120               "Scratch: %d bytes per wave\n"
1121               "Max Waves: %d\n"
1122               "********************\n\n\n",
1123               conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
1124               shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
1125               conf->lds_size * get_lds_granularity(sscreen, shader->selector->stage),
1126               conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
1127    }
1128 }
1129 
si_get_shader_name(const struct si_shader * shader)1130 const char *si_get_shader_name(const struct si_shader *shader)
1131 {
1132    switch (shader->selector->stage) {
1133    case MESA_SHADER_VERTEX:
1134       if (shader->key.ge.as_es)
1135          return "Vertex Shader as ES";
1136       else if (shader->key.ge.as_ls)
1137          return "Vertex Shader as LS";
1138       else if (shader->key.ge.as_ngg)
1139          return "Vertex Shader as ESGS";
1140       else
1141          return "Vertex Shader as VS";
1142    case MESA_SHADER_TESS_CTRL:
1143       return "Tessellation Control Shader";
1144    case MESA_SHADER_TESS_EVAL:
1145       if (shader->key.ge.as_es)
1146          return "Tessellation Evaluation Shader as ES";
1147       else if (shader->key.ge.as_ngg)
1148          return "Tessellation Evaluation Shader as ESGS";
1149       else
1150          return "Tessellation Evaluation Shader as VS";
1151    case MESA_SHADER_GEOMETRY:
1152       if (shader->is_gs_copy_shader)
1153          return "GS Copy Shader as VS";
1154       else
1155          return "Geometry Shader";
1156    case MESA_SHADER_FRAGMENT:
1157       return "Pixel Shader";
1158    case MESA_SHADER_COMPUTE:
1159       return "Compute Shader";
1160    default:
1161       return "Unknown Shader";
1162    }
1163 }
1164 
si_shader_dump(struct si_screen * sscreen,struct si_shader * shader,struct util_debug_callback * debug,FILE * file,bool check_debug_option)1165 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1166                     struct util_debug_callback *debug, FILE *file, bool check_debug_option)
1167 {
1168    gl_shader_stage stage = shader->selector->stage;
1169 
1170    if (!check_debug_option || si_can_dump_shader(sscreen, stage))
1171       si_dump_shader_key(shader, file);
1172 
1173    if (!check_debug_option && shader->binary.llvm_ir_string) {
1174       if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
1175          fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
1176          fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
1177       }
1178 
1179       fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
1180       fprintf(file, "%s\n", shader->binary.llvm_ir_string);
1181    }
1182 
1183    if (!check_debug_option ||
1184        (si_can_dump_shader(sscreen, stage) && !(sscreen->debug_flags & DBG(NO_ASM)))) {
1185 
1186       fprintf(file, "\n%s:\n", si_get_shader_name(shader));
1187 
1188       if (shader->prolog)
1189          si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, shader->wave_size, debug,
1190                                     "prolog", file);
1191       if (shader->previous_stage)
1192          si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,
1193                                     shader->wave_size, debug, "previous stage", file);
1194       si_shader_dump_disassembly(sscreen, &shader->binary, stage, shader->wave_size, debug, "main",
1195                                  file);
1196 
1197       if (shader->epilog)
1198          si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, shader->wave_size, debug,
1199                                     "epilog", file);
1200       fprintf(file, "\n");
1201    }
1202 
1203    si_shader_dump_stats(sscreen, shader, file, check_debug_option);
1204 }
1205 
si_dump_shader_key_vs(const union si_shader_key * key,const struct si_vs_prolog_bits * prolog,const char * prefix,FILE * f)1206 static void si_dump_shader_key_vs(const union si_shader_key *key,
1207                                   const struct si_vs_prolog_bits *prolog, const char *prefix,
1208                                   FILE *f)
1209 {
1210    fprintf(f, "  %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one);
1211    fprintf(f, "  %s.instance_divisor_is_fetched = %u\n", prefix,
1212            prolog->instance_divisor_is_fetched);
1213    fprintf(f, "  %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix);
1214 
1215    fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->ge.mono.vs_fetch_opencode);
1216    fprintf(f, "  mono.vs.fix_fetch = {");
1217    for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
1218       union si_vs_fix_fetch fix = key->ge.mono.vs_fix_fetch[i];
1219       if (i)
1220          fprintf(f, ", ");
1221       if (!fix.bits)
1222          fprintf(f, "0");
1223       else
1224          fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
1225                  fix.u.format);
1226    }
1227    fprintf(f, "}\n");
1228 }
1229 
si_dump_shader_key(const struct si_shader * shader,FILE * f)1230 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
1231 {
1232    const union si_shader_key *key = &shader->key;
1233    gl_shader_stage stage = shader->selector->stage;
1234 
1235    fprintf(f, "SHADER KEY\n");
1236    fprintf(f, "  source_sha1 = {");
1237    _mesa_sha1_print(f, shader->selector->info.base.source_sha1);
1238    fprintf(f, "}\n");
1239 
1240    switch (stage) {
1241    case MESA_SHADER_VERTEX:
1242       si_dump_shader_key_vs(key, &key->ge.part.vs.prolog, "part.vs.prolog", f);
1243       fprintf(f, "  as_es = %u\n", key->ge.as_es);
1244       fprintf(f, "  as_ls = %u\n", key->ge.as_ls);
1245       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
1246       fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1247       break;
1248 
1249    case MESA_SHADER_TESS_CTRL:
1250       if (shader->selector->screen->info.gfx_level >= GFX9) {
1251          si_dump_shader_key_vs(key, &key->ge.part.tcs.ls_prolog, "part.tcs.ls_prolog", f);
1252       }
1253       fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->ge.part.tcs.epilog.prim_mode);
1254       fprintf(f, "  opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1255       fprintf(f, "  opt.same_patch_vertices = %u\n", key->ge.opt.same_patch_vertices);
1256       break;
1257 
1258    case MESA_SHADER_TESS_EVAL:
1259       fprintf(f, "  as_es = %u\n", key->ge.as_es);
1260       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
1261       fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1262       break;
1263 
1264    case MESA_SHADER_GEOMETRY:
1265       if (shader->is_gs_copy_shader)
1266          break;
1267 
1268       if (shader->selector->screen->info.gfx_level >= GFX9 &&
1269           key->ge.part.gs.es->stage == MESA_SHADER_VERTEX) {
1270          si_dump_shader_key_vs(key, &key->ge.part.gs.vs_prolog, "part.gs.vs_prolog", f);
1271       }
1272       fprintf(f, "  mono.u.gs_tri_strip_adj_fix = %u\n", key->ge.mono.u.gs_tri_strip_adj_fix);
1273       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
1274       break;
1275 
1276    case MESA_SHADER_COMPUTE:
1277       break;
1278 
1279    case MESA_SHADER_FRAGMENT:
1280       fprintf(f, "  prolog.color_two_side = %u\n", key->ps.part.prolog.color_two_side);
1281       fprintf(f, "  prolog.flatshade_colors = %u\n", key->ps.part.prolog.flatshade_colors);
1282       fprintf(f, "  prolog.poly_stipple = %u\n", key->ps.part.prolog.poly_stipple);
1283       fprintf(f, "  prolog.force_persp_sample_interp = %u\n",
1284               key->ps.part.prolog.force_persp_sample_interp);
1285       fprintf(f, "  prolog.force_linear_sample_interp = %u\n",
1286               key->ps.part.prolog.force_linear_sample_interp);
1287       fprintf(f, "  prolog.force_persp_center_interp = %u\n",
1288               key->ps.part.prolog.force_persp_center_interp);
1289       fprintf(f, "  prolog.force_linear_center_interp = %u\n",
1290               key->ps.part.prolog.force_linear_center_interp);
1291       fprintf(f, "  prolog.bc_optimize_for_persp = %u\n",
1292               key->ps.part.prolog.bc_optimize_for_persp);
1293       fprintf(f, "  prolog.bc_optimize_for_linear = %u\n",
1294               key->ps.part.prolog.bc_optimize_for_linear);
1295       fprintf(f, "  prolog.samplemask_log_ps_iter = %u\n",
1296               key->ps.part.prolog.samplemask_log_ps_iter);
1297       fprintf(f, "  epilog.spi_shader_col_format = 0x%x\n",
1298               key->ps.part.epilog.spi_shader_col_format);
1299       fprintf(f, "  epilog.color_is_int8 = 0x%X\n", key->ps.part.epilog.color_is_int8);
1300       fprintf(f, "  epilog.color_is_int10 = 0x%X\n", key->ps.part.epilog.color_is_int10);
1301       fprintf(f, "  epilog.last_cbuf = %u\n", key->ps.part.epilog.last_cbuf);
1302       fprintf(f, "  epilog.alpha_func = %u\n", key->ps.part.epilog.alpha_func);
1303       fprintf(f, "  epilog.alpha_to_one = %u\n", key->ps.part.epilog.alpha_to_one);
1304       fprintf(f, "  epilog.alpha_to_coverage_via_mrtz = %u\n", key->ps.part.epilog.alpha_to_coverage_via_mrtz);
1305       fprintf(f, "  epilog.clamp_color = %u\n", key->ps.part.epilog.clamp_color);
1306       fprintf(f, "  epilog.dual_src_blend_swizzle = %u\n", key->ps.part.epilog.dual_src_blend_swizzle);
1307       fprintf(f, "  mono.poly_line_smoothing = %u\n", key->ps.mono.poly_line_smoothing);
1308       fprintf(f, "  mono.point_smoothing = %u\n", key->ps.mono.point_smoothing);
1309       fprintf(f, "  mono.interpolate_at_sample_force_center = %u\n",
1310               key->ps.mono.interpolate_at_sample_force_center);
1311       fprintf(f, "  mono.fbfetch_msaa = %u\n", key->ps.mono.fbfetch_msaa);
1312       fprintf(f, "  mono.fbfetch_is_1D = %u\n", key->ps.mono.fbfetch_is_1D);
1313       fprintf(f, "  mono.fbfetch_layered = %u\n", key->ps.mono.fbfetch_layered);
1314       break;
1315 
1316    default:
1317       assert(0);
1318    }
1319 
1320    if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||
1321         stage == MESA_SHADER_VERTEX) &&
1322        !key->ge.as_es && !key->ge.as_ls) {
1323       fprintf(f, "  opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs);
1324       fprintf(f, "  opt.kill_pointsize = 0x%x\n", key->ge.opt.kill_pointsize);
1325       fprintf(f, "  opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances);
1326       fprintf(f, "  opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling);
1327    }
1328 
1329    if (stage <= MESA_SHADER_GEOMETRY)
1330       fprintf(f, "  opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1331    else
1332       fprintf(f, "  opt.prefer_mono = %u\n", key->ps.opt.prefer_mono);
1333 
1334    if (stage <= MESA_SHADER_GEOMETRY) {
1335       if (key->ge.opt.inline_uniforms) {
1336          fprintf(f, "  opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1337                  key->ge.opt.inline_uniforms,
1338                  key->ge.opt.inlined_uniform_values[0],
1339                  key->ge.opt.inlined_uniform_values[1],
1340                  key->ge.opt.inlined_uniform_values[2],
1341                  key->ge.opt.inlined_uniform_values[3]);
1342       } else {
1343          fprintf(f, "  opt.inline_uniforms = 0\n");
1344       }
1345    } else {
1346       if (key->ps.opt.inline_uniforms) {
1347          fprintf(f, "  opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1348                  key->ps.opt.inline_uniforms,
1349                  key->ps.opt.inlined_uniform_values[0],
1350                  key->ps.opt.inlined_uniform_values[1],
1351                  key->ps.opt.inlined_uniform_values[2],
1352                  key->ps.opt.inlined_uniform_values[3]);
1353       } else {
1354          fprintf(f, "  opt.inline_uniforms = 0\n");
1355       }
1356    }
1357 }
1358 
si_vs_needs_prolog(const struct si_shader_selector * sel,const struct si_vs_prolog_bits * prolog_key,const union si_shader_key * key,bool ngg_cull_shader,bool is_gs)1359 bool si_vs_needs_prolog(const struct si_shader_selector *sel,
1360                         const struct si_vs_prolog_bits *prolog_key,
1361                         const union si_shader_key *key, bool ngg_cull_shader,
1362                         bool is_gs)
1363 {
1364    assert(sel->stage == MESA_SHADER_VERTEX);
1365 
1366    /* VGPR initialization fixup for Vega10 and Raven is always done in the
1367     * VS prolog. */
1368    return sel->info.vs_needs_prolog || prolog_key->ls_vgpr_fix ||
1369           /* The 2nd VS prolog loads input VGPRs from LDS */
1370           (key->ge.opt.ngg_culling && !ngg_cull_shader && !is_gs);
1371 }
1372 
1373 /**
1374  * Compute the VS prolog key, which contains all the information needed to
1375  * build the VS prolog function, and set shader->info bits where needed.
1376  *
1377  * \param info             Shader info of the vertex shader.
1378  * \param num_input_sgprs  Number of input SGPRs for the vertex shader.
1379  * \param has_old_  Whether the preceding shader part is the NGG cull shader.
1380  * \param prolog_key       Key of the VS prolog
1381  * \param shader_out       The vertex shader, or the next shader if merging LS+HS or ES+GS.
1382  * \param key              Output shader part key.
1383  */
si_get_vs_prolog_key(const struct si_shader_info * info,unsigned num_input_sgprs,bool ngg_cull_shader,const struct si_vs_prolog_bits * prolog_key,struct si_shader * shader_out,union si_shader_part_key * key)1384 void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs,
1385                           bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key,
1386                           struct si_shader *shader_out, union si_shader_part_key *key)
1387 {
1388    memset(key, 0, sizeof(*key));
1389    key->vs_prolog.states = *prolog_key;
1390    key->vs_prolog.wave32 = shader_out->wave_size == 32;
1391    key->vs_prolog.num_input_sgprs = num_input_sgprs;
1392    key->vs_prolog.num_inputs = info->num_inputs;
1393    key->vs_prolog.as_ls = shader_out->key.ge.as_ls;
1394    key->vs_prolog.as_es = shader_out->key.ge.as_es;
1395    key->vs_prolog.as_ngg = shader_out->key.ge.as_ngg;
1396 
1397    if (shader_out->selector->stage != MESA_SHADER_GEOMETRY &&
1398        !ngg_cull_shader && shader_out->key.ge.opt.ngg_culling)
1399       key->vs_prolog.load_vgprs_after_culling = 1;
1400 
1401    if (shader_out->selector->stage == MESA_SHADER_TESS_CTRL) {
1402       key->vs_prolog.as_ls = 1;
1403       key->vs_prolog.num_merged_next_stage_vgprs = 2;
1404    } else if (shader_out->selector->stage == MESA_SHADER_GEOMETRY) {
1405       key->vs_prolog.as_es = 1;
1406       key->vs_prolog.num_merged_next_stage_vgprs = 5;
1407    } else if (shader_out->key.ge.as_ngg) {
1408       key->vs_prolog.num_merged_next_stage_vgprs = 5;
1409    }
1410 
1411    /* Only one of these combinations can be set. as_ngg can be set with as_es. */
1412    assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +
1413           (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) <= 1);
1414 
1415    /* Enable loading the InstanceID VGPR. */
1416    uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
1417 
1418    if ((key->vs_prolog.states.instance_divisor_is_one |
1419         key->vs_prolog.states.instance_divisor_is_fetched) &
1420        input_mask)
1421       shader_out->info.uses_instanceid = true;
1422 }
1423 
1424 /* TODO: convert to nir_shader_instructions_pass */
si_nir_kill_outputs(nir_shader * nir,const union si_shader_key * key)1425 static bool si_nir_kill_outputs(nir_shader *nir, const union si_shader_key *key)
1426 {
1427    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1428    assert(impl);
1429 
1430    if (nir->info.stage > MESA_SHADER_GEOMETRY ||
1431        (!key->ge.opt.kill_outputs &&
1432         !key->ge.opt.kill_pointsize &&
1433         !key->ge.opt.kill_clip_distances)) {
1434       nir_metadata_preserve(impl, nir_metadata_all);
1435       return false;
1436    }
1437 
1438    bool progress = false;
1439 
1440    nir_builder b;
1441    nir_builder_init(&b, impl);
1442 
1443    nir_foreach_block(block, impl) {
1444       nir_foreach_instr_safe(instr, block) {
1445          if (instr->type != nir_instr_type_intrinsic)
1446             continue;
1447 
1448          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1449          if (intr->intrinsic != nir_intrinsic_store_output)
1450             continue;
1451 
1452          /* No indirect indexing allowed. */
1453          ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1454          assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1455 
1456          assert(intr->num_components == 1); /* only scalar stores expected */
1457          nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1458 
1459          if (nir_slot_is_varying(sem.location) &&
1460              key->ge.opt.kill_outputs &
1461              (1ull << si_shader_io_get_unique_index(sem.location, true))) {
1462             nir_remove_varying(intr);
1463             progress = true;
1464          }
1465 
1466          if (key->ge.opt.kill_pointsize && sem.location == VARYING_SLOT_PSIZ) {
1467             nir_remove_sysval_output(intr);
1468             progress = true;
1469          }
1470 
1471          /* TODO: We should only kill specific clip planes as required by kill_clip_distance,
1472           * not whole gl_ClipVertex. Lower ClipVertex in NIR.
1473           */
1474          if ((key->ge.opt.kill_clip_distances & SI_USER_CLIP_PLANE_MASK) == SI_USER_CLIP_PLANE_MASK &&
1475              sem.location == VARYING_SLOT_CLIP_VERTEX) {
1476             nir_remove_sysval_output(intr);
1477             progress = true;
1478          }
1479 
1480          if (key->ge.opt.kill_clip_distances &&
1481              (sem.location == VARYING_SLOT_CLIP_DIST0 ||
1482               sem.location == VARYING_SLOT_CLIP_DIST1)) {
1483             assert(nir_intrinsic_src_type(intr) == nir_type_float32);
1484             unsigned index = (sem.location - VARYING_SLOT_CLIP_DIST0) * 4 +
1485                              nir_intrinsic_component(intr);
1486 
1487             if ((key->ge.opt.kill_clip_distances >> index) & 0x1) {
1488                nir_remove_sysval_output(intr);
1489                progress = true;
1490             }
1491          }
1492       }
1493    }
1494 
1495    if (progress) {
1496       nir_metadata_preserve(impl, nir_metadata_dominance |
1497                                   nir_metadata_block_index);
1498    } else {
1499       nir_metadata_preserve(impl, nir_metadata_all);
1500    }
1501 
1502    return progress;
1503 }
1504 
si_map_io_driver_location(unsigned semantic)1505 static unsigned si_map_io_driver_location(unsigned semantic)
1506 {
1507    if ((semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_TESS_MAX) ||
1508        semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
1509        semantic == VARYING_SLOT_TESS_LEVEL_OUTER)
1510       return si_shader_io_get_unique_index_patch(semantic);
1511 
1512    return si_shader_io_get_unique_index(semantic, false);
1513 }
1514 
si_lower_io_to_mem(struct si_shader * shader,nir_shader * nir,uint64_t tcs_vgpr_only_inputs)1515 static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir,
1516                                uint64_t tcs_vgpr_only_inputs)
1517 {
1518    struct si_shader_selector *sel = shader->selector;
1519    const union si_shader_key *key = &shader->key;
1520 
1521    if (nir->info.stage == MESA_SHADER_VERTEX) {
1522       if (key->ge.as_ls) {
1523          NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem, si_map_io_driver_location,
1524                     key->ge.opt.same_patch_vertices, tcs_vgpr_only_inputs);
1525          return true;
1526       } else if (key->ge.as_es) {
1527          NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1528                     sel->screen->info.gfx_level, sel->info.esgs_itemsize);
1529          return true;
1530       }
1531    } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
1532       NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem, si_map_io_driver_location,
1533                  key->ge.opt.same_patch_vertices);
1534       NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, si_map_io_driver_location,
1535                  sel->screen->info.gfx_level,
1536                  false, /* does not matter as we disabled final tess factor write */
1537                  ~0ULL, ~0ULL, /* no TES inputs filter */
1538                  util_last_bit64(sel->info.outputs_written),
1539                  util_last_bit64(sel->info.patch_outputs_written),
1540                  shader->wave_size,
1541                  /* ALL TCS inputs are passed by register. */
1542                  key->ge.opt.same_patch_vertices &&
1543                  !(sel->info.base.inputs_read & ~sel->info.tcs_vgpr_only_inputs),
1544                  sel->info.tessfactors_are_def_in_all_invocs, false);
1545       return true;
1546    } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
1547       NIR_PASS_V(nir, ac_nir_lower_tes_inputs_to_mem, si_map_io_driver_location);
1548 
1549       if (key->ge.as_es) {
1550          NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1551                     sel->screen->info.gfx_level, sel->info.esgs_itemsize);
1552       }
1553 
1554       return true;
1555    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1556       NIR_PASS_V(nir, ac_nir_lower_gs_inputs_to_mem, si_map_io_driver_location,
1557                  sel->screen->info.gfx_level, key->ge.mono.u.gs_tri_strip_adj_fix);
1558       return true;
1559    }
1560 
1561    return false;
1562 }
1563 
si_deserialize_shader(struct si_shader_selector * sel)1564 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel)
1565 {
1566    struct pipe_screen *screen = &sel->screen->b;
1567    const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
1568                                                       pipe_shader_type_from_mesa(sel->stage));
1569 
1570    struct blob_reader blob_reader;
1571    blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
1572    return nir_deserialize(NULL, options, &blob_reader);
1573 }
1574 
si_get_nir_shader(struct si_shader * shader,bool * free_nir,uint64_t tcs_vgpr_only_inputs)1575 struct nir_shader *si_get_nir_shader(struct si_shader *shader, bool *free_nir,
1576                                      uint64_t tcs_vgpr_only_inputs)
1577 {
1578    struct si_shader_selector *sel = shader->selector;
1579    const union si_shader_key *key = &shader->key;
1580 
1581    nir_shader *nir;
1582    *free_nir = false;
1583 
1584    if (sel->nir) {
1585       nir = sel->nir;
1586    } else if (sel->nir_binary) {
1587       nir = si_deserialize_shader(sel);
1588       *free_nir = true;
1589    } else {
1590       return NULL;
1591    }
1592 
1593    bool progress = false;
1594 
1595    const char *original_name = NULL;
1596    if (unlikely(should_print_nir(nir))) {
1597       /* Modify the shader's name so that each variant gets its own name. */
1598       original_name = ralloc_strdup(nir, nir->info.name);
1599       ralloc_asprintf_append((char **)&nir->info.name, "-%08x", _mesa_hash_data(key, sizeof(*key)));
1600 
1601       /* Dummy pass to get the starting point. */
1602       printf("nir_dummy_pass\n");
1603       nir_print_shader(nir, stdout);
1604    }
1605 
1606    /* Kill outputs according to the shader key. */
1607    if (sel->stage <= MESA_SHADER_GEOMETRY)
1608       NIR_PASS(progress, nir, si_nir_kill_outputs, key);
1609 
1610    bool inline_uniforms = false;
1611    uint32_t *inlined_uniform_values;
1612    si_get_inline_uniform_state((union si_shader_key*)key, sel->pipe_shader_type,
1613                                &inline_uniforms, &inlined_uniform_values);
1614 
1615    if (inline_uniforms) {
1616       assert(*free_nir);
1617 
1618       /* Most places use shader information from the default variant, not
1619        * the optimized variant. These are the things that the driver looks at
1620        * in optimized variants and the list of things that we need to do.
1621        *
1622        * The driver takes into account these things if they suddenly disappear
1623        * from the shader code:
1624        * - Register usage and code size decrease (obvious)
1625        * - Eliminated PS system values are disabled by LLVM
1626        *   (FragCoord, FrontFace, barycentrics)
1627        * - VS/TES/GS outputs feeding PS are eliminated if outputs are undef.
1628        *   The storage for eliminated outputs is also not allocated.
1629        * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)
1630        * - TCS output stores are eliminated
1631        *
1632        * TODO: These are things the driver ignores in the final shader code
1633        * and relies on the default shader info.
1634        * - Other system values are not eliminated
1635        * - PS.NUM_INTERP = bitcount64(inputs_read), renumber inputs
1636        *   to remove holes
1637        * - uses_discard - if it changed to false
1638        * - writes_memory - if it changed to false
1639        * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not
1640        *   eliminated
1641        * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS)
1642        *   GS outputs are eliminated except for the temporary LDS.
1643        *   Clip distances, gl_PointSize, and PS outputs are eliminated based
1644        *   on current states, so we don't care about the shader code.
1645        *
1646        * TODO: Merged shaders don't inline uniforms for the first stage.
1647        * VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS.
1648        * (key == NULL for the first stage here)
1649        *
1650        * TODO: Compute shaders don't support inlinable uniforms, because they
1651        * don't have shader variants.
1652        *
1653        * TODO: The driver uses a linear search to find a shader variant. This
1654        * can be really slow if we get too many variants due to uniform inlining.
1655        */
1656       NIR_PASS_V(nir, nir_inline_uniforms,
1657                  nir->info.num_inlinable_uniforms,
1658                  inlined_uniform_values,
1659                  nir->info.inlinable_uniform_dw_offsets);
1660       progress = true;
1661    }
1662 
1663    if (sel->stage == MESA_SHADER_FRAGMENT && key->ps.mono.poly_line_smoothing)
1664       NIR_PASS(progress, nir, nir_lower_poly_line_smooth, SI_NUM_SMOOTH_AA_SAMPLES);
1665 
1666    if (sel->stage == MESA_SHADER_FRAGMENT && key->ps.mono.point_smoothing)
1667       NIR_PASS(progress, nir, nir_lower_point_smooth);
1668 
1669    if (progress)
1670       si_nir_opts(sel->screen, nir, true);
1671 
1672    /* Lower large variables that are always constant with load_constant intrinsics, which
1673     * get turned into PC-relative loads from a data section next to the shader.
1674     *
1675     * Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
1676     * this should be done after that.
1677     *
1678     * The pass crashes if there are dead temps of lowered IO interface types, so remove
1679     * them first.
1680     */
1681    bool progress2 = false;
1682    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
1683    NIR_PASS(progress2, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
1684 
1685    /* Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
1686     * this should be done after that.
1687     */
1688    progress2 |= ac_nir_lower_indirect_derefs(nir, sel->screen->info.gfx_level);
1689 
1690    bool opt_offsets = si_lower_io_to_mem(shader, nir, tcs_vgpr_only_inputs);
1691 
1692    if (progress2 || opt_offsets)
1693       si_nir_opts(sel->screen, nir, false);
1694 
1695    if (opt_offsets) {
1696       static const nir_opt_offsets_options offset_options = {
1697          .uniform_max = 0,
1698          .buffer_max = ~0,
1699          .shared_max = ~0,
1700       };
1701       NIR_PASS_V(nir, nir_opt_offsets, &offset_options);
1702    }
1703 
1704    if (progress || progress2 || opt_offsets)
1705       si_nir_late_opts(nir);
1706 
1707    NIR_PASS_V(nir, nir_divergence_analysis);
1708 
1709    /* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
1710     * 200 is tuned for Viewperf. It should be done last.
1711     */
1712    NIR_PASS_V(nir, nir_group_loads, nir_group_same_resource_only, 200);
1713 
1714    if (unlikely(original_name)) {
1715       ralloc_free((void*)nir->info.name);
1716       nir->info.name = original_name;
1717    }
1718 
1719    return nir;
1720 }
1721 
si_update_shader_binary_info(struct si_shader * shader,nir_shader * nir)1722 void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir)
1723 {
1724    struct si_shader_info info;
1725    si_nir_scan_shader(shader->selector->screen, nir, &info);
1726 
1727    shader->info.uses_vmem_load_other |= info.uses_vmem_load_other;
1728    shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh;
1729 }
1730 
si_nir_assign_param_offsets(nir_shader * nir,const struct si_shader_info * info,int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS],uint8_t * num_param_exports,uint64_t * output_param_mask,uint8_t vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS])1731 static void si_nir_assign_param_offsets(nir_shader *nir, const struct si_shader_info *info,
1732                                         int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS],
1733                                         uint8_t *num_param_exports, uint64_t *output_param_mask,
1734                                         uint8_t vs_output_param_offset[NUM_TOTAL_VARYING_SLOTS])
1735 {
1736    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1737    assert(impl);
1738 
1739    nir_foreach_block(block, impl) {
1740       nir_foreach_instr_safe(instr, block) {
1741          if (instr->type != nir_instr_type_intrinsic)
1742             continue;
1743 
1744          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1745          if (intr->intrinsic != nir_intrinsic_store_output)
1746             continue;
1747 
1748          /* No indirect indexing allowed. */
1749          ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1750          assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1751 
1752          assert(intr->num_components == 1); /* only scalar stores expected */
1753          nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1754 
1755          /* Assign the param index if it's unassigned. */
1756          if (nir_slot_is_varying(sem.location) && !sem.no_varying &&
1757              (sem.gs_streams & 0x3) == 0 &&
1758              vs_output_param_offset[sem.location] == AC_EXP_PARAM_DEFAULT_VAL_0000) {
1759             /* The semantic and the base should be the same as in si_shader_info. */
1760             assert(sem.location == info->output_semantic[nir_intrinsic_base(intr)]);
1761             /* It must not be remapped (duplicated). */
1762             assert(slot_remap[sem.location] == -1);
1763 
1764             vs_output_param_offset[sem.location] = (*num_param_exports)++;
1765             *output_param_mask |= BITFIELD64_BIT(nir_intrinsic_base(intr));
1766          }
1767       }
1768    }
1769 
1770    /* Duplicated outputs are redirected here. */
1771    for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
1772       if (slot_remap[i] >= 0)
1773          vs_output_param_offset[i] = vs_output_param_offset[slot_remap[i]];
1774    }
1775 }
1776 
si_compile_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)1777 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
1778                        struct si_shader *shader, struct util_debug_callback *debug)
1779 {
1780    struct si_shader_selector *sel = shader->selector;
1781    bool free_nir;
1782    struct nir_shader *nir = si_get_nir_shader(shader, &free_nir, 0);
1783 
1784    /* Assign param export indices. */
1785    if ((sel->stage == MESA_SHADER_VERTEX ||
1786         sel->stage == MESA_SHADER_TESS_EVAL ||
1787         (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)) &&
1788        !shader->key.ge.as_ls && !shader->key.ge.as_es) {
1789       /* Initialize this first. */
1790       shader->info.nr_param_exports = 0;
1791       shader->info.vs_output_param_mask = 0;
1792 
1793       STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
1794       memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
1795              sizeof(shader->info.vs_output_param_offset));
1796 
1797       /* A slot remapping table for duplicated outputs, so that 1 vertex shader output can be
1798        * mapped to multiple fragment shader inputs.
1799        */
1800       int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS];
1801       memset(slot_remap, -1, NUM_TOTAL_VARYING_SLOTS);
1802 
1803       /* This sets DEFAULT_VAL for constant outputs in vs_output_param_offset. */
1804       /* TODO: This doesn't affect GS. */
1805       NIR_PASS_V(nir, ac_nir_optimize_outputs, false, slot_remap,
1806                  shader->info.vs_output_param_offset);
1807 
1808       /* Assign the non-constant outputs. */
1809       /* TODO: Use this for the GS copy shader too. */
1810       si_nir_assign_param_offsets(nir, &sel->info, slot_remap, &shader->info.nr_param_exports,
1811                                   &shader->info.vs_output_param_mask,
1812                                   shader->info.vs_output_param_offset);
1813 
1814       if (shader->key.ge.mono.u.vs_export_prim_id) {
1815          shader->info.vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = shader->info.nr_param_exports++;
1816          shader->info.vs_output_param_mask |= BITFIELD64_BIT(sel->info.num_outputs);
1817       }
1818    }
1819 
1820    struct pipe_stream_output_info so = {};
1821    if (si_shader_uses_streamout(shader))
1822       nir_gather_stream_output_info(nir, &so);
1823 
1824    /* Dump NIR before doing NIR->LLVM conversion in case the
1825     * conversion fails. */
1826    if (si_can_dump_shader(sscreen, sel->stage) &&
1827        !(sscreen->debug_flags & DBG(NO_NIR))) {
1828       nir_print_shader(nir, stderr);
1829       si_dump_streamout(&so);
1830    }
1831 
1832    /* Initialize vs_output_ps_input_cntl to default. */
1833    for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++)
1834       shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED;
1835    shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0;
1836 
1837    si_update_shader_binary_info(shader, nir);
1838 
1839    shader->info.uses_instanceid = sel->info.uses_instanceid;
1840    shader->info.private_mem_vgprs = DIV_ROUND_UP(nir->scratch_size, 4);
1841 
1842    /* Set the FP ALU behavior. */
1843    /* By default, we disable denormals for FP32 and enable them for FP16 and FP64
1844     * for performance and correctness reasons. FP32 denormals can't be enabled because
1845     * they break output modifiers and v_mad_f32 and are very slow on GFX6-7.
1846     *
1847     * float_controls_execution_mode defines the set of valid behaviors. Contradicting flags
1848     * can be set simultaneously, which means we are allowed to choose, but not really because
1849     * some options cause GLCTS failures.
1850     */
1851    unsigned float_mode = V_00B028_FP_16_64_DENORMS;
1852 
1853    if (!(nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) &&
1854        nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
1855       float_mode |= V_00B028_FP_32_ROUND_TOWARDS_ZERO;
1856 
1857    if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
1858                                                     FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64)) &&
1859        nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
1860                                                   FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
1861       float_mode |= V_00B028_FP_16_64_ROUND_TOWARDS_ZERO;
1862 
1863    if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_PRESERVE_FP16 |
1864                                                     FLOAT_CONTROLS_DENORM_PRESERVE_FP64)) &&
1865        nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
1866                                                   FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64))
1867       float_mode &= ~V_00B028_FP_16_64_DENORMS;
1868 
1869    /* TODO: ACO could compile non-monolithic shaders here (starting
1870     * with PS and NGG VS), but monolithic shaders should be compiled
1871     * by LLVM due to more complicated compilation.
1872     */
1873    if (!si_llvm_compile_shader(sscreen, compiler, shader, &so, debug, nir, free_nir))
1874       return false;
1875 
1876    shader->config.float_mode = float_mode;
1877 
1878    /* The GS copy shader is compiled next. */
1879    if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
1880       shader->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, &so, debug);
1881       if (!shader->gs_copy_shader) {
1882          fprintf(stderr, "radeonsi: can't create GS copy shader\n");
1883          return false;
1884       }
1885    }
1886 
1887    /* Compute vs_output_ps_input_cntl. */
1888    if ((sel->stage == MESA_SHADER_VERTEX ||
1889         sel->stage == MESA_SHADER_TESS_EVAL ||
1890         sel->stage == MESA_SHADER_GEOMETRY) &&
1891        !shader->key.ge.as_ls && !shader->key.ge.as_es) {
1892       ubyte *vs_output_param_offset = shader->info.vs_output_param_offset;
1893 
1894       if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg)
1895          vs_output_param_offset = shader->gs_copy_shader->info.vs_output_param_offset;
1896 
1897       /* We must use the original shader info before the removal of duplicated shader outputs. */
1898       /* VS and TES should also set primitive ID output if it's used. */
1899       unsigned num_outputs_with_prim_id = sel->info.num_outputs +
1900                                           shader->key.ge.mono.u.vs_export_prim_id;
1901 
1902       for (unsigned i = 0; i < num_outputs_with_prim_id; i++) {
1903          unsigned semantic = sel->info.output_semantic[i];
1904          unsigned offset = vs_output_param_offset[semantic];
1905          unsigned ps_input_cntl;
1906 
1907          if (offset <= AC_EXP_PARAM_OFFSET_31) {
1908             /* The input is loaded from parameter memory. */
1909             ps_input_cntl = S_028644_OFFSET(offset);
1910          } else {
1911             /* The input is a DEFAULT_VAL constant. */
1912             assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
1913                    offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
1914             offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
1915 
1916             /* OFFSET=0x20 means that DEFAULT_VAL is used. */
1917             ps_input_cntl = S_028644_OFFSET(0x20) |
1918                             S_028644_DEFAULT_VAL(offset);
1919          }
1920 
1921          shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl;
1922       }
1923    }
1924 
1925    /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */
1926    if (sel->stage == MESA_SHADER_COMPUTE) {
1927       unsigned max_vgprs =
1928          sscreen->info.num_physical_wave64_vgprs_per_simd * (shader->wave_size == 32 ? 2 : 1);
1929       unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
1930       unsigned max_sgprs_per_wave = 128;
1931       unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
1932       unsigned threads_per_tg = si_get_max_workgroup_size(shader);
1933       unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, shader->wave_size);
1934       unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
1935 
1936       max_vgprs = max_vgprs / waves_per_simd;
1937       max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
1938 
1939       if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
1940          fprintf(stderr,
1941                  "LLVM failed to compile a shader correctly: "
1942                  "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
1943                  shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);
1944 
1945          /* Just terminate the process, because dependent
1946           * shaders can hang due to bad input data, but use
1947           * the env var to allow shader-db to work.
1948           */
1949          if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
1950             abort();
1951       }
1952    }
1953 
1954    /* Add the scratch offset to input SGPRs. */
1955    if (sel->screen->info.gfx_level < GFX11 &&
1956        shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader))
1957       shader->info.num_input_sgprs += 1; /* scratch byte offset */
1958 
1959    /* Calculate the number of fragment input VGPRs. */
1960    if (sel->stage == MESA_SHADER_FRAGMENT) {
1961       shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(
1962          &shader->config, &shader->info.face_vgpr_index, &shader->info.ancillary_vgpr_index,
1963          &shader->info.sample_coverage_vgpr_index);
1964    }
1965 
1966    si_calculate_max_simd_waves(shader);
1967    si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
1968    return true;
1969 }
1970 
1971 /**
1972  * Create, compile and return a shader part (prolog or epilog).
1973  *
1974  * \param sscreen  screen
1975  * \param list     list of shader parts of the same category
1976  * \param type     shader type
1977  * \param key      shader part key
1978  * \param prolog   whether the part being requested is a prolog
1979  * \param tm       LLVM target machine
1980  * \param debug    debug callback
1981  * \param build    the callback responsible for building the main function
1982  * \return         non-NULL on success
1983  */
1984 static struct si_shader_part *
si_get_shader_part(struct si_screen * sscreen,struct si_shader_part ** list,gl_shader_stage stage,bool prolog,union si_shader_part_key * key,struct ac_llvm_compiler * compiler,struct util_debug_callback * debug,void (* build)(struct si_shader_context *,union si_shader_part_key *),const char * name)1985 si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
1986                    gl_shader_stage stage, bool prolog, union si_shader_part_key *key,
1987                    struct ac_llvm_compiler *compiler, struct util_debug_callback *debug,
1988                    void (*build)(struct si_shader_context *, union si_shader_part_key *),
1989                    const char *name)
1990 {
1991    struct si_shader_part *result;
1992 
1993    simple_mtx_lock(&sscreen->shader_parts_mutex);
1994 
1995    /* Find existing. */
1996    for (result = *list; result; result = result->next) {
1997       if (memcmp(&result->key, key, sizeof(*key)) == 0) {
1998          simple_mtx_unlock(&sscreen->shader_parts_mutex);
1999          return result;
2000       }
2001    }
2002 
2003    /* Compile a new one. */
2004    result = CALLOC_STRUCT(si_shader_part);
2005    result->key = *key;
2006 
2007    struct si_shader_selector sel = {};
2008    sel.screen = sscreen;
2009 
2010    struct si_shader shader = {};
2011    shader.selector = &sel;
2012    bool wave32 = false;
2013 
2014    switch (stage) {
2015    case MESA_SHADER_VERTEX:
2016       shader.key.ge.as_ls = key->vs_prolog.as_ls;
2017       shader.key.ge.as_es = key->vs_prolog.as_es;
2018       shader.key.ge.as_ngg = key->vs_prolog.as_ngg;
2019       wave32 = key->vs_prolog.wave32;
2020       break;
2021    case MESA_SHADER_TESS_CTRL:
2022       assert(!prolog);
2023       shader.key.ge.part.tcs.epilog = key->tcs_epilog.states;
2024       wave32 = key->tcs_epilog.wave32;
2025       break;
2026    case MESA_SHADER_FRAGMENT:
2027       if (prolog) {
2028          shader.key.ps.part.prolog = key->ps_prolog.states;
2029          wave32 = key->ps_prolog.wave32;
2030       } else {
2031          shader.key.ps.part.epilog = key->ps_epilog.states;
2032          wave32 = key->ps_epilog.wave32;
2033       }
2034       break;
2035    default:
2036       unreachable("bad shader part");
2037    }
2038 
2039    struct si_shader_context ctx;
2040    si_llvm_context_init(&ctx, sscreen, compiler, wave32 ? 32 : 64);
2041 
2042    ctx.shader = &shader;
2043    ctx.stage = stage;
2044 
2045    build(&ctx, key);
2046 
2047    /* Compile. */
2048    si_llvm_optimize_module(&ctx);
2049 
2050    if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler, &ctx.ac, debug,
2051                         ctx.stage, name, false)) {
2052       FREE(result);
2053       result = NULL;
2054       goto out;
2055    }
2056 
2057    result->next = *list;
2058    *list = result;
2059 
2060 out:
2061    si_llvm_dispose(&ctx);
2062    simple_mtx_unlock(&sscreen->shader_parts_mutex);
2063    return result;
2064 }
2065 
si_get_vs_prolog(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug,struct si_shader * main_part,const struct si_vs_prolog_bits * key)2066 static bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2067                              struct si_shader *shader, struct util_debug_callback *debug,
2068                              struct si_shader *main_part, const struct si_vs_prolog_bits *key)
2069 {
2070    struct si_shader_selector *vs = main_part->selector;
2071 
2072    if (!si_vs_needs_prolog(vs, key, &shader->key, false,
2073                            shader->selector->stage == MESA_SHADER_GEOMETRY))
2074       return true;
2075 
2076    /* Get the prolog. */
2077    union si_shader_part_key prolog_key;
2078    si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader,
2079                         &prolog_key);
2080 
2081    shader->prolog =
2082       si_get_shader_part(sscreen, &sscreen->vs_prologs, MESA_SHADER_VERTEX, true, &prolog_key,
2083                          compiler, debug, si_llvm_build_vs_prolog, "Vertex Shader Prolog");
2084    return shader->prolog != NULL;
2085 }
2086 
2087 /**
2088  * Select and compile (or reuse) vertex shader parts (prolog & epilog).
2089  */
si_shader_select_vs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)2090 static bool si_shader_select_vs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2091                                       struct si_shader *shader, struct util_debug_callback *debug)
2092 {
2093    return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, &shader->key.ge.part.vs.prolog);
2094 }
2095 
si_get_tcs_epilog_key(struct si_shader * shader,union si_shader_part_key * key)2096 void si_get_tcs_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
2097 {
2098    memset(key, 0, sizeof(*key));
2099    key->tcs_epilog.wave32 = shader->wave_size == 32;
2100    key->tcs_epilog.states = shader->key.ge.part.tcs.epilog;
2101 
2102    /* If output patches are wholly in one wave, we don't need a barrier. */
2103    key->tcs_epilog.noop_s_barrier =
2104       shader->wave_size % shader->selector->info.base.tess.tcs_vertices_out == 0;
2105 }
2106 
2107 /**
2108  * Select and compile (or reuse) TCS parts (epilog).
2109  */
si_shader_select_tcs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)2110 static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2111                                        struct si_shader *shader, struct util_debug_callback *debug)
2112 {
2113    if (sscreen->info.gfx_level >= GFX9) {
2114       struct si_shader *ls_main_part = shader->key.ge.part.tcs.ls->main_shader_part_ls;
2115 
2116       if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
2117                             &shader->key.ge.part.tcs.ls_prolog))
2118          return false;
2119 
2120       shader->previous_stage = ls_main_part;
2121    }
2122 
2123    /* Get the epilog. */
2124    union si_shader_part_key epilog_key;
2125    si_get_tcs_epilog_key(shader, &epilog_key);
2126 
2127    shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, MESA_SHADER_TESS_CTRL, false,
2128                                        &epilog_key, compiler, debug, si_llvm_build_tcs_epilog,
2129                                        "Tessellation Control Shader Epilog");
2130    return shader->epilog != NULL;
2131 }
2132 
2133 /**
2134  * Select and compile (or reuse) GS parts (prolog).
2135  */
si_shader_select_gs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)2136 static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2137                                       struct si_shader *shader, struct util_debug_callback *debug)
2138 {
2139    if (sscreen->info.gfx_level >= GFX9) {
2140       struct si_shader *es_main_part;
2141 
2142       if (shader->key.ge.as_ngg)
2143          es_main_part = shader->key.ge.part.gs.es->main_shader_part_ngg_es;
2144       else
2145          es_main_part = shader->key.ge.part.gs.es->main_shader_part_es;
2146 
2147       if (shader->key.ge.part.gs.es->stage == MESA_SHADER_VERTEX &&
2148           !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
2149                             &shader->key.ge.part.gs.vs_prolog))
2150          return false;
2151 
2152       shader->previous_stage = es_main_part;
2153    }
2154 
2155    return true;
2156 }
2157 
2158 /**
2159  * Compute the PS prolog key, which contains all the information needed to
2160  * build the PS prolog function, and set related bits in shader->config.
2161  */
si_get_ps_prolog_key(struct si_shader * shader,union si_shader_part_key * key,bool separate_prolog)2162 void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,
2163                           bool separate_prolog)
2164 {
2165    struct si_shader_info *info = &shader->selector->info;
2166 
2167    memset(key, 0, sizeof(*key));
2168    key->ps_prolog.states = shader->key.ps.part.prolog;
2169    key->ps_prolog.wave32 = shader->wave_size == 32;
2170    key->ps_prolog.colors_read = info->colors_read;
2171    key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
2172    key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
2173    key->ps_prolog.wqm =
2174       info->base.fs.needs_quad_helper_invocations &&
2175       (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
2176        key->ps_prolog.states.force_linear_sample_interp ||
2177        key->ps_prolog.states.force_persp_center_interp ||
2178        key->ps_prolog.states.force_linear_center_interp ||
2179        key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);
2180    key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
2181    key->ps_prolog.sample_coverage_vgpr_index = shader->info.sample_coverage_vgpr_index;
2182 
2183    if (shader->key.ps.part.prolog.poly_stipple)
2184       shader->info.uses_vmem_load_other = true;
2185 
2186    if (info->colors_read) {
2187       ubyte *color = shader->selector->info.color_attr_index;
2188 
2189       if (shader->key.ps.part.prolog.color_two_side) {
2190          /* BCOLORs are stored after the last input. */
2191          key->ps_prolog.num_interp_inputs = info->num_inputs;
2192          key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
2193          if (separate_prolog)
2194             shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
2195       }
2196 
2197       for (unsigned i = 0; i < 2; i++) {
2198          unsigned interp = info->color_interpolate[i];
2199          unsigned location = info->color_interpolate_loc[i];
2200 
2201          if (!(info->colors_read & (0xf << i * 4)))
2202             continue;
2203 
2204          key->ps_prolog.color_attr_index[i] = color[i];
2205 
2206          if (shader->key.ps.part.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
2207             interp = INTERP_MODE_FLAT;
2208 
2209          switch (interp) {
2210          case INTERP_MODE_FLAT:
2211             key->ps_prolog.color_interp_vgpr_index[i] = -1;
2212             break;
2213          case INTERP_MODE_SMOOTH:
2214          case INTERP_MODE_COLOR:
2215             /* Force the interpolation location for colors here. */
2216             if (shader->key.ps.part.prolog.force_persp_sample_interp)
2217                location = TGSI_INTERPOLATE_LOC_SAMPLE;
2218             if (shader->key.ps.part.prolog.force_persp_center_interp)
2219                location = TGSI_INTERPOLATE_LOC_CENTER;
2220 
2221             switch (location) {
2222             case TGSI_INTERPOLATE_LOC_SAMPLE:
2223                key->ps_prolog.color_interp_vgpr_index[i] = 0;
2224                if (separate_prolog) {
2225                   shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2226                }
2227                break;
2228             case TGSI_INTERPOLATE_LOC_CENTER:
2229                key->ps_prolog.color_interp_vgpr_index[i] = 2;
2230                if (separate_prolog) {
2231                   shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2232                }
2233                break;
2234             case TGSI_INTERPOLATE_LOC_CENTROID:
2235                key->ps_prolog.color_interp_vgpr_index[i] = 4;
2236                if (separate_prolog) {
2237                   shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
2238                }
2239                break;
2240             default:
2241                assert(0);
2242             }
2243             break;
2244          case INTERP_MODE_NOPERSPECTIVE:
2245             /* Force the interpolation location for colors here. */
2246             if (shader->key.ps.part.prolog.force_linear_sample_interp)
2247                location = TGSI_INTERPOLATE_LOC_SAMPLE;
2248             if (shader->key.ps.part.prolog.force_linear_center_interp)
2249                location = TGSI_INTERPOLATE_LOC_CENTER;
2250 
2251             /* The VGPR assignment for non-monolithic shaders
2252              * works because InitialPSInputAddr is set on the
2253              * main shader and PERSP_PULL_MODEL is never used.
2254              */
2255             switch (location) {
2256             case TGSI_INTERPOLATE_LOC_SAMPLE:
2257                key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 6 : 9;
2258                if (separate_prolog) {
2259                   shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2260                }
2261                break;
2262             case TGSI_INTERPOLATE_LOC_CENTER:
2263                key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 8 : 11;
2264                if (separate_prolog) {
2265                   shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2266                }
2267                break;
2268             case TGSI_INTERPOLATE_LOC_CENTROID:
2269                key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 10 : 13;
2270                if (separate_prolog) {
2271                   shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
2272                }
2273                break;
2274             default:
2275                assert(0);
2276             }
2277             break;
2278          default:
2279             assert(0);
2280          }
2281       }
2282    }
2283 }
2284 
2285 /**
2286  * Check whether a PS prolog is required based on the key.
2287  */
si_need_ps_prolog(const union si_shader_part_key * key)2288 bool si_need_ps_prolog(const union si_shader_part_key *key)
2289 {
2290    return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
2291           key->ps_prolog.states.force_linear_sample_interp ||
2292           key->ps_prolog.states.force_persp_center_interp ||
2293           key->ps_prolog.states.force_linear_center_interp ||
2294           key->ps_prolog.states.bc_optimize_for_persp ||
2295           key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
2296           key->ps_prolog.states.samplemask_log_ps_iter;
2297 }
2298 
2299 /**
2300  * Compute the PS epilog key, which contains all the information needed to
2301  * build the PS epilog function.
2302  */
si_get_ps_epilog_key(struct si_shader * shader,union si_shader_part_key * key)2303 void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
2304 {
2305    struct si_shader_info *info = &shader->selector->info;
2306    memset(key, 0, sizeof(*key));
2307    key->ps_epilog.wave32 = shader->wave_size == 32;
2308    key->ps_epilog.uses_discard = si_shader_uses_discard(shader);
2309    key->ps_epilog.colors_written = info->colors_written;
2310    key->ps_epilog.color_types = info->output_color_types;
2311    key->ps_epilog.writes_z = info->writes_z;
2312    key->ps_epilog.writes_stencil = info->writes_stencil;
2313    key->ps_epilog.writes_samplemask = info->writes_samplemask;
2314    key->ps_epilog.states = shader->key.ps.part.epilog;
2315 }
2316 
2317 /**
2318  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
2319  */
si_shader_select_ps_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)2320 static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2321                                       struct si_shader *shader, struct util_debug_callback *debug)
2322 {
2323    union si_shader_part_key prolog_key;
2324    union si_shader_part_key epilog_key;
2325 
2326    /* Get the prolog. */
2327    si_get_ps_prolog_key(shader, &prolog_key, true);
2328 
2329    /* The prolog is a no-op if these aren't set. */
2330    if (si_need_ps_prolog(&prolog_key)) {
2331       shader->prolog =
2332          si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key,
2333                             compiler, debug, si_llvm_build_ps_prolog, "Fragment Shader Prolog");
2334       if (!shader->prolog)
2335          return false;
2336    }
2337 
2338    /* Get the epilog. */
2339    si_get_ps_epilog_key(shader, &epilog_key);
2340 
2341    shader->epilog =
2342       si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key,
2343                          compiler, debug, si_llvm_build_ps_epilog, "Fragment Shader Epilog");
2344    if (!shader->epilog)
2345       return false;
2346 
2347    /* Enable POS_FIXED_PT if polygon stippling is enabled. */
2348    if (shader->key.ps.part.prolog.poly_stipple) {
2349       shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2350       assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
2351    }
2352 
2353    /* Set up the enable bits for per-sample shading if needed. */
2354    if (shader->key.ps.part.prolog.force_persp_sample_interp &&
2355        (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2356         G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2357       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
2358       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2359       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2360    }
2361    if (shader->key.ps.part.prolog.force_linear_sample_interp &&
2362        (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2363         G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2364       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
2365       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2366       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2367    }
2368    if (shader->key.ps.part.prolog.force_persp_center_interp &&
2369        (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2370         G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2371       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
2372       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2373       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2374    }
2375    if (shader->key.ps.part.prolog.force_linear_center_interp &&
2376        (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2377         G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2378       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
2379       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2380       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2381    }
2382 
2383    /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
2384    if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
2385        !(shader->config.spi_ps_input_ena & 0xf)) {
2386       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2387       assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
2388    }
2389 
2390    /* At least one pair of interpolation weights must be enabled. */
2391    if (!(shader->config.spi_ps_input_ena & 0x7f)) {
2392       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2393       assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
2394    }
2395 
2396    /* Samplemask fixup requires the sample ID. */
2397    if (shader->key.ps.part.prolog.samplemask_log_ps_iter) {
2398       shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2399       assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
2400    }
2401 
2402    return true;
2403 }
2404 
si_multiwave_lds_size_workaround(struct si_screen * sscreen,unsigned * lds_size)2405 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
2406 {
2407    /* If tessellation is all offchip and on-chip GS isn't used, this
2408     * workaround is not needed.
2409     */
2410    return;
2411 
2412    /* SPI barrier management bug:
2413     *   Make sure we have at least 4k of LDS in use to avoid the bug.
2414     *   It applies to workgroup sizes of more than one wavefront.
2415     */
2416    if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
2417       *lds_size = MAX2(*lds_size, 8);
2418 }
2419 
si_fix_resource_usage(struct si_screen * sscreen,struct si_shader * shader)2420 void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
2421 {
2422    unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
2423 
2424    shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
2425 
2426    if (shader->selector->stage == MESA_SHADER_COMPUTE &&
2427        si_get_max_workgroup_size(shader) > shader->wave_size) {
2428       si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
2429    }
2430 }
2431 
si_create_shader_variant(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)2432 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2433                               struct si_shader *shader, struct util_debug_callback *debug)
2434 {
2435    struct si_shader_selector *sel = shader->selector;
2436    struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
2437 
2438    /* LS, ES, VS are compiled on demand if the main part hasn't been
2439     * compiled for that stage.
2440     *
2441     * GS are compiled on demand if the main part hasn't been compiled
2442     * for the chosen NGG-ness.
2443     *
2444     * Vertex shaders are compiled on demand when a vertex fetch
2445     * workaround must be applied.
2446     */
2447    if (shader->is_monolithic) {
2448       /* Monolithic shader (compiled as a whole, has many variants,
2449        * may take a long time to compile).
2450        */
2451       if (!si_compile_shader(sscreen, compiler, shader, debug))
2452          return false;
2453    } else {
2454       /* The shader consists of several parts:
2455        *
2456        * - the middle part is the user shader, it has 1 variant only
2457        *   and it was compiled during the creation of the shader
2458        *   selector
2459        * - the prolog part is inserted at the beginning
2460        * - the epilog part is inserted at the end
2461        *
2462        * The prolog and epilog have many (but simple) variants.
2463        *
2464        * Starting with gfx9, geometry and tessellation control
2465        * shaders also contain the prolog and user shader parts of
2466        * the previous shader stage.
2467        */
2468 
2469       if (!mainp)
2470          return false;
2471 
2472       /* Copy the compiled shader data over. */
2473       shader->is_binary_shared = true;
2474       shader->binary = mainp->binary;
2475       shader->config = mainp->config;
2476       shader->info = mainp->info;
2477 
2478       /* Select prologs and/or epilogs. */
2479       switch (sel->stage) {
2480       case MESA_SHADER_VERTEX:
2481          if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
2482             return false;
2483          break;
2484       case MESA_SHADER_TESS_CTRL:
2485          if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
2486             return false;
2487          break;
2488       case MESA_SHADER_TESS_EVAL:
2489          break;
2490       case MESA_SHADER_GEOMETRY:
2491          if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
2492             return false;
2493 
2494          /* Clone the GS copy shader for the shader variant.
2495           * We can't just copy the pointer because we change the pm4 state and
2496           * si_shader_selector::gs_copy_shader must be immutable because it's shared
2497           * by multiple contexts.
2498           */
2499          if (!shader->key.ge.as_ngg) {
2500             assert(sel->main_shader_part == mainp);
2501             assert(sel->main_shader_part->gs_copy_shader);
2502             assert(sel->main_shader_part->gs_copy_shader->bo);
2503             assert(!sel->main_shader_part->gs_copy_shader->previous_stage_sel);
2504             assert(!sel->main_shader_part->gs_copy_shader->scratch_bo);
2505 
2506             shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
2507             memcpy(shader->gs_copy_shader, sel->main_shader_part->gs_copy_shader,
2508                    sizeof(*shader->gs_copy_shader));
2509             /* Increase the reference count. */
2510             pipe_reference(NULL, &shader->gs_copy_shader->bo->b.b.reference);
2511             /* Initialize some fields differently. */
2512             shader->gs_copy_shader->shader_log = NULL;
2513             shader->gs_copy_shader->is_binary_shared = true;
2514             util_queue_fence_init(&shader->gs_copy_shader->ready);
2515          }
2516          break;
2517       case MESA_SHADER_FRAGMENT:
2518          if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
2519             return false;
2520 
2521          /* Make sure we have at least as many VGPRs as there
2522           * are allocated inputs.
2523           */
2524          shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
2525          break;
2526       default:;
2527       }
2528 
2529       assert(shader->wave_size == mainp->wave_size);
2530       assert(!shader->previous_stage || shader->wave_size == shader->previous_stage->wave_size);
2531 
2532       /* Update SGPR and VGPR counts. */
2533       if (shader->prolog) {
2534          shader->config.num_sgprs =
2535             MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs);
2536          shader->config.num_vgprs =
2537             MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs);
2538       }
2539       if (shader->previous_stage) {
2540          shader->config.num_sgprs =
2541             MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
2542          shader->config.num_vgprs =
2543             MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
2544          shader->config.spilled_sgprs =
2545             MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
2546          shader->config.spilled_vgprs =
2547             MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
2548          shader->info.private_mem_vgprs =
2549             MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
2550          shader->config.scratch_bytes_per_wave =
2551             MAX2(shader->config.scratch_bytes_per_wave,
2552                  shader->previous_stage->config.scratch_bytes_per_wave);
2553          shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
2554          shader->info.uses_vmem_load_other |= shader->previous_stage->info.uses_vmem_load_other;
2555          shader->info.uses_vmem_sampler_or_bvh |= shader->previous_stage->info.uses_vmem_sampler_or_bvh;
2556       }
2557       if (shader->epilog) {
2558          shader->config.num_sgprs =
2559             MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs);
2560          shader->config.num_vgprs =
2561             MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs);
2562       }
2563       si_calculate_max_simd_waves(shader);
2564    }
2565 
2566    if (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
2567       assert(!shader->key.ge.as_es && !shader->key.ge.as_ls);
2568       if (!gfx10_ngg_calculate_subgroup_info(shader)) {
2569          fprintf(stderr, "Failed to compute subgroup info\n");
2570          return false;
2571       }
2572    } else if (sscreen->info.gfx_level >= GFX9 && sel->stage == MESA_SHADER_GEOMETRY) {
2573       gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
2574    }
2575 
2576    shader->uses_vs_state_provoking_vertex =
2577       sscreen->use_ngg &&
2578       /* Used to convert triangle strips from GS to triangles. */
2579       ((sel->stage == MESA_SHADER_GEOMETRY &&
2580         util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) ||
2581        (sel->stage == MESA_SHADER_VERTEX &&
2582         /* Used to export PrimitiveID from the correct vertex. */
2583         shader->key.ge.mono.u.vs_export_prim_id));
2584 
2585    shader->uses_gs_state_outprim = sscreen->use_ngg &&
2586                                    /* Only used by streamout in vertex shaders. */
2587                                    sel->stage == MESA_SHADER_VERTEX &&
2588                                    si_shader_uses_streamout(shader);
2589 
2590    if (sel->stage == MESA_SHADER_VERTEX) {
2591       shader->uses_base_instance = sel->info.uses_base_instance ||
2592                                    shader->key.ge.part.vs.prolog.instance_divisor_is_one ||
2593                                    shader->key.ge.part.vs.prolog.instance_divisor_is_fetched;
2594    } else if (sel->stage == MESA_SHADER_TESS_CTRL) {
2595       shader->uses_base_instance = shader->previous_stage_sel &&
2596                                    (shader->previous_stage_sel->info.uses_base_instance ||
2597                                     shader->key.ge.part.tcs.ls_prolog.instance_divisor_is_one ||
2598                                     shader->key.ge.part.tcs.ls_prolog.instance_divisor_is_fetched);
2599    } else if (sel->stage == MESA_SHADER_GEOMETRY) {
2600       shader->uses_base_instance = shader->previous_stage_sel &&
2601                                    (shader->previous_stage_sel->info.uses_base_instance ||
2602                                     shader->key.ge.part.gs.vs_prolog.instance_divisor_is_one ||
2603                                     shader->key.ge.part.gs.vs_prolog.instance_divisor_is_fetched);
2604    }
2605 
2606    si_fix_resource_usage(sscreen, shader);
2607 
2608    /* Upload. */
2609    bool ok = si_shader_binary_upload(sscreen, shader, 0);
2610    si_shader_dump(sscreen, shader, debug, stderr, true);
2611 
2612    if (!ok)
2613       fprintf(stderr, "LLVM failed to upload shader\n");
2614    return ok;
2615 }
2616 
si_shader_binary_clean(struct si_shader_binary * binary)2617 void si_shader_binary_clean(struct si_shader_binary *binary)
2618 {
2619    free((void *)binary->elf_buffer);
2620    binary->elf_buffer = NULL;
2621 
2622    free(binary->llvm_ir_string);
2623    binary->llvm_ir_string = NULL;
2624 
2625    free(binary->uploaded_code);
2626    binary->uploaded_code = NULL;
2627    binary->uploaded_code_size = 0;
2628 }
2629 
si_shader_destroy(struct si_shader * shader)2630 void si_shader_destroy(struct si_shader *shader)
2631 {
2632    if (shader->scratch_bo)
2633       si_resource_reference(&shader->scratch_bo, NULL);
2634 
2635    si_resource_reference(&shader->bo, NULL);
2636 
2637    if (!shader->is_binary_shared)
2638       si_shader_binary_clean(&shader->binary);
2639 
2640    free(shader->shader_log);
2641 }
2642