• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2012 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_shader.h"
8 #include "ac_nir.h"
9 #include "ac_rtld.h"
10 #include "nir.h"
11 #include "nir_builder.h"
12 #include "nir_serialize.h"
13 #include "nir_xfb_info.h"
14 #include "si_pipe.h"
15 #include "si_shader_internal.h"
16 #include "sid.h"
17 #include "tgsi/tgsi_from_mesa.h"
18 #include "util/u_memory.h"
19 #include "util/mesa-sha1.h"
20 #include "util/ralloc.h"
21 #include "util/u_upload_mgr.h"
22 
23 #if LLVM_AVAILABLE
24 #include <llvm/Config/llvm-config.h> /* for LLVM_VERSION_MAJOR */
25 #else
26 #define LLVM_VERSION_MAJOR 0
27 #endif
28 
29 static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
30 
31 static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
32 
33 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
34 static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader);
35 
36 /* Get the number of all interpolated inputs */
si_get_ps_num_interp(struct si_shader * ps)37 unsigned si_get_ps_num_interp(struct si_shader *ps)
38 {
39    unsigned num_colors = !!(ps->info.ps_colors_read & 0x0f) + !!(ps->info.ps_colors_read & 0xf0);
40    unsigned num_interp =
41       ps->info.num_ps_inputs + (ps->key.ps.part.prolog.color_two_side ? num_colors : 0);
42 
43    assert(num_interp <= 32);
44    return MIN2(num_interp, 32);
45 }
46 
47 /** Whether the shader runs as a combination of multiple API shaders */
si_is_multi_part_shader(struct si_shader * shader)48 bool si_is_multi_part_shader(struct si_shader *shader)
49 {
50    if (shader->selector->screen->info.gfx_level <= GFX8 ||
51        shader->selector->stage > MESA_SHADER_GEOMETRY)
52       return false;
53 
54    return shader->key.ge.as_ls || shader->key.ge.as_es ||
55           shader->selector->stage == MESA_SHADER_TESS_CTRL ||
56           shader->selector->stage == MESA_SHADER_GEOMETRY;
57 }
58 
59 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
si_is_merged_shader(struct si_shader * shader)60 bool si_is_merged_shader(struct si_shader *shader)
61 {
62    if (shader->selector->stage > MESA_SHADER_GEOMETRY || shader->is_gs_copy_shader)
63       return false;
64 
65    return shader->key.ge.as_ngg || si_is_multi_part_shader(shader);
66 }
67 
68 /**
69  * Returns a unique index for a semantic name and index. The index must be
70  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
71  * calculated.
72  */
si_shader_io_get_unique_index(unsigned semantic)73 unsigned si_shader_io_get_unique_index(unsigned semantic)
74 {
75    switch (semantic) {
76    case VARYING_SLOT_POS:
77       return SI_UNIQUE_SLOT_POS;
78    default:
79       if (semantic >= VARYING_SLOT_VAR0 && semantic <= VARYING_SLOT_VAR31)
80          return SI_UNIQUE_SLOT_VAR0 + (semantic - VARYING_SLOT_VAR0);
81 
82       if (semantic >= VARYING_SLOT_VAR0_16BIT && semantic <= VARYING_SLOT_VAR15_16BIT)
83          return SI_UNIQUE_SLOT_VAR0_16BIT + (semantic - VARYING_SLOT_VAR0_16BIT);
84 
85       assert(!"invalid generic index");
86       return 0;
87 
88    /* Legacy desktop GL varyings. */
89    case VARYING_SLOT_FOGC:
90       return SI_UNIQUE_SLOT_FOGC;
91    case VARYING_SLOT_COL0:
92       return SI_UNIQUE_SLOT_COL0;
93    case VARYING_SLOT_COL1:
94       return SI_UNIQUE_SLOT_COL1;
95    case VARYING_SLOT_BFC0:
96       return SI_UNIQUE_SLOT_BFC0;
97    case VARYING_SLOT_BFC1:
98       return SI_UNIQUE_SLOT_BFC1;
99    case VARYING_SLOT_TEX0:
100    case VARYING_SLOT_TEX1:
101    case VARYING_SLOT_TEX2:
102    case VARYING_SLOT_TEX3:
103    case VARYING_SLOT_TEX4:
104    case VARYING_SLOT_TEX5:
105    case VARYING_SLOT_TEX6:
106    case VARYING_SLOT_TEX7:
107       return SI_UNIQUE_SLOT_TEX0 + (semantic - VARYING_SLOT_TEX0);
108    case VARYING_SLOT_CLIP_VERTEX:
109       return SI_UNIQUE_SLOT_CLIP_VERTEX;
110 
111    /* Varyings present in both GLES and desktop GL. */
112    case VARYING_SLOT_CLIP_DIST0:
113       return SI_UNIQUE_SLOT_CLIP_DIST0;
114    case VARYING_SLOT_CLIP_DIST1:
115       return SI_UNIQUE_SLOT_CLIP_DIST1;
116    case VARYING_SLOT_PSIZ:
117       return SI_UNIQUE_SLOT_PSIZ;
118    case VARYING_SLOT_LAYER:
119       return SI_UNIQUE_SLOT_LAYER;
120    case VARYING_SLOT_VIEWPORT:
121       return SI_UNIQUE_SLOT_VIEWPORT;
122    case VARYING_SLOT_PRIMITIVE_ID:
123       return SI_UNIQUE_SLOT_PRIMITIVE_ID;
124    }
125 }
126 
declare_streamout_params(struct si_shader_args * args,struct si_shader * shader,const shader_info * info)127 static void declare_streamout_params(struct si_shader_args *args, struct si_shader *shader,
128                                      const shader_info *info)
129 {
130    if (shader->selector->screen->info.gfx_level >= GFX11) {
131       /* NGG streamout. */
132       if (info->stage == MESA_SHADER_TESS_EVAL)
133          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
134       return;
135    }
136 
137    /* Streamout SGPRs. */
138    if (si_shader_uses_streamout(shader)) {
139       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_config);
140       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_write_index);
141 
142       /* A streamout buffer offset is loaded if the stride is non-zero. */
143       for (int i = 0; i < 4; i++) {
144          if (!info->xfb_stride[i])
145             continue;
146 
147          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_offset[i]);
148       }
149    } else if (info->stage == MESA_SHADER_TESS_EVAL) {
150       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
151    }
152 }
153 
si_get_max_workgroup_size(const struct si_shader * shader)154 unsigned si_get_max_workgroup_size(const struct si_shader *shader)
155 {
156    gl_shader_stage stage = shader->is_gs_copy_shader ?
157       MESA_SHADER_VERTEX : shader->selector->stage;
158 
159    assert(shader->wave_size);
160 
161    switch (stage) {
162    case MESA_SHADER_VERTEX:
163    case MESA_SHADER_TESS_EVAL:
164       /* Use the largest workgroup size for streamout */
165       if (shader->key.ge.as_ngg)
166          return si_shader_uses_streamout(shader) ? 256 : 128;
167 
168       /* As part of merged shader. */
169       return shader->selector->screen->info.gfx_level >= GFX9 &&
170          (shader->key.ge.as_ls || shader->key.ge.as_es) ? 128 : shader->wave_size;
171 
172    case MESA_SHADER_TESS_CTRL:
173       /* Return this so that LLVM doesn't remove s_barrier
174        * instructions on chips where we use s_barrier. */
175       return shader->selector->screen->info.gfx_level >= GFX7 ? 128 : shader->wave_size;
176 
177    case MESA_SHADER_GEOMETRY:
178       /* GS can always generate up to 256 vertices. */
179       return shader->selector->screen->info.gfx_level >= GFX9 ? 256 : shader->wave_size;
180 
181    case MESA_SHADER_COMPUTE:
182       break; /* see below */
183 
184    default:
185       return shader->wave_size;
186    }
187 
188    /* Compile a variable block size using the maximum variable size. */
189    if (shader->selector->info.base.workgroup_size_variable)
190       return SI_MAX_VARIABLE_THREADS_PER_BLOCK;
191 
192    uint16_t *local_size = shader->selector->info.base.workgroup_size;
193    unsigned max_work_group_size = (uint32_t)local_size[0] *
194                                   (uint32_t)local_size[1] *
195                                   (uint32_t)local_size[2];
196    assert(max_work_group_size);
197    return max_work_group_size;
198 }
199 
declare_const_and_shader_buffers(struct si_shader_args * args,struct si_shader * shader,const shader_info * info,bool assign_params)200 static void declare_const_and_shader_buffers(struct si_shader_args *args, struct si_shader *shader,
201                                              const shader_info *info, bool assign_params)
202 {
203    enum ac_arg_type const_shader_buf_type;
204 
205    if (info->num_ubos == 1 && info->num_ssbos == 0)
206       const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
207    else
208       const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
209 
210    ac_add_arg(
211       &args->ac, AC_ARG_SGPR, 1, const_shader_buf_type,
212       assign_params ? &args->const_and_shader_buffers : &args->other_const_and_shader_buffers);
213 }
214 
declare_samplers_and_images(struct si_shader_args * args,bool assign_params)215 static void declare_samplers_and_images(struct si_shader_args *args, bool assign_params)
216 {
217    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
218               assign_params ? &args->samplers_and_images : &args->other_samplers_and_images);
219 }
220 
declare_per_stage_desc_pointers(struct si_shader_args * args,struct si_shader * shader,const shader_info * info,bool assign_params)221 static void declare_per_stage_desc_pointers(struct si_shader_args *args, struct si_shader *shader,
222                                             const shader_info *info, bool assign_params)
223 {
224    declare_const_and_shader_buffers(args, shader, info, assign_params);
225    declare_samplers_and_images(args, assign_params);
226 }
227 
declare_global_desc_pointers(struct si_shader_args * args)228 static void declare_global_desc_pointers(struct si_shader_args *args)
229 {
230    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->internal_bindings);
231    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
232               &args->bindless_samplers_and_images);
233 }
234 
declare_vb_descriptor_input_sgprs(struct si_shader_args * args,struct si_shader * shader)235 static void declare_vb_descriptor_input_sgprs(struct si_shader_args *args,
236                                               struct si_shader *shader)
237 {
238    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->ac.vertex_buffers);
239 
240    unsigned num_vbos_in_user_sgprs = shader->selector->info.num_vbos_in_user_sgprs;
241    if (num_vbos_in_user_sgprs) {
242       unsigned user_sgprs = args->ac.num_sgprs_used;
243 
244       if (si_is_merged_shader(shader))
245          user_sgprs -= 8;
246       assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
247 
248       /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
249       for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
250          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
251 
252       assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(args->vb_descriptors));
253       for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
254          ac_add_arg(&args->ac, AC_ARG_SGPR, 4, AC_ARG_INT, &args->vb_descriptors[i]);
255    }
256 }
257 
declare_vs_input_vgprs(struct si_shader_args * args,struct si_shader * shader)258 static void declare_vs_input_vgprs(struct si_shader_args *args, struct si_shader *shader)
259 {
260    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vertex_id);
261 
262    if (shader->selector->screen->info.gfx_level >= GFX12) {
263       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
264    } else if (shader->key.ge.as_ls) {
265       if (shader->selector->screen->info.gfx_level >= GFX11) {
266          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
267          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
268          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
269       } else if (shader->selector->screen->info.gfx_level >= GFX10) {
270          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
271          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
272          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
273       } else {
274          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
275          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
276          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
277       }
278    } else if (shader->selector->screen->info.gfx_level >= GFX10) {
279       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
280       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
281                  /* user vgpr or PrimID (legacy) */
282                  shader->key.ge.as_ngg ? NULL : &args->ac.vs_prim_id);
283       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
284    } else {
285       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
286       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_prim_id);
287       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
288    }
289 }
290 
declare_vs_blit_inputs(struct si_shader * shader,struct si_shader_args * args,const shader_info * info)291 static void declare_vs_blit_inputs(struct si_shader *shader, struct si_shader_args *args,
292                                    const shader_info *info)
293 {
294    bool has_attribute_ring_address = shader->selector->screen->info.gfx_level >= GFX11;
295 
296    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_blit_inputs); /* i16 x1, y1 */
297    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);                  /* i16 x1, y1 */
298    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL);                /* depth */
299 
300    if (info->vs.blit_sgprs_amd ==
301        SI_VS_BLIT_SGPRS_POS_COLOR + has_attribute_ring_address) {
302       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
303       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
304       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
305       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
306       if (has_attribute_ring_address)
307          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* attribute ring address */
308    } else if (info->vs.blit_sgprs_amd ==
309               SI_VS_BLIT_SGPRS_POS_TEXCOORD + has_attribute_ring_address) {
310       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
311       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
312       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
313       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
314       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
315       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
316       if (has_attribute_ring_address)
317          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* attribute ring address */
318    }
319 }
320 
declare_tes_input_vgprs(struct si_shader_args * args)321 static void declare_tes_input_vgprs(struct si_shader_args *args)
322 {
323    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_u);
324    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_v);
325    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_rel_patch_id);
326    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_patch_id);
327 }
328 
329 enum
330 {
331    /* Convenient merged shader definitions. */
332    SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES,
333    SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
334 };
335 
si_add_arg_checked(struct ac_shader_args * args,enum ac_arg_regfile file,unsigned registers,enum ac_arg_type type,struct ac_arg * arg,unsigned idx)336 static void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
337                                enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
338 {
339    assert(args->arg_count == idx);
340    ac_add_arg(args, file, registers, type, arg);
341 }
342 
si_init_shader_args(struct si_shader * shader,struct si_shader_args * args,const shader_info * info)343 static void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args,
344                                 const shader_info *info)
345 {
346    unsigned i, num_returns, num_return_sgprs;
347    unsigned num_prolog_vgprs = 0;
348    struct si_shader_selector *sel = shader->selector;
349    unsigned stage = shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : info->stage;
350    unsigned stage_case = stage;
351 
352    memset(args, 0, sizeof(*args));
353 
354    /* Set MERGED shaders. */
355    if (sel->screen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY) {
356       if (shader->key.ge.as_ls || stage == MESA_SHADER_TESS_CTRL)
357          stage_case = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
358       else if (shader->key.ge.as_es || shader->key.ge.as_ngg || stage == MESA_SHADER_GEOMETRY)
359          stage_case = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
360    }
361 
362    switch (stage_case) {
363    case MESA_SHADER_VERTEX:
364       declare_global_desc_pointers(args);
365 
366       if (info->vs.blit_sgprs_amd) {
367          declare_vs_blit_inputs(shader, args, info);
368       } else {
369          declare_per_stage_desc_pointers(args, shader, info, true);
370          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
371 
372          if (shader->is_gs_copy_shader) {
373             declare_streamout_params(args, shader, info);
374          } else {
375             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
376             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
377             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
378             declare_vb_descriptor_input_sgprs(args, shader);
379 
380             if (shader->key.ge.as_es) {
381                ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
382             } else if (shader->key.ge.as_ls) {
383                /* no extra parameters */
384             } else {
385                declare_streamout_params(args, shader, info);
386             }
387          }
388       }
389 
390       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
391       if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
392          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
393 
394       /* VGPRs */
395       declare_vs_input_vgprs(args, shader);
396 
397       break;
398 
399    case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
400       declare_global_desc_pointers(args);
401       declare_per_stage_desc_pointers(args, shader, info, true);
402       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
403       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
404       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
405       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
406       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
407 
408       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
409       if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
410          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
411 
412       /* VGPRs */
413       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
414       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
415       break;
416 
417    case SI_SHADER_MERGED_VERTEX_TESSCTRL:
418       /* Merged stages have 8 system SGPRs at the beginning. */
419       /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
420       /* Gfx11+:  SPI_SHADER_PGM_LO/HI_HS */
421       declare_per_stage_desc_pointers(args, shader, info, stage == MESA_SHADER_TESS_CTRL);
422       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
423       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info);
424       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
425       if (sel->screen->info.gfx_level >= GFX11)
426          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_wave_id);
427       else
428          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
429       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
430       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
431 
432       declare_global_desc_pointers(args);
433       declare_per_stage_desc_pointers(args, shader, info, stage == MESA_SHADER_VERTEX);
434 
435       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
436       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
437       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
438       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
439       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
440       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
441 
442       /* VGPRs (first TCS, then VS) */
443       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
444       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
445 
446       if (stage == MESA_SHADER_VERTEX) {
447          declare_vs_input_vgprs(args, shader);
448 
449          /* Need to keep LS/HS arg index same for shared args when ACO,
450           * so this is not able to be before shared VGPRs.
451           */
452          declare_vb_descriptor_input_sgprs(args, shader);
453 
454          /* LS return values are inputs to the TCS main shader part. */
455          if (!shader->is_monolithic || shader->key.ge.opt.same_patch_vertices) {
456             for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
457                ac_add_return(&args->ac, AC_ARG_SGPR);
458             for (i = 0; i < 2; i++)
459                ac_add_return(&args->ac, AC_ARG_VGPR);
460 
461             /* VS outputs passed via VGPRs to TCS. */
462             if (shader->key.ge.opt.same_patch_vertices && !info->use_aco_amd) {
463                unsigned num_outputs = util_last_bit64(shader->selector->info.ls_es_outputs_written);
464                for (i = 0; i < num_outputs * 4; i++)
465                   ac_add_return(&args->ac, AC_ARG_VGPR);
466             }
467          }
468       } else {
469          /* TCS inputs are passed via VGPRs from VS. */
470          if (shader->key.ge.opt.same_patch_vertices && !info->use_aco_amd) {
471             unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->info.ls_es_outputs_written);
472             for (i = 0; i < num_inputs * 4; i++)
473                ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
474          }
475       }
476       break;
477 
478    case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
479       /* Merged stages have 8 system SGPRs at the beginning. */
480       /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
481       /* Gfx11+:  SPI_SHADER_PGM_LO/HI_GS */
482       declare_per_stage_desc_pointers(args, shader, info, stage == MESA_SHADER_GEOMETRY);
483 
484       if (shader->key.ge.as_ngg)
485          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_tg_info);
486       else
487          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
488 
489       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info);
490       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
491       if (sel->screen->info.gfx_level >= GFX11)
492          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_attr_offset);
493       else
494          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
495       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
496       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
497 
498       declare_global_desc_pointers(args);
499       if (stage != MESA_SHADER_VERTEX || !info->vs.blit_sgprs_amd) {
500          declare_per_stage_desc_pointers(
501             args, shader, info, (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL));
502       }
503 
504       if (stage == MESA_SHADER_VERTEX && info->vs.blit_sgprs_amd) {
505          declare_vs_blit_inputs(shader, args, info);
506       } else {
507          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
508 
509          if (stage == MESA_SHADER_VERTEX) {
510             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
511             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
512             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
513          } else if (stage == MESA_SHADER_TESS_EVAL) {
514             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
515             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
516             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
517          } else {
518             /* GS */
519             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
520             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
521             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
522          }
523 
524          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->small_prim_cull_info);
525          if (sel->screen->info.gfx_level >= GFX11)
526             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->gs_attr_address);
527          else
528             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
529       }
530 
531       /* VGPRs (first GS, then VS/TES) */
532       if (sel->screen->info.gfx_level >= GFX12) {
533          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
534          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
535          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
536       } else {
537          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
538          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
539          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
540          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
541          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
542       }
543 
544       if (stage == MESA_SHADER_VERTEX) {
545          declare_vs_input_vgprs(args, shader);
546 
547          /* Need to keep ES/GS arg index same for shared args when ACO,
548           * so this is not able to be before shared VGPRs.
549           */
550          if (!info->vs.blit_sgprs_amd)
551             declare_vb_descriptor_input_sgprs(args, shader);
552       } else if (stage == MESA_SHADER_TESS_EVAL) {
553          declare_tes_input_vgprs(args);
554       }
555 
556       if (shader->key.ge.as_es && !shader->is_monolithic &&
557           (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL)) {
558          /* ES return values are inputs to GS. */
559          for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
560             ac_add_return(&args->ac, AC_ARG_SGPR);
561          for (i = 0; i < (sel->screen->info.gfx_level >= GFX12 ? 3 : 5); i++)
562             ac_add_return(&args->ac, AC_ARG_VGPR);
563       }
564       break;
565 
566    case MESA_SHADER_TESS_EVAL:
567       declare_global_desc_pointers(args);
568       declare_per_stage_desc_pointers(args, shader, info, true);
569       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
570       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
571       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
572 
573       if (shader->key.ge.as_es) {
574          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
575          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
576          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
577       } else {
578          declare_streamout_params(args, shader, info);
579          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
580       }
581 
582       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
583       if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
584          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
585 
586       /* VGPRs */
587       declare_tes_input_vgprs(args);
588       break;
589 
590    case MESA_SHADER_GEOMETRY:
591       declare_global_desc_pointers(args);
592       declare_per_stage_desc_pointers(args, shader, info, true);
593       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
594       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_wave_id);
595 
596       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
597       if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
598          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
599 
600       /* VGPRs */
601       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
602       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
603       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
604       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
605       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[3]);
606       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[4]);
607       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[5]);
608       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
609       break;
610 
611    case MESA_SHADER_FRAGMENT:
612       declare_global_desc_pointers(args);
613       declare_per_stage_desc_pointers(args, shader, info, true);
614       si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->alpha_reference,
615                          SI_PARAM_ALPHA_REF);
616       si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.prim_mask,
617                          SI_PARAM_PRIM_MASK);
618 
619       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_sample,
620                          SI_PARAM_PERSP_SAMPLE);
621       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_center,
622                          SI_PARAM_PERSP_CENTER);
623       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_centroid,
624                          SI_PARAM_PERSP_CENTROID);
625       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
626       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_sample,
627                          SI_PARAM_LINEAR_SAMPLE);
628       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_center,
629                          SI_PARAM_LINEAR_CENTER);
630       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_centroid,
631                          SI_PARAM_LINEAR_CENTROID);
632       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
633       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[0],
634                          SI_PARAM_POS_X_FLOAT);
635       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[1],
636                          SI_PARAM_POS_Y_FLOAT);
637       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[2],
638                          SI_PARAM_POS_Z_FLOAT);
639       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[3],
640                          SI_PARAM_POS_W_FLOAT);
641       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.front_face,
642                          SI_PARAM_FRONT_FACE);
643       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.ancillary,
644                          SI_PARAM_ANCILLARY);
645       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.sample_coverage,
646                          SI_PARAM_SAMPLE_COVERAGE);
647       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.pos_fixed_pt,
648                          SI_PARAM_POS_FIXED_PT);
649 
650       if (info->use_aco_amd) {
651          ac_compact_ps_vgpr_args(&args->ac, shader->config.spi_ps_input_addr);
652 
653          /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
654          if (sel->screen->info.gfx_level < GFX11)
655             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
656       }
657 
658       /* Monolithic PS emit prolog and epilog in NIR directly. */
659       if (!shader->is_monolithic) {
660          /* Color inputs from the prolog. */
661          if (shader->selector->info.colors_read) {
662             unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);
663 
664             for (i = 0; i < num_color_elements; i++)
665                ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, i ? NULL : &args->color_start);
666 
667             num_prolog_vgprs += num_color_elements;
668          }
669 
670          /* Outputs for the epilog. */
671          num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
672          /* These must always be declared even if Z/stencil/samplemask are killed. */
673          num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
674                        sel->info.writes_z + sel->info.writes_stencil + sel->info.writes_samplemask +
675                        1 /* SampleMaskIn */;
676 
677          for (i = 0; i < num_return_sgprs; i++)
678             ac_add_return(&args->ac, AC_ARG_SGPR);
679          for (; i < num_returns; i++)
680             ac_add_return(&args->ac, AC_ARG_VGPR);
681       }
682       break;
683 
684    case MESA_SHADER_COMPUTE:
685    case MESA_SHADER_KERNEL:
686       declare_global_desc_pointers(args);
687       declare_per_stage_desc_pointers(args, shader, info, true);
688       if (shader->selector->info.uses_grid_size)
689          ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.num_work_groups);
690       if (shader->selector->info.uses_variable_block_size)
691          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->block_size);
692 
693       unsigned cs_user_data_dwords =
694          info->cs.user_data_components_amd;
695       if (cs_user_data_dwords) {
696          ac_add_arg(&args->ac, AC_ARG_SGPR, MIN2(cs_user_data_dwords, 4), AC_ARG_INT,
697                     &args->cs_user_data[0]);
698          if (cs_user_data_dwords > 4) {
699             ac_add_arg(&args->ac, AC_ARG_SGPR, cs_user_data_dwords - 4, AC_ARG_INT,
700                        &args->cs_user_data[1]);
701          }
702       }
703 
704       /* Some descriptors can be in user SGPRs. */
705       /* Shader buffers in user SGPRs. */
706       for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) {
707          while (args->ac.num_sgprs_used % 4 != 0)
708             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
709 
710          ac_add_arg(&args->ac, AC_ARG_SGPR, 4, AC_ARG_INT, &args->cs_shaderbuf[i]);
711       }
712       /* Images in user SGPRs. */
713       for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) {
714          unsigned num_sgprs = BITSET_TEST(info->image_buffers, i) ? 4 : 8;
715 
716          while (args->ac.num_sgprs_used % num_sgprs != 0)
717             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
718 
719          ac_add_arg(&args->ac, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &args->cs_image[i]);
720       }
721 
722       /* Hardware SGPRs. */
723       for (i = 0; i < 3; i++) {
724          if (shader->selector->info.uses_block_id[i]) {
725             /* GFX12 loads workgroup IDs into ttmp registers, so they are not input SGPRs, but we
726              * still need to set this to indicate that they are enabled (for ac_nir_to_llvm).
727              */
728             if (sel->screen->info.gfx_level >= GFX12)
729                args->ac.workgroup_ids[i].used = true;
730             else
731                ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]);
732          }
733       }
734       if (shader->selector->info.uses_tg_size)
735          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tg_size);
736 
737       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
738       if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
739          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
740 
741       /* Hardware VGPRs. */
742       /* Thread IDs are packed in VGPR0, 10 bits per component or stored in 3 separate VGPRs */
743       if (sel->screen->info.gfx_level >= GFX11 ||
744           (!sel->screen->info.has_graphics && sel->screen->info.family >= CHIP_MI200)) {
745          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_ids_packed);
746       } else {
747          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_id_x);
748          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_id_y);
749          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_id_z);
750       }
751       break;
752    default:
753       assert(0 && "unimplemented shader");
754       return;
755    }
756 
757    shader->info.num_input_sgprs = args->ac.num_sgprs_used;
758    shader->info.num_input_vgprs = args->ac.num_vgprs_used;
759 
760    assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
761    shader->info.num_input_vgprs -= num_prolog_vgprs;
762 }
763 
get_lds_granularity(struct si_screen * screen,gl_shader_stage stage)764 static unsigned get_lds_granularity(struct si_screen *screen, gl_shader_stage stage)
765 {
766    return screen->info.gfx_level >= GFX11 && stage == MESA_SHADER_FRAGMENT ? 1024 :
767           screen->info.gfx_level >= GFX7 ? 512 : 256;
768 }
769 
si_shader_binary_open(struct si_screen * screen,struct si_shader * shader,struct ac_rtld_binary * rtld)770 static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
771                                   struct ac_rtld_binary *rtld)
772 {
773    const struct si_shader_selector *sel = shader->selector;
774    const char *part_elfs[5];
775    size_t part_sizes[5];
776    unsigned num_parts = 0;
777 
778 #define add_part(shader_or_part)                                                                   \
779    if (shader_or_part) {                                                                           \
780       part_elfs[num_parts] = (shader_or_part)->binary.code_buffer;                                 \
781       part_sizes[num_parts] = (shader_or_part)->binary.code_size;                                  \
782       num_parts++;                                                                                 \
783    }
784 
785    add_part(shader->prolog);
786    add_part(shader->previous_stage);
787    add_part(shader);
788    add_part(shader->epilog);
789 
790 #undef add_part
791 
792    struct ac_rtld_symbol lds_symbols[2];
793    unsigned num_lds_symbols = 0;
794 
795    if (sel && screen->info.gfx_level >= GFX9 && !shader->is_gs_copy_shader &&
796        (sel->stage == MESA_SHADER_GEOMETRY ||
797         (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg))) {
798       struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
799       sym->name = "esgs_ring";
800       sym->size = shader->gs_info.esgs_ring_size * 4;
801       sym->align = 64 * 1024;
802    }
803 
804    if (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
805       struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
806       sym->name = "ngg_emit";
807       sym->size = shader->ngg.ngg_emit_size * 4;
808       sym->align = 4;
809    }
810 
811    bool ok = ac_rtld_open(
812       rtld, (struct ac_rtld_open_info){.info = &screen->info,
813                                        .options =
814                                           {
815                                              .halt_at_entry = screen->options.halt_shaders,
816                                              .waitcnt_wa = num_parts > 1 &&
817                                                            screen->info.needs_llvm_wait_wa,
818                                           },
819                                        .shader_type = sel->stage,
820                                        .wave_size = shader->wave_size,
821                                        .num_parts = num_parts,
822                                        .elf_ptrs = part_elfs,
823                                        .elf_sizes = part_sizes,
824                                        .num_shared_lds_symbols = num_lds_symbols,
825                                        .shared_lds_symbols = lds_symbols});
826 
827    if (rtld->lds_size > 0) {
828       unsigned alloc_granularity = get_lds_granularity(screen, sel->stage);
829       shader->config.lds_size = DIV_ROUND_UP(rtld->lds_size, alloc_granularity);
830    }
831 
832    return ok;
833 }
834 
get_shader_binaries(struct si_shader * shader,struct si_shader_binary * bin[4])835 static unsigned get_shader_binaries(struct si_shader *shader, struct si_shader_binary *bin[4])
836 {
837    unsigned num_bin = 0;
838 
839    if (shader->prolog)
840       bin[num_bin++] = &shader->prolog->binary;
841 
842    if (shader->previous_stage)
843       bin[num_bin++] = &shader->previous_stage->binary;
844 
845    bin[num_bin++] = &shader->binary;
846 
847    if (shader->epilog)
848       bin[num_bin++] = &shader->epilog->binary;
849 
850    return num_bin;
851 }
852 
853 /* si_get_shader_binary_size should only be called once per shader
854  * and the result should be stored in shader->complete_shader_binary_size.
855  */
si_get_shader_binary_size(struct si_screen * screen,struct si_shader * shader)856 unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
857 {
858    if (shader->binary.type == SI_SHADER_BINARY_ELF) {
859       struct ac_rtld_binary rtld;
860       si_shader_binary_open(screen, shader, &rtld);
861       uint64_t size = rtld.exec_size;
862       ac_rtld_close(&rtld);
863       return size;
864    } else {
865       struct si_shader_binary *bin[4];
866       unsigned num_bin = get_shader_binaries(shader, bin);
867 
868       unsigned size = 0;
869       for (unsigned i = 0; i < num_bin; i++) {
870          assert(bin[i]->type == SI_SHADER_BINARY_RAW);
871          size += bin[i]->exec_size;
872       }
873       return size;
874    }
875 }
876 
si_get_shader_prefetch_size(struct si_shader * shader)877 unsigned si_get_shader_prefetch_size(struct si_shader *shader)
878 {
879    struct si_screen *sscreen = shader->selector->screen;
880    /* This excludes arrays of constants after instructions. */
881    unsigned exec_size =
882       ac_align_shader_binary_for_prefetch(&sscreen->info,
883                                           shader->complete_shader_binary_size);
884 
885    /* INST_PREF_SIZE uses 128B granularity.
886     * - GFX11: max 128 * 63 = 8064
887     * - GFX12: max 128 * 255 = 32640
888     */
889    unsigned max_pref_size = shader->selector->screen->info.gfx_level >= GFX12 ? 255 : 63;
890    unsigned exec_size_gran128 = DIV_ROUND_UP(exec_size, 128);
891 
892    return MIN2(max_pref_size, exec_size_gran128);
893 }
894 
si_get_external_symbol(enum amd_gfx_level gfx_level,void * data,const char * name,uint64_t * value)895 static bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
896                                    uint64_t *value)
897 {
898    uint64_t *scratch_va = data;
899 
900    if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
901       *value = (uint32_t)*scratch_va;
902       return true;
903    }
904    if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
905       /* Enable scratch coalescing. */
906       *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32);
907 
908       if (gfx_level >= GFX11)
909          *value |= S_008F04_SWIZZLE_ENABLE_GFX11(1);
910       else
911          *value |= S_008F04_SWIZZLE_ENABLE_GFX6(1);
912       return true;
913    }
914 
915    return false;
916 }
917 
pre_upload_binary(struct si_screen * sscreen,struct si_shader * shader,unsigned binary_size,bool dma_upload,struct si_context ** upload_ctx,struct pipe_resource ** staging,unsigned * staging_offset,int64_t bo_offset)918 static void *pre_upload_binary(struct si_screen *sscreen, struct si_shader *shader,
919                                unsigned binary_size, bool dma_upload,
920                                struct si_context **upload_ctx,
921                                struct pipe_resource **staging,
922                                unsigned *staging_offset,
923                                int64_t bo_offset)
924 {
925    unsigned aligned_size = ac_align_shader_binary_for_prefetch(&sscreen->info, binary_size);
926 
927    if (bo_offset >= 0) {
928       /* sqtt needs to upload shaders as a pipeline, where all shaders
929        * are contiguous in memory.
930        * In this case, bo_offset will be positive and we don't have to
931        * realloc a new bo.
932        */
933       shader->gpu_address = shader->bo->gpu_address + bo_offset;
934       dma_upload = false;
935    } else {
936       si_resource_reference(&shader->bo, NULL);
937       shader->bo = si_aligned_buffer_create(
938          &sscreen->b,
939          SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT |
940          (dma_upload ? PIPE_RESOURCE_FLAG_UNMAPPABLE : 0),
941          PIPE_USAGE_IMMUTABLE, align(aligned_size, SI_CPDMA_ALIGNMENT), 256);
942       if (!shader->bo)
943          return NULL;
944 
945       shader->gpu_address = shader->bo->gpu_address;
946       bo_offset = 0;
947    }
948 
949    if (dma_upload) {
950       /* First upload into a staging buffer. */
951       *upload_ctx = si_get_aux_context(&sscreen->aux_context.shader_upload);
952 
953       void *ret;
954       u_upload_alloc((*upload_ctx)->b.stream_uploader, 0, binary_size, 256,
955                      staging_offset, staging, &ret);
956       if (!ret)
957          si_put_aux_context_flush(&sscreen->aux_context.shader_upload);
958 
959       return ret;
960    } else {
961       void *ptr = sscreen->ws->buffer_map(sscreen->ws,
962          shader->bo->buf, NULL,
963          PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);
964       if (!ptr)
965          return NULL;
966 
967       return ptr + bo_offset;
968    }
969 }
970 
post_upload_binary(struct si_screen * sscreen,struct si_shader * shader,void * code,unsigned code_size,unsigned binary_size,bool dma_upload,struct si_context * upload_ctx,struct pipe_resource * staging,unsigned staging_offset)971 static void post_upload_binary(struct si_screen *sscreen, struct si_shader *shader,
972                                void *code, unsigned code_size,
973                                unsigned binary_size, bool dma_upload,
974                                struct si_context *upload_ctx,
975                                struct pipe_resource *staging,
976                                unsigned staging_offset)
977 {
978    if (sscreen->debug_flags & DBG(SQTT)) {
979       /* Remember the uploaded code */
980       shader->binary.uploaded_code_size = code_size;
981       shader->binary.uploaded_code = malloc(code_size);
982       memcpy(shader->binary.uploaded_code, code, code_size);
983    }
984 
985    if (dma_upload) {
986       /* Then copy from the staging buffer to VRAM.
987        *
988        * We can't use the upload copy in si_buffer_transfer_unmap because that might use
989        * a compute shader, and we can't use shaders in the code that is responsible for making
990        * them available.
991        */
992       si_cp_dma_copy_buffer(upload_ctx, &shader->bo->b.b, staging, 0, staging_offset,
993                             binary_size);
994       si_barrier_after_simple_buffer_op(upload_ctx, 0, &shader->bo->b.b, staging);
995       upload_ctx->barrier_flags |= SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_L2;
996 
997 #if 0 /* debug: validate whether the copy was successful */
998       uint32_t *dst_binary = malloc(binary_size);
999       uint32_t *src_binary = (uint32_t*)code;
1000       pipe_buffer_read(&upload_ctx->b, &shader->bo->b.b, 0, binary_size, dst_binary);
1001       puts("dst_binary == src_binary:");
1002       for (unsigned i = 0; i < binary_size / 4; i++) {
1003          printf("   %08x == %08x\n", dst_binary[i], src_binary[i]);
1004       }
1005       free(dst_binary);
1006       exit(0);
1007 #endif
1008 
1009       si_put_aux_context_flush(&sscreen->aux_context.shader_upload);
1010       pipe_resource_reference(&staging, NULL);
1011    } else {
1012       sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf);
1013    }
1014 }
1015 
upload_binary_elf(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,bool dma_upload,int64_t bo_offset)1016 static int upload_binary_elf(struct si_screen *sscreen, struct si_shader *shader,
1017                              uint64_t scratch_va, bool dma_upload, int64_t bo_offset)
1018 {
1019    struct ac_rtld_binary binary;
1020    if (!si_shader_binary_open(sscreen, shader, &binary))
1021       return -1;
1022 
1023    struct si_context *upload_ctx = NULL;
1024    struct pipe_resource *staging = NULL;
1025    unsigned staging_offset = 0;
1026 
1027    void *rx_ptr = pre_upload_binary(sscreen, shader, binary.rx_size, dma_upload,
1028                                     &upload_ctx, &staging, &staging_offset,
1029                                     bo_offset);
1030    if (!rx_ptr)
1031       return -1;
1032 
1033    /* Upload. */
1034    struct ac_rtld_upload_info u = {};
1035    u.binary = &binary;
1036    u.get_external_symbol = si_get_external_symbol;
1037    u.cb_data = &scratch_va;
1038    u.rx_va = shader->gpu_address;
1039    u.rx_ptr = rx_ptr;
1040 
1041    int size = ac_rtld_upload(&u);
1042 
1043    post_upload_binary(sscreen, shader, rx_ptr, size, binary.rx_size, dma_upload,
1044                       upload_ctx, staging, staging_offset);
1045 
1046    ac_rtld_close(&binary);
1047 
1048    return size;
1049 }
1050 
calculate_needed_lds_size(struct si_screen * sscreen,struct si_shader * shader)1051 static void calculate_needed_lds_size(struct si_screen *sscreen, struct si_shader *shader)
1052 {
1053    gl_shader_stage stage =
1054       shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : shader->selector->stage;
1055 
1056    if (sscreen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY &&
1057        (stage == MESA_SHADER_GEOMETRY || shader->key.ge.as_ngg)) {
1058       unsigned size_in_dw = shader->gs_info.esgs_ring_size;
1059 
1060       if (stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)
1061          size_in_dw += shader->ngg.ngg_emit_size;
1062 
1063       if (shader->key.ge.as_ngg) {
1064          unsigned scratch_dw_size = gfx10_ngg_get_scratch_dw_size(shader);
1065          if (scratch_dw_size) {
1066             /* scratch base address needs to be 8 byte aligned */
1067             size_in_dw = ALIGN(size_in_dw, 2);
1068             size_in_dw += scratch_dw_size;
1069          }
1070       }
1071 
1072       shader->config.lds_size =
1073          DIV_ROUND_UP(size_in_dw * 4, get_lds_granularity(sscreen, stage));
1074    }
1075 }
1076 
upload_binary_raw(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,bool dma_upload,int64_t bo_offset)1077 static int upload_binary_raw(struct si_screen *sscreen, struct si_shader *shader,
1078                              uint64_t scratch_va, bool dma_upload, int64_t bo_offset)
1079 {
1080    struct si_shader_binary *bin[4];
1081    unsigned num_bin = get_shader_binaries(shader, bin);
1082 
1083    unsigned code_size = 0, exec_size = 0;
1084    for (unsigned i = 0; i < num_bin; i++) {
1085       assert(bin[i]->type == SI_SHADER_BINARY_RAW);
1086       code_size += bin[i]->code_size;
1087       exec_size += bin[i]->exec_size;
1088    }
1089 
1090    struct si_context *upload_ctx = NULL;
1091    struct pipe_resource *staging = NULL;
1092    unsigned staging_offset = 0;
1093 
1094    void *rx_ptr = pre_upload_binary(sscreen, shader, code_size, dma_upload,
1095                                     &upload_ctx, &staging, &staging_offset,
1096                                     bo_offset);
1097    if (!rx_ptr)
1098       return -1;
1099 
1100    unsigned exec_offset = 0, data_offset = exec_size;
1101    for (unsigned i = 0; i < num_bin; i++) {
1102       memcpy(rx_ptr + exec_offset, bin[i]->code_buffer, bin[i]->exec_size);
1103 
1104       if (bin[i]->num_symbols) {
1105          /* Offset needed to add to const data symbol because of inserting other
1106           * shader part between exec code and const data.
1107           */
1108          unsigned const_offset = data_offset - exec_offset - bin[i]->exec_size;
1109 
1110          /* Prolog and epilog have no symbols. */
1111          struct si_shader *sh = bin[i] == &shader->binary ? shader : shader->previous_stage;
1112          assert(sh && bin[i] == &sh->binary);
1113 
1114          si_aco_resolve_symbols(sh, rx_ptr + exec_offset, (const uint32_t *)bin[i]->code_buffer,
1115                                 scratch_va, const_offset);
1116       }
1117 
1118       exec_offset += bin[i]->exec_size;
1119 
1120       unsigned data_size = bin[i]->code_size - bin[i]->exec_size;
1121       if (data_size) {
1122          memcpy(rx_ptr + data_offset, bin[i]->code_buffer + bin[i]->exec_size, data_size);
1123          data_offset += data_size;
1124       }
1125    }
1126 
1127    post_upload_binary(sscreen, shader, rx_ptr, code_size, code_size, dma_upload,
1128                       upload_ctx, staging, staging_offset);
1129 
1130    calculate_needed_lds_size(sscreen, shader);
1131    return code_size;
1132 }
1133 
si_shader_binary_upload_at(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,int64_t bo_offset)1134 int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shader,
1135                                uint64_t scratch_va, int64_t bo_offset)
1136 {
1137    bool dma_upload = !(sscreen->debug_flags & DBG(NO_DMA_SHADERS)) && sscreen->info.has_cp_dma &&
1138                      sscreen->info.has_dedicated_vram && !sscreen->info.all_vram_visible &&
1139                      bo_offset < 0;
1140 
1141    if (shader->binary.type == SI_SHADER_BINARY_ELF) {
1142       return upload_binary_elf(sscreen, shader, scratch_va, dma_upload, bo_offset);
1143    } else {
1144       assert(shader->binary.type == SI_SHADER_BINARY_RAW);
1145       return upload_binary_raw(sscreen, shader, scratch_va, dma_upload, bo_offset);
1146    }
1147 }
1148 
si_shader_binary_upload(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va)1149 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
1150                             uint64_t scratch_va)
1151 {
1152    return si_shader_binary_upload_at(sscreen, shader, scratch_va, -1);
1153 }
1154 
print_disassembly(const char * disasm,size_t nbytes,const char * name,FILE * file,struct util_debug_callback * debug)1155 static void print_disassembly(const char *disasm, size_t nbytes,
1156                               const char *name, FILE *file,
1157                               struct util_debug_callback *debug)
1158 {
1159    if (debug && debug->debug_message) {
1160       /* Very long debug messages are cut off, so send the
1161        * disassembly one line at a time. This causes more
1162        * overhead, but on the plus side it simplifies
1163        * parsing of resulting logs.
1164        */
1165       util_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");
1166 
1167       uint64_t line = 0;
1168       while (line < nbytes) {
1169          int count = nbytes - line;
1170          const char *nl = memchr(disasm + line, '\n', nbytes - line);
1171          if (nl)
1172             count = nl - (disasm + line);
1173 
1174          if (count) {
1175             util_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
1176          }
1177 
1178          line += count + 1;
1179       }
1180 
1181       util_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
1182    }
1183 
1184    if (file) {
1185       fprintf(file, "Shader %s disassembly:\n", name);
1186       fprintf(file, "%*s", (int)nbytes, disasm);
1187    }
1188 }
1189 
si_shader_dump_disassembly(struct si_screen * screen,const struct si_shader_binary * binary,gl_shader_stage stage,unsigned wave_size,struct util_debug_callback * debug,const char * name,FILE * file)1190 static void si_shader_dump_disassembly(struct si_screen *screen,
1191                                        const struct si_shader_binary *binary,
1192                                        gl_shader_stage stage, unsigned wave_size,
1193                                        struct util_debug_callback *debug, const char *name,
1194                                        FILE *file)
1195 {
1196    if (binary->type == SI_SHADER_BINARY_RAW) {
1197       print_disassembly(binary->disasm_string, binary->disasm_size, name, file, debug);
1198       return;
1199    }
1200 
1201    struct ac_rtld_binary rtld_binary;
1202 
1203    if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
1204                                       .info = &screen->info,
1205                                       .shader_type = stage,
1206                                       .wave_size = wave_size,
1207                                       .num_parts = 1,
1208                                       .elf_ptrs = &binary->code_buffer,
1209                                       .elf_sizes = &binary->code_size}))
1210       return;
1211 
1212    const char *disasm;
1213    size_t nbytes;
1214 
1215    if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
1216       goto out;
1217 
1218    if (nbytes > INT_MAX)
1219       goto out;
1220 
1221    print_disassembly(disasm, nbytes, name, file, debug);
1222 
1223 out:
1224    ac_rtld_close(&rtld_binary);
1225 }
1226 
si_calculate_max_simd_waves(struct si_shader * shader)1227 static void si_calculate_max_simd_waves(struct si_shader *shader)
1228 {
1229    struct si_screen *sscreen = shader->selector->screen;
1230    struct ac_shader_config *conf = &shader->config;
1231    unsigned lds_increment = get_lds_granularity(sscreen, shader->selector->stage);
1232    unsigned lds_per_wave = 0;
1233    unsigned max_simd_waves;
1234 
1235    max_simd_waves = sscreen->info.max_waves_per_simd;
1236 
1237    /* Compute LDS usage for PS. */
1238    switch (shader->selector->stage) {
1239    case MESA_SHADER_FRAGMENT:
1240       /* The minimum usage per wave is (num_inputs * 48). The maximum
1241        * usage is (num_inputs * 48 * 16).
1242        * We can get anything in between and it varies between waves.
1243        *
1244        * The 48 bytes per input for a single primitive is equal to
1245        * 4 bytes/component * 4 components/input * 3 points.
1246        *
1247        * Other stages don't know the size at compile time or don't
1248        * allocate LDS per wave, but instead they do it per thread group.
1249        */
1250       lds_per_wave = conf->lds_size * lds_increment +
1251                      align(shader->info.num_ps_inputs * 48, lds_increment);
1252       break;
1253    case MESA_SHADER_COMPUTE: {
1254          unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
1255          lds_per_wave = (conf->lds_size * lds_increment) /
1256                         DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
1257       }
1258       break;
1259    default:;
1260    }
1261 
1262    /* Compute the per-SIMD wave counts. */
1263    if (conf->num_sgprs) {
1264       max_simd_waves =
1265          MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
1266    }
1267 
1268    if (conf->num_vgprs) {
1269       /* GFX 10.3 internally:
1270        * - aligns VGPRS to 16 for Wave32 and 8 for Wave64
1271        * - aligns LDS to 1024
1272        *
1273        * For shader-db stats, set num_vgprs that the hw actually uses.
1274        */
1275       unsigned num_vgprs = conf->num_vgprs;
1276       if (sscreen->info.gfx_level >= GFX10_3) {
1277          unsigned real_vgpr_gran = sscreen->info.num_physical_wave64_vgprs_per_simd / 64;
1278          num_vgprs = util_align_npot(num_vgprs, real_vgpr_gran * (shader->wave_size == 32 ? 2 : 1));
1279       } else {
1280          num_vgprs = align(num_vgprs, shader->wave_size == 32 ? 8 : 4);
1281       }
1282 
1283       /* Always print wave limits as Wave64, so that we can compare
1284        * Wave32 and Wave64 with shader-db fairly. */
1285       unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
1286       max_simd_waves = MIN2(max_simd_waves, max_vgprs / num_vgprs);
1287    }
1288 
1289    unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
1290    if (lds_per_wave)
1291       max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
1292 
1293    shader->info.max_simd_waves = max_simd_waves;
1294 }
1295 
si_shader_dump_stats_for_shader_db(struct si_screen * screen,struct si_shader * shader,struct util_debug_callback * debug)1296 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1297                                         struct util_debug_callback *debug)
1298 {
1299    const struct ac_shader_config *conf = &shader->config;
1300    static const char *stages[] = {"VS", "TCS", "TES", "GS", "PS", "CS"};
1301 
1302    if (screen->options.debug_disassembly)
1303       si_shader_dump_disassembly(screen, &shader->binary, shader->selector->stage,
1304                                  shader->wave_size, debug, "main", NULL);
1305 
1306    unsigned num_ls_outputs = 0;
1307    unsigned num_hs_outputs = 0;
1308    unsigned num_es_outputs = 0;
1309    unsigned num_gs_outputs = 0;
1310    unsigned num_vs_outputs = 0;
1311    unsigned num_ps_outputs = 0;
1312 
1313    if (shader->selector->stage <= MESA_SHADER_GEOMETRY) {
1314       /* This doesn't include pos exports because only param exports are interesting
1315        * for performance and can be optimized.
1316        */
1317       if (shader->key.ge.as_ls)
1318          num_ls_outputs = si_shader_lshs_vertex_stride(shader) / 16;
1319       else if (shader->selector->stage == MESA_SHADER_TESS_CTRL)
1320          num_hs_outputs = util_last_bit64(shader->selector->info.tcs_outputs_written_for_tes);
1321       else if (shader->key.ge.as_es)
1322          num_es_outputs = shader->selector->info.esgs_vertex_stride / 16;
1323       else if (shader->gs_copy_shader)
1324          num_gs_outputs = shader->gs_copy_shader->info.nr_param_exports;
1325       else if (shader->selector->stage == MESA_SHADER_GEOMETRY)
1326          num_gs_outputs = shader->info.nr_param_exports;
1327       else if (shader->selector->stage == MESA_SHADER_VERTEX ||
1328                shader->selector->stage == MESA_SHADER_TESS_EVAL)
1329          num_vs_outputs = shader->info.nr_param_exports;
1330       else
1331          unreachable("invalid shader key");
1332    } else if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
1333       num_ps_outputs = util_bitcount(shader->selector->info.colors_written) +
1334                        (shader->ps.writes_z ||
1335                         shader->ps.writes_stencil ||
1336                         shader->ps.writes_samplemask);
1337    }
1338 
1339    util_debug_message(debug, SHADER_INFO,
1340                       "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
1341                       "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
1342                       "Spilled VGPRs: %d PrivMem VGPRs: %d LSOutputs: %u HSOutputs: %u "
1343                       "HSPatchOuts: %u ESOutputs: %u GSOutputs: %u VSOutputs: %u PSOutputs: %u "
1344                       "InlineUniforms: %u DivergentLoop: %u (%s, W%u)",
1345                       conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
1346                       conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
1347                       conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
1348                       num_ls_outputs, num_hs_outputs,
1349                       util_last_bit(shader->selector->info.patch_outputs_written_for_tes),
1350                       num_es_outputs, num_gs_outputs, num_vs_outputs, num_ps_outputs,
1351                       shader->selector->info.base.num_inlinable_uniforms,
1352                       shader->selector->info.has_divergent_loop,
1353                       stages[shader->selector->stage], shader->wave_size);
1354 }
1355 
si_can_dump_shader(struct si_screen * sscreen,gl_shader_stage stage,enum si_shader_dump_type dump_type)1356 bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage,
1357                         enum si_shader_dump_type dump_type)
1358 {
1359    static uint64_t filter[] = {
1360       [SI_DUMP_SHADER_KEY] = DBG(NIR) | DBG(INIT_LLVM) | DBG(LLVM) | DBG(INIT_ACO) | DBG(ACO) | DBG(ASM),
1361       [SI_DUMP_INIT_NIR] = DBG(INIT_NIR),
1362       [SI_DUMP_NIR] = DBG(NIR),
1363       [SI_DUMP_INIT_LLVM_IR] = DBG(INIT_LLVM),
1364       [SI_DUMP_LLVM_IR] = DBG(LLVM),
1365       [SI_DUMP_INIT_ACO_IR] = DBG(INIT_ACO),
1366       [SI_DUMP_ACO_IR] = DBG(ACO),
1367       [SI_DUMP_ASM] = DBG(ASM),
1368       [SI_DUMP_STATS] = DBG(STATS),
1369       [SI_DUMP_ALWAYS] = DBG(VS) | DBG(TCS) | DBG(TES) | DBG(GS) | DBG(PS) | DBG(CS),
1370    };
1371    assert(dump_type < ARRAY_SIZE(filter));
1372 
1373    return sscreen->debug_flags & (1 << stage) &&
1374           sscreen->debug_flags & filter[dump_type];
1375 }
1376 
si_shader_dump_stats(struct si_screen * sscreen,struct si_shader * shader,FILE * file,bool check_debug_option)1377 static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
1378                                  bool check_debug_option)
1379 {
1380    const struct ac_shader_config *conf = &shader->config;
1381 
1382    if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
1383       fprintf(file,
1384               "*** SHADER CONFIG ***\n"
1385               "SPI_PS_INPUT_ADDR = 0x%04x\n"
1386               "SPI_PS_INPUT_ENA  = 0x%04x\n",
1387               conf->spi_ps_input_addr, conf->spi_ps_input_ena);
1388    }
1389 
1390    fprintf(file,
1391            "*** SHADER STATS ***\n"
1392            "SGPRS: %d\n"
1393            "VGPRS: %d\n"
1394            "Spilled SGPRs: %d\n"
1395            "Spilled VGPRs: %d\n"
1396            "Private memory VGPRs: %d\n"
1397            "Code Size: %d bytes\n"
1398            "LDS: %d bytes\n"
1399            "Scratch: %d bytes per wave\n"
1400            "Max Waves: %d\n"
1401            "********************\n\n\n",
1402            conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
1403            shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
1404            conf->lds_size * get_lds_granularity(sscreen, shader->selector->stage),
1405            conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
1406 }
1407 
si_get_shader_name(const struct si_shader * shader)1408 const char *si_get_shader_name(const struct si_shader *shader)
1409 {
1410    switch (shader->selector->stage) {
1411    case MESA_SHADER_VERTEX:
1412       if (shader->key.ge.as_es)
1413          return "Vertex Shader as ES";
1414       else if (shader->key.ge.as_ls)
1415          return "Vertex Shader as LS";
1416       else if (shader->key.ge.as_ngg)
1417          return "Vertex Shader as ESGS";
1418       else
1419          return "Vertex Shader as VS";
1420    case MESA_SHADER_TESS_CTRL:
1421       return "Tessellation Control Shader";
1422    case MESA_SHADER_TESS_EVAL:
1423       if (shader->key.ge.as_es)
1424          return "Tessellation Evaluation Shader as ES";
1425       else if (shader->key.ge.as_ngg)
1426          return "Tessellation Evaluation Shader as ESGS";
1427       else
1428          return "Tessellation Evaluation Shader as VS";
1429    case MESA_SHADER_GEOMETRY:
1430       if (shader->is_gs_copy_shader)
1431          return "GS Copy Shader as VS";
1432       else
1433          return "Geometry Shader";
1434    case MESA_SHADER_FRAGMENT:
1435       return "Pixel Shader";
1436    case MESA_SHADER_COMPUTE:
1437       return "Compute Shader";
1438    default:
1439       return "Unknown Shader";
1440    }
1441 }
1442 
si_shader_dump(struct si_screen * sscreen,struct si_shader * shader,struct util_debug_callback * debug,FILE * file,bool check_debug_option)1443 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1444                     struct util_debug_callback *debug, FILE *file, bool check_debug_option)
1445 {
1446    gl_shader_stage stage = shader->selector->stage;
1447 
1448    if (!check_debug_option || si_can_dump_shader(sscreen, stage, SI_DUMP_SHADER_KEY))
1449       si_dump_shader_key(shader, file);
1450 
1451    if (!check_debug_option && shader->binary.llvm_ir_string) {
1452       /* This is only used with ddebug. */
1453       if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
1454          fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
1455          fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
1456       }
1457 
1458       fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
1459       fprintf(file, "%s\n", shader->binary.llvm_ir_string);
1460    }
1461 
1462    if (!check_debug_option || (si_can_dump_shader(sscreen, stage, SI_DUMP_ASM))) {
1463       fprintf(file, "\n%s:\n", si_get_shader_name(shader));
1464 
1465       if (shader->prolog)
1466          si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, shader->wave_size, debug,
1467                                     "prolog", file);
1468       if (shader->previous_stage)
1469          si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,
1470                                     shader->wave_size, debug, "previous stage", file);
1471       si_shader_dump_disassembly(sscreen, &shader->binary, stage, shader->wave_size, debug, "main",
1472                                  file);
1473 
1474       if (shader->epilog)
1475          si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, shader->wave_size, debug,
1476                                     "epilog", file);
1477       fprintf(file, "\n");
1478 
1479       si_shader_dump_stats(sscreen, shader, file, check_debug_option);
1480    }
1481 }
1482 
si_dump_shader_key_vs(const union si_shader_key * key,FILE * f)1483 static void si_dump_shader_key_vs(const union si_shader_key *key, FILE *f)
1484 {
1485    fprintf(f, "  mono.instance_divisor_is_one = %u\n", key->ge.mono.instance_divisor_is_one);
1486    fprintf(f, "  mono.instance_divisor_is_fetched = %u\n",
1487            key->ge.mono.instance_divisor_is_fetched);
1488    fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->ge.mono.vs_fetch_opencode);
1489    fprintf(f, "  mono.vs.fix_fetch = {");
1490    for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
1491       union si_vs_fix_fetch fix = key->ge.mono.vs_fix_fetch[i];
1492       if (i)
1493          fprintf(f, ", ");
1494       if (!fix.bits)
1495          fprintf(f, "0");
1496       else
1497          fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
1498                  fix.u.format);
1499    }
1500    fprintf(f, "}\n");
1501 }
1502 
si_dump_shader_key(const struct si_shader * shader,FILE * f)1503 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
1504 {
1505    const union si_shader_key *key = &shader->key;
1506    gl_shader_stage stage = shader->selector->stage;
1507 
1508    fprintf(f, "SHADER KEY\n");
1509    fprintf(f, "  source_blake3 = {");
1510    _mesa_blake3_print(f, shader->selector->info.base.source_blake3);
1511    fprintf(f, "}\n");
1512 
1513    switch (stage) {
1514    case MESA_SHADER_VERTEX:
1515       si_dump_shader_key_vs(key, f);
1516       fprintf(f, "  as_es = %u\n", key->ge.as_es);
1517       fprintf(f, "  as_ls = %u\n", key->ge.as_ls);
1518       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
1519       fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1520       break;
1521 
1522    case MESA_SHADER_TESS_CTRL:
1523       if (shader->selector->screen->info.gfx_level >= GFX9)
1524          si_dump_shader_key_vs(key, f);
1525 
1526       fprintf(f, "  opt.tes_prim_mode = %u\n", key->ge.opt.tes_prim_mode);
1527       fprintf(f, "  opt.tes_reads_tess_factors = %u\n", key->ge.opt.tes_reads_tess_factors);
1528       fprintf(f, "  opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1529       fprintf(f, "  opt.same_patch_vertices = %u\n", key->ge.opt.same_patch_vertices);
1530       break;
1531 
1532    case MESA_SHADER_TESS_EVAL:
1533       fprintf(f, "  as_es = %u\n", key->ge.as_es);
1534       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
1535       fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1536       break;
1537 
1538    case MESA_SHADER_GEOMETRY:
1539       if (shader->is_gs_copy_shader)
1540          break;
1541 
1542       if (shader->selector->screen->info.gfx_level >= GFX9 &&
1543           key->ge.part.gs.es->stage == MESA_SHADER_VERTEX)
1544          si_dump_shader_key_vs(key, f);
1545 
1546       fprintf(f, "  mono.u.gs_tri_strip_adj_fix = %u\n", key->ge.mono.u.gs_tri_strip_adj_fix);
1547       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
1548       break;
1549 
1550    case MESA_SHADER_COMPUTE:
1551       break;
1552 
1553    case MESA_SHADER_FRAGMENT:
1554       fprintf(f, "  prolog.color_two_side = %u\n", key->ps.part.prolog.color_two_side);
1555       fprintf(f, "  prolog.flatshade_colors = %u\n", key->ps.part.prolog.flatshade_colors);
1556       fprintf(f, "  prolog.poly_stipple = %u\n", key->ps.part.prolog.poly_stipple);
1557       fprintf(f, "  prolog.force_persp_sample_interp = %u\n",
1558               key->ps.part.prolog.force_persp_sample_interp);
1559       fprintf(f, "  prolog.force_linear_sample_interp = %u\n",
1560               key->ps.part.prolog.force_linear_sample_interp);
1561       fprintf(f, "  prolog.force_persp_center_interp = %u\n",
1562               key->ps.part.prolog.force_persp_center_interp);
1563       fprintf(f, "  prolog.force_linear_center_interp = %u\n",
1564               key->ps.part.prolog.force_linear_center_interp);
1565       fprintf(f, "  prolog.bc_optimize_for_persp = %u\n",
1566               key->ps.part.prolog.bc_optimize_for_persp);
1567       fprintf(f, "  prolog.bc_optimize_for_linear = %u\n",
1568               key->ps.part.prolog.bc_optimize_for_linear);
1569       fprintf(f, "  prolog.samplemask_log_ps_iter = %u\n",
1570               key->ps.part.prolog.samplemask_log_ps_iter);
1571       fprintf(f, "  epilog.spi_shader_col_format = 0x%x\n",
1572               key->ps.part.epilog.spi_shader_col_format);
1573       fprintf(f, "  epilog.color_is_int8 = 0x%X\n", key->ps.part.epilog.color_is_int8);
1574       fprintf(f, "  epilog.color_is_int10 = 0x%X\n", key->ps.part.epilog.color_is_int10);
1575       fprintf(f, "  epilog.last_cbuf = %u\n", key->ps.part.epilog.last_cbuf);
1576       fprintf(f, "  epilog.alpha_func = %u\n", key->ps.part.epilog.alpha_func);
1577       fprintf(f, "  epilog.alpha_to_one = %u\n", key->ps.part.epilog.alpha_to_one);
1578       fprintf(f, "  epilog.alpha_to_coverage_via_mrtz = %u\n", key->ps.part.epilog.alpha_to_coverage_via_mrtz);
1579       fprintf(f, "  epilog.clamp_color = %u\n", key->ps.part.epilog.clamp_color);
1580       fprintf(f, "  epilog.dual_src_blend_swizzle = %u\n", key->ps.part.epilog.dual_src_blend_swizzle);
1581       fprintf(f, "  epilog.rbplus_depth_only_opt = %u\n", key->ps.part.epilog.rbplus_depth_only_opt);
1582       fprintf(f, "  epilog.kill_z = %u\n", key->ps.part.epilog.kill_z);
1583       fprintf(f, "  epilog.kill_stencil = %u\n", key->ps.part.epilog.kill_stencil);
1584       fprintf(f, "  epilog.kill_samplemask = %u\n", key->ps.part.epilog.kill_samplemask);
1585       fprintf(f, "  mono.poly_line_smoothing = %u\n", key->ps.mono.poly_line_smoothing);
1586       fprintf(f, "  mono.point_smoothing = %u\n", key->ps.mono.point_smoothing);
1587       fprintf(f, "  mono.interpolate_at_sample_force_center = %u\n",
1588               key->ps.mono.interpolate_at_sample_force_center);
1589       fprintf(f, "  mono.fbfetch_msaa = %u\n", key->ps.mono.fbfetch_msaa);
1590       fprintf(f, "  mono.fbfetch_is_1D = %u\n", key->ps.mono.fbfetch_is_1D);
1591       fprintf(f, "  mono.fbfetch_layered = %u\n", key->ps.mono.fbfetch_layered);
1592       break;
1593 
1594    default:
1595       assert(0);
1596    }
1597 
1598    if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||
1599         stage == MESA_SHADER_VERTEX) &&
1600        !key->ge.as_es && !key->ge.as_ls) {
1601       fprintf(f, "  mono.remove_streamout = 0x%x\n", key->ge.mono.remove_streamout);
1602       fprintf(f, "  opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs);
1603       fprintf(f, "  opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances);
1604       fprintf(f, "  opt.kill_pointsize = %u\n", key->ge.opt.kill_pointsize);
1605       fprintf(f, "  opt.kill_layer = %u\n", key->ge.opt.kill_layer);
1606       fprintf(f, "  opt.remove_streamout = %u\n", key->ge.opt.remove_streamout);
1607       fprintf(f, "  opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling);
1608       fprintf(f, "  opt.ngg_vs_streamout_num_verts_per_prim = %u\n",
1609               key->ge.opt.ngg_vs_streamout_num_verts_per_prim);
1610    }
1611 
1612    if (stage <= MESA_SHADER_GEOMETRY)
1613       fprintf(f, "  opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1614    else
1615       fprintf(f, "  opt.prefer_mono = %u\n", key->ps.opt.prefer_mono);
1616 
1617    if (stage <= MESA_SHADER_GEOMETRY) {
1618       if (key->ge.opt.inline_uniforms) {
1619          fprintf(f, "  opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1620                  key->ge.opt.inline_uniforms,
1621                  key->ge.opt.inlined_uniform_values[0],
1622                  key->ge.opt.inlined_uniform_values[1],
1623                  key->ge.opt.inlined_uniform_values[2],
1624                  key->ge.opt.inlined_uniform_values[3]);
1625       } else {
1626          fprintf(f, "  opt.inline_uniforms = 0\n");
1627       }
1628    } else {
1629       if (key->ps.opt.inline_uniforms) {
1630          fprintf(f, "  opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1631                  key->ps.opt.inline_uniforms,
1632                  key->ps.opt.inlined_uniform_values[0],
1633                  key->ps.opt.inlined_uniform_values[1],
1634                  key->ps.opt.inlined_uniform_values[2],
1635                  key->ps.opt.inlined_uniform_values[3]);
1636       } else {
1637          fprintf(f, "  opt.inline_uniforms = 0\n");
1638       }
1639    }
1640 }
1641 
1642 /* TODO: convert to nir_shader_instructions_pass */
si_nir_kill_outputs(nir_shader * nir,const union si_shader_key * key)1643 static bool si_nir_kill_outputs(nir_shader *nir, const union si_shader_key *key)
1644 {
1645    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1646    assert(impl);
1647    assert(nir->info.stage <= MESA_SHADER_GEOMETRY);
1648 
1649    if (!key->ge.opt.kill_outputs &&
1650        !key->ge.opt.kill_pointsize &&
1651        !key->ge.opt.kill_layer &&
1652        !key->ge.opt.kill_clip_distances &&
1653        !(nir->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER))) {
1654       nir_metadata_preserve(impl, nir_metadata_all);
1655       return false;
1656    }
1657 
1658    bool progress = false;
1659 
1660    nir_foreach_block(block, impl) {
1661       nir_foreach_instr_safe(instr, block) {
1662          if (instr->type != nir_instr_type_intrinsic)
1663             continue;
1664 
1665          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1666          if (intr->intrinsic != nir_intrinsic_store_output)
1667             continue;
1668 
1669          /* No indirect indexing allowed. */
1670          ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1671          assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1672 
1673          assert(intr->num_components == 1); /* only scalar stores expected */
1674          nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1675 
1676          if (nir_slot_is_varying(sem.location, MESA_SHADER_FRAGMENT) &&
1677              key->ge.opt.kill_outputs &
1678              (1ull << si_shader_io_get_unique_index(sem.location)))
1679             progress |= nir_remove_varying(intr, MESA_SHADER_FRAGMENT);
1680 
1681          switch (sem.location) {
1682          case VARYING_SLOT_PSIZ:
1683             if (key->ge.opt.kill_pointsize)
1684                progress |= nir_remove_sysval_output(intr, MESA_SHADER_FRAGMENT);
1685             break;
1686 
1687          case VARYING_SLOT_CLIP_VERTEX:
1688             /* TODO: We should only kill specific clip planes as required by kill_clip_distance,
1689              * not whole gl_ClipVertex. Lower ClipVertex in NIR.
1690              */
1691             if ((key->ge.opt.kill_clip_distances & SI_USER_CLIP_PLANE_MASK) ==
1692                 SI_USER_CLIP_PLANE_MASK)
1693                progress |= nir_remove_sysval_output(intr, MESA_SHADER_FRAGMENT);
1694             break;
1695 
1696          case VARYING_SLOT_CLIP_DIST0:
1697          case VARYING_SLOT_CLIP_DIST1:
1698             if (key->ge.opt.kill_clip_distances) {
1699                assert(nir_intrinsic_src_type(intr) == nir_type_float32);
1700                unsigned index = (sem.location - VARYING_SLOT_CLIP_DIST0) * 4 +
1701                                 nir_intrinsic_component(intr);
1702 
1703                if (key->ge.opt.kill_clip_distances & BITFIELD_BIT(index))
1704                   progress |= nir_remove_sysval_output(intr, MESA_SHADER_FRAGMENT);
1705             }
1706             break;
1707 
1708          case VARYING_SLOT_LAYER:
1709             /* LAYER is never passed to FS. Instead, we load it there as a system value. */
1710             progress |= nir_remove_varying(intr, MESA_SHADER_FRAGMENT);
1711 
1712             if (key->ge.opt.kill_layer)
1713                progress |= nir_remove_sysval_output(intr, MESA_SHADER_FRAGMENT);
1714             break;
1715          }
1716       }
1717    }
1718 
1719    if (progress) {
1720       nir_metadata_preserve(impl, nir_metadata_control_flow);
1721    } else {
1722       nir_metadata_preserve(impl, nir_metadata_all);
1723    }
1724 
1725    return progress;
1726 }
1727 
1728 /* Remove PS output components from NIR if they are disabled by spi_shader_col_format. */
kill_ps_outputs_cb(struct nir_builder * b,nir_instr * instr,void * _key)1729 static bool kill_ps_outputs_cb(struct nir_builder *b, nir_instr *instr, void *_key)
1730 {
1731    if (instr->type != nir_instr_type_intrinsic)
1732       return false;
1733 
1734    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1735    if (intr->intrinsic != nir_intrinsic_store_output)
1736       return false;
1737 
1738    /* No indirect indexing allowed. */
1739    ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1740    assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1741 
1742    unsigned location = nir_intrinsic_io_semantics(intr).location;
1743    const union si_shader_key *key = _key;
1744 
1745    switch (location) {
1746    case FRAG_RESULT_DEPTH:
1747    case FRAG_RESULT_STENCIL:
1748       return false;
1749 
1750    case FRAG_RESULT_SAMPLE_MASK:
1751       if (key->ps.part.epilog.kill_samplemask) {
1752          nir_instr_remove(instr);
1753          return true;
1754       }
1755       return false;
1756    }
1757 
1758    /* Color outputs. */
1759    unsigned component = nir_intrinsic_component(intr);
1760    unsigned comp_mask = BITFIELD_RANGE(component, intr->num_components);
1761    unsigned cb_shader_mask = ac_get_cb_shader_mask(key->ps.part.epilog.spi_shader_col_format);
1762 
1763    /* Preserve alpha if ALPHA_TESTING is enabled. */
1764    if (key->ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS ||
1765        key->ps.part.epilog.alpha_to_coverage_via_mrtz)
1766       cb_shader_mask |= 1 << 3;
1767 
1768    /* If COLOR is broadcasted to multiple color buffers, combine their masks. */
1769    if (location == FRAG_RESULT_COLOR) {
1770       for (unsigned i = 1; i <= key->ps.part.epilog.last_cbuf; i++)
1771          cb_shader_mask |= (cb_shader_mask >> (i * 4)) & 0xf;
1772    }
1773 
1774    unsigned index = location == FRAG_RESULT_COLOR ? 0 : location - FRAG_RESULT_DATA0;
1775    unsigned output_mask = (cb_shader_mask >> (index * 4)) & 0xf;
1776 
1777    if ((output_mask & comp_mask) == comp_mask)
1778       return false;
1779 
1780    if (!(output_mask & comp_mask)) {
1781       nir_instr_remove(instr);
1782       return true;
1783    }
1784 
1785    /* Fill disabled components with undef. */
1786    b->cursor = nir_before_instr(instr);
1787    nir_def *new_value = intr->src[0].ssa;
1788    nir_def *undef = nir_undef(b, 1, new_value->bit_size);
1789 
1790    unsigned kill_mask = (~output_mask & comp_mask) >> component;
1791    u_foreach_bit(i, kill_mask) {
1792       new_value = nir_vector_insert_imm(b, new_value, undef, i);
1793    }
1794 
1795    nir_src_rewrite(&intr->src[0], new_value);
1796    return true;
1797 }
1798 
si_nir_kill_ps_outputs(nir_shader * nir,const union si_shader_key * key)1799 static bool si_nir_kill_ps_outputs(nir_shader *nir, const union si_shader_key *key)
1800 {
1801    assert(nir->info.stage == MESA_SHADER_FRAGMENT);
1802    return nir_shader_instructions_pass(nir, kill_ps_outputs_cb,
1803                                        nir_metadata_control_flow, (void*)key);
1804 }
1805 
clamp_vertex_color_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * state)1806 static bool clamp_vertex_color_instr(nir_builder *b,
1807                                      nir_intrinsic_instr *intrin, void *state)
1808 {
1809    if (intrin->intrinsic != nir_intrinsic_store_output)
1810       return false;
1811 
1812    unsigned location = nir_intrinsic_io_semantics(intrin).location;
1813    if (location != VARYING_SLOT_COL0 && location != VARYING_SLOT_COL1 &&
1814        location != VARYING_SLOT_BFC0 && location != VARYING_SLOT_BFC1)
1815       return false;
1816 
1817    /* no indirect output */
1818    assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
1819    /* only scalar output */
1820    assert(intrin->src[0].ssa->num_components == 1);
1821 
1822    b->cursor = nir_before_instr(&intrin->instr);
1823 
1824    nir_def *color = intrin->src[0].ssa;
1825    nir_def *clamp = nir_load_clamp_vertex_color_amd(b);
1826    nir_def *new_color = nir_bcsel(b, clamp, nir_fsat(b, color), color);
1827    nir_src_rewrite(&intrin->src[0], new_color);
1828 
1829    return true;
1830 }
1831 
si_nir_clamp_vertex_color(nir_shader * nir)1832 static bool si_nir_clamp_vertex_color(nir_shader *nir)
1833 {
1834    uint64_t mask = VARYING_BIT_COL0 | VARYING_BIT_COL1 | VARYING_BIT_BFC0 | VARYING_BIT_BFC1;
1835    if (!(nir->info.outputs_written & mask))
1836       return false;
1837 
1838    return nir_shader_intrinsics_pass(nir, clamp_vertex_color_instr,
1839                                        nir_metadata_control_flow,
1840                                        NULL);
1841 }
1842 
si_map_io_driver_location(unsigned semantic)1843 static unsigned si_map_io_driver_location(unsigned semantic)
1844 {
1845    if ((semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_TESS_MAX) ||
1846        semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
1847        semantic == VARYING_SLOT_TESS_LEVEL_OUTER)
1848       return ac_shader_io_get_unique_index_patch(semantic);
1849 
1850    return si_shader_io_get_unique_index(semantic);
1851 }
1852 
si_lower_io_to_mem(struct si_shader * shader,nir_shader * nir)1853 static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir)
1854 {
1855    struct si_shader_selector *sel = shader->selector;
1856    struct si_shader_selector *next_sel = shader->next_shader ? shader->next_shader->selector : sel;
1857    const union si_shader_key *key = &shader->key;
1858    const bool is_gfx9_mono_tcs = shader->is_monolithic &&
1859                                  next_sel->stage == MESA_SHADER_TESS_CTRL &&
1860                                  sel->screen->info.gfx_level >= GFX9;
1861 
1862    if (nir->info.stage == MESA_SHADER_VERTEX) {
1863       if (key->ge.as_ls) {
1864          NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem,
1865                     is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
1866                     sel->screen->info.gfx_level,
1867                     key->ge.opt.same_patch_vertices,
1868                     is_gfx9_mono_tcs ? next_sel->info.tcs_inputs_via_temp : 0,
1869                     is_gfx9_mono_tcs ? next_sel->info.tcs_inputs_via_lds : ~0ull);
1870          return true;
1871       } else if (key->ge.as_es) {
1872          NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1873                     sel->screen->info.gfx_level, sel->info.esgs_vertex_stride, ~0ULL);
1874          return true;
1875       }
1876    } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
1877       NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem,
1878                  is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
1879                  sel->screen->info.gfx_level, key->ge.opt.same_patch_vertices,
1880                  sel->info.tcs_inputs_via_temp, sel->info.tcs_inputs_via_lds);
1881 
1882       /* Used by hs_emit_write_tess_factors() when monolithic shader. */
1883       if (nir->info.tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED)
1884          nir->info.tess._primitive_mode = key->ge.opt.tes_prim_mode;
1885 
1886       nir_tcs_info tcs_info;
1887       nir_gather_tcs_info(nir, &tcs_info, nir->info.tess._primitive_mode,
1888                           nir->info.tess.spacing);
1889 
1890       NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, &tcs_info, si_map_io_driver_location,
1891                  sel->screen->info.gfx_level,
1892                  ~0ULL, ~0U, /* no TES inputs filter */
1893                  shader->wave_size);
1894       return true;
1895    } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
1896       NIR_PASS_V(nir, ac_nir_lower_tes_inputs_to_mem, si_map_io_driver_location);
1897 
1898       if (key->ge.as_es) {
1899          NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1900                     sel->screen->info.gfx_level, sel->info.esgs_vertex_stride, ~0ULL);
1901       }
1902 
1903       return true;
1904    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1905       NIR_PASS_V(nir, ac_nir_lower_gs_inputs_to_mem, si_map_io_driver_location,
1906                  sel->screen->info.gfx_level, key->ge.mono.u.gs_tri_strip_adj_fix);
1907       return true;
1908    }
1909 
1910    return false;
1911 }
1912 
si_lower_ngg(struct si_shader * shader,nir_shader * nir)1913 static void si_lower_ngg(struct si_shader *shader, nir_shader *nir)
1914 {
1915    struct si_shader_selector *sel = shader->selector;
1916    const union si_shader_key *key = &shader->key;
1917    assert(key->ge.as_ngg);
1918 
1919    uint8_t clip_cull_dist_mask =
1920       (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) |
1921       sel->info.culldist_mask;
1922 
1923    ac_nir_lower_ngg_options options = {
1924       .family = sel->screen->info.family,
1925       .gfx_level = sel->screen->info.gfx_level,
1926       .max_workgroup_size = si_get_max_workgroup_size(shader),
1927       .wave_size = shader->wave_size,
1928       .can_cull = si_shader_culling_enabled(shader),
1929       .disable_streamout = !si_shader_uses_streamout(shader),
1930       .vs_output_param_offset = shader->info.vs_output_param_offset,
1931       .has_param_exports = shader->info.nr_param_exports,
1932       .clip_cull_dist_mask = clip_cull_dist_mask,
1933       .kill_pointsize = key->ge.opt.kill_pointsize,
1934       .kill_layer = key->ge.opt.kill_layer,
1935       .force_vrs = sel->screen->options.vrs2x2,
1936       .use_gfx12_xfb_intrinsic = !nir->info.use_aco_amd,
1937    };
1938 
1939    if (nir->info.stage == MESA_SHADER_VERTEX ||
1940        nir->info.stage == MESA_SHADER_TESS_EVAL) {
1941       /* Per instance inputs, used to remove instance load after culling. */
1942       unsigned instance_rate_inputs = 0;
1943 
1944       if (nir->info.stage == MESA_SHADER_VERTEX) {
1945          instance_rate_inputs = key->ge.mono.instance_divisor_is_one |
1946                                 key->ge.mono.instance_divisor_is_fetched;
1947 
1948          /* Manually mark the instance ID used, so the shader can repack it. */
1949          if (instance_rate_inputs)
1950             BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
1951       } else {
1952          /* Manually mark the primitive ID used, so the shader can repack it. */
1953          if (key->ge.mono.u.vs_export_prim_id)
1954             BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
1955       }
1956 
1957       unsigned clip_plane_enable =
1958          SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(key->ge.opt.ngg_culling);
1959       unsigned num_vertices = si_get_num_vertices_per_output_prim(shader);
1960 
1961       options.num_vertices_per_primitive = num_vertices ? num_vertices : 3;
1962       options.early_prim_export = gfx10_ngg_export_prim_early(shader);
1963       options.passthrough = gfx10_is_ngg_passthrough(shader);
1964       options.use_edgeflags = gfx10_has_variable_edgeflags(shader);
1965       options.has_gen_prim_query = options.has_xfb_prim_query =
1966          sel->screen->info.gfx_level >= GFX11 && !nir->info.vs.blit_sgprs_amd;
1967       options.export_primitive_id = key->ge.mono.u.vs_export_prim_id;
1968       options.instance_rate_inputs = instance_rate_inputs;
1969       options.user_clip_plane_enable_mask = clip_plane_enable;
1970 
1971       NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options);
1972    } else {
1973       assert(nir->info.stage == MESA_SHADER_GEOMETRY);
1974 
1975       options.gs_out_vtx_bytes = sel->info.gsvs_vertex_size;
1976       options.has_gen_prim_query = options.has_xfb_prim_query =
1977          sel->screen->info.gfx_level >= GFX11;
1978       options.has_gs_invocations_query = sel->screen->info.gfx_level < GFX11;
1979       options.has_gs_primitives_query = true;
1980 
1981       /* For monolithic ES/GS to add vscnt wait when GS export pos0. */
1982       if (key->ge.part.gs.es)
1983          nir->info.writes_memory |= key->ge.part.gs.es->info.base.writes_memory;
1984 
1985       NIR_PASS_V(nir, ac_nir_lower_ngg_gs, &options);
1986    }
1987 
1988    /* may generate some vector output store */
1989    NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1990 }
1991 
si_deserialize_shader(struct si_shader_selector * sel)1992 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel)
1993 {
1994    struct pipe_screen *screen = &sel->screen->b;
1995    const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR, sel->stage);
1996 
1997    struct blob_reader blob_reader;
1998    blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
1999    return nir_deserialize(NULL, options, &blob_reader);
2000 }
2001 
si_nir_assign_param_offsets(nir_shader * nir,struct si_shader * shader,int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])2002 static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shader,
2003                                         int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])
2004 {
2005    struct si_shader_selector *sel = shader->selector;
2006    struct si_shader_binary_info *info = &shader->info;
2007 
2008    uint64_t outputs_written = 0;
2009    uint32_t outputs_written_16bit = 0;
2010 
2011    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2012    assert(impl);
2013 
2014    nir_foreach_block(block, impl) {
2015       nir_foreach_instr_safe(instr, block) {
2016          if (instr->type != nir_instr_type_intrinsic)
2017             continue;
2018 
2019          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2020          if (intr->intrinsic != nir_intrinsic_store_output)
2021             continue;
2022 
2023          /* No indirect indexing allowed. */
2024          ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
2025          assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
2026 
2027          assert(intr->num_components == 1); /* only scalar stores expected */
2028          nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
2029 
2030          if (sem.location >= VARYING_SLOT_VAR0_16BIT)
2031             outputs_written_16bit |= BITFIELD_BIT(sem.location - VARYING_SLOT_VAR0_16BIT);
2032          else
2033             outputs_written |= BITFIELD64_BIT(sem.location);
2034 
2035          /* Assign the param index if it's unassigned. */
2036          if (nir_slot_is_varying(sem.location, MESA_SHADER_FRAGMENT) && !sem.no_varying &&
2037              (sem.gs_streams & 0x3) == 0 &&
2038              info->vs_output_param_offset[sem.location] == AC_EXP_PARAM_DEFAULT_VAL_0000) {
2039             /* The semantic and the base should be the same as in si_shader_info. */
2040             assert(sem.location == sel->info.output_semantic[nir_intrinsic_base(intr)]);
2041             /* It must not be remapped (duplicated). */
2042             assert(slot_remap[sem.location] == -1);
2043 
2044             info->vs_output_param_offset[sem.location] = info->nr_param_exports++;
2045          }
2046       }
2047    }
2048 
2049    /* Duplicated outputs are redirected here. */
2050    for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
2051       if (slot_remap[i] >= 0)
2052          info->vs_output_param_offset[i] = info->vs_output_param_offset[slot_remap[i]];
2053    }
2054 
2055    if (shader->key.ge.mono.u.vs_export_prim_id) {
2056       info->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = info->nr_param_exports++;
2057    }
2058 
2059    /* Update outputs written info, we may remove some outputs before. */
2060    nir->info.outputs_written = outputs_written;
2061    nir->info.outputs_written_16bit = outputs_written_16bit;
2062 }
2063 
si_assign_param_offsets(nir_shader * nir,struct si_shader * shader)2064 static void si_assign_param_offsets(nir_shader *nir, struct si_shader *shader)
2065 {
2066    /* Initialize this first. */
2067    shader->info.nr_param_exports = 0;
2068 
2069    STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
2070    memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
2071           sizeof(shader->info.vs_output_param_offset));
2072 
2073    /* A slot remapping table for duplicated outputs, so that 1 vertex shader output can be
2074     * mapped to multiple fragment shader inputs.
2075     */
2076    int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS];
2077    memset(slot_remap, -1, NUM_TOTAL_VARYING_SLOTS);
2078 
2079    /* This sets DEFAULT_VAL for constant outputs in vs_output_param_offset. */
2080    /* TODO: This doesn't affect GS. */
2081    NIR_PASS_V(nir, ac_nir_optimize_outputs, false, slot_remap,
2082               shader->info.vs_output_param_offset);
2083 
2084    /* Assign the non-constant outputs. */
2085    /* TODO: Use this for the GS copy shader too. */
2086    si_nir_assign_param_offsets(nir, shader, slot_remap);
2087 }
2088 
si_get_nr_pos_exports(const struct si_shader_selector * sel,const union si_shader_key * key)2089 static unsigned si_get_nr_pos_exports(const struct si_shader_selector *sel,
2090                                       const union si_shader_key *key)
2091 {
2092    const struct si_shader_info *info = &sel->info;
2093 
2094    /* Must have a position export. */
2095    unsigned nr_pos_exports = 1;
2096 
2097    if ((info->writes_psize && !key->ge.opt.kill_pointsize) ||
2098        (info->writes_edgeflag && !key->ge.as_ngg) ||
2099        (info->writes_layer && !key->ge.opt.kill_layer) ||
2100        info->writes_viewport_index || sel->screen->options.vrs2x2) {
2101       nr_pos_exports++;
2102    }
2103 
2104    unsigned clipdist_mask =
2105       (info->clipdist_mask & ~key->ge.opt.kill_clip_distances) | info->culldist_mask;
2106 
2107    for (int i = 0; i < 2; i++) {
2108       if (clipdist_mask & BITFIELD_RANGE(i * 4, 4))
2109          nr_pos_exports++;
2110    }
2111 
2112    return nr_pos_exports;
2113 }
2114 
lower_ps_load_color_intrinsic(nir_builder * b,nir_instr * instr,void * state)2115 static bool lower_ps_load_color_intrinsic(nir_builder *b, nir_instr *instr, void *state)
2116 {
2117    nir_def **colors = (nir_def **)state;
2118 
2119    if (instr->type != nir_instr_type_intrinsic)
2120       return false;
2121 
2122    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
2123 
2124    if (intrin->intrinsic != nir_intrinsic_load_color0 &&
2125        intrin->intrinsic != nir_intrinsic_load_color1)
2126       return false;
2127 
2128    unsigned index = intrin->intrinsic == nir_intrinsic_load_color0 ? 0 : 1;
2129    assert(colors[index]);
2130 
2131    nir_def_replace(&intrin->def, colors[index]);
2132    return true;
2133 }
2134 
si_nir_lower_ps_color_input(nir_shader * nir,const union si_shader_key * key,const struct si_shader_info * info)2135 static bool si_nir_lower_ps_color_input(nir_shader *nir, const union si_shader_key *key,
2136                                         const struct si_shader_info *info)
2137 {
2138    bool progress = false;
2139    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2140 
2141    nir_builder builder = nir_builder_at(nir_before_impl(impl));
2142    nir_builder *b = &builder;
2143 
2144    /* Build ready to be used colors at the beginning of the shader. */
2145    nir_def *colors[2] = {0};
2146    for (int i = 0; i < 2; i++) {
2147       if (!(info->colors_read & (0xf << (i * 4))))
2148          continue;
2149 
2150       unsigned color_base = info->color_attr_index[i];
2151       /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
2152        * otherwise it's at offset "num_inputs".
2153        */
2154       unsigned back_color_base = info->num_inputs;
2155       if (i == 1 && (info->colors_read & 0xf))
2156          back_color_base += 1;
2157 
2158       enum glsl_interp_mode interp_mode = info->color_interpolate[i];
2159       if (interp_mode == INTERP_MODE_COLOR) {
2160          interp_mode = key->ps.part.prolog.flatshade_colors ?
2161             INTERP_MODE_FLAT : INTERP_MODE_SMOOTH;
2162       }
2163 
2164       nir_def *back_color = NULL;
2165       if (interp_mode == INTERP_MODE_FLAT) {
2166          colors[i] = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
2167                                    .base = color_base,
2168                                    .io_semantics.location = VARYING_SLOT_COL0 + i,
2169                                    .io_semantics.num_slots = 1);
2170 
2171          if (key->ps.part.prolog.color_two_side) {
2172             back_color = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
2173                                         .base = back_color_base,
2174                                         .io_semantics.location = VARYING_SLOT_BFC0 + i,
2175                                         .io_semantics.num_slots = 1);
2176          }
2177       } else {
2178          nir_intrinsic_op op = 0;
2179          switch (info->color_interpolate_loc[i]) {
2180          case TGSI_INTERPOLATE_LOC_CENTER:
2181             op = nir_intrinsic_load_barycentric_pixel;
2182             break;
2183          case TGSI_INTERPOLATE_LOC_CENTROID:
2184             op = nir_intrinsic_load_barycentric_centroid;
2185             break;
2186          case TGSI_INTERPOLATE_LOC_SAMPLE:
2187             op = nir_intrinsic_load_barycentric_sample;
2188             break;
2189          default:
2190             unreachable("invalid color interpolate location");
2191             break;
2192          }
2193 
2194          nir_def *barycentric = nir_load_barycentric(b, op, interp_mode);
2195 
2196          colors[i] =
2197             nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0),
2198                                         .base = color_base,
2199                                         .io_semantics.location = VARYING_SLOT_COL0 + i,
2200                                         .io_semantics.num_slots = 1);
2201 
2202          if (key->ps.part.prolog.color_two_side) {
2203             back_color =
2204                nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0),
2205                                            .base = back_color_base,
2206                                            .io_semantics.location = VARYING_SLOT_BFC0 + i,
2207                                            .io_semantics.num_slots = 1);
2208          }
2209       }
2210 
2211       if (back_color) {
2212          nir_def *is_front_face = nir_load_front_face(b, 1);
2213          colors[i] = nir_bcsel(b, is_front_face, colors[i], back_color);
2214       }
2215 
2216       progress = true;
2217    }
2218 
2219    /* lower nir_load_color0/1 to use the color value. */
2220    return nir_shader_instructions_pass(nir, lower_ps_load_color_intrinsic,
2221                                        nir_metadata_control_flow,
2222                                        colors) || progress;
2223 }
2224 
si_nir_emit_polygon_stipple(nir_shader * nir,struct si_shader_args * args)2225 static void si_nir_emit_polygon_stipple(nir_shader *nir, struct si_shader_args *args)
2226 {
2227    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2228 
2229    nir_builder builder = nir_builder_at(nir_before_impl(impl));
2230    nir_builder *b = &builder;
2231 
2232    /* Load the buffer descriptor. */
2233    nir_def *desc =
2234       si_nir_load_internal_binding(b, args, SI_PS_CONST_POLY_STIPPLE, 4);
2235 
2236    /* Use the fixed-point gl_FragCoord input.
2237     * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
2238     * per coordinate to get the repeating effect.
2239     */
2240    nir_def *pos_x = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 0, 5);
2241    nir_def *pos_y = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 16, 5);
2242 
2243    nir_def *zero = nir_imm_int(b, 0);
2244    /* The stipple pattern is 32x32, each row has 32 bits. */
2245    nir_def *offset = nir_ishl_imm(b, pos_y, 2);
2246    nir_def *row = nir_load_buffer_amd(b, 1, 32, desc, offset, zero, zero);
2247    nir_def *bit = nir_ubfe(b, row, pos_x, nir_imm_int(b, 1));
2248 
2249    nir_def *pass = nir_i2b(b, bit);
2250    nir_discard_if(b, nir_inot(b, pass));
2251 }
2252 
si_should_clear_lds(struct si_screen * sscreen,const struct nir_shader * shader)2253 bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader)
2254 {
2255    return gl_shader_stage_is_compute(shader->info.stage) &&
2256       shader->info.shared_size > 0 && sscreen->options.clear_lds;
2257 }
2258 
clamp_shadow_comparison_value(nir_builder * b,nir_instr * instr,void * state)2259 static bool clamp_shadow_comparison_value(nir_builder *b, nir_instr *instr, void *state)
2260 {
2261    if (instr->type != nir_instr_type_tex)
2262       return false;
2263 
2264    nir_tex_instr *tex = nir_instr_as_tex(instr);
2265    if (!tex->is_shadow)
2266       return false;
2267 
2268    b->cursor = nir_before_instr(instr);
2269 
2270    int samp_index = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
2271    int comp_index = nir_tex_instr_src_index(tex, nir_tex_src_comparator);
2272    assert(samp_index >= 0 && comp_index >= 0);
2273 
2274    nir_def *sampler = tex->src[samp_index].src.ssa;
2275    nir_def *compare = tex->src[comp_index].src.ssa;
2276    /* Must have been lowered to descriptor. */
2277    assert(sampler->num_components > 1);
2278 
2279    nir_def *upgraded = nir_channel(b, sampler, 3);
2280    upgraded = nir_i2b(b, nir_ubfe_imm(b, upgraded, 29, 1));
2281 
2282    nir_def *clamped = nir_fsat(b, compare);
2283    compare = nir_bcsel(b, upgraded, clamped, compare);
2284 
2285    nir_src_rewrite(&tex->src[comp_index].src, compare);
2286    return true;
2287 }
2288 
si_nir_clamp_shadow_comparison_value(nir_shader * nir)2289 static bool si_nir_clamp_shadow_comparison_value(nir_shader *nir)
2290 {
2291    /* Section 8.23.1 (Depth Texture Comparison Mode) of the
2292     * OpenGL 4.5 spec says:
2293     *
2294     *    "If the texture’s internal format indicates a fixed-point
2295     *     depth texture, then D_t and D_ref are clamped to the
2296     *     range [0, 1]; otherwise no clamping is performed."
2297     *
2298     * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
2299     * so the depth comparison value isn't clamped for Z16 and
2300     * Z24 anymore. Do it manually here for GFX8-9; GFX10 has
2301     * an explicitly clamped 32-bit float format.
2302     */
2303    return nir_shader_instructions_pass(nir, clamp_shadow_comparison_value,
2304                                        nir_metadata_control_flow,
2305                                        NULL);
2306 }
2307 
si_get_nir_shader(struct si_shader * shader,struct si_shader_args * args,bool * free_nir,ac_nir_gs_output_info * output_info)2308 static struct nir_shader *si_get_nir_shader(struct si_shader *shader, struct si_shader_args *args,
2309                                             bool *free_nir, ac_nir_gs_output_info *output_info)
2310 {
2311    struct si_shader_selector *sel = shader->selector;
2312    const union si_shader_key *key = &shader->key;
2313 
2314    nir_shader *nir;
2315    *free_nir = false;
2316 
2317    if (sel->nir) {
2318       nir = sel->nir;
2319    } else if (sel->nir_binary) {
2320       nir = si_deserialize_shader(sel);
2321       *free_nir = true;
2322    } else {
2323       return NULL;
2324    }
2325 
2326    bool progress = false;
2327    bool late_opts = false;
2328 
2329    const char *original_name = NULL;
2330    if (unlikely(should_print_nir(nir))) {
2331       /* Modify the shader's name so that each variant gets its own name. */
2332       original_name = ralloc_strdup(nir, nir->info.name);
2333       ralloc_asprintf_append((char **)&nir->info.name, "-%08x", _mesa_hash_data(key, sizeof(*key)));
2334 
2335       /* Dummy pass to get the starting point. */
2336       printf("nir_dummy_pass\n");
2337       nir_print_shader(nir, stdout);
2338    }
2339 
2340    si_init_shader_args(shader, args, &nir->info);
2341 
2342    /* Kill outputs according to the shader key. */
2343    if (nir->info.stage <= MESA_SHADER_GEOMETRY)
2344       NIR_PASS(progress, nir, si_nir_kill_outputs, key);
2345 
2346    NIR_PASS(progress, nir, ac_nir_lower_tex,
2347             &(ac_nir_lower_tex_options){
2348                .gfx_level = sel->screen->info.gfx_level,
2349                .lower_array_layer_round_even = !sel->screen->info.conformant_trunc_coord,
2350             });
2351 
2352    if (nir->info.uses_resource_info_query)
2353       NIR_PASS(progress, nir, ac_nir_lower_resinfo, sel->screen->info.gfx_level);
2354 
2355    bool inline_uniforms = false;
2356    uint32_t *inlined_uniform_values;
2357    si_get_inline_uniform_state((union si_shader_key*)key, nir->info.stage,
2358                                &inline_uniforms, &inlined_uniform_values);
2359 
2360    if (inline_uniforms) {
2361       assert(*free_nir);
2362 
2363       /* Most places use shader information from the default variant, not
2364        * the optimized variant. These are the things that the driver looks at
2365        * in optimized variants and the list of things that we need to do.
2366        *
2367        * The driver takes into account these things if they suddenly disappear
2368        * from the shader code:
2369        * - Register usage and code size decrease (obvious)
2370        * - Eliminated PS system values are disabled by LLVM
2371        *   (FragCoord, FrontFace, barycentrics)
2372        * - VS/TES/GS param exports are eliminated if they are undef.
2373        *   The param space for eliminated outputs is also not allocated.
2374        * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)
2375        * - TCS output stores are eliminated
2376        * - Eliminated PS inputs are removed from PS.NUM_INTERP.
2377        *
2378        * TODO: These are things the driver ignores in the final shader code
2379        * and relies on the default shader info.
2380        * - System values in VS, TCS, TES, GS are not eliminated
2381        * - uses_discard - if it changed to false
2382        * - writes_memory - if it changed to false
2383        * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not
2384        *   eliminated
2385        * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS)
2386        *   GS outputs are eliminated except for the temporary LDS.
2387        *   Clip distances, gl_PointSize, gl_Layer and PS outputs are eliminated based
2388        *   on current states, so we don't care about the shader code.
2389        *
2390        * TODO: Merged shaders don't inline uniforms for the first stage.
2391        * VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS.
2392        * (key == NULL for the first stage here)
2393        *
2394        * TODO: Compute shaders don't support inlinable uniforms, because they
2395        * don't have shader variants.
2396        *
2397        * TODO: The driver uses a linear search to find a shader variant. This
2398        * can be really slow if we get too many variants due to uniform inlining.
2399        */
2400       NIR_PASS_V(nir, nir_inline_uniforms, nir->info.num_inlinable_uniforms,
2401                  inlined_uniform_values, nir->info.inlinable_uniform_dw_offsets);
2402       progress = true;
2403    }
2404 
2405    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2406       /* This uses the epilog key, so only monolithic shaders can call this. */
2407       if (shader->is_monolithic)
2408          NIR_PASS(progress, nir, si_nir_kill_ps_outputs, key);
2409 
2410       if (key->ps.mono.poly_line_smoothing)
2411          NIR_PASS(progress, nir, nir_lower_poly_line_smooth, SI_NUM_SMOOTH_AA_SAMPLES);
2412 
2413       if (key->ps.mono.point_smoothing)
2414          NIR_PASS(progress, nir, nir_lower_point_smooth, true);
2415 
2416       NIR_PASS(progress, nir, nir_lower_fragcoord_wtrans);
2417    }
2418 
2419    /* This must be before si_nir_lower_resource. */
2420    if (!sel->screen->info.has_image_opcodes)
2421       NIR_PASS(progress, nir, ac_nir_lower_image_opcodes);
2422 
2423    /* LLVM does not work well with this, so is handled in llvm backend waterfall. */
2424    if (nir->info.use_aco_amd && sel->info.has_non_uniform_tex_access) {
2425       nir_lower_non_uniform_access_options options = {
2426          .types = nir_lower_non_uniform_texture_access,
2427       };
2428       NIR_PASS(progress, nir, nir_lower_non_uniform_access, &options);
2429    }
2430 
2431    bool is_last_vgt_stage =
2432       (nir->info.stage == MESA_SHADER_VERTEX ||
2433        nir->info.stage == MESA_SHADER_TESS_EVAL ||
2434        (nir->info.stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)) &&
2435       !shader->key.ge.as_ls && !shader->key.ge.as_es;
2436 
2437    /* Legacy GS is not last VGT stage because it has GS copy shader. */
2438    bool is_legacy_gs = nir->info.stage == MESA_SHADER_GEOMETRY && !key->ge.as_ngg;
2439 
2440    if (is_last_vgt_stage || is_legacy_gs)
2441       NIR_PASS(progress, nir, si_nir_clamp_vertex_color);
2442 
2443    if (progress) {
2444       si_nir_opts(sel->screen, nir, true);
2445       late_opts = true;
2446       progress = false;
2447    }
2448 
2449    /* Lower large variables that are always constant with load_constant intrinsics, which
2450     * get turned into PC-relative loads from a data section next to the shader.
2451     *
2452     * Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
2453     * this should be done after that.
2454     *
2455     * The pass crashes if there are dead temps of lowered IO interface types, so remove
2456     * them first.
2457     */
2458    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
2459    NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
2460 
2461    /* Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
2462     * this should be done after that.
2463     */
2464    progress |= ac_nir_lower_indirect_derefs(nir, sel->screen->info.gfx_level);
2465 
2466    if (nir->info.stage == MESA_SHADER_VERTEX)
2467       NIR_PASS(progress, nir, si_nir_lower_vs_inputs, shader, args);
2468 
2469    progress |= si_lower_io_to_mem(shader, nir);
2470 
2471    if (is_last_vgt_stage) {
2472       /* Assign param export indices. */
2473       si_assign_param_offsets(nir, shader);
2474 
2475       /* Assign num of position exports. */
2476       shader->info.nr_pos_exports = si_get_nr_pos_exports(sel, key);
2477 
2478       if (key->ge.as_ngg) {
2479          /* Lower last VGT NGG shader stage. */
2480          si_lower_ngg(shader, nir);
2481       } else if (nir->info.stage == MESA_SHADER_VERTEX ||
2482                  nir->info.stage == MESA_SHADER_TESS_EVAL) {
2483          /* Lower last VGT none-NGG VS/TES shader stage. */
2484          unsigned clip_cull_mask =
2485             (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) |
2486             sel->info.culldist_mask;
2487 
2488          NIR_PASS_V(nir, ac_nir_lower_legacy_vs,
2489                     sel->screen->info.gfx_level,
2490                     clip_cull_mask,
2491                     shader->info.vs_output_param_offset,
2492                     shader->info.nr_param_exports,
2493                     shader->key.ge.mono.u.vs_export_prim_id,
2494                     !si_shader_uses_streamout(shader),
2495                     key->ge.opt.kill_pointsize,
2496                     key->ge.opt.kill_layer,
2497                     sel->screen->options.vrs2x2);
2498       }
2499       progress = true;
2500    } else if (is_legacy_gs) {
2501       NIR_PASS_V(nir, ac_nir_lower_legacy_gs, false, sel->screen->use_ngg, output_info);
2502       progress = true;
2503    } else if (nir->info.stage == MESA_SHADER_FRAGMENT && shader->is_monolithic) {
2504       /* Uniform inlining can eliminate PS inputs, and colormask can remove PS outputs,
2505        * which can also cause the elimination of PS inputs. Remove holes after removed PS inputs
2506        * by renumbering them. This can only happen with monolithic PS. Colors are unaffected
2507        * because they are still represented by nir_intrinsic_load_color0/1.
2508        */
2509       NIR_PASS_V(nir, nir_recompute_io_bases, nir_var_shader_in);
2510 
2511       /* Two-side color selection and interpolation: Get the latest shader info because
2512        * uniform inlining and colormask can fully eliminate color inputs.
2513        */
2514       struct si_shader_info info;
2515       si_nir_scan_shader(sel->screen, nir, &info);
2516 
2517       if (info.colors_read)
2518          NIR_PASS(progress, nir, si_nir_lower_ps_color_input, &shader->key, &info);
2519 
2520       /* We need to set this early for lowering nir_intrinsic_load_point_coord_maybe_flipped,
2521        * which can only occur with monolithic PS.
2522        */
2523       shader->info.num_ps_inputs = info.num_inputs;
2524       shader->info.ps_colors_read = info.colors_read;
2525 
2526       ac_nir_lower_ps_early_options early_options = {
2527          .force_persp_sample_interp = key->ps.part.prolog.force_persp_sample_interp,
2528          .force_linear_sample_interp = key->ps.part.prolog.force_linear_sample_interp,
2529          .force_persp_center_interp = key->ps.part.prolog.force_persp_center_interp,
2530          .force_linear_center_interp = key->ps.part.prolog.force_linear_center_interp,
2531          .ps_iter_samples = 1 << key->ps.part.prolog.samplemask_log_ps_iter,
2532 
2533          .clamp_color = key->ps.part.epilog.clamp_color,
2534          .alpha_test_alpha_to_one = key->ps.part.epilog.alpha_to_one,
2535          .alpha_func = key->ps.part.epilog.alpha_func,
2536          .keep_alpha_for_mrtz = key->ps.part.epilog.alpha_to_coverage_via_mrtz,
2537          .spi_shader_col_format_hint = key->ps.part.epilog.spi_shader_col_format,
2538          .kill_z = key->ps.part.epilog.kill_z,
2539          .kill_stencil = key->ps.part.epilog.kill_stencil,
2540          .kill_samplemask = key->ps.part.epilog.kill_samplemask,
2541       };
2542 
2543       NIR_PASS_V(nir, ac_nir_lower_ps_early, &early_options);
2544 
2545       ac_nir_lower_ps_late_options late_options = {
2546          .gfx_level = sel->screen->info.gfx_level,
2547          .family = sel->screen->info.family,
2548          .use_aco = nir->info.use_aco_amd,
2549          .bc_optimize_for_persp = key->ps.part.prolog.bc_optimize_for_persp,
2550          .bc_optimize_for_linear = key->ps.part.prolog.bc_optimize_for_linear,
2551          .uses_discard = si_shader_uses_discard(shader),
2552          .alpha_to_coverage_via_mrtz = key->ps.part.epilog.alpha_to_coverage_via_mrtz,
2553          .dual_src_blend_swizzle = key->ps.part.epilog.dual_src_blend_swizzle,
2554          .spi_shader_col_format = key->ps.part.epilog.spi_shader_col_format,
2555          .color_is_int8 = key->ps.part.epilog.color_is_int8,
2556          .color_is_int10 = key->ps.part.epilog.color_is_int10,
2557          .alpha_to_one = key->ps.part.epilog.alpha_to_one,
2558       };
2559 
2560       NIR_PASS_V(nir, ac_nir_lower_ps_late, &late_options);
2561 
2562       if (key->ps.part.prolog.poly_stipple)
2563          NIR_PASS_V(nir, si_nir_emit_polygon_stipple, args);
2564 
2565       progress = true;
2566    }
2567 
2568    assert(shader->wave_size == 32 || shader->wave_size == 64);
2569 
2570    NIR_PASS(progress, nir, nir_lower_subgroups,
2571             &(struct nir_lower_subgroups_options) {
2572                .subgroup_size = shader->wave_size,
2573                .ballot_bit_size = shader->wave_size,
2574                .ballot_components = 1,
2575                .lower_to_scalar = true,
2576                .lower_subgroup_masks = true,
2577                .lower_relative_shuffle = true,
2578                .lower_rotate_to_shuffle = !nir->info.use_aco_amd,
2579                .lower_shuffle_to_32bit = true,
2580                .lower_vote_eq = true,
2581                .lower_vote_bool_eq = true,
2582                .lower_quad_broadcast_dynamic = true,
2583                .lower_quad_broadcast_dynamic_to_const = sel->screen->info.gfx_level <= GFX7,
2584                .lower_shuffle_to_swizzle_amd = true,
2585                .lower_ballot_bit_count_to_mbcnt_amd = true,
2586                .lower_inverse_ballot = !nir->info.use_aco_amd && LLVM_VERSION_MAJOR < 17,
2587                .lower_boolean_reduce = nir->info.use_aco_amd,
2588                .lower_boolean_shuffle = true,
2589             });
2590 
2591    NIR_PASS(progress, nir, nir_lower_pack);
2592    NIR_PASS(progress, nir, nir_opt_idiv_const, 8);
2593    NIR_PASS(progress, nir, nir_lower_idiv,
2594             &(nir_lower_idiv_options){
2595                .allow_fp16 = sel->screen->info.gfx_level >= GFX9,
2596             });
2597 
2598    if (si_should_clear_lds(sel->screen, nir)) {
2599       const unsigned chunk_size = 16; /* max single store size */
2600       const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
2601       NIR_PASS_V(nir, nir_clear_shared_memory, shared_size, chunk_size);
2602    }
2603 
2604    NIR_PASS_V(nir, nir_divergence_analysis); /* required by ac_nir_flag_smem_for_loads */
2605    NIR_PASS(progress, nir, ac_nir_flag_smem_for_loads, sel->screen->info.gfx_level,
2606             !sel->info.base.use_aco_amd, true);
2607    NIR_PASS(progress, nir, nir_lower_io_to_scalar,
2608             nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_shared | nir_var_mem_global,
2609             ac_nir_scalarize_overfetching_loads_callback, &sel->screen->info.gfx_level);
2610    NIR_PASS(progress, nir, si_nir_lower_resource, shader, args);
2611 
2612    if (progress) {
2613       si_nir_opts(sel->screen, nir, false);
2614       progress = false;
2615       late_opts = true;
2616    }
2617 
2618    NIR_PASS(progress, nir, nir_opt_load_store_vectorize,
2619             &(nir_load_store_vectorize_options){
2620                .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_shared | nir_var_mem_global |
2621                         nir_var_shader_temp,
2622                .callback = ac_nir_mem_vectorize_callback,
2623                .cb_data = &(struct ac_nir_config){sel->screen->info.gfx_level, sel->info.base.use_aco_amd},
2624                /* On GFX6, read2/write2 is out-of-bounds if the offset register is negative, even if
2625                 * the final offset is not.
2626                 */
2627                .has_shared2_amd = sel->screen->info.gfx_level >= GFX7,
2628             });
2629    NIR_PASS(progress, nir, nir_opt_shrink_stores, false);
2630 
2631    nir_divergence_analysis(nir);
2632    NIR_PASS(progress, nir, ac_nir_flag_smem_for_loads, sel->screen->info.gfx_level,
2633             !nir->info.use_aco_amd, false);
2634    NIR_PASS(progress, nir, ac_nir_lower_mem_access_bit_sizes,
2635             sel->screen->info.gfx_level, !nir->info.use_aco_amd);
2636 
2637    if (nir->info.stage == MESA_SHADER_KERNEL) {
2638       NIR_PASS(progress, nir, ac_nir_lower_global_access);
2639 
2640       if (nir->info.bit_sizes_int & (8 | 16)) {
2641          if (sel->screen->info.gfx_level >= GFX8)
2642             nir_divergence_analysis(nir);
2643 
2644          NIR_PASS(progress, nir, nir_lower_bit_size, ac_nir_lower_bit_size_callback,
2645                   &sel->screen->info.gfx_level);
2646       }
2647    }
2648 
2649    /* This must be after lowering resources to descriptor loads and before lowering intrinsics
2650     * to args and lowering int64.
2651     */
2652    if (nir->info.use_aco_amd)
2653       progress |= ac_nir_optimize_uniform_atomics(nir);
2654 
2655    NIR_PASS(progress, nir, nir_lower_int64);
2656    NIR_PASS(progress, nir, si_nir_lower_abi, shader, args);
2657    NIR_PASS(progress, nir, ac_nir_lower_intrinsics_to_args, sel->screen->info.gfx_level,
2658             sel->screen->info.has_ls_vgpr_init_bug,
2659             si_select_hw_stage(nir->info.stage, key, sel->screen->info.gfx_level),
2660             shader->wave_size, si_get_max_workgroup_size(shader), &args->ac);
2661 
2662    /* LLVM keep non-uniform sampler as index, so can't do this in NIR.
2663     * Must be done after si_nir_lower_resource().
2664     */
2665    if (nir->info.use_aco_amd && sel->info.has_shadow_comparison &&
2666        sel->screen->info.gfx_level >= GFX8 && sel->screen->info.gfx_level <= GFX9) {
2667       NIR_PASS(progress, nir, si_nir_clamp_shadow_comparison_value);
2668    }
2669 
2670    if (progress) {
2671       si_nir_opts(sel->screen, nir, false);
2672       progress = false;
2673       late_opts = true;
2674    }
2675 
2676    static const nir_opt_offsets_options offset_options = {
2677       .uniform_max = 0,
2678       .buffer_max = ~0,
2679       .shared_max = ~0,
2680    };
2681    NIR_PASS_V(nir, nir_opt_offsets, &offset_options);
2682 
2683    if (late_opts)
2684       si_nir_late_opts(nir);
2685 
2686    NIR_PASS(progress, nir, nir_opt_sink,
2687             nir_move_const_undef | nir_move_copies | nir_move_alu | nir_move_comparisons |
2688             nir_move_load_ubo | nir_move_load_ssbo);
2689    NIR_PASS(progress, nir, nir_opt_move,
2690             nir_move_const_undef | nir_move_copies | nir_move_alu | nir_move_comparisons |
2691             nir_move_load_ubo);
2692    /* Run nir_opt_move again to make sure that comparisons are as close as possible to the first
2693     * use to prevent SCC spilling.
2694     */
2695    NIR_PASS(progress, nir, nir_opt_move, nir_move_comparisons);
2696 
2697    /* This must be done after si_nir_late_opts() because it may generate vec const. */
2698    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
2699 
2700    /* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
2701     * 200 is tuned for Viewperf. It should be done last.
2702     */
2703    NIR_PASS_V(nir, nir_group_loads, nir_group_same_resource_only, 200);
2704 
2705    if (unlikely(original_name)) {
2706       ralloc_free((void*)nir->info.name);
2707       nir->info.name = original_name;
2708    }
2709 
2710    return nir;
2711 }
2712 
si_update_shader_binary_info(struct si_shader * shader,nir_shader * nir)2713 void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir)
2714 {
2715    struct si_shader_info info;
2716    si_nir_scan_shader(shader->selector->screen, nir, &info);
2717 
2718    shader->info.uses_vmem_load_other |= info.uses_vmem_load_other;
2719    shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh;
2720 
2721    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2722       /* Since uniform inlining can remove PS inputs, set the latest info about PS inputs here. */
2723       shader->info.num_ps_inputs = info.num_inputs;
2724       shader->info.ps_colors_read = info.colors_read;
2725 
2726       /* A non-monolithic PS doesn't know if back colors are enabled, so copy 2 more. */
2727       unsigned max_interp = MIN2(info.num_inputs + 2, SI_NUM_INTERP);
2728       memcpy(shader->info.ps_inputs, info.input, max_interp * sizeof(info.input[0]));
2729    }
2730 }
2731 
2732 /* Generate code for the hardware VS shader stage to go with a geometry shader */
2733 static struct si_shader *
si_nir_generate_gs_copy_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * gs_shader,nir_shader * gs_nir,struct util_debug_callback * debug,ac_nir_gs_output_info * output_info)2734 si_nir_generate_gs_copy_shader(struct si_screen *sscreen,
2735                                struct ac_llvm_compiler *compiler,
2736                                struct si_shader *gs_shader,
2737                                nir_shader *gs_nir,
2738                                struct util_debug_callback *debug,
2739                                ac_nir_gs_output_info *output_info)
2740 {
2741    struct si_shader *shader;
2742    struct si_shader_selector *gs_selector = gs_shader->selector;
2743    struct si_shader_info *gsinfo = &gs_selector->info;
2744    union si_shader_key *gskey = &gs_shader->key;
2745 
2746    shader = CALLOC_STRUCT(si_shader);
2747    if (!shader)
2748       return NULL;
2749 
2750    /* We can leave the fence as permanently signaled because the GS copy
2751     * shader only becomes visible globally after it has been compiled. */
2752    util_queue_fence_init(&shader->ready);
2753 
2754    shader->selector = gs_selector;
2755    shader->is_gs_copy_shader = true;
2756    shader->wave_size = si_determine_wave_size(sscreen, shader);
2757 
2758    STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
2759    memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
2760           sizeof(shader->info.vs_output_param_offset));
2761 
2762    for (unsigned i = 0; i < gsinfo->num_outputs; i++) {
2763       unsigned semantic = gsinfo->output_semantic[i];
2764 
2765       /* Skip if no channel writes to stream 0. */
2766       if (!nir_slot_is_varying(semantic, MESA_SHADER_FRAGMENT) ||
2767           (gsinfo->output_streams[i] & 0x03 &&
2768            gsinfo->output_streams[i] & 0x0c &&
2769            gsinfo->output_streams[i] & 0x30 &&
2770            gsinfo->output_streams[i] & 0xc0))
2771          continue;
2772 
2773       shader->info.vs_output_param_offset[semantic] = shader->info.nr_param_exports++;
2774    }
2775 
2776    shader->info.nr_pos_exports = si_get_nr_pos_exports(gs_selector, gskey);
2777 
2778    unsigned clip_cull_mask =
2779       (gsinfo->clipdist_mask & ~gskey->ge.opt.kill_clip_distances) | gsinfo->culldist_mask;
2780 
2781    nir_shader *nir =
2782       ac_nir_create_gs_copy_shader(gs_nir,
2783                                    sscreen->info.gfx_level,
2784                                    clip_cull_mask,
2785                                    shader->info.vs_output_param_offset,
2786                                    shader->info.nr_param_exports,
2787                                    !si_shader_uses_streamout(gs_shader),
2788                                    gskey->ge.opt.kill_pointsize,
2789                                    gskey->ge.opt.kill_layer,
2790                                    sscreen->options.vrs2x2,
2791                                    output_info);
2792 
2793    struct si_shader_args args;
2794    si_init_shader_args(shader, &args, &gs_nir->info);
2795 
2796    NIR_PASS_V(nir, si_nir_lower_abi, shader, &args);
2797    NIR_PASS_V(nir, ac_nir_lower_intrinsics_to_args, sscreen->info.gfx_level,
2798               sscreen->info.has_ls_vgpr_init_bug, AC_HW_VERTEX_SHADER, 64, 64, &args.ac);
2799 
2800    si_nir_opts(gs_selector->screen, nir, false);
2801 
2802    NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
2803 
2804    if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY, SI_DUMP_NIR)) {
2805       fprintf(stderr, "GS Copy Shader:\n");
2806       nir_print_shader(nir, stderr);
2807    }
2808 
2809    bool ok =
2810 #if AMD_LLVM_AVAILABLE
2811       !gs_nir->info.use_aco_amd ? si_llvm_compile_shader(sscreen, compiler, shader,
2812                                                                    &args, debug, nir) :
2813 #endif
2814       si_aco_compile_shader(shader, &args, nir, debug);
2815 
2816    if (ok) {
2817       assert(!shader->config.scratch_bytes_per_wave);
2818       ok = si_shader_binary_upload(sscreen, shader, 0) >= 0;
2819       si_shader_dump(sscreen, shader, debug, stderr, true);
2820    }
2821    ralloc_free(nir);
2822 
2823    if (!ok) {
2824       FREE(shader);
2825       shader = NULL;
2826    } else {
2827       si_fix_resource_usage(sscreen, shader);
2828    }
2829    return shader;
2830 }
2831 
2832 struct si_gs_output_info {
2833    uint8_t streams[64];
2834    uint8_t streams_16bit_lo[16];
2835    uint8_t streams_16bit_hi[16];
2836 
2837    uint8_t usage_mask[64];
2838    uint8_t usage_mask_16bit_lo[16];
2839    uint8_t usage_mask_16bit_hi[16];
2840 
2841    ac_nir_gs_output_info info;
2842 };
2843 
2844 static void
si_init_gs_output_info(struct si_shader_info * info,struct si_gs_output_info * out_info)2845 si_init_gs_output_info(struct si_shader_info *info, struct si_gs_output_info *out_info)
2846 {
2847    for (int i = 0; i < info->num_outputs; i++) {
2848       unsigned slot = info->output_semantic[i];
2849       if (slot < VARYING_SLOT_VAR0_16BIT) {
2850          out_info->streams[slot] = info->output_streams[i];
2851          out_info->usage_mask[slot] = info->output_usagemask[i];
2852       } else {
2853          unsigned index = slot - VARYING_SLOT_VAR0_16BIT;
2854          /* TODO: 16bit need separated fields for lo/hi part. */
2855          out_info->streams_16bit_lo[index] = info->output_streams[i];
2856          out_info->streams_16bit_hi[index] = info->output_streams[i];
2857          out_info->usage_mask_16bit_lo[index] = info->output_usagemask[i];
2858          out_info->usage_mask_16bit_hi[index] = info->output_usagemask[i];
2859       }
2860    }
2861 
2862    ac_nir_gs_output_info *ac_info = &out_info->info;
2863 
2864    ac_info->streams = out_info->streams;
2865    ac_info->streams_16bit_lo = out_info->streams_16bit_lo;
2866    ac_info->streams_16bit_hi = out_info->streams_16bit_hi;
2867 
2868    ac_info->sysval_mask = out_info->usage_mask;
2869    ac_info->varying_mask = out_info->usage_mask;
2870    ac_info->varying_mask_16bit_lo = out_info->usage_mask_16bit_lo;
2871    ac_info->varying_mask_16bit_hi = out_info->usage_mask_16bit_hi;
2872 
2873    /* TODO: construct 16bit slot per component store type. */
2874    ac_info->types_16bit_lo = ac_info->types_16bit_hi = NULL;
2875 }
2876 
si_fixup_spi_ps_input_config(struct si_shader * shader)2877 static void si_fixup_spi_ps_input_config(struct si_shader *shader)
2878 {
2879    const union si_shader_key *key = &shader->key;
2880 
2881    /* Enable POS_FIXED_PT if polygon stippling is enabled. */
2882    if (key->ps.part.prolog.poly_stipple)
2883       shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2884 
2885    /* Set up the enable bits for per-sample shading if needed. */
2886    if (key->ps.part.prolog.force_persp_sample_interp &&
2887        (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2888         G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2889       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
2890       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2891       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2892    }
2893    if (key->ps.part.prolog.force_linear_sample_interp &&
2894        (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2895         G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2896       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
2897       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2898       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2899    }
2900    if (key->ps.part.prolog.force_persp_center_interp &&
2901        (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2902         G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2903       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
2904       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2905       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2906    }
2907    if (key->ps.part.prolog.force_linear_center_interp &&
2908        (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2909         G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2910       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
2911       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2912       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2913    }
2914 
2915    /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
2916    if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
2917        !(shader->config.spi_ps_input_ena & 0xf)) {
2918       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2919    }
2920 
2921    /* At least one pair of interpolation weights must be enabled. */
2922    if (!(shader->config.spi_ps_input_ena & 0x7f))
2923       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2924 
2925    /* Samplemask fixup requires the sample ID. */
2926    if (key->ps.part.prolog.samplemask_log_ps_iter)
2927       shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2928 }
2929 
2930 static void
si_set_spi_ps_input_config(struct si_shader * shader)2931 si_set_spi_ps_input_config(struct si_shader *shader)
2932 {
2933    const struct si_shader_selector *sel = shader->selector;
2934    const struct si_shader_info *info = &sel->info;
2935    const union si_shader_key *key = &shader->key;
2936 
2937    /* TODO: This should be determined from the final NIR instead of the input NIR,
2938     * otherwise LLVM will have a performance advantage here because it determines
2939     * VGPR inputs for each shader variant after LLVM optimizations.
2940     */
2941    shader->config.spi_ps_input_ena =
2942       S_0286CC_PERSP_CENTER_ENA(info->uses_persp_center) |
2943       S_0286CC_PERSP_CENTROID_ENA(info->uses_persp_centroid) |
2944       S_0286CC_PERSP_SAMPLE_ENA(info->uses_persp_sample) |
2945       S_0286CC_LINEAR_CENTER_ENA(info->uses_linear_center) |
2946       S_0286CC_LINEAR_CENTROID_ENA(info->uses_linear_centroid) |
2947       S_0286CC_LINEAR_SAMPLE_ENA(info->uses_linear_sample) |
2948       S_0286CC_FRONT_FACE_ENA(info->uses_frontface && !key->ps.opt.force_front_face_input) |
2949       S_0286CC_SAMPLE_COVERAGE_ENA(info->reads_samplemask) |
2950       S_0286CC_ANCILLARY_ENA(info->uses_sampleid || info->uses_layer_id);
2951 
2952    uint8_t mask = info->reads_frag_coord_mask | info->reads_sample_pos_mask;
2953    u_foreach_bit(i, mask) {
2954       shader->config.spi_ps_input_ena |= S_0286CC_POS_X_FLOAT_ENA(1) << i;
2955    }
2956 
2957    if (key->ps.part.prolog.color_two_side)
2958       shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
2959 
2960    /* INTERP_MODE_COLOR, same as SMOOTH if flat shading is disabled. */
2961    if (info->uses_interp_color && !key->ps.part.prolog.flatshade_colors) {
2962       shader->config.spi_ps_input_ena |=
2963          S_0286CC_PERSP_SAMPLE_ENA(info->uses_persp_sample_color) |
2964          S_0286CC_PERSP_CENTER_ENA(info->uses_persp_center_color) |
2965          S_0286CC_PERSP_CENTROID_ENA(info->uses_persp_centroid_color);
2966    }
2967 
2968    /* nir_lower_poly_line_smooth use nir_load_sample_mask_in */
2969    if (key->ps.mono.poly_line_smoothing)
2970       shader->config.spi_ps_input_ena |= S_0286CC_SAMPLE_COVERAGE_ENA(1);
2971 
2972    /* nir_lower_point_smooth use nir_load_point_coord_maybe_flipped which is lowered
2973     * to nir_load_barycentric_pixel and nir_load_interpolated_input.
2974     */
2975    if (key->ps.mono.point_smoothing)
2976       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2977 
2978    /* See fetch_framebuffer() for used args when fbfetch output. */
2979    if (info->base.fs.uses_fbfetch_output) {
2980       shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2981 
2982       if (key->ps.mono.fbfetch_layered || key->ps.mono.fbfetch_msaa)
2983          shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2984    }
2985 
2986    if (shader->is_monolithic) {
2987       si_fixup_spi_ps_input_config(shader);
2988       shader->config.spi_ps_input_addr = shader->config.spi_ps_input_ena;
2989    } else {
2990       /* Part mode will call si_fixup_spi_ps_input_config() when combining multi
2991        * shader part in si_shader_select_ps_parts().
2992        *
2993        * Reserve register locations for VGPR inputs the PS prolog may need.
2994        */
2995       shader->config.spi_ps_input_addr =
2996          shader->config.spi_ps_input_ena |
2997          SI_SPI_PS_INPUT_ADDR_FOR_PROLOG;
2998    }
2999 }
3000 
3001 static void
debug_message_stderr(void * data,unsigned * id,enum util_debug_type ptype,const char * fmt,va_list args)3002 debug_message_stderr(void *data, unsigned *id, enum util_debug_type ptype,
3003                       const char *fmt, va_list args)
3004 {
3005    vfprintf(stderr, fmt, args);
3006    fprintf(stderr, "\n");
3007 }
3008 
3009 static void
determine_shader_variant_info(struct si_screen * sscreen,struct si_shader * shader)3010 determine_shader_variant_info(struct si_screen *sscreen, struct si_shader *shader)
3011 {
3012    struct si_shader_selector *sel = shader->selector;
3013 
3014    if (sel->stage == MESA_SHADER_FRAGMENT) {
3015       shader->ps.writes_z = sel->info.writes_z && !shader->key.ps.part.epilog.kill_z;
3016       shader->ps.writes_stencil = sel->info.writes_stencil &&
3017                                   !shader->key.ps.part.epilog.kill_stencil;
3018       shader->ps.writes_samplemask = sel->info.writes_samplemask &&
3019                                      !shader->key.ps.part.epilog.kill_samplemask;
3020    }
3021 }
3022 
si_compile_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3023 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3024                        struct si_shader *shader, struct util_debug_callback *debug)
3025 {
3026    bool ret = true;
3027    struct si_shader_selector *sel = shader->selector;
3028 
3029    determine_shader_variant_info(sscreen, shader);
3030 
3031    /* ACO need spi_ps_input in advance to init args and used in compiler. */
3032    if (sel->stage == MESA_SHADER_FRAGMENT && sel->info.base.use_aco_amd)
3033       si_set_spi_ps_input_config(shader);
3034 
3035    /* We need this info only when legacy GS. */
3036    struct si_gs_output_info legacy_gs_output_info;
3037    if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
3038       memset(&legacy_gs_output_info, 0, sizeof(legacy_gs_output_info));
3039       si_init_gs_output_info(&sel->info, &legacy_gs_output_info);
3040    }
3041 
3042    struct si_shader_args args;
3043    bool free_nir;
3044    struct nir_shader *nir =
3045       si_get_nir_shader(shader, &args, &free_nir, &legacy_gs_output_info.info);
3046 
3047    /* Dump NIR before doing NIR->LLVM conversion in case the
3048     * conversion fails. */
3049    if (si_can_dump_shader(sscreen, nir->info.stage, SI_DUMP_NIR)) {
3050       nir_print_shader(nir, stderr);
3051 
3052       if (nir->xfb_info)
3053          nir_print_xfb_info(nir->xfb_info, stderr);
3054    }
3055 
3056    /* Initialize vs_output_ps_input_cntl to default. */
3057    for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++)
3058       shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED;
3059    shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0;
3060 
3061    si_update_shader_binary_info(shader, nir);
3062 
3063    /* uses_instanceid may be set by si_nir_lower_vs_inputs(). */
3064    shader->info.uses_instanceid |= sel->info.uses_instanceid;
3065    shader->info.private_mem_vgprs = DIV_ROUND_UP(nir->scratch_size, 4);
3066 
3067    /* Set the FP ALU behavior. */
3068    /* By default, we disable denormals for FP32 and enable them for FP16 and FP64
3069     * for performance and correctness reasons. FP32 denormals can't be enabled because
3070     * they break output modifiers and v_mad_f32 and are very slow on GFX6-7.
3071     *
3072     * float_controls_execution_mode defines the set of valid behaviors. Contradicting flags
3073     * can be set simultaneously, which means we are allowed to choose, but not really because
3074     * some options cause GLCTS failures.
3075     */
3076    unsigned float_mode = V_00B028_FP_16_64_DENORMS;
3077 
3078    if (!(nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) &&
3079        nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
3080       float_mode |= V_00B028_FP_32_ROUND_TOWARDS_ZERO;
3081 
3082    if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
3083                                                     FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64)) &&
3084        nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
3085                                                   FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
3086       float_mode |= V_00B028_FP_16_64_ROUND_TOWARDS_ZERO;
3087 
3088    if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_PRESERVE_FP16 |
3089                                                     FLOAT_CONTROLS_DENORM_PRESERVE_FP64)) &&
3090        nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
3091                                                   FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64))
3092       float_mode &= ~V_00B028_FP_16_64_DENORMS;
3093 
3094    ret =
3095 #if AMD_LLVM_AVAILABLE
3096       !nir->info.use_aco_amd ? si_llvm_compile_shader(sscreen, compiler, shader, &args,
3097                                                       debug, nir) :
3098 #endif
3099       si_aco_compile_shader(shader, &args, nir, debug);
3100 
3101    if (!ret)
3102       goto out;
3103 
3104    shader->config.float_mode = float_mode;
3105 
3106    /* The GS copy shader is compiled next. */
3107    if (nir->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
3108       shader->gs_copy_shader =
3109          si_nir_generate_gs_copy_shader(sscreen, compiler, shader, nir, debug,
3110                                         &legacy_gs_output_info.info);
3111       if (!shader->gs_copy_shader) {
3112          fprintf(stderr, "radeonsi: can't create GS copy shader\n");
3113          ret = false;
3114          goto out;
3115       }
3116    }
3117 
3118    /* Compute vs_output_ps_input_cntl. */
3119    if ((nir->info.stage == MESA_SHADER_VERTEX ||
3120         nir->info.stage == MESA_SHADER_TESS_EVAL ||
3121         nir->info.stage == MESA_SHADER_GEOMETRY) &&
3122        !shader->key.ge.as_ls && !shader->key.ge.as_es) {
3123       uint8_t *vs_output_param_offset = shader->info.vs_output_param_offset;
3124 
3125       if (nir->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg)
3126          vs_output_param_offset = shader->gs_copy_shader->info.vs_output_param_offset;
3127 
3128       /* We must use the original shader info before the removal of duplicated shader outputs. */
3129       /* VS and TES should also set primitive ID output if it's used. */
3130       unsigned num_outputs_with_prim_id = sel->info.num_outputs +
3131                                           shader->key.ge.mono.u.vs_export_prim_id;
3132 
3133       for (unsigned i = 0; i < num_outputs_with_prim_id; i++) {
3134          unsigned semantic = sel->info.output_semantic[i];
3135          unsigned offset = vs_output_param_offset[semantic];
3136          unsigned ps_input_cntl;
3137 
3138          if (offset <= AC_EXP_PARAM_OFFSET_31) {
3139             /* The input is loaded from parameter memory. */
3140             ps_input_cntl = S_028644_OFFSET(offset);
3141          } else {
3142             /* The input is a DEFAULT_VAL constant. */
3143             assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
3144                    offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
3145             offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
3146 
3147             /* OFFSET=0x20 means that DEFAULT_VAL is used. */
3148             ps_input_cntl = S_028644_OFFSET(0x20) |
3149                             S_028644_DEFAULT_VAL(offset);
3150          }
3151 
3152          shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl;
3153       }
3154    }
3155 
3156    /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */
3157    if (gl_shader_stage_is_compute(nir->info.stage)) {
3158       unsigned max_vgprs =
3159          sscreen->info.num_physical_wave64_vgprs_per_simd * (shader->wave_size == 32 ? 2 : 1);
3160       unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
3161       unsigned max_sgprs_per_wave = 128;
3162       unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
3163       unsigned threads_per_tg = si_get_max_workgroup_size(shader);
3164       unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, shader->wave_size);
3165       unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
3166 
3167       max_vgprs = max_vgprs / waves_per_simd;
3168       max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
3169 
3170       if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
3171          fprintf(stderr,
3172                  "LLVM failed to compile a shader correctly: "
3173                  "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
3174                  shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);
3175 
3176          /* Just terminate the process, because dependent
3177           * shaders can hang due to bad input data, but use
3178           * the env var to allow shader-db to work.
3179           */
3180          if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
3181             abort();
3182       }
3183    }
3184 
3185    /* Add/remove the scratch offset to/from input SGPRs. */
3186    if (!sel->screen->info.has_scratch_base_registers &&
3187        !si_is_merged_shader(shader)) {
3188       if (nir->info.use_aco_amd) {
3189          /* When aco scratch_offset arg is added explicitly at the beginning.
3190           * After compile if no scratch used, reduce the input sgpr count.
3191           */
3192          if (!shader->config.scratch_bytes_per_wave)
3193             shader->info.num_input_sgprs--;
3194       } else {
3195          /* scratch_offset arg is added by llvm implicitly */
3196          if (shader->info.num_input_sgprs)
3197             shader->info.num_input_sgprs++;
3198       }
3199    }
3200 
3201    /* Calculate the number of fragment input VGPRs. */
3202    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3203       shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(
3204          &shader->config, &shader->info.num_fragcoord_components);
3205    }
3206 
3207    si_calculate_max_simd_waves(shader);
3208 
3209    if (si_can_dump_shader(sscreen, nir->info.stage, SI_DUMP_STATS)) {
3210       struct util_debug_callback out_stderr = {
3211          .debug_message = debug_message_stderr,
3212       };
3213 
3214       si_shader_dump_stats_for_shader_db(sscreen, shader, &out_stderr);
3215    } else {
3216       si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
3217    }
3218 
3219 out:
3220    if (free_nir)
3221       ralloc_free(nir);
3222 
3223    return ret;
3224 }
3225 
3226 /**
3227  * Create, compile and return a shader part (prolog or epilog).
3228  *
3229  * \param sscreen  screen
3230  * \param list     list of shader parts of the same category
3231  * \param type     shader type
3232  * \param key      shader part key
3233  * \param prolog   whether the part being requested is a prolog
3234  * \param tm       LLVM target machine
3235  * \param debug    debug callback
3236  * \return         non-NULL on success
3237  */
3238 static struct si_shader_part *
si_get_shader_part(struct si_screen * sscreen,struct si_shader_part ** list,gl_shader_stage stage,bool prolog,union si_shader_part_key * key,struct ac_llvm_compiler * compiler,struct util_debug_callback * debug,const char * name)3239 si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
3240                    gl_shader_stage stage, bool prolog, union si_shader_part_key *key,
3241                    struct ac_llvm_compiler *compiler, struct util_debug_callback *debug,
3242                    const char *name)
3243 {
3244    struct si_shader_part *result;
3245 
3246    simple_mtx_lock(&sscreen->shader_parts_mutex);
3247 
3248    /* Find existing. */
3249    for (result = *list; result; result = result->next) {
3250       if (memcmp(&result->key, key, sizeof(*key)) == 0) {
3251          simple_mtx_unlock(&sscreen->shader_parts_mutex);
3252          return result;
3253       }
3254    }
3255 
3256    /* Compile a new one. */
3257    result = CALLOC_STRUCT(si_shader_part);
3258    result->key = *key;
3259 
3260    bool ok =
3261 #if AMD_LLVM_AVAILABLE
3262       !(sscreen->use_aco ||
3263         (stage == MESA_SHADER_FRAGMENT &&
3264          ((prolog && key->ps_prolog.use_aco) ||
3265           (!prolog && key->ps_epilog.use_aco)))) ?
3266       si_llvm_build_shader_part(sscreen, stage, prolog, compiler, debug, name, result) :
3267 #endif
3268       si_aco_build_shader_part(sscreen, stage, prolog, debug, name, result);
3269 
3270    if (ok) {
3271       result->next = *list;
3272       *list = result;
3273    } else {
3274       FREE(result);
3275       result = NULL;
3276    }
3277 
3278    simple_mtx_unlock(&sscreen->shader_parts_mutex);
3279    return result;
3280 }
3281 
3282 
3283 /**
3284  * Select and compile (or reuse) TCS parts (epilog).
3285  */
si_shader_select_tcs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3286 static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3287                                        struct si_shader *shader, struct util_debug_callback *debug)
3288 {
3289    if (sscreen->info.gfx_level >= GFX9) {
3290       assert(shader->wave_size == 32 || shader->wave_size == 64);
3291       unsigned index = shader->wave_size / 32 - 1;
3292       shader->previous_stage = shader->key.ge.part.tcs.ls->main_shader_part_ls[index];
3293    }
3294 
3295    return true;
3296 }
3297 
3298 /**
3299  * Select and compile (or reuse) GS parts (prolog).
3300  */
si_shader_select_gs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3301 static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3302                                       struct si_shader *shader, struct util_debug_callback *debug)
3303 {
3304    if (sscreen->info.gfx_level >= GFX9) {
3305       if (shader->key.ge.as_ngg) {
3306          assert(shader->wave_size == 32 || shader->wave_size == 64);
3307          unsigned index = shader->wave_size / 32 - 1;
3308          shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_ngg_es[index];
3309       } else {
3310          shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_es;
3311       }
3312    }
3313 
3314    return true;
3315 }
3316 
3317 /**
3318  * Compute the PS prolog key, which contains all the information needed to
3319  * build the PS prolog function, and set related bits in shader->config.
3320  */
si_get_ps_prolog_key(struct si_shader * shader,union si_shader_part_key * key)3321 static void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key)
3322 {
3323    struct si_shader_info *info = &shader->selector->info;
3324 
3325    memset(key, 0, sizeof(*key));
3326    key->ps_prolog.states = shader->key.ps.part.prolog;
3327    key->ps_prolog.use_aco = info->base.use_aco_amd;
3328    key->ps_prolog.wave32 = shader->wave_size == 32;
3329    key->ps_prolog.colors_read = shader->info.ps_colors_read;
3330    key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
3331    key->ps_prolog.wqm =
3332       info->base.fs.needs_quad_helper_invocations &&
3333       (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
3334        key->ps_prolog.states.force_linear_sample_interp ||
3335        key->ps_prolog.states.force_persp_center_interp ||
3336        key->ps_prolog.states.force_linear_center_interp ||
3337        key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);
3338    key->ps_prolog.num_fragcoord_components = shader->info.num_fragcoord_components;
3339 
3340    if (shader->key.ps.part.prolog.poly_stipple)
3341       shader->info.uses_vmem_load_other = true;
3342 
3343    if (shader->info.ps_colors_read) {
3344       uint8_t *color = shader->selector->info.color_attr_index;
3345 
3346       if (shader->key.ps.part.prolog.color_two_side) {
3347          /* BCOLORs are stored after the last input. */
3348          key->ps_prolog.num_interp_inputs = shader->info.num_ps_inputs;
3349          shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
3350       }
3351 
3352       for (unsigned i = 0; i < 2; i++) {
3353          unsigned interp = info->color_interpolate[i];
3354          unsigned location = info->color_interpolate_loc[i];
3355 
3356          if (!(shader->info.ps_colors_read & (0xf << i * 4)))
3357             continue;
3358 
3359          key->ps_prolog.color_attr_index[i] = color[i];
3360 
3361          if (shader->key.ps.part.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
3362             interp = INTERP_MODE_FLAT;
3363 
3364          switch (interp) {
3365          case INTERP_MODE_FLAT:
3366             key->ps_prolog.color_interp_vgpr_index[i] = -1;
3367             break;
3368          case INTERP_MODE_SMOOTH:
3369          case INTERP_MODE_COLOR:
3370             /* Force the interpolation location for colors here. */
3371             if (shader->key.ps.part.prolog.force_persp_sample_interp)
3372                location = TGSI_INTERPOLATE_LOC_SAMPLE;
3373             if (shader->key.ps.part.prolog.force_persp_center_interp)
3374                location = TGSI_INTERPOLATE_LOC_CENTER;
3375 
3376             switch (location) {
3377             case TGSI_INTERPOLATE_LOC_SAMPLE:
3378                key->ps_prolog.color_interp_vgpr_index[i] = 0;
3379                shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
3380                break;
3381             case TGSI_INTERPOLATE_LOC_CENTER:
3382                key->ps_prolog.color_interp_vgpr_index[i] = 2;
3383                shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
3384                break;
3385             case TGSI_INTERPOLATE_LOC_CENTROID:
3386                key->ps_prolog.color_interp_vgpr_index[i] = 4;
3387                shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
3388                break;
3389             default:
3390                assert(0);
3391             }
3392             break;
3393          case INTERP_MODE_NOPERSPECTIVE:
3394             /* Force the interpolation location for colors here. */
3395             if (shader->key.ps.part.prolog.force_linear_sample_interp)
3396                location = TGSI_INTERPOLATE_LOC_SAMPLE;
3397             if (shader->key.ps.part.prolog.force_linear_center_interp)
3398                location = TGSI_INTERPOLATE_LOC_CENTER;
3399 
3400             /* The VGPR assignment for non-monolithic shaders
3401              * works because InitialPSInputAddr is set on the
3402              * main shader and PERSP_PULL_MODEL is never used.
3403              */
3404             switch (location) {
3405             case TGSI_INTERPOLATE_LOC_SAMPLE:
3406                key->ps_prolog.color_interp_vgpr_index[i] = 6;
3407                shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
3408                break;
3409             case TGSI_INTERPOLATE_LOC_CENTER:
3410                key->ps_prolog.color_interp_vgpr_index[i] = 8;
3411                shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
3412                break;
3413             case TGSI_INTERPOLATE_LOC_CENTROID:
3414                key->ps_prolog.color_interp_vgpr_index[i] = 10;
3415                shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
3416                break;
3417             default:
3418                assert(0);
3419             }
3420             break;
3421          default:
3422             assert(0);
3423          }
3424       }
3425    }
3426 }
3427 
3428 /**
3429  * Check whether a PS prolog is required based on the key.
3430  */
si_need_ps_prolog(const union si_shader_part_key * key)3431 static bool si_need_ps_prolog(const union si_shader_part_key *key)
3432 {
3433    return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
3434           key->ps_prolog.states.force_linear_sample_interp ||
3435           key->ps_prolog.states.force_persp_center_interp ||
3436           key->ps_prolog.states.force_linear_center_interp ||
3437           key->ps_prolog.states.bc_optimize_for_persp ||
3438           key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
3439           key->ps_prolog.states.samplemask_log_ps_iter;
3440 }
3441 
3442 /**
3443  * Compute the PS epilog key, which contains all the information needed to
3444  * build the PS epilog function.
3445  */
si_get_ps_epilog_key(struct si_shader * shader,union si_shader_part_key * key)3446 static void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
3447 {
3448    struct si_shader_info *info = &shader->selector->info;
3449    memset(key, 0, sizeof(*key));
3450    key->ps_epilog.use_aco = info->base.use_aco_amd;
3451    key->ps_epilog.wave32 = shader->wave_size == 32;
3452    key->ps_epilog.uses_discard = si_shader_uses_discard(shader);
3453    key->ps_epilog.colors_written = info->colors_written;
3454    key->ps_epilog.color_types = info->output_color_types;
3455    key->ps_epilog.writes_z = info->writes_z;
3456    key->ps_epilog.writes_stencil = info->writes_stencil;
3457    key->ps_epilog.writes_samplemask = info->writes_samplemask;
3458    key->ps_epilog.states = shader->key.ps.part.epilog;
3459 }
3460 
3461 /**
3462  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
3463  */
si_shader_select_ps_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3464 static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3465                                       struct si_shader *shader, struct util_debug_callback *debug)
3466 {
3467    union si_shader_part_key prolog_key;
3468    union si_shader_part_key epilog_key;
3469 
3470    /* Get the prolog. */
3471    si_get_ps_prolog_key(shader, &prolog_key);
3472 
3473    /* The prolog is a no-op if these aren't set. */
3474    if (si_need_ps_prolog(&prolog_key)) {
3475       shader->prolog =
3476          si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key,
3477                             compiler, debug, "Fragment Shader Prolog");
3478       if (!shader->prolog)
3479          return false;
3480    }
3481 
3482    /* Get the epilog. */
3483    si_get_ps_epilog_key(shader, &epilog_key);
3484 
3485    shader->epilog =
3486       si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key,
3487                          compiler, debug, "Fragment Shader Epilog");
3488    if (!shader->epilog)
3489       return false;
3490 
3491    si_fixup_spi_ps_input_config(shader);
3492 
3493    /* Make sure spi_ps_input_addr bits is superset of spi_ps_input_ena. */
3494    unsigned spi_ps_input_ena = shader->config.spi_ps_input_ena;
3495    unsigned spi_ps_input_addr = shader->config.spi_ps_input_addr;
3496    assert((spi_ps_input_ena & spi_ps_input_addr) == spi_ps_input_ena);
3497 
3498    return true;
3499 }
3500 
si_multiwave_lds_size_workaround(struct si_screen * sscreen,unsigned * lds_size)3501 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
3502 {
3503    /* If tessellation is all offchip and on-chip GS isn't used, this
3504     * workaround is not needed.
3505     */
3506    return;
3507 
3508    /* SPI barrier management bug:
3509     *   Make sure we have at least 4k of LDS in use to avoid the bug.
3510     *   It applies to workgroup sizes of more than one wavefront.
3511     */
3512    if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
3513       *lds_size = MAX2(*lds_size, 8);
3514 }
3515 
si_fix_resource_usage(struct si_screen * sscreen,struct si_shader * shader)3516 static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
3517 {
3518    unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
3519 
3520    shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
3521 
3522    if (shader->selector->stage == MESA_SHADER_COMPUTE &&
3523        si_get_max_workgroup_size(shader) > shader->wave_size) {
3524       si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
3525    }
3526 }
3527 
si_create_shader_variant(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3528 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3529                               struct si_shader *shader, struct util_debug_callback *debug)
3530 {
3531    struct si_shader_selector *sel = shader->selector;
3532    struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key, shader->wave_size);
3533 
3534    /* LS, ES, VS are compiled on demand if the main part hasn't been
3535     * compiled for that stage.
3536     *
3537     * GS are compiled on demand if the main part hasn't been compiled
3538     * for the chosen NGG-ness.
3539     *
3540     * Vertex shaders are compiled on demand when a vertex fetch
3541     * workaround must be applied.
3542     */
3543    if (shader->is_monolithic) {
3544       /* Monolithic shader (compiled as a whole, has many variants,
3545        * may take a long time to compile).
3546        */
3547       if (!si_compile_shader(sscreen, compiler, shader, debug))
3548          return false;
3549    } else {
3550       /* The shader consists of several parts:
3551        *
3552        * - the middle part is the user shader, it has 1 variant only
3553        *   and it was compiled during the creation of the shader
3554        *   selector
3555        * - the prolog part is inserted at the beginning
3556        * - the epilog part is inserted at the end
3557        *
3558        * The prolog and epilog have many (but simple) variants.
3559        *
3560        * Starting with gfx9, geometry and tessellation control
3561        * shaders also contain the prolog and user shader parts of
3562        * the previous shader stage.
3563        */
3564 
3565       if (!mainp)
3566          return false;
3567 
3568       determine_shader_variant_info(sscreen, shader);
3569 
3570       /* Copy the compiled shader data over. */
3571       shader->is_binary_shared = true;
3572       shader->binary = mainp->binary;
3573       shader->config = mainp->config;
3574       shader->info = mainp->info;
3575 
3576       /* Select prologs and/or epilogs. */
3577       switch (sel->stage) {
3578       case MESA_SHADER_TESS_CTRL:
3579          if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
3580             return false;
3581          break;
3582       case MESA_SHADER_GEOMETRY:
3583          if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
3584             return false;
3585 
3586          /* Clone the GS copy shader for the shader variant.
3587           * We can't just copy the pointer because we change the pm4 state and
3588           * si_shader_selector::gs_copy_shader must be immutable because it's shared
3589           * by multiple contexts.
3590           */
3591          if (!shader->key.ge.as_ngg) {
3592             assert(mainp->gs_copy_shader);
3593             assert(mainp->gs_copy_shader->bo);
3594             assert(!mainp->gs_copy_shader->previous_stage_sel);
3595             assert(!mainp->gs_copy_shader->scratch_va);
3596 
3597             shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
3598             memcpy(shader->gs_copy_shader, mainp->gs_copy_shader,
3599                    sizeof(*shader->gs_copy_shader));
3600             /* Increase the reference count. */
3601             pipe_reference(NULL, &shader->gs_copy_shader->bo->b.b.reference);
3602             /* Initialize some fields differently. */
3603             shader->gs_copy_shader->shader_log = NULL;
3604             shader->gs_copy_shader->is_binary_shared = true;
3605             util_queue_fence_init(&shader->gs_copy_shader->ready);
3606          }
3607          break;
3608       case MESA_SHADER_FRAGMENT:
3609          if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
3610             return false;
3611 
3612          /* Make sure we have at least as many VGPRs as there
3613           * are allocated inputs.
3614           */
3615          shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
3616          break;
3617       default:;
3618       }
3619 
3620       assert(shader->wave_size == mainp->wave_size);
3621       assert(!shader->previous_stage || shader->wave_size == shader->previous_stage->wave_size);
3622 
3623       /* Update SGPR and VGPR counts. */
3624       if (shader->prolog) {
3625          shader->config.num_sgprs =
3626             MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs);
3627          shader->config.num_vgprs =
3628             MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs);
3629       }
3630       if (shader->previous_stage) {
3631          shader->config.num_sgprs =
3632             MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
3633          shader->config.num_vgprs =
3634             MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
3635          shader->config.spilled_sgprs =
3636             MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
3637          shader->config.spilled_vgprs =
3638             MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
3639          shader->info.private_mem_vgprs =
3640             MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
3641          shader->config.scratch_bytes_per_wave =
3642             MAX2(shader->config.scratch_bytes_per_wave,
3643                  shader->previous_stage->config.scratch_bytes_per_wave);
3644          shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
3645          shader->info.uses_vmem_load_other |= shader->previous_stage->info.uses_vmem_load_other;
3646          shader->info.uses_vmem_sampler_or_bvh |= shader->previous_stage->info.uses_vmem_sampler_or_bvh;
3647       }
3648       if (shader->epilog) {
3649          shader->config.num_sgprs =
3650             MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs);
3651          shader->config.num_vgprs =
3652             MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs);
3653       }
3654       si_calculate_max_simd_waves(shader);
3655    }
3656 
3657    if (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
3658       assert(!shader->key.ge.as_es && !shader->key.ge.as_ls);
3659       if (!gfx10_ngg_calculate_subgroup_info(shader)) {
3660          fprintf(stderr, "Failed to compute subgroup info\n");
3661          return false;
3662       }
3663    } else if (sscreen->info.gfx_level >= GFX9 && sel->stage == MESA_SHADER_GEOMETRY) {
3664       gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
3665    }
3666 
3667    shader->uses_vs_state_provoking_vertex =
3668       sscreen->use_ngg &&
3669       /* Used to convert triangle strips from GS to triangles. */
3670       ((sel->stage == MESA_SHADER_GEOMETRY &&
3671         util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) ||
3672        (sel->stage == MESA_SHADER_VERTEX &&
3673         /* Used to export PrimitiveID from the correct vertex. */
3674         shader->key.ge.mono.u.vs_export_prim_id));
3675 
3676    shader->uses_gs_state_outprim = sscreen->use_ngg &&
3677                                    /* Only used by streamout and the PrimID export in vertex
3678                                     * shaders. */
3679                                    sel->stage == MESA_SHADER_VERTEX &&
3680                                    (si_shader_uses_streamout(shader) ||
3681                                     shader->uses_vs_state_provoking_vertex);
3682 
3683    if (sel->stage == MESA_SHADER_VERTEX) {
3684       shader->uses_base_instance = sel->info.uses_base_instance ||
3685                                    shader->key.ge.mono.instance_divisor_is_one ||
3686                                    shader->key.ge.mono.instance_divisor_is_fetched;
3687    } else if (sel->stage == MESA_SHADER_TESS_CTRL) {
3688       shader->uses_base_instance = shader->previous_stage_sel &&
3689                                    (shader->previous_stage_sel->info.uses_base_instance ||
3690                                     shader->key.ge.mono.instance_divisor_is_one ||
3691                                     shader->key.ge.mono.instance_divisor_is_fetched);
3692    } else if (sel->stage == MESA_SHADER_GEOMETRY) {
3693       shader->uses_base_instance = shader->previous_stage_sel &&
3694                                    (shader->previous_stage_sel->info.uses_base_instance ||
3695                                     shader->key.ge.mono.instance_divisor_is_one ||
3696                                     shader->key.ge.mono.instance_divisor_is_fetched);
3697    }
3698 
3699    si_fix_resource_usage(sscreen, shader);
3700 
3701    /* Upload. */
3702    bool ok = si_shader_binary_upload(sscreen, shader, 0) >= 0;
3703 
3704    shader->complete_shader_binary_size = si_get_shader_binary_size(sscreen, shader);
3705 
3706    si_shader_dump(sscreen, shader, debug, stderr, true);
3707 
3708    if (!ok)
3709       fprintf(stderr, "LLVM failed to upload shader\n");
3710    return ok;
3711 }
3712 
si_shader_binary_clean(struct si_shader_binary * binary)3713 void si_shader_binary_clean(struct si_shader_binary *binary)
3714 {
3715    free((void *)binary->code_buffer);
3716    binary->code_buffer = NULL;
3717 
3718    free(binary->llvm_ir_string);
3719    binary->llvm_ir_string = NULL;
3720 
3721    free((void *)binary->symbols);
3722    binary->symbols = NULL;
3723 
3724    free(binary->uploaded_code);
3725    binary->uploaded_code = NULL;
3726    binary->uploaded_code_size = 0;
3727 }
3728 
si_shader_destroy(struct si_shader * shader)3729 void si_shader_destroy(struct si_shader *shader)
3730 {
3731    si_resource_reference(&shader->bo, NULL);
3732 
3733    if (!shader->is_binary_shared)
3734       si_shader_binary_clean(&shader->binary);
3735 
3736    free(shader->shader_log);
3737 }
3738 
si_get_prev_stage_nir_shader(struct si_shader * shader,struct si_shader * prev_shader,struct si_shader_args * args,bool * free_nir)3739 nir_shader *si_get_prev_stage_nir_shader(struct si_shader *shader,
3740                                          struct si_shader *prev_shader,
3741                                          struct si_shader_args *args,
3742                                          bool *free_nir)
3743 {
3744    const struct si_shader_selector *sel = shader->selector;
3745    const union si_shader_key *key = &shader->key;
3746 
3747    if (sel->stage == MESA_SHADER_TESS_CTRL) {
3748       struct si_shader_selector *ls = key->ge.part.tcs.ls;
3749 
3750       prev_shader->selector = ls;
3751       prev_shader->key.ge.as_ls = 1;
3752    } else {
3753       struct si_shader_selector *es = key->ge.part.gs.es;
3754 
3755       prev_shader->selector = es;
3756       prev_shader->key.ge.as_es = 1;
3757       prev_shader->key.ge.as_ngg = key->ge.as_ngg;
3758    }
3759 
3760    prev_shader->next_shader = shader;
3761    prev_shader->key.ge.mono = key->ge.mono;
3762    prev_shader->key.ge.opt = key->ge.opt;
3763    prev_shader->key.ge.opt.inline_uniforms = false; /* only TCS/GS can inline uniforms */
3764    /* kill_outputs was computed based on second shader's outputs so we can't use it to
3765     * kill first shader's outputs.
3766     */
3767    prev_shader->key.ge.opt.kill_outputs = 0;
3768    prev_shader->is_monolithic = true;
3769    prev_shader->wave_size = shader->wave_size;
3770 
3771    nir_shader *nir = si_get_nir_shader(prev_shader, args, free_nir, NULL);
3772    si_update_shader_binary_info(shader, nir);
3773 
3774    shader->info.uses_instanceid |=
3775       prev_shader->selector->info.uses_instanceid || prev_shader->info.uses_instanceid;
3776 
3777    return nir;
3778 }
3779 
si_get_ps_prolog_args(struct si_shader_args * args,const union si_shader_part_key * key)3780 void si_get_ps_prolog_args(struct si_shader_args *args,
3781                            const union si_shader_part_key *key)
3782 {
3783    memset(args, 0, sizeof(*args));
3784 
3785    const unsigned num_input_sgprs = key->ps_prolog.num_input_sgprs;
3786 
3787    struct ac_arg input_sgprs[num_input_sgprs];
3788    for (unsigned i = 0; i < num_input_sgprs; i++)
3789       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, input_sgprs + i);
3790 
3791    args->internal_bindings = input_sgprs[SI_SGPR_INTERNAL_BINDINGS];
3792    /* Use the absolute location of the input. */
3793    args->ac.prim_mask = input_sgprs[SI_PS_NUM_USER_SGPR];
3794 
3795    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_sample);
3796    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_center);
3797    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_centroid);
3798    /* skip PERSP_PULL_MODEL */
3799    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_sample);
3800    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_center);
3801    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_centroid);
3802    /* skip LINE_STIPPLE_TEX */
3803 
3804    /* POS_X|Y|Z|W_FLOAT */
3805    for (unsigned i = 0; i < key->ps_prolog.num_fragcoord_components; i++)
3806       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
3807 
3808    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.front_face);
3809    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.ancillary);
3810    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.sample_coverage);
3811    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.pos_fixed_pt);
3812 }
3813 
si_get_ps_epilog_args(struct si_shader_args * args,const union si_shader_part_key * key,struct ac_arg colors[MAX_DRAW_BUFFERS],struct ac_arg * depth,struct ac_arg * stencil,struct ac_arg * sample_mask)3814 void si_get_ps_epilog_args(struct si_shader_args *args,
3815                            const union si_shader_part_key *key,
3816                            struct ac_arg colors[MAX_DRAW_BUFFERS],
3817                            struct ac_arg *depth, struct ac_arg *stencil,
3818                            struct ac_arg *sample_mask)
3819 {
3820    memset(args, 0, sizeof(*args));
3821 
3822    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3823    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3824    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3825    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3826    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &args->alpha_reference);
3827 
3828    u_foreach_bit (i, key->ps_epilog.colors_written) {
3829       ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_FLOAT, colors + i);
3830    }
3831 
3832    if (key->ps_epilog.writes_z)
3833       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, depth);
3834 
3835    if (key->ps_epilog.writes_stencil)
3836       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, stencil);
3837 
3838    if (key->ps_epilog.writes_samplemask)
3839       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, sample_mask);
3840 }
3841