• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2012 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_shader.h"
8 #include "ac_nir.h"
9 #include "ac_rtld.h"
10 #include "nir.h"
11 #include "nir_builder.h"
12 #include "nir_serialize.h"
13 #include "nir_xfb_info.h"
14 #include "si_pipe.h"
15 #include "si_shader_internal.h"
16 #include "sid.h"
17 #include "tgsi/tgsi_from_mesa.h"
18 #include "util/u_memory.h"
19 #include "util/mesa-sha1.h"
20 #include "util/ralloc.h"
21 #include "util/u_upload_mgr.h"
22 
23 #if LLVM_AVAILABLE
24 #include <llvm/Config/llvm-config.h> /* for LLVM_VERSION_MAJOR */
25 #else
26 #define LLVM_VERSION_MAJOR 0
27 #endif
28 
29 static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
30 
31 static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
32 
33 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
34 static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader);
35 
36 /* Get the number of all interpolated inputs */
si_get_ps_num_interp(struct si_shader * ps)37 unsigned si_get_ps_num_interp(struct si_shader *ps)
38 {
39    unsigned num_interp = ps->info.num_ps_inputs;
40 
41    /* Back colors are added by the PS prolog when needed. */
42    if (!ps->is_monolithic && ps->key.ps.part.prolog.color_two_side)
43       num_interp += !!(ps->info.ps_colors_read & 0x0f) + !!(ps->info.ps_colors_read & 0xf0);
44 
45    assert(num_interp <= 32);
46    return MIN2(num_interp, 32);
47 }
48 
49 /** Whether the shader runs as a combination of multiple API shaders */
si_is_multi_part_shader(struct si_shader * shader)50 bool si_is_multi_part_shader(struct si_shader *shader)
51 {
52    if (shader->selector->screen->info.gfx_level <= GFX8 ||
53        shader->selector->stage > MESA_SHADER_GEOMETRY)
54       return false;
55 
56    return shader->key.ge.as_ls || shader->key.ge.as_es ||
57           shader->selector->stage == MESA_SHADER_TESS_CTRL ||
58           shader->selector->stage == MESA_SHADER_GEOMETRY;
59 }
60 
61 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
si_is_merged_shader(struct si_shader * shader)62 bool si_is_merged_shader(struct si_shader *shader)
63 {
64    if (shader->selector->stage > MESA_SHADER_GEOMETRY || shader->is_gs_copy_shader)
65       return false;
66 
67    return shader->key.ge.as_ngg || si_is_multi_part_shader(shader);
68 }
69 
70 /**
71  * Returns a unique index for a semantic name and index. The index must be
72  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
73  * calculated.
74  */
si_shader_io_get_unique_index(unsigned semantic)75 unsigned si_shader_io_get_unique_index(unsigned semantic)
76 {
77    switch (semantic) {
78    case VARYING_SLOT_POS:
79       return SI_UNIQUE_SLOT_POS;
80    default:
81       if (semantic >= VARYING_SLOT_VAR0 && semantic <= VARYING_SLOT_VAR31)
82          return SI_UNIQUE_SLOT_VAR0 + (semantic - VARYING_SLOT_VAR0);
83 
84       if (semantic >= VARYING_SLOT_VAR0_16BIT && semantic <= VARYING_SLOT_VAR15_16BIT)
85          return SI_UNIQUE_SLOT_VAR0_16BIT + (semantic - VARYING_SLOT_VAR0_16BIT);
86 
87       assert(!"invalid generic index");
88       return 0;
89 
90    /* Legacy desktop GL varyings. */
91    case VARYING_SLOT_FOGC:
92       return SI_UNIQUE_SLOT_FOGC;
93    case VARYING_SLOT_COL0:
94       return SI_UNIQUE_SLOT_COL0;
95    case VARYING_SLOT_COL1:
96       return SI_UNIQUE_SLOT_COL1;
97    case VARYING_SLOT_BFC0:
98       return SI_UNIQUE_SLOT_BFC0;
99    case VARYING_SLOT_BFC1:
100       return SI_UNIQUE_SLOT_BFC1;
101    case VARYING_SLOT_TEX0:
102    case VARYING_SLOT_TEX1:
103    case VARYING_SLOT_TEX2:
104    case VARYING_SLOT_TEX3:
105    case VARYING_SLOT_TEX4:
106    case VARYING_SLOT_TEX5:
107    case VARYING_SLOT_TEX6:
108    case VARYING_SLOT_TEX7:
109       return SI_UNIQUE_SLOT_TEX0 + (semantic - VARYING_SLOT_TEX0);
110    case VARYING_SLOT_CLIP_VERTEX:
111       return SI_UNIQUE_SLOT_CLIP_VERTEX;
112 
113    /* Varyings present in both GLES and desktop GL. */
114    case VARYING_SLOT_CLIP_DIST0:
115       return SI_UNIQUE_SLOT_CLIP_DIST0;
116    case VARYING_SLOT_CLIP_DIST1:
117       return SI_UNIQUE_SLOT_CLIP_DIST1;
118    case VARYING_SLOT_PSIZ:
119       return SI_UNIQUE_SLOT_PSIZ;
120    case VARYING_SLOT_LAYER:
121       return SI_UNIQUE_SLOT_LAYER;
122    case VARYING_SLOT_VIEWPORT:
123       return SI_UNIQUE_SLOT_VIEWPORT;
124    case VARYING_SLOT_PRIMITIVE_ID:
125       return SI_UNIQUE_SLOT_PRIMITIVE_ID;
126    }
127 }
128 
declare_streamout_params(struct si_shader_args * args,struct si_shader * shader,const shader_info * info)129 static void declare_streamout_params(struct si_shader_args *args, struct si_shader *shader,
130                                      const shader_info *info)
131 {
132    if (shader->selector->screen->info.gfx_level >= GFX11) {
133       /* NGG streamout. */
134       if (info->stage == MESA_SHADER_TESS_EVAL)
135          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
136       return;
137    }
138 
139    /* Streamout SGPRs. */
140    if (si_shader_uses_streamout(shader)) {
141       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_config);
142       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_write_index);
143 
144       /* A streamout buffer offset is loaded if the stride is non-zero. */
145       for (int i = 0; i < 4; i++) {
146          if (!info->xfb_stride[i])
147             continue;
148 
149          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_offset[i]);
150       }
151    } else if (info->stage == MESA_SHADER_TESS_EVAL) {
152       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
153    }
154 }
155 
si_get_max_workgroup_size(const struct si_shader * shader)156 unsigned si_get_max_workgroup_size(const struct si_shader *shader)
157 {
158    gl_shader_stage stage = shader->is_gs_copy_shader ?
159       MESA_SHADER_VERTEX : shader->selector->stage;
160 
161    assert(shader->wave_size);
162 
163    switch (stage) {
164    case MESA_SHADER_VERTEX:
165    case MESA_SHADER_TESS_EVAL:
166       /* Use the largest workgroup size for streamout */
167       if (shader->key.ge.as_ngg)
168          return si_shader_uses_streamout(shader) ? 256 : 128;
169 
170       /* As part of merged shader. */
171       return shader->selector->screen->info.gfx_level >= GFX9 &&
172          (shader->key.ge.as_ls || shader->key.ge.as_es) ? 128 : shader->wave_size;
173 
174    case MESA_SHADER_TESS_CTRL:
175       /* Return this so that LLVM doesn't remove s_barrier
176        * instructions on chips where we use s_barrier. */
177       return shader->selector->screen->info.gfx_level >= GFX7 ? 128 : shader->wave_size;
178 
179    case MESA_SHADER_GEOMETRY:
180       /* GS can always generate up to 256 vertices. */
181       return shader->selector->screen->info.gfx_level >= GFX9 ? 256 : shader->wave_size;
182 
183    case MESA_SHADER_COMPUTE:
184       break; /* see below */
185 
186    default:
187       return shader->wave_size;
188    }
189 
190    /* Compile a variable block size using the maximum variable size. */
191    if (shader->selector->info.base.workgroup_size_variable)
192       return SI_MAX_VARIABLE_THREADS_PER_BLOCK;
193 
194    uint16_t *local_size = shader->selector->info.base.workgroup_size;
195    unsigned max_work_group_size = (uint32_t)local_size[0] *
196                                   (uint32_t)local_size[1] *
197                                   (uint32_t)local_size[2];
198    assert(max_work_group_size);
199    return max_work_group_size;
200 }
201 
declare_const_and_shader_buffers(struct si_shader_args * args,struct si_shader * shader,const shader_info * info,bool assign_params)202 static void declare_const_and_shader_buffers(struct si_shader_args *args, struct si_shader *shader,
203                                              const shader_info *info, bool assign_params)
204 {
205    enum ac_arg_type const_shader_buf_type;
206 
207    if (info->num_ubos == 1 && info->num_ssbos == 0)
208       const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
209    else
210       const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
211 
212    ac_add_arg(
213       &args->ac, AC_ARG_SGPR, 1, const_shader_buf_type,
214       assign_params ? &args->const_and_shader_buffers : &args->other_const_and_shader_buffers);
215 }
216 
declare_samplers_and_images(struct si_shader_args * args,bool assign_params)217 static void declare_samplers_and_images(struct si_shader_args *args, bool assign_params)
218 {
219    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
220               assign_params ? &args->samplers_and_images : &args->other_samplers_and_images);
221 }
222 
declare_per_stage_desc_pointers(struct si_shader_args * args,struct si_shader * shader,const shader_info * info,bool assign_params)223 static void declare_per_stage_desc_pointers(struct si_shader_args *args, struct si_shader *shader,
224                                             const shader_info *info, bool assign_params)
225 {
226    declare_const_and_shader_buffers(args, shader, info, assign_params);
227    declare_samplers_and_images(args, assign_params);
228 }
229 
declare_global_desc_pointers(struct si_shader_args * args)230 static void declare_global_desc_pointers(struct si_shader_args *args)
231 {
232    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->internal_bindings);
233    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
234               &args->bindless_samplers_and_images);
235 }
236 
declare_vb_descriptor_input_sgprs(struct si_shader_args * args,struct si_shader * shader)237 static void declare_vb_descriptor_input_sgprs(struct si_shader_args *args,
238                                               struct si_shader *shader)
239 {
240    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->ac.vertex_buffers);
241 
242    unsigned num_vbos_in_user_sgprs = shader->selector->info.num_vbos_in_user_sgprs;
243    if (num_vbos_in_user_sgprs) {
244       unsigned user_sgprs = args->ac.num_sgprs_used;
245 
246       if (si_is_merged_shader(shader))
247          user_sgprs -= 8;
248       assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
249 
250       /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
251       for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
252          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
253 
254       assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(args->vb_descriptors));
255       for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
256          ac_add_arg(&args->ac, AC_ARG_SGPR, 4, AC_ARG_INT, &args->vb_descriptors[i]);
257    }
258 }
259 
declare_vs_input_vgprs(struct si_shader_args * args,struct si_shader * shader)260 static void declare_vs_input_vgprs(struct si_shader_args *args, struct si_shader *shader)
261 {
262    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vertex_id);
263 
264    if (shader->selector->screen->info.gfx_level >= GFX12) {
265       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
266    } else if (shader->key.ge.as_ls) {
267       if (shader->selector->screen->info.gfx_level >= GFX11) {
268          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
269          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
270          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
271       } else if (shader->selector->screen->info.gfx_level >= GFX10) {
272          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
273          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
274          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
275       } else {
276          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
277          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
278          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
279       }
280    } else if (shader->selector->screen->info.gfx_level >= GFX10) {
281       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
282       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
283                  /* user vgpr or PrimID (legacy) */
284                  shader->key.ge.as_ngg ? NULL : &args->ac.vs_prim_id);
285       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
286    } else {
287       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
288       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_prim_id);
289       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
290    }
291 }
292 
declare_vs_blit_inputs(struct si_shader * shader,struct si_shader_args * args,const shader_info * info)293 static void declare_vs_blit_inputs(struct si_shader *shader, struct si_shader_args *args,
294                                    const shader_info *info)
295 {
296    bool has_attribute_ring_address = shader->selector->screen->info.gfx_level >= GFX11;
297 
298    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_blit_inputs); /* i16 x1, y1 */
299    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);                  /* i16 x1, y1 */
300    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL);                /* depth */
301 
302    if (info->vs.blit_sgprs_amd ==
303        SI_VS_BLIT_SGPRS_POS_COLOR + has_attribute_ring_address) {
304       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
305       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
306       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
307       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
308       if (has_attribute_ring_address)
309          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* attribute ring address */
310    } else if (info->vs.blit_sgprs_amd ==
311               SI_VS_BLIT_SGPRS_POS_TEXCOORD + has_attribute_ring_address) {
312       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
313       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
314       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
315       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
316       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
317       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
318       if (has_attribute_ring_address)
319          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* attribute ring address */
320    }
321 }
322 
declare_tes_input_vgprs(struct si_shader_args * args)323 static void declare_tes_input_vgprs(struct si_shader_args *args)
324 {
325    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_u);
326    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_v);
327    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_rel_patch_id);
328    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_patch_id);
329 }
330 
331 enum
332 {
333    /* Convenient merged shader definitions. */
334    SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES,
335    SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
336 };
337 
si_add_arg_checked(struct ac_shader_args * args,enum ac_arg_regfile file,unsigned registers,enum ac_arg_type type,struct ac_arg * arg,unsigned idx)338 static void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
339                                enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
340 {
341    assert(args->arg_count == idx);
342    ac_add_arg(args, file, registers, type, arg);
343 }
344 
si_init_shader_args(struct si_shader * shader,struct si_shader_args * args,const shader_info * info)345 static void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args,
346                                 const shader_info *info)
347 {
348    unsigned i, num_returns, num_return_sgprs;
349    unsigned num_prolog_vgprs = 0;
350    struct si_shader_selector *sel = shader->selector;
351    unsigned stage = shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : info->stage;
352    unsigned stage_case = stage;
353 
354    memset(args, 0, sizeof(*args));
355 
356    /* Set MERGED shaders. */
357    if (sel->screen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY) {
358       if (shader->key.ge.as_ls || stage == MESA_SHADER_TESS_CTRL)
359          stage_case = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
360       else if (shader->key.ge.as_es || shader->key.ge.as_ngg || stage == MESA_SHADER_GEOMETRY)
361          stage_case = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
362    }
363 
364    switch (stage_case) {
365    case MESA_SHADER_VERTEX:
366       declare_global_desc_pointers(args);
367 
368       if (info->vs.blit_sgprs_amd) {
369          declare_vs_blit_inputs(shader, args, info);
370       } else {
371          declare_per_stage_desc_pointers(args, shader, info, true);
372          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
373 
374          if (shader->is_gs_copy_shader) {
375             declare_streamout_params(args, shader, info);
376          } else {
377             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
378             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
379             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
380             declare_vb_descriptor_input_sgprs(args, shader);
381 
382             if (shader->key.ge.as_es) {
383                ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
384             } else if (shader->key.ge.as_ls) {
385                /* no extra parameters */
386             } else {
387                declare_streamout_params(args, shader, info);
388             }
389          }
390       }
391 
392       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
393       if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
394          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
395 
396       /* VGPRs */
397       declare_vs_input_vgprs(args, shader);
398 
399       break;
400 
401    case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
402       declare_global_desc_pointers(args);
403       declare_per_stage_desc_pointers(args, shader, info, true);
404       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
405       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
406       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
407       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
408       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
409 
410       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
411       if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
412          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
413 
414       /* VGPRs */
415       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
416       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
417       break;
418 
419    case SI_SHADER_MERGED_VERTEX_TESSCTRL:
420       /* Merged stages have 8 system SGPRs at the beginning. */
421       /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
422       /* Gfx11+:  SPI_SHADER_PGM_LO/HI_HS */
423       declare_per_stage_desc_pointers(args, shader, info, stage == MESA_SHADER_TESS_CTRL);
424       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
425       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info);
426       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
427       if (sel->screen->info.gfx_level >= GFX11)
428          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_wave_id);
429       else
430          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
431       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
432       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
433 
434       declare_global_desc_pointers(args);
435       declare_per_stage_desc_pointers(args, shader, info, stage == MESA_SHADER_VERTEX);
436 
437       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
438       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
439       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
440       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
441       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
442       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
443 
444       /* VGPRs (first TCS, then VS) */
445       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
446       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
447 
448       if (stage == MESA_SHADER_VERTEX) {
449          declare_vs_input_vgprs(args, shader);
450 
451          /* Need to keep LS/HS arg index same for shared args when ACO,
452           * so this is not able to be before shared VGPRs.
453           */
454          declare_vb_descriptor_input_sgprs(args, shader);
455 
456          /* LS return values are inputs to the TCS main shader part. */
457          if (!shader->is_monolithic || shader->key.ge.opt.same_patch_vertices) {
458             for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
459                ac_add_return(&args->ac, AC_ARG_SGPR);
460             for (i = 0; i < 2; i++)
461                ac_add_return(&args->ac, AC_ARG_VGPR);
462 
463             /* VS outputs passed via VGPRs to TCS. */
464             if (shader->key.ge.opt.same_patch_vertices && !info->use_aco_amd) {
465                unsigned num_outputs = util_last_bit64(shader->selector->info.ls_es_outputs_written);
466                for (i = 0; i < num_outputs * 4; i++)
467                   ac_add_return(&args->ac, AC_ARG_VGPR);
468             }
469          }
470       } else {
471          /* TCS inputs are passed via VGPRs from VS. */
472          if (shader->key.ge.opt.same_patch_vertices && !info->use_aco_amd) {
473             unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->info.ls_es_outputs_written);
474             for (i = 0; i < num_inputs * 4; i++)
475                ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
476          }
477       }
478       break;
479 
480    case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
481       /* Merged stages have 8 system SGPRs at the beginning. */
482       /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
483       /* Gfx11+:  SPI_SHADER_PGM_LO/HI_GS */
484       declare_per_stage_desc_pointers(args, shader, info, stage == MESA_SHADER_GEOMETRY);
485 
486       if (shader->key.ge.as_ngg)
487          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_tg_info);
488       else
489          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
490 
491       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info);
492       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
493       if (sel->screen->info.gfx_level >= GFX11)
494          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_attr_offset);
495       else
496          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
497       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
498       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
499 
500       declare_global_desc_pointers(args);
501       if (stage != MESA_SHADER_VERTEX || !info->vs.blit_sgprs_amd) {
502          declare_per_stage_desc_pointers(
503             args, shader, info, (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL));
504       }
505 
506       if (stage == MESA_SHADER_VERTEX && info->vs.blit_sgprs_amd) {
507          declare_vs_blit_inputs(shader, args, info);
508       } else {
509          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
510 
511          if (stage == MESA_SHADER_VERTEX) {
512             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
513             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
514             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
515          } else if (stage == MESA_SHADER_TESS_EVAL) {
516             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
517             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
518             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
519          } else {
520             /* GS */
521             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
522             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
523             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
524          }
525 
526          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->small_prim_cull_info);
527          if (sel->screen->info.gfx_level >= GFX11)
528             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->gs_attr_address);
529          else
530             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
531       }
532 
533       /* VGPRs (first GS, then VS/TES) */
534       if (sel->screen->info.gfx_level >= GFX12) {
535          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
536          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
537          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
538       } else {
539          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
540          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
541          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
542          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
543          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
544       }
545 
546       if (stage == MESA_SHADER_VERTEX) {
547          declare_vs_input_vgprs(args, shader);
548 
549          /* Need to keep ES/GS arg index same for shared args when ACO,
550           * so this is not able to be before shared VGPRs.
551           */
552          if (!info->vs.blit_sgprs_amd)
553             declare_vb_descriptor_input_sgprs(args, shader);
554       } else if (stage == MESA_SHADER_TESS_EVAL) {
555          declare_tes_input_vgprs(args);
556       }
557 
558       if (shader->key.ge.as_es && !shader->is_monolithic &&
559           (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL)) {
560          /* ES return values are inputs to GS. */
561          for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
562             ac_add_return(&args->ac, AC_ARG_SGPR);
563          for (i = 0; i < (sel->screen->info.gfx_level >= GFX12 ? 3 : 5); i++)
564             ac_add_return(&args->ac, AC_ARG_VGPR);
565       }
566       break;
567 
568    case MESA_SHADER_TESS_EVAL:
569       declare_global_desc_pointers(args);
570       declare_per_stage_desc_pointers(args, shader, info, true);
571       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
572       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
573       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
574 
575       if (shader->key.ge.as_es) {
576          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
577          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
578          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
579       } else {
580          declare_streamout_params(args, shader, info);
581          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
582       }
583 
584       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
585       if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
586          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
587 
588       /* VGPRs */
589       declare_tes_input_vgprs(args);
590       break;
591 
592    case MESA_SHADER_GEOMETRY:
593       declare_global_desc_pointers(args);
594       declare_per_stage_desc_pointers(args, shader, info, true);
595       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
596       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_wave_id);
597 
598       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
599       if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
600          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
601 
602       /* VGPRs */
603       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
604       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
605       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
606       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
607       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[3]);
608       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[4]);
609       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[5]);
610       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
611       break;
612 
613    case MESA_SHADER_FRAGMENT:
614       declare_global_desc_pointers(args);
615       declare_per_stage_desc_pointers(args, shader, info, true);
616       si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->sample_locs[0],
617                          SI_PARAM_SAMPLE_LOCS0);
618       si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->sample_locs[1],
619                          SI_PARAM_SAMPLE_LOCS1);
620       si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->alpha_reference,
621                          SI_PARAM_ALPHA_REF);
622       si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.prim_mask,
623                          SI_PARAM_PRIM_MASK);
624 
625       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_sample,
626                          SI_PARAM_PERSP_SAMPLE);
627       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_center,
628                          SI_PARAM_PERSP_CENTER);
629       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_centroid,
630                          SI_PARAM_PERSP_CENTROID);
631       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
632       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_sample,
633                          SI_PARAM_LINEAR_SAMPLE);
634       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_center,
635                          SI_PARAM_LINEAR_CENTER);
636       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_centroid,
637                          SI_PARAM_LINEAR_CENTROID);
638       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
639       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[0],
640                          SI_PARAM_POS_X_FLOAT);
641       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[1],
642                          SI_PARAM_POS_Y_FLOAT);
643       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[2],
644                          SI_PARAM_POS_Z_FLOAT);
645       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[3],
646                          SI_PARAM_POS_W_FLOAT);
647       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.front_face,
648                          SI_PARAM_FRONT_FACE);
649       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.ancillary,
650                          SI_PARAM_ANCILLARY);
651       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.sample_coverage,
652                          SI_PARAM_SAMPLE_COVERAGE);
653       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.pos_fixed_pt,
654                          SI_PARAM_POS_FIXED_PT);
655 
656       if (info->use_aco_amd) {
657          ac_compact_ps_vgpr_args(&args->ac, shader->config.spi_ps_input_addr);
658 
659          /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
660          if (sel->screen->info.gfx_level < GFX11)
661             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
662       }
663 
664       /* Monolithic PS emit prolog and epilog in NIR directly. */
665       if (!shader->is_monolithic) {
666          /* Color inputs from the prolog. */
667          if (shader->selector->info.colors_read) {
668             unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);
669 
670             for (i = 0; i < num_color_elements; i++)
671                ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, i ? NULL : &args->color_start);
672 
673             num_prolog_vgprs += num_color_elements;
674          }
675 
676          /* Outputs for the epilog. */
677          num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
678          /* These must always be declared even if Z/stencil/samplemask are killed. */
679          num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
680                        sel->info.writes_z + sel->info.writes_stencil + sel->info.writes_samplemask +
681                        1 /* SampleMaskIn */;
682 
683          for (i = 0; i < num_return_sgprs; i++)
684             ac_add_return(&args->ac, AC_ARG_SGPR);
685          for (; i < num_returns; i++)
686             ac_add_return(&args->ac, AC_ARG_VGPR);
687       }
688       break;
689 
690    case MESA_SHADER_COMPUTE:
691    case MESA_SHADER_KERNEL:
692       declare_global_desc_pointers(args);
693       declare_per_stage_desc_pointers(args, shader, info, true);
694       if (shader->selector->info.uses_grid_size)
695          ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.num_work_groups);
696       if (shader->selector->info.uses_variable_block_size)
697          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->block_size);
698 
699       unsigned cs_user_data_dwords =
700          info->cs.user_data_components_amd;
701       if (cs_user_data_dwords) {
702          ac_add_arg(&args->ac, AC_ARG_SGPR, MIN2(cs_user_data_dwords, 4), AC_ARG_INT,
703                     &args->cs_user_data[0]);
704          if (cs_user_data_dwords > 4) {
705             ac_add_arg(&args->ac, AC_ARG_SGPR, cs_user_data_dwords - 4, AC_ARG_INT,
706                        &args->cs_user_data[1]);
707          }
708       }
709 
710       /* Some descriptors can be in user SGPRs. */
711       /* Shader buffers in user SGPRs. */
712       for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) {
713          while (args->ac.num_sgprs_used % 4 != 0)
714             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
715 
716          ac_add_arg(&args->ac, AC_ARG_SGPR, 4, AC_ARG_INT, &args->cs_shaderbuf[i]);
717       }
718       /* Images in user SGPRs. */
719       for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) {
720          unsigned num_sgprs = BITSET_TEST(info->image_buffers, i) ? 4 : 8;
721 
722          while (args->ac.num_sgprs_used % num_sgprs != 0)
723             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
724 
725          ac_add_arg(&args->ac, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &args->cs_image[i]);
726       }
727 
728       /* Hardware SGPRs. */
729       for (i = 0; i < 3; i++) {
730          if (shader->selector->info.uses_block_id[i]) {
731             /* GFX12 loads workgroup IDs into ttmp registers, so they are not input SGPRs, but we
732              * still need to set this to indicate that they are enabled (for ac_nir_to_llvm).
733              */
734             if (sel->screen->info.gfx_level >= GFX12)
735                args->ac.workgroup_ids[i].used = true;
736             else
737                ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]);
738          }
739       }
740       if (shader->selector->info.uses_tg_size)
741          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tg_size);
742 
743       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
744       if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
745          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
746 
747       /* Hardware VGPRs. */
748       /* Thread IDs are packed in VGPR0, 10 bits per component or stored in 3 separate VGPRs */
749       if (sel->screen->info.gfx_level >= GFX11 ||
750           (!sel->screen->info.has_graphics && sel->screen->info.family >= CHIP_MI200)) {
751          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_ids_packed);
752       } else {
753          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_id_x);
754          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_id_y);
755          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_id_z);
756       }
757       break;
758    default:
759       assert(0 && "unimplemented shader");
760       return;
761    }
762 
763    shader->info.num_input_sgprs = args->ac.num_sgprs_used;
764    shader->info.num_input_vgprs = args->ac.num_vgprs_used;
765 
766    assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
767    shader->info.num_input_vgprs -= num_prolog_vgprs;
768 }
769 
get_lds_granularity(struct si_screen * screen,gl_shader_stage stage)770 static unsigned get_lds_granularity(struct si_screen *screen, gl_shader_stage stage)
771 {
772    return screen->info.gfx_level >= GFX11 && stage == MESA_SHADER_FRAGMENT ? 1024 :
773           screen->info.gfx_level >= GFX7 ? 512 : 256;
774 }
775 
si_shader_binary_open(struct si_screen * screen,struct si_shader * shader,struct ac_rtld_binary * rtld)776 static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
777                                   struct ac_rtld_binary *rtld)
778 {
779    const struct si_shader_selector *sel = shader->selector;
780    const char *part_elfs[5];
781    size_t part_sizes[5];
782    unsigned num_parts = 0;
783 
784 #define add_part(shader_or_part)                                                                   \
785    if (shader_or_part) {                                                                           \
786       part_elfs[num_parts] = (shader_or_part)->binary.code_buffer;                                 \
787       part_sizes[num_parts] = (shader_or_part)->binary.code_size;                                  \
788       num_parts++;                                                                                 \
789    }
790 
791    add_part(shader->prolog);
792    add_part(shader->previous_stage);
793    add_part(shader);
794    add_part(shader->epilog);
795 
796 #undef add_part
797 
798    struct ac_rtld_symbol lds_symbols[2];
799    unsigned num_lds_symbols = 0;
800 
801    if (sel && screen->info.gfx_level >= GFX9 && !shader->is_gs_copy_shader &&
802        (sel->stage == MESA_SHADER_GEOMETRY ||
803         (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg))) {
804       struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
805       sym->name = "esgs_ring";
806       sym->size = shader->gs_info.esgs_ring_size * 4;
807       sym->align = 64 * 1024;
808    }
809 
810    if (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
811       struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
812       sym->name = "ngg_emit";
813       sym->size = shader->ngg.ngg_emit_size * 4;
814       sym->align = 4;
815    }
816 
817    bool ok = ac_rtld_open(
818       rtld, (struct ac_rtld_open_info){.info = &screen->info,
819                                        .options =
820                                           {
821                                              .halt_at_entry = screen->options.halt_shaders,
822                                              .waitcnt_wa = num_parts > 1 &&
823                                                            screen->info.needs_llvm_wait_wa,
824                                           },
825                                        .shader_type = sel->stage,
826                                        .wave_size = shader->wave_size,
827                                        .num_parts = num_parts,
828                                        .elf_ptrs = part_elfs,
829                                        .elf_sizes = part_sizes,
830                                        .num_shared_lds_symbols = num_lds_symbols,
831                                        .shared_lds_symbols = lds_symbols});
832 
833    if (rtld->lds_size > 0) {
834       unsigned alloc_granularity = get_lds_granularity(screen, sel->stage);
835       shader->config.lds_size = DIV_ROUND_UP(rtld->lds_size, alloc_granularity);
836    }
837 
838    return ok;
839 }
840 
get_shader_binaries(struct si_shader * shader,struct si_shader_binary * bin[4])841 static unsigned get_shader_binaries(struct si_shader *shader, struct si_shader_binary *bin[4])
842 {
843    unsigned num_bin = 0;
844 
845    if (shader->prolog)
846       bin[num_bin++] = &shader->prolog->binary;
847 
848    if (shader->previous_stage)
849       bin[num_bin++] = &shader->previous_stage->binary;
850 
851    bin[num_bin++] = &shader->binary;
852 
853    if (shader->epilog)
854       bin[num_bin++] = &shader->epilog->binary;
855 
856    return num_bin;
857 }
858 
859 /* si_get_shader_binary_size should only be called once per shader
860  * and the result should be stored in shader->complete_shader_binary_size.
861  */
si_get_shader_binary_size(struct si_screen * screen,struct si_shader * shader)862 unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
863 {
864    if (shader->binary.type == SI_SHADER_BINARY_ELF) {
865       struct ac_rtld_binary rtld;
866       si_shader_binary_open(screen, shader, &rtld);
867       uint64_t size = rtld.exec_size;
868       ac_rtld_close(&rtld);
869       return size;
870    } else {
871       struct si_shader_binary *bin[4];
872       unsigned num_bin = get_shader_binaries(shader, bin);
873 
874       unsigned size = 0;
875       for (unsigned i = 0; i < num_bin; i++) {
876          assert(bin[i]->type == SI_SHADER_BINARY_RAW);
877          size += bin[i]->exec_size;
878       }
879       return size;
880    }
881 }
882 
si_get_shader_prefetch_size(struct si_shader * shader)883 unsigned si_get_shader_prefetch_size(struct si_shader *shader)
884 {
885    struct si_screen *sscreen = shader->selector->screen;
886    /* This excludes arrays of constants after instructions. */
887    unsigned exec_size =
888       ac_align_shader_binary_for_prefetch(&sscreen->info,
889                                           shader->complete_shader_binary_size);
890 
891    /* INST_PREF_SIZE uses 128B granularity.
892     * - GFX11: max 128 * 63 = 8064
893     * - GFX12: max 128 * 255 = 32640
894     */
895    unsigned max_pref_size = shader->selector->screen->info.gfx_level >= GFX12 ? 255 : 63;
896    unsigned exec_size_gran128 = DIV_ROUND_UP(exec_size, 128);
897 
898    return MIN2(max_pref_size, exec_size_gran128);
899 }
900 
si_get_external_symbol(enum amd_gfx_level gfx_level,void * data,const char * name,uint64_t * value)901 static bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
902                                    uint64_t *value)
903 {
904    uint64_t *scratch_va = data;
905 
906    if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
907       *value = (uint32_t)*scratch_va;
908       return true;
909    }
910    if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
911       /* Enable scratch coalescing. */
912       *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32);
913 
914       if (gfx_level >= GFX11)
915          *value |= S_008F04_SWIZZLE_ENABLE_GFX11(1);
916       else
917          *value |= S_008F04_SWIZZLE_ENABLE_GFX6(1);
918       return true;
919    }
920 
921    return false;
922 }
923 
pre_upload_binary(struct si_screen * sscreen,struct si_shader * shader,unsigned binary_size,bool dma_upload,struct si_context ** upload_ctx,struct pipe_resource ** staging,unsigned * staging_offset,int64_t bo_offset)924 static void *pre_upload_binary(struct si_screen *sscreen, struct si_shader *shader,
925                                unsigned binary_size, bool dma_upload,
926                                struct si_context **upload_ctx,
927                                struct pipe_resource **staging,
928                                unsigned *staging_offset,
929                                int64_t bo_offset)
930 {
931    unsigned aligned_size = ac_align_shader_binary_for_prefetch(&sscreen->info, binary_size);
932 
933    if (bo_offset >= 0) {
934       /* sqtt needs to upload shaders as a pipeline, where all shaders
935        * are contiguous in memory.
936        * In this case, bo_offset will be positive and we don't have to
937        * realloc a new bo.
938        */
939       shader->gpu_address = shader->bo->gpu_address + bo_offset;
940       dma_upload = false;
941    } else {
942       si_resource_reference(&shader->bo, NULL);
943       shader->bo = si_aligned_buffer_create(
944          &sscreen->b,
945          SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT |
946          (dma_upload ? PIPE_RESOURCE_FLAG_UNMAPPABLE : 0),
947          PIPE_USAGE_IMMUTABLE, align(aligned_size, SI_CPDMA_ALIGNMENT), 256);
948       if (!shader->bo)
949          return NULL;
950 
951       shader->gpu_address = shader->bo->gpu_address;
952       bo_offset = 0;
953    }
954 
955    if (dma_upload) {
956       /* First upload into a staging buffer. */
957       *upload_ctx = si_get_aux_context(&sscreen->aux_context.shader_upload);
958 
959       void *ret;
960       u_upload_alloc((*upload_ctx)->b.stream_uploader, 0, binary_size, 256,
961                      staging_offset, staging, &ret);
962       if (!ret)
963          si_put_aux_context_flush(&sscreen->aux_context.shader_upload);
964 
965       return ret;
966    } else {
967       void *ptr = sscreen->ws->buffer_map(sscreen->ws,
968          shader->bo->buf, NULL,
969          PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);
970       if (!ptr)
971          return NULL;
972 
973       return ptr + bo_offset;
974    }
975 }
976 
post_upload_binary(struct si_screen * sscreen,struct si_shader * shader,void * code,unsigned code_size,unsigned binary_size,bool dma_upload,struct si_context * upload_ctx,struct pipe_resource * staging,unsigned staging_offset)977 static void post_upload_binary(struct si_screen *sscreen, struct si_shader *shader,
978                                void *code, unsigned code_size,
979                                unsigned binary_size, bool dma_upload,
980                                struct si_context *upload_ctx,
981                                struct pipe_resource *staging,
982                                unsigned staging_offset)
983 {
984    if (sscreen->debug_flags & DBG(SQTT)) {
985       /* Remember the uploaded code */
986       shader->binary.uploaded_code_size = code_size;
987       shader->binary.uploaded_code = malloc(code_size);
988       memcpy(shader->binary.uploaded_code, code, code_size);
989    }
990 
991    if (dma_upload) {
992       /* Then copy from the staging buffer to VRAM.
993        *
994        * We can't use the upload copy in si_buffer_transfer_unmap because that might use
995        * a compute shader, and we can't use shaders in the code that is responsible for making
996        * them available.
997        */
998       si_cp_dma_copy_buffer(upload_ctx, &shader->bo->b.b, staging, 0, staging_offset,
999                             binary_size);
1000       si_barrier_after_simple_buffer_op(upload_ctx, 0, &shader->bo->b.b, staging);
1001       upload_ctx->barrier_flags |= SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_L2;
1002 
1003 #if 0 /* debug: validate whether the copy was successful */
1004       uint32_t *dst_binary = malloc(binary_size);
1005       uint32_t *src_binary = (uint32_t*)code;
1006       pipe_buffer_read(&upload_ctx->b, &shader->bo->b.b, 0, binary_size, dst_binary);
1007       puts("dst_binary == src_binary:");
1008       for (unsigned i = 0; i < binary_size / 4; i++) {
1009          printf("   %08x == %08x\n", dst_binary[i], src_binary[i]);
1010       }
1011       free(dst_binary);
1012       exit(0);
1013 #endif
1014 
1015       si_put_aux_context_flush(&sscreen->aux_context.shader_upload);
1016       pipe_resource_reference(&staging, NULL);
1017    } else {
1018       sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf);
1019    }
1020 }
1021 
upload_binary_elf(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,bool dma_upload,int64_t bo_offset)1022 static int upload_binary_elf(struct si_screen *sscreen, struct si_shader *shader,
1023                              uint64_t scratch_va, bool dma_upload, int64_t bo_offset)
1024 {
1025    struct ac_rtld_binary binary;
1026    if (!si_shader_binary_open(sscreen, shader, &binary))
1027       return -1;
1028 
1029    struct si_context *upload_ctx = NULL;
1030    struct pipe_resource *staging = NULL;
1031    unsigned staging_offset = 0;
1032 
1033    void *rx_ptr = pre_upload_binary(sscreen, shader, binary.rx_size, dma_upload,
1034                                     &upload_ctx, &staging, &staging_offset,
1035                                     bo_offset);
1036    if (!rx_ptr)
1037       return -1;
1038 
1039    /* Upload. */
1040    struct ac_rtld_upload_info u = {};
1041    u.binary = &binary;
1042    u.get_external_symbol = si_get_external_symbol;
1043    u.cb_data = &scratch_va;
1044    u.rx_va = shader->gpu_address;
1045    u.rx_ptr = rx_ptr;
1046 
1047    int size = ac_rtld_upload(&u);
1048 
1049    post_upload_binary(sscreen, shader, rx_ptr, size, binary.rx_size, dma_upload,
1050                       upload_ctx, staging, staging_offset);
1051 
1052    ac_rtld_close(&binary);
1053 
1054    return size;
1055 }
1056 
calculate_needed_lds_size(struct si_screen * sscreen,struct si_shader * shader)1057 static void calculate_needed_lds_size(struct si_screen *sscreen, struct si_shader *shader)
1058 {
1059    gl_shader_stage stage =
1060       shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : shader->selector->stage;
1061 
1062    if (sscreen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY &&
1063        (stage == MESA_SHADER_GEOMETRY || shader->key.ge.as_ngg)) {
1064       unsigned size_in_dw = shader->gs_info.esgs_ring_size;
1065 
1066       if (stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)
1067          size_in_dw += shader->ngg.ngg_emit_size;
1068 
1069       if (shader->key.ge.as_ngg) {
1070          unsigned scratch_dw_size = gfx10_ngg_get_scratch_dw_size(shader);
1071          if (scratch_dw_size) {
1072             /* scratch base address needs to be 8 byte aligned */
1073             size_in_dw = ALIGN(size_in_dw, 2);
1074             size_in_dw += scratch_dw_size;
1075          }
1076       }
1077 
1078       shader->config.lds_size =
1079          DIV_ROUND_UP(size_in_dw * 4, get_lds_granularity(sscreen, stage));
1080    }
1081 }
1082 
upload_binary_raw(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,bool dma_upload,int64_t bo_offset)1083 static int upload_binary_raw(struct si_screen *sscreen, struct si_shader *shader,
1084                              uint64_t scratch_va, bool dma_upload, int64_t bo_offset)
1085 {
1086    struct si_shader_binary *bin[4];
1087    unsigned num_bin = get_shader_binaries(shader, bin);
1088 
1089    unsigned code_size = 0, exec_size = 0;
1090    for (unsigned i = 0; i < num_bin; i++) {
1091       assert(bin[i]->type == SI_SHADER_BINARY_RAW);
1092       code_size += bin[i]->code_size;
1093       exec_size += bin[i]->exec_size;
1094    }
1095 
1096    struct si_context *upload_ctx = NULL;
1097    struct pipe_resource *staging = NULL;
1098    unsigned staging_offset = 0;
1099 
1100    void *rx_ptr = pre_upload_binary(sscreen, shader, code_size, dma_upload,
1101                                     &upload_ctx, &staging, &staging_offset,
1102                                     bo_offset);
1103    if (!rx_ptr)
1104       return -1;
1105 
1106    unsigned exec_offset = 0, data_offset = exec_size;
1107    for (unsigned i = 0; i < num_bin; i++) {
1108       memcpy(rx_ptr + exec_offset, bin[i]->code_buffer, bin[i]->exec_size);
1109 
1110       if (bin[i]->num_symbols) {
1111          /* Offset needed to add to const data symbol because of inserting other
1112           * shader part between exec code and const data.
1113           */
1114          unsigned const_offset = data_offset - exec_offset - bin[i]->exec_size;
1115 
1116          /* Prolog and epilog have no symbols. */
1117          struct si_shader *sh = bin[i] == &shader->binary ? shader : shader->previous_stage;
1118          assert(sh && bin[i] == &sh->binary);
1119 
1120          si_aco_resolve_symbols(sh, rx_ptr + exec_offset, (const uint32_t *)bin[i]->code_buffer,
1121                                 scratch_va, const_offset);
1122       }
1123 
1124       exec_offset += bin[i]->exec_size;
1125 
1126       unsigned data_size = bin[i]->code_size - bin[i]->exec_size;
1127       if (data_size) {
1128          memcpy(rx_ptr + data_offset, bin[i]->code_buffer + bin[i]->exec_size, data_size);
1129          data_offset += data_size;
1130       }
1131    }
1132 
1133    post_upload_binary(sscreen, shader, rx_ptr, code_size, code_size, dma_upload,
1134                       upload_ctx, staging, staging_offset);
1135 
1136    calculate_needed_lds_size(sscreen, shader);
1137    return code_size;
1138 }
1139 
si_shader_binary_upload_at(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,int64_t bo_offset)1140 int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shader,
1141                                uint64_t scratch_va, int64_t bo_offset)
1142 {
1143    bool dma_upload = !(sscreen->debug_flags & DBG(NO_DMA_SHADERS)) && sscreen->info.has_cp_dma &&
1144                      sscreen->info.has_dedicated_vram && !sscreen->info.all_vram_visible &&
1145                      bo_offset < 0;
1146 
1147    if (shader->binary.type == SI_SHADER_BINARY_ELF) {
1148       return upload_binary_elf(sscreen, shader, scratch_va, dma_upload, bo_offset);
1149    } else {
1150       assert(shader->binary.type == SI_SHADER_BINARY_RAW);
1151       return upload_binary_raw(sscreen, shader, scratch_va, dma_upload, bo_offset);
1152    }
1153 }
1154 
si_shader_binary_upload(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va)1155 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
1156                             uint64_t scratch_va)
1157 {
1158    return si_shader_binary_upload_at(sscreen, shader, scratch_va, -1);
1159 }
1160 
print_disassembly(const char * disasm,size_t nbytes,const char * name,FILE * file,struct util_debug_callback * debug)1161 static void print_disassembly(const char *disasm, size_t nbytes,
1162                               const char *name, FILE *file,
1163                               struct util_debug_callback *debug)
1164 {
1165    if (debug && debug->debug_message) {
1166       /* Very long debug messages are cut off, so send the
1167        * disassembly one line at a time. This causes more
1168        * overhead, but on the plus side it simplifies
1169        * parsing of resulting logs.
1170        */
1171       util_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");
1172 
1173       uint64_t line = 0;
1174       while (line < nbytes) {
1175          int count = nbytes - line;
1176          const char *nl = memchr(disasm + line, '\n', nbytes - line);
1177          if (nl)
1178             count = nl - (disasm + line);
1179 
1180          if (count) {
1181             util_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
1182          }
1183 
1184          line += count + 1;
1185       }
1186 
1187       util_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
1188    }
1189 
1190    if (file) {
1191       fprintf(file, "Shader %s disassembly:\n", name);
1192       fprintf(file, "%*s", (int)nbytes, disasm);
1193    }
1194 }
1195 
si_shader_dump_disassembly(struct si_screen * screen,const struct si_shader_binary * binary,gl_shader_stage stage,unsigned wave_size,struct util_debug_callback * debug,const char * name,FILE * file)1196 static void si_shader_dump_disassembly(struct si_screen *screen,
1197                                        const struct si_shader_binary *binary,
1198                                        gl_shader_stage stage, unsigned wave_size,
1199                                        struct util_debug_callback *debug, const char *name,
1200                                        FILE *file)
1201 {
1202    if (binary->type == SI_SHADER_BINARY_RAW) {
1203       print_disassembly(binary->disasm_string, binary->disasm_size, name, file, debug);
1204       return;
1205    }
1206 
1207    struct ac_rtld_binary rtld_binary;
1208 
1209    if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
1210                                       .info = &screen->info,
1211                                       .shader_type = stage,
1212                                       .wave_size = wave_size,
1213                                       .num_parts = 1,
1214                                       .elf_ptrs = &binary->code_buffer,
1215                                       .elf_sizes = &binary->code_size}))
1216       return;
1217 
1218    const char *disasm;
1219    size_t nbytes;
1220 
1221    if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
1222       goto out;
1223 
1224    if (nbytes > INT_MAX)
1225       goto out;
1226 
1227    print_disassembly(disasm, nbytes, name, file, debug);
1228 
1229 out:
1230    ac_rtld_close(&rtld_binary);
1231 }
1232 
si_calculate_max_simd_waves(struct si_shader * shader)1233 static void si_calculate_max_simd_waves(struct si_shader *shader)
1234 {
1235    struct si_screen *sscreen = shader->selector->screen;
1236    struct ac_shader_config *conf = &shader->config;
1237    unsigned lds_increment = get_lds_granularity(sscreen, shader->selector->stage);
1238    unsigned lds_per_wave = 0;
1239    unsigned max_simd_waves;
1240 
1241    max_simd_waves = sscreen->info.max_waves_per_simd;
1242 
1243    /* Compute LDS usage for PS. */
1244    switch (shader->selector->stage) {
1245    case MESA_SHADER_FRAGMENT:
1246       /* The minimum usage per wave is (num_inputs * 48). The maximum
1247        * usage is (num_inputs * 48 * 16).
1248        * We can get anything in between and it varies between waves.
1249        *
1250        * The 48 bytes per input for a single primitive is equal to
1251        * 4 bytes/component * 4 components/input * 3 points.
1252        *
1253        * Other stages don't know the size at compile time or don't
1254        * allocate LDS per wave, but instead they do it per thread group.
1255        */
1256       lds_per_wave = conf->lds_size * lds_increment +
1257                      align(shader->info.num_ps_inputs * 48, lds_increment);
1258       break;
1259    case MESA_SHADER_COMPUTE: {
1260          unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
1261          lds_per_wave = (conf->lds_size * lds_increment) /
1262                         DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
1263       }
1264       break;
1265    default:;
1266    }
1267 
1268    /* Compute the per-SIMD wave counts. */
1269    if (conf->num_sgprs) {
1270       max_simd_waves =
1271          MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
1272    }
1273 
1274    if (conf->num_vgprs) {
1275       /* GFX 10.3 internally:
1276        * - aligns VGPRS to 16 for Wave32 and 8 for Wave64
1277        * - aligns LDS to 1024
1278        *
1279        * For shader-db stats, set num_vgprs that the hw actually uses.
1280        */
1281       unsigned num_vgprs = conf->num_vgprs;
1282       if (sscreen->info.gfx_level >= GFX10_3) {
1283          unsigned real_vgpr_gran = sscreen->info.num_physical_wave64_vgprs_per_simd / 64;
1284          num_vgprs = util_align_npot(num_vgprs, real_vgpr_gran * (shader->wave_size == 32 ? 2 : 1));
1285       } else {
1286          num_vgprs = align(num_vgprs, shader->wave_size == 32 ? 8 : 4);
1287       }
1288 
1289       /* Always print wave limits as Wave64, so that we can compare
1290        * Wave32 and Wave64 with shader-db fairly. */
1291       unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
1292       max_simd_waves = MIN2(max_simd_waves, max_vgprs / num_vgprs);
1293    }
1294 
1295    unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
1296    if (lds_per_wave)
1297       max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
1298 
1299    shader->info.max_simd_waves = max_simd_waves;
1300 }
1301 
si_shader_dump_stats_for_shader_db(struct si_screen * screen,struct si_shader * shader,struct util_debug_callback * debug)1302 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1303                                         struct util_debug_callback *debug)
1304 {
1305    const struct ac_shader_config *conf = &shader->config;
1306    static const char *stages[] = {"VS", "TCS", "TES", "GS", "PS", "CS"};
1307 
1308    if (screen->options.debug_disassembly)
1309       si_shader_dump_disassembly(screen, &shader->binary, shader->selector->stage,
1310                                  shader->wave_size, debug, "main", NULL);
1311 
1312    unsigned num_ls_outputs = 0;
1313    unsigned num_hs_outputs = 0;
1314    unsigned num_es_outputs = 0;
1315    unsigned num_gs_outputs = 0;
1316    unsigned num_vs_outputs = 0;
1317    unsigned num_ps_outputs = 0;
1318 
1319    if (shader->selector->stage <= MESA_SHADER_GEOMETRY) {
1320       /* This doesn't include pos exports because only param exports are interesting
1321        * for performance and can be optimized.
1322        */
1323       if (shader->key.ge.as_ls)
1324          num_ls_outputs = si_shader_lshs_vertex_stride(shader) / 16;
1325       else if (shader->selector->stage == MESA_SHADER_TESS_CTRL)
1326          num_hs_outputs = util_last_bit64(shader->selector->info.tcs_outputs_written_for_tes);
1327       else if (shader->key.ge.as_es)
1328          num_es_outputs = shader->selector->info.esgs_vertex_stride / 16;
1329       else if (shader->gs_copy_shader)
1330          num_gs_outputs = shader->gs_copy_shader->info.nr_param_exports;
1331       else if (shader->selector->stage == MESA_SHADER_GEOMETRY)
1332          num_gs_outputs = shader->info.nr_param_exports;
1333       else if (shader->selector->stage == MESA_SHADER_VERTEX ||
1334                shader->selector->stage == MESA_SHADER_TESS_EVAL)
1335          num_vs_outputs = shader->info.nr_param_exports;
1336       else
1337          unreachable("invalid shader key");
1338    } else if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
1339       num_ps_outputs = util_bitcount(shader->selector->info.colors_written) +
1340                        (shader->ps.writes_z ||
1341                         shader->ps.writes_stencil ||
1342                         shader->ps.writes_samplemask);
1343    }
1344 
1345    util_debug_message(debug, SHADER_INFO,
1346                       "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
1347                       "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
1348                       "Spilled VGPRs: %d PrivMem VGPRs: %d LSOutputs: %u HSOutputs: %u "
1349                       "HSPatchOuts: %u ESOutputs: %u GSOutputs: %u VSOutputs: %u PSOutputs: %u "
1350                       "InlineUniforms: %u DivergentLoop: %u (%s, W%u)",
1351                       conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
1352                       conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
1353                       conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
1354                       num_ls_outputs, num_hs_outputs,
1355                       util_last_bit(shader->selector->info.patch_outputs_written_for_tes),
1356                       num_es_outputs, num_gs_outputs, num_vs_outputs, num_ps_outputs,
1357                       shader->selector->info.base.num_inlinable_uniforms,
1358                       shader->selector->info.has_divergent_loop,
1359                       stages[shader->selector->stage], shader->wave_size);
1360 }
1361 
si_can_dump_shader(struct si_screen * sscreen,gl_shader_stage stage,enum si_shader_dump_type dump_type)1362 bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage,
1363                         enum si_shader_dump_type dump_type)
1364 {
1365    static uint64_t filter[] = {
1366       [SI_DUMP_SHADER_KEY] = DBG(NIR) | DBG(INIT_LLVM) | DBG(LLVM) | DBG(INIT_ACO) | DBG(ACO) | DBG(ASM),
1367       [SI_DUMP_INIT_NIR] = DBG(INIT_NIR),
1368       [SI_DUMP_NIR] = DBG(NIR),
1369       [SI_DUMP_INIT_LLVM_IR] = DBG(INIT_LLVM),
1370       [SI_DUMP_LLVM_IR] = DBG(LLVM),
1371       [SI_DUMP_INIT_ACO_IR] = DBG(INIT_ACO),
1372       [SI_DUMP_ACO_IR] = DBG(ACO),
1373       [SI_DUMP_ASM] = DBG(ASM),
1374       [SI_DUMP_STATS] = DBG(STATS),
1375       [SI_DUMP_ALWAYS] = DBG(VS) | DBG(TCS) | DBG(TES) | DBG(GS) | DBG(PS) | DBG(CS),
1376    };
1377    assert(dump_type < ARRAY_SIZE(filter));
1378 
1379    return sscreen->debug_flags & (1 << stage) &&
1380           sscreen->debug_flags & filter[dump_type];
1381 }
1382 
si_shader_dump_stats(struct si_screen * sscreen,struct si_shader * shader,FILE * file,bool check_debug_option)1383 static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
1384                                  bool check_debug_option)
1385 {
1386    const struct ac_shader_config *conf = &shader->config;
1387 
1388    if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
1389       fprintf(file,
1390               "*** SHADER CONFIG ***\n"
1391               "SPI_PS_INPUT_ADDR = 0x%04x\n"
1392               "SPI_PS_INPUT_ENA  = 0x%04x\n",
1393               conf->spi_ps_input_addr, conf->spi_ps_input_ena);
1394    }
1395 
1396    fprintf(file,
1397            "*** SHADER STATS ***\n"
1398            "SGPRS: %d\n"
1399            "VGPRS: %d\n"
1400            "Spilled SGPRs: %d\n"
1401            "Spilled VGPRs: %d\n"
1402            "Private memory VGPRs: %d\n"
1403            "Code Size: %d bytes\n"
1404            "LDS: %d bytes\n"
1405            "Scratch: %d bytes per wave\n"
1406            "Max Waves: %d\n"
1407            "********************\n\n\n",
1408            conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
1409            shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
1410            conf->lds_size * get_lds_granularity(sscreen, shader->selector->stage),
1411            conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
1412 }
1413 
si_get_shader_name(const struct si_shader * shader)1414 const char *si_get_shader_name(const struct si_shader *shader)
1415 {
1416    switch (shader->selector->stage) {
1417    case MESA_SHADER_VERTEX:
1418       if (shader->key.ge.as_es)
1419          return "Vertex Shader as ES";
1420       else if (shader->key.ge.as_ls)
1421          return "Vertex Shader as LS";
1422       else if (shader->key.ge.as_ngg)
1423          return "Vertex Shader as ESGS";
1424       else
1425          return "Vertex Shader as VS";
1426    case MESA_SHADER_TESS_CTRL:
1427       return "Tessellation Control Shader";
1428    case MESA_SHADER_TESS_EVAL:
1429       if (shader->key.ge.as_es)
1430          return "Tessellation Evaluation Shader as ES";
1431       else if (shader->key.ge.as_ngg)
1432          return "Tessellation Evaluation Shader as ESGS";
1433       else
1434          return "Tessellation Evaluation Shader as VS";
1435    case MESA_SHADER_GEOMETRY:
1436       if (shader->is_gs_copy_shader)
1437          return "GS Copy Shader as VS";
1438       else
1439          return "Geometry Shader";
1440    case MESA_SHADER_FRAGMENT:
1441       return "Pixel Shader";
1442    case MESA_SHADER_COMPUTE:
1443       return "Compute Shader";
1444    default:
1445       return "Unknown Shader";
1446    }
1447 }
1448 
si_shader_dump(struct si_screen * sscreen,struct si_shader * shader,struct util_debug_callback * debug,FILE * file,bool check_debug_option)1449 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1450                     struct util_debug_callback *debug, FILE *file, bool check_debug_option)
1451 {
1452    gl_shader_stage stage = shader->selector->stage;
1453 
1454    if (!check_debug_option || si_can_dump_shader(sscreen, stage, SI_DUMP_SHADER_KEY))
1455       si_dump_shader_key(shader, file);
1456 
1457    if (!check_debug_option && shader->binary.llvm_ir_string) {
1458       /* This is only used with ddebug. */
1459       if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
1460          fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
1461          fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
1462       }
1463 
1464       fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
1465       fprintf(file, "%s\n", shader->binary.llvm_ir_string);
1466    }
1467 
1468    if (!check_debug_option || (si_can_dump_shader(sscreen, stage, SI_DUMP_ASM))) {
1469       fprintf(file, "\n%s:\n", si_get_shader_name(shader));
1470 
1471       if (shader->prolog)
1472          si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, shader->wave_size, debug,
1473                                     "prolog", file);
1474       if (shader->previous_stage)
1475          si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,
1476                                     shader->wave_size, debug, "previous stage", file);
1477       si_shader_dump_disassembly(sscreen, &shader->binary, stage, shader->wave_size, debug, "main",
1478                                  file);
1479 
1480       if (shader->epilog)
1481          si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, shader->wave_size, debug,
1482                                     "epilog", file);
1483       fprintf(file, "\n");
1484 
1485       si_shader_dump_stats(sscreen, shader, file, check_debug_option);
1486    }
1487 }
1488 
si_dump_shader_key_vs(const union si_shader_key * key,FILE * f)1489 static void si_dump_shader_key_vs(const union si_shader_key *key, FILE *f)
1490 {
1491    fprintf(f, "  mono.instance_divisor_is_one = %u\n", key->ge.mono.instance_divisor_is_one);
1492    fprintf(f, "  mono.instance_divisor_is_fetched = %u\n",
1493            key->ge.mono.instance_divisor_is_fetched);
1494    fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->ge.mono.vs_fetch_opencode);
1495    fprintf(f, "  mono.vs.fix_fetch = {");
1496    for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
1497       union si_vs_fix_fetch fix = key->ge.mono.vs_fix_fetch[i];
1498       if (i)
1499          fprintf(f, ", ");
1500       if (!fix.bits)
1501          fprintf(f, "0");
1502       else
1503          fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
1504                  fix.u.format);
1505    }
1506    fprintf(f, "}\n");
1507 }
1508 
si_dump_shader_key(const struct si_shader * shader,FILE * f)1509 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
1510 {
1511    const union si_shader_key *key = &shader->key;
1512    gl_shader_stage stage = shader->selector->stage;
1513 
1514    fprintf(f, "SHADER KEY\n");
1515    fprintf(f, "  source_blake3 = {");
1516    _mesa_blake3_print(f, shader->selector->info.base.source_blake3);
1517    fprintf(f, "}\n");
1518 
1519    switch (stage) {
1520    case MESA_SHADER_VERTEX:
1521       si_dump_shader_key_vs(key, f);
1522       fprintf(f, "  as_es = %u\n", key->ge.as_es);
1523       fprintf(f, "  as_ls = %u\n", key->ge.as_ls);
1524       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
1525       fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1526       break;
1527 
1528    case MESA_SHADER_TESS_CTRL:
1529       if (shader->selector->screen->info.gfx_level >= GFX9)
1530          si_dump_shader_key_vs(key, f);
1531 
1532       fprintf(f, "  opt.tes_prim_mode = %u\n", key->ge.opt.tes_prim_mode);
1533       fprintf(f, "  opt.tes_reads_tess_factors = %u\n", key->ge.opt.tes_reads_tess_factors);
1534       fprintf(f, "  opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1535       fprintf(f, "  opt.same_patch_vertices = %u\n", key->ge.opt.same_patch_vertices);
1536       break;
1537 
1538    case MESA_SHADER_TESS_EVAL:
1539       fprintf(f, "  as_es = %u\n", key->ge.as_es);
1540       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
1541       fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1542       break;
1543 
1544    case MESA_SHADER_GEOMETRY:
1545       if (shader->is_gs_copy_shader)
1546          break;
1547 
1548       if (shader->selector->screen->info.gfx_level >= GFX9 &&
1549           key->ge.part.gs.es->stage == MESA_SHADER_VERTEX)
1550          si_dump_shader_key_vs(key, f);
1551 
1552       fprintf(f, "  mono.u.gs_tri_strip_adj_fix = %u\n", key->ge.mono.u.gs_tri_strip_adj_fix);
1553       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
1554       break;
1555 
1556    case MESA_SHADER_COMPUTE:
1557       break;
1558 
1559    case MESA_SHADER_FRAGMENT:
1560       fprintf(f, "  prolog.color_two_side = %u\n", key->ps.part.prolog.color_two_side);
1561       fprintf(f, "  prolog.flatshade_colors = %u\n", key->ps.part.prolog.flatshade_colors);
1562       fprintf(f, "  prolog.poly_stipple = %u\n", key->ps.part.prolog.poly_stipple);
1563       fprintf(f, "  prolog.force_persp_sample_interp = %u\n",
1564               key->ps.part.prolog.force_persp_sample_interp);
1565       fprintf(f, "  prolog.force_linear_sample_interp = %u\n",
1566               key->ps.part.prolog.force_linear_sample_interp);
1567       fprintf(f, "  prolog.force_persp_center_interp = %u\n",
1568               key->ps.part.prolog.force_persp_center_interp);
1569       fprintf(f, "  prolog.force_linear_center_interp = %u\n",
1570               key->ps.part.prolog.force_linear_center_interp);
1571       fprintf(f, "  prolog.bc_optimize_for_persp = %u\n",
1572               key->ps.part.prolog.bc_optimize_for_persp);
1573       fprintf(f, "  prolog.bc_optimize_for_linear = %u\n",
1574               key->ps.part.prolog.bc_optimize_for_linear);
1575       fprintf(f, "  prolog.samplemask_log_ps_iter = %u\n",
1576               key->ps.part.prolog.samplemask_log_ps_iter);
1577       fprintf(f, "  prolog.get_frag_coord_from_pixel_coord = %u\n",
1578               key->ps.part.prolog.get_frag_coord_from_pixel_coord);
1579       fprintf(f, "  prolog.force_samplemask_to_helper_invocation = %u\n",
1580               key->ps.part.prolog.force_samplemask_to_helper_invocation);
1581       fprintf(f, "  epilog.spi_shader_col_format = 0x%x\n",
1582               key->ps.part.epilog.spi_shader_col_format);
1583       fprintf(f, "  epilog.color_is_int8 = 0x%X\n", key->ps.part.epilog.color_is_int8);
1584       fprintf(f, "  epilog.color_is_int10 = 0x%X\n", key->ps.part.epilog.color_is_int10);
1585       fprintf(f, "  epilog.alpha_func = %u\n", key->ps.part.epilog.alpha_func);
1586       fprintf(f, "  epilog.alpha_to_one = %u\n", key->ps.part.epilog.alpha_to_one);
1587       fprintf(f, "  epilog.alpha_to_coverage_via_mrtz = %u\n", key->ps.part.epilog.alpha_to_coverage_via_mrtz);
1588       fprintf(f, "  epilog.clamp_color = %u\n", key->ps.part.epilog.clamp_color);
1589       fprintf(f, "  epilog.dual_src_blend_swizzle = %u\n", key->ps.part.epilog.dual_src_blend_swizzle);
1590       fprintf(f, "  epilog.rbplus_depth_only_opt = %u\n", key->ps.part.epilog.rbplus_depth_only_opt);
1591       fprintf(f, "  epilog.kill_z = %u\n", key->ps.part.epilog.kill_z);
1592       fprintf(f, "  epilog.kill_stencil = %u\n", key->ps.part.epilog.kill_stencil);
1593       fprintf(f, "  epilog.kill_samplemask = %u\n", key->ps.part.epilog.kill_samplemask);
1594       fprintf(f, "  mono.poly_line_smoothing = %u\n", key->ps.mono.poly_line_smoothing);
1595       fprintf(f, "  mono.point_smoothing = %u\n", key->ps.mono.point_smoothing);
1596       fprintf(f, "  mono.interpolate_at_sample_force_center = %u\n",
1597               key->ps.mono.interpolate_at_sample_force_center);
1598       fprintf(f, "  mono.fbfetch_msaa = %u\n", key->ps.mono.fbfetch_msaa);
1599       fprintf(f, "  mono.fbfetch_is_1D = %u\n", key->ps.mono.fbfetch_is_1D);
1600       fprintf(f, "  mono.fbfetch_layered = %u\n", key->ps.mono.fbfetch_layered);
1601       break;
1602 
1603    default:
1604       assert(0);
1605    }
1606 
1607    if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||
1608         stage == MESA_SHADER_VERTEX) &&
1609        !key->ge.as_es && !key->ge.as_ls) {
1610       fprintf(f, "  mono.remove_streamout = 0x%x\n", key->ge.mono.remove_streamout);
1611       fprintf(f, "  opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs);
1612       fprintf(f, "  opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances);
1613       fprintf(f, "  opt.kill_pointsize = %u\n", key->ge.opt.kill_pointsize);
1614       fprintf(f, "  opt.kill_layer = %u\n", key->ge.opt.kill_layer);
1615       fprintf(f, "  opt.remove_streamout = %u\n", key->ge.opt.remove_streamout);
1616       fprintf(f, "  opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling);
1617       fprintf(f, "  opt.ngg_vs_streamout_num_verts_per_prim = %u\n",
1618               key->ge.opt.ngg_vs_streamout_num_verts_per_prim);
1619    }
1620 
1621    if (stage <= MESA_SHADER_GEOMETRY)
1622       fprintf(f, "  opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1623    else
1624       fprintf(f, "  opt.prefer_mono = %u\n", key->ps.opt.prefer_mono);
1625 
1626    if (stage <= MESA_SHADER_GEOMETRY) {
1627       if (key->ge.opt.inline_uniforms) {
1628          fprintf(f, "  opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1629                  key->ge.opt.inline_uniforms,
1630                  key->ge.opt.inlined_uniform_values[0],
1631                  key->ge.opt.inlined_uniform_values[1],
1632                  key->ge.opt.inlined_uniform_values[2],
1633                  key->ge.opt.inlined_uniform_values[3]);
1634       } else {
1635          fprintf(f, "  opt.inline_uniforms = 0\n");
1636       }
1637    } else {
1638       if (key->ps.opt.inline_uniforms) {
1639          fprintf(f, "  opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1640                  key->ps.opt.inline_uniforms,
1641                  key->ps.opt.inlined_uniform_values[0],
1642                  key->ps.opt.inlined_uniform_values[1],
1643                  key->ps.opt.inlined_uniform_values[2],
1644                  key->ps.opt.inlined_uniform_values[3]);
1645       } else {
1646          fprintf(f, "  opt.inline_uniforms = 0\n");
1647       }
1648    }
1649 }
1650 
1651 /* TODO: convert to nir_shader_instructions_pass */
si_nir_kill_outputs(nir_shader * nir,const union si_shader_key * key)1652 static bool si_nir_kill_outputs(nir_shader *nir, const union si_shader_key *key)
1653 {
1654    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1655    assert(impl);
1656    assert(nir->info.stage <= MESA_SHADER_GEOMETRY);
1657 
1658    if (!key->ge.opt.kill_outputs &&
1659        !key->ge.opt.kill_pointsize &&
1660        !key->ge.opt.kill_layer &&
1661        !key->ge.opt.kill_clip_distances &&
1662        !(nir->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER))) {
1663       nir_metadata_preserve(impl, nir_metadata_all);
1664       return false;
1665    }
1666 
1667    bool progress = false;
1668 
1669    nir_foreach_block(block, impl) {
1670       nir_foreach_instr_safe(instr, block) {
1671          if (instr->type != nir_instr_type_intrinsic)
1672             continue;
1673 
1674          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1675          if (intr->intrinsic != nir_intrinsic_store_output)
1676             continue;
1677 
1678          /* No indirect indexing allowed. */
1679          ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1680          assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1681 
1682          assert(intr->num_components == 1); /* only scalar stores expected */
1683          nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1684 
1685          if (nir_slot_is_varying(sem.location, MESA_SHADER_FRAGMENT) &&
1686              key->ge.opt.kill_outputs &
1687              (1ull << si_shader_io_get_unique_index(sem.location)))
1688             progress |= nir_remove_varying(intr, MESA_SHADER_FRAGMENT);
1689 
1690          switch (sem.location) {
1691          case VARYING_SLOT_PSIZ:
1692             if (key->ge.opt.kill_pointsize)
1693                progress |= nir_remove_sysval_output(intr, MESA_SHADER_FRAGMENT);
1694             break;
1695 
1696          case VARYING_SLOT_CLIP_VERTEX:
1697             /* TODO: We should only kill specific clip planes as required by kill_clip_distance,
1698              * not whole gl_ClipVertex. Lower ClipVertex in NIR.
1699              */
1700             if ((key->ge.opt.kill_clip_distances & SI_USER_CLIP_PLANE_MASK) ==
1701                 SI_USER_CLIP_PLANE_MASK)
1702                progress |= nir_remove_sysval_output(intr, MESA_SHADER_FRAGMENT);
1703             break;
1704 
1705          case VARYING_SLOT_CLIP_DIST0:
1706          case VARYING_SLOT_CLIP_DIST1:
1707             if (key->ge.opt.kill_clip_distances) {
1708                assert(nir_intrinsic_src_type(intr) == nir_type_float32);
1709                unsigned index = (sem.location - VARYING_SLOT_CLIP_DIST0) * 4 +
1710                                 nir_intrinsic_component(intr);
1711 
1712                if (key->ge.opt.kill_clip_distances & BITFIELD_BIT(index))
1713                   progress |= nir_remove_sysval_output(intr, MESA_SHADER_FRAGMENT);
1714             }
1715             break;
1716 
1717          case VARYING_SLOT_LAYER:
1718             /* LAYER is never passed to FS. Instead, we load it there as a system value. */
1719             progress |= nir_remove_varying(intr, MESA_SHADER_FRAGMENT);
1720 
1721             if (key->ge.opt.kill_layer)
1722                progress |= nir_remove_sysval_output(intr, MESA_SHADER_FRAGMENT);
1723             break;
1724          }
1725       }
1726    }
1727 
1728    if (progress) {
1729       nir_metadata_preserve(impl, nir_metadata_control_flow);
1730    } else {
1731       nir_metadata_preserve(impl, nir_metadata_all);
1732    }
1733 
1734    return progress;
1735 }
1736 
si_map_io_driver_location(unsigned semantic)1737 static unsigned si_map_io_driver_location(unsigned semantic)
1738 {
1739    if ((semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_TESS_MAX) ||
1740        semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
1741        semantic == VARYING_SLOT_TESS_LEVEL_OUTER)
1742       return ac_shader_io_get_unique_index_patch(semantic);
1743 
1744    return si_shader_io_get_unique_index(semantic);
1745 }
1746 
si_lower_io_to_mem(struct si_shader * shader,nir_shader * nir)1747 static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir)
1748 {
1749    struct si_shader_selector *sel = shader->selector;
1750    struct si_shader_selector *next_sel = shader->next_shader ? shader->next_shader->selector : sel;
1751    const union si_shader_key *key = &shader->key;
1752    const bool is_gfx9_mono_tcs = shader->is_monolithic &&
1753                                  next_sel->stage == MESA_SHADER_TESS_CTRL &&
1754                                  sel->screen->info.gfx_level >= GFX9;
1755 
1756    if (nir->info.stage == MESA_SHADER_VERTEX) {
1757       if (key->ge.as_ls) {
1758          NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem,
1759                     is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
1760                     sel->screen->info.gfx_level,
1761                     key->ge.opt.same_patch_vertices,
1762                     is_gfx9_mono_tcs ? next_sel->info.tcs_inputs_via_temp : 0,
1763                     is_gfx9_mono_tcs ? next_sel->info.tcs_inputs_via_lds : ~0ull);
1764          return true;
1765       } else if (key->ge.as_es) {
1766          NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1767                     sel->screen->info.gfx_level, sel->info.esgs_vertex_stride, ~0ULL);
1768          return true;
1769       }
1770    } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
1771       NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem,
1772                  is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
1773                  sel->screen->info.gfx_level, key->ge.opt.same_patch_vertices,
1774                  sel->info.tcs_inputs_via_temp, sel->info.tcs_inputs_via_lds);
1775 
1776       /* Used by hs_emit_write_tess_factors() when monolithic shader. */
1777       if (nir->info.tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED)
1778          nir->info.tess._primitive_mode = key->ge.opt.tes_prim_mode;
1779 
1780       nir_tcs_info tcs_info;
1781       nir_gather_tcs_info(nir, &tcs_info, nir->info.tess._primitive_mode,
1782                           nir->info.tess.spacing);
1783 
1784       NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, &tcs_info, si_map_io_driver_location,
1785                  sel->screen->info.gfx_level,
1786                  ~0ULL, ~0U, /* no TES inputs filter */
1787                  shader->wave_size);
1788       return true;
1789    } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
1790       NIR_PASS_V(nir, ac_nir_lower_tes_inputs_to_mem, si_map_io_driver_location);
1791 
1792       if (key->ge.as_es) {
1793          NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1794                     sel->screen->info.gfx_level, sel->info.esgs_vertex_stride, ~0ULL);
1795       }
1796 
1797       return true;
1798    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1799       NIR_PASS_V(nir, ac_nir_lower_gs_inputs_to_mem, si_map_io_driver_location,
1800                  sel->screen->info.gfx_level, key->ge.mono.u.gs_tri_strip_adj_fix);
1801       return true;
1802    }
1803 
1804    return false;
1805 }
1806 
si_lower_ngg(struct si_shader * shader,nir_shader * nir)1807 static void si_lower_ngg(struct si_shader *shader, nir_shader *nir)
1808 {
1809    struct si_shader_selector *sel = shader->selector;
1810    const union si_shader_key *key = &shader->key;
1811    assert(key->ge.as_ngg);
1812 
1813    uint8_t clip_cull_dist_mask =
1814       (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) |
1815       sel->info.culldist_mask;
1816 
1817    ac_nir_lower_ngg_options options = {
1818       .hw_info = &sel->screen->info,
1819       .max_workgroup_size = si_get_max_workgroup_size(shader),
1820       .wave_size = shader->wave_size,
1821       .can_cull = si_shader_culling_enabled(shader),
1822       .disable_streamout = !si_shader_uses_streamout(shader),
1823       .vs_output_param_offset = shader->info.vs_output_param_offset,
1824       .has_param_exports = shader->info.nr_param_exports,
1825       .clip_cull_dist_mask = clip_cull_dist_mask,
1826       .kill_pointsize = key->ge.opt.kill_pointsize,
1827       .kill_layer = key->ge.opt.kill_layer,
1828       .force_vrs = sel->screen->options.vrs2x2,
1829       .use_gfx12_xfb_intrinsic = !nir->info.use_aco_amd,
1830    };
1831 
1832    if (nir->info.stage == MESA_SHADER_VERTEX ||
1833        nir->info.stage == MESA_SHADER_TESS_EVAL) {
1834       /* Per instance inputs, used to remove instance load after culling. */
1835       unsigned instance_rate_inputs = 0;
1836 
1837       if (nir->info.stage == MESA_SHADER_VERTEX) {
1838          instance_rate_inputs = key->ge.mono.instance_divisor_is_one |
1839                                 key->ge.mono.instance_divisor_is_fetched;
1840 
1841          /* Manually mark the instance ID used, so the shader can repack it. */
1842          if (instance_rate_inputs)
1843             BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
1844       } else {
1845          /* Manually mark the primitive ID used, so the shader can repack it. */
1846          if (key->ge.mono.u.vs_export_prim_id)
1847             BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
1848       }
1849 
1850       unsigned clip_plane_enable =
1851          SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(key->ge.opt.ngg_culling);
1852       unsigned num_vertices = si_get_num_vertices_per_output_prim(shader);
1853 
1854       options.num_vertices_per_primitive = num_vertices ? num_vertices : 3;
1855       options.early_prim_export = gfx10_ngg_export_prim_early(shader);
1856       options.passthrough = gfx10_is_ngg_passthrough(shader);
1857       options.use_edgeflags = gfx10_has_variable_edgeflags(shader);
1858       options.has_gen_prim_query = options.has_xfb_prim_query =
1859          sel->screen->info.gfx_level >= GFX11 && !nir->info.vs.blit_sgprs_amd;
1860       options.export_primitive_id = key->ge.mono.u.vs_export_prim_id;
1861       options.instance_rate_inputs = instance_rate_inputs;
1862       options.user_clip_plane_enable_mask = clip_plane_enable;
1863 
1864       NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options);
1865    } else {
1866       assert(nir->info.stage == MESA_SHADER_GEOMETRY);
1867 
1868       options.gs_out_vtx_bytes = sel->info.gsvs_vertex_size;
1869       options.has_gen_prim_query = options.has_xfb_prim_query =
1870          sel->screen->info.gfx_level >= GFX11;
1871       options.has_gs_invocations_query = sel->screen->info.gfx_level < GFX11;
1872       options.has_gs_primitives_query = true;
1873 
1874       /* For monolithic ES/GS to add vscnt wait when GS export pos0. */
1875       if (key->ge.part.gs.es)
1876          nir->info.writes_memory |= key->ge.part.gs.es->info.base.writes_memory;
1877 
1878       NIR_PASS_V(nir, ac_nir_lower_ngg_gs, &options);
1879    }
1880 
1881    /* may generate some vector output store */
1882    NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1883 }
1884 
si_deserialize_shader(struct si_shader_selector * sel)1885 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel)
1886 {
1887    struct pipe_screen *screen = &sel->screen->b;
1888    const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR, sel->stage);
1889 
1890    struct blob_reader blob_reader;
1891    blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
1892    return nir_deserialize(NULL, options, &blob_reader);
1893 }
1894 
si_nir_assign_param_offsets(nir_shader * nir,struct si_shader * shader,int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])1895 static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shader,
1896                                         int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])
1897 {
1898    struct si_shader_selector *sel = shader->selector;
1899    struct si_shader_binary_info *info = &shader->info;
1900 
1901    uint64_t outputs_written = 0;
1902    uint32_t outputs_written_16bit = 0;
1903 
1904    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1905    assert(impl);
1906 
1907    nir_foreach_block(block, impl) {
1908       nir_foreach_instr_safe(instr, block) {
1909          if (instr->type != nir_instr_type_intrinsic)
1910             continue;
1911 
1912          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1913          if (intr->intrinsic != nir_intrinsic_store_output)
1914             continue;
1915 
1916          /* No indirect indexing allowed. */
1917          ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1918          assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1919 
1920          assert(intr->num_components == 1); /* only scalar stores expected */
1921          nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1922 
1923          if (sem.location >= VARYING_SLOT_VAR0_16BIT)
1924             outputs_written_16bit |= BITFIELD_BIT(sem.location - VARYING_SLOT_VAR0_16BIT);
1925          else
1926             outputs_written |= BITFIELD64_BIT(sem.location);
1927 
1928          /* Assign the param index if it's unassigned. */
1929          if (nir_slot_is_varying(sem.location, MESA_SHADER_FRAGMENT) && !sem.no_varying &&
1930              (sem.gs_streams & 0x3) == 0 &&
1931              info->vs_output_param_offset[sem.location] == AC_EXP_PARAM_DEFAULT_VAL_0000) {
1932             /* The semantic and the base should be the same as in si_shader_info. */
1933             assert(sem.location == sel->info.output_semantic[nir_intrinsic_base(intr)]);
1934             /* It must not be remapped (duplicated). */
1935             assert(slot_remap[sem.location] == -1);
1936 
1937             info->vs_output_param_offset[sem.location] = info->nr_param_exports++;
1938          }
1939       }
1940    }
1941 
1942    /* Duplicated outputs are redirected here. */
1943    for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
1944       if (slot_remap[i] >= 0)
1945          info->vs_output_param_offset[i] = info->vs_output_param_offset[slot_remap[i]];
1946    }
1947 
1948    if (shader->key.ge.mono.u.vs_export_prim_id) {
1949       info->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = info->nr_param_exports++;
1950    }
1951 
1952    /* Update outputs written info, we may remove some outputs before. */
1953    nir->info.outputs_written = outputs_written;
1954    nir->info.outputs_written_16bit = outputs_written_16bit;
1955 }
1956 
si_assign_param_offsets(nir_shader * nir,struct si_shader * shader)1957 static void si_assign_param_offsets(nir_shader *nir, struct si_shader *shader)
1958 {
1959    /* Initialize this first. */
1960    shader->info.nr_param_exports = 0;
1961 
1962    STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
1963    memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
1964           sizeof(shader->info.vs_output_param_offset));
1965 
1966    /* A slot remapping table for duplicated outputs, so that 1 vertex shader output can be
1967     * mapped to multiple fragment shader inputs.
1968     */
1969    int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS];
1970    memset(slot_remap, -1, NUM_TOTAL_VARYING_SLOTS);
1971 
1972    /* This sets DEFAULT_VAL for constant outputs in vs_output_param_offset. */
1973    /* TODO: This doesn't affect GS. */
1974    NIR_PASS_V(nir, ac_nir_optimize_outputs, false, slot_remap,
1975               shader->info.vs_output_param_offset);
1976 
1977    /* Assign the non-constant outputs. */
1978    /* TODO: Use this for the GS copy shader too. */
1979    si_nir_assign_param_offsets(nir, shader, slot_remap);
1980 }
1981 
si_get_nr_pos_exports(const struct si_shader_selector * sel,const union si_shader_key * key)1982 static unsigned si_get_nr_pos_exports(const struct si_shader_selector *sel,
1983                                       const union si_shader_key *key)
1984 {
1985    const struct si_shader_info *info = &sel->info;
1986 
1987    /* Must have a position export. */
1988    unsigned nr_pos_exports = 1;
1989 
1990    if ((info->writes_psize && !key->ge.opt.kill_pointsize) ||
1991        (info->writes_edgeflag && !key->ge.as_ngg) ||
1992        (info->writes_layer && !key->ge.opt.kill_layer) ||
1993        info->writes_viewport_index || sel->screen->options.vrs2x2) {
1994       nr_pos_exports++;
1995    }
1996 
1997    unsigned clipdist_mask =
1998       (info->clipdist_mask & ~key->ge.opt.kill_clip_distances) | info->culldist_mask;
1999 
2000    for (int i = 0; i < 2; i++) {
2001       if (clipdist_mask & BITFIELD_RANGE(i * 4, 4))
2002          nr_pos_exports++;
2003    }
2004 
2005    return nr_pos_exports;
2006 }
2007 
lower_ps_load_color_intrinsic(nir_builder * b,nir_instr * instr,void * state)2008 static bool lower_ps_load_color_intrinsic(nir_builder *b, nir_instr *instr, void *state)
2009 {
2010    nir_def **colors = (nir_def **)state;
2011 
2012    if (instr->type != nir_instr_type_intrinsic)
2013       return false;
2014 
2015    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
2016 
2017    if (intrin->intrinsic != nir_intrinsic_load_color0 &&
2018        intrin->intrinsic != nir_intrinsic_load_color1)
2019       return false;
2020 
2021    unsigned index = intrin->intrinsic == nir_intrinsic_load_color0 ? 0 : 1;
2022    assert(colors[index]);
2023 
2024    nir_def_replace(&intrin->def, colors[index]);
2025    return true;
2026 }
2027 
si_nir_lower_ps_color_input(nir_shader * nir,const union si_shader_key * key,const struct si_shader_info * info)2028 static bool si_nir_lower_ps_color_input(nir_shader *nir, const union si_shader_key *key,
2029                                         const struct si_shader_info *info)
2030 {
2031    bool progress = false;
2032    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2033 
2034    nir_builder builder = nir_builder_at(nir_before_impl(impl));
2035    nir_builder *b = &builder;
2036 
2037    /* Build ready to be used colors at the beginning of the shader. */
2038    nir_def *colors[2] = {0};
2039    for (int i = 0; i < 2; i++) {
2040       if (!(info->colors_read & (0xf << (i * 4))))
2041          continue;
2042 
2043       enum glsl_interp_mode interp_mode = info->color_interpolate[i];
2044       if (interp_mode == INTERP_MODE_COLOR) {
2045          interp_mode = key->ps.part.prolog.flatshade_colors ?
2046             INTERP_MODE_FLAT : INTERP_MODE_SMOOTH;
2047       }
2048 
2049       nir_def *back_color = NULL;
2050       if (interp_mode == INTERP_MODE_FLAT) {
2051          colors[i] = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
2052                                    .io_semantics.location = VARYING_SLOT_COL0 + i,
2053                                    .io_semantics.num_slots = 1);
2054 
2055          if (key->ps.part.prolog.color_two_side) {
2056             back_color = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
2057                                         .io_semantics.location = VARYING_SLOT_BFC0 + i,
2058                                         .io_semantics.num_slots = 1);
2059          }
2060       } else {
2061          nir_intrinsic_op op = 0;
2062          switch (info->color_interpolate_loc[i]) {
2063          case TGSI_INTERPOLATE_LOC_CENTER:
2064             op = nir_intrinsic_load_barycentric_pixel;
2065             break;
2066          case TGSI_INTERPOLATE_LOC_CENTROID:
2067             op = nir_intrinsic_load_barycentric_centroid;
2068             break;
2069          case TGSI_INTERPOLATE_LOC_SAMPLE:
2070             op = nir_intrinsic_load_barycentric_sample;
2071             break;
2072          default:
2073             unreachable("invalid color interpolate location");
2074             break;
2075          }
2076 
2077          nir_def *barycentric = nir_load_barycentric(b, op, interp_mode);
2078 
2079          colors[i] =
2080             nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0),
2081                                         .io_semantics.location = VARYING_SLOT_COL0 + i,
2082                                         .io_semantics.num_slots = 1);
2083 
2084          if (key->ps.part.prolog.color_two_side) {
2085             back_color =
2086                nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0),
2087                                            .io_semantics.location = VARYING_SLOT_BFC0 + i,
2088                                            .io_semantics.num_slots = 1);
2089          }
2090       }
2091 
2092       if (back_color) {
2093          nir_def *is_front_face = nir_load_front_face(b, 1);
2094          colors[i] = nir_bcsel(b, is_front_face, colors[i], back_color);
2095       }
2096 
2097       progress = true;
2098    }
2099 
2100    /* lower nir_load_color0/1 to use the color value. */
2101    return nir_shader_instructions_pass(nir, lower_ps_load_color_intrinsic,
2102                                        nir_metadata_control_flow,
2103                                        colors) || progress;
2104 }
2105 
si_nir_emit_polygon_stipple(nir_shader * nir)2106 static bool si_nir_emit_polygon_stipple(nir_shader *nir)
2107 {
2108    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2109 
2110    nir_builder builder = nir_builder_at(nir_before_impl(impl));
2111    nir_builder *b = &builder;
2112 
2113    /* Load the buffer descriptor. */
2114    nir_def *desc = nir_load_polygon_stipple_buffer_amd(b);
2115 
2116    /* Use the fixed-point gl_FragCoord input.
2117     * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
2118     * per coordinate to get the repeating effect.
2119     */
2120    nir_def *pixel_coord = nir_u2u32(b, nir_iand_imm(b, nir_load_pixel_coord(b), 0x1f));
2121 
2122    nir_def *zero = nir_imm_int(b, 0);
2123    /* The stipple pattern is 32x32, each row has 32 bits. */
2124    nir_def *offset = nir_ishl_imm(b, nir_channel(b, pixel_coord, 1), 2);
2125    nir_def *row = nir_load_buffer_amd(b, 1, 32, desc, offset, zero, zero);
2126    nir_def *bit = nir_ubfe(b, row, nir_channel(b, pixel_coord, 0), nir_imm_int(b, 1));
2127 
2128    nir_def *pass = nir_i2b(b, bit);
2129    nir_discard_if(b, nir_inot(b, pass));
2130 
2131    nir_metadata_preserve(impl, nir_metadata_control_flow);
2132    return true;
2133 }
2134 
si_should_clear_lds(struct si_screen * sscreen,const struct nir_shader * shader)2135 bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader)
2136 {
2137    return gl_shader_stage_is_compute(shader->info.stage) &&
2138       shader->info.shared_size > 0 && sscreen->options.clear_lds;
2139 }
2140 
clamp_shadow_comparison_value(nir_builder * b,nir_instr * instr,void * state)2141 static bool clamp_shadow_comparison_value(nir_builder *b, nir_instr *instr, void *state)
2142 {
2143    if (instr->type != nir_instr_type_tex)
2144       return false;
2145 
2146    nir_tex_instr *tex = nir_instr_as_tex(instr);
2147    if (!tex->is_shadow)
2148       return false;
2149 
2150    b->cursor = nir_before_instr(instr);
2151 
2152    int samp_index = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
2153    int comp_index = nir_tex_instr_src_index(tex, nir_tex_src_comparator);
2154    assert(samp_index >= 0 && comp_index >= 0);
2155 
2156    nir_def *sampler = tex->src[samp_index].src.ssa;
2157    nir_def *compare = tex->src[comp_index].src.ssa;
2158    /* Must have been lowered to descriptor. */
2159    assert(sampler->num_components > 1);
2160 
2161    nir_def *upgraded = nir_channel(b, sampler, 3);
2162    upgraded = nir_i2b(b, nir_ubfe_imm(b, upgraded, 29, 1));
2163 
2164    nir_def *clamped = nir_fsat(b, compare);
2165    compare = nir_bcsel(b, upgraded, clamped, compare);
2166 
2167    nir_src_rewrite(&tex->src[comp_index].src, compare);
2168    return true;
2169 }
2170 
si_nir_clamp_shadow_comparison_value(nir_shader * nir)2171 static bool si_nir_clamp_shadow_comparison_value(nir_shader *nir)
2172 {
2173    /* Section 8.23.1 (Depth Texture Comparison Mode) of the
2174     * OpenGL 4.5 spec says:
2175     *
2176     *    "If the texture’s internal format indicates a fixed-point
2177     *     depth texture, then D_t and D_ref are clamped to the
2178     *     range [0, 1]; otherwise no clamping is performed."
2179     *
2180     * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
2181     * so the depth comparison value isn't clamped for Z16 and
2182     * Z24 anymore. Do it manually here for GFX8-9; GFX10 has
2183     * an explicitly clamped 32-bit float format.
2184     */
2185    return nir_shader_instructions_pass(nir, clamp_shadow_comparison_value,
2186                                        nir_metadata_control_flow,
2187                                        NULL);
2188 }
2189 
2190 static void
si_init_gs_output_info(struct si_shader_info * info,struct si_gs_output_info * out_info)2191 si_init_gs_output_info(struct si_shader_info *info, struct si_gs_output_info *out_info)
2192 {
2193    for (int i = 0; i < info->num_outputs; i++) {
2194       unsigned slot = info->output_semantic[i];
2195       if (slot < VARYING_SLOT_VAR0_16BIT) {
2196          out_info->streams[slot] = info->output_streams[i];
2197          out_info->usage_mask[slot] = info->output_usagemask[i];
2198       } else {
2199          unsigned index = slot - VARYING_SLOT_VAR0_16BIT;
2200          /* TODO: 16bit need separated fields for lo/hi part. */
2201          out_info->streams_16bit_lo[index] = info->output_streams[i];
2202          out_info->streams_16bit_hi[index] = info->output_streams[i];
2203          out_info->usage_mask_16bit_lo[index] = info->output_usagemask[i];
2204          out_info->usage_mask_16bit_hi[index] = info->output_usagemask[i];
2205       }
2206    }
2207 
2208    ac_nir_gs_output_info *ac_info = &out_info->info;
2209 
2210    ac_info->streams = out_info->streams;
2211    ac_info->streams_16bit_lo = out_info->streams_16bit_lo;
2212    ac_info->streams_16bit_hi = out_info->streams_16bit_hi;
2213 
2214    ac_info->sysval_mask = out_info->usage_mask;
2215    ac_info->varying_mask = out_info->usage_mask;
2216    ac_info->varying_mask_16bit_lo = out_info->usage_mask_16bit_lo;
2217    ac_info->varying_mask_16bit_hi = out_info->usage_mask_16bit_hi;
2218 
2219    /* TODO: construct 16bit slot per component store type. */
2220    ac_info->types_16bit_lo = ac_info->types_16bit_hi = NULL;
2221 }
2222 
2223 /* Run passes that eliminate code and affect shader_info. These should be run before linking
2224  * and shader_info gathering. Lowering passes can be run here too, but only if they lead to
2225  * better code or lower undesirable representations (like derefs). Lowering passes that prevent
2226  * linking optimizations or destroy shader_info shouldn't be run here.
2227  */
run_pre_link_optimization_passes(struct si_nir_shader_ctx * ctx)2228 static bool run_pre_link_optimization_passes(struct si_nir_shader_ctx *ctx)
2229 {
2230    struct si_shader *shader = ctx->shader;
2231    struct si_shader_selector *sel = shader->selector;
2232    const union si_shader_key *key = &shader->key;
2233    nir_shader *nir = ctx->nir;
2234    bool progress = false;
2235 
2236    /* Kill outputs according to the shader key. */
2237    if (nir->info.stage <= MESA_SHADER_GEOMETRY)
2238       NIR_PASS(progress, nir, si_nir_kill_outputs, key);
2239 
2240    bool inline_uniforms = false;
2241    uint32_t *inlined_uniform_values;
2242    si_get_inline_uniform_state((union si_shader_key*)key, nir->info.stage,
2243                                &inline_uniforms, &inlined_uniform_values);
2244 
2245    if (inline_uniforms) {
2246       /* Most places use shader information from the default variant, not
2247        * the optimized variant. These are the things that the driver looks at
2248        * in optimized variants and the list of things that we need to do.
2249        *
2250        * The driver takes into account these things if they suddenly disappear
2251        * from the shader code:
2252        * - Register usage and code size decrease (obvious)
2253        * - Eliminated PS system values are disabled
2254        * - VS/TES/GS param exports are eliminated if they are undef.
2255        *   The param space for eliminated outputs is also not allocated.
2256        * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)
2257        * - TCS output stores are eliminated
2258        * - Eliminated PS inputs are removed from PS.NUM_INTERP.
2259        *
2260        * TODO: These are things the driver ignores in the final shader code
2261        * and relies on the default shader info.
2262        * - System values in VS, TCS, TES, GS are not eliminated
2263        * - uses_discard - if it changed to false
2264        * - writes_memory - if it changed to false
2265        * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not
2266        *   eliminated
2267        * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS)
2268        *   GS outputs are eliminated except for the temporary LDS.
2269        *   Clip distances, gl_PointSize, gl_Layer and PS outputs are eliminated based
2270        *   on current states, so we don't care about the shader code.
2271        *
2272        * TODO: Merged shaders don't inline uniforms for the first stage.
2273        * VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS.
2274        * (key == NULL for the first stage here)
2275        *
2276        * TODO: Compute shaders don't support inlinable uniforms, because they
2277        * don't have shader variants.
2278        *
2279        * TODO: The driver uses a linear search to find a shader variant. This
2280        * can be really slow if we get too many variants due to uniform inlining.
2281        */
2282       NIR_PASS_V(nir, nir_inline_uniforms, nir->info.num_inlinable_uniforms,
2283                  inlined_uniform_values, nir->info.inlinable_uniform_dw_offsets);
2284       progress = true;
2285    }
2286 
2287    NIR_PASS(progress, nir, nir_opt_shrink_stores, false);
2288 
2289    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2290       /* This uses the prolog/epilog keys, so only monolithic shaders can call this. */
2291       if (shader->is_monolithic) {
2292          /* This lowers load_color intrinsics to COLn/BFCn input loads and two-side color
2293           * selection.
2294           */
2295          if (sel->info.colors_read)
2296             NIR_PASS(progress, nir, si_nir_lower_ps_color_input, &shader->key, &sel->info);
2297 
2298          /* This adds discard and barycentrics. */
2299          if (key->ps.mono.point_smoothing)
2300             NIR_PASS(progress, nir, nir_lower_point_smooth, true);
2301 
2302          /* This eliminates system values and unused shader output components. */
2303          ac_nir_lower_ps_early_options early_options = {
2304             .force_center_interp_no_msaa = key->ps.part.prolog.force_persp_center_interp ||
2305                                            key->ps.part.prolog.force_linear_center_interp ||
2306                                            key->ps.part.prolog.force_samplemask_to_helper_invocation ||
2307                                            key->ps.mono.interpolate_at_sample_force_center,
2308             .load_sample_positions_always_loads_current_ones = true,
2309             .force_front_face = key->ps.opt.force_front_face_input,
2310             .optimize_frag_coord = true,
2311             .frag_coord_is_center = true,
2312             /* This does a lot of things. See the description in ac_nir_lower_ps_early_options. */
2313             .ps_iter_samples = key->ps.part.prolog.samplemask_log_ps_iter ?
2314                                  (1 << key->ps.part.prolog.samplemask_log_ps_iter) :
2315                                  (key->ps.part.prolog.force_persp_sample_interp ||
2316                                   key->ps.part.prolog.force_linear_sample_interp ? 2 :
2317                                   (key->ps.part.prolog.get_frag_coord_from_pixel_coord ? 1 : 0)),
2318 
2319             .fbfetch_is_1D = key->ps.mono.fbfetch_is_1D,
2320             .fbfetch_layered = key->ps.mono.fbfetch_layered,
2321             .fbfetch_msaa = key->ps.mono.fbfetch_msaa,
2322             .fbfetch_apply_fmask = sel->screen->info.gfx_level < GFX11 &&
2323                                    !(sel->screen->debug_flags & DBG(NO_FMASK)),
2324 
2325             .clamp_color = key->ps.part.epilog.clamp_color,
2326             .alpha_test_alpha_to_one = key->ps.part.epilog.alpha_to_one,
2327             .alpha_func = key->ps.part.epilog.alpha_func,
2328             .keep_alpha_for_mrtz = key->ps.part.epilog.alpha_to_coverage_via_mrtz,
2329             .spi_shader_col_format_hint = key->ps.part.epilog.spi_shader_col_format,
2330             .kill_z = key->ps.part.epilog.kill_z,
2331             .kill_stencil = key->ps.part.epilog.kill_stencil,
2332             .kill_samplemask = key->ps.part.epilog.kill_samplemask,
2333          };
2334 
2335          NIR_PASS(progress, nir, ac_nir_lower_ps_early, &early_options);
2336 
2337          /* This adds gl_SampleMaskIn. It must be after ac_nir_lower_ps_early that lowers
2338           * sample_mask_in to load_helper_invocation because we only want to do that for user
2339           * shaders while keeping the real sample mask for smoothing, which is produced using
2340           * MSAA overrasterization over a single-sample color buffer.
2341           */
2342          if (key->ps.mono.poly_line_smoothing)
2343             NIR_PASS(progress, nir, nir_lower_poly_line_smooth, SI_NUM_SMOOTH_AA_SAMPLES);
2344 
2345          /* This adds discard. */
2346          if (key->ps.part.prolog.poly_stipple)
2347             NIR_PASS(progress, nir, si_nir_emit_polygon_stipple);
2348       } else {
2349          ac_nir_lower_ps_early_options early_options = {
2350             .optimize_frag_coord = true,
2351             .frag_coord_is_center = true,
2352             .alpha_func = COMPARE_FUNC_ALWAYS,
2353             .spi_shader_col_format_hint = ~0,
2354          };
2355          NIR_PASS(progress, nir, ac_nir_lower_ps_early, &early_options);
2356       }
2357    }
2358 
2359    if (progress) {
2360       si_nir_opts(sel->screen, nir, true);
2361       progress = false;
2362    }
2363 
2364    /* Remove dead temps before we lower indirect indexing. */
2365    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
2366 
2367    /* Lower indirect indexing last.
2368     *
2369     * Shader variant optimizations (such as uniform inlining, replacing barycentrics, and IO
2370     * elimination) can help eliminate indirect indexing, so this should be done after that.
2371     *
2372     * Note that the code can still contain tautologies such as "array1[i] == array2[i]" when
2373     * array1 and array2 have provably equal values (NIR doesn't have a pass that can do that),
2374     * which NIR can optimize only after we lower indirecting indexing, so it's important that
2375     * we lower it before we gather shader_info.
2376     */
2377 
2378    /* Lower indirect indexing of large constant arrays to the load_constant intrinsic, which
2379     * will be turned into PC-relative loads from a data section next to the shader.
2380     */
2381    NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
2382 
2383    /* Lower all other indirect indexing to if-else ladders or scratch. */
2384    progress |= ac_nir_lower_indirect_derefs(nir, sel->screen->info.gfx_level);
2385    return progress;
2386 }
2387 
2388 /* Late optimization passes and lowering passes. The majority of lowering passes are here.
2389  * These passes should have no impact on linking optimizations and shouldn't affect shader_info
2390  * (those should be run before this) because any changes in shader_info won't be reflected
2391  * in hw registers from now on.
2392  */
run_late_optimization_and_lowering_passes(struct si_nir_shader_ctx * ctx,bool progress)2393 static void run_late_optimization_and_lowering_passes(struct si_nir_shader_ctx *ctx,
2394                                                       bool progress)
2395 {
2396    struct si_shader *shader = ctx->shader;
2397    struct si_shader_selector *sel = shader->selector;
2398    const union si_shader_key *key = &shader->key;
2399    nir_shader *nir = ctx->nir;
2400 
2401    si_init_shader_args(shader, &ctx->args, &nir->info);
2402 
2403    if (nir->info.stage == MESA_SHADER_FRAGMENT)
2404       NIR_PASS(progress, nir, nir_lower_fragcoord_wtrans);
2405 
2406    NIR_PASS(progress, nir, ac_nir_lower_tex,
2407             &(ac_nir_lower_tex_options){
2408                .gfx_level = sel->screen->info.gfx_level,
2409                .lower_array_layer_round_even = !sel->screen->info.conformant_trunc_coord,
2410             });
2411 
2412    if (nir->info.uses_resource_info_query)
2413       NIR_PASS(progress, nir, ac_nir_lower_resinfo, sel->screen->info.gfx_level);
2414 
2415    /* This must be before si_nir_lower_resource. */
2416    if (!sel->screen->info.has_image_opcodes)
2417       NIR_PASS(progress, nir, ac_nir_lower_image_opcodes);
2418 
2419    /* LLVM does not work well with this, so is handled in llvm backend waterfall. */
2420    if (nir->info.use_aco_amd && sel->info.has_non_uniform_tex_access) {
2421       nir_lower_non_uniform_access_options options = {
2422          .types = nir_lower_non_uniform_texture_access,
2423       };
2424       NIR_PASS(progress, nir, nir_lower_non_uniform_access, &options);
2425    }
2426 
2427    /* Legacy GS is not the last VGT stage because there is also the GS copy shader. */
2428    bool is_last_vgt_stage =
2429       (nir->info.stage == MESA_SHADER_VERTEX ||
2430        nir->info.stage == MESA_SHADER_TESS_EVAL ||
2431        (nir->info.stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)) &&
2432       !shader->key.ge.as_ls && !shader->key.ge.as_es;
2433 
2434    if (nir->info.stage == MESA_SHADER_VERTEX)
2435       NIR_PASS(progress, nir, si_nir_lower_vs_inputs, shader, &ctx->args);
2436 
2437    progress |= si_lower_io_to_mem(shader, nir);
2438 
2439    if (is_last_vgt_stage) {
2440       /* Assign param export indices. */
2441       si_assign_param_offsets(nir, shader);
2442 
2443       /* Assign num of position exports. */
2444       shader->info.nr_pos_exports = si_get_nr_pos_exports(sel, key);
2445 
2446       if (key->ge.as_ngg) {
2447          /* Lower last VGT NGG shader stage. */
2448          si_lower_ngg(shader, nir);
2449       } else if (nir->info.stage == MESA_SHADER_VERTEX ||
2450                  nir->info.stage == MESA_SHADER_TESS_EVAL) {
2451          /* Lower last VGT none-NGG VS/TES shader stage. */
2452          unsigned clip_cull_mask =
2453             (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) |
2454             sel->info.culldist_mask;
2455 
2456          NIR_PASS_V(nir, ac_nir_lower_legacy_vs,
2457                     sel->screen->info.gfx_level,
2458                     clip_cull_mask,
2459                     shader->info.vs_output_param_offset,
2460                     shader->info.nr_param_exports,
2461                     shader->key.ge.mono.u.vs_export_prim_id,
2462                     !si_shader_uses_streamout(shader),
2463                     key->ge.opt.kill_pointsize,
2464                     key->ge.opt.kill_layer,
2465                     sel->screen->options.vrs2x2);
2466       }
2467       progress = true;
2468    } else if (nir->info.stage == MESA_SHADER_GEOMETRY && !key->ge.as_ngg) {
2469       si_init_gs_output_info(&sel->info, &ctx->legacy_gs_output_info);
2470       NIR_PASS_V(nir, ac_nir_lower_legacy_gs, false, sel->screen->use_ngg,
2471                  &ctx->legacy_gs_output_info.info);
2472       progress = true;
2473    } else if (nir->info.stage == MESA_SHADER_FRAGMENT && shader->is_monolithic) {
2474       ac_nir_lower_ps_late_options late_options = {
2475          .gfx_level = sel->screen->info.gfx_level,
2476          .family = sel->screen->info.family,
2477          .use_aco = nir->info.use_aco_amd,
2478          .bc_optimize_for_persp = key->ps.part.prolog.bc_optimize_for_persp,
2479          .bc_optimize_for_linear = key->ps.part.prolog.bc_optimize_for_linear,
2480          .uses_discard = si_shader_uses_discard(shader),
2481          .alpha_to_coverage_via_mrtz = key->ps.part.epilog.alpha_to_coverage_via_mrtz,
2482          .dual_src_blend_swizzle = key->ps.part.epilog.dual_src_blend_swizzle,
2483          .spi_shader_col_format = key->ps.part.epilog.spi_shader_col_format,
2484          .color_is_int8 = key->ps.part.epilog.color_is_int8,
2485          .color_is_int10 = key->ps.part.epilog.color_is_int10,
2486          .alpha_to_one = key->ps.part.epilog.alpha_to_one,
2487       };
2488 
2489       NIR_PASS(progress, nir, ac_nir_lower_ps_late, &late_options);
2490    }
2491 
2492    assert(shader->wave_size == 32 || shader->wave_size == 64);
2493 
2494    NIR_PASS(progress, nir, nir_lower_subgroups,
2495             &(struct nir_lower_subgroups_options) {
2496                .subgroup_size = shader->wave_size,
2497                .ballot_bit_size = shader->wave_size,
2498                .ballot_components = 1,
2499                .lower_to_scalar = true,
2500                .lower_subgroup_masks = true,
2501                .lower_relative_shuffle = true,
2502                .lower_rotate_to_shuffle = !nir->info.use_aco_amd,
2503                .lower_shuffle_to_32bit = true,
2504                .lower_vote_eq = true,
2505                .lower_vote_bool_eq = true,
2506                .lower_quad_broadcast_dynamic = true,
2507                .lower_quad_broadcast_dynamic_to_const = sel->screen->info.gfx_level <= GFX7,
2508                .lower_shuffle_to_swizzle_amd = true,
2509                .lower_ballot_bit_count_to_mbcnt_amd = true,
2510                .lower_inverse_ballot = !nir->info.use_aco_amd && LLVM_VERSION_MAJOR < 17,
2511                .lower_boolean_reduce = nir->info.use_aco_amd,
2512                .lower_boolean_shuffle = true,
2513             });
2514 
2515    NIR_PASS(progress, nir, nir_lower_pack);
2516    NIR_PASS(progress, nir, nir_opt_idiv_const, 8);
2517    NIR_PASS(progress, nir, nir_lower_idiv,
2518             &(nir_lower_idiv_options){
2519                .allow_fp16 = sel->screen->info.gfx_level >= GFX9,
2520             });
2521 
2522    if (si_should_clear_lds(sel->screen, nir)) {
2523       const unsigned chunk_size = 16; /* max single store size */
2524       const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
2525       NIR_PASS_V(nir, nir_clear_shared_memory, shared_size, chunk_size);
2526    }
2527 
2528    NIR_PASS_V(nir, nir_divergence_analysis); /* required by ac_nir_flag_smem_for_loads */
2529    NIR_PASS(progress, nir, ac_nir_flag_smem_for_loads, sel->screen->info.gfx_level,
2530             !sel->info.base.use_aco_amd, true);
2531    NIR_PASS(progress, nir, nir_lower_io_to_scalar,
2532             nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_shared | nir_var_mem_global,
2533             ac_nir_scalarize_overfetching_loads_callback, &sel->screen->info.gfx_level);
2534    NIR_PASS(progress, nir, si_nir_lower_resource, shader, &ctx->args);
2535 
2536    if (progress) {
2537       si_nir_opts(sel->screen, nir, false);
2538       progress = false;
2539    }
2540 
2541    NIR_PASS(progress, nir, nir_opt_load_store_vectorize,
2542             &(nir_load_store_vectorize_options){
2543                .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_shared | nir_var_mem_global |
2544                         nir_var_shader_temp,
2545                .callback = ac_nir_mem_vectorize_callback,
2546                .cb_data = &(struct ac_nir_config){sel->screen->info.gfx_level, sel->info.base.use_aco_amd},
2547                /* On GFX6, read2/write2 is out-of-bounds if the offset register is negative, even if
2548                 * the final offset is not.
2549                 */
2550                .has_shared2_amd = sel->screen->info.gfx_level >= GFX7,
2551             });
2552    NIR_PASS(progress, nir, ac_nir_lower_mem_access_bit_sizes,
2553             sel->screen->info.gfx_level, !nir->info.use_aco_amd);
2554 
2555    if (nir->info.stage == MESA_SHADER_KERNEL) {
2556       NIR_PASS(progress, nir, ac_nir_lower_global_access);
2557 
2558       if (nir->info.bit_sizes_int & (8 | 16)) {
2559          if (sel->screen->info.gfx_level >= GFX8)
2560             nir_divergence_analysis(nir);
2561 
2562          NIR_PASS(progress, nir, nir_lower_bit_size, ac_nir_lower_bit_size_callback,
2563                   &sel->screen->info.gfx_level);
2564       }
2565    }
2566 
2567    /* This must be after lowering resources to descriptor loads and before lowering intrinsics
2568     * to args and lowering int64.
2569     */
2570    if (nir->info.use_aco_amd)
2571       progress |= ac_nir_optimize_uniform_atomics(nir);
2572 
2573    NIR_PASS(progress, nir, nir_lower_int64);
2574    NIR_PASS(progress, nir, si_nir_lower_abi, shader, &ctx->args);
2575    NIR_PASS(progress, nir, ac_nir_lower_intrinsics_to_args, sel->screen->info.gfx_level,
2576             sel->screen->info.has_ls_vgpr_init_bug,
2577             si_select_hw_stage(nir->info.stage, key, sel->screen->info.gfx_level),
2578             shader->wave_size, si_get_max_workgroup_size(shader), &ctx->args.ac);
2579 
2580    /* LLVM keep non-uniform sampler as index, so can't do this in NIR.
2581     * Must be done after si_nir_lower_resource().
2582     */
2583    if (nir->info.use_aco_amd && sel->info.has_shadow_comparison &&
2584        sel->screen->info.gfx_level >= GFX8 && sel->screen->info.gfx_level <= GFX9) {
2585       NIR_PASS(progress, nir, si_nir_clamp_shadow_comparison_value);
2586    }
2587 
2588    if (progress) {
2589       si_nir_opts(sel->screen, nir, false);
2590       progress = false;
2591    }
2592 
2593    static const nir_opt_offsets_options offset_options = {
2594       .uniform_max = 0,
2595       .buffer_max = ~0,
2596       .shared_max = ~0,
2597    };
2598    NIR_PASS_V(nir, nir_opt_offsets, &offset_options);
2599 
2600    si_nir_late_opts(nir);
2601 
2602    NIR_PASS(progress, nir, nir_opt_sink,
2603             nir_move_const_undef | nir_move_copies | nir_move_alu | nir_move_comparisons |
2604             nir_move_load_ubo | nir_move_load_ssbo);
2605    NIR_PASS(progress, nir, nir_opt_move,
2606             nir_move_const_undef | nir_move_copies | nir_move_alu | nir_move_comparisons |
2607             nir_move_load_ubo);
2608    /* Run nir_opt_move again to make sure that comparisons are as close as possible to the first
2609     * use to prevent SCC spilling.
2610     */
2611    NIR_PASS(progress, nir, nir_opt_move, nir_move_comparisons);
2612 
2613    /* This must be done after si_nir_late_opts() because it may generate vec const. */
2614    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
2615 
2616    /* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
2617     * 200 is tuned for Viewperf. It should be done last.
2618     */
2619    NIR_PASS_V(nir, nir_group_loads, nir_group_same_resource_only, 200);
2620 }
2621 
get_input_nir(struct si_shader * shader,struct si_nir_shader_ctx * ctx)2622 static void get_input_nir(struct si_shader *shader, struct si_nir_shader_ctx *ctx)
2623 {
2624    struct si_shader_selector *sel = shader->selector;
2625 
2626    ctx->shader = shader;
2627    ctx->free_nir = !sel->nir && sel->nir_binary;
2628    ctx->nir = sel->nir ? sel->nir : (sel->nir_binary ? si_deserialize_shader(sel) : NULL);
2629    assert(ctx->nir);
2630 
2631    if (unlikely(should_print_nir(ctx->nir))) {
2632       /* Modify the shader's name so that each variant gets its own name. */
2633       ctx->nir->info.name = ralloc_asprintf(ctx->nir, "%s-%08x", ctx->nir->info.name,
2634                                             _mesa_hash_data(&shader->key, sizeof(shader->key)));
2635 
2636       /* Dummy pass to get the starting point. */
2637       printf("nir_dummy_pass\n");
2638       nir_print_shader(ctx->nir, stdout);
2639    }
2640 }
2641 
get_prev_stage_input_nir(struct si_shader * shader,struct si_linked_shaders * linked)2642 static void get_prev_stage_input_nir(struct si_shader *shader, struct si_linked_shaders *linked)
2643 {
2644    const union si_shader_key *key = &shader->key;
2645 
2646    if (shader->selector->stage == MESA_SHADER_TESS_CTRL) {
2647       linked->producer_shader.selector = key->ge.part.tcs.ls;
2648       linked->producer_shader.key.ge.as_ls = 1;
2649    } else {
2650       linked->producer_shader.selector = key->ge.part.gs.es;
2651       linked->producer_shader.key.ge.as_es = 1;
2652       linked->producer_shader.key.ge.as_ngg = key->ge.as_ngg;
2653    }
2654 
2655    linked->producer_shader.next_shader = shader;
2656    linked->producer_shader.key.ge.mono = key->ge.mono;
2657    linked->producer_shader.key.ge.opt = key->ge.opt;
2658    linked->producer_shader.key.ge.opt.inline_uniforms = false; /* only TCS/GS can inline uniforms */
2659    /* kill_outputs was computed based on second shader's outputs so we can't use it to
2660     * kill first shader's outputs.
2661     */
2662    linked->producer_shader.key.ge.opt.kill_outputs = 0;
2663    linked->producer_shader.is_monolithic = true;
2664    linked->producer_shader.wave_size = shader->wave_size;
2665 
2666    get_input_nir(&linked->producer_shader, &linked->producer);
2667 }
2668 
si_set_spi_ps_input_config_for_separate_prolog(struct si_shader * shader)2669 static void si_set_spi_ps_input_config_for_separate_prolog(struct si_shader *shader)
2670 {
2671    const union si_shader_key *key = &shader->key;
2672 
2673    /* Enable POS_FIXED_PT if polygon stippling is enabled. */
2674    if (key->ps.part.prolog.poly_stipple)
2675       shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2676 
2677    /* Set up the enable bits for per-sample shading if needed. */
2678    if (key->ps.part.prolog.force_persp_sample_interp &&
2679        (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2680         G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2681       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
2682       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2683       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2684    }
2685    if (key->ps.part.prolog.force_linear_sample_interp &&
2686        (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2687         G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2688       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
2689       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2690       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2691    }
2692    if (key->ps.part.prolog.force_persp_center_interp &&
2693        (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2694         G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2695       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
2696       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2697       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2698    }
2699    if (key->ps.part.prolog.force_linear_center_interp &&
2700        (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2701         G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2702       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
2703       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2704       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2705    }
2706 
2707    /* The sample mask fixup requires the sample ID. */
2708    if (key->ps.part.prolog.samplemask_log_ps_iter)
2709       shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2710 
2711    if (key->ps.part.prolog.force_samplemask_to_helper_invocation) {
2712       assert(key->ps.part.prolog.samplemask_log_ps_iter == 0);
2713       assert(!key->ps.mono.poly_line_smoothing);
2714       shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
2715    }
2716 
2717    /* The sample mask fixup has an optimization that replaces the sample mask with the sample ID. */
2718    if (key->ps.part.prolog.samplemask_log_ps_iter == 3)
2719       shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
2720 
2721    if (key->ps.part.prolog.get_frag_coord_from_pixel_coord) {
2722       shader->config.spi_ps_input_ena &= C_0286CC_POS_X_FLOAT_ENA;
2723       shader->config.spi_ps_input_ena &= C_0286CC_POS_Y_FLOAT_ENA;
2724       shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2725    }
2726 }
2727 
si_fixup_spi_ps_input_config(struct si_shader * shader)2728 static void si_fixup_spi_ps_input_config(struct si_shader *shader)
2729 {
2730    /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
2731    if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
2732        !(shader->config.spi_ps_input_ena & 0xf)) {
2733       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2734    }
2735 
2736    /* At least one pair of interpolation weights must be enabled. */
2737    if (!(shader->config.spi_ps_input_ena & 0x7f))
2738       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2739 }
2740 
2741 static void
si_get_shader_variant_info(struct si_shader * shader,nir_shader * nir)2742 si_get_shader_variant_info(struct si_shader *shader, nir_shader *nir)
2743 {
2744    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
2745    assert(shader->selector->info.base.use_aco_amd == nir->info.use_aco_amd);
2746    const BITSET_WORD *sysvals = nir->info.system_values_read;
2747 
2748    /* ACO needs spi_ps_input_ena before si_init_shader_args. */
2749    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2750       /* Find out which frag coord components are used. */
2751       uint8_t frag_coord_mask = 0;
2752 
2753       /* Since flat+convergent and non-flat components can occur in the same vec4, start with
2754        * all PS inputs as flat and change them to smooth when we find a component that's
2755        * interpolated.
2756        */
2757       for (unsigned i = 0; i < ARRAY_SIZE(shader->info.ps_inputs); i++)
2758          shader->info.ps_inputs[i].interpolate = INTERP_MODE_FLAT;
2759 
2760       nir_foreach_block(block, nir_shader_get_entrypoint(nir)) {
2761          nir_foreach_instr(instr, block) {
2762             if (instr->type == nir_instr_type_intrinsic) {
2763                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2764 
2765                switch (intr->intrinsic) {
2766                case nir_intrinsic_load_frag_coord:
2767                case nir_intrinsic_load_sample_pos:
2768                   frag_coord_mask |= nir_def_components_read(&intr->def);
2769                   break;
2770                case nir_intrinsic_load_input:
2771                case nir_intrinsic_load_interpolated_input: {
2772                   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
2773                   unsigned index = nir_intrinsic_base(intr);
2774                   assert(sem.num_slots == 1);
2775 
2776                   shader->info.num_ps_inputs = MAX2(shader->info.num_ps_inputs, index + 1);
2777                   shader->info.ps_inputs[index].semantic = sem.location;
2778                   /* Determine interpolation mode. This only cares about FLAT/SMOOTH/COLOR.
2779                    * COLOR is only for nir_intrinsic_load_color0/1.
2780                    */
2781                   if (intr->intrinsic == nir_intrinsic_load_interpolated_input) {
2782                      shader->info.ps_inputs[index].interpolate = INTERP_MODE_SMOOTH;
2783                      if (intr->def.bit_size == 16)
2784                         shader->info.ps_inputs[index].fp16_lo_hi_valid |= 0x1 << sem.high_16bits;
2785                   }
2786                   break;
2787                }
2788                case nir_intrinsic_load_color0:
2789                   assert(!shader->is_monolithic);
2790                   shader->info.ps_colors_read |= nir_def_components_read(&intr->def);
2791                   break;
2792                case nir_intrinsic_load_color1:
2793                   assert(!shader->is_monolithic);
2794                   shader->info.ps_colors_read |= nir_def_components_read(&intr->def) << 4;
2795                   break;
2796                default:
2797                   break;
2798                }
2799             }
2800          }
2801       }
2802 
2803       /* Add both front and back color inputs. */
2804       if (!shader->is_monolithic) {
2805          unsigned index = shader->info.num_ps_inputs;
2806 
2807          for (unsigned back = 0; back < 2; back++) {
2808             for (unsigned i = 0; i < 2; i++) {
2809                if ((shader->info.ps_colors_read >> (i * 4)) & 0xf) {
2810                   assert(index < ARRAY_SIZE(shader->info.ps_inputs));
2811                   shader->info.ps_inputs[index].semantic =
2812                      (back ? VARYING_SLOT_BFC0 : VARYING_SLOT_COL0) + i;
2813 
2814                   enum glsl_interp_mode mode = i ? nir->info.fs.color1_interp
2815                                                  : nir->info.fs.color0_interp;
2816                   shader->info.ps_inputs[index].interpolate =
2817                      mode == INTERP_MODE_NONE ? INTERP_MODE_COLOR : mode;
2818                   index++;
2819 
2820                   /* Back-face colors don't increment num_ps_inputs. si_emit_spi_map will use
2821                    * back-face colors conditionally only when needed.
2822                    */
2823                   if (!back)
2824                      shader->info.num_ps_inputs++;
2825                }
2826             }
2827          }
2828       }
2829 
2830       shader->config.spi_ps_input_ena =
2831          S_0286CC_PERSP_SAMPLE_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE)) |
2832          S_0286CC_PERSP_CENTER_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL)) |
2833          S_0286CC_PERSP_CENTROID_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID)) |
2834          S_0286CC_LINEAR_SAMPLE_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE)) |
2835          S_0286CC_LINEAR_CENTER_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL)) |
2836          S_0286CC_LINEAR_CENTROID_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID)) |
2837          S_0286CC_POS_X_FLOAT_ENA(!!(frag_coord_mask & 0x1)) |
2838          S_0286CC_POS_Y_FLOAT_ENA(!!(frag_coord_mask & 0x2)) |
2839          S_0286CC_POS_Z_FLOAT_ENA(!!(frag_coord_mask & 0x4)) |
2840          S_0286CC_POS_W_FLOAT_ENA(!!(frag_coord_mask & 0x8)) |
2841          S_0286CC_FRONT_FACE_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_FRONT_FACE) |
2842                                  BITSET_TEST(sysvals, SYSTEM_VALUE_FRONT_FACE_FSIGN)) |
2843          S_0286CC_ANCILLARY_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_SAMPLE_ID) |
2844                                 BITSET_TEST(sysvals, SYSTEM_VALUE_LAYER_ID)) |
2845          S_0286CC_SAMPLE_COVERAGE_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_SAMPLE_MASK_IN)) |
2846          S_0286CC_POS_FIXED_PT_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_PIXEL_COORD));
2847 
2848       if (shader->is_monolithic) {
2849          si_fixup_spi_ps_input_config(shader);
2850          shader->config.spi_ps_input_addr = shader->config.spi_ps_input_ena;
2851       } else {
2852          /* Part mode will call si_fixup_spi_ps_input_config() when combining multi
2853           * shader part in si_shader_select_ps_parts().
2854           *
2855           * Reserve register locations for VGPR inputs the PS prolog may need.
2856           */
2857          shader->config.spi_ps_input_addr = shader->config.spi_ps_input_ena |
2858                                             SI_SPI_PS_INPUT_ADDR_FOR_PROLOG;
2859       }
2860    }
2861 }
2862 
get_nir_shaders(struct si_shader * shader,struct si_linked_shaders * linked)2863 static void get_nir_shaders(struct si_shader *shader, struct si_linked_shaders *linked)
2864 {
2865    memset(linked, 0, sizeof(*linked));
2866    get_input_nir(shader, &linked->consumer);
2867 
2868    if (shader->selector->screen->info.gfx_level >= GFX9 && shader->is_monolithic &&
2869        (shader->selector->stage == MESA_SHADER_TESS_CTRL ||
2870         shader->selector->stage == MESA_SHADER_GEOMETRY))
2871       get_prev_stage_input_nir(shader, linked);
2872 
2873    bool progress[SI_NUM_LINKED_SHADERS] = {0};
2874 
2875    for (unsigned i = 0; i < SI_NUM_LINKED_SHADERS; i++) {
2876       if (linked->shader[i].nir) {
2877          progress[i] = run_pre_link_optimization_passes(&linked->shader[i]);
2878       }
2879    }
2880 
2881    /* TODO: run linking optimizations here if we have LS+HS or ES+GS */
2882 
2883    /* TODO: gather shader_info here */
2884    if (shader->selector->stage <= MESA_SHADER_GEOMETRY) {
2885       shader->info.uses_instanceid |=
2886          shader->key.ge.mono.instance_divisor_is_one ||
2887          shader->key.ge.mono.instance_divisor_is_fetched;
2888 
2889       if (linked->producer.nir) {
2890          shader->info.uses_instanceid |=
2891             linked->producer.shader->selector->info.uses_instanceid ||
2892             linked->producer.shader->info.uses_instanceid;
2893       }
2894    }
2895 
2896    if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
2897       if (progress[1]) {
2898          si_nir_opts(shader->selector->screen, linked->consumer.nir, false);
2899          progress[1] = false;
2900       }
2901 
2902       /* Remove holes after removed PS inputs by renumbering them. Holes can only occur with
2903        * monolithic PS.
2904        */
2905       if (shader->is_monolithic)
2906          NIR_PASS_V(linked->consumer.nir, nir_recompute_io_bases, nir_var_shader_in);
2907 
2908       si_get_shader_variant_info(shader, linked->consumer.nir);
2909    }
2910 
2911    for (unsigned i = 0; i < SI_NUM_LINKED_SHADERS; i++) {
2912       if (linked->shader[i].nir) {
2913          run_late_optimization_and_lowering_passes(&linked->shader[i], progress[i]);
2914       }
2915    }
2916 
2917    /* TODO: gather this where other shader_info is gathered */
2918    for (unsigned i = 0; i < SI_NUM_LINKED_SHADERS; i++) {
2919       if (linked->shader[i].nir) {
2920          struct si_shader_info info;
2921          si_nir_scan_shader(shader->selector->screen, linked->shader[i].nir, &info, true);
2922 
2923          shader->info.uses_vmem_load_other |= info.uses_vmem_load_other;
2924          shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh;
2925       }
2926    }
2927 }
2928 
2929 /* Generate code for the hardware VS shader stage to go with a geometry shader */
2930 static struct si_shader *
si_nir_generate_gs_copy_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * gs_shader,nir_shader * gs_nir,struct util_debug_callback * debug,ac_nir_gs_output_info * output_info)2931 si_nir_generate_gs_copy_shader(struct si_screen *sscreen,
2932                                struct ac_llvm_compiler *compiler,
2933                                struct si_shader *gs_shader,
2934                                nir_shader *gs_nir,
2935                                struct util_debug_callback *debug,
2936                                ac_nir_gs_output_info *output_info)
2937 {
2938    struct si_shader *shader;
2939    struct si_shader_selector *gs_selector = gs_shader->selector;
2940    struct si_shader_info *gsinfo = &gs_selector->info;
2941    union si_shader_key *gskey = &gs_shader->key;
2942 
2943    shader = CALLOC_STRUCT(si_shader);
2944    if (!shader)
2945       return NULL;
2946 
2947    /* We can leave the fence as permanently signaled because the GS copy
2948     * shader only becomes visible globally after it has been compiled. */
2949    util_queue_fence_init(&shader->ready);
2950 
2951    shader->selector = gs_selector;
2952    shader->is_gs_copy_shader = true;
2953    shader->wave_size = si_determine_wave_size(sscreen, shader);
2954 
2955    STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
2956    memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
2957           sizeof(shader->info.vs_output_param_offset));
2958 
2959    for (unsigned i = 0; i < gsinfo->num_outputs; i++) {
2960       unsigned semantic = gsinfo->output_semantic[i];
2961 
2962       /* Skip if no channel writes to stream 0. */
2963       if (!nir_slot_is_varying(semantic, MESA_SHADER_FRAGMENT) ||
2964           (gsinfo->output_streams[i] & 0x03 &&
2965            gsinfo->output_streams[i] & 0x0c &&
2966            gsinfo->output_streams[i] & 0x30 &&
2967            gsinfo->output_streams[i] & 0xc0))
2968          continue;
2969 
2970       shader->info.vs_output_param_offset[semantic] = shader->info.nr_param_exports++;
2971    }
2972 
2973    shader->info.nr_pos_exports = si_get_nr_pos_exports(gs_selector, gskey);
2974 
2975    unsigned clip_cull_mask =
2976       (gsinfo->clipdist_mask & ~gskey->ge.opt.kill_clip_distances) | gsinfo->culldist_mask;
2977 
2978    nir_shader *nir =
2979       ac_nir_create_gs_copy_shader(gs_nir,
2980                                    sscreen->info.gfx_level,
2981                                    clip_cull_mask,
2982                                    shader->info.vs_output_param_offset,
2983                                    shader->info.nr_param_exports,
2984                                    !si_shader_uses_streamout(gs_shader),
2985                                    gskey->ge.opt.kill_pointsize,
2986                                    gskey->ge.opt.kill_layer,
2987                                    sscreen->options.vrs2x2,
2988                                    output_info);
2989 
2990    struct si_linked_shaders linked;
2991    memset(&linked, 0, sizeof(linked));
2992    linked.consumer.nir = nir;
2993 
2994    si_init_shader_args(shader, &linked.consumer.args, &gs_nir->info);
2995 
2996    NIR_PASS_V(nir, si_nir_lower_abi, shader, &linked.consumer.args);
2997    NIR_PASS_V(nir, ac_nir_lower_intrinsics_to_args, sscreen->info.gfx_level,
2998               sscreen->info.has_ls_vgpr_init_bug, AC_HW_VERTEX_SHADER, 64, 64,
2999               &linked.consumer.args.ac);
3000 
3001    si_nir_opts(gs_selector->screen, nir, false);
3002 
3003    NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
3004 
3005    if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY, SI_DUMP_NIR)) {
3006       fprintf(stderr, "GS Copy Shader:\n");
3007       nir_print_shader(nir, stderr);
3008    }
3009 
3010    bool ok =
3011 #if AMD_LLVM_AVAILABLE
3012       !gs_nir->info.use_aco_amd ? si_llvm_compile_shader(sscreen, compiler, shader, &linked, debug) :
3013 #endif
3014       si_aco_compile_shader(shader, &linked, debug);
3015 
3016    if (ok) {
3017       assert(!shader->config.scratch_bytes_per_wave);
3018       ok = si_shader_binary_upload(sscreen, shader, 0) >= 0;
3019       si_shader_dump(sscreen, shader, debug, stderr, true);
3020    }
3021    ralloc_free(nir);
3022 
3023    if (!ok) {
3024       FREE(shader);
3025       shader = NULL;
3026    } else {
3027       si_fix_resource_usage(sscreen, shader);
3028    }
3029    return shader;
3030 }
3031 
3032 static void
debug_message_stderr(void * data,unsigned * id,enum util_debug_type ptype,const char * fmt,va_list args)3033 debug_message_stderr(void *data, unsigned *id, enum util_debug_type ptype,
3034                       const char *fmt, va_list args)
3035 {
3036    vfprintf(stderr, fmt, args);
3037    fprintf(stderr, "\n");
3038 }
3039 
3040 static void
determine_shader_variant_info(struct si_screen * sscreen,struct si_shader * shader)3041 determine_shader_variant_info(struct si_screen *sscreen, struct si_shader *shader)
3042 {
3043    struct si_shader_selector *sel = shader->selector;
3044 
3045    if (sel->stage == MESA_SHADER_FRAGMENT) {
3046       shader->ps.writes_z = sel->info.writes_z && !shader->key.ps.part.epilog.kill_z;
3047       shader->ps.writes_stencil = sel->info.writes_stencil &&
3048                                   !shader->key.ps.part.epilog.kill_stencil;
3049       shader->ps.writes_samplemask = sel->info.writes_samplemask &&
3050                                      !shader->key.ps.part.epilog.kill_samplemask;
3051    }
3052 }
3053 
si_compile_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3054 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3055                        struct si_shader *shader, struct util_debug_callback *debug)
3056 {
3057    bool ret = true;
3058    struct si_shader_selector *sel = shader->selector;
3059 
3060    determine_shader_variant_info(sscreen, shader);
3061 
3062    struct si_linked_shaders linked;
3063    get_nir_shaders(shader, &linked);
3064    nir_shader *nir = linked.consumer.nir;
3065 
3066    /* Dump NIR before doing NIR->LLVM conversion in case the
3067     * conversion fails. */
3068    if (si_can_dump_shader(sscreen, nir->info.stage, SI_DUMP_NIR)) {
3069       nir_print_shader(nir, stderr);
3070 
3071       if (nir->xfb_info)
3072          nir_print_xfb_info(nir->xfb_info, stderr);
3073    }
3074 
3075    /* Initialize vs_output_ps_input_cntl to default. */
3076    for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++)
3077       shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED;
3078    shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0;
3079 
3080    /* uses_instanceid may be set by si_nir_lower_vs_inputs(). */
3081    shader->info.uses_instanceid |= sel->info.uses_instanceid;
3082    shader->info.private_mem_vgprs = DIV_ROUND_UP(nir->scratch_size, 4);
3083 
3084    /* Set the FP ALU behavior. */
3085    /* By default, we disable denormals for FP32 and enable them for FP16 and FP64
3086     * for performance and correctness reasons. FP32 denormals can't be enabled because
3087     * they break output modifiers and v_mad_f32 and are very slow on GFX6-7.
3088     *
3089     * float_controls_execution_mode defines the set of valid behaviors. Contradicting flags
3090     * can be set simultaneously, which means we are allowed to choose, but not really because
3091     * some options cause GLCTS failures.
3092     */
3093    unsigned float_mode = V_00B028_FP_16_64_DENORMS;
3094 
3095    if (!(nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) &&
3096        nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
3097       float_mode |= V_00B028_FP_32_ROUND_TOWARDS_ZERO;
3098 
3099    if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
3100                                                     FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64)) &&
3101        nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
3102                                                   FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
3103       float_mode |= V_00B028_FP_16_64_ROUND_TOWARDS_ZERO;
3104 
3105    if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_PRESERVE_FP16 |
3106                                                     FLOAT_CONTROLS_DENORM_PRESERVE_FP64)) &&
3107        nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
3108                                                   FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64))
3109       float_mode &= ~V_00B028_FP_16_64_DENORMS;
3110 
3111    ret =
3112 #if AMD_LLVM_AVAILABLE
3113       !nir->info.use_aco_amd ? si_llvm_compile_shader(sscreen, compiler, shader, &linked, debug) :
3114 #endif
3115       si_aco_compile_shader(shader, &linked, debug);
3116 
3117    if (!ret)
3118       goto out;
3119 
3120    shader->config.float_mode = float_mode;
3121 
3122    /* The GS copy shader is compiled next. */
3123    if (nir->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
3124       shader->gs_copy_shader =
3125          si_nir_generate_gs_copy_shader(sscreen, compiler, shader, nir, debug,
3126                                         &linked.consumer.legacy_gs_output_info.info);
3127       if (!shader->gs_copy_shader) {
3128          fprintf(stderr, "radeonsi: can't create GS copy shader\n");
3129          ret = false;
3130          goto out;
3131       }
3132    }
3133 
3134    /* Compute vs_output_ps_input_cntl. */
3135    if ((nir->info.stage == MESA_SHADER_VERTEX ||
3136         nir->info.stage == MESA_SHADER_TESS_EVAL ||
3137         nir->info.stage == MESA_SHADER_GEOMETRY) &&
3138        !shader->key.ge.as_ls && !shader->key.ge.as_es) {
3139       uint8_t *vs_output_param_offset = shader->info.vs_output_param_offset;
3140 
3141       if (nir->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg)
3142          vs_output_param_offset = shader->gs_copy_shader->info.vs_output_param_offset;
3143 
3144       /* We must use the original shader info before the removal of duplicated shader outputs. */
3145       /* VS and TES should also set primitive ID output if it's used. */
3146       unsigned num_outputs_with_prim_id = sel->info.num_outputs +
3147                                           shader->key.ge.mono.u.vs_export_prim_id;
3148 
3149       for (unsigned i = 0; i < num_outputs_with_prim_id; i++) {
3150          unsigned semantic = sel->info.output_semantic[i];
3151          unsigned offset = vs_output_param_offset[semantic];
3152          unsigned ps_input_cntl;
3153 
3154          if (offset <= AC_EXP_PARAM_OFFSET_31) {
3155             /* The input is loaded from parameter memory. */
3156             ps_input_cntl = S_028644_OFFSET(offset);
3157          } else {
3158             /* The input is a DEFAULT_VAL constant. */
3159             assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
3160                    offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
3161             offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
3162 
3163             /* OFFSET=0x20 means that DEFAULT_VAL is used. */
3164             ps_input_cntl = S_028644_OFFSET(0x20) |
3165                             S_028644_DEFAULT_VAL(offset);
3166          }
3167 
3168          shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl;
3169       }
3170    }
3171 
3172    /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */
3173    if (gl_shader_stage_is_compute(nir->info.stage)) {
3174       unsigned max_vgprs =
3175          sscreen->info.num_physical_wave64_vgprs_per_simd * (shader->wave_size == 32 ? 2 : 1);
3176       unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
3177       unsigned max_sgprs_per_wave = 128;
3178       unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
3179       unsigned threads_per_tg = si_get_max_workgroup_size(shader);
3180       unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, shader->wave_size);
3181       unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
3182 
3183       max_vgprs = max_vgprs / waves_per_simd;
3184       max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
3185 
3186       if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
3187          fprintf(stderr,
3188                  "LLVM failed to compile a shader correctly: "
3189                  "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
3190                  shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);
3191 
3192          /* Just terminate the process, because dependent
3193           * shaders can hang due to bad input data, but use
3194           * the env var to allow shader-db to work.
3195           */
3196          if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
3197             abort();
3198       }
3199    }
3200 
3201    /* Add/remove the scratch offset to/from input SGPRs. */
3202    if (!sel->screen->info.has_scratch_base_registers &&
3203        !si_is_merged_shader(shader)) {
3204       if (nir->info.use_aco_amd) {
3205          /* When aco scratch_offset arg is added explicitly at the beginning.
3206           * After compile if no scratch used, reduce the input sgpr count.
3207           */
3208          if (!shader->config.scratch_bytes_per_wave)
3209             shader->info.num_input_sgprs--;
3210       } else {
3211          /* scratch_offset arg is added by llvm implicitly */
3212          if (shader->info.num_input_sgprs)
3213             shader->info.num_input_sgprs++;
3214       }
3215    }
3216 
3217    /* Calculate the number of fragment input VGPRs. */
3218    if (nir->info.stage == MESA_SHADER_FRAGMENT)
3219       shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(&shader->config);
3220 
3221    si_calculate_max_simd_waves(shader);
3222 
3223    if (si_can_dump_shader(sscreen, nir->info.stage, SI_DUMP_STATS)) {
3224       struct util_debug_callback out_stderr = {
3225          .debug_message = debug_message_stderr,
3226       };
3227 
3228       si_shader_dump_stats_for_shader_db(sscreen, shader, &out_stderr);
3229    } else {
3230       si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
3231    }
3232 
3233 out:
3234    for (unsigned i = 0; i < SI_NUM_LINKED_SHADERS; i++) {
3235       if (linked.shader[i].free_nir)
3236          ralloc_free(linked.shader[i].nir);
3237    }
3238 
3239    return ret;
3240 }
3241 
3242 /**
3243  * Create, compile and return a shader part (prolog or epilog).
3244  *
3245  * \param sscreen  screen
3246  * \param list     list of shader parts of the same category
3247  * \param type     shader type
3248  * \param key      shader part key
3249  * \param prolog   whether the part being requested is a prolog
3250  * \param tm       LLVM target machine
3251  * \param debug    debug callback
3252  * \return         non-NULL on success
3253  */
3254 static struct si_shader_part *
si_get_shader_part(struct si_screen * sscreen,struct si_shader_part ** list,gl_shader_stage stage,bool prolog,union si_shader_part_key * key,struct ac_llvm_compiler * compiler,struct util_debug_callback * debug,const char * name)3255 si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
3256                    gl_shader_stage stage, bool prolog, union si_shader_part_key *key,
3257                    struct ac_llvm_compiler *compiler, struct util_debug_callback *debug,
3258                    const char *name)
3259 {
3260    struct si_shader_part *result;
3261 
3262    simple_mtx_lock(&sscreen->shader_parts_mutex);
3263 
3264    /* Find existing. */
3265    for (result = *list; result; result = result->next) {
3266       if (memcmp(&result->key, key, sizeof(*key)) == 0) {
3267          simple_mtx_unlock(&sscreen->shader_parts_mutex);
3268          return result;
3269       }
3270    }
3271 
3272    /* Compile a new one. */
3273    result = CALLOC_STRUCT(si_shader_part);
3274    result->key = *key;
3275 
3276    bool ok =
3277 #if AMD_LLVM_AVAILABLE
3278       !(sscreen->use_aco ||
3279         (stage == MESA_SHADER_FRAGMENT &&
3280          ((prolog && key->ps_prolog.use_aco) ||
3281           (!prolog && key->ps_epilog.use_aco)))) ?
3282       si_llvm_build_shader_part(sscreen, stage, prolog, compiler, debug, name, result) :
3283 #endif
3284       si_aco_build_shader_part(sscreen, stage, prolog, debug, name, result);
3285 
3286    if (ok) {
3287       result->next = *list;
3288       *list = result;
3289    } else {
3290       FREE(result);
3291       result = NULL;
3292    }
3293 
3294    simple_mtx_unlock(&sscreen->shader_parts_mutex);
3295    return result;
3296 }
3297 
3298 
3299 /**
3300  * Select and compile (or reuse) TCS parts (epilog).
3301  */
si_shader_select_tcs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3302 static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3303                                        struct si_shader *shader, struct util_debug_callback *debug)
3304 {
3305    if (sscreen->info.gfx_level >= GFX9) {
3306       assert(shader->wave_size == 32 || shader->wave_size == 64);
3307       unsigned index = shader->wave_size / 32 - 1;
3308       shader->previous_stage = shader->key.ge.part.tcs.ls->main_shader_part_ls[index];
3309    }
3310 
3311    return true;
3312 }
3313 
3314 /**
3315  * Select and compile (or reuse) GS parts (prolog).
3316  */
si_shader_select_gs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3317 static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3318                                       struct si_shader *shader, struct util_debug_callback *debug)
3319 {
3320    if (sscreen->info.gfx_level >= GFX9) {
3321       if (shader->key.ge.as_ngg) {
3322          assert(shader->wave_size == 32 || shader->wave_size == 64);
3323          unsigned index = shader->wave_size / 32 - 1;
3324          shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_ngg_es[index];
3325       } else {
3326          shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_es;
3327       }
3328    }
3329 
3330    return true;
3331 }
3332 
3333 /**
3334  * Compute the PS prolog key, which contains all the information needed to
3335  * build the PS prolog function, and set related bits in shader->config.
3336  */
si_get_ps_prolog_key(struct si_shader * shader,union si_shader_part_key * key)3337 static void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key)
3338 {
3339    struct si_shader_info *info = &shader->selector->info;
3340 
3341    memset(key, 0, sizeof(*key));
3342    key->ps_prolog.states = shader->key.ps.part.prolog;
3343    key->ps_prolog.use_aco = info->base.use_aco_amd;
3344    key->ps_prolog.wave32 = shader->wave_size == 32;
3345    key->ps_prolog.colors_read = shader->info.ps_colors_read;
3346    key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
3347    key->ps_prolog.wqm =
3348       info->base.fs.needs_quad_helper_invocations &&
3349       (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
3350        key->ps_prolog.states.force_linear_sample_interp ||
3351        key->ps_prolog.states.force_persp_center_interp ||
3352        key->ps_prolog.states.force_linear_center_interp ||
3353        key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear ||
3354        key->ps_prolog.states.samplemask_log_ps_iter ||
3355        key->ps_prolog.states.get_frag_coord_from_pixel_coord ||
3356        key->ps_prolog.states.force_samplemask_to_helper_invocation);
3357    key->ps_prolog.fragcoord_usage_mask =
3358       G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_ena) |
3359       (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_ena) << 1) |
3360       (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_ena) << 2) |
3361       (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) << 3);
3362    key->ps_prolog.pixel_center_integer = key->ps_prolog.fragcoord_usage_mask &&
3363                                          shader->selector->info.base.fs.pixel_center_integer;
3364 
3365    if (shader->key.ps.part.prolog.poly_stipple)
3366       shader->info.uses_vmem_load_other = true;
3367 
3368    if (shader->info.ps_colors_read) {
3369       uint8_t *color = shader->selector->info.color_attr_index;
3370 
3371       if (shader->key.ps.part.prolog.color_two_side) {
3372          /* BCOLORs are stored after the last input. */
3373          key->ps_prolog.num_interp_inputs = shader->info.num_ps_inputs;
3374          shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
3375       }
3376 
3377       for (unsigned i = 0; i < 2; i++) {
3378          unsigned interp = info->color_interpolate[i];
3379          unsigned location = info->color_interpolate_loc[i];
3380 
3381          if (!(shader->info.ps_colors_read & (0xf << i * 4)))
3382             continue;
3383 
3384          key->ps_prolog.color_attr_index[i] = color[i];
3385 
3386          if (shader->key.ps.part.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
3387             interp = INTERP_MODE_FLAT;
3388 
3389          switch (interp) {
3390          case INTERP_MODE_FLAT:
3391             key->ps_prolog.color_interp_vgpr_index[i] = -1;
3392             break;
3393          case INTERP_MODE_SMOOTH:
3394          case INTERP_MODE_COLOR:
3395             /* Force the interpolation location for colors here. */
3396             if (shader->key.ps.part.prolog.force_persp_sample_interp)
3397                location = TGSI_INTERPOLATE_LOC_SAMPLE;
3398             if (shader->key.ps.part.prolog.force_persp_center_interp)
3399                location = TGSI_INTERPOLATE_LOC_CENTER;
3400 
3401             switch (location) {
3402             case TGSI_INTERPOLATE_LOC_SAMPLE:
3403                key->ps_prolog.color_interp_vgpr_index[i] = 0;
3404                shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
3405                break;
3406             case TGSI_INTERPOLATE_LOC_CENTER:
3407                key->ps_prolog.color_interp_vgpr_index[i] = 2;
3408                shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
3409                break;
3410             case TGSI_INTERPOLATE_LOC_CENTROID:
3411                key->ps_prolog.color_interp_vgpr_index[i] = 4;
3412                shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
3413                break;
3414             default:
3415                assert(0);
3416             }
3417             break;
3418          case INTERP_MODE_NOPERSPECTIVE:
3419             /* Force the interpolation location for colors here. */
3420             if (shader->key.ps.part.prolog.force_linear_sample_interp)
3421                location = TGSI_INTERPOLATE_LOC_SAMPLE;
3422             if (shader->key.ps.part.prolog.force_linear_center_interp)
3423                location = TGSI_INTERPOLATE_LOC_CENTER;
3424 
3425             /* The VGPR assignment for non-monolithic shaders
3426              * works because InitialPSInputAddr is set on the
3427              * main shader and PERSP_PULL_MODEL is never used.
3428              */
3429             switch (location) {
3430             case TGSI_INTERPOLATE_LOC_SAMPLE:
3431                key->ps_prolog.color_interp_vgpr_index[i] = 6;
3432                shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
3433                break;
3434             case TGSI_INTERPOLATE_LOC_CENTER:
3435                key->ps_prolog.color_interp_vgpr_index[i] = 8;
3436                shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
3437                break;
3438             case TGSI_INTERPOLATE_LOC_CENTROID:
3439                key->ps_prolog.color_interp_vgpr_index[i] = 10;
3440                shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
3441                break;
3442             default:
3443                assert(0);
3444             }
3445             break;
3446          default:
3447             assert(0);
3448          }
3449       }
3450    }
3451 }
3452 
3453 /**
3454  * Check whether a PS prolog is required based on the key.
3455  */
si_need_ps_prolog(const union si_shader_part_key * key)3456 static bool si_need_ps_prolog(const union si_shader_part_key *key)
3457 {
3458    return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
3459           key->ps_prolog.states.force_linear_sample_interp ||
3460           key->ps_prolog.states.force_persp_center_interp ||
3461           key->ps_prolog.states.force_linear_center_interp ||
3462           key->ps_prolog.states.bc_optimize_for_persp ||
3463           key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
3464           key->ps_prolog.states.samplemask_log_ps_iter ||
3465           key->ps_prolog.states.get_frag_coord_from_pixel_coord ||
3466           key->ps_prolog.states.force_samplemask_to_helper_invocation;
3467 }
3468 
3469 /**
3470  * Compute the PS epilog key, which contains all the information needed to
3471  * build the PS epilog function.
3472  */
si_get_ps_epilog_key(struct si_shader * shader,union si_shader_part_key * key)3473 static void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
3474 {
3475    struct si_shader_info *info = &shader->selector->info;
3476    memset(key, 0, sizeof(*key));
3477    key->ps_epilog.use_aco = info->base.use_aco_amd;
3478    key->ps_epilog.wave32 = shader->wave_size == 32;
3479    key->ps_epilog.uses_discard = si_shader_uses_discard(shader);
3480    key->ps_epilog.colors_written = info->colors_written;
3481    key->ps_epilog.color_types = info->output_color_types;
3482    key->ps_epilog.writes_all_cbufs = info->color0_writes_all_cbufs &&
3483                                      /* Check whether a non-zero color buffer is bound. */
3484                                      !!(shader->key.ps.part.epilog.spi_shader_col_format & 0xfffffff0);
3485    key->ps_epilog.writes_z = info->writes_z;
3486    key->ps_epilog.writes_stencil = info->writes_stencil;
3487    key->ps_epilog.writes_samplemask = info->writes_samplemask;
3488    key->ps_epilog.states = shader->key.ps.part.epilog;
3489 }
3490 
3491 /**
3492  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
3493  */
si_shader_select_ps_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3494 static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3495                                       struct si_shader *shader, struct util_debug_callback *debug)
3496 {
3497    union si_shader_part_key prolog_key;
3498    union si_shader_part_key epilog_key;
3499 
3500    /* Get the prolog. */
3501    si_get_ps_prolog_key(shader, &prolog_key);
3502 
3503    /* The prolog is a no-op if these aren't set. */
3504    if (si_need_ps_prolog(&prolog_key)) {
3505       shader->prolog =
3506          si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key,
3507                             compiler, debug, "Fragment Shader Prolog");
3508       if (!shader->prolog)
3509          return false;
3510    }
3511 
3512    /* Get the epilog. */
3513    si_get_ps_epilog_key(shader, &epilog_key);
3514 
3515    shader->epilog =
3516       si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key,
3517                          compiler, debug, "Fragment Shader Epilog");
3518    if (!shader->epilog)
3519       return false;
3520 
3521    si_set_spi_ps_input_config_for_separate_prolog(shader);
3522    si_fixup_spi_ps_input_config(shader);
3523 
3524    /* Make sure spi_ps_input_addr bits is superset of spi_ps_input_ena. */
3525    unsigned spi_ps_input_ena = shader->config.spi_ps_input_ena;
3526    unsigned spi_ps_input_addr = shader->config.spi_ps_input_addr;
3527    assert((spi_ps_input_ena & spi_ps_input_addr) == spi_ps_input_ena);
3528 
3529    return true;
3530 }
3531 
si_multiwave_lds_size_workaround(struct si_screen * sscreen,unsigned * lds_size)3532 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
3533 {
3534    /* If tessellation is all offchip and on-chip GS isn't used, this
3535     * workaround is not needed.
3536     */
3537    return;
3538 
3539    /* SPI barrier management bug:
3540     *   Make sure we have at least 4k of LDS in use to avoid the bug.
3541     *   It applies to workgroup sizes of more than one wavefront.
3542     */
3543    if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
3544       *lds_size = MAX2(*lds_size, 8);
3545 }
3546 
si_fix_resource_usage(struct si_screen * sscreen,struct si_shader * shader)3547 static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
3548 {
3549    unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
3550 
3551    shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
3552 
3553    if (shader->selector->stage == MESA_SHADER_COMPUTE &&
3554        si_get_max_workgroup_size(shader) > shader->wave_size) {
3555       si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
3556    }
3557 }
3558 
si_create_shader_variant(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3559 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3560                               struct si_shader *shader, struct util_debug_callback *debug)
3561 {
3562    struct si_shader_selector *sel = shader->selector;
3563    struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key, shader->wave_size);
3564 
3565    /* LS, ES, VS are compiled on demand if the main part hasn't been
3566     * compiled for that stage.
3567     *
3568     * GS are compiled on demand if the main part hasn't been compiled
3569     * for the chosen NGG-ness.
3570     *
3571     * Vertex shaders are compiled on demand when a vertex fetch
3572     * workaround must be applied.
3573     */
3574    if (shader->is_monolithic) {
3575       /* Monolithic shader (compiled as a whole, has many variants,
3576        * may take a long time to compile).
3577        */
3578       if (!si_compile_shader(sscreen, compiler, shader, debug))
3579          return false;
3580    } else {
3581       /* The shader consists of several parts:
3582        *
3583        * - the middle part is the user shader, it has 1 variant only
3584        *   and it was compiled during the creation of the shader
3585        *   selector
3586        * - the prolog part is inserted at the beginning
3587        * - the epilog part is inserted at the end
3588        *
3589        * The prolog and epilog have many (but simple) variants.
3590        *
3591        * Starting with gfx9, geometry and tessellation control
3592        * shaders also contain the prolog and user shader parts of
3593        * the previous shader stage.
3594        */
3595 
3596       if (!mainp)
3597          return false;
3598 
3599       determine_shader_variant_info(sscreen, shader);
3600 
3601       /* Copy the compiled shader data over. */
3602       shader->is_binary_shared = true;
3603       shader->binary = mainp->binary;
3604       shader->config = mainp->config;
3605       shader->info = mainp->info;
3606 
3607       /* Select prologs and/or epilogs. */
3608       switch (sel->stage) {
3609       case MESA_SHADER_TESS_CTRL:
3610          if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
3611             return false;
3612          break;
3613       case MESA_SHADER_GEOMETRY:
3614          if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
3615             return false;
3616 
3617          /* Clone the GS copy shader for the shader variant.
3618           * We can't just copy the pointer because we change the pm4 state and
3619           * si_shader_selector::gs_copy_shader must be immutable because it's shared
3620           * by multiple contexts.
3621           */
3622          if (!shader->key.ge.as_ngg) {
3623             assert(mainp->gs_copy_shader);
3624             assert(mainp->gs_copy_shader->bo);
3625             assert(!mainp->gs_copy_shader->previous_stage_sel);
3626             assert(!mainp->gs_copy_shader->scratch_va);
3627 
3628             shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
3629             memcpy(shader->gs_copy_shader, mainp->gs_copy_shader,
3630                    sizeof(*shader->gs_copy_shader));
3631             /* Increase the reference count. */
3632             pipe_reference(NULL, &shader->gs_copy_shader->bo->b.b.reference);
3633             /* Initialize some fields differently. */
3634             shader->gs_copy_shader->shader_log = NULL;
3635             shader->gs_copy_shader->is_binary_shared = true;
3636             util_queue_fence_init(&shader->gs_copy_shader->ready);
3637          }
3638          break;
3639       case MESA_SHADER_FRAGMENT:
3640          if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
3641             return false;
3642 
3643          /* Make sure we have at least as many VGPRs as there
3644           * are allocated inputs.
3645           */
3646          shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
3647          break;
3648       default:;
3649       }
3650 
3651       assert(shader->wave_size == mainp->wave_size);
3652       assert(!shader->previous_stage || shader->wave_size == shader->previous_stage->wave_size);
3653 
3654       /* Update SGPR and VGPR counts. */
3655       if (shader->prolog) {
3656          shader->config.num_sgprs =
3657             MAX2(shader->config.num_sgprs, shader->prolog->num_sgprs);
3658          shader->config.num_vgprs =
3659             MAX2(shader->config.num_vgprs, shader->prolog->num_vgprs);
3660       }
3661       if (shader->previous_stage) {
3662          shader->config.num_sgprs =
3663             MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
3664          shader->config.num_vgprs =
3665             MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
3666          shader->config.spilled_sgprs =
3667             MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
3668          shader->config.spilled_vgprs =
3669             MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
3670          shader->info.private_mem_vgprs =
3671             MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
3672          shader->config.scratch_bytes_per_wave =
3673             MAX2(shader->config.scratch_bytes_per_wave,
3674                  shader->previous_stage->config.scratch_bytes_per_wave);
3675          shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
3676          shader->info.uses_vmem_load_other |= shader->previous_stage->info.uses_vmem_load_other;
3677          shader->info.uses_vmem_sampler_or_bvh |= shader->previous_stage->info.uses_vmem_sampler_or_bvh;
3678       }
3679       if (shader->epilog) {
3680          shader->config.num_sgprs =
3681             MAX2(shader->config.num_sgprs, shader->epilog->num_sgprs);
3682          shader->config.num_vgprs =
3683             MAX2(shader->config.num_vgprs, shader->epilog->num_vgprs);
3684       }
3685       si_calculate_max_simd_waves(shader);
3686    }
3687 
3688    if (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
3689       assert(!shader->key.ge.as_es && !shader->key.ge.as_ls);
3690       if (!gfx10_ngg_calculate_subgroup_info(shader)) {
3691          fprintf(stderr, "Failed to compute subgroup info\n");
3692          return false;
3693       }
3694    } else if (sscreen->info.gfx_level >= GFX9 && sel->stage == MESA_SHADER_GEOMETRY) {
3695       gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
3696    }
3697 
3698    shader->uses_vs_state_provoking_vertex =
3699       sscreen->use_ngg &&
3700       /* Used to convert triangle strips from GS to triangles. */
3701       ((sel->stage == MESA_SHADER_GEOMETRY &&
3702         util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) ||
3703        (sel->stage == MESA_SHADER_VERTEX &&
3704         /* Used to export PrimitiveID from the correct vertex. */
3705         shader->key.ge.mono.u.vs_export_prim_id));
3706 
3707    shader->uses_gs_state_outprim = sscreen->use_ngg &&
3708                                    /* Only used by streamout and the PrimID export in vertex
3709                                     * shaders. */
3710                                    sel->stage == MESA_SHADER_VERTEX &&
3711                                    (si_shader_uses_streamout(shader) ||
3712                                     shader->uses_vs_state_provoking_vertex);
3713 
3714    if (sel->stage == MESA_SHADER_VERTEX) {
3715       shader->uses_base_instance = sel->info.uses_base_instance ||
3716                                    shader->key.ge.mono.instance_divisor_is_one ||
3717                                    shader->key.ge.mono.instance_divisor_is_fetched;
3718    } else if (sel->stage == MESA_SHADER_TESS_CTRL) {
3719       shader->uses_base_instance = shader->previous_stage_sel &&
3720                                    (shader->previous_stage_sel->info.uses_base_instance ||
3721                                     shader->key.ge.mono.instance_divisor_is_one ||
3722                                     shader->key.ge.mono.instance_divisor_is_fetched);
3723    } else if (sel->stage == MESA_SHADER_GEOMETRY) {
3724       shader->uses_base_instance = shader->previous_stage_sel &&
3725                                    (shader->previous_stage_sel->info.uses_base_instance ||
3726                                     shader->key.ge.mono.instance_divisor_is_one ||
3727                                     shader->key.ge.mono.instance_divisor_is_fetched);
3728    }
3729 
3730    si_fix_resource_usage(sscreen, shader);
3731 
3732    /* Upload. */
3733    bool ok = si_shader_binary_upload(sscreen, shader, 0) >= 0;
3734 
3735    shader->complete_shader_binary_size = si_get_shader_binary_size(sscreen, shader);
3736 
3737    si_shader_dump(sscreen, shader, debug, stderr, true);
3738 
3739    if (!ok)
3740       fprintf(stderr, "LLVM failed to upload shader\n");
3741    return ok;
3742 }
3743 
si_shader_binary_clean(struct si_shader_binary * binary)3744 void si_shader_binary_clean(struct si_shader_binary *binary)
3745 {
3746    free((void *)binary->code_buffer);
3747    binary->code_buffer = NULL;
3748 
3749    free(binary->llvm_ir_string);
3750    binary->llvm_ir_string = NULL;
3751 
3752    free((void *)binary->symbols);
3753    binary->symbols = NULL;
3754 
3755    free(binary->uploaded_code);
3756    binary->uploaded_code = NULL;
3757    binary->uploaded_code_size = 0;
3758 }
3759 
si_shader_destroy(struct si_shader * shader)3760 void si_shader_destroy(struct si_shader *shader)
3761 {
3762    si_resource_reference(&shader->bo, NULL);
3763 
3764    if (!shader->is_binary_shared)
3765       si_shader_binary_clean(&shader->binary);
3766 
3767    free(shader->shader_log);
3768 }
3769 
si_get_ps_prolog_args(struct si_shader_args * args,const union si_shader_part_key * key)3770 void si_get_ps_prolog_args(struct si_shader_args *args,
3771                            const union si_shader_part_key *key)
3772 {
3773    memset(args, 0, sizeof(*args));
3774 
3775    const unsigned num_input_sgprs = key->ps_prolog.num_input_sgprs;
3776 
3777    struct ac_arg input_sgprs[num_input_sgprs];
3778    for (unsigned i = 0; i < num_input_sgprs; i++)
3779       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, input_sgprs + i);
3780 
3781    args->internal_bindings = input_sgprs[SI_SGPR_INTERNAL_BINDINGS];
3782    /* Use the absolute location of the input. */
3783    args->ac.prim_mask = input_sgprs[SI_PS_NUM_USER_SGPR];
3784 
3785    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_sample);
3786    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_center);
3787    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_centroid);
3788    /* skip PERSP_PULL_MODEL */
3789    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_sample);
3790    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_center);
3791    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_centroid);
3792    /* skip LINE_STIPPLE_TEX */
3793 
3794    /* POS_X|Y|Z|W_FLOAT */
3795    u_foreach_bit(i, key->ps_prolog.fragcoord_usage_mask)
3796       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[i]);
3797 
3798    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.front_face);
3799    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.ancillary);
3800    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.sample_coverage);
3801    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.pos_fixed_pt);
3802 }
3803 
si_get_ps_epilog_args(struct si_shader_args * args,const union si_shader_part_key * key,struct ac_arg colors[MAX_DRAW_BUFFERS],struct ac_arg * depth,struct ac_arg * stencil,struct ac_arg * sample_mask)3804 void si_get_ps_epilog_args(struct si_shader_args *args,
3805                            const union si_shader_part_key *key,
3806                            struct ac_arg colors[MAX_DRAW_BUFFERS],
3807                            struct ac_arg *depth, struct ac_arg *stencil,
3808                            struct ac_arg *sample_mask)
3809 {
3810    memset(args, 0, sizeof(*args));
3811 
3812    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3813    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3814    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3815    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3816    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3817    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3818    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &args->alpha_reference);
3819 
3820    u_foreach_bit (i, key->ps_epilog.colors_written) {
3821       ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_FLOAT, colors + i);
3822    }
3823 
3824    if (key->ps_epilog.writes_z)
3825       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, depth);
3826 
3827    if (key->ps_epilog.writes_stencil)
3828       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, stencil);
3829 
3830    if (key->ps_epilog.writes_samplemask)
3831       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, sample_mask);
3832 }
3833