• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2012 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_shader.h"
8 #include "ac_nir.h"
9 #include "ac_rtld.h"
10 #include "nir.h"
11 #include "nir_builder.h"
12 #include "nir_serialize.h"
13 #include "nir_xfb_info.h"
14 #include "si_pipe.h"
15 #include "si_shader_internal.h"
16 #include "sid.h"
17 #include "tgsi/tgsi_from_mesa.h"
18 #include "util/u_memory.h"
19 #include "util/mesa-sha1.h"
20 #include "util/ralloc.h"
21 #include "util/u_upload_mgr.h"
22 
23 static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
24 
25 static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
26 
27 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
28 static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader);
29 
30 /* Get the number of all interpolated inputs */
si_get_ps_num_interp(struct si_shader * ps)31 unsigned si_get_ps_num_interp(struct si_shader *ps)
32 {
33    unsigned num_colors = !!(ps->info.ps_colors_read & 0x0f) + !!(ps->info.ps_colors_read & 0xf0);
34    unsigned num_interp =
35       ps->info.num_ps_inputs + (ps->key.ps.part.prolog.color_two_side ? num_colors : 0);
36 
37    assert(num_interp <= 32);
38    return MIN2(num_interp, 32);
39 }
40 
41 /** Whether the shader runs as a combination of multiple API shaders */
si_is_multi_part_shader(struct si_shader * shader)42 bool si_is_multi_part_shader(struct si_shader *shader)
43 {
44    if (shader->selector->screen->info.gfx_level <= GFX8 ||
45        shader->selector->stage > MESA_SHADER_GEOMETRY)
46       return false;
47 
48    return shader->key.ge.as_ls || shader->key.ge.as_es ||
49           shader->selector->stage == MESA_SHADER_TESS_CTRL ||
50           shader->selector->stage == MESA_SHADER_GEOMETRY;
51 }
52 
53 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
si_is_merged_shader(struct si_shader * shader)54 bool si_is_merged_shader(struct si_shader *shader)
55 {
56    if (shader->selector->stage > MESA_SHADER_GEOMETRY || shader->is_gs_copy_shader)
57       return false;
58 
59    return shader->key.ge.as_ngg || si_is_multi_part_shader(shader);
60 }
61 
62 /**
63  * Returns a unique index for a semantic name and index. The index must be
64  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
65  * calculated.
66  */
si_shader_io_get_unique_index(unsigned semantic)67 unsigned si_shader_io_get_unique_index(unsigned semantic)
68 {
69    switch (semantic) {
70    case VARYING_SLOT_POS:
71       return SI_UNIQUE_SLOT_POS;
72    default:
73       if (semantic >= VARYING_SLOT_VAR0 && semantic <= VARYING_SLOT_VAR31)
74          return SI_UNIQUE_SLOT_VAR0 + (semantic - VARYING_SLOT_VAR0);
75 
76       if (semantic >= VARYING_SLOT_VAR0_16BIT && semantic <= VARYING_SLOT_VAR15_16BIT)
77          return SI_UNIQUE_SLOT_VAR0_16BIT + (semantic - VARYING_SLOT_VAR0_16BIT);
78 
79       assert(!"invalid generic index");
80       return 0;
81 
82    /* Legacy desktop GL varyings. */
83    case VARYING_SLOT_FOGC:
84       return SI_UNIQUE_SLOT_FOGC;
85    case VARYING_SLOT_COL0:
86       return SI_UNIQUE_SLOT_COL0;
87    case VARYING_SLOT_COL1:
88       return SI_UNIQUE_SLOT_COL1;
89    case VARYING_SLOT_BFC0:
90       return SI_UNIQUE_SLOT_BFC0;
91    case VARYING_SLOT_BFC1:
92       return SI_UNIQUE_SLOT_BFC1;
93    case VARYING_SLOT_TEX0:
94    case VARYING_SLOT_TEX1:
95    case VARYING_SLOT_TEX2:
96    case VARYING_SLOT_TEX3:
97    case VARYING_SLOT_TEX4:
98    case VARYING_SLOT_TEX5:
99    case VARYING_SLOT_TEX6:
100    case VARYING_SLOT_TEX7:
101       return SI_UNIQUE_SLOT_TEX0 + (semantic - VARYING_SLOT_TEX0);
102    case VARYING_SLOT_CLIP_VERTEX:
103       return SI_UNIQUE_SLOT_CLIP_VERTEX;
104 
105    /* Varyings present in both GLES and desktop GL. */
106    case VARYING_SLOT_CLIP_DIST0:
107       return SI_UNIQUE_SLOT_CLIP_DIST0;
108    case VARYING_SLOT_CLIP_DIST1:
109       return SI_UNIQUE_SLOT_CLIP_DIST1;
110    case VARYING_SLOT_PSIZ:
111       return SI_UNIQUE_SLOT_PSIZ;
112    case VARYING_SLOT_LAYER:
113       return SI_UNIQUE_SLOT_LAYER;
114    case VARYING_SLOT_VIEWPORT:
115       return SI_UNIQUE_SLOT_VIEWPORT;
116    case VARYING_SLOT_PRIMITIVE_ID:
117       return SI_UNIQUE_SLOT_PRIMITIVE_ID;
118    }
119 }
120 
declare_streamout_params(struct si_shader_args * args,struct si_shader * shader)121 static void declare_streamout_params(struct si_shader_args *args, struct si_shader *shader)
122 {
123    struct si_shader_selector *sel = shader->selector;
124 
125    if (shader->selector->screen->info.gfx_level >= GFX11) {
126       /* NGG streamout. */
127       if (sel->stage == MESA_SHADER_TESS_EVAL)
128          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
129       return;
130    }
131 
132    /* Streamout SGPRs. */
133    if (si_shader_uses_streamout(shader)) {
134       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_config);
135       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_write_index);
136 
137       /* A streamout buffer offset is loaded if the stride is non-zero. */
138       for (int i = 0; i < 4; i++) {
139          if (!sel->info.base.xfb_stride[i])
140             continue;
141 
142          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_offset[i]);
143       }
144    } else if (sel->stage == MESA_SHADER_TESS_EVAL) {
145       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
146    }
147 }
148 
si_get_max_workgroup_size(const struct si_shader * shader)149 unsigned si_get_max_workgroup_size(const struct si_shader *shader)
150 {
151    gl_shader_stage stage = shader->is_gs_copy_shader ?
152       MESA_SHADER_VERTEX : shader->selector->stage;
153 
154    switch (stage) {
155    case MESA_SHADER_VERTEX:
156    case MESA_SHADER_TESS_EVAL:
157       /* Use the largest workgroup size for streamout */
158       if (shader->key.ge.as_ngg)
159          return si_shader_uses_streamout(shader) ? 256 : 128;
160 
161       /* As part of merged shader. */
162       return shader->selector->screen->info.gfx_level >= GFX9 &&
163          (shader->key.ge.as_ls || shader->key.ge.as_es) ? 128 : 0;
164 
165    case MESA_SHADER_TESS_CTRL:
166       /* Return this so that LLVM doesn't remove s_barrier
167        * instructions on chips where we use s_barrier. */
168       return shader->selector->screen->info.gfx_level >= GFX7 ? 128 : 0;
169 
170    case MESA_SHADER_GEOMETRY:
171       /* GS can always generate up to 256 vertices. */
172       return shader->selector->screen->info.gfx_level >= GFX9 ? 256 : 0;
173 
174    case MESA_SHADER_COMPUTE:
175       break; /* see below */
176 
177    default:
178       return 0;
179    }
180 
181    /* Compile a variable block size using the maximum variable size. */
182    if (shader->selector->info.base.workgroup_size_variable)
183       return SI_MAX_VARIABLE_THREADS_PER_BLOCK;
184 
185    uint16_t *local_size = shader->selector->info.base.workgroup_size;
186    unsigned max_work_group_size = (uint32_t)local_size[0] *
187                                   (uint32_t)local_size[1] *
188                                   (uint32_t)local_size[2];
189    assert(max_work_group_size);
190    return max_work_group_size;
191 }
192 
declare_const_and_shader_buffers(struct si_shader_args * args,struct si_shader * shader,bool assign_params)193 static void declare_const_and_shader_buffers(struct si_shader_args *args,
194                                              struct si_shader *shader,
195                                              bool assign_params)
196 {
197    enum ac_arg_type const_shader_buf_type;
198 
199    if (shader->selector->info.base.num_ubos == 1 &&
200        shader->selector->info.base.num_ssbos == 0)
201       const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
202    else
203       const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
204 
205    ac_add_arg(
206       &args->ac, AC_ARG_SGPR, 1, const_shader_buf_type,
207       assign_params ? &args->const_and_shader_buffers : &args->other_const_and_shader_buffers);
208 }
209 
declare_samplers_and_images(struct si_shader_args * args,bool assign_params)210 static void declare_samplers_and_images(struct si_shader_args *args, bool assign_params)
211 {
212    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
213               assign_params ? &args->samplers_and_images : &args->other_samplers_and_images);
214 }
215 
declare_per_stage_desc_pointers(struct si_shader_args * args,struct si_shader * shader,bool assign_params)216 static void declare_per_stage_desc_pointers(struct si_shader_args *args,
217                                             struct si_shader *shader,
218                                             bool assign_params)
219 {
220    declare_const_and_shader_buffers(args, shader, assign_params);
221    declare_samplers_and_images(args, assign_params);
222 }
223 
declare_global_desc_pointers(struct si_shader_args * args)224 static void declare_global_desc_pointers(struct si_shader_args *args)
225 {
226    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->internal_bindings);
227    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
228               &args->bindless_samplers_and_images);
229 }
230 
declare_vb_descriptor_input_sgprs(struct si_shader_args * args,struct si_shader * shader)231 static void declare_vb_descriptor_input_sgprs(struct si_shader_args *args,
232                                               struct si_shader *shader)
233 {
234    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->ac.vertex_buffers);
235 
236    unsigned num_vbos_in_user_sgprs = shader->selector->info.num_vbos_in_user_sgprs;
237    if (num_vbos_in_user_sgprs) {
238       unsigned user_sgprs = args->ac.num_sgprs_used;
239 
240       if (si_is_merged_shader(shader))
241          user_sgprs -= 8;
242       assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
243 
244       /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
245       for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
246          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
247 
248       assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(args->vb_descriptors));
249       for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
250          ac_add_arg(&args->ac, AC_ARG_SGPR, 4, AC_ARG_INT, &args->vb_descriptors[i]);
251    }
252 }
253 
declare_vs_input_vgprs(struct si_shader_args * args,struct si_shader * shader)254 static void declare_vs_input_vgprs(struct si_shader_args *args, struct si_shader *shader)
255 {
256    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vertex_id);
257    if (shader->key.ge.as_ls) {
258       if (shader->selector->screen->info.gfx_level >= GFX11) {
259          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
260          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
261          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
262       } else if (shader->selector->screen->info.gfx_level >= GFX10) {
263          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
264          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
265          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
266       } else {
267          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
268          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
269          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
270       }
271    } else if (shader->selector->screen->info.gfx_level >= GFX10) {
272       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
273       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
274                  /* user vgpr or PrimID (legacy) */
275                  shader->key.ge.as_ngg ? NULL : &args->ac.vs_prim_id);
276       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
277    } else {
278       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
279       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_prim_id);
280       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
281    }
282 }
283 
declare_vs_blit_inputs(struct si_shader * shader,struct si_shader_args * args)284 static void declare_vs_blit_inputs(struct si_shader *shader, struct si_shader_args *args)
285 {
286    bool has_attribute_ring_address = shader->selector->screen->info.gfx_level >= GFX11;
287 
288    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_blit_inputs); /* i16 x1, y1 */
289    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);                  /* i16 x1, y1 */
290    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL);                /* depth */
291 
292    if (shader->selector->info.base.vs.blit_sgprs_amd ==
293        SI_VS_BLIT_SGPRS_POS_COLOR + has_attribute_ring_address) {
294       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
295       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
296       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
297       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
298       if (has_attribute_ring_address)
299          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* attribute ring address */
300    } else if (shader->selector->info.base.vs.blit_sgprs_amd ==
301               SI_VS_BLIT_SGPRS_POS_TEXCOORD + has_attribute_ring_address) {
302       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
303       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
304       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
305       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
306       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
307       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
308       if (has_attribute_ring_address)
309          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* attribute ring address */
310    }
311 }
312 
declare_tes_input_vgprs(struct si_shader_args * args)313 static void declare_tes_input_vgprs(struct si_shader_args *args)
314 {
315    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_u);
316    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_v);
317    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_rel_patch_id);
318    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_patch_id);
319 }
320 
321 enum
322 {
323    /* Convenient merged shader definitions. */
324    SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES,
325    SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
326 };
327 
si_add_arg_checked(struct ac_shader_args * args,enum ac_arg_regfile file,unsigned registers,enum ac_arg_type type,struct ac_arg * arg,unsigned idx)328 void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
329                         enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
330 {
331    assert(args->arg_count == idx);
332    ac_add_arg(args, file, registers, type, arg);
333 }
334 
si_init_shader_args(struct si_shader * shader,struct si_shader_args * args)335 void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args)
336 {
337    unsigned i, num_returns, num_return_sgprs;
338    unsigned num_prolog_vgprs = 0;
339    struct si_shader_selector *sel = shader->selector;
340    unsigned stage = shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : sel->stage;
341    unsigned stage_case = stage;
342 
343    memset(args, 0, sizeof(*args));
344 
345    /* Set MERGED shaders. */
346    if (sel->screen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY) {
347       if (shader->key.ge.as_ls || stage == MESA_SHADER_TESS_CTRL)
348          stage_case = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
349       else if (shader->key.ge.as_es || shader->key.ge.as_ngg || stage == MESA_SHADER_GEOMETRY)
350          stage_case = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
351    }
352 
353    switch (stage_case) {
354    case MESA_SHADER_VERTEX:
355       declare_global_desc_pointers(args);
356 
357       if (sel->info.base.vs.blit_sgprs_amd) {
358          declare_vs_blit_inputs(shader, args);
359       } else {
360          declare_per_stage_desc_pointers(args, shader, true);
361          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
362 
363          if (shader->is_gs_copy_shader) {
364             declare_streamout_params(args, shader);
365          } else {
366             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
367             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
368             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
369             declare_vb_descriptor_input_sgprs(args, shader);
370 
371             if (shader->key.ge.as_es) {
372                ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
373             } else if (shader->key.ge.as_ls) {
374                /* no extra parameters */
375             } else {
376                declare_streamout_params(args, shader);
377             }
378          }
379       }
380 
381       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
382       if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11)
383          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
384 
385       /* VGPRs */
386       declare_vs_input_vgprs(args, shader);
387 
388       break;
389 
390    case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
391       declare_global_desc_pointers(args);
392       declare_per_stage_desc_pointers(args, shader, true);
393       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
394       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
395       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
396       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
397       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
398 
399       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
400       if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11)
401          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
402 
403       /* VGPRs */
404       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
405       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
406 
407       /* For monolithic shaders, the TCS epilog code is generated by
408        * ac_nir_lower_hs_outputs_to_mem.
409        */
410       if (!shader->is_monolithic) {
411          /* param_tcs_offchip_offset and param_tcs_factor_offset are
412           * placed after the user SGPRs.
413           */
414          for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
415             ac_add_return(&args->ac, AC_ARG_SGPR);
416          for (i = 0; i < 11; i++)
417             ac_add_return(&args->ac, AC_ARG_VGPR);
418       }
419       break;
420 
421    case SI_SHADER_MERGED_VERTEX_TESSCTRL:
422       /* Merged stages have 8 system SGPRs at the beginning. */
423       /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
424       /* Gfx11+:  SPI_SHADER_PGM_LO/HI_HS */
425       declare_per_stage_desc_pointers(args, shader, stage == MESA_SHADER_TESS_CTRL);
426       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
427       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info);
428       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
429       if (sel->screen->info.gfx_level >= GFX11)
430          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_wave_id);
431       else
432          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
433       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
434       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
435 
436       declare_global_desc_pointers(args);
437       declare_per_stage_desc_pointers(args, shader, stage == MESA_SHADER_VERTEX);
438 
439       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
440       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
441       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
442       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
443       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
444       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
445 
446       /* VGPRs (first TCS, then VS) */
447       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
448       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
449 
450       if (stage == MESA_SHADER_VERTEX) {
451          declare_vs_input_vgprs(args, shader);
452 
453          /* Need to keep LS/HS arg index same for shared args when ACO,
454           * so this is not able to be before shared VGPRs.
455           */
456          declare_vb_descriptor_input_sgprs(args, shader);
457 
458          /* LS return values are inputs to the TCS main shader part. */
459          if (!shader->is_monolithic || shader->key.ge.opt.same_patch_vertices) {
460             for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
461                ac_add_return(&args->ac, AC_ARG_SGPR);
462             for (i = 0; i < 2; i++)
463                ac_add_return(&args->ac, AC_ARG_VGPR);
464 
465             /* VS outputs passed via VGPRs to TCS. */
466             if (shader->key.ge.opt.same_patch_vertices && !sel->screen->use_aco) {
467                unsigned num_outputs = util_last_bit64(shader->selector->info.outputs_written_before_tes_gs);
468                for (i = 0; i < num_outputs * 4; i++)
469                   ac_add_return(&args->ac, AC_ARG_VGPR);
470             }
471          }
472       } else {
473          /* TCS inputs are passed via VGPRs from VS. */
474          if (shader->key.ge.opt.same_patch_vertices && !sel->screen->use_aco) {
475             unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->info.outputs_written_before_tes_gs);
476             for (i = 0; i < num_inputs * 4; i++)
477                ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
478          }
479 
480          /* For monolithic shaders, the TCS epilog code is generated by
481           * ac_nir_lower_hs_outputs_to_mem.
482           */
483          if (!shader->is_monolithic) {
484             /* TCS return values are inputs to the TCS epilog.
485              *
486              * param_tcs_offchip_offset, param_tcs_factor_offset,
487              * param_tcs_offchip_layout, and internal_bindings
488              * should be passed to the epilog.
489              */
490             for (i = 0; i <= 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR; i++)
491                ac_add_return(&args->ac, AC_ARG_SGPR);
492             for (i = 0; i < 11; i++)
493                ac_add_return(&args->ac, AC_ARG_VGPR);
494          }
495       }
496       break;
497 
498    case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
499       /* Merged stages have 8 system SGPRs at the beginning. */
500       /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
501       /* Gfx11+:  SPI_SHADER_PGM_LO/HI_GS */
502       declare_per_stage_desc_pointers(args, shader, stage == MESA_SHADER_GEOMETRY);
503 
504       if (shader->key.ge.as_ngg)
505          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_tg_info);
506       else
507          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
508 
509       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info);
510       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
511       if (sel->screen->info.gfx_level >= GFX11)
512          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_attr_offset);
513       else
514          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
515       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
516       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
517 
518       declare_global_desc_pointers(args);
519       if (stage != MESA_SHADER_VERTEX || !sel->info.base.vs.blit_sgprs_amd) {
520          declare_per_stage_desc_pointers(
521             args, shader, (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL));
522       }
523 
524       if (stage == MESA_SHADER_VERTEX && sel->info.base.vs.blit_sgprs_amd) {
525          declare_vs_blit_inputs(shader, args);
526       } else {
527          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
528 
529          if (stage == MESA_SHADER_VERTEX) {
530             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
531             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
532             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
533          } else if (stage == MESA_SHADER_TESS_EVAL) {
534             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
535             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
536             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
537          } else {
538             /* GS */
539             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
540             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
541             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
542          }
543 
544          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->small_prim_cull_info);
545          if (sel->screen->info.gfx_level >= GFX11)
546             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->gs_attr_address);
547          else
548             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
549       }
550 
551       /* VGPRs (first GS, then VS/TES) */
552       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
553       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
554       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
555       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
556       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
557 
558       if (stage == MESA_SHADER_VERTEX) {
559          declare_vs_input_vgprs(args, shader);
560 
561          /* Need to keep ES/GS arg index same for shared args when ACO,
562           * so this is not able to be before shared VGPRs.
563           */
564          if (!sel->info.base.vs.blit_sgprs_amd)
565             declare_vb_descriptor_input_sgprs(args, shader);
566       } else if (stage == MESA_SHADER_TESS_EVAL) {
567          declare_tes_input_vgprs(args);
568       }
569 
570       if (shader->key.ge.as_es && !shader->is_monolithic &&
571           (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL)) {
572          /* ES return values are inputs to GS. */
573          for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
574             ac_add_return(&args->ac, AC_ARG_SGPR);
575          for (i = 0; i < 5; i++)
576             ac_add_return(&args->ac, AC_ARG_VGPR);
577       }
578       break;
579 
580    case MESA_SHADER_TESS_EVAL:
581       declare_global_desc_pointers(args);
582       declare_per_stage_desc_pointers(args, shader, true);
583       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
584       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
585       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
586 
587       if (shader->key.ge.as_es) {
588          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
589          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
590          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
591       } else {
592          declare_streamout_params(args, shader);
593          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
594       }
595 
596       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
597       if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11)
598          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
599 
600       /* VGPRs */
601       declare_tes_input_vgprs(args);
602       break;
603 
604    case MESA_SHADER_GEOMETRY:
605       declare_global_desc_pointers(args);
606       declare_per_stage_desc_pointers(args, shader, true);
607       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
608       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_wave_id);
609 
610       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
611       if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11)
612          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
613 
614       /* VGPRs */
615       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
616       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
617       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
618       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
619       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[3]);
620       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[4]);
621       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[5]);
622       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
623       break;
624 
625    case MESA_SHADER_FRAGMENT:
626       declare_global_desc_pointers(args);
627       declare_per_stage_desc_pointers(args, shader, true);
628       si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->alpha_reference,
629                          SI_PARAM_ALPHA_REF);
630       si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.prim_mask,
631                          SI_PARAM_PRIM_MASK);
632 
633       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_sample,
634                          SI_PARAM_PERSP_SAMPLE);
635       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_center,
636                          SI_PARAM_PERSP_CENTER);
637       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_centroid,
638                          SI_PARAM_PERSP_CENTROID);
639       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
640       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_sample,
641                          SI_PARAM_LINEAR_SAMPLE);
642       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_center,
643                          SI_PARAM_LINEAR_CENTER);
644       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_centroid,
645                          SI_PARAM_LINEAR_CENTROID);
646       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
647       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[0],
648                          SI_PARAM_POS_X_FLOAT);
649       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[1],
650                          SI_PARAM_POS_Y_FLOAT);
651       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[2],
652                          SI_PARAM_POS_Z_FLOAT);
653       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[3],
654                          SI_PARAM_POS_W_FLOAT);
655       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.front_face,
656                          SI_PARAM_FRONT_FACE);
657       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.ancillary,
658                          SI_PARAM_ANCILLARY);
659       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.sample_coverage,
660                          SI_PARAM_SAMPLE_COVERAGE);
661       si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.pos_fixed_pt,
662                          SI_PARAM_POS_FIXED_PT);
663 
664       if (sel->screen->use_aco) {
665          ac_compact_ps_vgpr_args(&args->ac, shader->config.spi_ps_input_addr);
666 
667          /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
668          if (sel->screen->info.gfx_level < GFX11)
669             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
670       }
671 
672       /* Monolithic PS emit prolog and epilog in NIR directly. */
673       if (!shader->is_monolithic) {
674          /* Color inputs from the prolog. */
675          if (shader->selector->info.colors_read) {
676             unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);
677 
678             for (i = 0; i < num_color_elements; i++)
679                ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, i ? NULL : &args->color_start);
680 
681             num_prolog_vgprs += num_color_elements;
682          }
683 
684          /* Outputs for the epilog. */
685          num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
686          num_returns =
687             num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
688             shader->selector->info.writes_z + shader->selector->info.writes_stencil +
689             shader->ps.writes_samplemask + 1 /* SampleMaskIn */;
690 
691          for (i = 0; i < num_return_sgprs; i++)
692             ac_add_return(&args->ac, AC_ARG_SGPR);
693          for (; i < num_returns; i++)
694             ac_add_return(&args->ac, AC_ARG_VGPR);
695       }
696       break;
697 
698    case MESA_SHADER_COMPUTE:
699       declare_global_desc_pointers(args);
700       declare_per_stage_desc_pointers(args, shader, true);
701       if (shader->selector->info.uses_grid_size)
702          ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.num_work_groups);
703       if (shader->selector->info.uses_variable_block_size)
704          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->block_size);
705 
706       unsigned cs_user_data_dwords =
707          shader->selector->info.base.cs.user_data_components_amd;
708       if (cs_user_data_dwords) {
709          ac_add_arg(&args->ac, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &args->cs_user_data);
710       }
711 
712       /* Some descriptors can be in user SGPRs. */
713       /* Shader buffers in user SGPRs. */
714       for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) {
715          while (args->ac.num_sgprs_used % 4 != 0)
716             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
717 
718          ac_add_arg(&args->ac, AC_ARG_SGPR, 4, AC_ARG_INT, &args->cs_shaderbuf[i]);
719       }
720       /* Images in user SGPRs. */
721       for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) {
722          unsigned num_sgprs = BITSET_TEST(shader->selector->info.base.image_buffers, i) ? 4 : 8;
723 
724          while (args->ac.num_sgprs_used % num_sgprs != 0)
725             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
726 
727          ac_add_arg(&args->ac, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &args->cs_image[i]);
728       }
729 
730       /* Hardware SGPRs. */
731       for (i = 0; i < 3; i++) {
732          if (shader->selector->info.uses_block_id[i]) {
733             ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]);
734          }
735       }
736       if (shader->selector->info.uses_tg_size)
737          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tg_size);
738 
739       /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
740       if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11)
741          ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
742 
743       /* Hardware VGPRs. */
744       /* Thread IDs are packed in VGPR0, 10 bits per component or stored in 3 separate VGPRs */
745       if (sel->screen->info.gfx_level >= GFX11 ||
746           (!sel->screen->info.has_graphics && sel->screen->info.family >= CHIP_MI200))
747          ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_ids);
748       else
749          ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, &args->ac.local_invocation_ids);
750       break;
751    default:
752       assert(0 && "unimplemented shader");
753       return;
754    }
755 
756    shader->info.num_input_sgprs = args->ac.num_sgprs_used;
757    shader->info.num_input_vgprs = args->ac.num_vgprs_used;
758 
759    assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
760    shader->info.num_input_vgprs -= num_prolog_vgprs;
761 }
762 
get_lds_granularity(struct si_screen * screen,gl_shader_stage stage)763 static unsigned get_lds_granularity(struct si_screen *screen, gl_shader_stage stage)
764 {
765    return screen->info.gfx_level >= GFX11 && stage == MESA_SHADER_FRAGMENT ? 1024 :
766           screen->info.gfx_level >= GFX7 ? 512 : 256;
767 }
768 
si_shader_binary_open(struct si_screen * screen,struct si_shader * shader,struct ac_rtld_binary * rtld)769 bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
770                            struct ac_rtld_binary *rtld)
771 {
772    const struct si_shader_selector *sel = shader->selector;
773    const char *part_elfs[5];
774    size_t part_sizes[5];
775    unsigned num_parts = 0;
776 
777 #define add_part(shader_or_part)                                                                   \
778    if (shader_or_part) {                                                                           \
779       part_elfs[num_parts] = (shader_or_part)->binary.code_buffer;                                 \
780       part_sizes[num_parts] = (shader_or_part)->binary.code_size;                                  \
781       num_parts++;                                                                                 \
782    }
783 
784    add_part(shader->prolog);
785    add_part(shader->previous_stage);
786    add_part(shader);
787    add_part(shader->epilog);
788 
789 #undef add_part
790 
791    struct ac_rtld_symbol lds_symbols[2];
792    unsigned num_lds_symbols = 0;
793 
794    if (sel && screen->info.gfx_level >= GFX9 && !shader->is_gs_copy_shader &&
795        (sel->stage == MESA_SHADER_GEOMETRY ||
796         (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg))) {
797       struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
798       sym->name = "esgs_ring";
799       sym->size = shader->gs_info.esgs_ring_size * 4;
800       sym->align = 64 * 1024;
801    }
802 
803    if (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
804       struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
805       sym->name = "ngg_emit";
806       sym->size = shader->ngg.ngg_emit_size * 4;
807       sym->align = 4;
808    }
809 
810    bool ok = ac_rtld_open(
811       rtld, (struct ac_rtld_open_info){.info = &screen->info,
812                                        .options =
813                                           {
814                                              .halt_at_entry = screen->options.halt_shaders,
815                                           },
816                                        .shader_type = sel->stage,
817                                        .wave_size = shader->wave_size,
818                                        .num_parts = num_parts,
819                                        .elf_ptrs = part_elfs,
820                                        .elf_sizes = part_sizes,
821                                        .num_shared_lds_symbols = num_lds_symbols,
822                                        .shared_lds_symbols = lds_symbols});
823 
824    if (rtld->lds_size > 0) {
825       unsigned alloc_granularity = get_lds_granularity(screen, sel->stage);
826       shader->config.lds_size = DIV_ROUND_UP(rtld->lds_size, alloc_granularity);
827    }
828 
829    return ok;
830 }
831 
get_shader_binaries(struct si_shader * shader,struct si_shader_binary * bin[4])832 static unsigned get_shader_binaries(struct si_shader *shader, struct si_shader_binary *bin[4])
833 {
834    unsigned num_bin = 0;
835 
836    if (shader->prolog)
837       bin[num_bin++] = &shader->prolog->binary;
838 
839    if (shader->previous_stage)
840       bin[num_bin++] = &shader->previous_stage->binary;
841 
842    bin[num_bin++] = &shader->binary;
843 
844    if (shader->epilog)
845       bin[num_bin++] = &shader->epilog->binary;
846 
847    return num_bin;
848 }
849 
si_get_shader_binary_size(struct si_screen * screen,struct si_shader * shader)850 static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
851 {
852    if (shader->binary.type == SI_SHADER_BINARY_ELF) {
853       struct ac_rtld_binary rtld;
854       si_shader_binary_open(screen, shader, &rtld);
855       uint64_t size = rtld.exec_size;
856       ac_rtld_close(&rtld);
857       return size;
858    } else {
859       struct si_shader_binary *bin[4];
860       unsigned num_bin = get_shader_binaries(shader, bin);
861 
862       unsigned size = 0;
863       for (unsigned i = 0; i < num_bin; i++) {
864          assert(bin[i]->type == SI_SHADER_BINARY_RAW);
865          size += bin[i]->exec_size;
866       }
867       return size;
868    }
869 }
870 
si_get_external_symbol(enum amd_gfx_level gfx_level,void * data,const char * name,uint64_t * value)871 bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
872                             uint64_t *value)
873 {
874    uint64_t *scratch_va = data;
875 
876    if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
877       *value = (uint32_t)*scratch_va;
878       return true;
879    }
880    if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
881       /* Enable scratch coalescing. */
882       *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32);
883 
884       if (gfx_level >= GFX11)
885          *value |= S_008F04_SWIZZLE_ENABLE_GFX11(1);
886       else
887          *value |= S_008F04_SWIZZLE_ENABLE_GFX6(1);
888       return true;
889    }
890 
891    return false;
892 }
893 
pre_upload_binary(struct si_screen * sscreen,struct si_shader * shader,unsigned binary_size,bool dma_upload,struct si_context ** upload_ctx,struct pipe_resource ** staging,unsigned * staging_offset)894 static void *pre_upload_binary(struct si_screen *sscreen, struct si_shader *shader,
895                                unsigned binary_size, bool dma_upload,
896                                struct si_context **upload_ctx,
897                                struct pipe_resource **staging,
898                                unsigned *staging_offset)
899 {
900    unsigned aligned_size = ac_align_shader_binary_for_prefetch(&sscreen->info, binary_size);
901 
902    si_resource_reference(&shader->bo, NULL);
903    shader->bo = si_aligned_buffer_create(
904       &sscreen->b,
905       SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT |
906       (dma_upload || sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY) |
907       (dma_upload ? PIPE_RESOURCE_FLAG_UNMAPPABLE : 0),
908       PIPE_USAGE_IMMUTABLE, align(aligned_size, SI_CPDMA_ALIGNMENT), 256);
909    if (!shader->bo)
910       return NULL;
911 
912    shader->gpu_address = shader->bo->gpu_address;
913 
914    if (dma_upload) {
915       /* First upload into a staging buffer. */
916       *upload_ctx = si_get_aux_context(&sscreen->aux_context.shader_upload);
917 
918       void *ret;
919       u_upload_alloc((*upload_ctx)->b.stream_uploader, 0, binary_size, 256,
920                      staging_offset, staging, &ret);
921       if (!ret)
922          si_put_aux_context_flush(&sscreen->aux_context.shader_upload);
923 
924       return ret;
925    } else {
926       return sscreen->ws->buffer_map(sscreen->ws,
927          shader->bo->buf, NULL,
928          PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);
929    }
930 }
931 
post_upload_binary(struct si_screen * sscreen,struct si_shader * shader,void * code,unsigned code_size,unsigned binary_size,bool dma_upload,struct si_context * upload_ctx,struct pipe_resource * staging,unsigned staging_offset)932 static void post_upload_binary(struct si_screen *sscreen, struct si_shader *shader,
933                                void *code, unsigned code_size,
934                                unsigned binary_size, bool dma_upload,
935                                struct si_context *upload_ctx,
936                                struct pipe_resource *staging,
937                                unsigned staging_offset)
938 {
939    if (sscreen->debug_flags & DBG(SQTT)) {
940       /* Remember the uploaded code */
941       shader->binary.uploaded_code_size = code_size;
942       shader->binary.uploaded_code = malloc(code_size);
943       memcpy(shader->binary.uploaded_code, code, code_size);
944    }
945 
946    if (dma_upload) {
947       /* Then copy from the staging buffer to VRAM.
948        *
949        * We can't use the upload copy in si_buffer_transfer_unmap because that might use
950        * a compute shader, and we can't use shaders in the code that is responsible for making
951        * them available.
952        */
953       si_cp_dma_copy_buffer(upload_ctx, &shader->bo->b.b, staging, 0, staging_offset,
954                             binary_size, SI_OP_SYNC_AFTER, SI_COHERENCY_SHADER,
955                             sscreen->info.gfx_level >= GFX7 ? L2_LRU : L2_BYPASS);
956       upload_ctx->flags |= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_L2;
957 
958 #if 0 /* debug: validate whether the copy was successful */
959       uint32_t *dst_binary = malloc(binary_size);
960       uint32_t *src_binary = (uint32_t*)code;
961       pipe_buffer_read(&upload_ctx->b, &shader->bo->b.b, 0, binary_size, dst_binary);
962       puts("dst_binary == src_binary:");
963       for (unsigned i = 0; i < binary_size / 4; i++) {
964          printf("   %08x == %08x\n", dst_binary[i], src_binary[i]);
965       }
966       free(dst_binary);
967       exit(0);
968 #endif
969 
970       si_put_aux_context_flush(&sscreen->aux_context.shader_upload);
971       pipe_resource_reference(&staging, NULL);
972    } else {
973       sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf);
974    }
975 }
976 
upload_binary_elf(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,bool dma_upload)977 static bool upload_binary_elf(struct si_screen *sscreen, struct si_shader *shader,
978                               uint64_t scratch_va, bool dma_upload)
979 {
980    struct ac_rtld_binary binary;
981    if (!si_shader_binary_open(sscreen, shader, &binary))
982       return false;
983 
984    struct si_context *upload_ctx = NULL;
985    struct pipe_resource *staging = NULL;
986    unsigned staging_offset = 0;
987 
988    void *rx_ptr = pre_upload_binary(sscreen, shader, binary.rx_size, dma_upload,
989                                     &upload_ctx, &staging, &staging_offset);
990    if (!rx_ptr)
991       return false;
992 
993    /* Upload. */
994    struct ac_rtld_upload_info u = {};
995    u.binary = &binary;
996    u.get_external_symbol = si_get_external_symbol;
997    u.cb_data = &scratch_va;
998    u.rx_va = shader->bo->gpu_address;
999    u.rx_ptr = rx_ptr;
1000 
1001    int size = ac_rtld_upload(&u);
1002 
1003    post_upload_binary(sscreen, shader, rx_ptr, size, binary.rx_size, dma_upload,
1004                       upload_ctx, staging, staging_offset);
1005 
1006    ac_rtld_close(&binary);
1007 
1008    return size >= 0;
1009 }
1010 
calculate_needed_lds_size(struct si_screen * sscreen,struct si_shader * shader)1011 static void calculate_needed_lds_size(struct si_screen *sscreen, struct si_shader *shader)
1012 {
1013    gl_shader_stage stage =
1014       shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : shader->selector->stage;
1015 
1016    if (sscreen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY &&
1017        (stage == MESA_SHADER_GEOMETRY || shader->key.ge.as_ngg)) {
1018       unsigned size_in_dw = shader->gs_info.esgs_ring_size;
1019 
1020       if (stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)
1021          size_in_dw += shader->ngg.ngg_emit_size;
1022 
1023       if (shader->key.ge.as_ngg) {
1024          unsigned scratch_dw_size = gfx10_ngg_get_scratch_dw_size(shader);
1025          if (scratch_dw_size) {
1026             /* scratch base address needs to be 8 byte aligned */
1027             size_in_dw = ALIGN(size_in_dw, 2);
1028             size_in_dw += scratch_dw_size;
1029          }
1030       }
1031 
1032       shader->config.lds_size =
1033          DIV_ROUND_UP(size_in_dw * 4, get_lds_granularity(sscreen, stage));
1034    }
1035 }
1036 
upload_binary_raw(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,bool dma_upload)1037 static bool upload_binary_raw(struct si_screen *sscreen, struct si_shader *shader,
1038                               uint64_t scratch_va, bool dma_upload)
1039 {
1040    struct si_shader_binary *bin[4];
1041    unsigned num_bin = get_shader_binaries(shader, bin);
1042 
1043    unsigned code_size = 0, exec_size = 0;
1044    for (unsigned i = 0; i < num_bin; i++) {
1045       assert(bin[i]->type == SI_SHADER_BINARY_RAW);
1046       code_size += bin[i]->code_size;
1047       exec_size += bin[i]->exec_size;
1048    }
1049 
1050    struct si_context *upload_ctx = NULL;
1051    struct pipe_resource *staging = NULL;
1052    unsigned staging_offset = 0;
1053 
1054    void *rx_ptr = pre_upload_binary(sscreen, shader, code_size, dma_upload,
1055                                     &upload_ctx, &staging, &staging_offset);
1056    if (!rx_ptr)
1057       return false;
1058 
1059    unsigned exec_offset = 0, data_offset = exec_size;
1060    for (unsigned i = 0; i < num_bin; i++) {
1061       memcpy(rx_ptr + exec_offset, bin[i]->code_buffer, bin[i]->exec_size);
1062 
1063       if (bin[i]->num_symbols) {
1064          /* Offset needed to add to const data symbol because of inserting other
1065           * shader part between exec code and const data.
1066           */
1067          unsigned const_offset = data_offset - exec_offset - bin[i]->exec_size;
1068 
1069          /* Prolog and epilog have no symbols. */
1070          struct si_shader *sh = bin[i] == &shader->binary ? shader : shader->previous_stage;
1071          assert(sh && bin[i] == &sh->binary);
1072 
1073          si_aco_resolve_symbols(sh, rx_ptr + exec_offset, (const uint32_t *)bin[i]->code_buffer,
1074                                 scratch_va, const_offset);
1075       }
1076 
1077       exec_offset += bin[i]->exec_size;
1078 
1079       unsigned data_size = bin[i]->code_size - bin[i]->exec_size;
1080       if (data_size) {
1081          memcpy(rx_ptr + data_offset, bin[i]->code_buffer + bin[i]->exec_size, data_size);
1082          data_offset += data_size;
1083       }
1084    }
1085 
1086    post_upload_binary(sscreen, shader, rx_ptr, code_size, code_size, dma_upload,
1087                       upload_ctx, staging, staging_offset);
1088 
1089    calculate_needed_lds_size(sscreen, shader);
1090    return true;
1091 }
1092 
si_shader_binary_upload(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va)1093 bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
1094                              uint64_t scratch_va)
1095 {
1096    bool dma_upload = !(sscreen->debug_flags & DBG(NO_DMA_SHADERS)) &&
1097                      sscreen->info.has_dedicated_vram && !sscreen->info.all_vram_visible;
1098 
1099    if (shader->binary.type == SI_SHADER_BINARY_ELF) {
1100       return upload_binary_elf(sscreen, shader, scratch_va, dma_upload);
1101    } else {
1102       assert(shader->binary.type == SI_SHADER_BINARY_RAW);
1103       return upload_binary_raw(sscreen, shader, scratch_va, dma_upload);
1104    }
1105 }
1106 
print_disassembly(const char * disasm,size_t nbytes,const char * name,FILE * file,struct util_debug_callback * debug)1107 static void print_disassembly(const char *disasm, size_t nbytes,
1108                               const char *name, FILE *file,
1109                               struct util_debug_callback *debug)
1110 {
1111    if (debug && debug->debug_message) {
1112       /* Very long debug messages are cut off, so send the
1113        * disassembly one line at a time. This causes more
1114        * overhead, but on the plus side it simplifies
1115        * parsing of resulting logs.
1116        */
1117       util_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");
1118 
1119       uint64_t line = 0;
1120       while (line < nbytes) {
1121          int count = nbytes - line;
1122          const char *nl = memchr(disasm + line, '\n', nbytes - line);
1123          if (nl)
1124             count = nl - (disasm + line);
1125 
1126          if (count) {
1127             util_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
1128          }
1129 
1130          line += count + 1;
1131       }
1132 
1133       util_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
1134    }
1135 
1136    if (file) {
1137       fprintf(file, "Shader %s disassembly:\n", name);
1138       fprintf(file, "%*s", (int)nbytes, disasm);
1139    }
1140 }
1141 
si_shader_dump_disassembly(struct si_screen * screen,const struct si_shader_binary * binary,gl_shader_stage stage,unsigned wave_size,struct util_debug_callback * debug,const char * name,FILE * file)1142 static void si_shader_dump_disassembly(struct si_screen *screen,
1143                                        const struct si_shader_binary *binary,
1144                                        gl_shader_stage stage, unsigned wave_size,
1145                                        struct util_debug_callback *debug, const char *name,
1146                                        FILE *file)
1147 {
1148    if (binary->type == SI_SHADER_BINARY_RAW) {
1149       print_disassembly(binary->disasm_string, binary->disasm_size, name, file, debug);
1150       return;
1151    }
1152 
1153    struct ac_rtld_binary rtld_binary;
1154 
1155    if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
1156                                       .info = &screen->info,
1157                                       .shader_type = stage,
1158                                       .wave_size = wave_size,
1159                                       .num_parts = 1,
1160                                       .elf_ptrs = &binary->code_buffer,
1161                                       .elf_sizes = &binary->code_size}))
1162       return;
1163 
1164    const char *disasm;
1165    size_t nbytes;
1166 
1167    if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
1168       goto out;
1169 
1170    if (nbytes > INT_MAX)
1171       goto out;
1172 
1173    print_disassembly(disasm, nbytes, name, file, debug);
1174 
1175 out:
1176    ac_rtld_close(&rtld_binary);
1177 }
1178 
si_calculate_max_simd_waves(struct si_shader * shader)1179 static void si_calculate_max_simd_waves(struct si_shader *shader)
1180 {
1181    struct si_screen *sscreen = shader->selector->screen;
1182    struct ac_shader_config *conf = &shader->config;
1183    unsigned lds_increment = get_lds_granularity(sscreen, shader->selector->stage);
1184    unsigned lds_per_wave = 0;
1185    unsigned max_simd_waves;
1186 
1187    max_simd_waves = sscreen->info.max_waves_per_simd;
1188 
1189    /* Compute LDS usage for PS. */
1190    switch (shader->selector->stage) {
1191    case MESA_SHADER_FRAGMENT:
1192       /* The minimum usage per wave is (num_inputs * 48). The maximum
1193        * usage is (num_inputs * 48 * 16).
1194        * We can get anything in between and it varies between waves.
1195        *
1196        * The 48 bytes per input for a single primitive is equal to
1197        * 4 bytes/component * 4 components/input * 3 points.
1198        *
1199        * Other stages don't know the size at compile time or don't
1200        * allocate LDS per wave, but instead they do it per thread group.
1201        */
1202       lds_per_wave = conf->lds_size * lds_increment +
1203                      align(shader->info.num_ps_inputs * 48, lds_increment);
1204       break;
1205    case MESA_SHADER_COMPUTE: {
1206          unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
1207          lds_per_wave = (conf->lds_size * lds_increment) /
1208                         DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
1209       }
1210       break;
1211    default:;
1212    }
1213 
1214    /* Compute the per-SIMD wave counts. */
1215    if (conf->num_sgprs) {
1216       max_simd_waves =
1217          MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
1218    }
1219 
1220    if (conf->num_vgprs) {
1221       /* GFX 10.3 internally:
1222        * - aligns VGPRS to 16 for Wave32 and 8 for Wave64
1223        * - aligns LDS to 1024
1224        *
1225        * For shader-db stats, set num_vgprs that the hw actually uses.
1226        */
1227       unsigned num_vgprs = conf->num_vgprs;
1228       if (sscreen->info.gfx_level >= GFX10_3) {
1229          unsigned real_vgpr_gran = sscreen->info.num_physical_wave64_vgprs_per_simd / 64;
1230          num_vgprs = util_align_npot(num_vgprs, real_vgpr_gran * (shader->wave_size == 32 ? 2 : 1));
1231       } else {
1232          num_vgprs = align(num_vgprs, shader->wave_size == 32 ? 8 : 4);
1233       }
1234 
1235       /* Always print wave limits as Wave64, so that we can compare
1236        * Wave32 and Wave64 with shader-db fairly. */
1237       unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
1238       max_simd_waves = MIN2(max_simd_waves, max_vgprs / num_vgprs);
1239    }
1240 
1241    unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
1242    if (lds_per_wave)
1243       max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
1244 
1245    shader->info.max_simd_waves = max_simd_waves;
1246 }
1247 
si_shader_dump_stats_for_shader_db(struct si_screen * screen,struct si_shader * shader,struct util_debug_callback * debug)1248 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1249                                         struct util_debug_callback *debug)
1250 {
1251    const struct ac_shader_config *conf = &shader->config;
1252    static const char *stages[] = {"VS", "TCS", "TES", "GS", "PS", "CS"};
1253 
1254    if (screen->options.debug_disassembly)
1255       si_shader_dump_disassembly(screen, &shader->binary, shader->selector->stage,
1256                                  shader->wave_size, debug, "main", NULL);
1257 
1258    unsigned num_ls_outputs = 0;
1259    unsigned num_hs_outputs = 0;
1260    unsigned num_es_outputs = 0;
1261    unsigned num_gs_outputs = 0;
1262    unsigned num_vs_outputs = 0;
1263    unsigned num_ps_outputs = 0;
1264 
1265    if (shader->selector->stage <= MESA_SHADER_GEOMETRY) {
1266       /* This doesn't include pos exports because only param exports are interesting
1267        * for performance and can be optimized.
1268        */
1269       if (shader->key.ge.as_ls)
1270          num_ls_outputs = shader->selector->info.lshs_vertex_stride / 16;
1271       else if (shader->selector->stage == MESA_SHADER_TESS_CTRL)
1272          num_hs_outputs = util_last_bit64(shader->selector->info.outputs_written_before_tes_gs);
1273       else if (shader->key.ge.as_es)
1274          num_es_outputs = shader->selector->info.esgs_vertex_stride / 16;
1275       else if (shader->gs_copy_shader)
1276          num_gs_outputs = shader->gs_copy_shader->info.nr_param_exports;
1277       else if (shader->selector->stage == MESA_SHADER_GEOMETRY)
1278          num_gs_outputs = shader->info.nr_param_exports;
1279       else if (shader->selector->stage == MESA_SHADER_VERTEX ||
1280                shader->selector->stage == MESA_SHADER_TESS_EVAL)
1281          num_vs_outputs = shader->info.nr_param_exports;
1282       else
1283          unreachable("invalid shader key");
1284    } else if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
1285       num_ps_outputs = util_bitcount(shader->selector->info.colors_written) +
1286                        (shader->selector->info.writes_z ||
1287                         shader->selector->info.writes_stencil ||
1288                         shader->ps.writes_samplemask);
1289    }
1290 
1291    util_debug_message(debug, SHADER_INFO,
1292                       "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
1293                       "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
1294                       "Spilled VGPRs: %d PrivMem VGPRs: %d LSOutputs: %u HSOutputs: %u "
1295                       "HSPatchOuts: %u ESOutputs: %u GSOutputs: %u VSOutputs: %u PSOutputs: %u "
1296                       "InlineUniforms: %u DivergentLoop: %u (%s, W%u)",
1297                       conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
1298                       conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
1299                       conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
1300                       num_ls_outputs, num_hs_outputs,
1301                       util_last_bit64(shader->selector->info.patch_outputs_written),
1302                       num_es_outputs, num_gs_outputs, num_vs_outputs, num_ps_outputs,
1303                       shader->selector->info.base.num_inlinable_uniforms,
1304                       shader->selector->info.has_divergent_loop,
1305                       stages[shader->selector->stage], shader->wave_size);
1306 }
1307 
si_can_dump_shader(struct si_screen * sscreen,gl_shader_stage stage,enum si_shader_dump_type dump_type)1308 bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage,
1309                         enum si_shader_dump_type dump_type)
1310 {
1311    static uint64_t filter[] = {
1312       [SI_DUMP_SHADER_KEY] = DBG(NIR) | DBG(INIT_LLVM) | DBG(LLVM) | DBG(INIT_ACO) | DBG(ACO) | DBG(ASM),
1313       [SI_DUMP_INIT_NIR] = DBG(INIT_NIR),
1314       [SI_DUMP_NIR] = DBG(NIR),
1315       [SI_DUMP_INIT_LLVM_IR] = DBG(INIT_LLVM),
1316       [SI_DUMP_LLVM_IR] = DBG(LLVM),
1317       [SI_DUMP_INIT_ACO_IR] = DBG(INIT_ACO),
1318       [SI_DUMP_ACO_IR] = DBG(ACO),
1319       [SI_DUMP_ASM] = DBG(ASM),
1320       [SI_DUMP_STATS] = DBG(STATS),
1321       [SI_DUMP_ALWAYS] = DBG(VS) | DBG(TCS) | DBG(TES) | DBG(GS) | DBG(PS) | DBG(CS),
1322    };
1323    assert(dump_type < ARRAY_SIZE(filter));
1324 
1325    return sscreen->debug_flags & (1 << stage) &&
1326           sscreen->debug_flags & filter[dump_type];
1327 }
1328 
si_shader_dump_stats(struct si_screen * sscreen,struct si_shader * shader,FILE * file,bool check_debug_option)1329 static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
1330                                  bool check_debug_option)
1331 {
1332    const struct ac_shader_config *conf = &shader->config;
1333 
1334    if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
1335       fprintf(file,
1336               "*** SHADER CONFIG ***\n"
1337               "SPI_PS_INPUT_ADDR = 0x%04x\n"
1338               "SPI_PS_INPUT_ENA  = 0x%04x\n",
1339               conf->spi_ps_input_addr, conf->spi_ps_input_ena);
1340    }
1341 
1342    fprintf(file,
1343            "*** SHADER STATS ***\n"
1344            "SGPRS: %d\n"
1345            "VGPRS: %d\n"
1346            "Spilled SGPRs: %d\n"
1347            "Spilled VGPRs: %d\n"
1348            "Private memory VGPRs: %d\n"
1349            "Code Size: %d bytes\n"
1350            "LDS: %d bytes\n"
1351            "Scratch: %d bytes per wave\n"
1352            "Max Waves: %d\n"
1353            "********************\n\n\n",
1354            conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
1355            shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
1356            conf->lds_size * get_lds_granularity(sscreen, shader->selector->stage),
1357            conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
1358 }
1359 
si_get_shader_name(const struct si_shader * shader)1360 const char *si_get_shader_name(const struct si_shader *shader)
1361 {
1362    switch (shader->selector->stage) {
1363    case MESA_SHADER_VERTEX:
1364       if (shader->key.ge.as_es)
1365          return "Vertex Shader as ES";
1366       else if (shader->key.ge.as_ls)
1367          return "Vertex Shader as LS";
1368       else if (shader->key.ge.as_ngg)
1369          return "Vertex Shader as ESGS";
1370       else
1371          return "Vertex Shader as VS";
1372    case MESA_SHADER_TESS_CTRL:
1373       return "Tessellation Control Shader";
1374    case MESA_SHADER_TESS_EVAL:
1375       if (shader->key.ge.as_es)
1376          return "Tessellation Evaluation Shader as ES";
1377       else if (shader->key.ge.as_ngg)
1378          return "Tessellation Evaluation Shader as ESGS";
1379       else
1380          return "Tessellation Evaluation Shader as VS";
1381    case MESA_SHADER_GEOMETRY:
1382       if (shader->is_gs_copy_shader)
1383          return "GS Copy Shader as VS";
1384       else
1385          return "Geometry Shader";
1386    case MESA_SHADER_FRAGMENT:
1387       return "Pixel Shader";
1388    case MESA_SHADER_COMPUTE:
1389       return "Compute Shader";
1390    default:
1391       return "Unknown Shader";
1392    }
1393 }
1394 
si_shader_dump(struct si_screen * sscreen,struct si_shader * shader,struct util_debug_callback * debug,FILE * file,bool check_debug_option)1395 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1396                     struct util_debug_callback *debug, FILE *file, bool check_debug_option)
1397 {
1398    gl_shader_stage stage = shader->selector->stage;
1399 
1400    if (!check_debug_option || si_can_dump_shader(sscreen, stage, SI_DUMP_SHADER_KEY))
1401       si_dump_shader_key(shader, file);
1402 
1403    if (!check_debug_option && shader->binary.llvm_ir_string) {
1404       /* This is only used with ddebug. */
1405       if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
1406          fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
1407          fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
1408       }
1409 
1410       fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
1411       fprintf(file, "%s\n", shader->binary.llvm_ir_string);
1412    }
1413 
1414    if (!check_debug_option || (si_can_dump_shader(sscreen, stage, SI_DUMP_ASM))) {
1415       fprintf(file, "\n%s:\n", si_get_shader_name(shader));
1416 
1417       if (shader->prolog)
1418          si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, shader->wave_size, debug,
1419                                     "prolog", file);
1420       if (shader->previous_stage)
1421          si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,
1422                                     shader->wave_size, debug, "previous stage", file);
1423       si_shader_dump_disassembly(sscreen, &shader->binary, stage, shader->wave_size, debug, "main",
1424                                  file);
1425 
1426       if (shader->epilog)
1427          si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, shader->wave_size, debug,
1428                                     "epilog", file);
1429       fprintf(file, "\n");
1430 
1431       si_shader_dump_stats(sscreen, shader, file, check_debug_option);
1432    }
1433 }
1434 
si_dump_shader_key_vs(const union si_shader_key * key,FILE * f)1435 static void si_dump_shader_key_vs(const union si_shader_key *key, FILE *f)
1436 {
1437    fprintf(f, "  mono.instance_divisor_is_one = %u\n", key->ge.mono.instance_divisor_is_one);
1438    fprintf(f, "  mono.instance_divisor_is_fetched = %u\n",
1439            key->ge.mono.instance_divisor_is_fetched);
1440    fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->ge.mono.vs_fetch_opencode);
1441    fprintf(f, "  mono.vs.fix_fetch = {");
1442    for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
1443       union si_vs_fix_fetch fix = key->ge.mono.vs_fix_fetch[i];
1444       if (i)
1445          fprintf(f, ", ");
1446       if (!fix.bits)
1447          fprintf(f, "0");
1448       else
1449          fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
1450                  fix.u.format);
1451    }
1452    fprintf(f, "}\n");
1453 }
1454 
si_dump_shader_key(const struct si_shader * shader,FILE * f)1455 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
1456 {
1457    const union si_shader_key *key = &shader->key;
1458    gl_shader_stage stage = shader->selector->stage;
1459 
1460    fprintf(f, "SHADER KEY\n");
1461    fprintf(f, "  source_sha1 = {");
1462    _mesa_sha1_print(f, shader->selector->info.base.source_sha1);
1463    fprintf(f, "}\n");
1464 
1465    switch (stage) {
1466    case MESA_SHADER_VERTEX:
1467       si_dump_shader_key_vs(key, f);
1468       fprintf(f, "  as_es = %u\n", key->ge.as_es);
1469       fprintf(f, "  as_ls = %u\n", key->ge.as_ls);
1470       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
1471       fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1472       break;
1473 
1474    case MESA_SHADER_TESS_CTRL:
1475       if (shader->selector->screen->info.gfx_level >= GFX9)
1476          si_dump_shader_key_vs(key, f);
1477 
1478       fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->ge.part.tcs.epilog.prim_mode);
1479       fprintf(f, "  opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1480       fprintf(f, "  opt.same_patch_vertices = %u\n", key->ge.opt.same_patch_vertices);
1481       break;
1482 
1483    case MESA_SHADER_TESS_EVAL:
1484       fprintf(f, "  as_es = %u\n", key->ge.as_es);
1485       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
1486       fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1487       break;
1488 
1489    case MESA_SHADER_GEOMETRY:
1490       if (shader->is_gs_copy_shader)
1491          break;
1492 
1493       if (shader->selector->screen->info.gfx_level >= GFX9 &&
1494           key->ge.part.gs.es->stage == MESA_SHADER_VERTEX)
1495          si_dump_shader_key_vs(key, f);
1496 
1497       fprintf(f, "  mono.u.gs_tri_strip_adj_fix = %u\n", key->ge.mono.u.gs_tri_strip_adj_fix);
1498       fprintf(f, "  as_ngg = %u\n", key->ge.as_ngg);
1499       break;
1500 
1501    case MESA_SHADER_COMPUTE:
1502       break;
1503 
1504    case MESA_SHADER_FRAGMENT:
1505       fprintf(f, "  prolog.color_two_side = %u\n", key->ps.part.prolog.color_two_side);
1506       fprintf(f, "  prolog.flatshade_colors = %u\n", key->ps.part.prolog.flatshade_colors);
1507       fprintf(f, "  prolog.poly_stipple = %u\n", key->ps.part.prolog.poly_stipple);
1508       fprintf(f, "  prolog.force_persp_sample_interp = %u\n",
1509               key->ps.part.prolog.force_persp_sample_interp);
1510       fprintf(f, "  prolog.force_linear_sample_interp = %u\n",
1511               key->ps.part.prolog.force_linear_sample_interp);
1512       fprintf(f, "  prolog.force_persp_center_interp = %u\n",
1513               key->ps.part.prolog.force_persp_center_interp);
1514       fprintf(f, "  prolog.force_linear_center_interp = %u\n",
1515               key->ps.part.prolog.force_linear_center_interp);
1516       fprintf(f, "  prolog.bc_optimize_for_persp = %u\n",
1517               key->ps.part.prolog.bc_optimize_for_persp);
1518       fprintf(f, "  prolog.bc_optimize_for_linear = %u\n",
1519               key->ps.part.prolog.bc_optimize_for_linear);
1520       fprintf(f, "  prolog.samplemask_log_ps_iter = %u\n",
1521               key->ps.part.prolog.samplemask_log_ps_iter);
1522       fprintf(f, "  epilog.spi_shader_col_format = 0x%x\n",
1523               key->ps.part.epilog.spi_shader_col_format);
1524       fprintf(f, "  epilog.color_is_int8 = 0x%X\n", key->ps.part.epilog.color_is_int8);
1525       fprintf(f, "  epilog.color_is_int10 = 0x%X\n", key->ps.part.epilog.color_is_int10);
1526       fprintf(f, "  epilog.last_cbuf = %u\n", key->ps.part.epilog.last_cbuf);
1527       fprintf(f, "  epilog.alpha_func = %u\n", key->ps.part.epilog.alpha_func);
1528       fprintf(f, "  epilog.alpha_to_one = %u\n", key->ps.part.epilog.alpha_to_one);
1529       fprintf(f, "  epilog.alpha_to_coverage_via_mrtz = %u\n", key->ps.part.epilog.alpha_to_coverage_via_mrtz);
1530       fprintf(f, "  epilog.clamp_color = %u\n", key->ps.part.epilog.clamp_color);
1531       fprintf(f, "  epilog.dual_src_blend_swizzle = %u\n", key->ps.part.epilog.dual_src_blend_swizzle);
1532       fprintf(f, "  epilog.rbplus_depth_only_opt = %u\n", key->ps.part.epilog.rbplus_depth_only_opt);
1533       fprintf(f, "  epilog.kill_samplemask = %u\n", key->ps.part.epilog.kill_samplemask);
1534       fprintf(f, "  mono.poly_line_smoothing = %u\n", key->ps.mono.poly_line_smoothing);
1535       fprintf(f, "  mono.point_smoothing = %u\n", key->ps.mono.point_smoothing);
1536       fprintf(f, "  mono.interpolate_at_sample_force_center = %u\n",
1537               key->ps.mono.interpolate_at_sample_force_center);
1538       fprintf(f, "  mono.fbfetch_msaa = %u\n", key->ps.mono.fbfetch_msaa);
1539       fprintf(f, "  mono.fbfetch_is_1D = %u\n", key->ps.mono.fbfetch_is_1D);
1540       fprintf(f, "  mono.fbfetch_layered = %u\n", key->ps.mono.fbfetch_layered);
1541       break;
1542 
1543    default:
1544       assert(0);
1545    }
1546 
1547    if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||
1548         stage == MESA_SHADER_VERTEX) &&
1549        !key->ge.as_es && !key->ge.as_ls) {
1550       fprintf(f, "  opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs);
1551       fprintf(f, "  opt.kill_pointsize = 0x%x\n", key->ge.opt.kill_pointsize);
1552       fprintf(f, "  opt.kill_layer = 0x%x\n", key->ge.opt.kill_layer);
1553       fprintf(f, "  opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances);
1554       fprintf(f, "  opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling);
1555       fprintf(f, "  opt.remove_streamout = 0x%x\n", key->ge.opt.remove_streamout);
1556    }
1557 
1558    if (stage <= MESA_SHADER_GEOMETRY)
1559       fprintf(f, "  opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1560    else
1561       fprintf(f, "  opt.prefer_mono = %u\n", key->ps.opt.prefer_mono);
1562 
1563    if (stage <= MESA_SHADER_GEOMETRY) {
1564       if (key->ge.opt.inline_uniforms) {
1565          fprintf(f, "  opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1566                  key->ge.opt.inline_uniforms,
1567                  key->ge.opt.inlined_uniform_values[0],
1568                  key->ge.opt.inlined_uniform_values[1],
1569                  key->ge.opt.inlined_uniform_values[2],
1570                  key->ge.opt.inlined_uniform_values[3]);
1571       } else {
1572          fprintf(f, "  opt.inline_uniforms = 0\n");
1573       }
1574    } else {
1575       if (key->ps.opt.inline_uniforms) {
1576          fprintf(f, "  opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1577                  key->ps.opt.inline_uniforms,
1578                  key->ps.opt.inlined_uniform_values[0],
1579                  key->ps.opt.inlined_uniform_values[1],
1580                  key->ps.opt.inlined_uniform_values[2],
1581                  key->ps.opt.inlined_uniform_values[3]);
1582       } else {
1583          fprintf(f, "  opt.inline_uniforms = 0\n");
1584       }
1585    }
1586 }
1587 
1588 /* TODO: convert to nir_shader_instructions_pass */
si_nir_kill_outputs(nir_shader * nir,const union si_shader_key * key)1589 static bool si_nir_kill_outputs(nir_shader *nir, const union si_shader_key *key)
1590 {
1591    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1592    assert(impl);
1593    assert(nir->info.stage <= MESA_SHADER_GEOMETRY);
1594 
1595    if (!key->ge.opt.kill_outputs &&
1596        !key->ge.opt.kill_pointsize &&
1597        !key->ge.opt.kill_layer &&
1598        !key->ge.opt.kill_clip_distances &&
1599        !(nir->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER))) {
1600       nir_metadata_preserve(impl, nir_metadata_all);
1601       return false;
1602    }
1603 
1604    bool progress = false;
1605 
1606    nir_foreach_block(block, impl) {
1607       nir_foreach_instr_safe(instr, block) {
1608          if (instr->type != nir_instr_type_intrinsic)
1609             continue;
1610 
1611          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1612          if (intr->intrinsic != nir_intrinsic_store_output)
1613             continue;
1614 
1615          /* No indirect indexing allowed. */
1616          ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1617          assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1618 
1619          assert(intr->num_components == 1); /* only scalar stores expected */
1620          nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1621 
1622          if (nir_slot_is_varying(sem.location) &&
1623              key->ge.opt.kill_outputs &
1624              (1ull << si_shader_io_get_unique_index(sem.location)))
1625             progress |= nir_remove_varying(intr, MESA_SHADER_FRAGMENT);
1626 
1627          switch (sem.location) {
1628          case VARYING_SLOT_PSIZ:
1629             if (key->ge.opt.kill_pointsize)
1630                progress |= nir_remove_sysval_output(intr);
1631             break;
1632 
1633          case VARYING_SLOT_CLIP_VERTEX:
1634             /* TODO: We should only kill specific clip planes as required by kill_clip_distance,
1635              * not whole gl_ClipVertex. Lower ClipVertex in NIR.
1636              */
1637             if ((key->ge.opt.kill_clip_distances & SI_USER_CLIP_PLANE_MASK) ==
1638                 SI_USER_CLIP_PLANE_MASK)
1639                progress |= nir_remove_sysval_output(intr);
1640             break;
1641 
1642          case VARYING_SLOT_CLIP_DIST0:
1643          case VARYING_SLOT_CLIP_DIST1:
1644             if (key->ge.opt.kill_clip_distances) {
1645                assert(nir_intrinsic_src_type(intr) == nir_type_float32);
1646                unsigned index = (sem.location - VARYING_SLOT_CLIP_DIST0) * 4 +
1647                                 nir_intrinsic_component(intr);
1648 
1649                if (key->ge.opt.kill_clip_distances & BITFIELD_BIT(index))
1650                   progress |= nir_remove_sysval_output(intr);
1651             }
1652             break;
1653 
1654          case VARYING_SLOT_LAYER:
1655             /* LAYER is never passed to FS. Instead, we load it there as a system value. */
1656             progress |= nir_remove_varying(intr, MESA_SHADER_FRAGMENT);
1657 
1658             if (key->ge.opt.kill_layer)
1659                progress |= nir_remove_sysval_output(intr);
1660             break;
1661          }
1662       }
1663    }
1664 
1665    if (progress) {
1666       nir_metadata_preserve(impl, nir_metadata_dominance |
1667                                   nir_metadata_block_index);
1668    } else {
1669       nir_metadata_preserve(impl, nir_metadata_all);
1670    }
1671 
1672    return progress;
1673 }
1674 
1675 /* Remove PS output components from NIR if they are disabled by spi_shader_col_format. */
kill_ps_outputs_cb(struct nir_builder * b,nir_instr * instr,void * _key)1676 static bool kill_ps_outputs_cb(struct nir_builder *b, nir_instr *instr, void *_key)
1677 {
1678    if (instr->type != nir_instr_type_intrinsic)
1679       return false;
1680 
1681    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1682    if (intr->intrinsic != nir_intrinsic_store_output)
1683       return false;
1684 
1685    /* No indirect indexing allowed. */
1686    ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1687    assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1688 
1689    unsigned location = nir_intrinsic_io_semantics(intr).location;
1690    const union si_shader_key *key = _key;
1691 
1692    switch (location) {
1693    case FRAG_RESULT_DEPTH:
1694    case FRAG_RESULT_STENCIL:
1695       return false;
1696 
1697    case FRAG_RESULT_SAMPLE_MASK:
1698       if (key->ps.part.epilog.kill_samplemask) {
1699          nir_instr_remove(instr);
1700          return true;
1701       }
1702       return false;
1703    }
1704 
1705    /* Color outputs. */
1706    unsigned comp_mask = BITFIELD_MASK(intr->num_components);
1707    assert(nir_intrinsic_component(intr) == 0);
1708    unsigned cb_shader_mask = ac_get_cb_shader_mask(key->ps.part.epilog.spi_shader_col_format);
1709 
1710    /* If COLOR is broadcasted to multiple color buffers, combine their masks. */
1711    if (location == FRAG_RESULT_COLOR) {
1712       for (unsigned i = 1; i <= key->ps.part.epilog.last_cbuf; i++)
1713          cb_shader_mask |= (cb_shader_mask >> (i * 4)) & 0xf;
1714    }
1715 
1716    unsigned index = location == FRAG_RESULT_COLOR ? 0 : location - FRAG_RESULT_DATA0;
1717    unsigned output_mask = (cb_shader_mask >> (index * 4)) & 0xf;
1718 
1719    if ((output_mask & comp_mask) == comp_mask)
1720       return false;
1721 
1722    if (!(output_mask & comp_mask)) {
1723       nir_instr_remove(instr);
1724       return true;
1725    }
1726 
1727    /* Fill disabled components with undef. */
1728    b->cursor = nir_before_instr(instr);
1729    nir_def *new_value = intr->src[0].ssa;
1730    nir_def *undef = nir_undef(b, 1, new_value->bit_size);
1731 
1732    unsigned kill_mask = ~output_mask & comp_mask;
1733    u_foreach_bit(i, kill_mask) {
1734       new_value = nir_vector_insert_imm(b, new_value, undef, i);
1735    }
1736 
1737    nir_src_rewrite(&intr->src[0], new_value);
1738    return true;
1739 }
1740 
si_nir_kill_ps_outputs(nir_shader * nir,const union si_shader_key * key)1741 static bool si_nir_kill_ps_outputs(nir_shader *nir, const union si_shader_key *key)
1742 {
1743    assert(nir->info.stage == MESA_SHADER_FRAGMENT);
1744    return nir_shader_instructions_pass(nir, kill_ps_outputs_cb,
1745                                        nir_metadata_dominance |
1746                                        nir_metadata_block_index, (void*)key);
1747 }
1748 
clamp_vertex_color_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * state)1749 static bool clamp_vertex_color_instr(nir_builder *b,
1750                                      nir_intrinsic_instr *intrin, void *state)
1751 {
1752    if (intrin->intrinsic != nir_intrinsic_store_output)
1753       return false;
1754 
1755    unsigned location = nir_intrinsic_io_semantics(intrin).location;
1756    if (location != VARYING_SLOT_COL0 && location != VARYING_SLOT_COL1 &&
1757        location != VARYING_SLOT_BFC0 && location != VARYING_SLOT_BFC1)
1758       return false;
1759 
1760    /* no indirect output */
1761    assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
1762    /* only scalar output */
1763    assert(intrin->src[0].ssa->num_components == 1);
1764 
1765    b->cursor = nir_before_instr(&intrin->instr);
1766 
1767    nir_def *color = intrin->src[0].ssa;
1768    nir_def *clamp = nir_load_clamp_vertex_color_amd(b);
1769    nir_def *new_color = nir_bcsel(b, clamp, nir_fsat(b, color), color);
1770    nir_src_rewrite(&intrin->src[0], new_color);
1771 
1772    return true;
1773 }
1774 
si_nir_clamp_vertex_color(nir_shader * nir)1775 static bool si_nir_clamp_vertex_color(nir_shader *nir)
1776 {
1777    uint64_t mask = VARYING_BIT_COL0 | VARYING_BIT_COL1 | VARYING_BIT_BFC0 | VARYING_BIT_BFC1;
1778    if (!(nir->info.outputs_written & mask))
1779       return false;
1780 
1781    return nir_shader_intrinsics_pass(nir, clamp_vertex_color_instr,
1782                                        nir_metadata_dominance | nir_metadata_block_index,
1783                                        NULL);
1784 }
1785 
si_map_io_driver_location(unsigned semantic)1786 static unsigned si_map_io_driver_location(unsigned semantic)
1787 {
1788    if ((semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_TESS_MAX) ||
1789        semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
1790        semantic == VARYING_SLOT_TESS_LEVEL_OUTER)
1791       return ac_shader_io_get_unique_index_patch(semantic);
1792 
1793    return si_shader_io_get_unique_index(semantic);
1794 }
1795 
si_lower_io_to_mem(struct si_shader * shader,nir_shader * nir,uint64_t tcs_vgpr_only_inputs)1796 static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir,
1797                                uint64_t tcs_vgpr_only_inputs)
1798 {
1799    struct si_shader_selector *sel = shader->selector;
1800    const union si_shader_key *key = &shader->key;
1801 
1802    if (nir->info.stage == MESA_SHADER_VERTEX) {
1803       if (key->ge.as_ls) {
1804          NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem, si_map_io_driver_location,
1805                     key->ge.opt.same_patch_vertices, tcs_vgpr_only_inputs);
1806          return true;
1807       } else if (key->ge.as_es) {
1808          NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1809                     sel->screen->info.gfx_level, sel->info.esgs_vertex_stride);
1810          return true;
1811       }
1812    } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
1813       NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem, si_map_io_driver_location,
1814                  key->ge.opt.same_patch_vertices);
1815 
1816       /* Used by hs_emit_write_tess_factors() when monolithic shader. */
1817       nir->info.tess._primitive_mode = key->ge.part.tcs.epilog.prim_mode;
1818 
1819       NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, si_map_io_driver_location,
1820                  sel->screen->info.gfx_level,
1821                  /* Used by hs_emit_write_tess_factors() when monolithic shader. */
1822                  key->ge.part.tcs.epilog.tes_reads_tess_factors,
1823                  ~0ULL, ~0ULL, /* no TES inputs filter */
1824                  util_last_bit64(sel->info.outputs_written_before_tes_gs),
1825                  util_last_bit64(sel->info.patch_outputs_written),
1826                  shader->wave_size,
1827                  /* ALL TCS inputs are passed by register. */
1828                  key->ge.opt.same_patch_vertices &&
1829                  !(sel->info.base.inputs_read & ~sel->info.tcs_vgpr_only_inputs),
1830                  sel->info.tessfactors_are_def_in_all_invocs,
1831                  /* Emit epilog only when monolithic shader. */
1832                  shader->is_monolithic);
1833       return true;
1834    } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
1835       NIR_PASS_V(nir, ac_nir_lower_tes_inputs_to_mem, si_map_io_driver_location);
1836 
1837       if (key->ge.as_es) {
1838          NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1839                     sel->screen->info.gfx_level, sel->info.esgs_vertex_stride);
1840       }
1841 
1842       return true;
1843    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1844       NIR_PASS_V(nir, ac_nir_lower_gs_inputs_to_mem, si_map_io_driver_location,
1845                  sel->screen->info.gfx_level, key->ge.mono.u.gs_tri_strip_adj_fix);
1846       return true;
1847    }
1848 
1849    return false;
1850 }
1851 
si_lower_ngg(struct si_shader * shader,nir_shader * nir)1852 static void si_lower_ngg(struct si_shader *shader, nir_shader *nir)
1853 {
1854    struct si_shader_selector *sel = shader->selector;
1855    const union si_shader_key *key = &shader->key;
1856    assert(key->ge.as_ngg);
1857 
1858    uint8_t clip_cull_dist_mask =
1859       (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) |
1860       sel->info.culldist_mask;
1861 
1862    ac_nir_lower_ngg_options options = {
1863       .family = sel->screen->info.family,
1864       .gfx_level = sel->screen->info.gfx_level,
1865       .max_workgroup_size = si_get_max_workgroup_size(shader),
1866       .wave_size = shader->wave_size,
1867       .can_cull = !!key->ge.opt.ngg_culling,
1868       .disable_streamout = !si_shader_uses_streamout(shader),
1869       .vs_output_param_offset = shader->info.vs_output_param_offset,
1870       .has_param_exports = shader->info.nr_param_exports,
1871       .clip_cull_dist_mask = clip_cull_dist_mask,
1872       .kill_pointsize = key->ge.opt.kill_pointsize,
1873       .kill_layer = key->ge.opt.kill_layer,
1874       .force_vrs = sel->screen->options.vrs2x2,
1875    };
1876 
1877    if (nir->info.stage == MESA_SHADER_VERTEX ||
1878        nir->info.stage == MESA_SHADER_TESS_EVAL) {
1879       /* Per instance inputs, used to remove instance load after culling. */
1880       unsigned instance_rate_inputs = 0;
1881 
1882       if (nir->info.stage == MESA_SHADER_VERTEX) {
1883          instance_rate_inputs = key->ge.mono.instance_divisor_is_one |
1884                                 key->ge.mono.instance_divisor_is_fetched;
1885 
1886          /* Manually mark the instance ID used, so the shader can repack it. */
1887          if (instance_rate_inputs)
1888             BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
1889       } else {
1890          /* Manually mark the primitive ID used, so the shader can repack it. */
1891          if (key->ge.mono.u.vs_export_prim_id)
1892             BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
1893       }
1894 
1895       unsigned clip_plane_enable =
1896          SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(key->ge.opt.ngg_culling);
1897       unsigned num_vertices = gfx10_ngg_get_vertices_per_prim(shader);
1898 
1899       options.num_vertices_per_primitive = num_vertices ? num_vertices : 3;
1900       options.early_prim_export = gfx10_ngg_export_prim_early(shader);
1901       options.passthrough = gfx10_is_ngg_passthrough(shader);
1902       options.use_edgeflags = gfx10_edgeflags_have_effect(shader);
1903       options.has_gen_prim_query = options.has_xfb_prim_query =
1904          sel->screen->info.gfx_level >= GFX11 && !sel->info.base.vs.blit_sgprs_amd;
1905       options.export_primitive_id = key->ge.mono.u.vs_export_prim_id;
1906       options.instance_rate_inputs = instance_rate_inputs;
1907       options.user_clip_plane_enable_mask = clip_plane_enable;
1908 
1909       NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options);
1910    } else {
1911       assert(nir->info.stage == MESA_SHADER_GEOMETRY);
1912 
1913       options.gs_out_vtx_bytes = sel->info.gsvs_vertex_size;
1914       options.has_gen_prim_query = options.has_xfb_prim_query =
1915          sel->screen->info.gfx_level >= GFX11;
1916       options.has_gs_invocations_query = sel->screen->info.gfx_level < GFX11;
1917       options.has_gs_primitives_query = true;
1918 
1919       /* For monolithic ES/GS to add vscnt wait when GS export pos0. */
1920       if (key->ge.part.gs.es)
1921          nir->info.writes_memory |= key->ge.part.gs.es->info.base.writes_memory;
1922 
1923       NIR_PASS_V(nir, ac_nir_lower_ngg_gs, &options);
1924    }
1925 
1926    /* may generate some subgroup op like ballot */
1927    NIR_PASS_V(nir, nir_lower_subgroups, &si_nir_subgroups_options);
1928 
1929    /* may generate some vector output store */
1930    NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1931 }
1932 
si_deserialize_shader(struct si_shader_selector * sel)1933 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel)
1934 {
1935    struct pipe_screen *screen = &sel->screen->b;
1936    const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
1937                                                       pipe_shader_type_from_mesa(sel->stage));
1938 
1939    struct blob_reader blob_reader;
1940    blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
1941    return nir_deserialize(NULL, options, &blob_reader);
1942 }
1943 
si_nir_assign_param_offsets(nir_shader * nir,struct si_shader * shader,int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])1944 static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shader,
1945                                         int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])
1946 {
1947    struct si_shader_selector *sel = shader->selector;
1948    struct si_shader_binary_info *info = &shader->info;
1949 
1950    uint64_t outputs_written = 0;
1951    uint32_t outputs_written_16bit = 0;
1952 
1953    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1954    assert(impl);
1955 
1956    nir_foreach_block(block, impl) {
1957       nir_foreach_instr_safe(instr, block) {
1958          if (instr->type != nir_instr_type_intrinsic)
1959             continue;
1960 
1961          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1962          if (intr->intrinsic != nir_intrinsic_store_output)
1963             continue;
1964 
1965          /* No indirect indexing allowed. */
1966          ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1967          assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1968 
1969          assert(intr->num_components == 1); /* only scalar stores expected */
1970          nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1971 
1972          if (sem.location >= VARYING_SLOT_VAR0_16BIT)
1973             outputs_written_16bit |= BITFIELD_BIT(sem.location - VARYING_SLOT_VAR0_16BIT);
1974          else
1975             outputs_written |= BITFIELD64_BIT(sem.location);
1976 
1977          /* Assign the param index if it's unassigned. */
1978          if (nir_slot_is_varying(sem.location) && !sem.no_varying &&
1979              (sem.gs_streams & 0x3) == 0 &&
1980              info->vs_output_param_offset[sem.location] == AC_EXP_PARAM_DEFAULT_VAL_0000) {
1981             /* The semantic and the base should be the same as in si_shader_info. */
1982             assert(sem.location == sel->info.output_semantic[nir_intrinsic_base(intr)]);
1983             /* It must not be remapped (duplicated). */
1984             assert(slot_remap[sem.location] == -1);
1985 
1986             info->vs_output_param_offset[sem.location] = info->nr_param_exports++;
1987          }
1988       }
1989    }
1990 
1991    /* Duplicated outputs are redirected here. */
1992    for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
1993       if (slot_remap[i] >= 0)
1994          info->vs_output_param_offset[i] = info->vs_output_param_offset[slot_remap[i]];
1995    }
1996 
1997    if (shader->key.ge.mono.u.vs_export_prim_id) {
1998       info->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = info->nr_param_exports++;
1999    }
2000 
2001    /* Update outputs written info, we may remove some outputs before. */
2002    nir->info.outputs_written = outputs_written;
2003    nir->info.outputs_written_16bit = outputs_written_16bit;
2004 }
2005 
si_assign_param_offsets(nir_shader * nir,struct si_shader * shader)2006 static void si_assign_param_offsets(nir_shader *nir, struct si_shader *shader)
2007 {
2008    /* Initialize this first. */
2009    shader->info.nr_param_exports = 0;
2010 
2011    STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
2012    memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
2013           sizeof(shader->info.vs_output_param_offset));
2014 
2015    /* A slot remapping table for duplicated outputs, so that 1 vertex shader output can be
2016     * mapped to multiple fragment shader inputs.
2017     */
2018    int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS];
2019    memset(slot_remap, -1, NUM_TOTAL_VARYING_SLOTS);
2020 
2021    /* This sets DEFAULT_VAL for constant outputs in vs_output_param_offset. */
2022    /* TODO: This doesn't affect GS. */
2023    NIR_PASS_V(nir, ac_nir_optimize_outputs, false, slot_remap,
2024               shader->info.vs_output_param_offset);
2025 
2026    /* Assign the non-constant outputs. */
2027    /* TODO: Use this for the GS copy shader too. */
2028    si_nir_assign_param_offsets(nir, shader, slot_remap);
2029 }
2030 
si_get_nr_pos_exports(const struct si_shader_selector * sel,const union si_shader_key * key)2031 static unsigned si_get_nr_pos_exports(const struct si_shader_selector *sel,
2032                                       const union si_shader_key *key)
2033 {
2034    const struct si_shader_info *info = &sel->info;
2035 
2036    /* Must have a position export. */
2037    unsigned nr_pos_exports = 1;
2038 
2039    if ((info->writes_psize && !key->ge.opt.kill_pointsize) ||
2040        (info->writes_edgeflag && !key->ge.as_ngg) ||
2041        (info->writes_layer && !key->ge.opt.kill_layer) ||
2042        info->writes_viewport_index || sel->screen->options.vrs2x2) {
2043       nr_pos_exports++;
2044    }
2045 
2046    unsigned clipdist_mask =
2047       (info->clipdist_mask & ~key->ge.opt.kill_clip_distances) | info->culldist_mask;
2048 
2049    for (int i = 0; i < 2; i++) {
2050       if (clipdist_mask & BITFIELD_RANGE(i * 4, 4))
2051          nr_pos_exports++;
2052    }
2053 
2054    return nr_pos_exports;
2055 }
2056 
lower_ps_load_color_intrinsic(nir_builder * b,nir_instr * instr,void * state)2057 static bool lower_ps_load_color_intrinsic(nir_builder *b, nir_instr *instr, void *state)
2058 {
2059    nir_def **colors = (nir_def **)state;
2060 
2061    if (instr->type != nir_instr_type_intrinsic)
2062       return false;
2063 
2064    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
2065 
2066    if (intrin->intrinsic != nir_intrinsic_load_color0 &&
2067        intrin->intrinsic != nir_intrinsic_load_color1)
2068       return false;
2069 
2070    unsigned index = intrin->intrinsic == nir_intrinsic_load_color0 ? 0 : 1;
2071    assert(colors[index]);
2072 
2073    nir_def_rewrite_uses(&intrin->def, colors[index]);
2074 
2075    nir_instr_remove(&intrin->instr);
2076    return true;
2077 }
2078 
si_nir_lower_ps_color_input(nir_shader * nir,const union si_shader_key * key,const struct si_shader_info * info)2079 static bool si_nir_lower_ps_color_input(nir_shader *nir, const union si_shader_key *key,
2080                                         const struct si_shader_info *info)
2081 {
2082    bool progress = false;
2083    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2084 
2085    nir_builder builder = nir_builder_at(nir_before_impl(impl));
2086    nir_builder *b = &builder;
2087 
2088    /* Build ready to be used colors at the beginning of the shader. */
2089    nir_def *colors[2] = {0};
2090    for (int i = 0; i < 2; i++) {
2091       if (!(info->colors_read & (0xf << (i * 4))))
2092          continue;
2093 
2094       unsigned color_base = info->color_attr_index[i];
2095       /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
2096        * otherwise it's at offset "num_inputs".
2097        */
2098       unsigned back_color_base = info->num_inputs;
2099       if (i == 1 && (info->colors_read & 0xf))
2100          back_color_base += 1;
2101 
2102       enum glsl_interp_mode interp_mode = info->color_interpolate[i];
2103       if (interp_mode == INTERP_MODE_COLOR) {
2104          interp_mode = key->ps.part.prolog.flatshade_colors ?
2105             INTERP_MODE_FLAT : INTERP_MODE_SMOOTH;
2106       }
2107 
2108       nir_def *back_color = NULL;
2109       if (interp_mode == INTERP_MODE_FLAT) {
2110          colors[i] = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
2111                                    .base = color_base,
2112                                    .io_semantics.location = VARYING_SLOT_COL0 + i,
2113                                    .io_semantics.num_slots = 1);
2114 
2115          if (key->ps.part.prolog.color_two_side) {
2116             back_color = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
2117                                         .base = back_color_base,
2118                                         .io_semantics.location = VARYING_SLOT_BFC0 + i,
2119                                         .io_semantics.num_slots = 1);
2120          }
2121       } else {
2122          nir_intrinsic_op op = 0;
2123          switch (info->color_interpolate_loc[i]) {
2124          case TGSI_INTERPOLATE_LOC_CENTER:
2125             op = nir_intrinsic_load_barycentric_pixel;
2126             break;
2127          case TGSI_INTERPOLATE_LOC_CENTROID:
2128             op = nir_intrinsic_load_barycentric_centroid;
2129             break;
2130          case TGSI_INTERPOLATE_LOC_SAMPLE:
2131             op = nir_intrinsic_load_barycentric_sample;
2132             break;
2133          default:
2134             unreachable("invalid color interpolate location");
2135             break;
2136          }
2137 
2138          nir_def *barycentric = nir_load_barycentric(b, op, interp_mode);
2139 
2140          colors[i] =
2141             nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0),
2142                                         .base = color_base,
2143                                         .io_semantics.location = VARYING_SLOT_COL0 + i,
2144                                         .io_semantics.num_slots = 1);
2145 
2146          if (key->ps.part.prolog.color_two_side) {
2147             back_color =
2148                nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0),
2149                                            .base = back_color_base,
2150                                            .io_semantics.location = VARYING_SLOT_BFC0 + i,
2151                                            .io_semantics.num_slots = 1);
2152          }
2153       }
2154 
2155       if (back_color) {
2156          nir_def *is_front_face = nir_load_front_face(b, 1);
2157          colors[i] = nir_bcsel(b, is_front_face, colors[i], back_color);
2158       }
2159 
2160       progress = true;
2161    }
2162 
2163    /* lower nir_load_color0/1 to use the color value. */
2164    return nir_shader_instructions_pass(nir, lower_ps_load_color_intrinsic,
2165                                        nir_metadata_block_index | nir_metadata_dominance,
2166                                        colors) || progress;
2167 }
2168 
si_nir_emit_polygon_stipple(nir_shader * nir,struct si_shader_args * args)2169 static void si_nir_emit_polygon_stipple(nir_shader *nir, struct si_shader_args *args)
2170 {
2171    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2172 
2173    nir_builder builder = nir_builder_at(nir_before_impl(impl));
2174    nir_builder *b = &builder;
2175 
2176    /* Load the buffer descriptor. */
2177    nir_def *desc =
2178       si_nir_load_internal_binding(b, args, SI_PS_CONST_POLY_STIPPLE, 4);
2179 
2180    /* Use the fixed-point gl_FragCoord input.
2181     * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
2182     * per coordinate to get the repeating effect.
2183     */
2184    nir_def *pos_x = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 0, 5);
2185    nir_def *pos_y = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 16, 5);
2186 
2187    nir_def *zero = nir_imm_int(b, 0);
2188    /* The stipple pattern is 32x32, each row has 32 bits. */
2189    nir_def *offset = nir_ishl_imm(b, pos_y, 2);
2190    nir_def *row = nir_load_buffer_amd(b, 1, 32, desc, offset, zero, zero);
2191    nir_def *bit = nir_ubfe(b, row, pos_x, nir_imm_int(b, 1));
2192 
2193    nir_def *pass = nir_i2b(b, bit);
2194    nir_discard_if(b, nir_inot(b, pass));
2195 }
2196 
si_should_clear_lds(struct si_screen * sscreen,const struct nir_shader * shader)2197 bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader)
2198 {
2199    return shader->info.stage == MESA_SHADER_COMPUTE && shader->info.shared_size > 0 && sscreen->options.clear_lds;
2200 }
2201 
si_get_nir_shader(struct si_shader * shader,struct si_shader_args * args,bool * free_nir,uint64_t tcs_vgpr_only_inputs,ac_nir_gs_output_info * output_info)2202 struct nir_shader *si_get_nir_shader(struct si_shader *shader,
2203                                      struct si_shader_args *args,
2204                                      bool *free_nir,
2205                                      uint64_t tcs_vgpr_only_inputs,
2206                                      ac_nir_gs_output_info *output_info)
2207 {
2208    struct si_shader_selector *sel = shader->selector;
2209    const union si_shader_key *key = &shader->key;
2210 
2211    nir_shader *nir;
2212    *free_nir = false;
2213 
2214    if (sel->nir) {
2215       nir = sel->nir;
2216    } else if (sel->nir_binary) {
2217       nir = si_deserialize_shader(sel);
2218       *free_nir = true;
2219    } else {
2220       return NULL;
2221    }
2222 
2223    bool progress = false;
2224    bool late_opts = false;
2225 
2226    const char *original_name = NULL;
2227    if (unlikely(should_print_nir(nir))) {
2228       /* Modify the shader's name so that each variant gets its own name. */
2229       original_name = ralloc_strdup(nir, nir->info.name);
2230       ralloc_asprintf_append((char **)&nir->info.name, "-%08x", _mesa_hash_data(key, sizeof(*key)));
2231 
2232       /* Dummy pass to get the starting point. */
2233       printf("nir_dummy_pass\n");
2234       nir_print_shader(nir, stdout);
2235    }
2236 
2237    /* Kill outputs according to the shader key. */
2238    if (sel->stage <= MESA_SHADER_GEOMETRY)
2239       NIR_PASS(progress, nir, si_nir_kill_outputs, key);
2240 
2241    NIR_PASS(progress, nir, ac_nir_lower_tex,
2242             &(ac_nir_lower_tex_options){
2243                .gfx_level = sel->screen->info.gfx_level,
2244                .lower_array_layer_round_even = !sel->screen->info.conformant_trunc_coord,
2245             });
2246 
2247    if (nir->info.uses_resource_info_query)
2248       NIR_PASS(progress, nir, ac_nir_lower_resinfo, sel->screen->info.gfx_level);
2249 
2250    bool inline_uniforms = false;
2251    uint32_t *inlined_uniform_values;
2252    si_get_inline_uniform_state((union si_shader_key*)key, sel->pipe_shader_type,
2253                                &inline_uniforms, &inlined_uniform_values);
2254 
2255    if (inline_uniforms) {
2256       assert(*free_nir);
2257 
2258       /* Most places use shader information from the default variant, not
2259        * the optimized variant. These are the things that the driver looks at
2260        * in optimized variants and the list of things that we need to do.
2261        *
2262        * The driver takes into account these things if they suddenly disappear
2263        * from the shader code:
2264        * - Register usage and code size decrease (obvious)
2265        * - Eliminated PS system values are disabled by LLVM
2266        *   (FragCoord, FrontFace, barycentrics)
2267        * - VS/TES/GS param exports are eliminated if they are undef.
2268        *   The param space for eliminated outputs is also not allocated.
2269        * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)
2270        * - TCS output stores are eliminated
2271        * - Eliminated PS inputs are removed from PS.NUM_INTERP.
2272        *
2273        * TODO: These are things the driver ignores in the final shader code
2274        * and relies on the default shader info.
2275        * - System values in VS, TCS, TES, GS are not eliminated
2276        * - uses_discard - if it changed to false
2277        * - writes_memory - if it changed to false
2278        * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not
2279        *   eliminated
2280        * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS)
2281        *   GS outputs are eliminated except for the temporary LDS.
2282        *   Clip distances, gl_PointSize, gl_Layer and PS outputs are eliminated based
2283        *   on current states, so we don't care about the shader code.
2284        *
2285        * TODO: Merged shaders don't inline uniforms for the first stage.
2286        * VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS.
2287        * (key == NULL for the first stage here)
2288        *
2289        * TODO: Compute shaders don't support inlinable uniforms, because they
2290        * don't have shader variants.
2291        *
2292        * TODO: The driver uses a linear search to find a shader variant. This
2293        * can be really slow if we get too many variants due to uniform inlining.
2294        */
2295       NIR_PASS_V(nir, nir_inline_uniforms, nir->info.num_inlinable_uniforms,
2296                  inlined_uniform_values, nir->info.inlinable_uniform_dw_offsets);
2297       progress = true;
2298    }
2299 
2300    if (sel->stage == MESA_SHADER_FRAGMENT) {
2301       /* This uses the epilog key, so only monolithic shaders can call this. */
2302       if (shader->is_monolithic)
2303          NIR_PASS(progress, nir, si_nir_kill_ps_outputs, key);
2304 
2305       if (key->ps.mono.poly_line_smoothing)
2306          NIR_PASS(progress, nir, nir_lower_poly_line_smooth, SI_NUM_SMOOTH_AA_SAMPLES);
2307 
2308       if (key->ps.mono.point_smoothing)
2309          NIR_PASS(progress, nir, nir_lower_point_smooth);
2310    }
2311 
2312    /* This must be before si_nir_lower_resource. */
2313    if (!sel->screen->info.has_image_opcodes)
2314       NIR_PASS(progress, nir, ac_nir_lower_image_opcodes);
2315 
2316    /* LLVM does not work well with this, so is handled in llvm backend waterfall. */
2317    if (sel->screen->use_aco && sel->info.has_non_uniform_tex_access) {
2318       nir_lower_non_uniform_access_options options = {
2319          .types = nir_lower_non_uniform_texture_access,
2320       };
2321       NIR_PASS(progress, nir, nir_lower_non_uniform_access, &options);
2322    }
2323 
2324    NIR_PASS(progress, nir, si_nir_lower_resource, shader, args);
2325 
2326    bool is_last_vgt_stage =
2327       (sel->stage == MESA_SHADER_VERTEX ||
2328        sel->stage == MESA_SHADER_TESS_EVAL ||
2329        (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)) &&
2330       !shader->key.ge.as_ls && !shader->key.ge.as_es;
2331 
2332    /* Legacy GS is not last VGT stage because it has GS copy shader. */
2333    bool is_legacy_gs = sel->stage == MESA_SHADER_GEOMETRY && !key->ge.as_ngg;
2334 
2335    if (is_last_vgt_stage || is_legacy_gs)
2336       NIR_PASS(progress, nir, si_nir_clamp_vertex_color);
2337 
2338    if (progress) {
2339       si_nir_opts(sel->screen, nir, true);
2340       late_opts = true;
2341       progress = false;
2342    }
2343 
2344    /* Lower large variables that are always constant with load_constant intrinsics, which
2345     * get turned into PC-relative loads from a data section next to the shader.
2346     *
2347     * Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
2348     * this should be done after that.
2349     *
2350     * The pass crashes if there are dead temps of lowered IO interface types, so remove
2351     * them first.
2352     */
2353    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
2354    NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
2355 
2356    /* Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
2357     * this should be done after that.
2358     */
2359    progress |= ac_nir_lower_indirect_derefs(nir, sel->screen->info.gfx_level);
2360 
2361    if (sel->stage == MESA_SHADER_VERTEX)
2362       NIR_PASS(progress, nir, si_nir_lower_vs_inputs, shader, args);
2363 
2364    progress |= si_lower_io_to_mem(shader, nir, tcs_vgpr_only_inputs);
2365 
2366    if (is_last_vgt_stage) {
2367       /* Assign param export indices. */
2368       si_assign_param_offsets(nir, shader);
2369 
2370       /* Assign num of position exports. */
2371       shader->info.nr_pos_exports = si_get_nr_pos_exports(sel, key);
2372 
2373       if (key->ge.as_ngg) {
2374          /* Lower last VGT NGG shader stage. */
2375          si_lower_ngg(shader, nir);
2376       } else if (sel->stage == MESA_SHADER_VERTEX || sel->stage == MESA_SHADER_TESS_EVAL) {
2377          /* Lower last VGT none-NGG VS/TES shader stage. */
2378          unsigned clip_cull_mask =
2379             (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) |
2380             sel->info.culldist_mask;
2381 
2382          NIR_PASS_V(nir, ac_nir_lower_legacy_vs,
2383                     sel->screen->info.gfx_level,
2384                     clip_cull_mask,
2385                     shader->info.vs_output_param_offset,
2386                     shader->info.nr_param_exports,
2387                     shader->key.ge.mono.u.vs_export_prim_id,
2388                     !si_shader_uses_streamout(shader),
2389                     key->ge.opt.kill_pointsize,
2390                     key->ge.opt.kill_layer,
2391                     sel->screen->options.vrs2x2);
2392       }
2393       progress = true;
2394    } else if (is_legacy_gs) {
2395       NIR_PASS_V(nir, ac_nir_lower_legacy_gs, false, sel->screen->use_ngg, output_info);
2396       progress = true;
2397    } else if (sel->stage == MESA_SHADER_FRAGMENT && shader->is_monolithic) {
2398       /* Uniform inlining can eliminate PS inputs, and colormask can remove PS outputs,
2399        * which can also cause the elimination of PS inputs. Remove holes after removed PS inputs
2400        * by renumbering them. This can only happen with monolithic PS. Colors are unaffected
2401        * because they are still represented by nir_intrinsic_load_color0/1.
2402        */
2403       NIR_PASS_V(nir, nir_recompute_io_bases, nir_var_shader_in);
2404 
2405       /* Two-side color selection and interpolation: Get the latest shader info because
2406        * uniform inlining and colormask can fully eliminate color inputs.
2407        */
2408       struct si_shader_info info;
2409       si_nir_scan_shader(sel->screen, nir, &info);
2410 
2411       if (info.colors_read)
2412          NIR_PASS(progress, nir, si_nir_lower_ps_color_input, &shader->key, &info);
2413 
2414       /* We need to set this early for lowering nir_intrinsic_load_point_coord_maybe_flipped,
2415        * which can only occur with monolithic PS.
2416        */
2417       shader->info.num_ps_inputs = info.num_inputs;
2418       shader->info.ps_colors_read = info.colors_read;
2419 
2420       ac_nir_lower_ps_options options = {
2421          .gfx_level = sel->screen->info.gfx_level,
2422          .family = sel->screen->info.family,
2423          .use_aco = sel->screen->use_aco,
2424          .uses_discard = si_shader_uses_discard(shader),
2425          .alpha_to_coverage_via_mrtz = key->ps.part.epilog.alpha_to_coverage_via_mrtz,
2426          .dual_src_blend_swizzle = key->ps.part.epilog.dual_src_blend_swizzle,
2427          .spi_shader_col_format = key->ps.part.epilog.spi_shader_col_format,
2428          .color_is_int8 = key->ps.part.epilog.color_is_int8,
2429          .color_is_int10 = key->ps.part.epilog.color_is_int10,
2430          .clamp_color = key->ps.part.epilog.clamp_color,
2431          .alpha_to_one = key->ps.part.epilog.alpha_to_one,
2432          .alpha_func = key->ps.part.epilog.alpha_func,
2433          .broadcast_last_cbuf = key->ps.part.epilog.last_cbuf,
2434          .kill_samplemask = key->ps.part.epilog.kill_samplemask,
2435 
2436          .bc_optimize_for_persp = key->ps.part.prolog.bc_optimize_for_persp,
2437          .bc_optimize_for_linear = key->ps.part.prolog.bc_optimize_for_linear,
2438          .force_persp_sample_interp = key->ps.part.prolog.force_persp_sample_interp,
2439          .force_linear_sample_interp = key->ps.part.prolog.force_linear_sample_interp,
2440          .force_persp_center_interp = key->ps.part.prolog.force_persp_center_interp,
2441          .force_linear_center_interp = key->ps.part.prolog.force_linear_center_interp,
2442          .ps_iter_samples = 1 << key->ps.part.prolog.samplemask_log_ps_iter,
2443       };
2444 
2445       NIR_PASS_V(nir, ac_nir_lower_ps, &options);
2446 
2447       if (key->ps.part.prolog.poly_stipple)
2448          NIR_PASS_V(nir, si_nir_emit_polygon_stipple, args);
2449 
2450       progress = true;
2451    }
2452 
2453    NIR_PASS(progress, nir, nir_opt_idiv_const, 8);
2454    NIR_PASS(progress, nir, nir_lower_idiv,
2455             &(nir_lower_idiv_options){
2456                .allow_fp16 = sel->screen->info.gfx_level >= GFX9,
2457             });
2458 
2459    if (si_should_clear_lds(sel->screen, nir)) {
2460       const unsigned chunk_size = 16; /* max single store size */
2461       const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
2462       NIR_PASS_V(nir, nir_clear_shared_memory, shared_size, chunk_size);
2463    }
2464 
2465    NIR_PASS(progress, nir, ac_nir_lower_intrinsics_to_args, sel->screen->info.gfx_level,
2466             si_select_hw_stage(nir->info.stage, key, sel->screen->info.gfx_level),
2467             &args->ac);
2468    NIR_PASS(progress, nir, si_nir_lower_abi, shader, args);
2469 
2470    if (progress) {
2471       si_nir_opts(sel->screen, nir, false);
2472       progress = false;
2473       late_opts = true;
2474    }
2475 
2476    static const nir_opt_offsets_options offset_options = {
2477       .uniform_max = 0,
2478       .buffer_max = ~0,
2479       .shared_max = ~0,
2480    };
2481    NIR_PASS_V(nir, nir_opt_offsets, &offset_options);
2482 
2483    if (late_opts)
2484       si_nir_late_opts(nir);
2485 
2486    /* aco only accept scalar const, must be done after si_nir_late_opts()
2487     * which may generate vec const.
2488     */
2489    if (sel->screen->use_aco)
2490       NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
2491 
2492    /* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
2493     * 200 is tuned for Viewperf. It should be done last.
2494     */
2495    NIR_PASS_V(nir, nir_group_loads, nir_group_same_resource_only, 200);
2496 
2497    if (unlikely(original_name)) {
2498       ralloc_free((void*)nir->info.name);
2499       nir->info.name = original_name;
2500    }
2501 
2502    return nir;
2503 }
2504 
si_update_shader_binary_info(struct si_shader * shader,nir_shader * nir)2505 void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir)
2506 {
2507    struct si_shader_info info;
2508    si_nir_scan_shader(shader->selector->screen, nir, &info);
2509 
2510    shader->info.uses_vmem_load_other |= info.uses_vmem_load_other;
2511    shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh;
2512 
2513    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2514       /* Since uniform inlining can remove PS inputs, set the latest info about PS inputs here. */
2515       shader->info.num_ps_inputs = info.num_inputs;
2516       shader->info.ps_colors_read = info.colors_read;
2517 
2518       /* A non-monolithic PS doesn't know if back colors are enabled, so copy 2 more. */
2519       unsigned max_interp = MIN2(info.num_inputs + 2, SI_NUM_INTERP);
2520       memcpy(shader->info.ps_inputs, info.input, max_interp * sizeof(info.input[0]));
2521    }
2522 }
2523 
2524 /* Generate code for the hardware VS shader stage to go with a geometry shader */
2525 static struct si_shader *
si_nir_generate_gs_copy_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * gs_shader,nir_shader * gs_nir,struct util_debug_callback * debug,ac_nir_gs_output_info * output_info)2526 si_nir_generate_gs_copy_shader(struct si_screen *sscreen,
2527                                struct ac_llvm_compiler *compiler,
2528                                struct si_shader *gs_shader,
2529                                nir_shader *gs_nir,
2530                                struct util_debug_callback *debug,
2531                                ac_nir_gs_output_info *output_info)
2532 {
2533    struct si_shader *shader;
2534    struct si_shader_selector *gs_selector = gs_shader->selector;
2535    struct si_shader_info *gsinfo = &gs_selector->info;
2536    union si_shader_key *gskey = &gs_shader->key;
2537 
2538    shader = CALLOC_STRUCT(si_shader);
2539    if (!shader)
2540       return NULL;
2541 
2542    /* We can leave the fence as permanently signaled because the GS copy
2543     * shader only becomes visible globally after it has been compiled. */
2544    util_queue_fence_init(&shader->ready);
2545 
2546    shader->selector = gs_selector;
2547    shader->is_gs_copy_shader = true;
2548    shader->wave_size = si_determine_wave_size(sscreen, shader);
2549 
2550    STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
2551    memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
2552           sizeof(shader->info.vs_output_param_offset));
2553 
2554    for (unsigned i = 0; i < gsinfo->num_outputs; i++) {
2555       unsigned semantic = gsinfo->output_semantic[i];
2556 
2557       /* Skip if no channel writes to stream 0. */
2558       if (!nir_slot_is_varying(semantic) ||
2559           (gsinfo->output_streams[i] & 0x03 &&
2560            gsinfo->output_streams[i] & 0x0c &&
2561            gsinfo->output_streams[i] & 0x30 &&
2562            gsinfo->output_streams[i] & 0xc0))
2563          continue;
2564 
2565       shader->info.vs_output_param_offset[semantic] = shader->info.nr_param_exports++;
2566    }
2567 
2568    shader->info.nr_pos_exports = si_get_nr_pos_exports(gs_selector, gskey);
2569 
2570    unsigned clip_cull_mask =
2571       (gsinfo->clipdist_mask & ~gskey->ge.opt.kill_clip_distances) | gsinfo->culldist_mask;
2572 
2573    nir_shader *nir =
2574       ac_nir_create_gs_copy_shader(gs_nir,
2575                                    sscreen->info.gfx_level,
2576                                    clip_cull_mask,
2577                                    shader->info.vs_output_param_offset,
2578                                    shader->info.nr_param_exports,
2579                                    !si_shader_uses_streamout(gs_shader),
2580                                    gskey->ge.opt.kill_pointsize,
2581                                    gskey->ge.opt.kill_layer,
2582                                    sscreen->options.vrs2x2,
2583                                    output_info);
2584 
2585    struct si_shader_args args;
2586    si_init_shader_args(shader, &args);
2587 
2588    NIR_PASS_V(nir, ac_nir_lower_intrinsics_to_args, sscreen->info.gfx_level, AC_HW_VERTEX_SHADER, &args.ac);
2589    NIR_PASS_V(nir, si_nir_lower_abi, shader, &args);
2590 
2591    si_nir_opts(gs_selector->screen, nir, false);
2592 
2593    /* aco only accept scalar const */
2594    if (sscreen->use_aco)
2595       NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
2596 
2597    if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY, SI_DUMP_NIR)) {
2598       fprintf(stderr, "GS Copy Shader:\n");
2599       nir_print_shader(nir, stderr);
2600    }
2601 
2602    bool ok =
2603 #if LLVM_AVAILABLE
2604       !sscreen->use_aco ? si_llvm_compile_shader(sscreen, compiler, shader, &args, debug, nir) :
2605 #endif
2606       si_aco_compile_shader(shader, &args, nir, debug);
2607 
2608 
2609    if (ok) {
2610       assert(!shader->config.scratch_bytes_per_wave);
2611       ok = si_shader_binary_upload(sscreen, shader, 0);
2612       si_shader_dump(sscreen, shader, debug, stderr, true);
2613    }
2614    ralloc_free(nir);
2615 
2616    if (!ok) {
2617       FREE(shader);
2618       shader = NULL;
2619    } else {
2620       si_fix_resource_usage(sscreen, shader);
2621    }
2622    return shader;
2623 }
2624 
2625 struct si_gs_output_info {
2626    uint8_t streams[64];
2627    uint8_t streams_16bit_lo[16];
2628    uint8_t streams_16bit_hi[16];
2629 
2630    uint8_t usage_mask[64];
2631    uint8_t usage_mask_16bit_lo[16];
2632    uint8_t usage_mask_16bit_hi[16];
2633 
2634    ac_nir_gs_output_info info;
2635 };
2636 
2637 static void
si_init_gs_output_info(struct si_shader_info * info,struct si_gs_output_info * out_info)2638 si_init_gs_output_info(struct si_shader_info *info, struct si_gs_output_info *out_info)
2639 {
2640    for (int i = 0; i < info->num_outputs; i++) {
2641       unsigned slot = info->output_semantic[i];
2642       if (slot < VARYING_SLOT_VAR0_16BIT) {
2643          out_info->streams[slot] = info->output_streams[i];
2644          out_info->usage_mask[slot] = info->output_usagemask[i];
2645       } else {
2646          unsigned index = slot - VARYING_SLOT_VAR0_16BIT;
2647          /* TODO: 16bit need separated fields for lo/hi part. */
2648          out_info->streams_16bit_lo[index] = info->output_streams[i];
2649          out_info->streams_16bit_hi[index] = info->output_streams[i];
2650          out_info->usage_mask_16bit_lo[index] = info->output_usagemask[i];
2651          out_info->usage_mask_16bit_hi[index] = info->output_usagemask[i];
2652       }
2653    }
2654 
2655    ac_nir_gs_output_info *ac_info = &out_info->info;
2656 
2657    ac_info->streams = out_info->streams;
2658    ac_info->streams_16bit_lo = out_info->streams_16bit_lo;
2659    ac_info->streams_16bit_hi = out_info->streams_16bit_hi;
2660 
2661    ac_info->usage_mask = out_info->usage_mask;
2662    ac_info->usage_mask_16bit_lo = out_info->usage_mask_16bit_lo;
2663    ac_info->usage_mask_16bit_hi = out_info->usage_mask_16bit_hi;
2664 
2665    /* TODO: construct 16bit slot per component store type. */
2666    ac_info->types_16bit_lo = ac_info->types_16bit_hi = NULL;
2667 }
2668 
si_fixup_spi_ps_input_config(struct si_shader * shader)2669 static void si_fixup_spi_ps_input_config(struct si_shader *shader)
2670 {
2671    const union si_shader_key *key = &shader->key;
2672 
2673    /* Enable POS_FIXED_PT if polygon stippling is enabled. */
2674    if (key->ps.part.prolog.poly_stipple)
2675       shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2676 
2677    /* Set up the enable bits for per-sample shading if needed. */
2678    if (key->ps.part.prolog.force_persp_sample_interp &&
2679        (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2680         G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2681       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
2682       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2683       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2684    }
2685    if (key->ps.part.prolog.force_linear_sample_interp &&
2686        (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2687         G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2688       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
2689       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2690       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2691    }
2692    if (key->ps.part.prolog.force_persp_center_interp &&
2693        (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2694         G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2695       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
2696       shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2697       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2698    }
2699    if (key->ps.part.prolog.force_linear_center_interp &&
2700        (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2701         G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2702       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
2703       shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2704       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2705    }
2706 
2707    /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
2708    if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
2709        !(shader->config.spi_ps_input_ena & 0xf)) {
2710       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2711    }
2712 
2713    /* At least one pair of interpolation weights must be enabled. */
2714    if (!(shader->config.spi_ps_input_ena & 0x7f))
2715       shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2716 
2717    /* Samplemask fixup requires the sample ID. */
2718    if (key->ps.part.prolog.samplemask_log_ps_iter)
2719       shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2720 }
2721 
2722 static void
si_set_spi_ps_input_config(struct si_shader * shader)2723 si_set_spi_ps_input_config(struct si_shader *shader)
2724 {
2725    const struct si_shader_selector *sel = shader->selector;
2726    const struct si_shader_info *info = &sel->info;
2727    const union si_shader_key *key = &shader->key;
2728 
2729    /* TODO: This should be determined from the final NIR instead of the input NIR,
2730     * otherwise LLVM will have a performance advantage here because it determines
2731     * VGPR inputs for each shader variant after LLVM optimizations.
2732     */
2733    shader->config.spi_ps_input_ena =
2734       S_0286CC_PERSP_CENTER_ENA(info->uses_persp_center) |
2735       S_0286CC_PERSP_CENTROID_ENA(info->uses_persp_centroid) |
2736       S_0286CC_PERSP_SAMPLE_ENA(info->uses_persp_sample) |
2737       S_0286CC_LINEAR_CENTER_ENA(info->uses_linear_center) |
2738       S_0286CC_LINEAR_CENTROID_ENA(info->uses_linear_centroid) |
2739       S_0286CC_LINEAR_SAMPLE_ENA(info->uses_linear_sample) |
2740       S_0286CC_FRONT_FACE_ENA(info->uses_frontface && !key->ps.opt.force_front_face_input) |
2741       S_0286CC_SAMPLE_COVERAGE_ENA(info->reads_samplemask) |
2742       S_0286CC_ANCILLARY_ENA(info->uses_sampleid || info->uses_layer_id);
2743 
2744    uint8_t mask = info->reads_frag_coord_mask | info->reads_sample_pos_mask;
2745    u_foreach_bit(i, mask) {
2746       shader->config.spi_ps_input_ena |= S_0286CC_POS_X_FLOAT_ENA(1) << i;
2747    }
2748 
2749    if (key->ps.part.prolog.color_two_side)
2750       shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
2751 
2752    /* INTERP_MODE_COLOR, same as SMOOTH if flat shading is disabled. */
2753    if (info->uses_interp_color && !key->ps.part.prolog.flatshade_colors) {
2754       shader->config.spi_ps_input_ena |=
2755          S_0286CC_PERSP_SAMPLE_ENA(info->uses_persp_sample_color) |
2756          S_0286CC_PERSP_CENTER_ENA(info->uses_persp_center_color) |
2757          S_0286CC_PERSP_CENTROID_ENA(info->uses_persp_centroid_color);
2758    }
2759 
2760    /* nir_lower_poly_line_smooth use nir_load_sample_mask_in */
2761    if (key->ps.mono.poly_line_smoothing)
2762       shader->config.spi_ps_input_ena |= S_0286CC_SAMPLE_COVERAGE_ENA(1);
2763 
2764    /* nir_lower_point_smooth use nir_load_point_coord_maybe_flipped which is lowered
2765     * to nir_load_barycentric_pixel and nir_load_interpolated_input.
2766     */
2767    if (key->ps.mono.point_smoothing)
2768       shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2769 
2770    /* See fetch_framebuffer() for used args when fbfetch output. */
2771    if (info->base.fs.uses_fbfetch_output) {
2772       shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2773 
2774       if (key->ps.mono.fbfetch_layered || key->ps.mono.fbfetch_msaa)
2775          shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2776    }
2777 
2778    if (shader->is_monolithic) {
2779       si_fixup_spi_ps_input_config(shader);
2780       shader->config.spi_ps_input_addr = shader->config.spi_ps_input_ena;
2781    } else {
2782       /* Part mode will call si_fixup_spi_ps_input_config() when combining multi
2783        * shader part in si_shader_select_ps_parts().
2784        *
2785        * Reserve register locations for VGPR inputs the PS prolog may need.
2786        */
2787       shader->config.spi_ps_input_addr =
2788          shader->config.spi_ps_input_ena |
2789          SI_SPI_PS_INPUT_ADDR_FOR_PROLOG;
2790    }
2791 }
2792 
2793 static void
debug_message_stderr(void * data,unsigned * id,enum util_debug_type ptype,const char * fmt,va_list args)2794 debug_message_stderr(void *data, unsigned *id, enum util_debug_type ptype,
2795                       const char *fmt, va_list args)
2796 {
2797    vfprintf(stderr, fmt, args);
2798    fprintf(stderr, "\n");
2799 }
2800 
si_compile_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)2801 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2802                        struct si_shader *shader, struct util_debug_callback *debug)
2803 {
2804    bool ret = true;
2805    struct si_shader_selector *sel = shader->selector;
2806 
2807    /* ACO need spi_ps_input in advance to init args and used in compiler. */
2808    if (sel->stage == MESA_SHADER_FRAGMENT && sscreen->use_aco)
2809       si_set_spi_ps_input_config(shader);
2810 
2811    /* We need this info only when legacy GS. */
2812    struct si_gs_output_info legacy_gs_output_info;
2813    if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
2814       memset(&legacy_gs_output_info, 0, sizeof(legacy_gs_output_info));
2815       si_init_gs_output_info(&sel->info, &legacy_gs_output_info);
2816    }
2817 
2818    struct si_shader_args args;
2819    si_init_shader_args(shader, &args);
2820 
2821    bool free_nir;
2822    struct nir_shader *nir =
2823       si_get_nir_shader(shader, &args, &free_nir, 0, &legacy_gs_output_info.info);
2824 
2825    /* Dump NIR before doing NIR->LLVM conversion in case the
2826     * conversion fails. */
2827    if (si_can_dump_shader(sscreen, sel->stage, SI_DUMP_NIR)) {
2828       nir_print_shader(nir, stderr);
2829 
2830       if (nir->xfb_info)
2831          nir_print_xfb_info(nir->xfb_info, stderr);
2832    }
2833 
2834    /* Initialize vs_output_ps_input_cntl to default. */
2835    for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++)
2836       shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED;
2837    shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0;
2838 
2839    si_update_shader_binary_info(shader, nir);
2840 
2841    /* uses_instanceid may be set by si_nir_lower_vs_inputs(). */
2842    shader->info.uses_instanceid |= sel->info.uses_instanceid;
2843    shader->info.private_mem_vgprs = DIV_ROUND_UP(nir->scratch_size, 4);
2844 
2845    /* Set the FP ALU behavior. */
2846    /* By default, we disable denormals for FP32 and enable them for FP16 and FP64
2847     * for performance and correctness reasons. FP32 denormals can't be enabled because
2848     * they break output modifiers and v_mad_f32 and are very slow on GFX6-7.
2849     *
2850     * float_controls_execution_mode defines the set of valid behaviors. Contradicting flags
2851     * can be set simultaneously, which means we are allowed to choose, but not really because
2852     * some options cause GLCTS failures.
2853     */
2854    unsigned float_mode = V_00B028_FP_16_64_DENORMS;
2855 
2856    if (!(nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) &&
2857        nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
2858       float_mode |= V_00B028_FP_32_ROUND_TOWARDS_ZERO;
2859 
2860    if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
2861                                                     FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64)) &&
2862        nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
2863                                                   FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
2864       float_mode |= V_00B028_FP_16_64_ROUND_TOWARDS_ZERO;
2865 
2866    if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_PRESERVE_FP16 |
2867                                                     FLOAT_CONTROLS_DENORM_PRESERVE_FP64)) &&
2868        nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
2869                                                   FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64))
2870       float_mode &= ~V_00B028_FP_16_64_DENORMS;
2871 
2872    ret =
2873 #if LLVM_AVAILABLE
2874       !sscreen->use_aco ? si_llvm_compile_shader(sscreen, compiler, shader, &args, debug, nir) :
2875 #endif
2876       si_aco_compile_shader(shader, &args, nir, debug);
2877 
2878    if (!ret)
2879       goto out;
2880 
2881    shader->config.float_mode = float_mode;
2882 
2883    /* The GS copy shader is compiled next. */
2884    if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
2885       shader->gs_copy_shader =
2886          si_nir_generate_gs_copy_shader(sscreen, compiler, shader, nir, debug,
2887                                         &legacy_gs_output_info.info);
2888       if (!shader->gs_copy_shader) {
2889          fprintf(stderr, "radeonsi: can't create GS copy shader\n");
2890          ret = false;
2891          goto out;
2892       }
2893    }
2894 
2895    /* Compute vs_output_ps_input_cntl. */
2896    if ((sel->stage == MESA_SHADER_VERTEX ||
2897         sel->stage == MESA_SHADER_TESS_EVAL ||
2898         sel->stage == MESA_SHADER_GEOMETRY) &&
2899        !shader->key.ge.as_ls && !shader->key.ge.as_es) {
2900       uint8_t *vs_output_param_offset = shader->info.vs_output_param_offset;
2901 
2902       if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg)
2903          vs_output_param_offset = shader->gs_copy_shader->info.vs_output_param_offset;
2904 
2905       /* We must use the original shader info before the removal of duplicated shader outputs. */
2906       /* VS and TES should also set primitive ID output if it's used. */
2907       unsigned num_outputs_with_prim_id = sel->info.num_outputs +
2908                                           shader->key.ge.mono.u.vs_export_prim_id;
2909 
2910       for (unsigned i = 0; i < num_outputs_with_prim_id; i++) {
2911          unsigned semantic = sel->info.output_semantic[i];
2912          unsigned offset = vs_output_param_offset[semantic];
2913          unsigned ps_input_cntl;
2914 
2915          if (offset <= AC_EXP_PARAM_OFFSET_31) {
2916             /* The input is loaded from parameter memory. */
2917             ps_input_cntl = S_028644_OFFSET(offset);
2918          } else {
2919             /* The input is a DEFAULT_VAL constant. */
2920             assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
2921                    offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
2922             offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
2923 
2924             /* OFFSET=0x20 means that DEFAULT_VAL is used. */
2925             ps_input_cntl = S_028644_OFFSET(0x20) |
2926                             S_028644_DEFAULT_VAL(offset);
2927          }
2928 
2929          shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl;
2930       }
2931    }
2932 
2933    /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */
2934    if (sel->stage == MESA_SHADER_COMPUTE) {
2935       unsigned max_vgprs =
2936          sscreen->info.num_physical_wave64_vgprs_per_simd * (shader->wave_size == 32 ? 2 : 1);
2937       unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
2938       unsigned max_sgprs_per_wave = 128;
2939       unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
2940       unsigned threads_per_tg = si_get_max_workgroup_size(shader);
2941       unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, shader->wave_size);
2942       unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
2943 
2944       max_vgprs = max_vgprs / waves_per_simd;
2945       max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
2946 
2947       if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
2948          fprintf(stderr,
2949                  "LLVM failed to compile a shader correctly: "
2950                  "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
2951                  shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);
2952 
2953          /* Just terminate the process, because dependent
2954           * shaders can hang due to bad input data, but use
2955           * the env var to allow shader-db to work.
2956           */
2957          if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
2958             abort();
2959       }
2960    }
2961 
2962    /* Add/remove the scratch offset to/from input SGPRs. */
2963    if (sel->screen->info.gfx_level < GFX11 &&
2964        (sel->screen->info.family < CHIP_GFX940 || sel->screen->info.has_graphics) &&
2965        !si_is_merged_shader(shader)) {
2966       if (sscreen->use_aco) {
2967          /* When aco scratch_offset arg is added explicitly at the beginning.
2968           * After compile if no scratch used, reduce the input sgpr count.
2969           */
2970          if (!shader->config.scratch_bytes_per_wave)
2971             shader->info.num_input_sgprs--;
2972       } else {
2973          /* scratch_offset arg is added by llvm implicitly */
2974          if (shader->info.num_input_sgprs)
2975             shader->info.num_input_sgprs++;
2976       }
2977    }
2978 
2979    /* Calculate the number of fragment input VGPRs. */
2980    if (sel->stage == MESA_SHADER_FRAGMENT) {
2981       shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(
2982          &shader->config, &shader->info.num_fragcoord_components);
2983    }
2984 
2985    si_calculate_max_simd_waves(shader);
2986 
2987    if (si_can_dump_shader(sscreen, sel->stage, SI_DUMP_STATS)) {
2988       struct util_debug_callback out_stderr = {
2989          .debug_message = debug_message_stderr,
2990       };
2991 
2992       si_shader_dump_stats_for_shader_db(sscreen, shader, &out_stderr);
2993    } else {
2994       si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
2995    }
2996 
2997 out:
2998    if (free_nir)
2999       ralloc_free(nir);
3000 
3001    return ret;
3002 }
3003 
3004 /**
3005  * Create, compile and return a shader part (prolog or epilog).
3006  *
3007  * \param sscreen  screen
3008  * \param list     list of shader parts of the same category
3009  * \param type     shader type
3010  * \param key      shader part key
3011  * \param prolog   whether the part being requested is a prolog
3012  * \param tm       LLVM target machine
3013  * \param debug    debug callback
3014  * \return         non-NULL on success
3015  */
3016 static struct si_shader_part *
si_get_shader_part(struct si_screen * sscreen,struct si_shader_part ** list,gl_shader_stage stage,bool prolog,union si_shader_part_key * key,struct ac_llvm_compiler * compiler,struct util_debug_callback * debug,const char * name)3017 si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
3018                    gl_shader_stage stage, bool prolog, union si_shader_part_key *key,
3019                    struct ac_llvm_compiler *compiler, struct util_debug_callback *debug,
3020                    const char *name)
3021 {
3022    struct si_shader_part *result;
3023 
3024    simple_mtx_lock(&sscreen->shader_parts_mutex);
3025 
3026    /* Find existing. */
3027    for (result = *list; result; result = result->next) {
3028       if (memcmp(&result->key, key, sizeof(*key)) == 0) {
3029          simple_mtx_unlock(&sscreen->shader_parts_mutex);
3030          return result;
3031       }
3032    }
3033 
3034    /* Compile a new one. */
3035    result = CALLOC_STRUCT(si_shader_part);
3036    result->key = *key;
3037 
3038    bool ok =
3039 #if LLVM_AVAILABLE
3040       !sscreen->use_aco ? si_llvm_build_shader_part(sscreen, stage, prolog, compiler, debug, name, result) :
3041 #endif
3042       si_aco_build_shader_part(sscreen, stage, prolog, debug, name, result);
3043 
3044    if (ok) {
3045       result->next = *list;
3046       *list = result;
3047    } else {
3048       FREE(result);
3049       result = NULL;
3050    }
3051 
3052    simple_mtx_unlock(&sscreen->shader_parts_mutex);
3053    return result;
3054 }
3055 
si_get_tcs_epilog_key(struct si_shader * shader,union si_shader_part_key * key)3056 void si_get_tcs_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
3057 {
3058    memset(key, 0, sizeof(*key));
3059    key->tcs_epilog.wave32 = shader->wave_size == 32;
3060    key->tcs_epilog.states = shader->key.ge.part.tcs.epilog;
3061 
3062    /* If output patches are wholly in one wave, we don't need a barrier. */
3063    key->tcs_epilog.noop_s_barrier =
3064       shader->wave_size % shader->selector->info.base.tess.tcs_vertices_out == 0;
3065 }
3066 
3067 /**
3068  * Select and compile (or reuse) TCS parts (epilog).
3069  */
si_shader_select_tcs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3070 static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3071                                        struct si_shader *shader, struct util_debug_callback *debug)
3072 {
3073    if (sscreen->info.gfx_level >= GFX9)
3074       shader->previous_stage = shader->key.ge.part.tcs.ls->main_shader_part_ls;
3075 
3076    /* Get the epilog. */
3077    union si_shader_part_key epilog_key;
3078    si_get_tcs_epilog_key(shader, &epilog_key);
3079 
3080    shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, MESA_SHADER_TESS_CTRL, false,
3081                                        &epilog_key, compiler, debug,
3082                                        "Tessellation Control Shader Epilog");
3083    return shader->epilog != NULL;
3084 }
3085 
3086 /**
3087  * Select and compile (or reuse) GS parts (prolog).
3088  */
si_shader_select_gs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3089 static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3090                                       struct si_shader *shader, struct util_debug_callback *debug)
3091 {
3092    if (sscreen->info.gfx_level >= GFX9) {
3093       if (shader->key.ge.as_ngg)
3094          shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_ngg_es;
3095       else
3096          shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_es;
3097    }
3098 
3099    return true;
3100 }
3101 
3102 /**
3103  * Compute the PS prolog key, which contains all the information needed to
3104  * build the PS prolog function, and set related bits in shader->config.
3105  */
si_get_ps_prolog_key(struct si_shader * shader,union si_shader_part_key * key)3106 void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key)
3107 {
3108    struct si_shader_info *info = &shader->selector->info;
3109 
3110    memset(key, 0, sizeof(*key));
3111    key->ps_prolog.states = shader->key.ps.part.prolog;
3112    key->ps_prolog.wave32 = shader->wave_size == 32;
3113    key->ps_prolog.colors_read = shader->info.ps_colors_read;
3114    key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
3115    key->ps_prolog.wqm =
3116       info->base.fs.needs_quad_helper_invocations &&
3117       (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
3118        key->ps_prolog.states.force_linear_sample_interp ||
3119        key->ps_prolog.states.force_persp_center_interp ||
3120        key->ps_prolog.states.force_linear_center_interp ||
3121        key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);
3122    key->ps_prolog.num_fragcoord_components = shader->info.num_fragcoord_components;
3123 
3124    if (shader->key.ps.part.prolog.poly_stipple)
3125       shader->info.uses_vmem_load_other = true;
3126 
3127    if (shader->info.ps_colors_read) {
3128       uint8_t *color = shader->selector->info.color_attr_index;
3129 
3130       if (shader->key.ps.part.prolog.color_two_side) {
3131          /* BCOLORs are stored after the last input. */
3132          key->ps_prolog.num_interp_inputs = shader->info.num_ps_inputs;
3133          shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
3134       }
3135 
3136       for (unsigned i = 0; i < 2; i++) {
3137          unsigned interp = info->color_interpolate[i];
3138          unsigned location = info->color_interpolate_loc[i];
3139 
3140          if (!(shader->info.ps_colors_read & (0xf << i * 4)))
3141             continue;
3142 
3143          key->ps_prolog.color_attr_index[i] = color[i];
3144 
3145          if (shader->key.ps.part.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
3146             interp = INTERP_MODE_FLAT;
3147 
3148          switch (interp) {
3149          case INTERP_MODE_FLAT:
3150             key->ps_prolog.color_interp_vgpr_index[i] = -1;
3151             break;
3152          case INTERP_MODE_SMOOTH:
3153          case INTERP_MODE_COLOR:
3154             /* Force the interpolation location for colors here. */
3155             if (shader->key.ps.part.prolog.force_persp_sample_interp)
3156                location = TGSI_INTERPOLATE_LOC_SAMPLE;
3157             if (shader->key.ps.part.prolog.force_persp_center_interp)
3158                location = TGSI_INTERPOLATE_LOC_CENTER;
3159 
3160             switch (location) {
3161             case TGSI_INTERPOLATE_LOC_SAMPLE:
3162                key->ps_prolog.color_interp_vgpr_index[i] = 0;
3163                shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
3164                break;
3165             case TGSI_INTERPOLATE_LOC_CENTER:
3166                key->ps_prolog.color_interp_vgpr_index[i] = 2;
3167                shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
3168                break;
3169             case TGSI_INTERPOLATE_LOC_CENTROID:
3170                key->ps_prolog.color_interp_vgpr_index[i] = 4;
3171                shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
3172                break;
3173             default:
3174                assert(0);
3175             }
3176             break;
3177          case INTERP_MODE_NOPERSPECTIVE:
3178             /* Force the interpolation location for colors here. */
3179             if (shader->key.ps.part.prolog.force_linear_sample_interp)
3180                location = TGSI_INTERPOLATE_LOC_SAMPLE;
3181             if (shader->key.ps.part.prolog.force_linear_center_interp)
3182                location = TGSI_INTERPOLATE_LOC_CENTER;
3183 
3184             /* The VGPR assignment for non-monolithic shaders
3185              * works because InitialPSInputAddr is set on the
3186              * main shader and PERSP_PULL_MODEL is never used.
3187              */
3188             switch (location) {
3189             case TGSI_INTERPOLATE_LOC_SAMPLE:
3190                key->ps_prolog.color_interp_vgpr_index[i] = 6;
3191                shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
3192                break;
3193             case TGSI_INTERPOLATE_LOC_CENTER:
3194                key->ps_prolog.color_interp_vgpr_index[i] = 8;
3195                shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
3196                break;
3197             case TGSI_INTERPOLATE_LOC_CENTROID:
3198                key->ps_prolog.color_interp_vgpr_index[i] = 10;
3199                shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
3200                break;
3201             default:
3202                assert(0);
3203             }
3204             break;
3205          default:
3206             assert(0);
3207          }
3208       }
3209    }
3210 }
3211 
3212 /**
3213  * Check whether a PS prolog is required based on the key.
3214  */
si_need_ps_prolog(const union si_shader_part_key * key)3215 bool si_need_ps_prolog(const union si_shader_part_key *key)
3216 {
3217    return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
3218           key->ps_prolog.states.force_linear_sample_interp ||
3219           key->ps_prolog.states.force_persp_center_interp ||
3220           key->ps_prolog.states.force_linear_center_interp ||
3221           key->ps_prolog.states.bc_optimize_for_persp ||
3222           key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
3223           key->ps_prolog.states.samplemask_log_ps_iter;
3224 }
3225 
3226 /**
3227  * Compute the PS epilog key, which contains all the information needed to
3228  * build the PS epilog function.
3229  */
si_get_ps_epilog_key(struct si_shader * shader,union si_shader_part_key * key)3230 void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
3231 {
3232    struct si_shader_info *info = &shader->selector->info;
3233    memset(key, 0, sizeof(*key));
3234    key->ps_epilog.wave32 = shader->wave_size == 32;
3235    key->ps_epilog.uses_discard = si_shader_uses_discard(shader);
3236    key->ps_epilog.colors_written = info->colors_written;
3237    key->ps_epilog.color_types = info->output_color_types;
3238    key->ps_epilog.writes_z = info->writes_z;
3239    key->ps_epilog.writes_stencil = info->writes_stencil;
3240    key->ps_epilog.writes_samplemask = info->writes_samplemask &&
3241                                       !shader->key.ps.part.epilog.kill_samplemask;
3242    key->ps_epilog.states = shader->key.ps.part.epilog;
3243 }
3244 
3245 /**
3246  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
3247  */
si_shader_select_ps_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3248 static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3249                                       struct si_shader *shader, struct util_debug_callback *debug)
3250 {
3251    union si_shader_part_key prolog_key;
3252    union si_shader_part_key epilog_key;
3253 
3254    /* Get the prolog. */
3255    si_get_ps_prolog_key(shader, &prolog_key);
3256 
3257    /* The prolog is a no-op if these aren't set. */
3258    if (si_need_ps_prolog(&prolog_key)) {
3259       shader->prolog =
3260          si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key,
3261                             compiler, debug, "Fragment Shader Prolog");
3262       if (!shader->prolog)
3263          return false;
3264    }
3265 
3266    /* Get the epilog. */
3267    si_get_ps_epilog_key(shader, &epilog_key);
3268 
3269    shader->epilog =
3270       si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key,
3271                          compiler, debug, "Fragment Shader Epilog");
3272    if (!shader->epilog)
3273       return false;
3274 
3275    si_fixup_spi_ps_input_config(shader);
3276 
3277    /* Make sure spi_ps_input_addr bits is superset of spi_ps_input_ena. */
3278    unsigned spi_ps_input_ena = shader->config.spi_ps_input_ena;
3279    unsigned spi_ps_input_addr = shader->config.spi_ps_input_addr;
3280    assert((spi_ps_input_ena & spi_ps_input_addr) == spi_ps_input_ena);
3281 
3282    return true;
3283 }
3284 
si_multiwave_lds_size_workaround(struct si_screen * sscreen,unsigned * lds_size)3285 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
3286 {
3287    /* If tessellation is all offchip and on-chip GS isn't used, this
3288     * workaround is not needed.
3289     */
3290    return;
3291 
3292    /* SPI barrier management bug:
3293     *   Make sure we have at least 4k of LDS in use to avoid the bug.
3294     *   It applies to workgroup sizes of more than one wavefront.
3295     */
3296    if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
3297       *lds_size = MAX2(*lds_size, 8);
3298 }
3299 
si_fix_resource_usage(struct si_screen * sscreen,struct si_shader * shader)3300 static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
3301 {
3302    unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
3303 
3304    shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
3305 
3306    if (shader->selector->stage == MESA_SHADER_COMPUTE &&
3307        si_get_max_workgroup_size(shader) > shader->wave_size) {
3308       si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
3309    }
3310 }
3311 
si_create_shader_variant(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3312 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3313                               struct si_shader *shader, struct util_debug_callback *debug)
3314 {
3315    struct si_shader_selector *sel = shader->selector;
3316    struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
3317 
3318    if (sel->stage == MESA_SHADER_FRAGMENT) {
3319       shader->ps.writes_samplemask = sel->info.writes_samplemask &&
3320                                      !shader->key.ps.part.epilog.kill_samplemask;
3321    }
3322 
3323    /* LS, ES, VS are compiled on demand if the main part hasn't been
3324     * compiled for that stage.
3325     *
3326     * GS are compiled on demand if the main part hasn't been compiled
3327     * for the chosen NGG-ness.
3328     *
3329     * Vertex shaders are compiled on demand when a vertex fetch
3330     * workaround must be applied.
3331     */
3332    if (shader->is_monolithic) {
3333       /* Monolithic shader (compiled as a whole, has many variants,
3334        * may take a long time to compile).
3335        */
3336       if (!si_compile_shader(sscreen, compiler, shader, debug))
3337          return false;
3338    } else {
3339       /* The shader consists of several parts:
3340        *
3341        * - the middle part is the user shader, it has 1 variant only
3342        *   and it was compiled during the creation of the shader
3343        *   selector
3344        * - the prolog part is inserted at the beginning
3345        * - the epilog part is inserted at the end
3346        *
3347        * The prolog and epilog have many (but simple) variants.
3348        *
3349        * Starting with gfx9, geometry and tessellation control
3350        * shaders also contain the prolog and user shader parts of
3351        * the previous shader stage.
3352        */
3353 
3354       if (!mainp)
3355          return false;
3356 
3357       /* Copy the compiled shader data over. */
3358       shader->is_binary_shared = true;
3359       shader->binary = mainp->binary;
3360       shader->config = mainp->config;
3361       shader->info = mainp->info;
3362 
3363       /* Select prologs and/or epilogs. */
3364       switch (sel->stage) {
3365       case MESA_SHADER_TESS_CTRL:
3366          if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
3367             return false;
3368          break;
3369       case MESA_SHADER_GEOMETRY:
3370          if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
3371             return false;
3372 
3373          /* Clone the GS copy shader for the shader variant.
3374           * We can't just copy the pointer because we change the pm4 state and
3375           * si_shader_selector::gs_copy_shader must be immutable because it's shared
3376           * by multiple contexts.
3377           */
3378          if (!shader->key.ge.as_ngg) {
3379             assert(sel->main_shader_part == mainp);
3380             assert(sel->main_shader_part->gs_copy_shader);
3381             assert(sel->main_shader_part->gs_copy_shader->bo);
3382             assert(!sel->main_shader_part->gs_copy_shader->previous_stage_sel);
3383             assert(!sel->main_shader_part->gs_copy_shader->scratch_bo);
3384 
3385             shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
3386             memcpy(shader->gs_copy_shader, sel->main_shader_part->gs_copy_shader,
3387                    sizeof(*shader->gs_copy_shader));
3388             /* Increase the reference count. */
3389             pipe_reference(NULL, &shader->gs_copy_shader->bo->b.b.reference);
3390             /* Initialize some fields differently. */
3391             shader->gs_copy_shader->shader_log = NULL;
3392             shader->gs_copy_shader->is_binary_shared = true;
3393             util_queue_fence_init(&shader->gs_copy_shader->ready);
3394          }
3395          break;
3396       case MESA_SHADER_FRAGMENT:
3397          if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
3398             return false;
3399 
3400          /* Make sure we have at least as many VGPRs as there
3401           * are allocated inputs.
3402           */
3403          shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
3404          break;
3405       default:;
3406       }
3407 
3408       assert(shader->wave_size == mainp->wave_size);
3409       assert(!shader->previous_stage || shader->wave_size == shader->previous_stage->wave_size);
3410 
3411       /* Update SGPR and VGPR counts. */
3412       if (shader->prolog) {
3413          shader->config.num_sgprs =
3414             MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs);
3415          shader->config.num_vgprs =
3416             MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs);
3417       }
3418       if (shader->previous_stage) {
3419          shader->config.num_sgprs =
3420             MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
3421          shader->config.num_vgprs =
3422             MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
3423          shader->config.spilled_sgprs =
3424             MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
3425          shader->config.spilled_vgprs =
3426             MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
3427          shader->info.private_mem_vgprs =
3428             MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
3429          shader->config.scratch_bytes_per_wave =
3430             MAX2(shader->config.scratch_bytes_per_wave,
3431                  shader->previous_stage->config.scratch_bytes_per_wave);
3432          shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
3433          shader->info.uses_vmem_load_other |= shader->previous_stage->info.uses_vmem_load_other;
3434          shader->info.uses_vmem_sampler_or_bvh |= shader->previous_stage->info.uses_vmem_sampler_or_bvh;
3435       }
3436       if (shader->epilog) {
3437          shader->config.num_sgprs =
3438             MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs);
3439          shader->config.num_vgprs =
3440             MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs);
3441       }
3442       si_calculate_max_simd_waves(shader);
3443    }
3444 
3445    if (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
3446       assert(!shader->key.ge.as_es && !shader->key.ge.as_ls);
3447       if (!gfx10_ngg_calculate_subgroup_info(shader)) {
3448          fprintf(stderr, "Failed to compute subgroup info\n");
3449          return false;
3450       }
3451    } else if (sscreen->info.gfx_level >= GFX9 && sel->stage == MESA_SHADER_GEOMETRY) {
3452       gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
3453    }
3454 
3455    shader->uses_vs_state_provoking_vertex =
3456       sscreen->use_ngg &&
3457       /* Used to convert triangle strips from GS to triangles. */
3458       ((sel->stage == MESA_SHADER_GEOMETRY &&
3459         util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) ||
3460        (sel->stage == MESA_SHADER_VERTEX &&
3461         /* Used to export PrimitiveID from the correct vertex. */
3462         shader->key.ge.mono.u.vs_export_prim_id));
3463 
3464    shader->uses_gs_state_outprim = sscreen->use_ngg &&
3465                                    /* Only used by streamout and the PrimID export in vertex
3466                                     * shaders. */
3467                                    sel->stage == MESA_SHADER_VERTEX &&
3468                                    (si_shader_uses_streamout(shader) ||
3469                                     shader->uses_vs_state_provoking_vertex);
3470 
3471    if (sel->stage == MESA_SHADER_VERTEX) {
3472       shader->uses_base_instance = sel->info.uses_base_instance ||
3473                                    shader->key.ge.mono.instance_divisor_is_one ||
3474                                    shader->key.ge.mono.instance_divisor_is_fetched;
3475    } else if (sel->stage == MESA_SHADER_TESS_CTRL) {
3476       shader->uses_base_instance = shader->previous_stage_sel &&
3477                                    (shader->previous_stage_sel->info.uses_base_instance ||
3478                                     shader->key.ge.mono.instance_divisor_is_one ||
3479                                     shader->key.ge.mono.instance_divisor_is_fetched);
3480    } else if (sel->stage == MESA_SHADER_GEOMETRY) {
3481       shader->uses_base_instance = shader->previous_stage_sel &&
3482                                    (shader->previous_stage_sel->info.uses_base_instance ||
3483                                     shader->key.ge.mono.instance_divisor_is_one ||
3484                                     shader->key.ge.mono.instance_divisor_is_fetched);
3485    }
3486 
3487    si_fix_resource_usage(sscreen, shader);
3488 
3489    /* Upload. */
3490    bool ok = si_shader_binary_upload(sscreen, shader, 0);
3491    si_shader_dump(sscreen, shader, debug, stderr, true);
3492 
3493    if (!ok)
3494       fprintf(stderr, "LLVM failed to upload shader\n");
3495    return ok;
3496 }
3497 
si_shader_binary_clean(struct si_shader_binary * binary)3498 void si_shader_binary_clean(struct si_shader_binary *binary)
3499 {
3500    free((void *)binary->code_buffer);
3501    binary->code_buffer = NULL;
3502 
3503    free(binary->llvm_ir_string);
3504    binary->llvm_ir_string = NULL;
3505 
3506    free((void *)binary->symbols);
3507    binary->symbols = NULL;
3508 
3509    free(binary->uploaded_code);
3510    binary->uploaded_code = NULL;
3511    binary->uploaded_code_size = 0;
3512 }
3513 
si_shader_destroy(struct si_shader * shader)3514 void si_shader_destroy(struct si_shader *shader)
3515 {
3516    if (shader->scratch_bo)
3517       si_resource_reference(&shader->scratch_bo, NULL);
3518 
3519    si_resource_reference(&shader->bo, NULL);
3520 
3521    if (!shader->is_binary_shared)
3522       si_shader_binary_clean(&shader->binary);
3523 
3524    free(shader->shader_log);
3525 }
3526 
si_get_prev_stage_nir_shader(struct si_shader * shader,struct si_shader * prev_shader,struct si_shader_args * args,bool * free_nir)3527 nir_shader *si_get_prev_stage_nir_shader(struct si_shader *shader,
3528                                          struct si_shader *prev_shader,
3529                                          struct si_shader_args *args,
3530                                          bool *free_nir)
3531 {
3532    const struct si_shader_selector *sel = shader->selector;
3533    const union si_shader_key *key = &shader->key;
3534 
3535    if (sel->stage == MESA_SHADER_TESS_CTRL) {
3536       struct si_shader_selector *ls = key->ge.part.tcs.ls;
3537 
3538       prev_shader->selector = ls;
3539       prev_shader->key.ge.as_ls = 1;
3540    } else {
3541       struct si_shader_selector *es = key->ge.part.gs.es;
3542 
3543       prev_shader->selector = es;
3544       prev_shader->key.ge.as_es = 1;
3545       prev_shader->key.ge.as_ngg = key->ge.as_ngg;
3546    }
3547 
3548    prev_shader->key.ge.mono = key->ge.mono;
3549    prev_shader->key.ge.opt = key->ge.opt;
3550    prev_shader->key.ge.opt.inline_uniforms = false; /* only TCS/GS can inline uniforms */
3551    /* kill_outputs was computed based on second shader's outputs so we can't use it to
3552     * kill first shader's outputs.
3553     */
3554    prev_shader->key.ge.opt.kill_outputs = 0;
3555    prev_shader->is_monolithic = true;
3556 
3557    si_init_shader_args(prev_shader, args);
3558 
3559    nir_shader *nir = si_get_nir_shader(prev_shader, args, free_nir,
3560                                        sel->info.tcs_vgpr_only_inputs, NULL);
3561 
3562    si_update_shader_binary_info(shader, nir);
3563 
3564    shader->info.uses_instanceid |=
3565       prev_shader->selector->info.uses_instanceid || prev_shader->info.uses_instanceid;
3566 
3567    return nir;
3568 }
3569 
si_get_tcs_out_patch_stride(const struct si_shader_info * info)3570 unsigned si_get_tcs_out_patch_stride(const struct si_shader_info *info)
3571 {
3572    unsigned tcs_out_vertices = info->base.tess.tcs_vertices_out;
3573    unsigned vertex_stride = util_last_bit64(info->outputs_written_before_tes_gs) * 4;
3574    unsigned num_patch_outputs = util_last_bit64(info->patch_outputs_written);
3575 
3576    return tcs_out_vertices * vertex_stride + num_patch_outputs * 4;
3577 }
3578 
si_get_tcs_epilog_args(enum amd_gfx_level gfx_level,struct si_shader_args * args,struct ac_arg * rel_patch_id,struct ac_arg * invocation_id,struct ac_arg * tf_lds_offset,struct ac_arg tess_factors[6])3579 void si_get_tcs_epilog_args(enum amd_gfx_level gfx_level,
3580                             struct si_shader_args *args,
3581                             struct ac_arg *rel_patch_id,
3582                             struct ac_arg *invocation_id,
3583                             struct ac_arg *tf_lds_offset,
3584                             struct ac_arg tess_factors[6])
3585 {
3586    memset(args, 0, sizeof(*args));
3587 
3588    if (gfx_level >= GFX9) {
3589       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3590       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3591       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
3592       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */
3593       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
3594       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3595       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3596       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3597       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3598       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3599       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3600       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3601       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3602       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3603       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3604       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3605       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
3606       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
3607    } else {
3608       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3609       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3610       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3611       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3612       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
3613       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
3614       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3615       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
3616       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
3617    }
3618 
3619    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
3620    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
3621    /* patch index within the wave (REL_PATCH_ID) */
3622    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, rel_patch_id);
3623    /* invocation ID within the patch */
3624    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, invocation_id);
3625    /* LDS offset where tess factors should be loaded from */
3626    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, tf_lds_offset);
3627 
3628    for (unsigned i = 0; i < 6; i++)
3629       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]);
3630 }
3631 
si_get_ps_prolog_args(struct si_shader_args * args,const union si_shader_part_key * key)3632 void si_get_ps_prolog_args(struct si_shader_args *args,
3633                            const union si_shader_part_key *key)
3634 {
3635    memset(args, 0, sizeof(*args));
3636 
3637    const unsigned num_input_sgprs = key->ps_prolog.num_input_sgprs;
3638 
3639    struct ac_arg input_sgprs[num_input_sgprs];
3640    for (unsigned i = 0; i < num_input_sgprs; i++)
3641       ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, input_sgprs + i);
3642 
3643    args->internal_bindings = input_sgprs[SI_SGPR_INTERNAL_BINDINGS];
3644    /* Use the absolute location of the input. */
3645    args->ac.prim_mask = input_sgprs[SI_PS_NUM_USER_SGPR];
3646 
3647    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_sample);
3648    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_center);
3649    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_centroid);
3650    /* skip PERSP_PULL_MODEL */
3651    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_sample);
3652    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_center);
3653    ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_centroid);
3654    /* skip LINE_STIPPLE_TEX */
3655 
3656    /* POS_X|Y|Z|W_FLOAT */
3657    for (unsigned i = 0; i < key->ps_prolog.num_fragcoord_components; i++)
3658       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
3659 
3660    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.front_face);
3661    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.ancillary);
3662    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.sample_coverage);
3663    ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.pos_fixed_pt);
3664 }
3665 
si_get_ps_epilog_args(struct si_shader_args * args,const union si_shader_part_key * key,struct ac_arg colors[MAX_DRAW_BUFFERS],struct ac_arg * depth,struct ac_arg * stencil,struct ac_arg * sample_mask)3666 void si_get_ps_epilog_args(struct si_shader_args *args,
3667                            const union si_shader_part_key *key,
3668                            struct ac_arg colors[MAX_DRAW_BUFFERS],
3669                            struct ac_arg *depth, struct ac_arg *stencil,
3670                            struct ac_arg *sample_mask)
3671 {
3672    memset(args, 0, sizeof(*args));
3673 
3674    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3675    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3676    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3677    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3678    ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &args->alpha_reference);
3679 
3680    u_foreach_bit (i, key->ps_epilog.colors_written) {
3681       ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_FLOAT, colors + i);
3682    }
3683 
3684    if (key->ps_epilog.writes_z)
3685       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, depth);
3686 
3687    if (key->ps_epilog.writes_stencil)
3688       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, stencil);
3689 
3690    if (key->ps_epilog.writes_samplemask)
3691       ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, sample_mask);
3692 }
3693