1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "si_shader.h"
8 #include "ac_nir.h"
9 #include "ac_rtld.h"
10 #include "nir.h"
11 #include "nir_builder.h"
12 #include "nir_serialize.h"
13 #include "nir_xfb_info.h"
14 #include "si_pipe.h"
15 #include "si_shader_internal.h"
16 #include "sid.h"
17 #include "tgsi/tgsi_from_mesa.h"
18 #include "util/u_memory.h"
19 #include "util/mesa-sha1.h"
20 #include "util/ralloc.h"
21 #include "util/u_upload_mgr.h"
22
23 static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
24
25 static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
26
27 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
28 static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader);
29
30 /* Get the number of all interpolated inputs */
si_get_ps_num_interp(struct si_shader * ps)31 unsigned si_get_ps_num_interp(struct si_shader *ps)
32 {
33 unsigned num_colors = !!(ps->info.ps_colors_read & 0x0f) + !!(ps->info.ps_colors_read & 0xf0);
34 unsigned num_interp =
35 ps->info.num_ps_inputs + (ps->key.ps.part.prolog.color_two_side ? num_colors : 0);
36
37 assert(num_interp <= 32);
38 return MIN2(num_interp, 32);
39 }
40
41 /** Whether the shader runs as a combination of multiple API shaders */
si_is_multi_part_shader(struct si_shader * shader)42 bool si_is_multi_part_shader(struct si_shader *shader)
43 {
44 if (shader->selector->screen->info.gfx_level <= GFX8 ||
45 shader->selector->stage > MESA_SHADER_GEOMETRY)
46 return false;
47
48 return shader->key.ge.as_ls || shader->key.ge.as_es ||
49 shader->selector->stage == MESA_SHADER_TESS_CTRL ||
50 shader->selector->stage == MESA_SHADER_GEOMETRY;
51 }
52
53 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
si_is_merged_shader(struct si_shader * shader)54 bool si_is_merged_shader(struct si_shader *shader)
55 {
56 if (shader->selector->stage > MESA_SHADER_GEOMETRY || shader->is_gs_copy_shader)
57 return false;
58
59 return shader->key.ge.as_ngg || si_is_multi_part_shader(shader);
60 }
61
62 /**
63 * Returns a unique index for a semantic name and index. The index must be
64 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
65 * calculated.
66 */
si_shader_io_get_unique_index(unsigned semantic)67 unsigned si_shader_io_get_unique_index(unsigned semantic)
68 {
69 switch (semantic) {
70 case VARYING_SLOT_POS:
71 return SI_UNIQUE_SLOT_POS;
72 default:
73 if (semantic >= VARYING_SLOT_VAR0 && semantic <= VARYING_SLOT_VAR31)
74 return SI_UNIQUE_SLOT_VAR0 + (semantic - VARYING_SLOT_VAR0);
75
76 if (semantic >= VARYING_SLOT_VAR0_16BIT && semantic <= VARYING_SLOT_VAR15_16BIT)
77 return SI_UNIQUE_SLOT_VAR0_16BIT + (semantic - VARYING_SLOT_VAR0_16BIT);
78
79 assert(!"invalid generic index");
80 return 0;
81
82 /* Legacy desktop GL varyings. */
83 case VARYING_SLOT_FOGC:
84 return SI_UNIQUE_SLOT_FOGC;
85 case VARYING_SLOT_COL0:
86 return SI_UNIQUE_SLOT_COL0;
87 case VARYING_SLOT_COL1:
88 return SI_UNIQUE_SLOT_COL1;
89 case VARYING_SLOT_BFC0:
90 return SI_UNIQUE_SLOT_BFC0;
91 case VARYING_SLOT_BFC1:
92 return SI_UNIQUE_SLOT_BFC1;
93 case VARYING_SLOT_TEX0:
94 case VARYING_SLOT_TEX1:
95 case VARYING_SLOT_TEX2:
96 case VARYING_SLOT_TEX3:
97 case VARYING_SLOT_TEX4:
98 case VARYING_SLOT_TEX5:
99 case VARYING_SLOT_TEX6:
100 case VARYING_SLOT_TEX7:
101 return SI_UNIQUE_SLOT_TEX0 + (semantic - VARYING_SLOT_TEX0);
102 case VARYING_SLOT_CLIP_VERTEX:
103 return SI_UNIQUE_SLOT_CLIP_VERTEX;
104
105 /* Varyings present in both GLES and desktop GL. */
106 case VARYING_SLOT_CLIP_DIST0:
107 return SI_UNIQUE_SLOT_CLIP_DIST0;
108 case VARYING_SLOT_CLIP_DIST1:
109 return SI_UNIQUE_SLOT_CLIP_DIST1;
110 case VARYING_SLOT_PSIZ:
111 return SI_UNIQUE_SLOT_PSIZ;
112 case VARYING_SLOT_LAYER:
113 return SI_UNIQUE_SLOT_LAYER;
114 case VARYING_SLOT_VIEWPORT:
115 return SI_UNIQUE_SLOT_VIEWPORT;
116 case VARYING_SLOT_PRIMITIVE_ID:
117 return SI_UNIQUE_SLOT_PRIMITIVE_ID;
118 }
119 }
120
declare_streamout_params(struct si_shader_args * args,struct si_shader * shader)121 static void declare_streamout_params(struct si_shader_args *args, struct si_shader *shader)
122 {
123 struct si_shader_selector *sel = shader->selector;
124
125 if (shader->selector->screen->info.gfx_level >= GFX11) {
126 /* NGG streamout. */
127 if (sel->stage == MESA_SHADER_TESS_EVAL)
128 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
129 return;
130 }
131
132 /* Streamout SGPRs. */
133 if (si_shader_uses_streamout(shader)) {
134 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_config);
135 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_write_index);
136
137 /* A streamout buffer offset is loaded if the stride is non-zero. */
138 for (int i = 0; i < 4; i++) {
139 if (!sel->info.base.xfb_stride[i])
140 continue;
141
142 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_offset[i]);
143 }
144 } else if (sel->stage == MESA_SHADER_TESS_EVAL) {
145 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
146 }
147 }
148
si_get_max_workgroup_size(const struct si_shader * shader)149 unsigned si_get_max_workgroup_size(const struct si_shader *shader)
150 {
151 gl_shader_stage stage = shader->is_gs_copy_shader ?
152 MESA_SHADER_VERTEX : shader->selector->stage;
153
154 switch (stage) {
155 case MESA_SHADER_VERTEX:
156 case MESA_SHADER_TESS_EVAL:
157 /* Use the largest workgroup size for streamout */
158 if (shader->key.ge.as_ngg)
159 return si_shader_uses_streamout(shader) ? 256 : 128;
160
161 /* As part of merged shader. */
162 return shader->selector->screen->info.gfx_level >= GFX9 &&
163 (shader->key.ge.as_ls || shader->key.ge.as_es) ? 128 : 0;
164
165 case MESA_SHADER_TESS_CTRL:
166 /* Return this so that LLVM doesn't remove s_barrier
167 * instructions on chips where we use s_barrier. */
168 return shader->selector->screen->info.gfx_level >= GFX7 ? 128 : 0;
169
170 case MESA_SHADER_GEOMETRY:
171 /* GS can always generate up to 256 vertices. */
172 return shader->selector->screen->info.gfx_level >= GFX9 ? 256 : 0;
173
174 case MESA_SHADER_COMPUTE:
175 break; /* see below */
176
177 default:
178 return 0;
179 }
180
181 /* Compile a variable block size using the maximum variable size. */
182 if (shader->selector->info.base.workgroup_size_variable)
183 return SI_MAX_VARIABLE_THREADS_PER_BLOCK;
184
185 uint16_t *local_size = shader->selector->info.base.workgroup_size;
186 unsigned max_work_group_size = (uint32_t)local_size[0] *
187 (uint32_t)local_size[1] *
188 (uint32_t)local_size[2];
189 assert(max_work_group_size);
190 return max_work_group_size;
191 }
192
declare_const_and_shader_buffers(struct si_shader_args * args,struct si_shader * shader,bool assign_params)193 static void declare_const_and_shader_buffers(struct si_shader_args *args,
194 struct si_shader *shader,
195 bool assign_params)
196 {
197 enum ac_arg_type const_shader_buf_type;
198
199 if (shader->selector->info.base.num_ubos == 1 &&
200 shader->selector->info.base.num_ssbos == 0)
201 const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
202 else
203 const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
204
205 ac_add_arg(
206 &args->ac, AC_ARG_SGPR, 1, const_shader_buf_type,
207 assign_params ? &args->const_and_shader_buffers : &args->other_const_and_shader_buffers);
208 }
209
declare_samplers_and_images(struct si_shader_args * args,bool assign_params)210 static void declare_samplers_and_images(struct si_shader_args *args, bool assign_params)
211 {
212 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
213 assign_params ? &args->samplers_and_images : &args->other_samplers_and_images);
214 }
215
declare_per_stage_desc_pointers(struct si_shader_args * args,struct si_shader * shader,bool assign_params)216 static void declare_per_stage_desc_pointers(struct si_shader_args *args,
217 struct si_shader *shader,
218 bool assign_params)
219 {
220 declare_const_and_shader_buffers(args, shader, assign_params);
221 declare_samplers_and_images(args, assign_params);
222 }
223
declare_global_desc_pointers(struct si_shader_args * args)224 static void declare_global_desc_pointers(struct si_shader_args *args)
225 {
226 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->internal_bindings);
227 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
228 &args->bindless_samplers_and_images);
229 }
230
declare_vb_descriptor_input_sgprs(struct si_shader_args * args,struct si_shader * shader)231 static void declare_vb_descriptor_input_sgprs(struct si_shader_args *args,
232 struct si_shader *shader)
233 {
234 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->ac.vertex_buffers);
235
236 unsigned num_vbos_in_user_sgprs = shader->selector->info.num_vbos_in_user_sgprs;
237 if (num_vbos_in_user_sgprs) {
238 unsigned user_sgprs = args->ac.num_sgprs_used;
239
240 if (si_is_merged_shader(shader))
241 user_sgprs -= 8;
242 assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
243
244 /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
245 for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
246 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
247
248 assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(args->vb_descriptors));
249 for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
250 ac_add_arg(&args->ac, AC_ARG_SGPR, 4, AC_ARG_INT, &args->vb_descriptors[i]);
251 }
252 }
253
declare_vs_input_vgprs(struct si_shader_args * args,struct si_shader * shader)254 static void declare_vs_input_vgprs(struct si_shader_args *args, struct si_shader *shader)
255 {
256 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vertex_id);
257 if (shader->key.ge.as_ls) {
258 if (shader->selector->screen->info.gfx_level >= GFX11) {
259 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
260 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
261 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
262 } else if (shader->selector->screen->info.gfx_level >= GFX10) {
263 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
264 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
265 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
266 } else {
267 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
268 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
269 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
270 }
271 } else if (shader->selector->screen->info.gfx_level >= GFX10) {
272 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
273 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
274 /* user vgpr or PrimID (legacy) */
275 shader->key.ge.as_ngg ? NULL : &args->ac.vs_prim_id);
276 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
277 } else {
278 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
279 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_prim_id);
280 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
281 }
282 }
283
declare_vs_blit_inputs(struct si_shader * shader,struct si_shader_args * args)284 static void declare_vs_blit_inputs(struct si_shader *shader, struct si_shader_args *args)
285 {
286 bool has_attribute_ring_address = shader->selector->screen->info.gfx_level >= GFX11;
287
288 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_blit_inputs); /* i16 x1, y1 */
289 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */
290 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */
291
292 if (shader->selector->info.base.vs.blit_sgprs_amd ==
293 SI_VS_BLIT_SGPRS_POS_COLOR + has_attribute_ring_address) {
294 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
295 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
296 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
297 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
298 if (has_attribute_ring_address)
299 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* attribute ring address */
300 } else if (shader->selector->info.base.vs.blit_sgprs_amd ==
301 SI_VS_BLIT_SGPRS_POS_TEXCOORD + has_attribute_ring_address) {
302 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
303 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
304 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
305 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
306 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
307 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
308 if (has_attribute_ring_address)
309 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* attribute ring address */
310 }
311 }
312
declare_tes_input_vgprs(struct si_shader_args * args)313 static void declare_tes_input_vgprs(struct si_shader_args *args)
314 {
315 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_u);
316 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_v);
317 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_rel_patch_id);
318 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_patch_id);
319 }
320
321 enum
322 {
323 /* Convenient merged shader definitions. */
324 SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES,
325 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
326 };
327
si_add_arg_checked(struct ac_shader_args * args,enum ac_arg_regfile file,unsigned registers,enum ac_arg_type type,struct ac_arg * arg,unsigned idx)328 void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
329 enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
330 {
331 assert(args->arg_count == idx);
332 ac_add_arg(args, file, registers, type, arg);
333 }
334
si_init_shader_args(struct si_shader * shader,struct si_shader_args * args)335 void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args)
336 {
337 unsigned i, num_returns, num_return_sgprs;
338 unsigned num_prolog_vgprs = 0;
339 struct si_shader_selector *sel = shader->selector;
340 unsigned stage = shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : sel->stage;
341 unsigned stage_case = stage;
342
343 memset(args, 0, sizeof(*args));
344
345 /* Set MERGED shaders. */
346 if (sel->screen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY) {
347 if (shader->key.ge.as_ls || stage == MESA_SHADER_TESS_CTRL)
348 stage_case = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
349 else if (shader->key.ge.as_es || shader->key.ge.as_ngg || stage == MESA_SHADER_GEOMETRY)
350 stage_case = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
351 }
352
353 switch (stage_case) {
354 case MESA_SHADER_VERTEX:
355 declare_global_desc_pointers(args);
356
357 if (sel->info.base.vs.blit_sgprs_amd) {
358 declare_vs_blit_inputs(shader, args);
359 } else {
360 declare_per_stage_desc_pointers(args, shader, true);
361 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
362
363 if (shader->is_gs_copy_shader) {
364 declare_streamout_params(args, shader);
365 } else {
366 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
367 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
368 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
369 declare_vb_descriptor_input_sgprs(args, shader);
370
371 if (shader->key.ge.as_es) {
372 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
373 } else if (shader->key.ge.as_ls) {
374 /* no extra parameters */
375 } else {
376 declare_streamout_params(args, shader);
377 }
378 }
379 }
380
381 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
382 if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11)
383 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
384
385 /* VGPRs */
386 declare_vs_input_vgprs(args, shader);
387
388 break;
389
390 case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
391 declare_global_desc_pointers(args);
392 declare_per_stage_desc_pointers(args, shader, true);
393 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
394 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
395 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
396 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
397 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
398
399 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
400 if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11)
401 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
402
403 /* VGPRs */
404 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
405 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
406
407 /* For monolithic shaders, the TCS epilog code is generated by
408 * ac_nir_lower_hs_outputs_to_mem.
409 */
410 if (!shader->is_monolithic) {
411 /* param_tcs_offchip_offset and param_tcs_factor_offset are
412 * placed after the user SGPRs.
413 */
414 for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
415 ac_add_return(&args->ac, AC_ARG_SGPR);
416 for (i = 0; i < 11; i++)
417 ac_add_return(&args->ac, AC_ARG_VGPR);
418 }
419 break;
420
421 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
422 /* Merged stages have 8 system SGPRs at the beginning. */
423 /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
424 /* Gfx11+: SPI_SHADER_PGM_LO/HI_HS */
425 declare_per_stage_desc_pointers(args, shader, stage == MESA_SHADER_TESS_CTRL);
426 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
427 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info);
428 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
429 if (sel->screen->info.gfx_level >= GFX11)
430 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_wave_id);
431 else
432 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
433 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
434 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
435
436 declare_global_desc_pointers(args);
437 declare_per_stage_desc_pointers(args, shader, stage == MESA_SHADER_VERTEX);
438
439 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
440 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
441 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
442 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
443 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
444 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
445
446 /* VGPRs (first TCS, then VS) */
447 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
448 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
449
450 if (stage == MESA_SHADER_VERTEX) {
451 declare_vs_input_vgprs(args, shader);
452
453 /* Need to keep LS/HS arg index same for shared args when ACO,
454 * so this is not able to be before shared VGPRs.
455 */
456 declare_vb_descriptor_input_sgprs(args, shader);
457
458 /* LS return values are inputs to the TCS main shader part. */
459 if (!shader->is_monolithic || shader->key.ge.opt.same_patch_vertices) {
460 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
461 ac_add_return(&args->ac, AC_ARG_SGPR);
462 for (i = 0; i < 2; i++)
463 ac_add_return(&args->ac, AC_ARG_VGPR);
464
465 /* VS outputs passed via VGPRs to TCS. */
466 if (shader->key.ge.opt.same_patch_vertices && !sel->screen->use_aco) {
467 unsigned num_outputs = util_last_bit64(shader->selector->info.outputs_written_before_tes_gs);
468 for (i = 0; i < num_outputs * 4; i++)
469 ac_add_return(&args->ac, AC_ARG_VGPR);
470 }
471 }
472 } else {
473 /* TCS inputs are passed via VGPRs from VS. */
474 if (shader->key.ge.opt.same_patch_vertices && !sel->screen->use_aco) {
475 unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->info.outputs_written_before_tes_gs);
476 for (i = 0; i < num_inputs * 4; i++)
477 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
478 }
479
480 /* For monolithic shaders, the TCS epilog code is generated by
481 * ac_nir_lower_hs_outputs_to_mem.
482 */
483 if (!shader->is_monolithic) {
484 /* TCS return values are inputs to the TCS epilog.
485 *
486 * param_tcs_offchip_offset, param_tcs_factor_offset,
487 * param_tcs_offchip_layout, and internal_bindings
488 * should be passed to the epilog.
489 */
490 for (i = 0; i <= 8 + GFX9_SGPR_TCS_OFFCHIP_ADDR; i++)
491 ac_add_return(&args->ac, AC_ARG_SGPR);
492 for (i = 0; i < 11; i++)
493 ac_add_return(&args->ac, AC_ARG_VGPR);
494 }
495 }
496 break;
497
498 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
499 /* Merged stages have 8 system SGPRs at the beginning. */
500 /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
501 /* Gfx11+: SPI_SHADER_PGM_LO/HI_GS */
502 declare_per_stage_desc_pointers(args, shader, stage == MESA_SHADER_GEOMETRY);
503
504 if (shader->key.ge.as_ngg)
505 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_tg_info);
506 else
507 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
508
509 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info);
510 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
511 if (sel->screen->info.gfx_level >= GFX11)
512 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_attr_offset);
513 else
514 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
515 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
516 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
517
518 declare_global_desc_pointers(args);
519 if (stage != MESA_SHADER_VERTEX || !sel->info.base.vs.blit_sgprs_amd) {
520 declare_per_stage_desc_pointers(
521 args, shader, (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL));
522 }
523
524 if (stage == MESA_SHADER_VERTEX && sel->info.base.vs.blit_sgprs_amd) {
525 declare_vs_blit_inputs(shader, args);
526 } else {
527 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
528
529 if (stage == MESA_SHADER_VERTEX) {
530 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
531 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
532 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
533 } else if (stage == MESA_SHADER_TESS_EVAL) {
534 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
535 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
536 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
537 } else {
538 /* GS */
539 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
540 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
541 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
542 }
543
544 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->small_prim_cull_info);
545 if (sel->screen->info.gfx_level >= GFX11)
546 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->gs_attr_address);
547 else
548 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
549 }
550
551 /* VGPRs (first GS, then VS/TES) */
552 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
553 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
554 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
555 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
556 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
557
558 if (stage == MESA_SHADER_VERTEX) {
559 declare_vs_input_vgprs(args, shader);
560
561 /* Need to keep ES/GS arg index same for shared args when ACO,
562 * so this is not able to be before shared VGPRs.
563 */
564 if (!sel->info.base.vs.blit_sgprs_amd)
565 declare_vb_descriptor_input_sgprs(args, shader);
566 } else if (stage == MESA_SHADER_TESS_EVAL) {
567 declare_tes_input_vgprs(args);
568 }
569
570 if (shader->key.ge.as_es && !shader->is_monolithic &&
571 (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL)) {
572 /* ES return values are inputs to GS. */
573 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
574 ac_add_return(&args->ac, AC_ARG_SGPR);
575 for (i = 0; i < 5; i++)
576 ac_add_return(&args->ac, AC_ARG_VGPR);
577 }
578 break;
579
580 case MESA_SHADER_TESS_EVAL:
581 declare_global_desc_pointers(args);
582 declare_per_stage_desc_pointers(args, shader, true);
583 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
584 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
585 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
586
587 if (shader->key.ge.as_es) {
588 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
589 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
590 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
591 } else {
592 declare_streamout_params(args, shader);
593 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
594 }
595
596 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
597 if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11)
598 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
599
600 /* VGPRs */
601 declare_tes_input_vgprs(args);
602 break;
603
604 case MESA_SHADER_GEOMETRY:
605 declare_global_desc_pointers(args);
606 declare_per_stage_desc_pointers(args, shader, true);
607 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
608 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_wave_id);
609
610 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
611 if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11)
612 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
613
614 /* VGPRs */
615 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
616 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
617 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
618 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
619 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[3]);
620 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[4]);
621 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[5]);
622 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
623 break;
624
625 case MESA_SHADER_FRAGMENT:
626 declare_global_desc_pointers(args);
627 declare_per_stage_desc_pointers(args, shader, true);
628 si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->alpha_reference,
629 SI_PARAM_ALPHA_REF);
630 si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.prim_mask,
631 SI_PARAM_PRIM_MASK);
632
633 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_sample,
634 SI_PARAM_PERSP_SAMPLE);
635 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_center,
636 SI_PARAM_PERSP_CENTER);
637 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_centroid,
638 SI_PARAM_PERSP_CENTROID);
639 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
640 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_sample,
641 SI_PARAM_LINEAR_SAMPLE);
642 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_center,
643 SI_PARAM_LINEAR_CENTER);
644 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_centroid,
645 SI_PARAM_LINEAR_CENTROID);
646 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
647 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[0],
648 SI_PARAM_POS_X_FLOAT);
649 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[1],
650 SI_PARAM_POS_Y_FLOAT);
651 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[2],
652 SI_PARAM_POS_Z_FLOAT);
653 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[3],
654 SI_PARAM_POS_W_FLOAT);
655 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.front_face,
656 SI_PARAM_FRONT_FACE);
657 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.ancillary,
658 SI_PARAM_ANCILLARY);
659 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.sample_coverage,
660 SI_PARAM_SAMPLE_COVERAGE);
661 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.pos_fixed_pt,
662 SI_PARAM_POS_FIXED_PT);
663
664 if (sel->screen->use_aco) {
665 ac_compact_ps_vgpr_args(&args->ac, shader->config.spi_ps_input_addr);
666
667 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
668 if (sel->screen->info.gfx_level < GFX11)
669 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
670 }
671
672 /* Monolithic PS emit prolog and epilog in NIR directly. */
673 if (!shader->is_monolithic) {
674 /* Color inputs from the prolog. */
675 if (shader->selector->info.colors_read) {
676 unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);
677
678 for (i = 0; i < num_color_elements; i++)
679 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, i ? NULL : &args->color_start);
680
681 num_prolog_vgprs += num_color_elements;
682 }
683
684 /* Outputs for the epilog. */
685 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
686 num_returns =
687 num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
688 shader->selector->info.writes_z + shader->selector->info.writes_stencil +
689 shader->ps.writes_samplemask + 1 /* SampleMaskIn */;
690
691 for (i = 0; i < num_return_sgprs; i++)
692 ac_add_return(&args->ac, AC_ARG_SGPR);
693 for (; i < num_returns; i++)
694 ac_add_return(&args->ac, AC_ARG_VGPR);
695 }
696 break;
697
698 case MESA_SHADER_COMPUTE:
699 declare_global_desc_pointers(args);
700 declare_per_stage_desc_pointers(args, shader, true);
701 if (shader->selector->info.uses_grid_size)
702 ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.num_work_groups);
703 if (shader->selector->info.uses_variable_block_size)
704 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->block_size);
705
706 unsigned cs_user_data_dwords =
707 shader->selector->info.base.cs.user_data_components_amd;
708 if (cs_user_data_dwords) {
709 ac_add_arg(&args->ac, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &args->cs_user_data);
710 }
711
712 /* Some descriptors can be in user SGPRs. */
713 /* Shader buffers in user SGPRs. */
714 for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) {
715 while (args->ac.num_sgprs_used % 4 != 0)
716 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
717
718 ac_add_arg(&args->ac, AC_ARG_SGPR, 4, AC_ARG_INT, &args->cs_shaderbuf[i]);
719 }
720 /* Images in user SGPRs. */
721 for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) {
722 unsigned num_sgprs = BITSET_TEST(shader->selector->info.base.image_buffers, i) ? 4 : 8;
723
724 while (args->ac.num_sgprs_used % num_sgprs != 0)
725 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
726
727 ac_add_arg(&args->ac, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &args->cs_image[i]);
728 }
729
730 /* Hardware SGPRs. */
731 for (i = 0; i < 3; i++) {
732 if (shader->selector->info.uses_block_id[i]) {
733 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]);
734 }
735 }
736 if (shader->selector->info.uses_tg_size)
737 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tg_size);
738
739 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
740 if (sel->screen->use_aco && sel->screen->info.gfx_level < GFX11)
741 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
742
743 /* Hardware VGPRs. */
744 /* Thread IDs are packed in VGPR0, 10 bits per component or stored in 3 separate VGPRs */
745 if (sel->screen->info.gfx_level >= GFX11 ||
746 (!sel->screen->info.has_graphics && sel->screen->info.family >= CHIP_MI200))
747 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_ids);
748 else
749 ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, &args->ac.local_invocation_ids);
750 break;
751 default:
752 assert(0 && "unimplemented shader");
753 return;
754 }
755
756 shader->info.num_input_sgprs = args->ac.num_sgprs_used;
757 shader->info.num_input_vgprs = args->ac.num_vgprs_used;
758
759 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
760 shader->info.num_input_vgprs -= num_prolog_vgprs;
761 }
762
get_lds_granularity(struct si_screen * screen,gl_shader_stage stage)763 static unsigned get_lds_granularity(struct si_screen *screen, gl_shader_stage stage)
764 {
765 return screen->info.gfx_level >= GFX11 && stage == MESA_SHADER_FRAGMENT ? 1024 :
766 screen->info.gfx_level >= GFX7 ? 512 : 256;
767 }
768
si_shader_binary_open(struct si_screen * screen,struct si_shader * shader,struct ac_rtld_binary * rtld)769 bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
770 struct ac_rtld_binary *rtld)
771 {
772 const struct si_shader_selector *sel = shader->selector;
773 const char *part_elfs[5];
774 size_t part_sizes[5];
775 unsigned num_parts = 0;
776
777 #define add_part(shader_or_part) \
778 if (shader_or_part) { \
779 part_elfs[num_parts] = (shader_or_part)->binary.code_buffer; \
780 part_sizes[num_parts] = (shader_or_part)->binary.code_size; \
781 num_parts++; \
782 }
783
784 add_part(shader->prolog);
785 add_part(shader->previous_stage);
786 add_part(shader);
787 add_part(shader->epilog);
788
789 #undef add_part
790
791 struct ac_rtld_symbol lds_symbols[2];
792 unsigned num_lds_symbols = 0;
793
794 if (sel && screen->info.gfx_level >= GFX9 && !shader->is_gs_copy_shader &&
795 (sel->stage == MESA_SHADER_GEOMETRY ||
796 (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg))) {
797 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
798 sym->name = "esgs_ring";
799 sym->size = shader->gs_info.esgs_ring_size * 4;
800 sym->align = 64 * 1024;
801 }
802
803 if (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
804 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
805 sym->name = "ngg_emit";
806 sym->size = shader->ngg.ngg_emit_size * 4;
807 sym->align = 4;
808 }
809
810 bool ok = ac_rtld_open(
811 rtld, (struct ac_rtld_open_info){.info = &screen->info,
812 .options =
813 {
814 .halt_at_entry = screen->options.halt_shaders,
815 },
816 .shader_type = sel->stage,
817 .wave_size = shader->wave_size,
818 .num_parts = num_parts,
819 .elf_ptrs = part_elfs,
820 .elf_sizes = part_sizes,
821 .num_shared_lds_symbols = num_lds_symbols,
822 .shared_lds_symbols = lds_symbols});
823
824 if (rtld->lds_size > 0) {
825 unsigned alloc_granularity = get_lds_granularity(screen, sel->stage);
826 shader->config.lds_size = DIV_ROUND_UP(rtld->lds_size, alloc_granularity);
827 }
828
829 return ok;
830 }
831
get_shader_binaries(struct si_shader * shader,struct si_shader_binary * bin[4])832 static unsigned get_shader_binaries(struct si_shader *shader, struct si_shader_binary *bin[4])
833 {
834 unsigned num_bin = 0;
835
836 if (shader->prolog)
837 bin[num_bin++] = &shader->prolog->binary;
838
839 if (shader->previous_stage)
840 bin[num_bin++] = &shader->previous_stage->binary;
841
842 bin[num_bin++] = &shader->binary;
843
844 if (shader->epilog)
845 bin[num_bin++] = &shader->epilog->binary;
846
847 return num_bin;
848 }
849
si_get_shader_binary_size(struct si_screen * screen,struct si_shader * shader)850 static unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
851 {
852 if (shader->binary.type == SI_SHADER_BINARY_ELF) {
853 struct ac_rtld_binary rtld;
854 si_shader_binary_open(screen, shader, &rtld);
855 uint64_t size = rtld.exec_size;
856 ac_rtld_close(&rtld);
857 return size;
858 } else {
859 struct si_shader_binary *bin[4];
860 unsigned num_bin = get_shader_binaries(shader, bin);
861
862 unsigned size = 0;
863 for (unsigned i = 0; i < num_bin; i++) {
864 assert(bin[i]->type == SI_SHADER_BINARY_RAW);
865 size += bin[i]->exec_size;
866 }
867 return size;
868 }
869 }
870
si_get_external_symbol(enum amd_gfx_level gfx_level,void * data,const char * name,uint64_t * value)871 bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
872 uint64_t *value)
873 {
874 uint64_t *scratch_va = data;
875
876 if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
877 *value = (uint32_t)*scratch_va;
878 return true;
879 }
880 if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
881 /* Enable scratch coalescing. */
882 *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32);
883
884 if (gfx_level >= GFX11)
885 *value |= S_008F04_SWIZZLE_ENABLE_GFX11(1);
886 else
887 *value |= S_008F04_SWIZZLE_ENABLE_GFX6(1);
888 return true;
889 }
890
891 return false;
892 }
893
pre_upload_binary(struct si_screen * sscreen,struct si_shader * shader,unsigned binary_size,bool dma_upload,struct si_context ** upload_ctx,struct pipe_resource ** staging,unsigned * staging_offset)894 static void *pre_upload_binary(struct si_screen *sscreen, struct si_shader *shader,
895 unsigned binary_size, bool dma_upload,
896 struct si_context **upload_ctx,
897 struct pipe_resource **staging,
898 unsigned *staging_offset)
899 {
900 unsigned aligned_size = ac_align_shader_binary_for_prefetch(&sscreen->info, binary_size);
901
902 si_resource_reference(&shader->bo, NULL);
903 shader->bo = si_aligned_buffer_create(
904 &sscreen->b,
905 SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT |
906 (dma_upload || sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY) |
907 (dma_upload ? PIPE_RESOURCE_FLAG_UNMAPPABLE : 0),
908 PIPE_USAGE_IMMUTABLE, align(aligned_size, SI_CPDMA_ALIGNMENT), 256);
909 if (!shader->bo)
910 return NULL;
911
912 shader->gpu_address = shader->bo->gpu_address;
913
914 if (dma_upload) {
915 /* First upload into a staging buffer. */
916 *upload_ctx = si_get_aux_context(&sscreen->aux_context.shader_upload);
917
918 void *ret;
919 u_upload_alloc((*upload_ctx)->b.stream_uploader, 0, binary_size, 256,
920 staging_offset, staging, &ret);
921 if (!ret)
922 si_put_aux_context_flush(&sscreen->aux_context.shader_upload);
923
924 return ret;
925 } else {
926 return sscreen->ws->buffer_map(sscreen->ws,
927 shader->bo->buf, NULL,
928 PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);
929 }
930 }
931
post_upload_binary(struct si_screen * sscreen,struct si_shader * shader,void * code,unsigned code_size,unsigned binary_size,bool dma_upload,struct si_context * upload_ctx,struct pipe_resource * staging,unsigned staging_offset)932 static void post_upload_binary(struct si_screen *sscreen, struct si_shader *shader,
933 void *code, unsigned code_size,
934 unsigned binary_size, bool dma_upload,
935 struct si_context *upload_ctx,
936 struct pipe_resource *staging,
937 unsigned staging_offset)
938 {
939 if (sscreen->debug_flags & DBG(SQTT)) {
940 /* Remember the uploaded code */
941 shader->binary.uploaded_code_size = code_size;
942 shader->binary.uploaded_code = malloc(code_size);
943 memcpy(shader->binary.uploaded_code, code, code_size);
944 }
945
946 if (dma_upload) {
947 /* Then copy from the staging buffer to VRAM.
948 *
949 * We can't use the upload copy in si_buffer_transfer_unmap because that might use
950 * a compute shader, and we can't use shaders in the code that is responsible for making
951 * them available.
952 */
953 si_cp_dma_copy_buffer(upload_ctx, &shader->bo->b.b, staging, 0, staging_offset,
954 binary_size, SI_OP_SYNC_AFTER, SI_COHERENCY_SHADER,
955 sscreen->info.gfx_level >= GFX7 ? L2_LRU : L2_BYPASS);
956 upload_ctx->flags |= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_L2;
957
958 #if 0 /* debug: validate whether the copy was successful */
959 uint32_t *dst_binary = malloc(binary_size);
960 uint32_t *src_binary = (uint32_t*)code;
961 pipe_buffer_read(&upload_ctx->b, &shader->bo->b.b, 0, binary_size, dst_binary);
962 puts("dst_binary == src_binary:");
963 for (unsigned i = 0; i < binary_size / 4; i++) {
964 printf(" %08x == %08x\n", dst_binary[i], src_binary[i]);
965 }
966 free(dst_binary);
967 exit(0);
968 #endif
969
970 si_put_aux_context_flush(&sscreen->aux_context.shader_upload);
971 pipe_resource_reference(&staging, NULL);
972 } else {
973 sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf);
974 }
975 }
976
upload_binary_elf(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,bool dma_upload)977 static bool upload_binary_elf(struct si_screen *sscreen, struct si_shader *shader,
978 uint64_t scratch_va, bool dma_upload)
979 {
980 struct ac_rtld_binary binary;
981 if (!si_shader_binary_open(sscreen, shader, &binary))
982 return false;
983
984 struct si_context *upload_ctx = NULL;
985 struct pipe_resource *staging = NULL;
986 unsigned staging_offset = 0;
987
988 void *rx_ptr = pre_upload_binary(sscreen, shader, binary.rx_size, dma_upload,
989 &upload_ctx, &staging, &staging_offset);
990 if (!rx_ptr)
991 return false;
992
993 /* Upload. */
994 struct ac_rtld_upload_info u = {};
995 u.binary = &binary;
996 u.get_external_symbol = si_get_external_symbol;
997 u.cb_data = &scratch_va;
998 u.rx_va = shader->bo->gpu_address;
999 u.rx_ptr = rx_ptr;
1000
1001 int size = ac_rtld_upload(&u);
1002
1003 post_upload_binary(sscreen, shader, rx_ptr, size, binary.rx_size, dma_upload,
1004 upload_ctx, staging, staging_offset);
1005
1006 ac_rtld_close(&binary);
1007
1008 return size >= 0;
1009 }
1010
calculate_needed_lds_size(struct si_screen * sscreen,struct si_shader * shader)1011 static void calculate_needed_lds_size(struct si_screen *sscreen, struct si_shader *shader)
1012 {
1013 gl_shader_stage stage =
1014 shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : shader->selector->stage;
1015
1016 if (sscreen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY &&
1017 (stage == MESA_SHADER_GEOMETRY || shader->key.ge.as_ngg)) {
1018 unsigned size_in_dw = shader->gs_info.esgs_ring_size;
1019
1020 if (stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)
1021 size_in_dw += shader->ngg.ngg_emit_size;
1022
1023 if (shader->key.ge.as_ngg) {
1024 unsigned scratch_dw_size = gfx10_ngg_get_scratch_dw_size(shader);
1025 if (scratch_dw_size) {
1026 /* scratch base address needs to be 8 byte aligned */
1027 size_in_dw = ALIGN(size_in_dw, 2);
1028 size_in_dw += scratch_dw_size;
1029 }
1030 }
1031
1032 shader->config.lds_size =
1033 DIV_ROUND_UP(size_in_dw * 4, get_lds_granularity(sscreen, stage));
1034 }
1035 }
1036
upload_binary_raw(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,bool dma_upload)1037 static bool upload_binary_raw(struct si_screen *sscreen, struct si_shader *shader,
1038 uint64_t scratch_va, bool dma_upload)
1039 {
1040 struct si_shader_binary *bin[4];
1041 unsigned num_bin = get_shader_binaries(shader, bin);
1042
1043 unsigned code_size = 0, exec_size = 0;
1044 for (unsigned i = 0; i < num_bin; i++) {
1045 assert(bin[i]->type == SI_SHADER_BINARY_RAW);
1046 code_size += bin[i]->code_size;
1047 exec_size += bin[i]->exec_size;
1048 }
1049
1050 struct si_context *upload_ctx = NULL;
1051 struct pipe_resource *staging = NULL;
1052 unsigned staging_offset = 0;
1053
1054 void *rx_ptr = pre_upload_binary(sscreen, shader, code_size, dma_upload,
1055 &upload_ctx, &staging, &staging_offset);
1056 if (!rx_ptr)
1057 return false;
1058
1059 unsigned exec_offset = 0, data_offset = exec_size;
1060 for (unsigned i = 0; i < num_bin; i++) {
1061 memcpy(rx_ptr + exec_offset, bin[i]->code_buffer, bin[i]->exec_size);
1062
1063 if (bin[i]->num_symbols) {
1064 /* Offset needed to add to const data symbol because of inserting other
1065 * shader part between exec code and const data.
1066 */
1067 unsigned const_offset = data_offset - exec_offset - bin[i]->exec_size;
1068
1069 /* Prolog and epilog have no symbols. */
1070 struct si_shader *sh = bin[i] == &shader->binary ? shader : shader->previous_stage;
1071 assert(sh && bin[i] == &sh->binary);
1072
1073 si_aco_resolve_symbols(sh, rx_ptr + exec_offset, (const uint32_t *)bin[i]->code_buffer,
1074 scratch_va, const_offset);
1075 }
1076
1077 exec_offset += bin[i]->exec_size;
1078
1079 unsigned data_size = bin[i]->code_size - bin[i]->exec_size;
1080 if (data_size) {
1081 memcpy(rx_ptr + data_offset, bin[i]->code_buffer + bin[i]->exec_size, data_size);
1082 data_offset += data_size;
1083 }
1084 }
1085
1086 post_upload_binary(sscreen, shader, rx_ptr, code_size, code_size, dma_upload,
1087 upload_ctx, staging, staging_offset);
1088
1089 calculate_needed_lds_size(sscreen, shader);
1090 return true;
1091 }
1092
si_shader_binary_upload(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va)1093 bool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
1094 uint64_t scratch_va)
1095 {
1096 bool dma_upload = !(sscreen->debug_flags & DBG(NO_DMA_SHADERS)) &&
1097 sscreen->info.has_dedicated_vram && !sscreen->info.all_vram_visible;
1098
1099 if (shader->binary.type == SI_SHADER_BINARY_ELF) {
1100 return upload_binary_elf(sscreen, shader, scratch_va, dma_upload);
1101 } else {
1102 assert(shader->binary.type == SI_SHADER_BINARY_RAW);
1103 return upload_binary_raw(sscreen, shader, scratch_va, dma_upload);
1104 }
1105 }
1106
print_disassembly(const char * disasm,size_t nbytes,const char * name,FILE * file,struct util_debug_callback * debug)1107 static void print_disassembly(const char *disasm, size_t nbytes,
1108 const char *name, FILE *file,
1109 struct util_debug_callback *debug)
1110 {
1111 if (debug && debug->debug_message) {
1112 /* Very long debug messages are cut off, so send the
1113 * disassembly one line at a time. This causes more
1114 * overhead, but on the plus side it simplifies
1115 * parsing of resulting logs.
1116 */
1117 util_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");
1118
1119 uint64_t line = 0;
1120 while (line < nbytes) {
1121 int count = nbytes - line;
1122 const char *nl = memchr(disasm + line, '\n', nbytes - line);
1123 if (nl)
1124 count = nl - (disasm + line);
1125
1126 if (count) {
1127 util_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
1128 }
1129
1130 line += count + 1;
1131 }
1132
1133 util_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
1134 }
1135
1136 if (file) {
1137 fprintf(file, "Shader %s disassembly:\n", name);
1138 fprintf(file, "%*s", (int)nbytes, disasm);
1139 }
1140 }
1141
si_shader_dump_disassembly(struct si_screen * screen,const struct si_shader_binary * binary,gl_shader_stage stage,unsigned wave_size,struct util_debug_callback * debug,const char * name,FILE * file)1142 static void si_shader_dump_disassembly(struct si_screen *screen,
1143 const struct si_shader_binary *binary,
1144 gl_shader_stage stage, unsigned wave_size,
1145 struct util_debug_callback *debug, const char *name,
1146 FILE *file)
1147 {
1148 if (binary->type == SI_SHADER_BINARY_RAW) {
1149 print_disassembly(binary->disasm_string, binary->disasm_size, name, file, debug);
1150 return;
1151 }
1152
1153 struct ac_rtld_binary rtld_binary;
1154
1155 if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
1156 .info = &screen->info,
1157 .shader_type = stage,
1158 .wave_size = wave_size,
1159 .num_parts = 1,
1160 .elf_ptrs = &binary->code_buffer,
1161 .elf_sizes = &binary->code_size}))
1162 return;
1163
1164 const char *disasm;
1165 size_t nbytes;
1166
1167 if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
1168 goto out;
1169
1170 if (nbytes > INT_MAX)
1171 goto out;
1172
1173 print_disassembly(disasm, nbytes, name, file, debug);
1174
1175 out:
1176 ac_rtld_close(&rtld_binary);
1177 }
1178
si_calculate_max_simd_waves(struct si_shader * shader)1179 static void si_calculate_max_simd_waves(struct si_shader *shader)
1180 {
1181 struct si_screen *sscreen = shader->selector->screen;
1182 struct ac_shader_config *conf = &shader->config;
1183 unsigned lds_increment = get_lds_granularity(sscreen, shader->selector->stage);
1184 unsigned lds_per_wave = 0;
1185 unsigned max_simd_waves;
1186
1187 max_simd_waves = sscreen->info.max_waves_per_simd;
1188
1189 /* Compute LDS usage for PS. */
1190 switch (shader->selector->stage) {
1191 case MESA_SHADER_FRAGMENT:
1192 /* The minimum usage per wave is (num_inputs * 48). The maximum
1193 * usage is (num_inputs * 48 * 16).
1194 * We can get anything in between and it varies between waves.
1195 *
1196 * The 48 bytes per input for a single primitive is equal to
1197 * 4 bytes/component * 4 components/input * 3 points.
1198 *
1199 * Other stages don't know the size at compile time or don't
1200 * allocate LDS per wave, but instead they do it per thread group.
1201 */
1202 lds_per_wave = conf->lds_size * lds_increment +
1203 align(shader->info.num_ps_inputs * 48, lds_increment);
1204 break;
1205 case MESA_SHADER_COMPUTE: {
1206 unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
1207 lds_per_wave = (conf->lds_size * lds_increment) /
1208 DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
1209 }
1210 break;
1211 default:;
1212 }
1213
1214 /* Compute the per-SIMD wave counts. */
1215 if (conf->num_sgprs) {
1216 max_simd_waves =
1217 MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
1218 }
1219
1220 if (conf->num_vgprs) {
1221 /* GFX 10.3 internally:
1222 * - aligns VGPRS to 16 for Wave32 and 8 for Wave64
1223 * - aligns LDS to 1024
1224 *
1225 * For shader-db stats, set num_vgprs that the hw actually uses.
1226 */
1227 unsigned num_vgprs = conf->num_vgprs;
1228 if (sscreen->info.gfx_level >= GFX10_3) {
1229 unsigned real_vgpr_gran = sscreen->info.num_physical_wave64_vgprs_per_simd / 64;
1230 num_vgprs = util_align_npot(num_vgprs, real_vgpr_gran * (shader->wave_size == 32 ? 2 : 1));
1231 } else {
1232 num_vgprs = align(num_vgprs, shader->wave_size == 32 ? 8 : 4);
1233 }
1234
1235 /* Always print wave limits as Wave64, so that we can compare
1236 * Wave32 and Wave64 with shader-db fairly. */
1237 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
1238 max_simd_waves = MIN2(max_simd_waves, max_vgprs / num_vgprs);
1239 }
1240
1241 unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
1242 if (lds_per_wave)
1243 max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
1244
1245 shader->info.max_simd_waves = max_simd_waves;
1246 }
1247
si_shader_dump_stats_for_shader_db(struct si_screen * screen,struct si_shader * shader,struct util_debug_callback * debug)1248 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1249 struct util_debug_callback *debug)
1250 {
1251 const struct ac_shader_config *conf = &shader->config;
1252 static const char *stages[] = {"VS", "TCS", "TES", "GS", "PS", "CS"};
1253
1254 if (screen->options.debug_disassembly)
1255 si_shader_dump_disassembly(screen, &shader->binary, shader->selector->stage,
1256 shader->wave_size, debug, "main", NULL);
1257
1258 unsigned num_ls_outputs = 0;
1259 unsigned num_hs_outputs = 0;
1260 unsigned num_es_outputs = 0;
1261 unsigned num_gs_outputs = 0;
1262 unsigned num_vs_outputs = 0;
1263 unsigned num_ps_outputs = 0;
1264
1265 if (shader->selector->stage <= MESA_SHADER_GEOMETRY) {
1266 /* This doesn't include pos exports because only param exports are interesting
1267 * for performance and can be optimized.
1268 */
1269 if (shader->key.ge.as_ls)
1270 num_ls_outputs = shader->selector->info.lshs_vertex_stride / 16;
1271 else if (shader->selector->stage == MESA_SHADER_TESS_CTRL)
1272 num_hs_outputs = util_last_bit64(shader->selector->info.outputs_written_before_tes_gs);
1273 else if (shader->key.ge.as_es)
1274 num_es_outputs = shader->selector->info.esgs_vertex_stride / 16;
1275 else if (shader->gs_copy_shader)
1276 num_gs_outputs = shader->gs_copy_shader->info.nr_param_exports;
1277 else if (shader->selector->stage == MESA_SHADER_GEOMETRY)
1278 num_gs_outputs = shader->info.nr_param_exports;
1279 else if (shader->selector->stage == MESA_SHADER_VERTEX ||
1280 shader->selector->stage == MESA_SHADER_TESS_EVAL)
1281 num_vs_outputs = shader->info.nr_param_exports;
1282 else
1283 unreachable("invalid shader key");
1284 } else if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
1285 num_ps_outputs = util_bitcount(shader->selector->info.colors_written) +
1286 (shader->selector->info.writes_z ||
1287 shader->selector->info.writes_stencil ||
1288 shader->ps.writes_samplemask);
1289 }
1290
1291 util_debug_message(debug, SHADER_INFO,
1292 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
1293 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
1294 "Spilled VGPRs: %d PrivMem VGPRs: %d LSOutputs: %u HSOutputs: %u "
1295 "HSPatchOuts: %u ESOutputs: %u GSOutputs: %u VSOutputs: %u PSOutputs: %u "
1296 "InlineUniforms: %u DivergentLoop: %u (%s, W%u)",
1297 conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
1298 conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
1299 conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
1300 num_ls_outputs, num_hs_outputs,
1301 util_last_bit64(shader->selector->info.patch_outputs_written),
1302 num_es_outputs, num_gs_outputs, num_vs_outputs, num_ps_outputs,
1303 shader->selector->info.base.num_inlinable_uniforms,
1304 shader->selector->info.has_divergent_loop,
1305 stages[shader->selector->stage], shader->wave_size);
1306 }
1307
si_can_dump_shader(struct si_screen * sscreen,gl_shader_stage stage,enum si_shader_dump_type dump_type)1308 bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage,
1309 enum si_shader_dump_type dump_type)
1310 {
1311 static uint64_t filter[] = {
1312 [SI_DUMP_SHADER_KEY] = DBG(NIR) | DBG(INIT_LLVM) | DBG(LLVM) | DBG(INIT_ACO) | DBG(ACO) | DBG(ASM),
1313 [SI_DUMP_INIT_NIR] = DBG(INIT_NIR),
1314 [SI_DUMP_NIR] = DBG(NIR),
1315 [SI_DUMP_INIT_LLVM_IR] = DBG(INIT_LLVM),
1316 [SI_DUMP_LLVM_IR] = DBG(LLVM),
1317 [SI_DUMP_INIT_ACO_IR] = DBG(INIT_ACO),
1318 [SI_DUMP_ACO_IR] = DBG(ACO),
1319 [SI_DUMP_ASM] = DBG(ASM),
1320 [SI_DUMP_STATS] = DBG(STATS),
1321 [SI_DUMP_ALWAYS] = DBG(VS) | DBG(TCS) | DBG(TES) | DBG(GS) | DBG(PS) | DBG(CS),
1322 };
1323 assert(dump_type < ARRAY_SIZE(filter));
1324
1325 return sscreen->debug_flags & (1 << stage) &&
1326 sscreen->debug_flags & filter[dump_type];
1327 }
1328
si_shader_dump_stats(struct si_screen * sscreen,struct si_shader * shader,FILE * file,bool check_debug_option)1329 static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
1330 bool check_debug_option)
1331 {
1332 const struct ac_shader_config *conf = &shader->config;
1333
1334 if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
1335 fprintf(file,
1336 "*** SHADER CONFIG ***\n"
1337 "SPI_PS_INPUT_ADDR = 0x%04x\n"
1338 "SPI_PS_INPUT_ENA = 0x%04x\n",
1339 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
1340 }
1341
1342 fprintf(file,
1343 "*** SHADER STATS ***\n"
1344 "SGPRS: %d\n"
1345 "VGPRS: %d\n"
1346 "Spilled SGPRs: %d\n"
1347 "Spilled VGPRs: %d\n"
1348 "Private memory VGPRs: %d\n"
1349 "Code Size: %d bytes\n"
1350 "LDS: %d bytes\n"
1351 "Scratch: %d bytes per wave\n"
1352 "Max Waves: %d\n"
1353 "********************\n\n\n",
1354 conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
1355 shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
1356 conf->lds_size * get_lds_granularity(sscreen, shader->selector->stage),
1357 conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
1358 }
1359
si_get_shader_name(const struct si_shader * shader)1360 const char *si_get_shader_name(const struct si_shader *shader)
1361 {
1362 switch (shader->selector->stage) {
1363 case MESA_SHADER_VERTEX:
1364 if (shader->key.ge.as_es)
1365 return "Vertex Shader as ES";
1366 else if (shader->key.ge.as_ls)
1367 return "Vertex Shader as LS";
1368 else if (shader->key.ge.as_ngg)
1369 return "Vertex Shader as ESGS";
1370 else
1371 return "Vertex Shader as VS";
1372 case MESA_SHADER_TESS_CTRL:
1373 return "Tessellation Control Shader";
1374 case MESA_SHADER_TESS_EVAL:
1375 if (shader->key.ge.as_es)
1376 return "Tessellation Evaluation Shader as ES";
1377 else if (shader->key.ge.as_ngg)
1378 return "Tessellation Evaluation Shader as ESGS";
1379 else
1380 return "Tessellation Evaluation Shader as VS";
1381 case MESA_SHADER_GEOMETRY:
1382 if (shader->is_gs_copy_shader)
1383 return "GS Copy Shader as VS";
1384 else
1385 return "Geometry Shader";
1386 case MESA_SHADER_FRAGMENT:
1387 return "Pixel Shader";
1388 case MESA_SHADER_COMPUTE:
1389 return "Compute Shader";
1390 default:
1391 return "Unknown Shader";
1392 }
1393 }
1394
si_shader_dump(struct si_screen * sscreen,struct si_shader * shader,struct util_debug_callback * debug,FILE * file,bool check_debug_option)1395 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1396 struct util_debug_callback *debug, FILE *file, bool check_debug_option)
1397 {
1398 gl_shader_stage stage = shader->selector->stage;
1399
1400 if (!check_debug_option || si_can_dump_shader(sscreen, stage, SI_DUMP_SHADER_KEY))
1401 si_dump_shader_key(shader, file);
1402
1403 if (!check_debug_option && shader->binary.llvm_ir_string) {
1404 /* This is only used with ddebug. */
1405 if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
1406 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
1407 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
1408 }
1409
1410 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
1411 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
1412 }
1413
1414 if (!check_debug_option || (si_can_dump_shader(sscreen, stage, SI_DUMP_ASM))) {
1415 fprintf(file, "\n%s:\n", si_get_shader_name(shader));
1416
1417 if (shader->prolog)
1418 si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, shader->wave_size, debug,
1419 "prolog", file);
1420 if (shader->previous_stage)
1421 si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,
1422 shader->wave_size, debug, "previous stage", file);
1423 si_shader_dump_disassembly(sscreen, &shader->binary, stage, shader->wave_size, debug, "main",
1424 file);
1425
1426 if (shader->epilog)
1427 si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, shader->wave_size, debug,
1428 "epilog", file);
1429 fprintf(file, "\n");
1430
1431 si_shader_dump_stats(sscreen, shader, file, check_debug_option);
1432 }
1433 }
1434
si_dump_shader_key_vs(const union si_shader_key * key,FILE * f)1435 static void si_dump_shader_key_vs(const union si_shader_key *key, FILE *f)
1436 {
1437 fprintf(f, " mono.instance_divisor_is_one = %u\n", key->ge.mono.instance_divisor_is_one);
1438 fprintf(f, " mono.instance_divisor_is_fetched = %u\n",
1439 key->ge.mono.instance_divisor_is_fetched);
1440 fprintf(f, " mono.vs.fetch_opencode = %x\n", key->ge.mono.vs_fetch_opencode);
1441 fprintf(f, " mono.vs.fix_fetch = {");
1442 for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
1443 union si_vs_fix_fetch fix = key->ge.mono.vs_fix_fetch[i];
1444 if (i)
1445 fprintf(f, ", ");
1446 if (!fix.bits)
1447 fprintf(f, "0");
1448 else
1449 fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
1450 fix.u.format);
1451 }
1452 fprintf(f, "}\n");
1453 }
1454
si_dump_shader_key(const struct si_shader * shader,FILE * f)1455 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
1456 {
1457 const union si_shader_key *key = &shader->key;
1458 gl_shader_stage stage = shader->selector->stage;
1459
1460 fprintf(f, "SHADER KEY\n");
1461 fprintf(f, " source_sha1 = {");
1462 _mesa_sha1_print(f, shader->selector->info.base.source_sha1);
1463 fprintf(f, "}\n");
1464
1465 switch (stage) {
1466 case MESA_SHADER_VERTEX:
1467 si_dump_shader_key_vs(key, f);
1468 fprintf(f, " as_es = %u\n", key->ge.as_es);
1469 fprintf(f, " as_ls = %u\n", key->ge.as_ls);
1470 fprintf(f, " as_ngg = %u\n", key->ge.as_ngg);
1471 fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1472 break;
1473
1474 case MESA_SHADER_TESS_CTRL:
1475 if (shader->selector->screen->info.gfx_level >= GFX9)
1476 si_dump_shader_key_vs(key, f);
1477
1478 fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->ge.part.tcs.epilog.prim_mode);
1479 fprintf(f, " opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1480 fprintf(f, " opt.same_patch_vertices = %u\n", key->ge.opt.same_patch_vertices);
1481 break;
1482
1483 case MESA_SHADER_TESS_EVAL:
1484 fprintf(f, " as_es = %u\n", key->ge.as_es);
1485 fprintf(f, " as_ngg = %u\n", key->ge.as_ngg);
1486 fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1487 break;
1488
1489 case MESA_SHADER_GEOMETRY:
1490 if (shader->is_gs_copy_shader)
1491 break;
1492
1493 if (shader->selector->screen->info.gfx_level >= GFX9 &&
1494 key->ge.part.gs.es->stage == MESA_SHADER_VERTEX)
1495 si_dump_shader_key_vs(key, f);
1496
1497 fprintf(f, " mono.u.gs_tri_strip_adj_fix = %u\n", key->ge.mono.u.gs_tri_strip_adj_fix);
1498 fprintf(f, " as_ngg = %u\n", key->ge.as_ngg);
1499 break;
1500
1501 case MESA_SHADER_COMPUTE:
1502 break;
1503
1504 case MESA_SHADER_FRAGMENT:
1505 fprintf(f, " prolog.color_two_side = %u\n", key->ps.part.prolog.color_two_side);
1506 fprintf(f, " prolog.flatshade_colors = %u\n", key->ps.part.prolog.flatshade_colors);
1507 fprintf(f, " prolog.poly_stipple = %u\n", key->ps.part.prolog.poly_stipple);
1508 fprintf(f, " prolog.force_persp_sample_interp = %u\n",
1509 key->ps.part.prolog.force_persp_sample_interp);
1510 fprintf(f, " prolog.force_linear_sample_interp = %u\n",
1511 key->ps.part.prolog.force_linear_sample_interp);
1512 fprintf(f, " prolog.force_persp_center_interp = %u\n",
1513 key->ps.part.prolog.force_persp_center_interp);
1514 fprintf(f, " prolog.force_linear_center_interp = %u\n",
1515 key->ps.part.prolog.force_linear_center_interp);
1516 fprintf(f, " prolog.bc_optimize_for_persp = %u\n",
1517 key->ps.part.prolog.bc_optimize_for_persp);
1518 fprintf(f, " prolog.bc_optimize_for_linear = %u\n",
1519 key->ps.part.prolog.bc_optimize_for_linear);
1520 fprintf(f, " prolog.samplemask_log_ps_iter = %u\n",
1521 key->ps.part.prolog.samplemask_log_ps_iter);
1522 fprintf(f, " epilog.spi_shader_col_format = 0x%x\n",
1523 key->ps.part.epilog.spi_shader_col_format);
1524 fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.part.epilog.color_is_int8);
1525 fprintf(f, " epilog.color_is_int10 = 0x%X\n", key->ps.part.epilog.color_is_int10);
1526 fprintf(f, " epilog.last_cbuf = %u\n", key->ps.part.epilog.last_cbuf);
1527 fprintf(f, " epilog.alpha_func = %u\n", key->ps.part.epilog.alpha_func);
1528 fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.part.epilog.alpha_to_one);
1529 fprintf(f, " epilog.alpha_to_coverage_via_mrtz = %u\n", key->ps.part.epilog.alpha_to_coverage_via_mrtz);
1530 fprintf(f, " epilog.clamp_color = %u\n", key->ps.part.epilog.clamp_color);
1531 fprintf(f, " epilog.dual_src_blend_swizzle = %u\n", key->ps.part.epilog.dual_src_blend_swizzle);
1532 fprintf(f, " epilog.rbplus_depth_only_opt = %u\n", key->ps.part.epilog.rbplus_depth_only_opt);
1533 fprintf(f, " epilog.kill_samplemask = %u\n", key->ps.part.epilog.kill_samplemask);
1534 fprintf(f, " mono.poly_line_smoothing = %u\n", key->ps.mono.poly_line_smoothing);
1535 fprintf(f, " mono.point_smoothing = %u\n", key->ps.mono.point_smoothing);
1536 fprintf(f, " mono.interpolate_at_sample_force_center = %u\n",
1537 key->ps.mono.interpolate_at_sample_force_center);
1538 fprintf(f, " mono.fbfetch_msaa = %u\n", key->ps.mono.fbfetch_msaa);
1539 fprintf(f, " mono.fbfetch_is_1D = %u\n", key->ps.mono.fbfetch_is_1D);
1540 fprintf(f, " mono.fbfetch_layered = %u\n", key->ps.mono.fbfetch_layered);
1541 break;
1542
1543 default:
1544 assert(0);
1545 }
1546
1547 if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||
1548 stage == MESA_SHADER_VERTEX) &&
1549 !key->ge.as_es && !key->ge.as_ls) {
1550 fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs);
1551 fprintf(f, " opt.kill_pointsize = 0x%x\n", key->ge.opt.kill_pointsize);
1552 fprintf(f, " opt.kill_layer = 0x%x\n", key->ge.opt.kill_layer);
1553 fprintf(f, " opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances);
1554 fprintf(f, " opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling);
1555 fprintf(f, " opt.remove_streamout = 0x%x\n", key->ge.opt.remove_streamout);
1556 }
1557
1558 if (stage <= MESA_SHADER_GEOMETRY)
1559 fprintf(f, " opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1560 else
1561 fprintf(f, " opt.prefer_mono = %u\n", key->ps.opt.prefer_mono);
1562
1563 if (stage <= MESA_SHADER_GEOMETRY) {
1564 if (key->ge.opt.inline_uniforms) {
1565 fprintf(f, " opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1566 key->ge.opt.inline_uniforms,
1567 key->ge.opt.inlined_uniform_values[0],
1568 key->ge.opt.inlined_uniform_values[1],
1569 key->ge.opt.inlined_uniform_values[2],
1570 key->ge.opt.inlined_uniform_values[3]);
1571 } else {
1572 fprintf(f, " opt.inline_uniforms = 0\n");
1573 }
1574 } else {
1575 if (key->ps.opt.inline_uniforms) {
1576 fprintf(f, " opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1577 key->ps.opt.inline_uniforms,
1578 key->ps.opt.inlined_uniform_values[0],
1579 key->ps.opt.inlined_uniform_values[1],
1580 key->ps.opt.inlined_uniform_values[2],
1581 key->ps.opt.inlined_uniform_values[3]);
1582 } else {
1583 fprintf(f, " opt.inline_uniforms = 0\n");
1584 }
1585 }
1586 }
1587
1588 /* TODO: convert to nir_shader_instructions_pass */
si_nir_kill_outputs(nir_shader * nir,const union si_shader_key * key)1589 static bool si_nir_kill_outputs(nir_shader *nir, const union si_shader_key *key)
1590 {
1591 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1592 assert(impl);
1593 assert(nir->info.stage <= MESA_SHADER_GEOMETRY);
1594
1595 if (!key->ge.opt.kill_outputs &&
1596 !key->ge.opt.kill_pointsize &&
1597 !key->ge.opt.kill_layer &&
1598 !key->ge.opt.kill_clip_distances &&
1599 !(nir->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER))) {
1600 nir_metadata_preserve(impl, nir_metadata_all);
1601 return false;
1602 }
1603
1604 bool progress = false;
1605
1606 nir_foreach_block(block, impl) {
1607 nir_foreach_instr_safe(instr, block) {
1608 if (instr->type != nir_instr_type_intrinsic)
1609 continue;
1610
1611 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1612 if (intr->intrinsic != nir_intrinsic_store_output)
1613 continue;
1614
1615 /* No indirect indexing allowed. */
1616 ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1617 assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1618
1619 assert(intr->num_components == 1); /* only scalar stores expected */
1620 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1621
1622 if (nir_slot_is_varying(sem.location) &&
1623 key->ge.opt.kill_outputs &
1624 (1ull << si_shader_io_get_unique_index(sem.location)))
1625 progress |= nir_remove_varying(intr, MESA_SHADER_FRAGMENT);
1626
1627 switch (sem.location) {
1628 case VARYING_SLOT_PSIZ:
1629 if (key->ge.opt.kill_pointsize)
1630 progress |= nir_remove_sysval_output(intr);
1631 break;
1632
1633 case VARYING_SLOT_CLIP_VERTEX:
1634 /* TODO: We should only kill specific clip planes as required by kill_clip_distance,
1635 * not whole gl_ClipVertex. Lower ClipVertex in NIR.
1636 */
1637 if ((key->ge.opt.kill_clip_distances & SI_USER_CLIP_PLANE_MASK) ==
1638 SI_USER_CLIP_PLANE_MASK)
1639 progress |= nir_remove_sysval_output(intr);
1640 break;
1641
1642 case VARYING_SLOT_CLIP_DIST0:
1643 case VARYING_SLOT_CLIP_DIST1:
1644 if (key->ge.opt.kill_clip_distances) {
1645 assert(nir_intrinsic_src_type(intr) == nir_type_float32);
1646 unsigned index = (sem.location - VARYING_SLOT_CLIP_DIST0) * 4 +
1647 nir_intrinsic_component(intr);
1648
1649 if (key->ge.opt.kill_clip_distances & BITFIELD_BIT(index))
1650 progress |= nir_remove_sysval_output(intr);
1651 }
1652 break;
1653
1654 case VARYING_SLOT_LAYER:
1655 /* LAYER is never passed to FS. Instead, we load it there as a system value. */
1656 progress |= nir_remove_varying(intr, MESA_SHADER_FRAGMENT);
1657
1658 if (key->ge.opt.kill_layer)
1659 progress |= nir_remove_sysval_output(intr);
1660 break;
1661 }
1662 }
1663 }
1664
1665 if (progress) {
1666 nir_metadata_preserve(impl, nir_metadata_dominance |
1667 nir_metadata_block_index);
1668 } else {
1669 nir_metadata_preserve(impl, nir_metadata_all);
1670 }
1671
1672 return progress;
1673 }
1674
1675 /* Remove PS output components from NIR if they are disabled by spi_shader_col_format. */
kill_ps_outputs_cb(struct nir_builder * b,nir_instr * instr,void * _key)1676 static bool kill_ps_outputs_cb(struct nir_builder *b, nir_instr *instr, void *_key)
1677 {
1678 if (instr->type != nir_instr_type_intrinsic)
1679 return false;
1680
1681 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1682 if (intr->intrinsic != nir_intrinsic_store_output)
1683 return false;
1684
1685 /* No indirect indexing allowed. */
1686 ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1687 assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1688
1689 unsigned location = nir_intrinsic_io_semantics(intr).location;
1690 const union si_shader_key *key = _key;
1691
1692 switch (location) {
1693 case FRAG_RESULT_DEPTH:
1694 case FRAG_RESULT_STENCIL:
1695 return false;
1696
1697 case FRAG_RESULT_SAMPLE_MASK:
1698 if (key->ps.part.epilog.kill_samplemask) {
1699 nir_instr_remove(instr);
1700 return true;
1701 }
1702 return false;
1703 }
1704
1705 /* Color outputs. */
1706 unsigned comp_mask = BITFIELD_MASK(intr->num_components);
1707 assert(nir_intrinsic_component(intr) == 0);
1708 unsigned cb_shader_mask = ac_get_cb_shader_mask(key->ps.part.epilog.spi_shader_col_format);
1709
1710 /* If COLOR is broadcasted to multiple color buffers, combine their masks. */
1711 if (location == FRAG_RESULT_COLOR) {
1712 for (unsigned i = 1; i <= key->ps.part.epilog.last_cbuf; i++)
1713 cb_shader_mask |= (cb_shader_mask >> (i * 4)) & 0xf;
1714 }
1715
1716 unsigned index = location == FRAG_RESULT_COLOR ? 0 : location - FRAG_RESULT_DATA0;
1717 unsigned output_mask = (cb_shader_mask >> (index * 4)) & 0xf;
1718
1719 if ((output_mask & comp_mask) == comp_mask)
1720 return false;
1721
1722 if (!(output_mask & comp_mask)) {
1723 nir_instr_remove(instr);
1724 return true;
1725 }
1726
1727 /* Fill disabled components with undef. */
1728 b->cursor = nir_before_instr(instr);
1729 nir_def *new_value = intr->src[0].ssa;
1730 nir_def *undef = nir_undef(b, 1, new_value->bit_size);
1731
1732 unsigned kill_mask = ~output_mask & comp_mask;
1733 u_foreach_bit(i, kill_mask) {
1734 new_value = nir_vector_insert_imm(b, new_value, undef, i);
1735 }
1736
1737 nir_src_rewrite(&intr->src[0], new_value);
1738 return true;
1739 }
1740
si_nir_kill_ps_outputs(nir_shader * nir,const union si_shader_key * key)1741 static bool si_nir_kill_ps_outputs(nir_shader *nir, const union si_shader_key *key)
1742 {
1743 assert(nir->info.stage == MESA_SHADER_FRAGMENT);
1744 return nir_shader_instructions_pass(nir, kill_ps_outputs_cb,
1745 nir_metadata_dominance |
1746 nir_metadata_block_index, (void*)key);
1747 }
1748
clamp_vertex_color_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * state)1749 static bool clamp_vertex_color_instr(nir_builder *b,
1750 nir_intrinsic_instr *intrin, void *state)
1751 {
1752 if (intrin->intrinsic != nir_intrinsic_store_output)
1753 return false;
1754
1755 unsigned location = nir_intrinsic_io_semantics(intrin).location;
1756 if (location != VARYING_SLOT_COL0 && location != VARYING_SLOT_COL1 &&
1757 location != VARYING_SLOT_BFC0 && location != VARYING_SLOT_BFC1)
1758 return false;
1759
1760 /* no indirect output */
1761 assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
1762 /* only scalar output */
1763 assert(intrin->src[0].ssa->num_components == 1);
1764
1765 b->cursor = nir_before_instr(&intrin->instr);
1766
1767 nir_def *color = intrin->src[0].ssa;
1768 nir_def *clamp = nir_load_clamp_vertex_color_amd(b);
1769 nir_def *new_color = nir_bcsel(b, clamp, nir_fsat(b, color), color);
1770 nir_src_rewrite(&intrin->src[0], new_color);
1771
1772 return true;
1773 }
1774
si_nir_clamp_vertex_color(nir_shader * nir)1775 static bool si_nir_clamp_vertex_color(nir_shader *nir)
1776 {
1777 uint64_t mask = VARYING_BIT_COL0 | VARYING_BIT_COL1 | VARYING_BIT_BFC0 | VARYING_BIT_BFC1;
1778 if (!(nir->info.outputs_written & mask))
1779 return false;
1780
1781 return nir_shader_intrinsics_pass(nir, clamp_vertex_color_instr,
1782 nir_metadata_dominance | nir_metadata_block_index,
1783 NULL);
1784 }
1785
si_map_io_driver_location(unsigned semantic)1786 static unsigned si_map_io_driver_location(unsigned semantic)
1787 {
1788 if ((semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_TESS_MAX) ||
1789 semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
1790 semantic == VARYING_SLOT_TESS_LEVEL_OUTER)
1791 return ac_shader_io_get_unique_index_patch(semantic);
1792
1793 return si_shader_io_get_unique_index(semantic);
1794 }
1795
si_lower_io_to_mem(struct si_shader * shader,nir_shader * nir,uint64_t tcs_vgpr_only_inputs)1796 static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir,
1797 uint64_t tcs_vgpr_only_inputs)
1798 {
1799 struct si_shader_selector *sel = shader->selector;
1800 const union si_shader_key *key = &shader->key;
1801
1802 if (nir->info.stage == MESA_SHADER_VERTEX) {
1803 if (key->ge.as_ls) {
1804 NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem, si_map_io_driver_location,
1805 key->ge.opt.same_patch_vertices, tcs_vgpr_only_inputs);
1806 return true;
1807 } else if (key->ge.as_es) {
1808 NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1809 sel->screen->info.gfx_level, sel->info.esgs_vertex_stride);
1810 return true;
1811 }
1812 } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
1813 NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem, si_map_io_driver_location,
1814 key->ge.opt.same_patch_vertices);
1815
1816 /* Used by hs_emit_write_tess_factors() when monolithic shader. */
1817 nir->info.tess._primitive_mode = key->ge.part.tcs.epilog.prim_mode;
1818
1819 NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, si_map_io_driver_location,
1820 sel->screen->info.gfx_level,
1821 /* Used by hs_emit_write_tess_factors() when monolithic shader. */
1822 key->ge.part.tcs.epilog.tes_reads_tess_factors,
1823 ~0ULL, ~0ULL, /* no TES inputs filter */
1824 util_last_bit64(sel->info.outputs_written_before_tes_gs),
1825 util_last_bit64(sel->info.patch_outputs_written),
1826 shader->wave_size,
1827 /* ALL TCS inputs are passed by register. */
1828 key->ge.opt.same_patch_vertices &&
1829 !(sel->info.base.inputs_read & ~sel->info.tcs_vgpr_only_inputs),
1830 sel->info.tessfactors_are_def_in_all_invocs,
1831 /* Emit epilog only when monolithic shader. */
1832 shader->is_monolithic);
1833 return true;
1834 } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
1835 NIR_PASS_V(nir, ac_nir_lower_tes_inputs_to_mem, si_map_io_driver_location);
1836
1837 if (key->ge.as_es) {
1838 NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1839 sel->screen->info.gfx_level, sel->info.esgs_vertex_stride);
1840 }
1841
1842 return true;
1843 } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1844 NIR_PASS_V(nir, ac_nir_lower_gs_inputs_to_mem, si_map_io_driver_location,
1845 sel->screen->info.gfx_level, key->ge.mono.u.gs_tri_strip_adj_fix);
1846 return true;
1847 }
1848
1849 return false;
1850 }
1851
si_lower_ngg(struct si_shader * shader,nir_shader * nir)1852 static void si_lower_ngg(struct si_shader *shader, nir_shader *nir)
1853 {
1854 struct si_shader_selector *sel = shader->selector;
1855 const union si_shader_key *key = &shader->key;
1856 assert(key->ge.as_ngg);
1857
1858 uint8_t clip_cull_dist_mask =
1859 (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) |
1860 sel->info.culldist_mask;
1861
1862 ac_nir_lower_ngg_options options = {
1863 .family = sel->screen->info.family,
1864 .gfx_level = sel->screen->info.gfx_level,
1865 .max_workgroup_size = si_get_max_workgroup_size(shader),
1866 .wave_size = shader->wave_size,
1867 .can_cull = !!key->ge.opt.ngg_culling,
1868 .disable_streamout = !si_shader_uses_streamout(shader),
1869 .vs_output_param_offset = shader->info.vs_output_param_offset,
1870 .has_param_exports = shader->info.nr_param_exports,
1871 .clip_cull_dist_mask = clip_cull_dist_mask,
1872 .kill_pointsize = key->ge.opt.kill_pointsize,
1873 .kill_layer = key->ge.opt.kill_layer,
1874 .force_vrs = sel->screen->options.vrs2x2,
1875 };
1876
1877 if (nir->info.stage == MESA_SHADER_VERTEX ||
1878 nir->info.stage == MESA_SHADER_TESS_EVAL) {
1879 /* Per instance inputs, used to remove instance load after culling. */
1880 unsigned instance_rate_inputs = 0;
1881
1882 if (nir->info.stage == MESA_SHADER_VERTEX) {
1883 instance_rate_inputs = key->ge.mono.instance_divisor_is_one |
1884 key->ge.mono.instance_divisor_is_fetched;
1885
1886 /* Manually mark the instance ID used, so the shader can repack it. */
1887 if (instance_rate_inputs)
1888 BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
1889 } else {
1890 /* Manually mark the primitive ID used, so the shader can repack it. */
1891 if (key->ge.mono.u.vs_export_prim_id)
1892 BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
1893 }
1894
1895 unsigned clip_plane_enable =
1896 SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(key->ge.opt.ngg_culling);
1897 unsigned num_vertices = gfx10_ngg_get_vertices_per_prim(shader);
1898
1899 options.num_vertices_per_primitive = num_vertices ? num_vertices : 3;
1900 options.early_prim_export = gfx10_ngg_export_prim_early(shader);
1901 options.passthrough = gfx10_is_ngg_passthrough(shader);
1902 options.use_edgeflags = gfx10_edgeflags_have_effect(shader);
1903 options.has_gen_prim_query = options.has_xfb_prim_query =
1904 sel->screen->info.gfx_level >= GFX11 && !sel->info.base.vs.blit_sgprs_amd;
1905 options.export_primitive_id = key->ge.mono.u.vs_export_prim_id;
1906 options.instance_rate_inputs = instance_rate_inputs;
1907 options.user_clip_plane_enable_mask = clip_plane_enable;
1908
1909 NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options);
1910 } else {
1911 assert(nir->info.stage == MESA_SHADER_GEOMETRY);
1912
1913 options.gs_out_vtx_bytes = sel->info.gsvs_vertex_size;
1914 options.has_gen_prim_query = options.has_xfb_prim_query =
1915 sel->screen->info.gfx_level >= GFX11;
1916 options.has_gs_invocations_query = sel->screen->info.gfx_level < GFX11;
1917 options.has_gs_primitives_query = true;
1918
1919 /* For monolithic ES/GS to add vscnt wait when GS export pos0. */
1920 if (key->ge.part.gs.es)
1921 nir->info.writes_memory |= key->ge.part.gs.es->info.base.writes_memory;
1922
1923 NIR_PASS_V(nir, ac_nir_lower_ngg_gs, &options);
1924 }
1925
1926 /* may generate some subgroup op like ballot */
1927 NIR_PASS_V(nir, nir_lower_subgroups, &si_nir_subgroups_options);
1928
1929 /* may generate some vector output store */
1930 NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1931 }
1932
si_deserialize_shader(struct si_shader_selector * sel)1933 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel)
1934 {
1935 struct pipe_screen *screen = &sel->screen->b;
1936 const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
1937 pipe_shader_type_from_mesa(sel->stage));
1938
1939 struct blob_reader blob_reader;
1940 blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
1941 return nir_deserialize(NULL, options, &blob_reader);
1942 }
1943
si_nir_assign_param_offsets(nir_shader * nir,struct si_shader * shader,int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])1944 static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shader,
1945 int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])
1946 {
1947 struct si_shader_selector *sel = shader->selector;
1948 struct si_shader_binary_info *info = &shader->info;
1949
1950 uint64_t outputs_written = 0;
1951 uint32_t outputs_written_16bit = 0;
1952
1953 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1954 assert(impl);
1955
1956 nir_foreach_block(block, impl) {
1957 nir_foreach_instr_safe(instr, block) {
1958 if (instr->type != nir_instr_type_intrinsic)
1959 continue;
1960
1961 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1962 if (intr->intrinsic != nir_intrinsic_store_output)
1963 continue;
1964
1965 /* No indirect indexing allowed. */
1966 ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1967 assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1968
1969 assert(intr->num_components == 1); /* only scalar stores expected */
1970 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1971
1972 if (sem.location >= VARYING_SLOT_VAR0_16BIT)
1973 outputs_written_16bit |= BITFIELD_BIT(sem.location - VARYING_SLOT_VAR0_16BIT);
1974 else
1975 outputs_written |= BITFIELD64_BIT(sem.location);
1976
1977 /* Assign the param index if it's unassigned. */
1978 if (nir_slot_is_varying(sem.location) && !sem.no_varying &&
1979 (sem.gs_streams & 0x3) == 0 &&
1980 info->vs_output_param_offset[sem.location] == AC_EXP_PARAM_DEFAULT_VAL_0000) {
1981 /* The semantic and the base should be the same as in si_shader_info. */
1982 assert(sem.location == sel->info.output_semantic[nir_intrinsic_base(intr)]);
1983 /* It must not be remapped (duplicated). */
1984 assert(slot_remap[sem.location] == -1);
1985
1986 info->vs_output_param_offset[sem.location] = info->nr_param_exports++;
1987 }
1988 }
1989 }
1990
1991 /* Duplicated outputs are redirected here. */
1992 for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
1993 if (slot_remap[i] >= 0)
1994 info->vs_output_param_offset[i] = info->vs_output_param_offset[slot_remap[i]];
1995 }
1996
1997 if (shader->key.ge.mono.u.vs_export_prim_id) {
1998 info->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = info->nr_param_exports++;
1999 }
2000
2001 /* Update outputs written info, we may remove some outputs before. */
2002 nir->info.outputs_written = outputs_written;
2003 nir->info.outputs_written_16bit = outputs_written_16bit;
2004 }
2005
si_assign_param_offsets(nir_shader * nir,struct si_shader * shader)2006 static void si_assign_param_offsets(nir_shader *nir, struct si_shader *shader)
2007 {
2008 /* Initialize this first. */
2009 shader->info.nr_param_exports = 0;
2010
2011 STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
2012 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
2013 sizeof(shader->info.vs_output_param_offset));
2014
2015 /* A slot remapping table for duplicated outputs, so that 1 vertex shader output can be
2016 * mapped to multiple fragment shader inputs.
2017 */
2018 int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS];
2019 memset(slot_remap, -1, NUM_TOTAL_VARYING_SLOTS);
2020
2021 /* This sets DEFAULT_VAL for constant outputs in vs_output_param_offset. */
2022 /* TODO: This doesn't affect GS. */
2023 NIR_PASS_V(nir, ac_nir_optimize_outputs, false, slot_remap,
2024 shader->info.vs_output_param_offset);
2025
2026 /* Assign the non-constant outputs. */
2027 /* TODO: Use this for the GS copy shader too. */
2028 si_nir_assign_param_offsets(nir, shader, slot_remap);
2029 }
2030
si_get_nr_pos_exports(const struct si_shader_selector * sel,const union si_shader_key * key)2031 static unsigned si_get_nr_pos_exports(const struct si_shader_selector *sel,
2032 const union si_shader_key *key)
2033 {
2034 const struct si_shader_info *info = &sel->info;
2035
2036 /* Must have a position export. */
2037 unsigned nr_pos_exports = 1;
2038
2039 if ((info->writes_psize && !key->ge.opt.kill_pointsize) ||
2040 (info->writes_edgeflag && !key->ge.as_ngg) ||
2041 (info->writes_layer && !key->ge.opt.kill_layer) ||
2042 info->writes_viewport_index || sel->screen->options.vrs2x2) {
2043 nr_pos_exports++;
2044 }
2045
2046 unsigned clipdist_mask =
2047 (info->clipdist_mask & ~key->ge.opt.kill_clip_distances) | info->culldist_mask;
2048
2049 for (int i = 0; i < 2; i++) {
2050 if (clipdist_mask & BITFIELD_RANGE(i * 4, 4))
2051 nr_pos_exports++;
2052 }
2053
2054 return nr_pos_exports;
2055 }
2056
lower_ps_load_color_intrinsic(nir_builder * b,nir_instr * instr,void * state)2057 static bool lower_ps_load_color_intrinsic(nir_builder *b, nir_instr *instr, void *state)
2058 {
2059 nir_def **colors = (nir_def **)state;
2060
2061 if (instr->type != nir_instr_type_intrinsic)
2062 return false;
2063
2064 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
2065
2066 if (intrin->intrinsic != nir_intrinsic_load_color0 &&
2067 intrin->intrinsic != nir_intrinsic_load_color1)
2068 return false;
2069
2070 unsigned index = intrin->intrinsic == nir_intrinsic_load_color0 ? 0 : 1;
2071 assert(colors[index]);
2072
2073 nir_def_rewrite_uses(&intrin->def, colors[index]);
2074
2075 nir_instr_remove(&intrin->instr);
2076 return true;
2077 }
2078
si_nir_lower_ps_color_input(nir_shader * nir,const union si_shader_key * key,const struct si_shader_info * info)2079 static bool si_nir_lower_ps_color_input(nir_shader *nir, const union si_shader_key *key,
2080 const struct si_shader_info *info)
2081 {
2082 bool progress = false;
2083 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2084
2085 nir_builder builder = nir_builder_at(nir_before_impl(impl));
2086 nir_builder *b = &builder;
2087
2088 /* Build ready to be used colors at the beginning of the shader. */
2089 nir_def *colors[2] = {0};
2090 for (int i = 0; i < 2; i++) {
2091 if (!(info->colors_read & (0xf << (i * 4))))
2092 continue;
2093
2094 unsigned color_base = info->color_attr_index[i];
2095 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
2096 * otherwise it's at offset "num_inputs".
2097 */
2098 unsigned back_color_base = info->num_inputs;
2099 if (i == 1 && (info->colors_read & 0xf))
2100 back_color_base += 1;
2101
2102 enum glsl_interp_mode interp_mode = info->color_interpolate[i];
2103 if (interp_mode == INTERP_MODE_COLOR) {
2104 interp_mode = key->ps.part.prolog.flatshade_colors ?
2105 INTERP_MODE_FLAT : INTERP_MODE_SMOOTH;
2106 }
2107
2108 nir_def *back_color = NULL;
2109 if (interp_mode == INTERP_MODE_FLAT) {
2110 colors[i] = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
2111 .base = color_base,
2112 .io_semantics.location = VARYING_SLOT_COL0 + i,
2113 .io_semantics.num_slots = 1);
2114
2115 if (key->ps.part.prolog.color_two_side) {
2116 back_color = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
2117 .base = back_color_base,
2118 .io_semantics.location = VARYING_SLOT_BFC0 + i,
2119 .io_semantics.num_slots = 1);
2120 }
2121 } else {
2122 nir_intrinsic_op op = 0;
2123 switch (info->color_interpolate_loc[i]) {
2124 case TGSI_INTERPOLATE_LOC_CENTER:
2125 op = nir_intrinsic_load_barycentric_pixel;
2126 break;
2127 case TGSI_INTERPOLATE_LOC_CENTROID:
2128 op = nir_intrinsic_load_barycentric_centroid;
2129 break;
2130 case TGSI_INTERPOLATE_LOC_SAMPLE:
2131 op = nir_intrinsic_load_barycentric_sample;
2132 break;
2133 default:
2134 unreachable("invalid color interpolate location");
2135 break;
2136 }
2137
2138 nir_def *barycentric = nir_load_barycentric(b, op, interp_mode);
2139
2140 colors[i] =
2141 nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0),
2142 .base = color_base,
2143 .io_semantics.location = VARYING_SLOT_COL0 + i,
2144 .io_semantics.num_slots = 1);
2145
2146 if (key->ps.part.prolog.color_two_side) {
2147 back_color =
2148 nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0),
2149 .base = back_color_base,
2150 .io_semantics.location = VARYING_SLOT_BFC0 + i,
2151 .io_semantics.num_slots = 1);
2152 }
2153 }
2154
2155 if (back_color) {
2156 nir_def *is_front_face = nir_load_front_face(b, 1);
2157 colors[i] = nir_bcsel(b, is_front_face, colors[i], back_color);
2158 }
2159
2160 progress = true;
2161 }
2162
2163 /* lower nir_load_color0/1 to use the color value. */
2164 return nir_shader_instructions_pass(nir, lower_ps_load_color_intrinsic,
2165 nir_metadata_block_index | nir_metadata_dominance,
2166 colors) || progress;
2167 }
2168
si_nir_emit_polygon_stipple(nir_shader * nir,struct si_shader_args * args)2169 static void si_nir_emit_polygon_stipple(nir_shader *nir, struct si_shader_args *args)
2170 {
2171 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2172
2173 nir_builder builder = nir_builder_at(nir_before_impl(impl));
2174 nir_builder *b = &builder;
2175
2176 /* Load the buffer descriptor. */
2177 nir_def *desc =
2178 si_nir_load_internal_binding(b, args, SI_PS_CONST_POLY_STIPPLE, 4);
2179
2180 /* Use the fixed-point gl_FragCoord input.
2181 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
2182 * per coordinate to get the repeating effect.
2183 */
2184 nir_def *pos_x = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 0, 5);
2185 nir_def *pos_y = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 16, 5);
2186
2187 nir_def *zero = nir_imm_int(b, 0);
2188 /* The stipple pattern is 32x32, each row has 32 bits. */
2189 nir_def *offset = nir_ishl_imm(b, pos_y, 2);
2190 nir_def *row = nir_load_buffer_amd(b, 1, 32, desc, offset, zero, zero);
2191 nir_def *bit = nir_ubfe(b, row, pos_x, nir_imm_int(b, 1));
2192
2193 nir_def *pass = nir_i2b(b, bit);
2194 nir_discard_if(b, nir_inot(b, pass));
2195 }
2196
si_should_clear_lds(struct si_screen * sscreen,const struct nir_shader * shader)2197 bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader)
2198 {
2199 return shader->info.stage == MESA_SHADER_COMPUTE && shader->info.shared_size > 0 && sscreen->options.clear_lds;
2200 }
2201
si_get_nir_shader(struct si_shader * shader,struct si_shader_args * args,bool * free_nir,uint64_t tcs_vgpr_only_inputs,ac_nir_gs_output_info * output_info)2202 struct nir_shader *si_get_nir_shader(struct si_shader *shader,
2203 struct si_shader_args *args,
2204 bool *free_nir,
2205 uint64_t tcs_vgpr_only_inputs,
2206 ac_nir_gs_output_info *output_info)
2207 {
2208 struct si_shader_selector *sel = shader->selector;
2209 const union si_shader_key *key = &shader->key;
2210
2211 nir_shader *nir;
2212 *free_nir = false;
2213
2214 if (sel->nir) {
2215 nir = sel->nir;
2216 } else if (sel->nir_binary) {
2217 nir = si_deserialize_shader(sel);
2218 *free_nir = true;
2219 } else {
2220 return NULL;
2221 }
2222
2223 bool progress = false;
2224 bool late_opts = false;
2225
2226 const char *original_name = NULL;
2227 if (unlikely(should_print_nir(nir))) {
2228 /* Modify the shader's name so that each variant gets its own name. */
2229 original_name = ralloc_strdup(nir, nir->info.name);
2230 ralloc_asprintf_append((char **)&nir->info.name, "-%08x", _mesa_hash_data(key, sizeof(*key)));
2231
2232 /* Dummy pass to get the starting point. */
2233 printf("nir_dummy_pass\n");
2234 nir_print_shader(nir, stdout);
2235 }
2236
2237 /* Kill outputs according to the shader key. */
2238 if (sel->stage <= MESA_SHADER_GEOMETRY)
2239 NIR_PASS(progress, nir, si_nir_kill_outputs, key);
2240
2241 NIR_PASS(progress, nir, ac_nir_lower_tex,
2242 &(ac_nir_lower_tex_options){
2243 .gfx_level = sel->screen->info.gfx_level,
2244 .lower_array_layer_round_even = !sel->screen->info.conformant_trunc_coord,
2245 });
2246
2247 if (nir->info.uses_resource_info_query)
2248 NIR_PASS(progress, nir, ac_nir_lower_resinfo, sel->screen->info.gfx_level);
2249
2250 bool inline_uniforms = false;
2251 uint32_t *inlined_uniform_values;
2252 si_get_inline_uniform_state((union si_shader_key*)key, sel->pipe_shader_type,
2253 &inline_uniforms, &inlined_uniform_values);
2254
2255 if (inline_uniforms) {
2256 assert(*free_nir);
2257
2258 /* Most places use shader information from the default variant, not
2259 * the optimized variant. These are the things that the driver looks at
2260 * in optimized variants and the list of things that we need to do.
2261 *
2262 * The driver takes into account these things if they suddenly disappear
2263 * from the shader code:
2264 * - Register usage and code size decrease (obvious)
2265 * - Eliminated PS system values are disabled by LLVM
2266 * (FragCoord, FrontFace, barycentrics)
2267 * - VS/TES/GS param exports are eliminated if they are undef.
2268 * The param space for eliminated outputs is also not allocated.
2269 * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)
2270 * - TCS output stores are eliminated
2271 * - Eliminated PS inputs are removed from PS.NUM_INTERP.
2272 *
2273 * TODO: These are things the driver ignores in the final shader code
2274 * and relies on the default shader info.
2275 * - System values in VS, TCS, TES, GS are not eliminated
2276 * - uses_discard - if it changed to false
2277 * - writes_memory - if it changed to false
2278 * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not
2279 * eliminated
2280 * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS)
2281 * GS outputs are eliminated except for the temporary LDS.
2282 * Clip distances, gl_PointSize, gl_Layer and PS outputs are eliminated based
2283 * on current states, so we don't care about the shader code.
2284 *
2285 * TODO: Merged shaders don't inline uniforms for the first stage.
2286 * VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS.
2287 * (key == NULL for the first stage here)
2288 *
2289 * TODO: Compute shaders don't support inlinable uniforms, because they
2290 * don't have shader variants.
2291 *
2292 * TODO: The driver uses a linear search to find a shader variant. This
2293 * can be really slow if we get too many variants due to uniform inlining.
2294 */
2295 NIR_PASS_V(nir, nir_inline_uniforms, nir->info.num_inlinable_uniforms,
2296 inlined_uniform_values, nir->info.inlinable_uniform_dw_offsets);
2297 progress = true;
2298 }
2299
2300 if (sel->stage == MESA_SHADER_FRAGMENT) {
2301 /* This uses the epilog key, so only monolithic shaders can call this. */
2302 if (shader->is_monolithic)
2303 NIR_PASS(progress, nir, si_nir_kill_ps_outputs, key);
2304
2305 if (key->ps.mono.poly_line_smoothing)
2306 NIR_PASS(progress, nir, nir_lower_poly_line_smooth, SI_NUM_SMOOTH_AA_SAMPLES);
2307
2308 if (key->ps.mono.point_smoothing)
2309 NIR_PASS(progress, nir, nir_lower_point_smooth);
2310 }
2311
2312 /* This must be before si_nir_lower_resource. */
2313 if (!sel->screen->info.has_image_opcodes)
2314 NIR_PASS(progress, nir, ac_nir_lower_image_opcodes);
2315
2316 /* LLVM does not work well with this, so is handled in llvm backend waterfall. */
2317 if (sel->screen->use_aco && sel->info.has_non_uniform_tex_access) {
2318 nir_lower_non_uniform_access_options options = {
2319 .types = nir_lower_non_uniform_texture_access,
2320 };
2321 NIR_PASS(progress, nir, nir_lower_non_uniform_access, &options);
2322 }
2323
2324 NIR_PASS(progress, nir, si_nir_lower_resource, shader, args);
2325
2326 bool is_last_vgt_stage =
2327 (sel->stage == MESA_SHADER_VERTEX ||
2328 sel->stage == MESA_SHADER_TESS_EVAL ||
2329 (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)) &&
2330 !shader->key.ge.as_ls && !shader->key.ge.as_es;
2331
2332 /* Legacy GS is not last VGT stage because it has GS copy shader. */
2333 bool is_legacy_gs = sel->stage == MESA_SHADER_GEOMETRY && !key->ge.as_ngg;
2334
2335 if (is_last_vgt_stage || is_legacy_gs)
2336 NIR_PASS(progress, nir, si_nir_clamp_vertex_color);
2337
2338 if (progress) {
2339 si_nir_opts(sel->screen, nir, true);
2340 late_opts = true;
2341 progress = false;
2342 }
2343
2344 /* Lower large variables that are always constant with load_constant intrinsics, which
2345 * get turned into PC-relative loads from a data section next to the shader.
2346 *
2347 * Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
2348 * this should be done after that.
2349 *
2350 * The pass crashes if there are dead temps of lowered IO interface types, so remove
2351 * them first.
2352 */
2353 NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
2354 NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
2355
2356 /* Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
2357 * this should be done after that.
2358 */
2359 progress |= ac_nir_lower_indirect_derefs(nir, sel->screen->info.gfx_level);
2360
2361 if (sel->stage == MESA_SHADER_VERTEX)
2362 NIR_PASS(progress, nir, si_nir_lower_vs_inputs, shader, args);
2363
2364 progress |= si_lower_io_to_mem(shader, nir, tcs_vgpr_only_inputs);
2365
2366 if (is_last_vgt_stage) {
2367 /* Assign param export indices. */
2368 si_assign_param_offsets(nir, shader);
2369
2370 /* Assign num of position exports. */
2371 shader->info.nr_pos_exports = si_get_nr_pos_exports(sel, key);
2372
2373 if (key->ge.as_ngg) {
2374 /* Lower last VGT NGG shader stage. */
2375 si_lower_ngg(shader, nir);
2376 } else if (sel->stage == MESA_SHADER_VERTEX || sel->stage == MESA_SHADER_TESS_EVAL) {
2377 /* Lower last VGT none-NGG VS/TES shader stage. */
2378 unsigned clip_cull_mask =
2379 (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) |
2380 sel->info.culldist_mask;
2381
2382 NIR_PASS_V(nir, ac_nir_lower_legacy_vs,
2383 sel->screen->info.gfx_level,
2384 clip_cull_mask,
2385 shader->info.vs_output_param_offset,
2386 shader->info.nr_param_exports,
2387 shader->key.ge.mono.u.vs_export_prim_id,
2388 !si_shader_uses_streamout(shader),
2389 key->ge.opt.kill_pointsize,
2390 key->ge.opt.kill_layer,
2391 sel->screen->options.vrs2x2);
2392 }
2393 progress = true;
2394 } else if (is_legacy_gs) {
2395 NIR_PASS_V(nir, ac_nir_lower_legacy_gs, false, sel->screen->use_ngg, output_info);
2396 progress = true;
2397 } else if (sel->stage == MESA_SHADER_FRAGMENT && shader->is_monolithic) {
2398 /* Uniform inlining can eliminate PS inputs, and colormask can remove PS outputs,
2399 * which can also cause the elimination of PS inputs. Remove holes after removed PS inputs
2400 * by renumbering them. This can only happen with monolithic PS. Colors are unaffected
2401 * because they are still represented by nir_intrinsic_load_color0/1.
2402 */
2403 NIR_PASS_V(nir, nir_recompute_io_bases, nir_var_shader_in);
2404
2405 /* Two-side color selection and interpolation: Get the latest shader info because
2406 * uniform inlining and colormask can fully eliminate color inputs.
2407 */
2408 struct si_shader_info info;
2409 si_nir_scan_shader(sel->screen, nir, &info);
2410
2411 if (info.colors_read)
2412 NIR_PASS(progress, nir, si_nir_lower_ps_color_input, &shader->key, &info);
2413
2414 /* We need to set this early for lowering nir_intrinsic_load_point_coord_maybe_flipped,
2415 * which can only occur with monolithic PS.
2416 */
2417 shader->info.num_ps_inputs = info.num_inputs;
2418 shader->info.ps_colors_read = info.colors_read;
2419
2420 ac_nir_lower_ps_options options = {
2421 .gfx_level = sel->screen->info.gfx_level,
2422 .family = sel->screen->info.family,
2423 .use_aco = sel->screen->use_aco,
2424 .uses_discard = si_shader_uses_discard(shader),
2425 .alpha_to_coverage_via_mrtz = key->ps.part.epilog.alpha_to_coverage_via_mrtz,
2426 .dual_src_blend_swizzle = key->ps.part.epilog.dual_src_blend_swizzle,
2427 .spi_shader_col_format = key->ps.part.epilog.spi_shader_col_format,
2428 .color_is_int8 = key->ps.part.epilog.color_is_int8,
2429 .color_is_int10 = key->ps.part.epilog.color_is_int10,
2430 .clamp_color = key->ps.part.epilog.clamp_color,
2431 .alpha_to_one = key->ps.part.epilog.alpha_to_one,
2432 .alpha_func = key->ps.part.epilog.alpha_func,
2433 .broadcast_last_cbuf = key->ps.part.epilog.last_cbuf,
2434 .kill_samplemask = key->ps.part.epilog.kill_samplemask,
2435
2436 .bc_optimize_for_persp = key->ps.part.prolog.bc_optimize_for_persp,
2437 .bc_optimize_for_linear = key->ps.part.prolog.bc_optimize_for_linear,
2438 .force_persp_sample_interp = key->ps.part.prolog.force_persp_sample_interp,
2439 .force_linear_sample_interp = key->ps.part.prolog.force_linear_sample_interp,
2440 .force_persp_center_interp = key->ps.part.prolog.force_persp_center_interp,
2441 .force_linear_center_interp = key->ps.part.prolog.force_linear_center_interp,
2442 .ps_iter_samples = 1 << key->ps.part.prolog.samplemask_log_ps_iter,
2443 };
2444
2445 NIR_PASS_V(nir, ac_nir_lower_ps, &options);
2446
2447 if (key->ps.part.prolog.poly_stipple)
2448 NIR_PASS_V(nir, si_nir_emit_polygon_stipple, args);
2449
2450 progress = true;
2451 }
2452
2453 NIR_PASS(progress, nir, nir_opt_idiv_const, 8);
2454 NIR_PASS(progress, nir, nir_lower_idiv,
2455 &(nir_lower_idiv_options){
2456 .allow_fp16 = sel->screen->info.gfx_level >= GFX9,
2457 });
2458
2459 if (si_should_clear_lds(sel->screen, nir)) {
2460 const unsigned chunk_size = 16; /* max single store size */
2461 const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
2462 NIR_PASS_V(nir, nir_clear_shared_memory, shared_size, chunk_size);
2463 }
2464
2465 NIR_PASS(progress, nir, ac_nir_lower_intrinsics_to_args, sel->screen->info.gfx_level,
2466 si_select_hw_stage(nir->info.stage, key, sel->screen->info.gfx_level),
2467 &args->ac);
2468 NIR_PASS(progress, nir, si_nir_lower_abi, shader, args);
2469
2470 if (progress) {
2471 si_nir_opts(sel->screen, nir, false);
2472 progress = false;
2473 late_opts = true;
2474 }
2475
2476 static const nir_opt_offsets_options offset_options = {
2477 .uniform_max = 0,
2478 .buffer_max = ~0,
2479 .shared_max = ~0,
2480 };
2481 NIR_PASS_V(nir, nir_opt_offsets, &offset_options);
2482
2483 if (late_opts)
2484 si_nir_late_opts(nir);
2485
2486 /* aco only accept scalar const, must be done after si_nir_late_opts()
2487 * which may generate vec const.
2488 */
2489 if (sel->screen->use_aco)
2490 NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
2491
2492 /* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
2493 * 200 is tuned for Viewperf. It should be done last.
2494 */
2495 NIR_PASS_V(nir, nir_group_loads, nir_group_same_resource_only, 200);
2496
2497 if (unlikely(original_name)) {
2498 ralloc_free((void*)nir->info.name);
2499 nir->info.name = original_name;
2500 }
2501
2502 return nir;
2503 }
2504
si_update_shader_binary_info(struct si_shader * shader,nir_shader * nir)2505 void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir)
2506 {
2507 struct si_shader_info info;
2508 si_nir_scan_shader(shader->selector->screen, nir, &info);
2509
2510 shader->info.uses_vmem_load_other |= info.uses_vmem_load_other;
2511 shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh;
2512
2513 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2514 /* Since uniform inlining can remove PS inputs, set the latest info about PS inputs here. */
2515 shader->info.num_ps_inputs = info.num_inputs;
2516 shader->info.ps_colors_read = info.colors_read;
2517
2518 /* A non-monolithic PS doesn't know if back colors are enabled, so copy 2 more. */
2519 unsigned max_interp = MIN2(info.num_inputs + 2, SI_NUM_INTERP);
2520 memcpy(shader->info.ps_inputs, info.input, max_interp * sizeof(info.input[0]));
2521 }
2522 }
2523
2524 /* Generate code for the hardware VS shader stage to go with a geometry shader */
2525 static struct si_shader *
si_nir_generate_gs_copy_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * gs_shader,nir_shader * gs_nir,struct util_debug_callback * debug,ac_nir_gs_output_info * output_info)2526 si_nir_generate_gs_copy_shader(struct si_screen *sscreen,
2527 struct ac_llvm_compiler *compiler,
2528 struct si_shader *gs_shader,
2529 nir_shader *gs_nir,
2530 struct util_debug_callback *debug,
2531 ac_nir_gs_output_info *output_info)
2532 {
2533 struct si_shader *shader;
2534 struct si_shader_selector *gs_selector = gs_shader->selector;
2535 struct si_shader_info *gsinfo = &gs_selector->info;
2536 union si_shader_key *gskey = &gs_shader->key;
2537
2538 shader = CALLOC_STRUCT(si_shader);
2539 if (!shader)
2540 return NULL;
2541
2542 /* We can leave the fence as permanently signaled because the GS copy
2543 * shader only becomes visible globally after it has been compiled. */
2544 util_queue_fence_init(&shader->ready);
2545
2546 shader->selector = gs_selector;
2547 shader->is_gs_copy_shader = true;
2548 shader->wave_size = si_determine_wave_size(sscreen, shader);
2549
2550 STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
2551 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
2552 sizeof(shader->info.vs_output_param_offset));
2553
2554 for (unsigned i = 0; i < gsinfo->num_outputs; i++) {
2555 unsigned semantic = gsinfo->output_semantic[i];
2556
2557 /* Skip if no channel writes to stream 0. */
2558 if (!nir_slot_is_varying(semantic) ||
2559 (gsinfo->output_streams[i] & 0x03 &&
2560 gsinfo->output_streams[i] & 0x0c &&
2561 gsinfo->output_streams[i] & 0x30 &&
2562 gsinfo->output_streams[i] & 0xc0))
2563 continue;
2564
2565 shader->info.vs_output_param_offset[semantic] = shader->info.nr_param_exports++;
2566 }
2567
2568 shader->info.nr_pos_exports = si_get_nr_pos_exports(gs_selector, gskey);
2569
2570 unsigned clip_cull_mask =
2571 (gsinfo->clipdist_mask & ~gskey->ge.opt.kill_clip_distances) | gsinfo->culldist_mask;
2572
2573 nir_shader *nir =
2574 ac_nir_create_gs_copy_shader(gs_nir,
2575 sscreen->info.gfx_level,
2576 clip_cull_mask,
2577 shader->info.vs_output_param_offset,
2578 shader->info.nr_param_exports,
2579 !si_shader_uses_streamout(gs_shader),
2580 gskey->ge.opt.kill_pointsize,
2581 gskey->ge.opt.kill_layer,
2582 sscreen->options.vrs2x2,
2583 output_info);
2584
2585 struct si_shader_args args;
2586 si_init_shader_args(shader, &args);
2587
2588 NIR_PASS_V(nir, ac_nir_lower_intrinsics_to_args, sscreen->info.gfx_level, AC_HW_VERTEX_SHADER, &args.ac);
2589 NIR_PASS_V(nir, si_nir_lower_abi, shader, &args);
2590
2591 si_nir_opts(gs_selector->screen, nir, false);
2592
2593 /* aco only accept scalar const */
2594 if (sscreen->use_aco)
2595 NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
2596
2597 if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY, SI_DUMP_NIR)) {
2598 fprintf(stderr, "GS Copy Shader:\n");
2599 nir_print_shader(nir, stderr);
2600 }
2601
2602 bool ok =
2603 #if LLVM_AVAILABLE
2604 !sscreen->use_aco ? si_llvm_compile_shader(sscreen, compiler, shader, &args, debug, nir) :
2605 #endif
2606 si_aco_compile_shader(shader, &args, nir, debug);
2607
2608
2609 if (ok) {
2610 assert(!shader->config.scratch_bytes_per_wave);
2611 ok = si_shader_binary_upload(sscreen, shader, 0);
2612 si_shader_dump(sscreen, shader, debug, stderr, true);
2613 }
2614 ralloc_free(nir);
2615
2616 if (!ok) {
2617 FREE(shader);
2618 shader = NULL;
2619 } else {
2620 si_fix_resource_usage(sscreen, shader);
2621 }
2622 return shader;
2623 }
2624
2625 struct si_gs_output_info {
2626 uint8_t streams[64];
2627 uint8_t streams_16bit_lo[16];
2628 uint8_t streams_16bit_hi[16];
2629
2630 uint8_t usage_mask[64];
2631 uint8_t usage_mask_16bit_lo[16];
2632 uint8_t usage_mask_16bit_hi[16];
2633
2634 ac_nir_gs_output_info info;
2635 };
2636
2637 static void
si_init_gs_output_info(struct si_shader_info * info,struct si_gs_output_info * out_info)2638 si_init_gs_output_info(struct si_shader_info *info, struct si_gs_output_info *out_info)
2639 {
2640 for (int i = 0; i < info->num_outputs; i++) {
2641 unsigned slot = info->output_semantic[i];
2642 if (slot < VARYING_SLOT_VAR0_16BIT) {
2643 out_info->streams[slot] = info->output_streams[i];
2644 out_info->usage_mask[slot] = info->output_usagemask[i];
2645 } else {
2646 unsigned index = slot - VARYING_SLOT_VAR0_16BIT;
2647 /* TODO: 16bit need separated fields for lo/hi part. */
2648 out_info->streams_16bit_lo[index] = info->output_streams[i];
2649 out_info->streams_16bit_hi[index] = info->output_streams[i];
2650 out_info->usage_mask_16bit_lo[index] = info->output_usagemask[i];
2651 out_info->usage_mask_16bit_hi[index] = info->output_usagemask[i];
2652 }
2653 }
2654
2655 ac_nir_gs_output_info *ac_info = &out_info->info;
2656
2657 ac_info->streams = out_info->streams;
2658 ac_info->streams_16bit_lo = out_info->streams_16bit_lo;
2659 ac_info->streams_16bit_hi = out_info->streams_16bit_hi;
2660
2661 ac_info->usage_mask = out_info->usage_mask;
2662 ac_info->usage_mask_16bit_lo = out_info->usage_mask_16bit_lo;
2663 ac_info->usage_mask_16bit_hi = out_info->usage_mask_16bit_hi;
2664
2665 /* TODO: construct 16bit slot per component store type. */
2666 ac_info->types_16bit_lo = ac_info->types_16bit_hi = NULL;
2667 }
2668
si_fixup_spi_ps_input_config(struct si_shader * shader)2669 static void si_fixup_spi_ps_input_config(struct si_shader *shader)
2670 {
2671 const union si_shader_key *key = &shader->key;
2672
2673 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
2674 if (key->ps.part.prolog.poly_stipple)
2675 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2676
2677 /* Set up the enable bits for per-sample shading if needed. */
2678 if (key->ps.part.prolog.force_persp_sample_interp &&
2679 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2680 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2681 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
2682 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2683 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2684 }
2685 if (key->ps.part.prolog.force_linear_sample_interp &&
2686 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2687 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2688 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
2689 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2690 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2691 }
2692 if (key->ps.part.prolog.force_persp_center_interp &&
2693 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2694 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2695 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
2696 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2697 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2698 }
2699 if (key->ps.part.prolog.force_linear_center_interp &&
2700 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2701 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2702 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
2703 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2704 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2705 }
2706
2707 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
2708 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
2709 !(shader->config.spi_ps_input_ena & 0xf)) {
2710 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2711 }
2712
2713 /* At least one pair of interpolation weights must be enabled. */
2714 if (!(shader->config.spi_ps_input_ena & 0x7f))
2715 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2716
2717 /* Samplemask fixup requires the sample ID. */
2718 if (key->ps.part.prolog.samplemask_log_ps_iter)
2719 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2720 }
2721
2722 static void
si_set_spi_ps_input_config(struct si_shader * shader)2723 si_set_spi_ps_input_config(struct si_shader *shader)
2724 {
2725 const struct si_shader_selector *sel = shader->selector;
2726 const struct si_shader_info *info = &sel->info;
2727 const union si_shader_key *key = &shader->key;
2728
2729 /* TODO: This should be determined from the final NIR instead of the input NIR,
2730 * otherwise LLVM will have a performance advantage here because it determines
2731 * VGPR inputs for each shader variant after LLVM optimizations.
2732 */
2733 shader->config.spi_ps_input_ena =
2734 S_0286CC_PERSP_CENTER_ENA(info->uses_persp_center) |
2735 S_0286CC_PERSP_CENTROID_ENA(info->uses_persp_centroid) |
2736 S_0286CC_PERSP_SAMPLE_ENA(info->uses_persp_sample) |
2737 S_0286CC_LINEAR_CENTER_ENA(info->uses_linear_center) |
2738 S_0286CC_LINEAR_CENTROID_ENA(info->uses_linear_centroid) |
2739 S_0286CC_LINEAR_SAMPLE_ENA(info->uses_linear_sample) |
2740 S_0286CC_FRONT_FACE_ENA(info->uses_frontface && !key->ps.opt.force_front_face_input) |
2741 S_0286CC_SAMPLE_COVERAGE_ENA(info->reads_samplemask) |
2742 S_0286CC_ANCILLARY_ENA(info->uses_sampleid || info->uses_layer_id);
2743
2744 uint8_t mask = info->reads_frag_coord_mask | info->reads_sample_pos_mask;
2745 u_foreach_bit(i, mask) {
2746 shader->config.spi_ps_input_ena |= S_0286CC_POS_X_FLOAT_ENA(1) << i;
2747 }
2748
2749 if (key->ps.part.prolog.color_two_side)
2750 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
2751
2752 /* INTERP_MODE_COLOR, same as SMOOTH if flat shading is disabled. */
2753 if (info->uses_interp_color && !key->ps.part.prolog.flatshade_colors) {
2754 shader->config.spi_ps_input_ena |=
2755 S_0286CC_PERSP_SAMPLE_ENA(info->uses_persp_sample_color) |
2756 S_0286CC_PERSP_CENTER_ENA(info->uses_persp_center_color) |
2757 S_0286CC_PERSP_CENTROID_ENA(info->uses_persp_centroid_color);
2758 }
2759
2760 /* nir_lower_poly_line_smooth use nir_load_sample_mask_in */
2761 if (key->ps.mono.poly_line_smoothing)
2762 shader->config.spi_ps_input_ena |= S_0286CC_SAMPLE_COVERAGE_ENA(1);
2763
2764 /* nir_lower_point_smooth use nir_load_point_coord_maybe_flipped which is lowered
2765 * to nir_load_barycentric_pixel and nir_load_interpolated_input.
2766 */
2767 if (key->ps.mono.point_smoothing)
2768 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2769
2770 /* See fetch_framebuffer() for used args when fbfetch output. */
2771 if (info->base.fs.uses_fbfetch_output) {
2772 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2773
2774 if (key->ps.mono.fbfetch_layered || key->ps.mono.fbfetch_msaa)
2775 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2776 }
2777
2778 if (shader->is_monolithic) {
2779 si_fixup_spi_ps_input_config(shader);
2780 shader->config.spi_ps_input_addr = shader->config.spi_ps_input_ena;
2781 } else {
2782 /* Part mode will call si_fixup_spi_ps_input_config() when combining multi
2783 * shader part in si_shader_select_ps_parts().
2784 *
2785 * Reserve register locations for VGPR inputs the PS prolog may need.
2786 */
2787 shader->config.spi_ps_input_addr =
2788 shader->config.spi_ps_input_ena |
2789 SI_SPI_PS_INPUT_ADDR_FOR_PROLOG;
2790 }
2791 }
2792
2793 static void
debug_message_stderr(void * data,unsigned * id,enum util_debug_type ptype,const char * fmt,va_list args)2794 debug_message_stderr(void *data, unsigned *id, enum util_debug_type ptype,
2795 const char *fmt, va_list args)
2796 {
2797 vfprintf(stderr, fmt, args);
2798 fprintf(stderr, "\n");
2799 }
2800
si_compile_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)2801 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
2802 struct si_shader *shader, struct util_debug_callback *debug)
2803 {
2804 bool ret = true;
2805 struct si_shader_selector *sel = shader->selector;
2806
2807 /* ACO need spi_ps_input in advance to init args and used in compiler. */
2808 if (sel->stage == MESA_SHADER_FRAGMENT && sscreen->use_aco)
2809 si_set_spi_ps_input_config(shader);
2810
2811 /* We need this info only when legacy GS. */
2812 struct si_gs_output_info legacy_gs_output_info;
2813 if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
2814 memset(&legacy_gs_output_info, 0, sizeof(legacy_gs_output_info));
2815 si_init_gs_output_info(&sel->info, &legacy_gs_output_info);
2816 }
2817
2818 struct si_shader_args args;
2819 si_init_shader_args(shader, &args);
2820
2821 bool free_nir;
2822 struct nir_shader *nir =
2823 si_get_nir_shader(shader, &args, &free_nir, 0, &legacy_gs_output_info.info);
2824
2825 /* Dump NIR before doing NIR->LLVM conversion in case the
2826 * conversion fails. */
2827 if (si_can_dump_shader(sscreen, sel->stage, SI_DUMP_NIR)) {
2828 nir_print_shader(nir, stderr);
2829
2830 if (nir->xfb_info)
2831 nir_print_xfb_info(nir->xfb_info, stderr);
2832 }
2833
2834 /* Initialize vs_output_ps_input_cntl to default. */
2835 for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++)
2836 shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED;
2837 shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0;
2838
2839 si_update_shader_binary_info(shader, nir);
2840
2841 /* uses_instanceid may be set by si_nir_lower_vs_inputs(). */
2842 shader->info.uses_instanceid |= sel->info.uses_instanceid;
2843 shader->info.private_mem_vgprs = DIV_ROUND_UP(nir->scratch_size, 4);
2844
2845 /* Set the FP ALU behavior. */
2846 /* By default, we disable denormals for FP32 and enable them for FP16 and FP64
2847 * for performance and correctness reasons. FP32 denormals can't be enabled because
2848 * they break output modifiers and v_mad_f32 and are very slow on GFX6-7.
2849 *
2850 * float_controls_execution_mode defines the set of valid behaviors. Contradicting flags
2851 * can be set simultaneously, which means we are allowed to choose, but not really because
2852 * some options cause GLCTS failures.
2853 */
2854 unsigned float_mode = V_00B028_FP_16_64_DENORMS;
2855
2856 if (!(nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) &&
2857 nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
2858 float_mode |= V_00B028_FP_32_ROUND_TOWARDS_ZERO;
2859
2860 if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
2861 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64)) &&
2862 nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
2863 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
2864 float_mode |= V_00B028_FP_16_64_ROUND_TOWARDS_ZERO;
2865
2866 if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_PRESERVE_FP16 |
2867 FLOAT_CONTROLS_DENORM_PRESERVE_FP64)) &&
2868 nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
2869 FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64))
2870 float_mode &= ~V_00B028_FP_16_64_DENORMS;
2871
2872 ret =
2873 #if LLVM_AVAILABLE
2874 !sscreen->use_aco ? si_llvm_compile_shader(sscreen, compiler, shader, &args, debug, nir) :
2875 #endif
2876 si_aco_compile_shader(shader, &args, nir, debug);
2877
2878 if (!ret)
2879 goto out;
2880
2881 shader->config.float_mode = float_mode;
2882
2883 /* The GS copy shader is compiled next. */
2884 if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
2885 shader->gs_copy_shader =
2886 si_nir_generate_gs_copy_shader(sscreen, compiler, shader, nir, debug,
2887 &legacy_gs_output_info.info);
2888 if (!shader->gs_copy_shader) {
2889 fprintf(stderr, "radeonsi: can't create GS copy shader\n");
2890 ret = false;
2891 goto out;
2892 }
2893 }
2894
2895 /* Compute vs_output_ps_input_cntl. */
2896 if ((sel->stage == MESA_SHADER_VERTEX ||
2897 sel->stage == MESA_SHADER_TESS_EVAL ||
2898 sel->stage == MESA_SHADER_GEOMETRY) &&
2899 !shader->key.ge.as_ls && !shader->key.ge.as_es) {
2900 uint8_t *vs_output_param_offset = shader->info.vs_output_param_offset;
2901
2902 if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg)
2903 vs_output_param_offset = shader->gs_copy_shader->info.vs_output_param_offset;
2904
2905 /* We must use the original shader info before the removal of duplicated shader outputs. */
2906 /* VS and TES should also set primitive ID output if it's used. */
2907 unsigned num_outputs_with_prim_id = sel->info.num_outputs +
2908 shader->key.ge.mono.u.vs_export_prim_id;
2909
2910 for (unsigned i = 0; i < num_outputs_with_prim_id; i++) {
2911 unsigned semantic = sel->info.output_semantic[i];
2912 unsigned offset = vs_output_param_offset[semantic];
2913 unsigned ps_input_cntl;
2914
2915 if (offset <= AC_EXP_PARAM_OFFSET_31) {
2916 /* The input is loaded from parameter memory. */
2917 ps_input_cntl = S_028644_OFFSET(offset);
2918 } else {
2919 /* The input is a DEFAULT_VAL constant. */
2920 assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
2921 offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
2922 offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
2923
2924 /* OFFSET=0x20 means that DEFAULT_VAL is used. */
2925 ps_input_cntl = S_028644_OFFSET(0x20) |
2926 S_028644_DEFAULT_VAL(offset);
2927 }
2928
2929 shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl;
2930 }
2931 }
2932
2933 /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */
2934 if (sel->stage == MESA_SHADER_COMPUTE) {
2935 unsigned max_vgprs =
2936 sscreen->info.num_physical_wave64_vgprs_per_simd * (shader->wave_size == 32 ? 2 : 1);
2937 unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
2938 unsigned max_sgprs_per_wave = 128;
2939 unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
2940 unsigned threads_per_tg = si_get_max_workgroup_size(shader);
2941 unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, shader->wave_size);
2942 unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
2943
2944 max_vgprs = max_vgprs / waves_per_simd;
2945 max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
2946
2947 if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
2948 fprintf(stderr,
2949 "LLVM failed to compile a shader correctly: "
2950 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
2951 shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);
2952
2953 /* Just terminate the process, because dependent
2954 * shaders can hang due to bad input data, but use
2955 * the env var to allow shader-db to work.
2956 */
2957 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
2958 abort();
2959 }
2960 }
2961
2962 /* Add/remove the scratch offset to/from input SGPRs. */
2963 if (sel->screen->info.gfx_level < GFX11 &&
2964 (sel->screen->info.family < CHIP_GFX940 || sel->screen->info.has_graphics) &&
2965 !si_is_merged_shader(shader)) {
2966 if (sscreen->use_aco) {
2967 /* When aco scratch_offset arg is added explicitly at the beginning.
2968 * After compile if no scratch used, reduce the input sgpr count.
2969 */
2970 if (!shader->config.scratch_bytes_per_wave)
2971 shader->info.num_input_sgprs--;
2972 } else {
2973 /* scratch_offset arg is added by llvm implicitly */
2974 if (shader->info.num_input_sgprs)
2975 shader->info.num_input_sgprs++;
2976 }
2977 }
2978
2979 /* Calculate the number of fragment input VGPRs. */
2980 if (sel->stage == MESA_SHADER_FRAGMENT) {
2981 shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(
2982 &shader->config, &shader->info.num_fragcoord_components);
2983 }
2984
2985 si_calculate_max_simd_waves(shader);
2986
2987 if (si_can_dump_shader(sscreen, sel->stage, SI_DUMP_STATS)) {
2988 struct util_debug_callback out_stderr = {
2989 .debug_message = debug_message_stderr,
2990 };
2991
2992 si_shader_dump_stats_for_shader_db(sscreen, shader, &out_stderr);
2993 } else {
2994 si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
2995 }
2996
2997 out:
2998 if (free_nir)
2999 ralloc_free(nir);
3000
3001 return ret;
3002 }
3003
3004 /**
3005 * Create, compile and return a shader part (prolog or epilog).
3006 *
3007 * \param sscreen screen
3008 * \param list list of shader parts of the same category
3009 * \param type shader type
3010 * \param key shader part key
3011 * \param prolog whether the part being requested is a prolog
3012 * \param tm LLVM target machine
3013 * \param debug debug callback
3014 * \return non-NULL on success
3015 */
3016 static struct si_shader_part *
si_get_shader_part(struct si_screen * sscreen,struct si_shader_part ** list,gl_shader_stage stage,bool prolog,union si_shader_part_key * key,struct ac_llvm_compiler * compiler,struct util_debug_callback * debug,const char * name)3017 si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
3018 gl_shader_stage stage, bool prolog, union si_shader_part_key *key,
3019 struct ac_llvm_compiler *compiler, struct util_debug_callback *debug,
3020 const char *name)
3021 {
3022 struct si_shader_part *result;
3023
3024 simple_mtx_lock(&sscreen->shader_parts_mutex);
3025
3026 /* Find existing. */
3027 for (result = *list; result; result = result->next) {
3028 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
3029 simple_mtx_unlock(&sscreen->shader_parts_mutex);
3030 return result;
3031 }
3032 }
3033
3034 /* Compile a new one. */
3035 result = CALLOC_STRUCT(si_shader_part);
3036 result->key = *key;
3037
3038 bool ok =
3039 #if LLVM_AVAILABLE
3040 !sscreen->use_aco ? si_llvm_build_shader_part(sscreen, stage, prolog, compiler, debug, name, result) :
3041 #endif
3042 si_aco_build_shader_part(sscreen, stage, prolog, debug, name, result);
3043
3044 if (ok) {
3045 result->next = *list;
3046 *list = result;
3047 } else {
3048 FREE(result);
3049 result = NULL;
3050 }
3051
3052 simple_mtx_unlock(&sscreen->shader_parts_mutex);
3053 return result;
3054 }
3055
si_get_tcs_epilog_key(struct si_shader * shader,union si_shader_part_key * key)3056 void si_get_tcs_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
3057 {
3058 memset(key, 0, sizeof(*key));
3059 key->tcs_epilog.wave32 = shader->wave_size == 32;
3060 key->tcs_epilog.states = shader->key.ge.part.tcs.epilog;
3061
3062 /* If output patches are wholly in one wave, we don't need a barrier. */
3063 key->tcs_epilog.noop_s_barrier =
3064 shader->wave_size % shader->selector->info.base.tess.tcs_vertices_out == 0;
3065 }
3066
3067 /**
3068 * Select and compile (or reuse) TCS parts (epilog).
3069 */
si_shader_select_tcs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3070 static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3071 struct si_shader *shader, struct util_debug_callback *debug)
3072 {
3073 if (sscreen->info.gfx_level >= GFX9)
3074 shader->previous_stage = shader->key.ge.part.tcs.ls->main_shader_part_ls;
3075
3076 /* Get the epilog. */
3077 union si_shader_part_key epilog_key;
3078 si_get_tcs_epilog_key(shader, &epilog_key);
3079
3080 shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, MESA_SHADER_TESS_CTRL, false,
3081 &epilog_key, compiler, debug,
3082 "Tessellation Control Shader Epilog");
3083 return shader->epilog != NULL;
3084 }
3085
3086 /**
3087 * Select and compile (or reuse) GS parts (prolog).
3088 */
si_shader_select_gs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3089 static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3090 struct si_shader *shader, struct util_debug_callback *debug)
3091 {
3092 if (sscreen->info.gfx_level >= GFX9) {
3093 if (shader->key.ge.as_ngg)
3094 shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_ngg_es;
3095 else
3096 shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_es;
3097 }
3098
3099 return true;
3100 }
3101
3102 /**
3103 * Compute the PS prolog key, which contains all the information needed to
3104 * build the PS prolog function, and set related bits in shader->config.
3105 */
si_get_ps_prolog_key(struct si_shader * shader,union si_shader_part_key * key)3106 void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key)
3107 {
3108 struct si_shader_info *info = &shader->selector->info;
3109
3110 memset(key, 0, sizeof(*key));
3111 key->ps_prolog.states = shader->key.ps.part.prolog;
3112 key->ps_prolog.wave32 = shader->wave_size == 32;
3113 key->ps_prolog.colors_read = shader->info.ps_colors_read;
3114 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
3115 key->ps_prolog.wqm =
3116 info->base.fs.needs_quad_helper_invocations &&
3117 (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
3118 key->ps_prolog.states.force_linear_sample_interp ||
3119 key->ps_prolog.states.force_persp_center_interp ||
3120 key->ps_prolog.states.force_linear_center_interp ||
3121 key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);
3122 key->ps_prolog.num_fragcoord_components = shader->info.num_fragcoord_components;
3123
3124 if (shader->key.ps.part.prolog.poly_stipple)
3125 shader->info.uses_vmem_load_other = true;
3126
3127 if (shader->info.ps_colors_read) {
3128 uint8_t *color = shader->selector->info.color_attr_index;
3129
3130 if (shader->key.ps.part.prolog.color_two_side) {
3131 /* BCOLORs are stored after the last input. */
3132 key->ps_prolog.num_interp_inputs = shader->info.num_ps_inputs;
3133 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
3134 }
3135
3136 for (unsigned i = 0; i < 2; i++) {
3137 unsigned interp = info->color_interpolate[i];
3138 unsigned location = info->color_interpolate_loc[i];
3139
3140 if (!(shader->info.ps_colors_read & (0xf << i * 4)))
3141 continue;
3142
3143 key->ps_prolog.color_attr_index[i] = color[i];
3144
3145 if (shader->key.ps.part.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
3146 interp = INTERP_MODE_FLAT;
3147
3148 switch (interp) {
3149 case INTERP_MODE_FLAT:
3150 key->ps_prolog.color_interp_vgpr_index[i] = -1;
3151 break;
3152 case INTERP_MODE_SMOOTH:
3153 case INTERP_MODE_COLOR:
3154 /* Force the interpolation location for colors here. */
3155 if (shader->key.ps.part.prolog.force_persp_sample_interp)
3156 location = TGSI_INTERPOLATE_LOC_SAMPLE;
3157 if (shader->key.ps.part.prolog.force_persp_center_interp)
3158 location = TGSI_INTERPOLATE_LOC_CENTER;
3159
3160 switch (location) {
3161 case TGSI_INTERPOLATE_LOC_SAMPLE:
3162 key->ps_prolog.color_interp_vgpr_index[i] = 0;
3163 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
3164 break;
3165 case TGSI_INTERPOLATE_LOC_CENTER:
3166 key->ps_prolog.color_interp_vgpr_index[i] = 2;
3167 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
3168 break;
3169 case TGSI_INTERPOLATE_LOC_CENTROID:
3170 key->ps_prolog.color_interp_vgpr_index[i] = 4;
3171 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
3172 break;
3173 default:
3174 assert(0);
3175 }
3176 break;
3177 case INTERP_MODE_NOPERSPECTIVE:
3178 /* Force the interpolation location for colors here. */
3179 if (shader->key.ps.part.prolog.force_linear_sample_interp)
3180 location = TGSI_INTERPOLATE_LOC_SAMPLE;
3181 if (shader->key.ps.part.prolog.force_linear_center_interp)
3182 location = TGSI_INTERPOLATE_LOC_CENTER;
3183
3184 /* The VGPR assignment for non-monolithic shaders
3185 * works because InitialPSInputAddr is set on the
3186 * main shader and PERSP_PULL_MODEL is never used.
3187 */
3188 switch (location) {
3189 case TGSI_INTERPOLATE_LOC_SAMPLE:
3190 key->ps_prolog.color_interp_vgpr_index[i] = 6;
3191 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
3192 break;
3193 case TGSI_INTERPOLATE_LOC_CENTER:
3194 key->ps_prolog.color_interp_vgpr_index[i] = 8;
3195 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
3196 break;
3197 case TGSI_INTERPOLATE_LOC_CENTROID:
3198 key->ps_prolog.color_interp_vgpr_index[i] = 10;
3199 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
3200 break;
3201 default:
3202 assert(0);
3203 }
3204 break;
3205 default:
3206 assert(0);
3207 }
3208 }
3209 }
3210 }
3211
3212 /**
3213 * Check whether a PS prolog is required based on the key.
3214 */
si_need_ps_prolog(const union si_shader_part_key * key)3215 bool si_need_ps_prolog(const union si_shader_part_key *key)
3216 {
3217 return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
3218 key->ps_prolog.states.force_linear_sample_interp ||
3219 key->ps_prolog.states.force_persp_center_interp ||
3220 key->ps_prolog.states.force_linear_center_interp ||
3221 key->ps_prolog.states.bc_optimize_for_persp ||
3222 key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
3223 key->ps_prolog.states.samplemask_log_ps_iter;
3224 }
3225
3226 /**
3227 * Compute the PS epilog key, which contains all the information needed to
3228 * build the PS epilog function.
3229 */
si_get_ps_epilog_key(struct si_shader * shader,union si_shader_part_key * key)3230 void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
3231 {
3232 struct si_shader_info *info = &shader->selector->info;
3233 memset(key, 0, sizeof(*key));
3234 key->ps_epilog.wave32 = shader->wave_size == 32;
3235 key->ps_epilog.uses_discard = si_shader_uses_discard(shader);
3236 key->ps_epilog.colors_written = info->colors_written;
3237 key->ps_epilog.color_types = info->output_color_types;
3238 key->ps_epilog.writes_z = info->writes_z;
3239 key->ps_epilog.writes_stencil = info->writes_stencil;
3240 key->ps_epilog.writes_samplemask = info->writes_samplemask &&
3241 !shader->key.ps.part.epilog.kill_samplemask;
3242 key->ps_epilog.states = shader->key.ps.part.epilog;
3243 }
3244
3245 /**
3246 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
3247 */
si_shader_select_ps_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3248 static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3249 struct si_shader *shader, struct util_debug_callback *debug)
3250 {
3251 union si_shader_part_key prolog_key;
3252 union si_shader_part_key epilog_key;
3253
3254 /* Get the prolog. */
3255 si_get_ps_prolog_key(shader, &prolog_key);
3256
3257 /* The prolog is a no-op if these aren't set. */
3258 if (si_need_ps_prolog(&prolog_key)) {
3259 shader->prolog =
3260 si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key,
3261 compiler, debug, "Fragment Shader Prolog");
3262 if (!shader->prolog)
3263 return false;
3264 }
3265
3266 /* Get the epilog. */
3267 si_get_ps_epilog_key(shader, &epilog_key);
3268
3269 shader->epilog =
3270 si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key,
3271 compiler, debug, "Fragment Shader Epilog");
3272 if (!shader->epilog)
3273 return false;
3274
3275 si_fixup_spi_ps_input_config(shader);
3276
3277 /* Make sure spi_ps_input_addr bits is superset of spi_ps_input_ena. */
3278 unsigned spi_ps_input_ena = shader->config.spi_ps_input_ena;
3279 unsigned spi_ps_input_addr = shader->config.spi_ps_input_addr;
3280 assert((spi_ps_input_ena & spi_ps_input_addr) == spi_ps_input_ena);
3281
3282 return true;
3283 }
3284
si_multiwave_lds_size_workaround(struct si_screen * sscreen,unsigned * lds_size)3285 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
3286 {
3287 /* If tessellation is all offchip and on-chip GS isn't used, this
3288 * workaround is not needed.
3289 */
3290 return;
3291
3292 /* SPI barrier management bug:
3293 * Make sure we have at least 4k of LDS in use to avoid the bug.
3294 * It applies to workgroup sizes of more than one wavefront.
3295 */
3296 if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
3297 *lds_size = MAX2(*lds_size, 8);
3298 }
3299
si_fix_resource_usage(struct si_screen * sscreen,struct si_shader * shader)3300 static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
3301 {
3302 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
3303
3304 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
3305
3306 if (shader->selector->stage == MESA_SHADER_COMPUTE &&
3307 si_get_max_workgroup_size(shader) > shader->wave_size) {
3308 si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
3309 }
3310 }
3311
si_create_shader_variant(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3312 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3313 struct si_shader *shader, struct util_debug_callback *debug)
3314 {
3315 struct si_shader_selector *sel = shader->selector;
3316 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
3317
3318 if (sel->stage == MESA_SHADER_FRAGMENT) {
3319 shader->ps.writes_samplemask = sel->info.writes_samplemask &&
3320 !shader->key.ps.part.epilog.kill_samplemask;
3321 }
3322
3323 /* LS, ES, VS are compiled on demand if the main part hasn't been
3324 * compiled for that stage.
3325 *
3326 * GS are compiled on demand if the main part hasn't been compiled
3327 * for the chosen NGG-ness.
3328 *
3329 * Vertex shaders are compiled on demand when a vertex fetch
3330 * workaround must be applied.
3331 */
3332 if (shader->is_monolithic) {
3333 /* Monolithic shader (compiled as a whole, has many variants,
3334 * may take a long time to compile).
3335 */
3336 if (!si_compile_shader(sscreen, compiler, shader, debug))
3337 return false;
3338 } else {
3339 /* The shader consists of several parts:
3340 *
3341 * - the middle part is the user shader, it has 1 variant only
3342 * and it was compiled during the creation of the shader
3343 * selector
3344 * - the prolog part is inserted at the beginning
3345 * - the epilog part is inserted at the end
3346 *
3347 * The prolog and epilog have many (but simple) variants.
3348 *
3349 * Starting with gfx9, geometry and tessellation control
3350 * shaders also contain the prolog and user shader parts of
3351 * the previous shader stage.
3352 */
3353
3354 if (!mainp)
3355 return false;
3356
3357 /* Copy the compiled shader data over. */
3358 shader->is_binary_shared = true;
3359 shader->binary = mainp->binary;
3360 shader->config = mainp->config;
3361 shader->info = mainp->info;
3362
3363 /* Select prologs and/or epilogs. */
3364 switch (sel->stage) {
3365 case MESA_SHADER_TESS_CTRL:
3366 if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
3367 return false;
3368 break;
3369 case MESA_SHADER_GEOMETRY:
3370 if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
3371 return false;
3372
3373 /* Clone the GS copy shader for the shader variant.
3374 * We can't just copy the pointer because we change the pm4 state and
3375 * si_shader_selector::gs_copy_shader must be immutable because it's shared
3376 * by multiple contexts.
3377 */
3378 if (!shader->key.ge.as_ngg) {
3379 assert(sel->main_shader_part == mainp);
3380 assert(sel->main_shader_part->gs_copy_shader);
3381 assert(sel->main_shader_part->gs_copy_shader->bo);
3382 assert(!sel->main_shader_part->gs_copy_shader->previous_stage_sel);
3383 assert(!sel->main_shader_part->gs_copy_shader->scratch_bo);
3384
3385 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
3386 memcpy(shader->gs_copy_shader, sel->main_shader_part->gs_copy_shader,
3387 sizeof(*shader->gs_copy_shader));
3388 /* Increase the reference count. */
3389 pipe_reference(NULL, &shader->gs_copy_shader->bo->b.b.reference);
3390 /* Initialize some fields differently. */
3391 shader->gs_copy_shader->shader_log = NULL;
3392 shader->gs_copy_shader->is_binary_shared = true;
3393 util_queue_fence_init(&shader->gs_copy_shader->ready);
3394 }
3395 break;
3396 case MESA_SHADER_FRAGMENT:
3397 if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
3398 return false;
3399
3400 /* Make sure we have at least as many VGPRs as there
3401 * are allocated inputs.
3402 */
3403 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
3404 break;
3405 default:;
3406 }
3407
3408 assert(shader->wave_size == mainp->wave_size);
3409 assert(!shader->previous_stage || shader->wave_size == shader->previous_stage->wave_size);
3410
3411 /* Update SGPR and VGPR counts. */
3412 if (shader->prolog) {
3413 shader->config.num_sgprs =
3414 MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs);
3415 shader->config.num_vgprs =
3416 MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs);
3417 }
3418 if (shader->previous_stage) {
3419 shader->config.num_sgprs =
3420 MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
3421 shader->config.num_vgprs =
3422 MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
3423 shader->config.spilled_sgprs =
3424 MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
3425 shader->config.spilled_vgprs =
3426 MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
3427 shader->info.private_mem_vgprs =
3428 MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
3429 shader->config.scratch_bytes_per_wave =
3430 MAX2(shader->config.scratch_bytes_per_wave,
3431 shader->previous_stage->config.scratch_bytes_per_wave);
3432 shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
3433 shader->info.uses_vmem_load_other |= shader->previous_stage->info.uses_vmem_load_other;
3434 shader->info.uses_vmem_sampler_or_bvh |= shader->previous_stage->info.uses_vmem_sampler_or_bvh;
3435 }
3436 if (shader->epilog) {
3437 shader->config.num_sgprs =
3438 MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs);
3439 shader->config.num_vgprs =
3440 MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs);
3441 }
3442 si_calculate_max_simd_waves(shader);
3443 }
3444
3445 if (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
3446 assert(!shader->key.ge.as_es && !shader->key.ge.as_ls);
3447 if (!gfx10_ngg_calculate_subgroup_info(shader)) {
3448 fprintf(stderr, "Failed to compute subgroup info\n");
3449 return false;
3450 }
3451 } else if (sscreen->info.gfx_level >= GFX9 && sel->stage == MESA_SHADER_GEOMETRY) {
3452 gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
3453 }
3454
3455 shader->uses_vs_state_provoking_vertex =
3456 sscreen->use_ngg &&
3457 /* Used to convert triangle strips from GS to triangles. */
3458 ((sel->stage == MESA_SHADER_GEOMETRY &&
3459 util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) ||
3460 (sel->stage == MESA_SHADER_VERTEX &&
3461 /* Used to export PrimitiveID from the correct vertex. */
3462 shader->key.ge.mono.u.vs_export_prim_id));
3463
3464 shader->uses_gs_state_outprim = sscreen->use_ngg &&
3465 /* Only used by streamout and the PrimID export in vertex
3466 * shaders. */
3467 sel->stage == MESA_SHADER_VERTEX &&
3468 (si_shader_uses_streamout(shader) ||
3469 shader->uses_vs_state_provoking_vertex);
3470
3471 if (sel->stage == MESA_SHADER_VERTEX) {
3472 shader->uses_base_instance = sel->info.uses_base_instance ||
3473 shader->key.ge.mono.instance_divisor_is_one ||
3474 shader->key.ge.mono.instance_divisor_is_fetched;
3475 } else if (sel->stage == MESA_SHADER_TESS_CTRL) {
3476 shader->uses_base_instance = shader->previous_stage_sel &&
3477 (shader->previous_stage_sel->info.uses_base_instance ||
3478 shader->key.ge.mono.instance_divisor_is_one ||
3479 shader->key.ge.mono.instance_divisor_is_fetched);
3480 } else if (sel->stage == MESA_SHADER_GEOMETRY) {
3481 shader->uses_base_instance = shader->previous_stage_sel &&
3482 (shader->previous_stage_sel->info.uses_base_instance ||
3483 shader->key.ge.mono.instance_divisor_is_one ||
3484 shader->key.ge.mono.instance_divisor_is_fetched);
3485 }
3486
3487 si_fix_resource_usage(sscreen, shader);
3488
3489 /* Upload. */
3490 bool ok = si_shader_binary_upload(sscreen, shader, 0);
3491 si_shader_dump(sscreen, shader, debug, stderr, true);
3492
3493 if (!ok)
3494 fprintf(stderr, "LLVM failed to upload shader\n");
3495 return ok;
3496 }
3497
si_shader_binary_clean(struct si_shader_binary * binary)3498 void si_shader_binary_clean(struct si_shader_binary *binary)
3499 {
3500 free((void *)binary->code_buffer);
3501 binary->code_buffer = NULL;
3502
3503 free(binary->llvm_ir_string);
3504 binary->llvm_ir_string = NULL;
3505
3506 free((void *)binary->symbols);
3507 binary->symbols = NULL;
3508
3509 free(binary->uploaded_code);
3510 binary->uploaded_code = NULL;
3511 binary->uploaded_code_size = 0;
3512 }
3513
si_shader_destroy(struct si_shader * shader)3514 void si_shader_destroy(struct si_shader *shader)
3515 {
3516 if (shader->scratch_bo)
3517 si_resource_reference(&shader->scratch_bo, NULL);
3518
3519 si_resource_reference(&shader->bo, NULL);
3520
3521 if (!shader->is_binary_shared)
3522 si_shader_binary_clean(&shader->binary);
3523
3524 free(shader->shader_log);
3525 }
3526
si_get_prev_stage_nir_shader(struct si_shader * shader,struct si_shader * prev_shader,struct si_shader_args * args,bool * free_nir)3527 nir_shader *si_get_prev_stage_nir_shader(struct si_shader *shader,
3528 struct si_shader *prev_shader,
3529 struct si_shader_args *args,
3530 bool *free_nir)
3531 {
3532 const struct si_shader_selector *sel = shader->selector;
3533 const union si_shader_key *key = &shader->key;
3534
3535 if (sel->stage == MESA_SHADER_TESS_CTRL) {
3536 struct si_shader_selector *ls = key->ge.part.tcs.ls;
3537
3538 prev_shader->selector = ls;
3539 prev_shader->key.ge.as_ls = 1;
3540 } else {
3541 struct si_shader_selector *es = key->ge.part.gs.es;
3542
3543 prev_shader->selector = es;
3544 prev_shader->key.ge.as_es = 1;
3545 prev_shader->key.ge.as_ngg = key->ge.as_ngg;
3546 }
3547
3548 prev_shader->key.ge.mono = key->ge.mono;
3549 prev_shader->key.ge.opt = key->ge.opt;
3550 prev_shader->key.ge.opt.inline_uniforms = false; /* only TCS/GS can inline uniforms */
3551 /* kill_outputs was computed based on second shader's outputs so we can't use it to
3552 * kill first shader's outputs.
3553 */
3554 prev_shader->key.ge.opt.kill_outputs = 0;
3555 prev_shader->is_monolithic = true;
3556
3557 si_init_shader_args(prev_shader, args);
3558
3559 nir_shader *nir = si_get_nir_shader(prev_shader, args, free_nir,
3560 sel->info.tcs_vgpr_only_inputs, NULL);
3561
3562 si_update_shader_binary_info(shader, nir);
3563
3564 shader->info.uses_instanceid |=
3565 prev_shader->selector->info.uses_instanceid || prev_shader->info.uses_instanceid;
3566
3567 return nir;
3568 }
3569
si_get_tcs_out_patch_stride(const struct si_shader_info * info)3570 unsigned si_get_tcs_out_patch_stride(const struct si_shader_info *info)
3571 {
3572 unsigned tcs_out_vertices = info->base.tess.tcs_vertices_out;
3573 unsigned vertex_stride = util_last_bit64(info->outputs_written_before_tes_gs) * 4;
3574 unsigned num_patch_outputs = util_last_bit64(info->patch_outputs_written);
3575
3576 return tcs_out_vertices * vertex_stride + num_patch_outputs * 4;
3577 }
3578
si_get_tcs_epilog_args(enum amd_gfx_level gfx_level,struct si_shader_args * args,struct ac_arg * rel_patch_id,struct ac_arg * invocation_id,struct ac_arg * tf_lds_offset,struct ac_arg tess_factors[6])3579 void si_get_tcs_epilog_args(enum amd_gfx_level gfx_level,
3580 struct si_shader_args *args,
3581 struct ac_arg *rel_patch_id,
3582 struct ac_arg *invocation_id,
3583 struct ac_arg *tf_lds_offset,
3584 struct ac_arg tess_factors[6])
3585 {
3586 memset(args, 0, sizeof(*args));
3587
3588 if (gfx_level >= GFX9) {
3589 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3590 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3591 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
3592 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* wave info */
3593 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
3594 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3595 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3596 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3597 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3598 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3599 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3600 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3601 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3602 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3603 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3604 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3605 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
3606 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
3607 } else {
3608 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3609 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3610 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3611 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3612 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
3613 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
3614 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3615 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
3616 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
3617 }
3618
3619 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
3620 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* VGPR gap */
3621 /* patch index within the wave (REL_PATCH_ID) */
3622 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, rel_patch_id);
3623 /* invocation ID within the patch */
3624 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, invocation_id);
3625 /* LDS offset where tess factors should be loaded from */
3626 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, tf_lds_offset);
3627
3628 for (unsigned i = 0; i < 6; i++)
3629 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &tess_factors[i]);
3630 }
3631
si_get_ps_prolog_args(struct si_shader_args * args,const union si_shader_part_key * key)3632 void si_get_ps_prolog_args(struct si_shader_args *args,
3633 const union si_shader_part_key *key)
3634 {
3635 memset(args, 0, sizeof(*args));
3636
3637 const unsigned num_input_sgprs = key->ps_prolog.num_input_sgprs;
3638
3639 struct ac_arg input_sgprs[num_input_sgprs];
3640 for (unsigned i = 0; i < num_input_sgprs; i++)
3641 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, input_sgprs + i);
3642
3643 args->internal_bindings = input_sgprs[SI_SGPR_INTERNAL_BINDINGS];
3644 /* Use the absolute location of the input. */
3645 args->ac.prim_mask = input_sgprs[SI_PS_NUM_USER_SGPR];
3646
3647 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_sample);
3648 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_center);
3649 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_centroid);
3650 /* skip PERSP_PULL_MODEL */
3651 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_sample);
3652 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_center);
3653 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_centroid);
3654 /* skip LINE_STIPPLE_TEX */
3655
3656 /* POS_X|Y|Z|W_FLOAT */
3657 for (unsigned i = 0; i < key->ps_prolog.num_fragcoord_components; i++)
3658 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
3659
3660 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.front_face);
3661 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.ancillary);
3662 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.sample_coverage);
3663 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.pos_fixed_pt);
3664 }
3665
si_get_ps_epilog_args(struct si_shader_args * args,const union si_shader_part_key * key,struct ac_arg colors[MAX_DRAW_BUFFERS],struct ac_arg * depth,struct ac_arg * stencil,struct ac_arg * sample_mask)3666 void si_get_ps_epilog_args(struct si_shader_args *args,
3667 const union si_shader_part_key *key,
3668 struct ac_arg colors[MAX_DRAW_BUFFERS],
3669 struct ac_arg *depth, struct ac_arg *stencil,
3670 struct ac_arg *sample_mask)
3671 {
3672 memset(args, 0, sizeof(*args));
3673
3674 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3675 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3676 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3677 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3678 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &args->alpha_reference);
3679
3680 u_foreach_bit (i, key->ps_epilog.colors_written) {
3681 ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_FLOAT, colors + i);
3682 }
3683
3684 if (key->ps_epilog.writes_z)
3685 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, depth);
3686
3687 if (key->ps_epilog.writes_stencil)
3688 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, stencil);
3689
3690 if (key->ps_epilog.writes_samplemask)
3691 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, sample_mask);
3692 }
3693