1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "si_shader.h"
8 #include "ac_nir.h"
9 #include "ac_rtld.h"
10 #include "nir.h"
11 #include "nir_builder.h"
12 #include "nir_serialize.h"
13 #include "nir_xfb_info.h"
14 #include "si_pipe.h"
15 #include "si_shader_internal.h"
16 #include "sid.h"
17 #include "tgsi/tgsi_from_mesa.h"
18 #include "util/u_memory.h"
19 #include "util/mesa-sha1.h"
20 #include "util/ralloc.h"
21 #include "util/u_upload_mgr.h"
22
23 #if LLVM_AVAILABLE
24 #include <llvm/Config/llvm-config.h> /* for LLVM_VERSION_MAJOR */
25 #else
26 #define LLVM_VERSION_MAJOR 0
27 #endif
28
29 static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
30
31 static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
32
33 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
34 static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader);
35
36 /* Get the number of all interpolated inputs */
si_get_ps_num_interp(struct si_shader * ps)37 unsigned si_get_ps_num_interp(struct si_shader *ps)
38 {
39 unsigned num_interp = ps->info.num_ps_inputs;
40
41 /* Back colors are added by the PS prolog when needed. */
42 if (!ps->is_monolithic && ps->key.ps.part.prolog.color_two_side)
43 num_interp += !!(ps->info.ps_colors_read & 0x0f) + !!(ps->info.ps_colors_read & 0xf0);
44
45 assert(num_interp <= 32);
46 return MIN2(num_interp, 32);
47 }
48
49 /** Whether the shader runs as a combination of multiple API shaders */
si_is_multi_part_shader(struct si_shader * shader)50 bool si_is_multi_part_shader(struct si_shader *shader)
51 {
52 if (shader->selector->screen->info.gfx_level <= GFX8 ||
53 shader->selector->stage > MESA_SHADER_GEOMETRY)
54 return false;
55
56 return shader->key.ge.as_ls || shader->key.ge.as_es ||
57 shader->selector->stage == MESA_SHADER_TESS_CTRL ||
58 shader->selector->stage == MESA_SHADER_GEOMETRY;
59 }
60
61 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
si_is_merged_shader(struct si_shader * shader)62 bool si_is_merged_shader(struct si_shader *shader)
63 {
64 if (shader->selector->stage > MESA_SHADER_GEOMETRY || shader->is_gs_copy_shader)
65 return false;
66
67 return shader->key.ge.as_ngg || si_is_multi_part_shader(shader);
68 }
69
70 /**
71 * Returns a unique index for a semantic name and index. The index must be
72 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
73 * calculated.
74 */
si_shader_io_get_unique_index(unsigned semantic)75 unsigned si_shader_io_get_unique_index(unsigned semantic)
76 {
77 switch (semantic) {
78 case VARYING_SLOT_POS:
79 return SI_UNIQUE_SLOT_POS;
80 default:
81 if (semantic >= VARYING_SLOT_VAR0 && semantic <= VARYING_SLOT_VAR31)
82 return SI_UNIQUE_SLOT_VAR0 + (semantic - VARYING_SLOT_VAR0);
83
84 if (semantic >= VARYING_SLOT_VAR0_16BIT && semantic <= VARYING_SLOT_VAR15_16BIT)
85 return SI_UNIQUE_SLOT_VAR0_16BIT + (semantic - VARYING_SLOT_VAR0_16BIT);
86
87 assert(!"invalid generic index");
88 return 0;
89
90 /* Legacy desktop GL varyings. */
91 case VARYING_SLOT_FOGC:
92 return SI_UNIQUE_SLOT_FOGC;
93 case VARYING_SLOT_COL0:
94 return SI_UNIQUE_SLOT_COL0;
95 case VARYING_SLOT_COL1:
96 return SI_UNIQUE_SLOT_COL1;
97 case VARYING_SLOT_BFC0:
98 return SI_UNIQUE_SLOT_BFC0;
99 case VARYING_SLOT_BFC1:
100 return SI_UNIQUE_SLOT_BFC1;
101 case VARYING_SLOT_TEX0:
102 case VARYING_SLOT_TEX1:
103 case VARYING_SLOT_TEX2:
104 case VARYING_SLOT_TEX3:
105 case VARYING_SLOT_TEX4:
106 case VARYING_SLOT_TEX5:
107 case VARYING_SLOT_TEX6:
108 case VARYING_SLOT_TEX7:
109 return SI_UNIQUE_SLOT_TEX0 + (semantic - VARYING_SLOT_TEX0);
110 case VARYING_SLOT_CLIP_VERTEX:
111 return SI_UNIQUE_SLOT_CLIP_VERTEX;
112
113 /* Varyings present in both GLES and desktop GL. */
114 case VARYING_SLOT_CLIP_DIST0:
115 return SI_UNIQUE_SLOT_CLIP_DIST0;
116 case VARYING_SLOT_CLIP_DIST1:
117 return SI_UNIQUE_SLOT_CLIP_DIST1;
118 case VARYING_SLOT_PSIZ:
119 return SI_UNIQUE_SLOT_PSIZ;
120 case VARYING_SLOT_LAYER:
121 return SI_UNIQUE_SLOT_LAYER;
122 case VARYING_SLOT_VIEWPORT:
123 return SI_UNIQUE_SLOT_VIEWPORT;
124 case VARYING_SLOT_PRIMITIVE_ID:
125 return SI_UNIQUE_SLOT_PRIMITIVE_ID;
126 }
127 }
128
declare_streamout_params(struct si_shader_args * args,struct si_shader * shader,const shader_info * info)129 static void declare_streamout_params(struct si_shader_args *args, struct si_shader *shader,
130 const shader_info *info)
131 {
132 if (shader->selector->screen->info.gfx_level >= GFX11) {
133 /* NGG streamout. */
134 if (info->stage == MESA_SHADER_TESS_EVAL)
135 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
136 return;
137 }
138
139 /* Streamout SGPRs. */
140 if (si_shader_uses_streamout(shader)) {
141 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_config);
142 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_write_index);
143
144 /* A streamout buffer offset is loaded if the stride is non-zero. */
145 for (int i = 0; i < 4; i++) {
146 if (!info->xfb_stride[i])
147 continue;
148
149 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_offset[i]);
150 }
151 } else if (info->stage == MESA_SHADER_TESS_EVAL) {
152 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
153 }
154 }
155
si_get_max_workgroup_size(const struct si_shader * shader)156 unsigned si_get_max_workgroup_size(const struct si_shader *shader)
157 {
158 gl_shader_stage stage = shader->is_gs_copy_shader ?
159 MESA_SHADER_VERTEX : shader->selector->stage;
160
161 assert(shader->wave_size);
162
163 switch (stage) {
164 case MESA_SHADER_VERTEX:
165 case MESA_SHADER_TESS_EVAL:
166 /* Use the largest workgroup size for streamout */
167 if (shader->key.ge.as_ngg)
168 return si_shader_uses_streamout(shader) ? 256 : 128;
169
170 /* As part of merged shader. */
171 return shader->selector->screen->info.gfx_level >= GFX9 &&
172 (shader->key.ge.as_ls || shader->key.ge.as_es) ? 128 : shader->wave_size;
173
174 case MESA_SHADER_TESS_CTRL:
175 /* Return this so that LLVM doesn't remove s_barrier
176 * instructions on chips where we use s_barrier. */
177 return shader->selector->screen->info.gfx_level >= GFX7 ? 128 : shader->wave_size;
178
179 case MESA_SHADER_GEOMETRY:
180 /* GS can always generate up to 256 vertices. */
181 return shader->selector->screen->info.gfx_level >= GFX9 ? 256 : shader->wave_size;
182
183 case MESA_SHADER_COMPUTE:
184 break; /* see below */
185
186 default:
187 return shader->wave_size;
188 }
189
190 /* Compile a variable block size using the maximum variable size. */
191 if (shader->selector->info.base.workgroup_size_variable)
192 return SI_MAX_VARIABLE_THREADS_PER_BLOCK;
193
194 uint16_t *local_size = shader->selector->info.base.workgroup_size;
195 unsigned max_work_group_size = (uint32_t)local_size[0] *
196 (uint32_t)local_size[1] *
197 (uint32_t)local_size[2];
198 assert(max_work_group_size);
199 return max_work_group_size;
200 }
201
declare_const_and_shader_buffers(struct si_shader_args * args,struct si_shader * shader,const shader_info * info,bool assign_params)202 static void declare_const_and_shader_buffers(struct si_shader_args *args, struct si_shader *shader,
203 const shader_info *info, bool assign_params)
204 {
205 enum ac_arg_type const_shader_buf_type;
206
207 if (info->num_ubos == 1 && info->num_ssbos == 0)
208 const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
209 else
210 const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
211
212 ac_add_arg(
213 &args->ac, AC_ARG_SGPR, 1, const_shader_buf_type,
214 assign_params ? &args->const_and_shader_buffers : &args->other_const_and_shader_buffers);
215 }
216
declare_samplers_and_images(struct si_shader_args * args,bool assign_params)217 static void declare_samplers_and_images(struct si_shader_args *args, bool assign_params)
218 {
219 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
220 assign_params ? &args->samplers_and_images : &args->other_samplers_and_images);
221 }
222
declare_per_stage_desc_pointers(struct si_shader_args * args,struct si_shader * shader,const shader_info * info,bool assign_params)223 static void declare_per_stage_desc_pointers(struct si_shader_args *args, struct si_shader *shader,
224 const shader_info *info, bool assign_params)
225 {
226 declare_const_and_shader_buffers(args, shader, info, assign_params);
227 declare_samplers_and_images(args, assign_params);
228 }
229
declare_global_desc_pointers(struct si_shader_args * args)230 static void declare_global_desc_pointers(struct si_shader_args *args)
231 {
232 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->internal_bindings);
233 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
234 &args->bindless_samplers_and_images);
235 }
236
declare_vb_descriptor_input_sgprs(struct si_shader_args * args,struct si_shader * shader)237 static void declare_vb_descriptor_input_sgprs(struct si_shader_args *args,
238 struct si_shader *shader)
239 {
240 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->ac.vertex_buffers);
241
242 unsigned num_vbos_in_user_sgprs = shader->selector->info.num_vbos_in_user_sgprs;
243 if (num_vbos_in_user_sgprs) {
244 unsigned user_sgprs = args->ac.num_sgprs_used;
245
246 if (si_is_merged_shader(shader))
247 user_sgprs -= 8;
248 assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
249
250 /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
251 for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
252 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
253
254 assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(args->vb_descriptors));
255 for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
256 ac_add_arg(&args->ac, AC_ARG_SGPR, 4, AC_ARG_INT, &args->vb_descriptors[i]);
257 }
258 }
259
declare_vs_input_vgprs(struct si_shader_args * args,struct si_shader * shader)260 static void declare_vs_input_vgprs(struct si_shader_args *args, struct si_shader *shader)
261 {
262 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vertex_id);
263
264 if (shader->selector->screen->info.gfx_level >= GFX12) {
265 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
266 } else if (shader->key.ge.as_ls) {
267 if (shader->selector->screen->info.gfx_level >= GFX11) {
268 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
269 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
270 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
271 } else if (shader->selector->screen->info.gfx_level >= GFX10) {
272 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
273 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
274 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
275 } else {
276 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
277 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
278 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
279 }
280 } else if (shader->selector->screen->info.gfx_level >= GFX10) {
281 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
282 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
283 /* user vgpr or PrimID (legacy) */
284 shader->key.ge.as_ngg ? NULL : &args->ac.vs_prim_id);
285 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
286 } else {
287 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
288 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_prim_id);
289 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
290 }
291 }
292
declare_vs_blit_inputs(struct si_shader * shader,struct si_shader_args * args,const shader_info * info)293 static void declare_vs_blit_inputs(struct si_shader *shader, struct si_shader_args *args,
294 const shader_info *info)
295 {
296 bool has_attribute_ring_address = shader->selector->screen->info.gfx_level >= GFX11;
297
298 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_blit_inputs); /* i16 x1, y1 */
299 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */
300 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */
301
302 if (info->vs.blit_sgprs_amd ==
303 SI_VS_BLIT_SGPRS_POS_COLOR + has_attribute_ring_address) {
304 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
305 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
306 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
307 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
308 if (has_attribute_ring_address)
309 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* attribute ring address */
310 } else if (info->vs.blit_sgprs_amd ==
311 SI_VS_BLIT_SGPRS_POS_TEXCOORD + has_attribute_ring_address) {
312 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
313 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
314 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
315 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
316 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
317 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
318 if (has_attribute_ring_address)
319 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* attribute ring address */
320 }
321 }
322
declare_tes_input_vgprs(struct si_shader_args * args)323 static void declare_tes_input_vgprs(struct si_shader_args *args)
324 {
325 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_u);
326 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_v);
327 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_rel_patch_id);
328 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_patch_id);
329 }
330
331 enum
332 {
333 /* Convenient merged shader definitions. */
334 SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES,
335 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
336 };
337
si_add_arg_checked(struct ac_shader_args * args,enum ac_arg_regfile file,unsigned registers,enum ac_arg_type type,struct ac_arg * arg,unsigned idx)338 static void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
339 enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
340 {
341 assert(args->arg_count == idx);
342 ac_add_arg(args, file, registers, type, arg);
343 }
344
si_init_shader_args(struct si_shader * shader,struct si_shader_args * args,const shader_info * info)345 static void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args,
346 const shader_info *info)
347 {
348 unsigned i, num_returns, num_return_sgprs;
349 unsigned num_prolog_vgprs = 0;
350 struct si_shader_selector *sel = shader->selector;
351 unsigned stage = shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : info->stage;
352 unsigned stage_case = stage;
353
354 memset(args, 0, sizeof(*args));
355
356 /* Set MERGED shaders. */
357 if (sel->screen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY) {
358 if (shader->key.ge.as_ls || stage == MESA_SHADER_TESS_CTRL)
359 stage_case = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
360 else if (shader->key.ge.as_es || shader->key.ge.as_ngg || stage == MESA_SHADER_GEOMETRY)
361 stage_case = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
362 }
363
364 switch (stage_case) {
365 case MESA_SHADER_VERTEX:
366 declare_global_desc_pointers(args);
367
368 if (info->vs.blit_sgprs_amd) {
369 declare_vs_blit_inputs(shader, args, info);
370 } else {
371 declare_per_stage_desc_pointers(args, shader, info, true);
372 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
373
374 if (shader->is_gs_copy_shader) {
375 declare_streamout_params(args, shader, info);
376 } else {
377 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
378 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
379 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
380 declare_vb_descriptor_input_sgprs(args, shader);
381
382 if (shader->key.ge.as_es) {
383 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
384 } else if (shader->key.ge.as_ls) {
385 /* no extra parameters */
386 } else {
387 declare_streamout_params(args, shader, info);
388 }
389 }
390 }
391
392 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
393 if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
394 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
395
396 /* VGPRs */
397 declare_vs_input_vgprs(args, shader);
398
399 break;
400
401 case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
402 declare_global_desc_pointers(args);
403 declare_per_stage_desc_pointers(args, shader, info, true);
404 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
405 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
406 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
407 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
408 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
409
410 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
411 if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
412 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
413
414 /* VGPRs */
415 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
416 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
417 break;
418
419 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
420 /* Merged stages have 8 system SGPRs at the beginning. */
421 /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
422 /* Gfx11+: SPI_SHADER_PGM_LO/HI_HS */
423 declare_per_stage_desc_pointers(args, shader, info, stage == MESA_SHADER_TESS_CTRL);
424 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
425 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info);
426 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
427 if (sel->screen->info.gfx_level >= GFX11)
428 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_wave_id);
429 else
430 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
431 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
432 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
433
434 declare_global_desc_pointers(args);
435 declare_per_stage_desc_pointers(args, shader, info, stage == MESA_SHADER_VERTEX);
436
437 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
438 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
439 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
440 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
441 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
442 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
443
444 /* VGPRs (first TCS, then VS) */
445 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
446 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
447
448 if (stage == MESA_SHADER_VERTEX) {
449 declare_vs_input_vgprs(args, shader);
450
451 /* Need to keep LS/HS arg index same for shared args when ACO,
452 * so this is not able to be before shared VGPRs.
453 */
454 declare_vb_descriptor_input_sgprs(args, shader);
455
456 /* LS return values are inputs to the TCS main shader part. */
457 if (!shader->is_monolithic || shader->key.ge.opt.same_patch_vertices) {
458 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
459 ac_add_return(&args->ac, AC_ARG_SGPR);
460 for (i = 0; i < 2; i++)
461 ac_add_return(&args->ac, AC_ARG_VGPR);
462
463 /* VS outputs passed via VGPRs to TCS. */
464 if (shader->key.ge.opt.same_patch_vertices && !info->use_aco_amd) {
465 unsigned num_outputs = util_last_bit64(shader->selector->info.ls_es_outputs_written);
466 for (i = 0; i < num_outputs * 4; i++)
467 ac_add_return(&args->ac, AC_ARG_VGPR);
468 }
469 }
470 } else {
471 /* TCS inputs are passed via VGPRs from VS. */
472 if (shader->key.ge.opt.same_patch_vertices && !info->use_aco_amd) {
473 unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->info.ls_es_outputs_written);
474 for (i = 0; i < num_inputs * 4; i++)
475 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
476 }
477 }
478 break;
479
480 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
481 /* Merged stages have 8 system SGPRs at the beginning. */
482 /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
483 /* Gfx11+: SPI_SHADER_PGM_LO/HI_GS */
484 declare_per_stage_desc_pointers(args, shader, info, stage == MESA_SHADER_GEOMETRY);
485
486 if (shader->key.ge.as_ngg)
487 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_tg_info);
488 else
489 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
490
491 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info);
492 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
493 if (sel->screen->info.gfx_level >= GFX11)
494 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_attr_offset);
495 else
496 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
497 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
498 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
499
500 declare_global_desc_pointers(args);
501 if (stage != MESA_SHADER_VERTEX || !info->vs.blit_sgprs_amd) {
502 declare_per_stage_desc_pointers(
503 args, shader, info, (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL));
504 }
505
506 if (stage == MESA_SHADER_VERTEX && info->vs.blit_sgprs_amd) {
507 declare_vs_blit_inputs(shader, args, info);
508 } else {
509 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
510
511 if (stage == MESA_SHADER_VERTEX) {
512 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
513 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
514 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
515 } else if (stage == MESA_SHADER_TESS_EVAL) {
516 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
517 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
518 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
519 } else {
520 /* GS */
521 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
522 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
523 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
524 }
525
526 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->small_prim_cull_info);
527 if (sel->screen->info.gfx_level >= GFX11)
528 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->gs_attr_address);
529 else
530 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
531 }
532
533 /* VGPRs (first GS, then VS/TES) */
534 if (sel->screen->info.gfx_level >= GFX12) {
535 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
536 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
537 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
538 } else {
539 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
540 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
541 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
542 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
543 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
544 }
545
546 if (stage == MESA_SHADER_VERTEX) {
547 declare_vs_input_vgprs(args, shader);
548
549 /* Need to keep ES/GS arg index same for shared args when ACO,
550 * so this is not able to be before shared VGPRs.
551 */
552 if (!info->vs.blit_sgprs_amd)
553 declare_vb_descriptor_input_sgprs(args, shader);
554 } else if (stage == MESA_SHADER_TESS_EVAL) {
555 declare_tes_input_vgprs(args);
556 }
557
558 if (shader->key.ge.as_es && !shader->is_monolithic &&
559 (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL)) {
560 /* ES return values are inputs to GS. */
561 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
562 ac_add_return(&args->ac, AC_ARG_SGPR);
563 for (i = 0; i < (sel->screen->info.gfx_level >= GFX12 ? 3 : 5); i++)
564 ac_add_return(&args->ac, AC_ARG_VGPR);
565 }
566 break;
567
568 case MESA_SHADER_TESS_EVAL:
569 declare_global_desc_pointers(args);
570 declare_per_stage_desc_pointers(args, shader, info, true);
571 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
572 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
573 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
574
575 if (shader->key.ge.as_es) {
576 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
577 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
578 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
579 } else {
580 declare_streamout_params(args, shader, info);
581 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
582 }
583
584 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
585 if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
586 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
587
588 /* VGPRs */
589 declare_tes_input_vgprs(args);
590 break;
591
592 case MESA_SHADER_GEOMETRY:
593 declare_global_desc_pointers(args);
594 declare_per_stage_desc_pointers(args, shader, info, true);
595 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
596 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_wave_id);
597
598 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
599 if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
600 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
601
602 /* VGPRs */
603 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
604 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
605 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
606 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
607 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[3]);
608 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[4]);
609 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[5]);
610 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
611 break;
612
613 case MESA_SHADER_FRAGMENT:
614 declare_global_desc_pointers(args);
615 declare_per_stage_desc_pointers(args, shader, info, true);
616 si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->sample_locs[0],
617 SI_PARAM_SAMPLE_LOCS0);
618 si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->sample_locs[1],
619 SI_PARAM_SAMPLE_LOCS1);
620 si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->alpha_reference,
621 SI_PARAM_ALPHA_REF);
622 si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.prim_mask,
623 SI_PARAM_PRIM_MASK);
624
625 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_sample,
626 SI_PARAM_PERSP_SAMPLE);
627 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_center,
628 SI_PARAM_PERSP_CENTER);
629 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_centroid,
630 SI_PARAM_PERSP_CENTROID);
631 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
632 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_sample,
633 SI_PARAM_LINEAR_SAMPLE);
634 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_center,
635 SI_PARAM_LINEAR_CENTER);
636 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_centroid,
637 SI_PARAM_LINEAR_CENTROID);
638 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
639 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[0],
640 SI_PARAM_POS_X_FLOAT);
641 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[1],
642 SI_PARAM_POS_Y_FLOAT);
643 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[2],
644 SI_PARAM_POS_Z_FLOAT);
645 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[3],
646 SI_PARAM_POS_W_FLOAT);
647 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.front_face,
648 SI_PARAM_FRONT_FACE);
649 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.ancillary,
650 SI_PARAM_ANCILLARY);
651 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.sample_coverage,
652 SI_PARAM_SAMPLE_COVERAGE);
653 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.pos_fixed_pt,
654 SI_PARAM_POS_FIXED_PT);
655
656 if (info->use_aco_amd) {
657 ac_compact_ps_vgpr_args(&args->ac, shader->config.spi_ps_input_addr);
658
659 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
660 if (sel->screen->info.gfx_level < GFX11)
661 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
662 }
663
664 /* Monolithic PS emit prolog and epilog in NIR directly. */
665 if (!shader->is_monolithic) {
666 /* Color inputs from the prolog. */
667 if (shader->selector->info.colors_read) {
668 unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);
669
670 for (i = 0; i < num_color_elements; i++)
671 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, i ? NULL : &args->color_start);
672
673 num_prolog_vgprs += num_color_elements;
674 }
675
676 /* Outputs for the epilog. */
677 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
678 /* These must always be declared even if Z/stencil/samplemask are killed. */
679 num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
680 sel->info.writes_z + sel->info.writes_stencil + sel->info.writes_samplemask +
681 1 /* SampleMaskIn */;
682
683 for (i = 0; i < num_return_sgprs; i++)
684 ac_add_return(&args->ac, AC_ARG_SGPR);
685 for (; i < num_returns; i++)
686 ac_add_return(&args->ac, AC_ARG_VGPR);
687 }
688 break;
689
690 case MESA_SHADER_COMPUTE:
691 case MESA_SHADER_KERNEL:
692 declare_global_desc_pointers(args);
693 declare_per_stage_desc_pointers(args, shader, info, true);
694 if (shader->selector->info.uses_grid_size)
695 ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.num_work_groups);
696 if (shader->selector->info.uses_variable_block_size)
697 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->block_size);
698
699 unsigned cs_user_data_dwords =
700 info->cs.user_data_components_amd;
701 if (cs_user_data_dwords) {
702 ac_add_arg(&args->ac, AC_ARG_SGPR, MIN2(cs_user_data_dwords, 4), AC_ARG_INT,
703 &args->cs_user_data[0]);
704 if (cs_user_data_dwords > 4) {
705 ac_add_arg(&args->ac, AC_ARG_SGPR, cs_user_data_dwords - 4, AC_ARG_INT,
706 &args->cs_user_data[1]);
707 }
708 }
709
710 /* Some descriptors can be in user SGPRs. */
711 /* Shader buffers in user SGPRs. */
712 for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) {
713 while (args->ac.num_sgprs_used % 4 != 0)
714 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
715
716 ac_add_arg(&args->ac, AC_ARG_SGPR, 4, AC_ARG_INT, &args->cs_shaderbuf[i]);
717 }
718 /* Images in user SGPRs. */
719 for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) {
720 unsigned num_sgprs = BITSET_TEST(info->image_buffers, i) ? 4 : 8;
721
722 while (args->ac.num_sgprs_used % num_sgprs != 0)
723 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
724
725 ac_add_arg(&args->ac, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &args->cs_image[i]);
726 }
727
728 /* Hardware SGPRs. */
729 for (i = 0; i < 3; i++) {
730 if (shader->selector->info.uses_block_id[i]) {
731 /* GFX12 loads workgroup IDs into ttmp registers, so they are not input SGPRs, but we
732 * still need to set this to indicate that they are enabled (for ac_nir_to_llvm).
733 */
734 if (sel->screen->info.gfx_level >= GFX12)
735 args->ac.workgroup_ids[i].used = true;
736 else
737 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]);
738 }
739 }
740 if (shader->selector->info.uses_tg_size)
741 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tg_size);
742
743 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
744 if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
745 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
746
747 /* Hardware VGPRs. */
748 /* Thread IDs are packed in VGPR0, 10 bits per component or stored in 3 separate VGPRs */
749 if (sel->screen->info.gfx_level >= GFX11 ||
750 (!sel->screen->info.has_graphics && sel->screen->info.family >= CHIP_MI200)) {
751 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_ids_packed);
752 } else {
753 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_id_x);
754 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_id_y);
755 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_id_z);
756 }
757 break;
758 default:
759 assert(0 && "unimplemented shader");
760 return;
761 }
762
763 shader->info.num_input_sgprs = args->ac.num_sgprs_used;
764 shader->info.num_input_vgprs = args->ac.num_vgprs_used;
765
766 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
767 shader->info.num_input_vgprs -= num_prolog_vgprs;
768 }
769
get_lds_granularity(struct si_screen * screen,gl_shader_stage stage)770 static unsigned get_lds_granularity(struct si_screen *screen, gl_shader_stage stage)
771 {
772 return screen->info.gfx_level >= GFX11 && stage == MESA_SHADER_FRAGMENT ? 1024 :
773 screen->info.gfx_level >= GFX7 ? 512 : 256;
774 }
775
si_shader_binary_open(struct si_screen * screen,struct si_shader * shader,struct ac_rtld_binary * rtld)776 static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
777 struct ac_rtld_binary *rtld)
778 {
779 const struct si_shader_selector *sel = shader->selector;
780 const char *part_elfs[5];
781 size_t part_sizes[5];
782 unsigned num_parts = 0;
783
784 #define add_part(shader_or_part) \
785 if (shader_or_part) { \
786 part_elfs[num_parts] = (shader_or_part)->binary.code_buffer; \
787 part_sizes[num_parts] = (shader_or_part)->binary.code_size; \
788 num_parts++; \
789 }
790
791 add_part(shader->prolog);
792 add_part(shader->previous_stage);
793 add_part(shader);
794 add_part(shader->epilog);
795
796 #undef add_part
797
798 struct ac_rtld_symbol lds_symbols[2];
799 unsigned num_lds_symbols = 0;
800
801 if (sel && screen->info.gfx_level >= GFX9 && !shader->is_gs_copy_shader &&
802 (sel->stage == MESA_SHADER_GEOMETRY ||
803 (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg))) {
804 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
805 sym->name = "esgs_ring";
806 sym->size = shader->gs_info.esgs_ring_size * 4;
807 sym->align = 64 * 1024;
808 }
809
810 if (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
811 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
812 sym->name = "ngg_emit";
813 sym->size = shader->ngg.ngg_emit_size * 4;
814 sym->align = 4;
815 }
816
817 bool ok = ac_rtld_open(
818 rtld, (struct ac_rtld_open_info){.info = &screen->info,
819 .options =
820 {
821 .halt_at_entry = screen->options.halt_shaders,
822 .waitcnt_wa = num_parts > 1 &&
823 screen->info.needs_llvm_wait_wa,
824 },
825 .shader_type = sel->stage,
826 .wave_size = shader->wave_size,
827 .num_parts = num_parts,
828 .elf_ptrs = part_elfs,
829 .elf_sizes = part_sizes,
830 .num_shared_lds_symbols = num_lds_symbols,
831 .shared_lds_symbols = lds_symbols});
832
833 if (rtld->lds_size > 0) {
834 unsigned alloc_granularity = get_lds_granularity(screen, sel->stage);
835 shader->config.lds_size = DIV_ROUND_UP(rtld->lds_size, alloc_granularity);
836 }
837
838 return ok;
839 }
840
get_shader_binaries(struct si_shader * shader,struct si_shader_binary * bin[4])841 static unsigned get_shader_binaries(struct si_shader *shader, struct si_shader_binary *bin[4])
842 {
843 unsigned num_bin = 0;
844
845 if (shader->prolog)
846 bin[num_bin++] = &shader->prolog->binary;
847
848 if (shader->previous_stage)
849 bin[num_bin++] = &shader->previous_stage->binary;
850
851 bin[num_bin++] = &shader->binary;
852
853 if (shader->epilog)
854 bin[num_bin++] = &shader->epilog->binary;
855
856 return num_bin;
857 }
858
859 /* si_get_shader_binary_size should only be called once per shader
860 * and the result should be stored in shader->complete_shader_binary_size.
861 */
si_get_shader_binary_size(struct si_screen * screen,struct si_shader * shader)862 unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
863 {
864 if (shader->binary.type == SI_SHADER_BINARY_ELF) {
865 struct ac_rtld_binary rtld;
866 si_shader_binary_open(screen, shader, &rtld);
867 uint64_t size = rtld.exec_size;
868 ac_rtld_close(&rtld);
869 return size;
870 } else {
871 struct si_shader_binary *bin[4];
872 unsigned num_bin = get_shader_binaries(shader, bin);
873
874 unsigned size = 0;
875 for (unsigned i = 0; i < num_bin; i++) {
876 assert(bin[i]->type == SI_SHADER_BINARY_RAW);
877 size += bin[i]->exec_size;
878 }
879 return size;
880 }
881 }
882
si_get_shader_prefetch_size(struct si_shader * shader)883 unsigned si_get_shader_prefetch_size(struct si_shader *shader)
884 {
885 struct si_screen *sscreen = shader->selector->screen;
886 /* This excludes arrays of constants after instructions. */
887 unsigned exec_size =
888 ac_align_shader_binary_for_prefetch(&sscreen->info,
889 shader->complete_shader_binary_size);
890
891 /* INST_PREF_SIZE uses 128B granularity.
892 * - GFX11: max 128 * 63 = 8064
893 * - GFX12: max 128 * 255 = 32640
894 */
895 unsigned max_pref_size = shader->selector->screen->info.gfx_level >= GFX12 ? 255 : 63;
896 unsigned exec_size_gran128 = DIV_ROUND_UP(exec_size, 128);
897
898 return MIN2(max_pref_size, exec_size_gran128);
899 }
900
si_get_external_symbol(enum amd_gfx_level gfx_level,void * data,const char * name,uint64_t * value)901 static bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
902 uint64_t *value)
903 {
904 uint64_t *scratch_va = data;
905
906 if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
907 *value = (uint32_t)*scratch_va;
908 return true;
909 }
910 if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
911 /* Enable scratch coalescing. */
912 *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32);
913
914 if (gfx_level >= GFX11)
915 *value |= S_008F04_SWIZZLE_ENABLE_GFX11(1);
916 else
917 *value |= S_008F04_SWIZZLE_ENABLE_GFX6(1);
918 return true;
919 }
920
921 return false;
922 }
923
pre_upload_binary(struct si_screen * sscreen,struct si_shader * shader,unsigned binary_size,bool dma_upload,struct si_context ** upload_ctx,struct pipe_resource ** staging,unsigned * staging_offset,int64_t bo_offset)924 static void *pre_upload_binary(struct si_screen *sscreen, struct si_shader *shader,
925 unsigned binary_size, bool dma_upload,
926 struct si_context **upload_ctx,
927 struct pipe_resource **staging,
928 unsigned *staging_offset,
929 int64_t bo_offset)
930 {
931 unsigned aligned_size = ac_align_shader_binary_for_prefetch(&sscreen->info, binary_size);
932
933 if (bo_offset >= 0) {
934 /* sqtt needs to upload shaders as a pipeline, where all shaders
935 * are contiguous in memory.
936 * In this case, bo_offset will be positive and we don't have to
937 * realloc a new bo.
938 */
939 shader->gpu_address = shader->bo->gpu_address + bo_offset;
940 dma_upload = false;
941 } else {
942 si_resource_reference(&shader->bo, NULL);
943 shader->bo = si_aligned_buffer_create(
944 &sscreen->b,
945 SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT |
946 (dma_upload ? PIPE_RESOURCE_FLAG_UNMAPPABLE : 0),
947 PIPE_USAGE_IMMUTABLE, align(aligned_size, SI_CPDMA_ALIGNMENT), 256);
948 if (!shader->bo)
949 return NULL;
950
951 shader->gpu_address = shader->bo->gpu_address;
952 bo_offset = 0;
953 }
954
955 if (dma_upload) {
956 /* First upload into a staging buffer. */
957 *upload_ctx = si_get_aux_context(&sscreen->aux_context.shader_upload);
958
959 void *ret;
960 u_upload_alloc((*upload_ctx)->b.stream_uploader, 0, binary_size, 256,
961 staging_offset, staging, &ret);
962 if (!ret)
963 si_put_aux_context_flush(&sscreen->aux_context.shader_upload);
964
965 return ret;
966 } else {
967 void *ptr = sscreen->ws->buffer_map(sscreen->ws,
968 shader->bo->buf, NULL,
969 PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);
970 if (!ptr)
971 return NULL;
972
973 return ptr + bo_offset;
974 }
975 }
976
post_upload_binary(struct si_screen * sscreen,struct si_shader * shader,void * code,unsigned code_size,unsigned binary_size,bool dma_upload,struct si_context * upload_ctx,struct pipe_resource * staging,unsigned staging_offset)977 static void post_upload_binary(struct si_screen *sscreen, struct si_shader *shader,
978 void *code, unsigned code_size,
979 unsigned binary_size, bool dma_upload,
980 struct si_context *upload_ctx,
981 struct pipe_resource *staging,
982 unsigned staging_offset)
983 {
984 if (sscreen->debug_flags & DBG(SQTT)) {
985 /* Remember the uploaded code */
986 shader->binary.uploaded_code_size = code_size;
987 shader->binary.uploaded_code = malloc(code_size);
988 memcpy(shader->binary.uploaded_code, code, code_size);
989 }
990
991 if (dma_upload) {
992 /* Then copy from the staging buffer to VRAM.
993 *
994 * We can't use the upload copy in si_buffer_transfer_unmap because that might use
995 * a compute shader, and we can't use shaders in the code that is responsible for making
996 * them available.
997 */
998 si_cp_dma_copy_buffer(upload_ctx, &shader->bo->b.b, staging, 0, staging_offset,
999 binary_size);
1000 si_barrier_after_simple_buffer_op(upload_ctx, 0, &shader->bo->b.b, staging);
1001 upload_ctx->barrier_flags |= SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_L2;
1002
1003 #if 0 /* debug: validate whether the copy was successful */
1004 uint32_t *dst_binary = malloc(binary_size);
1005 uint32_t *src_binary = (uint32_t*)code;
1006 pipe_buffer_read(&upload_ctx->b, &shader->bo->b.b, 0, binary_size, dst_binary);
1007 puts("dst_binary == src_binary:");
1008 for (unsigned i = 0; i < binary_size / 4; i++) {
1009 printf(" %08x == %08x\n", dst_binary[i], src_binary[i]);
1010 }
1011 free(dst_binary);
1012 exit(0);
1013 #endif
1014
1015 si_put_aux_context_flush(&sscreen->aux_context.shader_upload);
1016 pipe_resource_reference(&staging, NULL);
1017 } else {
1018 sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf);
1019 }
1020 }
1021
upload_binary_elf(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,bool dma_upload,int64_t bo_offset)1022 static int upload_binary_elf(struct si_screen *sscreen, struct si_shader *shader,
1023 uint64_t scratch_va, bool dma_upload, int64_t bo_offset)
1024 {
1025 struct ac_rtld_binary binary;
1026 if (!si_shader_binary_open(sscreen, shader, &binary))
1027 return -1;
1028
1029 struct si_context *upload_ctx = NULL;
1030 struct pipe_resource *staging = NULL;
1031 unsigned staging_offset = 0;
1032
1033 void *rx_ptr = pre_upload_binary(sscreen, shader, binary.rx_size, dma_upload,
1034 &upload_ctx, &staging, &staging_offset,
1035 bo_offset);
1036 if (!rx_ptr)
1037 return -1;
1038
1039 /* Upload. */
1040 struct ac_rtld_upload_info u = {};
1041 u.binary = &binary;
1042 u.get_external_symbol = si_get_external_symbol;
1043 u.cb_data = &scratch_va;
1044 u.rx_va = shader->gpu_address;
1045 u.rx_ptr = rx_ptr;
1046
1047 int size = ac_rtld_upload(&u);
1048
1049 post_upload_binary(sscreen, shader, rx_ptr, size, binary.rx_size, dma_upload,
1050 upload_ctx, staging, staging_offset);
1051
1052 ac_rtld_close(&binary);
1053
1054 return size;
1055 }
1056
calculate_needed_lds_size(struct si_screen * sscreen,struct si_shader * shader)1057 static void calculate_needed_lds_size(struct si_screen *sscreen, struct si_shader *shader)
1058 {
1059 gl_shader_stage stage =
1060 shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : shader->selector->stage;
1061
1062 if (sscreen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY &&
1063 (stage == MESA_SHADER_GEOMETRY || shader->key.ge.as_ngg)) {
1064 unsigned size_in_dw = shader->gs_info.esgs_ring_size;
1065
1066 if (stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)
1067 size_in_dw += shader->ngg.ngg_emit_size;
1068
1069 if (shader->key.ge.as_ngg) {
1070 unsigned scratch_dw_size = gfx10_ngg_get_scratch_dw_size(shader);
1071 if (scratch_dw_size) {
1072 /* scratch base address needs to be 8 byte aligned */
1073 size_in_dw = ALIGN(size_in_dw, 2);
1074 size_in_dw += scratch_dw_size;
1075 }
1076 }
1077
1078 shader->config.lds_size =
1079 DIV_ROUND_UP(size_in_dw * 4, get_lds_granularity(sscreen, stage));
1080 }
1081 }
1082
upload_binary_raw(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,bool dma_upload,int64_t bo_offset)1083 static int upload_binary_raw(struct si_screen *sscreen, struct si_shader *shader,
1084 uint64_t scratch_va, bool dma_upload, int64_t bo_offset)
1085 {
1086 struct si_shader_binary *bin[4];
1087 unsigned num_bin = get_shader_binaries(shader, bin);
1088
1089 unsigned code_size = 0, exec_size = 0;
1090 for (unsigned i = 0; i < num_bin; i++) {
1091 assert(bin[i]->type == SI_SHADER_BINARY_RAW);
1092 code_size += bin[i]->code_size;
1093 exec_size += bin[i]->exec_size;
1094 }
1095
1096 struct si_context *upload_ctx = NULL;
1097 struct pipe_resource *staging = NULL;
1098 unsigned staging_offset = 0;
1099
1100 void *rx_ptr = pre_upload_binary(sscreen, shader, code_size, dma_upload,
1101 &upload_ctx, &staging, &staging_offset,
1102 bo_offset);
1103 if (!rx_ptr)
1104 return -1;
1105
1106 unsigned exec_offset = 0, data_offset = exec_size;
1107 for (unsigned i = 0; i < num_bin; i++) {
1108 memcpy(rx_ptr + exec_offset, bin[i]->code_buffer, bin[i]->exec_size);
1109
1110 if (bin[i]->num_symbols) {
1111 /* Offset needed to add to const data symbol because of inserting other
1112 * shader part between exec code and const data.
1113 */
1114 unsigned const_offset = data_offset - exec_offset - bin[i]->exec_size;
1115
1116 /* Prolog and epilog have no symbols. */
1117 struct si_shader *sh = bin[i] == &shader->binary ? shader : shader->previous_stage;
1118 assert(sh && bin[i] == &sh->binary);
1119
1120 si_aco_resolve_symbols(sh, rx_ptr + exec_offset, (const uint32_t *)bin[i]->code_buffer,
1121 scratch_va, const_offset);
1122 }
1123
1124 exec_offset += bin[i]->exec_size;
1125
1126 unsigned data_size = bin[i]->code_size - bin[i]->exec_size;
1127 if (data_size) {
1128 memcpy(rx_ptr + data_offset, bin[i]->code_buffer + bin[i]->exec_size, data_size);
1129 data_offset += data_size;
1130 }
1131 }
1132
1133 post_upload_binary(sscreen, shader, rx_ptr, code_size, code_size, dma_upload,
1134 upload_ctx, staging, staging_offset);
1135
1136 calculate_needed_lds_size(sscreen, shader);
1137 return code_size;
1138 }
1139
si_shader_binary_upload_at(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,int64_t bo_offset)1140 int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shader,
1141 uint64_t scratch_va, int64_t bo_offset)
1142 {
1143 bool dma_upload = !(sscreen->debug_flags & DBG(NO_DMA_SHADERS)) && sscreen->info.has_cp_dma &&
1144 sscreen->info.has_dedicated_vram && !sscreen->info.all_vram_visible &&
1145 bo_offset < 0;
1146
1147 if (shader->binary.type == SI_SHADER_BINARY_ELF) {
1148 return upload_binary_elf(sscreen, shader, scratch_va, dma_upload, bo_offset);
1149 } else {
1150 assert(shader->binary.type == SI_SHADER_BINARY_RAW);
1151 return upload_binary_raw(sscreen, shader, scratch_va, dma_upload, bo_offset);
1152 }
1153 }
1154
si_shader_binary_upload(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va)1155 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
1156 uint64_t scratch_va)
1157 {
1158 return si_shader_binary_upload_at(sscreen, shader, scratch_va, -1);
1159 }
1160
print_disassembly(const char * disasm,size_t nbytes,const char * name,FILE * file,struct util_debug_callback * debug)1161 static void print_disassembly(const char *disasm, size_t nbytes,
1162 const char *name, FILE *file,
1163 struct util_debug_callback *debug)
1164 {
1165 if (debug && debug->debug_message) {
1166 /* Very long debug messages are cut off, so send the
1167 * disassembly one line at a time. This causes more
1168 * overhead, but on the plus side it simplifies
1169 * parsing of resulting logs.
1170 */
1171 util_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");
1172
1173 uint64_t line = 0;
1174 while (line < nbytes) {
1175 int count = nbytes - line;
1176 const char *nl = memchr(disasm + line, '\n', nbytes - line);
1177 if (nl)
1178 count = nl - (disasm + line);
1179
1180 if (count) {
1181 util_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
1182 }
1183
1184 line += count + 1;
1185 }
1186
1187 util_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
1188 }
1189
1190 if (file) {
1191 fprintf(file, "Shader %s disassembly:\n", name);
1192 fprintf(file, "%*s", (int)nbytes, disasm);
1193 }
1194 }
1195
si_shader_dump_disassembly(struct si_screen * screen,const struct si_shader_binary * binary,gl_shader_stage stage,unsigned wave_size,struct util_debug_callback * debug,const char * name,FILE * file)1196 static void si_shader_dump_disassembly(struct si_screen *screen,
1197 const struct si_shader_binary *binary,
1198 gl_shader_stage stage, unsigned wave_size,
1199 struct util_debug_callback *debug, const char *name,
1200 FILE *file)
1201 {
1202 if (binary->type == SI_SHADER_BINARY_RAW) {
1203 print_disassembly(binary->disasm_string, binary->disasm_size, name, file, debug);
1204 return;
1205 }
1206
1207 struct ac_rtld_binary rtld_binary;
1208
1209 if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
1210 .info = &screen->info,
1211 .shader_type = stage,
1212 .wave_size = wave_size,
1213 .num_parts = 1,
1214 .elf_ptrs = &binary->code_buffer,
1215 .elf_sizes = &binary->code_size}))
1216 return;
1217
1218 const char *disasm;
1219 size_t nbytes;
1220
1221 if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
1222 goto out;
1223
1224 if (nbytes > INT_MAX)
1225 goto out;
1226
1227 print_disassembly(disasm, nbytes, name, file, debug);
1228
1229 out:
1230 ac_rtld_close(&rtld_binary);
1231 }
1232
si_calculate_max_simd_waves(struct si_shader * shader)1233 static void si_calculate_max_simd_waves(struct si_shader *shader)
1234 {
1235 struct si_screen *sscreen = shader->selector->screen;
1236 struct ac_shader_config *conf = &shader->config;
1237 unsigned lds_increment = get_lds_granularity(sscreen, shader->selector->stage);
1238 unsigned lds_per_wave = 0;
1239 unsigned max_simd_waves;
1240
1241 max_simd_waves = sscreen->info.max_waves_per_simd;
1242
1243 /* Compute LDS usage for PS. */
1244 switch (shader->selector->stage) {
1245 case MESA_SHADER_FRAGMENT:
1246 /* The minimum usage per wave is (num_inputs * 48). The maximum
1247 * usage is (num_inputs * 48 * 16).
1248 * We can get anything in between and it varies between waves.
1249 *
1250 * The 48 bytes per input for a single primitive is equal to
1251 * 4 bytes/component * 4 components/input * 3 points.
1252 *
1253 * Other stages don't know the size at compile time or don't
1254 * allocate LDS per wave, but instead they do it per thread group.
1255 */
1256 lds_per_wave = conf->lds_size * lds_increment +
1257 align(shader->info.num_ps_inputs * 48, lds_increment);
1258 break;
1259 case MESA_SHADER_COMPUTE: {
1260 unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
1261 lds_per_wave = (conf->lds_size * lds_increment) /
1262 DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
1263 }
1264 break;
1265 default:;
1266 }
1267
1268 /* Compute the per-SIMD wave counts. */
1269 if (conf->num_sgprs) {
1270 max_simd_waves =
1271 MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
1272 }
1273
1274 if (conf->num_vgprs) {
1275 /* GFX 10.3 internally:
1276 * - aligns VGPRS to 16 for Wave32 and 8 for Wave64
1277 * - aligns LDS to 1024
1278 *
1279 * For shader-db stats, set num_vgprs that the hw actually uses.
1280 */
1281 unsigned num_vgprs = conf->num_vgprs;
1282 if (sscreen->info.gfx_level >= GFX10_3) {
1283 unsigned real_vgpr_gran = sscreen->info.num_physical_wave64_vgprs_per_simd / 64;
1284 num_vgprs = util_align_npot(num_vgprs, real_vgpr_gran * (shader->wave_size == 32 ? 2 : 1));
1285 } else {
1286 num_vgprs = align(num_vgprs, shader->wave_size == 32 ? 8 : 4);
1287 }
1288
1289 /* Always print wave limits as Wave64, so that we can compare
1290 * Wave32 and Wave64 with shader-db fairly. */
1291 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
1292 max_simd_waves = MIN2(max_simd_waves, max_vgprs / num_vgprs);
1293 }
1294
1295 unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
1296 if (lds_per_wave)
1297 max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
1298
1299 shader->info.max_simd_waves = max_simd_waves;
1300 }
1301
si_shader_dump_stats_for_shader_db(struct si_screen * screen,struct si_shader * shader,struct util_debug_callback * debug)1302 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1303 struct util_debug_callback *debug)
1304 {
1305 const struct ac_shader_config *conf = &shader->config;
1306 static const char *stages[] = {"VS", "TCS", "TES", "GS", "PS", "CS"};
1307
1308 if (screen->options.debug_disassembly)
1309 si_shader_dump_disassembly(screen, &shader->binary, shader->selector->stage,
1310 shader->wave_size, debug, "main", NULL);
1311
1312 unsigned num_ls_outputs = 0;
1313 unsigned num_hs_outputs = 0;
1314 unsigned num_es_outputs = 0;
1315 unsigned num_gs_outputs = 0;
1316 unsigned num_vs_outputs = 0;
1317 unsigned num_ps_outputs = 0;
1318
1319 if (shader->selector->stage <= MESA_SHADER_GEOMETRY) {
1320 /* This doesn't include pos exports because only param exports are interesting
1321 * for performance and can be optimized.
1322 */
1323 if (shader->key.ge.as_ls)
1324 num_ls_outputs = si_shader_lshs_vertex_stride(shader) / 16;
1325 else if (shader->selector->stage == MESA_SHADER_TESS_CTRL)
1326 num_hs_outputs = util_last_bit64(shader->selector->info.tcs_outputs_written_for_tes);
1327 else if (shader->key.ge.as_es)
1328 num_es_outputs = shader->selector->info.esgs_vertex_stride / 16;
1329 else if (shader->gs_copy_shader)
1330 num_gs_outputs = shader->gs_copy_shader->info.nr_param_exports;
1331 else if (shader->selector->stage == MESA_SHADER_GEOMETRY)
1332 num_gs_outputs = shader->info.nr_param_exports;
1333 else if (shader->selector->stage == MESA_SHADER_VERTEX ||
1334 shader->selector->stage == MESA_SHADER_TESS_EVAL)
1335 num_vs_outputs = shader->info.nr_param_exports;
1336 else
1337 unreachable("invalid shader key");
1338 } else if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
1339 num_ps_outputs = util_bitcount(shader->selector->info.colors_written) +
1340 (shader->ps.writes_z ||
1341 shader->ps.writes_stencil ||
1342 shader->ps.writes_samplemask);
1343 }
1344
1345 util_debug_message(debug, SHADER_INFO,
1346 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
1347 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
1348 "Spilled VGPRs: %d PrivMem VGPRs: %d LSOutputs: %u HSOutputs: %u "
1349 "HSPatchOuts: %u ESOutputs: %u GSOutputs: %u VSOutputs: %u PSOutputs: %u "
1350 "InlineUniforms: %u DivergentLoop: %u (%s, W%u)",
1351 conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
1352 conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
1353 conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
1354 num_ls_outputs, num_hs_outputs,
1355 util_last_bit(shader->selector->info.patch_outputs_written_for_tes),
1356 num_es_outputs, num_gs_outputs, num_vs_outputs, num_ps_outputs,
1357 shader->selector->info.base.num_inlinable_uniforms,
1358 shader->selector->info.has_divergent_loop,
1359 stages[shader->selector->stage], shader->wave_size);
1360 }
1361
si_can_dump_shader(struct si_screen * sscreen,gl_shader_stage stage,enum si_shader_dump_type dump_type)1362 bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage,
1363 enum si_shader_dump_type dump_type)
1364 {
1365 static uint64_t filter[] = {
1366 [SI_DUMP_SHADER_KEY] = DBG(NIR) | DBG(INIT_LLVM) | DBG(LLVM) | DBG(INIT_ACO) | DBG(ACO) | DBG(ASM),
1367 [SI_DUMP_INIT_NIR] = DBG(INIT_NIR),
1368 [SI_DUMP_NIR] = DBG(NIR),
1369 [SI_DUMP_INIT_LLVM_IR] = DBG(INIT_LLVM),
1370 [SI_DUMP_LLVM_IR] = DBG(LLVM),
1371 [SI_DUMP_INIT_ACO_IR] = DBG(INIT_ACO),
1372 [SI_DUMP_ACO_IR] = DBG(ACO),
1373 [SI_DUMP_ASM] = DBG(ASM),
1374 [SI_DUMP_STATS] = DBG(STATS),
1375 [SI_DUMP_ALWAYS] = DBG(VS) | DBG(TCS) | DBG(TES) | DBG(GS) | DBG(PS) | DBG(CS),
1376 };
1377 assert(dump_type < ARRAY_SIZE(filter));
1378
1379 return sscreen->debug_flags & (1 << stage) &&
1380 sscreen->debug_flags & filter[dump_type];
1381 }
1382
si_shader_dump_stats(struct si_screen * sscreen,struct si_shader * shader,FILE * file,bool check_debug_option)1383 static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
1384 bool check_debug_option)
1385 {
1386 const struct ac_shader_config *conf = &shader->config;
1387
1388 if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
1389 fprintf(file,
1390 "*** SHADER CONFIG ***\n"
1391 "SPI_PS_INPUT_ADDR = 0x%04x\n"
1392 "SPI_PS_INPUT_ENA = 0x%04x\n",
1393 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
1394 }
1395
1396 fprintf(file,
1397 "*** SHADER STATS ***\n"
1398 "SGPRS: %d\n"
1399 "VGPRS: %d\n"
1400 "Spilled SGPRs: %d\n"
1401 "Spilled VGPRs: %d\n"
1402 "Private memory VGPRs: %d\n"
1403 "Code Size: %d bytes\n"
1404 "LDS: %d bytes\n"
1405 "Scratch: %d bytes per wave\n"
1406 "Max Waves: %d\n"
1407 "********************\n\n\n",
1408 conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
1409 shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
1410 conf->lds_size * get_lds_granularity(sscreen, shader->selector->stage),
1411 conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
1412 }
1413
si_get_shader_name(const struct si_shader * shader)1414 const char *si_get_shader_name(const struct si_shader *shader)
1415 {
1416 switch (shader->selector->stage) {
1417 case MESA_SHADER_VERTEX:
1418 if (shader->key.ge.as_es)
1419 return "Vertex Shader as ES";
1420 else if (shader->key.ge.as_ls)
1421 return "Vertex Shader as LS";
1422 else if (shader->key.ge.as_ngg)
1423 return "Vertex Shader as ESGS";
1424 else
1425 return "Vertex Shader as VS";
1426 case MESA_SHADER_TESS_CTRL:
1427 return "Tessellation Control Shader";
1428 case MESA_SHADER_TESS_EVAL:
1429 if (shader->key.ge.as_es)
1430 return "Tessellation Evaluation Shader as ES";
1431 else if (shader->key.ge.as_ngg)
1432 return "Tessellation Evaluation Shader as ESGS";
1433 else
1434 return "Tessellation Evaluation Shader as VS";
1435 case MESA_SHADER_GEOMETRY:
1436 if (shader->is_gs_copy_shader)
1437 return "GS Copy Shader as VS";
1438 else
1439 return "Geometry Shader";
1440 case MESA_SHADER_FRAGMENT:
1441 return "Pixel Shader";
1442 case MESA_SHADER_COMPUTE:
1443 return "Compute Shader";
1444 default:
1445 return "Unknown Shader";
1446 }
1447 }
1448
si_shader_dump(struct si_screen * sscreen,struct si_shader * shader,struct util_debug_callback * debug,FILE * file,bool check_debug_option)1449 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1450 struct util_debug_callback *debug, FILE *file, bool check_debug_option)
1451 {
1452 gl_shader_stage stage = shader->selector->stage;
1453
1454 if (!check_debug_option || si_can_dump_shader(sscreen, stage, SI_DUMP_SHADER_KEY))
1455 si_dump_shader_key(shader, file);
1456
1457 if (!check_debug_option && shader->binary.llvm_ir_string) {
1458 /* This is only used with ddebug. */
1459 if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
1460 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
1461 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
1462 }
1463
1464 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
1465 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
1466 }
1467
1468 if (!check_debug_option || (si_can_dump_shader(sscreen, stage, SI_DUMP_ASM))) {
1469 fprintf(file, "\n%s:\n", si_get_shader_name(shader));
1470
1471 if (shader->prolog)
1472 si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, shader->wave_size, debug,
1473 "prolog", file);
1474 if (shader->previous_stage)
1475 si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,
1476 shader->wave_size, debug, "previous stage", file);
1477 si_shader_dump_disassembly(sscreen, &shader->binary, stage, shader->wave_size, debug, "main",
1478 file);
1479
1480 if (shader->epilog)
1481 si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, shader->wave_size, debug,
1482 "epilog", file);
1483 fprintf(file, "\n");
1484
1485 si_shader_dump_stats(sscreen, shader, file, check_debug_option);
1486 }
1487 }
1488
si_dump_shader_key_vs(const union si_shader_key * key,FILE * f)1489 static void si_dump_shader_key_vs(const union si_shader_key *key, FILE *f)
1490 {
1491 fprintf(f, " mono.instance_divisor_is_one = %u\n", key->ge.mono.instance_divisor_is_one);
1492 fprintf(f, " mono.instance_divisor_is_fetched = %u\n",
1493 key->ge.mono.instance_divisor_is_fetched);
1494 fprintf(f, " mono.vs.fetch_opencode = %x\n", key->ge.mono.vs_fetch_opencode);
1495 fprintf(f, " mono.vs.fix_fetch = {");
1496 for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
1497 union si_vs_fix_fetch fix = key->ge.mono.vs_fix_fetch[i];
1498 if (i)
1499 fprintf(f, ", ");
1500 if (!fix.bits)
1501 fprintf(f, "0");
1502 else
1503 fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
1504 fix.u.format);
1505 }
1506 fprintf(f, "}\n");
1507 }
1508
si_dump_shader_key(const struct si_shader * shader,FILE * f)1509 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
1510 {
1511 const union si_shader_key *key = &shader->key;
1512 gl_shader_stage stage = shader->selector->stage;
1513
1514 fprintf(f, "SHADER KEY\n");
1515 fprintf(f, " source_blake3 = {");
1516 _mesa_blake3_print(f, shader->selector->info.base.source_blake3);
1517 fprintf(f, "}\n");
1518
1519 switch (stage) {
1520 case MESA_SHADER_VERTEX:
1521 si_dump_shader_key_vs(key, f);
1522 fprintf(f, " as_es = %u\n", key->ge.as_es);
1523 fprintf(f, " as_ls = %u\n", key->ge.as_ls);
1524 fprintf(f, " as_ngg = %u\n", key->ge.as_ngg);
1525 fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1526 break;
1527
1528 case MESA_SHADER_TESS_CTRL:
1529 if (shader->selector->screen->info.gfx_level >= GFX9)
1530 si_dump_shader_key_vs(key, f);
1531
1532 fprintf(f, " opt.tes_prim_mode = %u\n", key->ge.opt.tes_prim_mode);
1533 fprintf(f, " opt.tes_reads_tess_factors = %u\n", key->ge.opt.tes_reads_tess_factors);
1534 fprintf(f, " opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1535 fprintf(f, " opt.same_patch_vertices = %u\n", key->ge.opt.same_patch_vertices);
1536 break;
1537
1538 case MESA_SHADER_TESS_EVAL:
1539 fprintf(f, " as_es = %u\n", key->ge.as_es);
1540 fprintf(f, " as_ngg = %u\n", key->ge.as_ngg);
1541 fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1542 break;
1543
1544 case MESA_SHADER_GEOMETRY:
1545 if (shader->is_gs_copy_shader)
1546 break;
1547
1548 if (shader->selector->screen->info.gfx_level >= GFX9 &&
1549 key->ge.part.gs.es->stage == MESA_SHADER_VERTEX)
1550 si_dump_shader_key_vs(key, f);
1551
1552 fprintf(f, " mono.u.gs_tri_strip_adj_fix = %u\n", key->ge.mono.u.gs_tri_strip_adj_fix);
1553 fprintf(f, " as_ngg = %u\n", key->ge.as_ngg);
1554 break;
1555
1556 case MESA_SHADER_COMPUTE:
1557 break;
1558
1559 case MESA_SHADER_FRAGMENT:
1560 fprintf(f, " prolog.color_two_side = %u\n", key->ps.part.prolog.color_two_side);
1561 fprintf(f, " prolog.flatshade_colors = %u\n", key->ps.part.prolog.flatshade_colors);
1562 fprintf(f, " prolog.poly_stipple = %u\n", key->ps.part.prolog.poly_stipple);
1563 fprintf(f, " prolog.force_persp_sample_interp = %u\n",
1564 key->ps.part.prolog.force_persp_sample_interp);
1565 fprintf(f, " prolog.force_linear_sample_interp = %u\n",
1566 key->ps.part.prolog.force_linear_sample_interp);
1567 fprintf(f, " prolog.force_persp_center_interp = %u\n",
1568 key->ps.part.prolog.force_persp_center_interp);
1569 fprintf(f, " prolog.force_linear_center_interp = %u\n",
1570 key->ps.part.prolog.force_linear_center_interp);
1571 fprintf(f, " prolog.bc_optimize_for_persp = %u\n",
1572 key->ps.part.prolog.bc_optimize_for_persp);
1573 fprintf(f, " prolog.bc_optimize_for_linear = %u\n",
1574 key->ps.part.prolog.bc_optimize_for_linear);
1575 fprintf(f, " prolog.samplemask_log_ps_iter = %u\n",
1576 key->ps.part.prolog.samplemask_log_ps_iter);
1577 fprintf(f, " prolog.get_frag_coord_from_pixel_coord = %u\n",
1578 key->ps.part.prolog.get_frag_coord_from_pixel_coord);
1579 fprintf(f, " prolog.force_samplemask_to_helper_invocation = %u\n",
1580 key->ps.part.prolog.force_samplemask_to_helper_invocation);
1581 fprintf(f, " epilog.spi_shader_col_format = 0x%x\n",
1582 key->ps.part.epilog.spi_shader_col_format);
1583 fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.part.epilog.color_is_int8);
1584 fprintf(f, " epilog.color_is_int10 = 0x%X\n", key->ps.part.epilog.color_is_int10);
1585 fprintf(f, " epilog.alpha_func = %u\n", key->ps.part.epilog.alpha_func);
1586 fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.part.epilog.alpha_to_one);
1587 fprintf(f, " epilog.alpha_to_coverage_via_mrtz = %u\n", key->ps.part.epilog.alpha_to_coverage_via_mrtz);
1588 fprintf(f, " epilog.clamp_color = %u\n", key->ps.part.epilog.clamp_color);
1589 fprintf(f, " epilog.dual_src_blend_swizzle = %u\n", key->ps.part.epilog.dual_src_blend_swizzle);
1590 fprintf(f, " epilog.rbplus_depth_only_opt = %u\n", key->ps.part.epilog.rbplus_depth_only_opt);
1591 fprintf(f, " epilog.kill_z = %u\n", key->ps.part.epilog.kill_z);
1592 fprintf(f, " epilog.kill_stencil = %u\n", key->ps.part.epilog.kill_stencil);
1593 fprintf(f, " epilog.kill_samplemask = %u\n", key->ps.part.epilog.kill_samplemask);
1594 fprintf(f, " mono.poly_line_smoothing = %u\n", key->ps.mono.poly_line_smoothing);
1595 fprintf(f, " mono.point_smoothing = %u\n", key->ps.mono.point_smoothing);
1596 fprintf(f, " mono.interpolate_at_sample_force_center = %u\n",
1597 key->ps.mono.interpolate_at_sample_force_center);
1598 fprintf(f, " mono.fbfetch_msaa = %u\n", key->ps.mono.fbfetch_msaa);
1599 fprintf(f, " mono.fbfetch_is_1D = %u\n", key->ps.mono.fbfetch_is_1D);
1600 fprintf(f, " mono.fbfetch_layered = %u\n", key->ps.mono.fbfetch_layered);
1601 break;
1602
1603 default:
1604 assert(0);
1605 }
1606
1607 if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||
1608 stage == MESA_SHADER_VERTEX) &&
1609 !key->ge.as_es && !key->ge.as_ls) {
1610 fprintf(f, " mono.remove_streamout = 0x%x\n", key->ge.mono.remove_streamout);
1611 fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs);
1612 fprintf(f, " opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances);
1613 fprintf(f, " opt.kill_pointsize = %u\n", key->ge.opt.kill_pointsize);
1614 fprintf(f, " opt.kill_layer = %u\n", key->ge.opt.kill_layer);
1615 fprintf(f, " opt.remove_streamout = %u\n", key->ge.opt.remove_streamout);
1616 fprintf(f, " opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling);
1617 fprintf(f, " opt.ngg_vs_streamout_num_verts_per_prim = %u\n",
1618 key->ge.opt.ngg_vs_streamout_num_verts_per_prim);
1619 }
1620
1621 if (stage <= MESA_SHADER_GEOMETRY)
1622 fprintf(f, " opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1623 else
1624 fprintf(f, " opt.prefer_mono = %u\n", key->ps.opt.prefer_mono);
1625
1626 if (stage <= MESA_SHADER_GEOMETRY) {
1627 if (key->ge.opt.inline_uniforms) {
1628 fprintf(f, " opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1629 key->ge.opt.inline_uniforms,
1630 key->ge.opt.inlined_uniform_values[0],
1631 key->ge.opt.inlined_uniform_values[1],
1632 key->ge.opt.inlined_uniform_values[2],
1633 key->ge.opt.inlined_uniform_values[3]);
1634 } else {
1635 fprintf(f, " opt.inline_uniforms = 0\n");
1636 }
1637 } else {
1638 if (key->ps.opt.inline_uniforms) {
1639 fprintf(f, " opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1640 key->ps.opt.inline_uniforms,
1641 key->ps.opt.inlined_uniform_values[0],
1642 key->ps.opt.inlined_uniform_values[1],
1643 key->ps.opt.inlined_uniform_values[2],
1644 key->ps.opt.inlined_uniform_values[3]);
1645 } else {
1646 fprintf(f, " opt.inline_uniforms = 0\n");
1647 }
1648 }
1649 }
1650
1651 /* TODO: convert to nir_shader_instructions_pass */
si_nir_kill_outputs(nir_shader * nir,const union si_shader_key * key)1652 static bool si_nir_kill_outputs(nir_shader *nir, const union si_shader_key *key)
1653 {
1654 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1655 assert(impl);
1656 assert(nir->info.stage <= MESA_SHADER_GEOMETRY);
1657
1658 if (!key->ge.opt.kill_outputs &&
1659 !key->ge.opt.kill_pointsize &&
1660 !key->ge.opt.kill_layer &&
1661 !key->ge.opt.kill_clip_distances &&
1662 !(nir->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER))) {
1663 nir_metadata_preserve(impl, nir_metadata_all);
1664 return false;
1665 }
1666
1667 bool progress = false;
1668
1669 nir_foreach_block(block, impl) {
1670 nir_foreach_instr_safe(instr, block) {
1671 if (instr->type != nir_instr_type_intrinsic)
1672 continue;
1673
1674 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1675 if (intr->intrinsic != nir_intrinsic_store_output)
1676 continue;
1677
1678 /* No indirect indexing allowed. */
1679 ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1680 assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1681
1682 assert(intr->num_components == 1); /* only scalar stores expected */
1683 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1684
1685 if (nir_slot_is_varying(sem.location, MESA_SHADER_FRAGMENT) &&
1686 key->ge.opt.kill_outputs &
1687 (1ull << si_shader_io_get_unique_index(sem.location)))
1688 progress |= nir_remove_varying(intr, MESA_SHADER_FRAGMENT);
1689
1690 switch (sem.location) {
1691 case VARYING_SLOT_PSIZ:
1692 if (key->ge.opt.kill_pointsize)
1693 progress |= nir_remove_sysval_output(intr, MESA_SHADER_FRAGMENT);
1694 break;
1695
1696 case VARYING_SLOT_CLIP_VERTEX:
1697 /* TODO: We should only kill specific clip planes as required by kill_clip_distance,
1698 * not whole gl_ClipVertex. Lower ClipVertex in NIR.
1699 */
1700 if ((key->ge.opt.kill_clip_distances & SI_USER_CLIP_PLANE_MASK) ==
1701 SI_USER_CLIP_PLANE_MASK)
1702 progress |= nir_remove_sysval_output(intr, MESA_SHADER_FRAGMENT);
1703 break;
1704
1705 case VARYING_SLOT_CLIP_DIST0:
1706 case VARYING_SLOT_CLIP_DIST1:
1707 if (key->ge.opt.kill_clip_distances) {
1708 assert(nir_intrinsic_src_type(intr) == nir_type_float32);
1709 unsigned index = (sem.location - VARYING_SLOT_CLIP_DIST0) * 4 +
1710 nir_intrinsic_component(intr);
1711
1712 if (key->ge.opt.kill_clip_distances & BITFIELD_BIT(index))
1713 progress |= nir_remove_sysval_output(intr, MESA_SHADER_FRAGMENT);
1714 }
1715 break;
1716
1717 case VARYING_SLOT_LAYER:
1718 /* LAYER is never passed to FS. Instead, we load it there as a system value. */
1719 progress |= nir_remove_varying(intr, MESA_SHADER_FRAGMENT);
1720
1721 if (key->ge.opt.kill_layer)
1722 progress |= nir_remove_sysval_output(intr, MESA_SHADER_FRAGMENT);
1723 break;
1724 }
1725 }
1726 }
1727
1728 if (progress) {
1729 nir_metadata_preserve(impl, nir_metadata_control_flow);
1730 } else {
1731 nir_metadata_preserve(impl, nir_metadata_all);
1732 }
1733
1734 return progress;
1735 }
1736
si_map_io_driver_location(unsigned semantic)1737 static unsigned si_map_io_driver_location(unsigned semantic)
1738 {
1739 if ((semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_TESS_MAX) ||
1740 semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
1741 semantic == VARYING_SLOT_TESS_LEVEL_OUTER)
1742 return ac_shader_io_get_unique_index_patch(semantic);
1743
1744 return si_shader_io_get_unique_index(semantic);
1745 }
1746
si_lower_io_to_mem(struct si_shader * shader,nir_shader * nir)1747 static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir)
1748 {
1749 struct si_shader_selector *sel = shader->selector;
1750 struct si_shader_selector *next_sel = shader->next_shader ? shader->next_shader->selector : sel;
1751 const union si_shader_key *key = &shader->key;
1752 const bool is_gfx9_mono_tcs = shader->is_monolithic &&
1753 next_sel->stage == MESA_SHADER_TESS_CTRL &&
1754 sel->screen->info.gfx_level >= GFX9;
1755
1756 if (nir->info.stage == MESA_SHADER_VERTEX) {
1757 if (key->ge.as_ls) {
1758 NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem,
1759 is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
1760 sel->screen->info.gfx_level,
1761 key->ge.opt.same_patch_vertices,
1762 is_gfx9_mono_tcs ? next_sel->info.tcs_inputs_via_temp : 0,
1763 is_gfx9_mono_tcs ? next_sel->info.tcs_inputs_via_lds : ~0ull);
1764 return true;
1765 } else if (key->ge.as_es) {
1766 NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1767 sel->screen->info.gfx_level, sel->info.esgs_vertex_stride, ~0ULL);
1768 return true;
1769 }
1770 } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
1771 NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem,
1772 is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
1773 sel->screen->info.gfx_level, key->ge.opt.same_patch_vertices,
1774 sel->info.tcs_inputs_via_temp, sel->info.tcs_inputs_via_lds);
1775
1776 /* Used by hs_emit_write_tess_factors() when monolithic shader. */
1777 if (nir->info.tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED)
1778 nir->info.tess._primitive_mode = key->ge.opt.tes_prim_mode;
1779
1780 nir_tcs_info tcs_info;
1781 nir_gather_tcs_info(nir, &tcs_info, nir->info.tess._primitive_mode,
1782 nir->info.tess.spacing);
1783
1784 NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, &tcs_info, si_map_io_driver_location,
1785 sel->screen->info.gfx_level,
1786 ~0ULL, ~0U, /* no TES inputs filter */
1787 shader->wave_size);
1788 return true;
1789 } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
1790 NIR_PASS_V(nir, ac_nir_lower_tes_inputs_to_mem, si_map_io_driver_location);
1791
1792 if (key->ge.as_es) {
1793 NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1794 sel->screen->info.gfx_level, sel->info.esgs_vertex_stride, ~0ULL);
1795 }
1796
1797 return true;
1798 } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1799 NIR_PASS_V(nir, ac_nir_lower_gs_inputs_to_mem, si_map_io_driver_location,
1800 sel->screen->info.gfx_level, key->ge.mono.u.gs_tri_strip_adj_fix);
1801 return true;
1802 }
1803
1804 return false;
1805 }
1806
si_lower_ngg(struct si_shader * shader,nir_shader * nir)1807 static void si_lower_ngg(struct si_shader *shader, nir_shader *nir)
1808 {
1809 struct si_shader_selector *sel = shader->selector;
1810 const union si_shader_key *key = &shader->key;
1811 assert(key->ge.as_ngg);
1812
1813 uint8_t clip_cull_dist_mask =
1814 (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) |
1815 sel->info.culldist_mask;
1816
1817 ac_nir_lower_ngg_options options = {
1818 .hw_info = &sel->screen->info,
1819 .max_workgroup_size = si_get_max_workgroup_size(shader),
1820 .wave_size = shader->wave_size,
1821 .can_cull = si_shader_culling_enabled(shader),
1822 .disable_streamout = !si_shader_uses_streamout(shader),
1823 .vs_output_param_offset = shader->info.vs_output_param_offset,
1824 .has_param_exports = shader->info.nr_param_exports,
1825 .clip_cull_dist_mask = clip_cull_dist_mask,
1826 .kill_pointsize = key->ge.opt.kill_pointsize,
1827 .kill_layer = key->ge.opt.kill_layer,
1828 .force_vrs = sel->screen->options.vrs2x2,
1829 .use_gfx12_xfb_intrinsic = !nir->info.use_aco_amd,
1830 };
1831
1832 if (nir->info.stage == MESA_SHADER_VERTEX ||
1833 nir->info.stage == MESA_SHADER_TESS_EVAL) {
1834 /* Per instance inputs, used to remove instance load after culling. */
1835 unsigned instance_rate_inputs = 0;
1836
1837 if (nir->info.stage == MESA_SHADER_VERTEX) {
1838 instance_rate_inputs = key->ge.mono.instance_divisor_is_one |
1839 key->ge.mono.instance_divisor_is_fetched;
1840
1841 /* Manually mark the instance ID used, so the shader can repack it. */
1842 if (instance_rate_inputs)
1843 BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
1844 } else {
1845 /* Manually mark the primitive ID used, so the shader can repack it. */
1846 if (key->ge.mono.u.vs_export_prim_id)
1847 BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
1848 }
1849
1850 unsigned clip_plane_enable =
1851 SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(key->ge.opt.ngg_culling);
1852 unsigned num_vertices = si_get_num_vertices_per_output_prim(shader);
1853
1854 options.num_vertices_per_primitive = num_vertices ? num_vertices : 3;
1855 options.early_prim_export = gfx10_ngg_export_prim_early(shader);
1856 options.passthrough = gfx10_is_ngg_passthrough(shader);
1857 options.use_edgeflags = gfx10_has_variable_edgeflags(shader);
1858 options.has_gen_prim_query = options.has_xfb_prim_query =
1859 sel->screen->info.gfx_level >= GFX11 && !nir->info.vs.blit_sgprs_amd;
1860 options.export_primitive_id = key->ge.mono.u.vs_export_prim_id;
1861 options.instance_rate_inputs = instance_rate_inputs;
1862 options.user_clip_plane_enable_mask = clip_plane_enable;
1863
1864 NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options);
1865 } else {
1866 assert(nir->info.stage == MESA_SHADER_GEOMETRY);
1867
1868 options.gs_out_vtx_bytes = sel->info.gsvs_vertex_size;
1869 options.has_gen_prim_query = options.has_xfb_prim_query =
1870 sel->screen->info.gfx_level >= GFX11;
1871 options.has_gs_invocations_query = sel->screen->info.gfx_level < GFX11;
1872 options.has_gs_primitives_query = true;
1873
1874 /* For monolithic ES/GS to add vscnt wait when GS export pos0. */
1875 if (key->ge.part.gs.es)
1876 nir->info.writes_memory |= key->ge.part.gs.es->info.base.writes_memory;
1877
1878 NIR_PASS_V(nir, ac_nir_lower_ngg_gs, &options);
1879 }
1880
1881 /* may generate some vector output store */
1882 NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1883 }
1884
si_deserialize_shader(struct si_shader_selector * sel)1885 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel)
1886 {
1887 struct pipe_screen *screen = &sel->screen->b;
1888 const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR, sel->stage);
1889
1890 struct blob_reader blob_reader;
1891 blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
1892 return nir_deserialize(NULL, options, &blob_reader);
1893 }
1894
si_nir_assign_param_offsets(nir_shader * nir,struct si_shader * shader,int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])1895 static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shader,
1896 int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])
1897 {
1898 struct si_shader_selector *sel = shader->selector;
1899 struct si_shader_binary_info *info = &shader->info;
1900
1901 uint64_t outputs_written = 0;
1902 uint32_t outputs_written_16bit = 0;
1903
1904 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1905 assert(impl);
1906
1907 nir_foreach_block(block, impl) {
1908 nir_foreach_instr_safe(instr, block) {
1909 if (instr->type != nir_instr_type_intrinsic)
1910 continue;
1911
1912 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1913 if (intr->intrinsic != nir_intrinsic_store_output)
1914 continue;
1915
1916 /* No indirect indexing allowed. */
1917 ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1918 assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1919
1920 assert(intr->num_components == 1); /* only scalar stores expected */
1921 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1922
1923 if (sem.location >= VARYING_SLOT_VAR0_16BIT)
1924 outputs_written_16bit |= BITFIELD_BIT(sem.location - VARYING_SLOT_VAR0_16BIT);
1925 else
1926 outputs_written |= BITFIELD64_BIT(sem.location);
1927
1928 /* Assign the param index if it's unassigned. */
1929 if (nir_slot_is_varying(sem.location, MESA_SHADER_FRAGMENT) && !sem.no_varying &&
1930 (sem.gs_streams & 0x3) == 0 &&
1931 info->vs_output_param_offset[sem.location] == AC_EXP_PARAM_DEFAULT_VAL_0000) {
1932 /* The semantic and the base should be the same as in si_shader_info. */
1933 assert(sem.location == sel->info.output_semantic[nir_intrinsic_base(intr)]);
1934 /* It must not be remapped (duplicated). */
1935 assert(slot_remap[sem.location] == -1);
1936
1937 info->vs_output_param_offset[sem.location] = info->nr_param_exports++;
1938 }
1939 }
1940 }
1941
1942 /* Duplicated outputs are redirected here. */
1943 for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
1944 if (slot_remap[i] >= 0)
1945 info->vs_output_param_offset[i] = info->vs_output_param_offset[slot_remap[i]];
1946 }
1947
1948 if (shader->key.ge.mono.u.vs_export_prim_id) {
1949 info->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = info->nr_param_exports++;
1950 }
1951
1952 /* Update outputs written info, we may remove some outputs before. */
1953 nir->info.outputs_written = outputs_written;
1954 nir->info.outputs_written_16bit = outputs_written_16bit;
1955 }
1956
si_assign_param_offsets(nir_shader * nir,struct si_shader * shader)1957 static void si_assign_param_offsets(nir_shader *nir, struct si_shader *shader)
1958 {
1959 /* Initialize this first. */
1960 shader->info.nr_param_exports = 0;
1961
1962 STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
1963 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
1964 sizeof(shader->info.vs_output_param_offset));
1965
1966 /* A slot remapping table for duplicated outputs, so that 1 vertex shader output can be
1967 * mapped to multiple fragment shader inputs.
1968 */
1969 int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS];
1970 memset(slot_remap, -1, NUM_TOTAL_VARYING_SLOTS);
1971
1972 /* This sets DEFAULT_VAL for constant outputs in vs_output_param_offset. */
1973 /* TODO: This doesn't affect GS. */
1974 NIR_PASS_V(nir, ac_nir_optimize_outputs, false, slot_remap,
1975 shader->info.vs_output_param_offset);
1976
1977 /* Assign the non-constant outputs. */
1978 /* TODO: Use this for the GS copy shader too. */
1979 si_nir_assign_param_offsets(nir, shader, slot_remap);
1980 }
1981
si_get_nr_pos_exports(const struct si_shader_selector * sel,const union si_shader_key * key)1982 static unsigned si_get_nr_pos_exports(const struct si_shader_selector *sel,
1983 const union si_shader_key *key)
1984 {
1985 const struct si_shader_info *info = &sel->info;
1986
1987 /* Must have a position export. */
1988 unsigned nr_pos_exports = 1;
1989
1990 if ((info->writes_psize && !key->ge.opt.kill_pointsize) ||
1991 (info->writes_edgeflag && !key->ge.as_ngg) ||
1992 (info->writes_layer && !key->ge.opt.kill_layer) ||
1993 info->writes_viewport_index || sel->screen->options.vrs2x2) {
1994 nr_pos_exports++;
1995 }
1996
1997 unsigned clipdist_mask =
1998 (info->clipdist_mask & ~key->ge.opt.kill_clip_distances) | info->culldist_mask;
1999
2000 for (int i = 0; i < 2; i++) {
2001 if (clipdist_mask & BITFIELD_RANGE(i * 4, 4))
2002 nr_pos_exports++;
2003 }
2004
2005 return nr_pos_exports;
2006 }
2007
lower_ps_load_color_intrinsic(nir_builder * b,nir_instr * instr,void * state)2008 static bool lower_ps_load_color_intrinsic(nir_builder *b, nir_instr *instr, void *state)
2009 {
2010 nir_def **colors = (nir_def **)state;
2011
2012 if (instr->type != nir_instr_type_intrinsic)
2013 return false;
2014
2015 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
2016
2017 if (intrin->intrinsic != nir_intrinsic_load_color0 &&
2018 intrin->intrinsic != nir_intrinsic_load_color1)
2019 return false;
2020
2021 unsigned index = intrin->intrinsic == nir_intrinsic_load_color0 ? 0 : 1;
2022 assert(colors[index]);
2023
2024 nir_def_replace(&intrin->def, colors[index]);
2025 return true;
2026 }
2027
si_nir_lower_ps_color_input(nir_shader * nir,const union si_shader_key * key,const struct si_shader_info * info)2028 static bool si_nir_lower_ps_color_input(nir_shader *nir, const union si_shader_key *key,
2029 const struct si_shader_info *info)
2030 {
2031 bool progress = false;
2032 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2033
2034 nir_builder builder = nir_builder_at(nir_before_impl(impl));
2035 nir_builder *b = &builder;
2036
2037 /* Build ready to be used colors at the beginning of the shader. */
2038 nir_def *colors[2] = {0};
2039 for (int i = 0; i < 2; i++) {
2040 if (!(info->colors_read & (0xf << (i * 4))))
2041 continue;
2042
2043 enum glsl_interp_mode interp_mode = info->color_interpolate[i];
2044 if (interp_mode == INTERP_MODE_COLOR) {
2045 interp_mode = key->ps.part.prolog.flatshade_colors ?
2046 INTERP_MODE_FLAT : INTERP_MODE_SMOOTH;
2047 }
2048
2049 nir_def *back_color = NULL;
2050 if (interp_mode == INTERP_MODE_FLAT) {
2051 colors[i] = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
2052 .io_semantics.location = VARYING_SLOT_COL0 + i,
2053 .io_semantics.num_slots = 1);
2054
2055 if (key->ps.part.prolog.color_two_side) {
2056 back_color = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
2057 .io_semantics.location = VARYING_SLOT_BFC0 + i,
2058 .io_semantics.num_slots = 1);
2059 }
2060 } else {
2061 nir_intrinsic_op op = 0;
2062 switch (info->color_interpolate_loc[i]) {
2063 case TGSI_INTERPOLATE_LOC_CENTER:
2064 op = nir_intrinsic_load_barycentric_pixel;
2065 break;
2066 case TGSI_INTERPOLATE_LOC_CENTROID:
2067 op = nir_intrinsic_load_barycentric_centroid;
2068 break;
2069 case TGSI_INTERPOLATE_LOC_SAMPLE:
2070 op = nir_intrinsic_load_barycentric_sample;
2071 break;
2072 default:
2073 unreachable("invalid color interpolate location");
2074 break;
2075 }
2076
2077 nir_def *barycentric = nir_load_barycentric(b, op, interp_mode);
2078
2079 colors[i] =
2080 nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0),
2081 .io_semantics.location = VARYING_SLOT_COL0 + i,
2082 .io_semantics.num_slots = 1);
2083
2084 if (key->ps.part.prolog.color_two_side) {
2085 back_color =
2086 nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0),
2087 .io_semantics.location = VARYING_SLOT_BFC0 + i,
2088 .io_semantics.num_slots = 1);
2089 }
2090 }
2091
2092 if (back_color) {
2093 nir_def *is_front_face = nir_load_front_face(b, 1);
2094 colors[i] = nir_bcsel(b, is_front_face, colors[i], back_color);
2095 }
2096
2097 progress = true;
2098 }
2099
2100 /* lower nir_load_color0/1 to use the color value. */
2101 return nir_shader_instructions_pass(nir, lower_ps_load_color_intrinsic,
2102 nir_metadata_control_flow,
2103 colors) || progress;
2104 }
2105
si_nir_emit_polygon_stipple(nir_shader * nir)2106 static bool si_nir_emit_polygon_stipple(nir_shader *nir)
2107 {
2108 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2109
2110 nir_builder builder = nir_builder_at(nir_before_impl(impl));
2111 nir_builder *b = &builder;
2112
2113 /* Load the buffer descriptor. */
2114 nir_def *desc = nir_load_polygon_stipple_buffer_amd(b);
2115
2116 /* Use the fixed-point gl_FragCoord input.
2117 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
2118 * per coordinate to get the repeating effect.
2119 */
2120 nir_def *pixel_coord = nir_u2u32(b, nir_iand_imm(b, nir_load_pixel_coord(b), 0x1f));
2121
2122 nir_def *zero = nir_imm_int(b, 0);
2123 /* The stipple pattern is 32x32, each row has 32 bits. */
2124 nir_def *offset = nir_ishl_imm(b, nir_channel(b, pixel_coord, 1), 2);
2125 nir_def *row = nir_load_buffer_amd(b, 1, 32, desc, offset, zero, zero);
2126 nir_def *bit = nir_ubfe(b, row, nir_channel(b, pixel_coord, 0), nir_imm_int(b, 1));
2127
2128 nir_def *pass = nir_i2b(b, bit);
2129 nir_discard_if(b, nir_inot(b, pass));
2130
2131 nir_metadata_preserve(impl, nir_metadata_control_flow);
2132 return true;
2133 }
2134
si_should_clear_lds(struct si_screen * sscreen,const struct nir_shader * shader)2135 bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader)
2136 {
2137 return gl_shader_stage_is_compute(shader->info.stage) &&
2138 shader->info.shared_size > 0 && sscreen->options.clear_lds;
2139 }
2140
clamp_shadow_comparison_value(nir_builder * b,nir_instr * instr,void * state)2141 static bool clamp_shadow_comparison_value(nir_builder *b, nir_instr *instr, void *state)
2142 {
2143 if (instr->type != nir_instr_type_tex)
2144 return false;
2145
2146 nir_tex_instr *tex = nir_instr_as_tex(instr);
2147 if (!tex->is_shadow)
2148 return false;
2149
2150 b->cursor = nir_before_instr(instr);
2151
2152 int samp_index = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
2153 int comp_index = nir_tex_instr_src_index(tex, nir_tex_src_comparator);
2154 assert(samp_index >= 0 && comp_index >= 0);
2155
2156 nir_def *sampler = tex->src[samp_index].src.ssa;
2157 nir_def *compare = tex->src[comp_index].src.ssa;
2158 /* Must have been lowered to descriptor. */
2159 assert(sampler->num_components > 1);
2160
2161 nir_def *upgraded = nir_channel(b, sampler, 3);
2162 upgraded = nir_i2b(b, nir_ubfe_imm(b, upgraded, 29, 1));
2163
2164 nir_def *clamped = nir_fsat(b, compare);
2165 compare = nir_bcsel(b, upgraded, clamped, compare);
2166
2167 nir_src_rewrite(&tex->src[comp_index].src, compare);
2168 return true;
2169 }
2170
si_nir_clamp_shadow_comparison_value(nir_shader * nir)2171 static bool si_nir_clamp_shadow_comparison_value(nir_shader *nir)
2172 {
2173 /* Section 8.23.1 (Depth Texture Comparison Mode) of the
2174 * OpenGL 4.5 spec says:
2175 *
2176 * "If the texture’s internal format indicates a fixed-point
2177 * depth texture, then D_t and D_ref are clamped to the
2178 * range [0, 1]; otherwise no clamping is performed."
2179 *
2180 * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
2181 * so the depth comparison value isn't clamped for Z16 and
2182 * Z24 anymore. Do it manually here for GFX8-9; GFX10 has
2183 * an explicitly clamped 32-bit float format.
2184 */
2185 return nir_shader_instructions_pass(nir, clamp_shadow_comparison_value,
2186 nir_metadata_control_flow,
2187 NULL);
2188 }
2189
2190 static void
si_init_gs_output_info(struct si_shader_info * info,struct si_gs_output_info * out_info)2191 si_init_gs_output_info(struct si_shader_info *info, struct si_gs_output_info *out_info)
2192 {
2193 for (int i = 0; i < info->num_outputs; i++) {
2194 unsigned slot = info->output_semantic[i];
2195 if (slot < VARYING_SLOT_VAR0_16BIT) {
2196 out_info->streams[slot] = info->output_streams[i];
2197 out_info->usage_mask[slot] = info->output_usagemask[i];
2198 } else {
2199 unsigned index = slot - VARYING_SLOT_VAR0_16BIT;
2200 /* TODO: 16bit need separated fields for lo/hi part. */
2201 out_info->streams_16bit_lo[index] = info->output_streams[i];
2202 out_info->streams_16bit_hi[index] = info->output_streams[i];
2203 out_info->usage_mask_16bit_lo[index] = info->output_usagemask[i];
2204 out_info->usage_mask_16bit_hi[index] = info->output_usagemask[i];
2205 }
2206 }
2207
2208 ac_nir_gs_output_info *ac_info = &out_info->info;
2209
2210 ac_info->streams = out_info->streams;
2211 ac_info->streams_16bit_lo = out_info->streams_16bit_lo;
2212 ac_info->streams_16bit_hi = out_info->streams_16bit_hi;
2213
2214 ac_info->sysval_mask = out_info->usage_mask;
2215 ac_info->varying_mask = out_info->usage_mask;
2216 ac_info->varying_mask_16bit_lo = out_info->usage_mask_16bit_lo;
2217 ac_info->varying_mask_16bit_hi = out_info->usage_mask_16bit_hi;
2218
2219 /* TODO: construct 16bit slot per component store type. */
2220 ac_info->types_16bit_lo = ac_info->types_16bit_hi = NULL;
2221 }
2222
2223 /* Run passes that eliminate code and affect shader_info. These should be run before linking
2224 * and shader_info gathering. Lowering passes can be run here too, but only if they lead to
2225 * better code or lower undesirable representations (like derefs). Lowering passes that prevent
2226 * linking optimizations or destroy shader_info shouldn't be run here.
2227 */
run_pre_link_optimization_passes(struct si_nir_shader_ctx * ctx)2228 static bool run_pre_link_optimization_passes(struct si_nir_shader_ctx *ctx)
2229 {
2230 struct si_shader *shader = ctx->shader;
2231 struct si_shader_selector *sel = shader->selector;
2232 const union si_shader_key *key = &shader->key;
2233 nir_shader *nir = ctx->nir;
2234 bool progress = false;
2235
2236 /* Kill outputs according to the shader key. */
2237 if (nir->info.stage <= MESA_SHADER_GEOMETRY)
2238 NIR_PASS(progress, nir, si_nir_kill_outputs, key);
2239
2240 bool inline_uniforms = false;
2241 uint32_t *inlined_uniform_values;
2242 si_get_inline_uniform_state((union si_shader_key*)key, nir->info.stage,
2243 &inline_uniforms, &inlined_uniform_values);
2244
2245 if (inline_uniforms) {
2246 /* Most places use shader information from the default variant, not
2247 * the optimized variant. These are the things that the driver looks at
2248 * in optimized variants and the list of things that we need to do.
2249 *
2250 * The driver takes into account these things if they suddenly disappear
2251 * from the shader code:
2252 * - Register usage and code size decrease (obvious)
2253 * - Eliminated PS system values are disabled
2254 * - VS/TES/GS param exports are eliminated if they are undef.
2255 * The param space for eliminated outputs is also not allocated.
2256 * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)
2257 * - TCS output stores are eliminated
2258 * - Eliminated PS inputs are removed from PS.NUM_INTERP.
2259 *
2260 * TODO: These are things the driver ignores in the final shader code
2261 * and relies on the default shader info.
2262 * - System values in VS, TCS, TES, GS are not eliminated
2263 * - uses_discard - if it changed to false
2264 * - writes_memory - if it changed to false
2265 * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not
2266 * eliminated
2267 * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS)
2268 * GS outputs are eliminated except for the temporary LDS.
2269 * Clip distances, gl_PointSize, gl_Layer and PS outputs are eliminated based
2270 * on current states, so we don't care about the shader code.
2271 *
2272 * TODO: Merged shaders don't inline uniforms for the first stage.
2273 * VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS.
2274 * (key == NULL for the first stage here)
2275 *
2276 * TODO: Compute shaders don't support inlinable uniforms, because they
2277 * don't have shader variants.
2278 *
2279 * TODO: The driver uses a linear search to find a shader variant. This
2280 * can be really slow if we get too many variants due to uniform inlining.
2281 */
2282 NIR_PASS_V(nir, nir_inline_uniforms, nir->info.num_inlinable_uniforms,
2283 inlined_uniform_values, nir->info.inlinable_uniform_dw_offsets);
2284 progress = true;
2285 }
2286
2287 NIR_PASS(progress, nir, nir_opt_shrink_stores, false);
2288
2289 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2290 /* This uses the prolog/epilog keys, so only monolithic shaders can call this. */
2291 if (shader->is_monolithic) {
2292 /* This lowers load_color intrinsics to COLn/BFCn input loads and two-side color
2293 * selection.
2294 */
2295 if (sel->info.colors_read)
2296 NIR_PASS(progress, nir, si_nir_lower_ps_color_input, &shader->key, &sel->info);
2297
2298 /* This adds discard and barycentrics. */
2299 if (key->ps.mono.point_smoothing)
2300 NIR_PASS(progress, nir, nir_lower_point_smooth, true);
2301
2302 /* This eliminates system values and unused shader output components. */
2303 ac_nir_lower_ps_early_options early_options = {
2304 .force_center_interp_no_msaa = key->ps.part.prolog.force_persp_center_interp ||
2305 key->ps.part.prolog.force_linear_center_interp ||
2306 key->ps.part.prolog.force_samplemask_to_helper_invocation ||
2307 key->ps.mono.interpolate_at_sample_force_center,
2308 .load_sample_positions_always_loads_current_ones = true,
2309 .force_front_face = key->ps.opt.force_front_face_input,
2310 .optimize_frag_coord = true,
2311 .frag_coord_is_center = true,
2312 /* This does a lot of things. See the description in ac_nir_lower_ps_early_options. */
2313 .ps_iter_samples = key->ps.part.prolog.samplemask_log_ps_iter ?
2314 (1 << key->ps.part.prolog.samplemask_log_ps_iter) :
2315 (key->ps.part.prolog.force_persp_sample_interp ||
2316 key->ps.part.prolog.force_linear_sample_interp ? 2 :
2317 (key->ps.part.prolog.get_frag_coord_from_pixel_coord ? 1 : 0)),
2318
2319 .fbfetch_is_1D = key->ps.mono.fbfetch_is_1D,
2320 .fbfetch_layered = key->ps.mono.fbfetch_layered,
2321 .fbfetch_msaa = key->ps.mono.fbfetch_msaa,
2322 .fbfetch_apply_fmask = sel->screen->info.gfx_level < GFX11 &&
2323 !(sel->screen->debug_flags & DBG(NO_FMASK)),
2324
2325 .clamp_color = key->ps.part.epilog.clamp_color,
2326 .alpha_test_alpha_to_one = key->ps.part.epilog.alpha_to_one,
2327 .alpha_func = key->ps.part.epilog.alpha_func,
2328 .keep_alpha_for_mrtz = key->ps.part.epilog.alpha_to_coverage_via_mrtz,
2329 .spi_shader_col_format_hint = key->ps.part.epilog.spi_shader_col_format,
2330 .kill_z = key->ps.part.epilog.kill_z,
2331 .kill_stencil = key->ps.part.epilog.kill_stencil,
2332 .kill_samplemask = key->ps.part.epilog.kill_samplemask,
2333 };
2334
2335 NIR_PASS(progress, nir, ac_nir_lower_ps_early, &early_options);
2336
2337 /* This adds gl_SampleMaskIn. It must be after ac_nir_lower_ps_early that lowers
2338 * sample_mask_in to load_helper_invocation because we only want to do that for user
2339 * shaders while keeping the real sample mask for smoothing, which is produced using
2340 * MSAA overrasterization over a single-sample color buffer.
2341 */
2342 if (key->ps.mono.poly_line_smoothing)
2343 NIR_PASS(progress, nir, nir_lower_poly_line_smooth, SI_NUM_SMOOTH_AA_SAMPLES);
2344
2345 /* This adds discard. */
2346 if (key->ps.part.prolog.poly_stipple)
2347 NIR_PASS(progress, nir, si_nir_emit_polygon_stipple);
2348 } else {
2349 ac_nir_lower_ps_early_options early_options = {
2350 .optimize_frag_coord = true,
2351 .frag_coord_is_center = true,
2352 .alpha_func = COMPARE_FUNC_ALWAYS,
2353 .spi_shader_col_format_hint = ~0,
2354 };
2355 NIR_PASS(progress, nir, ac_nir_lower_ps_early, &early_options);
2356 }
2357 }
2358
2359 if (progress) {
2360 si_nir_opts(sel->screen, nir, true);
2361 progress = false;
2362 }
2363
2364 /* Remove dead temps before we lower indirect indexing. */
2365 NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
2366
2367 /* Lower indirect indexing last.
2368 *
2369 * Shader variant optimizations (such as uniform inlining, replacing barycentrics, and IO
2370 * elimination) can help eliminate indirect indexing, so this should be done after that.
2371 *
2372 * Note that the code can still contain tautologies such as "array1[i] == array2[i]" when
2373 * array1 and array2 have provably equal values (NIR doesn't have a pass that can do that),
2374 * which NIR can optimize only after we lower indirecting indexing, so it's important that
2375 * we lower it before we gather shader_info.
2376 */
2377
2378 /* Lower indirect indexing of large constant arrays to the load_constant intrinsic, which
2379 * will be turned into PC-relative loads from a data section next to the shader.
2380 */
2381 NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
2382
2383 /* Lower all other indirect indexing to if-else ladders or scratch. */
2384 progress |= ac_nir_lower_indirect_derefs(nir, sel->screen->info.gfx_level);
2385 return progress;
2386 }
2387
2388 /* Late optimization passes and lowering passes. The majority of lowering passes are here.
2389 * These passes should have no impact on linking optimizations and shouldn't affect shader_info
2390 * (those should be run before this) because any changes in shader_info won't be reflected
2391 * in hw registers from now on.
2392 */
run_late_optimization_and_lowering_passes(struct si_nir_shader_ctx * ctx,bool progress)2393 static void run_late_optimization_and_lowering_passes(struct si_nir_shader_ctx *ctx,
2394 bool progress)
2395 {
2396 struct si_shader *shader = ctx->shader;
2397 struct si_shader_selector *sel = shader->selector;
2398 const union si_shader_key *key = &shader->key;
2399 nir_shader *nir = ctx->nir;
2400
2401 si_init_shader_args(shader, &ctx->args, &nir->info);
2402
2403 if (nir->info.stage == MESA_SHADER_FRAGMENT)
2404 NIR_PASS(progress, nir, nir_lower_fragcoord_wtrans);
2405
2406 NIR_PASS(progress, nir, ac_nir_lower_tex,
2407 &(ac_nir_lower_tex_options){
2408 .gfx_level = sel->screen->info.gfx_level,
2409 .lower_array_layer_round_even = !sel->screen->info.conformant_trunc_coord,
2410 });
2411
2412 if (nir->info.uses_resource_info_query)
2413 NIR_PASS(progress, nir, ac_nir_lower_resinfo, sel->screen->info.gfx_level);
2414
2415 /* This must be before si_nir_lower_resource. */
2416 if (!sel->screen->info.has_image_opcodes)
2417 NIR_PASS(progress, nir, ac_nir_lower_image_opcodes);
2418
2419 /* LLVM does not work well with this, so is handled in llvm backend waterfall. */
2420 if (nir->info.use_aco_amd && sel->info.has_non_uniform_tex_access) {
2421 nir_lower_non_uniform_access_options options = {
2422 .types = nir_lower_non_uniform_texture_access,
2423 };
2424 NIR_PASS(progress, nir, nir_lower_non_uniform_access, &options);
2425 }
2426
2427 /* Legacy GS is not the last VGT stage because there is also the GS copy shader. */
2428 bool is_last_vgt_stage =
2429 (nir->info.stage == MESA_SHADER_VERTEX ||
2430 nir->info.stage == MESA_SHADER_TESS_EVAL ||
2431 (nir->info.stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)) &&
2432 !shader->key.ge.as_ls && !shader->key.ge.as_es;
2433
2434 if (nir->info.stage == MESA_SHADER_VERTEX)
2435 NIR_PASS(progress, nir, si_nir_lower_vs_inputs, shader, &ctx->args);
2436
2437 progress |= si_lower_io_to_mem(shader, nir);
2438
2439 if (is_last_vgt_stage) {
2440 /* Assign param export indices. */
2441 si_assign_param_offsets(nir, shader);
2442
2443 /* Assign num of position exports. */
2444 shader->info.nr_pos_exports = si_get_nr_pos_exports(sel, key);
2445
2446 if (key->ge.as_ngg) {
2447 /* Lower last VGT NGG shader stage. */
2448 si_lower_ngg(shader, nir);
2449 } else if (nir->info.stage == MESA_SHADER_VERTEX ||
2450 nir->info.stage == MESA_SHADER_TESS_EVAL) {
2451 /* Lower last VGT none-NGG VS/TES shader stage. */
2452 unsigned clip_cull_mask =
2453 (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) |
2454 sel->info.culldist_mask;
2455
2456 NIR_PASS_V(nir, ac_nir_lower_legacy_vs,
2457 sel->screen->info.gfx_level,
2458 clip_cull_mask,
2459 shader->info.vs_output_param_offset,
2460 shader->info.nr_param_exports,
2461 shader->key.ge.mono.u.vs_export_prim_id,
2462 !si_shader_uses_streamout(shader),
2463 key->ge.opt.kill_pointsize,
2464 key->ge.opt.kill_layer,
2465 sel->screen->options.vrs2x2);
2466 }
2467 progress = true;
2468 } else if (nir->info.stage == MESA_SHADER_GEOMETRY && !key->ge.as_ngg) {
2469 si_init_gs_output_info(&sel->info, &ctx->legacy_gs_output_info);
2470 NIR_PASS_V(nir, ac_nir_lower_legacy_gs, false, sel->screen->use_ngg,
2471 &ctx->legacy_gs_output_info.info);
2472 progress = true;
2473 } else if (nir->info.stage == MESA_SHADER_FRAGMENT && shader->is_monolithic) {
2474 ac_nir_lower_ps_late_options late_options = {
2475 .gfx_level = sel->screen->info.gfx_level,
2476 .family = sel->screen->info.family,
2477 .use_aco = nir->info.use_aco_amd,
2478 .bc_optimize_for_persp = key->ps.part.prolog.bc_optimize_for_persp,
2479 .bc_optimize_for_linear = key->ps.part.prolog.bc_optimize_for_linear,
2480 .uses_discard = si_shader_uses_discard(shader),
2481 .alpha_to_coverage_via_mrtz = key->ps.part.epilog.alpha_to_coverage_via_mrtz,
2482 .dual_src_blend_swizzle = key->ps.part.epilog.dual_src_blend_swizzle,
2483 .spi_shader_col_format = key->ps.part.epilog.spi_shader_col_format,
2484 .color_is_int8 = key->ps.part.epilog.color_is_int8,
2485 .color_is_int10 = key->ps.part.epilog.color_is_int10,
2486 .alpha_to_one = key->ps.part.epilog.alpha_to_one,
2487 };
2488
2489 NIR_PASS(progress, nir, ac_nir_lower_ps_late, &late_options);
2490 }
2491
2492 assert(shader->wave_size == 32 || shader->wave_size == 64);
2493
2494 NIR_PASS(progress, nir, nir_lower_subgroups,
2495 &(struct nir_lower_subgroups_options) {
2496 .subgroup_size = shader->wave_size,
2497 .ballot_bit_size = shader->wave_size,
2498 .ballot_components = 1,
2499 .lower_to_scalar = true,
2500 .lower_subgroup_masks = true,
2501 .lower_relative_shuffle = true,
2502 .lower_rotate_to_shuffle = !nir->info.use_aco_amd,
2503 .lower_shuffle_to_32bit = true,
2504 .lower_vote_eq = true,
2505 .lower_vote_bool_eq = true,
2506 .lower_quad_broadcast_dynamic = true,
2507 .lower_quad_broadcast_dynamic_to_const = sel->screen->info.gfx_level <= GFX7,
2508 .lower_shuffle_to_swizzle_amd = true,
2509 .lower_ballot_bit_count_to_mbcnt_amd = true,
2510 .lower_inverse_ballot = !nir->info.use_aco_amd && LLVM_VERSION_MAJOR < 17,
2511 .lower_boolean_reduce = nir->info.use_aco_amd,
2512 .lower_boolean_shuffle = true,
2513 });
2514
2515 NIR_PASS(progress, nir, nir_lower_pack);
2516 NIR_PASS(progress, nir, nir_opt_idiv_const, 8);
2517 NIR_PASS(progress, nir, nir_lower_idiv,
2518 &(nir_lower_idiv_options){
2519 .allow_fp16 = sel->screen->info.gfx_level >= GFX9,
2520 });
2521
2522 if (si_should_clear_lds(sel->screen, nir)) {
2523 const unsigned chunk_size = 16; /* max single store size */
2524 const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
2525 NIR_PASS_V(nir, nir_clear_shared_memory, shared_size, chunk_size);
2526 }
2527
2528 NIR_PASS_V(nir, nir_divergence_analysis); /* required by ac_nir_flag_smem_for_loads */
2529 NIR_PASS(progress, nir, ac_nir_flag_smem_for_loads, sel->screen->info.gfx_level,
2530 !sel->info.base.use_aco_amd, true);
2531 NIR_PASS(progress, nir, nir_lower_io_to_scalar,
2532 nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_shared | nir_var_mem_global,
2533 ac_nir_scalarize_overfetching_loads_callback, &sel->screen->info.gfx_level);
2534 NIR_PASS(progress, nir, si_nir_lower_resource, shader, &ctx->args);
2535
2536 if (progress) {
2537 si_nir_opts(sel->screen, nir, false);
2538 progress = false;
2539 }
2540
2541 NIR_PASS(progress, nir, nir_opt_load_store_vectorize,
2542 &(nir_load_store_vectorize_options){
2543 .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_shared | nir_var_mem_global |
2544 nir_var_shader_temp,
2545 .callback = ac_nir_mem_vectorize_callback,
2546 .cb_data = &(struct ac_nir_config){sel->screen->info.gfx_level, sel->info.base.use_aco_amd},
2547 /* On GFX6, read2/write2 is out-of-bounds if the offset register is negative, even if
2548 * the final offset is not.
2549 */
2550 .has_shared2_amd = sel->screen->info.gfx_level >= GFX7,
2551 });
2552 NIR_PASS(progress, nir, ac_nir_lower_mem_access_bit_sizes,
2553 sel->screen->info.gfx_level, !nir->info.use_aco_amd);
2554
2555 if (nir->info.stage == MESA_SHADER_KERNEL) {
2556 NIR_PASS(progress, nir, ac_nir_lower_global_access);
2557
2558 if (nir->info.bit_sizes_int & (8 | 16)) {
2559 if (sel->screen->info.gfx_level >= GFX8)
2560 nir_divergence_analysis(nir);
2561
2562 NIR_PASS(progress, nir, nir_lower_bit_size, ac_nir_lower_bit_size_callback,
2563 &sel->screen->info.gfx_level);
2564 }
2565 }
2566
2567 /* This must be after lowering resources to descriptor loads and before lowering intrinsics
2568 * to args and lowering int64.
2569 */
2570 if (nir->info.use_aco_amd)
2571 progress |= ac_nir_optimize_uniform_atomics(nir);
2572
2573 NIR_PASS(progress, nir, nir_lower_int64);
2574 NIR_PASS(progress, nir, si_nir_lower_abi, shader, &ctx->args);
2575 NIR_PASS(progress, nir, ac_nir_lower_intrinsics_to_args, sel->screen->info.gfx_level,
2576 sel->screen->info.has_ls_vgpr_init_bug,
2577 si_select_hw_stage(nir->info.stage, key, sel->screen->info.gfx_level),
2578 shader->wave_size, si_get_max_workgroup_size(shader), &ctx->args.ac);
2579
2580 /* LLVM keep non-uniform sampler as index, so can't do this in NIR.
2581 * Must be done after si_nir_lower_resource().
2582 */
2583 if (nir->info.use_aco_amd && sel->info.has_shadow_comparison &&
2584 sel->screen->info.gfx_level >= GFX8 && sel->screen->info.gfx_level <= GFX9) {
2585 NIR_PASS(progress, nir, si_nir_clamp_shadow_comparison_value);
2586 }
2587
2588 if (progress) {
2589 si_nir_opts(sel->screen, nir, false);
2590 progress = false;
2591 }
2592
2593 static const nir_opt_offsets_options offset_options = {
2594 .uniform_max = 0,
2595 .buffer_max = ~0,
2596 .shared_max = ~0,
2597 };
2598 NIR_PASS_V(nir, nir_opt_offsets, &offset_options);
2599
2600 si_nir_late_opts(nir);
2601
2602 NIR_PASS(progress, nir, nir_opt_sink,
2603 nir_move_const_undef | nir_move_copies | nir_move_alu | nir_move_comparisons |
2604 nir_move_load_ubo | nir_move_load_ssbo);
2605 NIR_PASS(progress, nir, nir_opt_move,
2606 nir_move_const_undef | nir_move_copies | nir_move_alu | nir_move_comparisons |
2607 nir_move_load_ubo);
2608 /* Run nir_opt_move again to make sure that comparisons are as close as possible to the first
2609 * use to prevent SCC spilling.
2610 */
2611 NIR_PASS(progress, nir, nir_opt_move, nir_move_comparisons);
2612
2613 /* This must be done after si_nir_late_opts() because it may generate vec const. */
2614 NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
2615
2616 /* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
2617 * 200 is tuned for Viewperf. It should be done last.
2618 */
2619 NIR_PASS_V(nir, nir_group_loads, nir_group_same_resource_only, 200);
2620 }
2621
get_input_nir(struct si_shader * shader,struct si_nir_shader_ctx * ctx)2622 static void get_input_nir(struct si_shader *shader, struct si_nir_shader_ctx *ctx)
2623 {
2624 struct si_shader_selector *sel = shader->selector;
2625
2626 ctx->shader = shader;
2627 ctx->free_nir = !sel->nir && sel->nir_binary;
2628 ctx->nir = sel->nir ? sel->nir : (sel->nir_binary ? si_deserialize_shader(sel) : NULL);
2629 assert(ctx->nir);
2630
2631 if (unlikely(should_print_nir(ctx->nir))) {
2632 /* Modify the shader's name so that each variant gets its own name. */
2633 ctx->nir->info.name = ralloc_asprintf(ctx->nir, "%s-%08x", ctx->nir->info.name,
2634 _mesa_hash_data(&shader->key, sizeof(shader->key)));
2635
2636 /* Dummy pass to get the starting point. */
2637 printf("nir_dummy_pass\n");
2638 nir_print_shader(ctx->nir, stdout);
2639 }
2640 }
2641
get_prev_stage_input_nir(struct si_shader * shader,struct si_linked_shaders * linked)2642 static void get_prev_stage_input_nir(struct si_shader *shader, struct si_linked_shaders *linked)
2643 {
2644 const union si_shader_key *key = &shader->key;
2645
2646 if (shader->selector->stage == MESA_SHADER_TESS_CTRL) {
2647 linked->producer_shader.selector = key->ge.part.tcs.ls;
2648 linked->producer_shader.key.ge.as_ls = 1;
2649 } else {
2650 linked->producer_shader.selector = key->ge.part.gs.es;
2651 linked->producer_shader.key.ge.as_es = 1;
2652 linked->producer_shader.key.ge.as_ngg = key->ge.as_ngg;
2653 }
2654
2655 linked->producer_shader.next_shader = shader;
2656 linked->producer_shader.key.ge.mono = key->ge.mono;
2657 linked->producer_shader.key.ge.opt = key->ge.opt;
2658 linked->producer_shader.key.ge.opt.inline_uniforms = false; /* only TCS/GS can inline uniforms */
2659 /* kill_outputs was computed based on second shader's outputs so we can't use it to
2660 * kill first shader's outputs.
2661 */
2662 linked->producer_shader.key.ge.opt.kill_outputs = 0;
2663 linked->producer_shader.is_monolithic = true;
2664 linked->producer_shader.wave_size = shader->wave_size;
2665
2666 get_input_nir(&linked->producer_shader, &linked->producer);
2667 }
2668
si_set_spi_ps_input_config_for_separate_prolog(struct si_shader * shader)2669 static void si_set_spi_ps_input_config_for_separate_prolog(struct si_shader *shader)
2670 {
2671 const union si_shader_key *key = &shader->key;
2672
2673 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
2674 if (key->ps.part.prolog.poly_stipple)
2675 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2676
2677 /* Set up the enable bits for per-sample shading if needed. */
2678 if (key->ps.part.prolog.force_persp_sample_interp &&
2679 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2680 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2681 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
2682 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2683 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2684 }
2685 if (key->ps.part.prolog.force_linear_sample_interp &&
2686 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2687 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2688 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
2689 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2690 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2691 }
2692 if (key->ps.part.prolog.force_persp_center_interp &&
2693 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2694 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2695 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
2696 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2697 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2698 }
2699 if (key->ps.part.prolog.force_linear_center_interp &&
2700 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2701 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2702 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
2703 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2704 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2705 }
2706
2707 /* The sample mask fixup requires the sample ID. */
2708 if (key->ps.part.prolog.samplemask_log_ps_iter)
2709 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2710
2711 if (key->ps.part.prolog.force_samplemask_to_helper_invocation) {
2712 assert(key->ps.part.prolog.samplemask_log_ps_iter == 0);
2713 assert(!key->ps.mono.poly_line_smoothing);
2714 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
2715 }
2716
2717 /* The sample mask fixup has an optimization that replaces the sample mask with the sample ID. */
2718 if (key->ps.part.prolog.samplemask_log_ps_iter == 3)
2719 shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
2720
2721 if (key->ps.part.prolog.get_frag_coord_from_pixel_coord) {
2722 shader->config.spi_ps_input_ena &= C_0286CC_POS_X_FLOAT_ENA;
2723 shader->config.spi_ps_input_ena &= C_0286CC_POS_Y_FLOAT_ENA;
2724 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2725 }
2726 }
2727
si_fixup_spi_ps_input_config(struct si_shader * shader)2728 static void si_fixup_spi_ps_input_config(struct si_shader *shader)
2729 {
2730 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
2731 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
2732 !(shader->config.spi_ps_input_ena & 0xf)) {
2733 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2734 }
2735
2736 /* At least one pair of interpolation weights must be enabled. */
2737 if (!(shader->config.spi_ps_input_ena & 0x7f))
2738 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2739 }
2740
2741 static void
si_get_shader_variant_info(struct si_shader * shader,nir_shader * nir)2742 si_get_shader_variant_info(struct si_shader *shader, nir_shader *nir)
2743 {
2744 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
2745 assert(shader->selector->info.base.use_aco_amd == nir->info.use_aco_amd);
2746 const BITSET_WORD *sysvals = nir->info.system_values_read;
2747
2748 /* ACO needs spi_ps_input_ena before si_init_shader_args. */
2749 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2750 /* Find out which frag coord components are used. */
2751 uint8_t frag_coord_mask = 0;
2752
2753 /* Since flat+convergent and non-flat components can occur in the same vec4, start with
2754 * all PS inputs as flat and change them to smooth when we find a component that's
2755 * interpolated.
2756 */
2757 for (unsigned i = 0; i < ARRAY_SIZE(shader->info.ps_inputs); i++)
2758 shader->info.ps_inputs[i].interpolate = INTERP_MODE_FLAT;
2759
2760 nir_foreach_block(block, nir_shader_get_entrypoint(nir)) {
2761 nir_foreach_instr(instr, block) {
2762 if (instr->type == nir_instr_type_intrinsic) {
2763 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2764
2765 switch (intr->intrinsic) {
2766 case nir_intrinsic_load_frag_coord:
2767 case nir_intrinsic_load_sample_pos:
2768 frag_coord_mask |= nir_def_components_read(&intr->def);
2769 break;
2770 case nir_intrinsic_load_input:
2771 case nir_intrinsic_load_interpolated_input: {
2772 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
2773 unsigned index = nir_intrinsic_base(intr);
2774 assert(sem.num_slots == 1);
2775
2776 shader->info.num_ps_inputs = MAX2(shader->info.num_ps_inputs, index + 1);
2777 shader->info.ps_inputs[index].semantic = sem.location;
2778 /* Determine interpolation mode. This only cares about FLAT/SMOOTH/COLOR.
2779 * COLOR is only for nir_intrinsic_load_color0/1.
2780 */
2781 if (intr->intrinsic == nir_intrinsic_load_interpolated_input) {
2782 shader->info.ps_inputs[index].interpolate = INTERP_MODE_SMOOTH;
2783 if (intr->def.bit_size == 16)
2784 shader->info.ps_inputs[index].fp16_lo_hi_valid |= 0x1 << sem.high_16bits;
2785 }
2786 break;
2787 }
2788 case nir_intrinsic_load_color0:
2789 assert(!shader->is_monolithic);
2790 shader->info.ps_colors_read |= nir_def_components_read(&intr->def);
2791 break;
2792 case nir_intrinsic_load_color1:
2793 assert(!shader->is_monolithic);
2794 shader->info.ps_colors_read |= nir_def_components_read(&intr->def) << 4;
2795 break;
2796 default:
2797 break;
2798 }
2799 }
2800 }
2801 }
2802
2803 /* Add both front and back color inputs. */
2804 if (!shader->is_monolithic) {
2805 unsigned index = shader->info.num_ps_inputs;
2806
2807 for (unsigned back = 0; back < 2; back++) {
2808 for (unsigned i = 0; i < 2; i++) {
2809 if ((shader->info.ps_colors_read >> (i * 4)) & 0xf) {
2810 assert(index < ARRAY_SIZE(shader->info.ps_inputs));
2811 shader->info.ps_inputs[index].semantic =
2812 (back ? VARYING_SLOT_BFC0 : VARYING_SLOT_COL0) + i;
2813
2814 enum glsl_interp_mode mode = i ? nir->info.fs.color1_interp
2815 : nir->info.fs.color0_interp;
2816 shader->info.ps_inputs[index].interpolate =
2817 mode == INTERP_MODE_NONE ? INTERP_MODE_COLOR : mode;
2818 index++;
2819
2820 /* Back-face colors don't increment num_ps_inputs. si_emit_spi_map will use
2821 * back-face colors conditionally only when needed.
2822 */
2823 if (!back)
2824 shader->info.num_ps_inputs++;
2825 }
2826 }
2827 }
2828 }
2829
2830 shader->config.spi_ps_input_ena =
2831 S_0286CC_PERSP_SAMPLE_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_BARYCENTRIC_PERSP_SAMPLE)) |
2832 S_0286CC_PERSP_CENTER_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL)) |
2833 S_0286CC_PERSP_CENTROID_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID)) |
2834 S_0286CC_LINEAR_SAMPLE_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_BARYCENTRIC_LINEAR_SAMPLE)) |
2835 S_0286CC_LINEAR_CENTER_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL)) |
2836 S_0286CC_LINEAR_CENTROID_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID)) |
2837 S_0286CC_POS_X_FLOAT_ENA(!!(frag_coord_mask & 0x1)) |
2838 S_0286CC_POS_Y_FLOAT_ENA(!!(frag_coord_mask & 0x2)) |
2839 S_0286CC_POS_Z_FLOAT_ENA(!!(frag_coord_mask & 0x4)) |
2840 S_0286CC_POS_W_FLOAT_ENA(!!(frag_coord_mask & 0x8)) |
2841 S_0286CC_FRONT_FACE_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_FRONT_FACE) |
2842 BITSET_TEST(sysvals, SYSTEM_VALUE_FRONT_FACE_FSIGN)) |
2843 S_0286CC_ANCILLARY_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_SAMPLE_ID) |
2844 BITSET_TEST(sysvals, SYSTEM_VALUE_LAYER_ID)) |
2845 S_0286CC_SAMPLE_COVERAGE_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_SAMPLE_MASK_IN)) |
2846 S_0286CC_POS_FIXED_PT_ENA(BITSET_TEST(sysvals, SYSTEM_VALUE_PIXEL_COORD));
2847
2848 if (shader->is_monolithic) {
2849 si_fixup_spi_ps_input_config(shader);
2850 shader->config.spi_ps_input_addr = shader->config.spi_ps_input_ena;
2851 } else {
2852 /* Part mode will call si_fixup_spi_ps_input_config() when combining multi
2853 * shader part in si_shader_select_ps_parts().
2854 *
2855 * Reserve register locations for VGPR inputs the PS prolog may need.
2856 */
2857 shader->config.spi_ps_input_addr = shader->config.spi_ps_input_ena |
2858 SI_SPI_PS_INPUT_ADDR_FOR_PROLOG;
2859 }
2860 }
2861 }
2862
get_nir_shaders(struct si_shader * shader,struct si_linked_shaders * linked)2863 static void get_nir_shaders(struct si_shader *shader, struct si_linked_shaders *linked)
2864 {
2865 memset(linked, 0, sizeof(*linked));
2866 get_input_nir(shader, &linked->consumer);
2867
2868 if (shader->selector->screen->info.gfx_level >= GFX9 && shader->is_monolithic &&
2869 (shader->selector->stage == MESA_SHADER_TESS_CTRL ||
2870 shader->selector->stage == MESA_SHADER_GEOMETRY))
2871 get_prev_stage_input_nir(shader, linked);
2872
2873 bool progress[SI_NUM_LINKED_SHADERS] = {0};
2874
2875 for (unsigned i = 0; i < SI_NUM_LINKED_SHADERS; i++) {
2876 if (linked->shader[i].nir) {
2877 progress[i] = run_pre_link_optimization_passes(&linked->shader[i]);
2878 }
2879 }
2880
2881 /* TODO: run linking optimizations here if we have LS+HS or ES+GS */
2882
2883 /* TODO: gather shader_info here */
2884 if (shader->selector->stage <= MESA_SHADER_GEOMETRY) {
2885 shader->info.uses_instanceid |=
2886 shader->key.ge.mono.instance_divisor_is_one ||
2887 shader->key.ge.mono.instance_divisor_is_fetched;
2888
2889 if (linked->producer.nir) {
2890 shader->info.uses_instanceid |=
2891 linked->producer.shader->selector->info.uses_instanceid ||
2892 linked->producer.shader->info.uses_instanceid;
2893 }
2894 }
2895
2896 if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
2897 if (progress[1]) {
2898 si_nir_opts(shader->selector->screen, linked->consumer.nir, false);
2899 progress[1] = false;
2900 }
2901
2902 /* Remove holes after removed PS inputs by renumbering them. Holes can only occur with
2903 * monolithic PS.
2904 */
2905 if (shader->is_monolithic)
2906 NIR_PASS_V(linked->consumer.nir, nir_recompute_io_bases, nir_var_shader_in);
2907
2908 si_get_shader_variant_info(shader, linked->consumer.nir);
2909 }
2910
2911 for (unsigned i = 0; i < SI_NUM_LINKED_SHADERS; i++) {
2912 if (linked->shader[i].nir) {
2913 run_late_optimization_and_lowering_passes(&linked->shader[i], progress[i]);
2914 }
2915 }
2916
2917 /* TODO: gather this where other shader_info is gathered */
2918 for (unsigned i = 0; i < SI_NUM_LINKED_SHADERS; i++) {
2919 if (linked->shader[i].nir) {
2920 struct si_shader_info info;
2921 si_nir_scan_shader(shader->selector->screen, linked->shader[i].nir, &info, true);
2922
2923 shader->info.uses_vmem_load_other |= info.uses_vmem_load_other;
2924 shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh;
2925 }
2926 }
2927 }
2928
2929 /* Generate code for the hardware VS shader stage to go with a geometry shader */
2930 static struct si_shader *
si_nir_generate_gs_copy_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * gs_shader,nir_shader * gs_nir,struct util_debug_callback * debug,ac_nir_gs_output_info * output_info)2931 si_nir_generate_gs_copy_shader(struct si_screen *sscreen,
2932 struct ac_llvm_compiler *compiler,
2933 struct si_shader *gs_shader,
2934 nir_shader *gs_nir,
2935 struct util_debug_callback *debug,
2936 ac_nir_gs_output_info *output_info)
2937 {
2938 struct si_shader *shader;
2939 struct si_shader_selector *gs_selector = gs_shader->selector;
2940 struct si_shader_info *gsinfo = &gs_selector->info;
2941 union si_shader_key *gskey = &gs_shader->key;
2942
2943 shader = CALLOC_STRUCT(si_shader);
2944 if (!shader)
2945 return NULL;
2946
2947 /* We can leave the fence as permanently signaled because the GS copy
2948 * shader only becomes visible globally after it has been compiled. */
2949 util_queue_fence_init(&shader->ready);
2950
2951 shader->selector = gs_selector;
2952 shader->is_gs_copy_shader = true;
2953 shader->wave_size = si_determine_wave_size(sscreen, shader);
2954
2955 STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
2956 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
2957 sizeof(shader->info.vs_output_param_offset));
2958
2959 for (unsigned i = 0; i < gsinfo->num_outputs; i++) {
2960 unsigned semantic = gsinfo->output_semantic[i];
2961
2962 /* Skip if no channel writes to stream 0. */
2963 if (!nir_slot_is_varying(semantic, MESA_SHADER_FRAGMENT) ||
2964 (gsinfo->output_streams[i] & 0x03 &&
2965 gsinfo->output_streams[i] & 0x0c &&
2966 gsinfo->output_streams[i] & 0x30 &&
2967 gsinfo->output_streams[i] & 0xc0))
2968 continue;
2969
2970 shader->info.vs_output_param_offset[semantic] = shader->info.nr_param_exports++;
2971 }
2972
2973 shader->info.nr_pos_exports = si_get_nr_pos_exports(gs_selector, gskey);
2974
2975 unsigned clip_cull_mask =
2976 (gsinfo->clipdist_mask & ~gskey->ge.opt.kill_clip_distances) | gsinfo->culldist_mask;
2977
2978 nir_shader *nir =
2979 ac_nir_create_gs_copy_shader(gs_nir,
2980 sscreen->info.gfx_level,
2981 clip_cull_mask,
2982 shader->info.vs_output_param_offset,
2983 shader->info.nr_param_exports,
2984 !si_shader_uses_streamout(gs_shader),
2985 gskey->ge.opt.kill_pointsize,
2986 gskey->ge.opt.kill_layer,
2987 sscreen->options.vrs2x2,
2988 output_info);
2989
2990 struct si_linked_shaders linked;
2991 memset(&linked, 0, sizeof(linked));
2992 linked.consumer.nir = nir;
2993
2994 si_init_shader_args(shader, &linked.consumer.args, &gs_nir->info);
2995
2996 NIR_PASS_V(nir, si_nir_lower_abi, shader, &linked.consumer.args);
2997 NIR_PASS_V(nir, ac_nir_lower_intrinsics_to_args, sscreen->info.gfx_level,
2998 sscreen->info.has_ls_vgpr_init_bug, AC_HW_VERTEX_SHADER, 64, 64,
2999 &linked.consumer.args.ac);
3000
3001 si_nir_opts(gs_selector->screen, nir, false);
3002
3003 NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
3004
3005 if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY, SI_DUMP_NIR)) {
3006 fprintf(stderr, "GS Copy Shader:\n");
3007 nir_print_shader(nir, stderr);
3008 }
3009
3010 bool ok =
3011 #if AMD_LLVM_AVAILABLE
3012 !gs_nir->info.use_aco_amd ? si_llvm_compile_shader(sscreen, compiler, shader, &linked, debug) :
3013 #endif
3014 si_aco_compile_shader(shader, &linked, debug);
3015
3016 if (ok) {
3017 assert(!shader->config.scratch_bytes_per_wave);
3018 ok = si_shader_binary_upload(sscreen, shader, 0) >= 0;
3019 si_shader_dump(sscreen, shader, debug, stderr, true);
3020 }
3021 ralloc_free(nir);
3022
3023 if (!ok) {
3024 FREE(shader);
3025 shader = NULL;
3026 } else {
3027 si_fix_resource_usage(sscreen, shader);
3028 }
3029 return shader;
3030 }
3031
3032 static void
debug_message_stderr(void * data,unsigned * id,enum util_debug_type ptype,const char * fmt,va_list args)3033 debug_message_stderr(void *data, unsigned *id, enum util_debug_type ptype,
3034 const char *fmt, va_list args)
3035 {
3036 vfprintf(stderr, fmt, args);
3037 fprintf(stderr, "\n");
3038 }
3039
3040 static void
determine_shader_variant_info(struct si_screen * sscreen,struct si_shader * shader)3041 determine_shader_variant_info(struct si_screen *sscreen, struct si_shader *shader)
3042 {
3043 struct si_shader_selector *sel = shader->selector;
3044
3045 if (sel->stage == MESA_SHADER_FRAGMENT) {
3046 shader->ps.writes_z = sel->info.writes_z && !shader->key.ps.part.epilog.kill_z;
3047 shader->ps.writes_stencil = sel->info.writes_stencil &&
3048 !shader->key.ps.part.epilog.kill_stencil;
3049 shader->ps.writes_samplemask = sel->info.writes_samplemask &&
3050 !shader->key.ps.part.epilog.kill_samplemask;
3051 }
3052 }
3053
si_compile_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3054 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3055 struct si_shader *shader, struct util_debug_callback *debug)
3056 {
3057 bool ret = true;
3058 struct si_shader_selector *sel = shader->selector;
3059
3060 determine_shader_variant_info(sscreen, shader);
3061
3062 struct si_linked_shaders linked;
3063 get_nir_shaders(shader, &linked);
3064 nir_shader *nir = linked.consumer.nir;
3065
3066 /* Dump NIR before doing NIR->LLVM conversion in case the
3067 * conversion fails. */
3068 if (si_can_dump_shader(sscreen, nir->info.stage, SI_DUMP_NIR)) {
3069 nir_print_shader(nir, stderr);
3070
3071 if (nir->xfb_info)
3072 nir_print_xfb_info(nir->xfb_info, stderr);
3073 }
3074
3075 /* Initialize vs_output_ps_input_cntl to default. */
3076 for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++)
3077 shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED;
3078 shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0;
3079
3080 /* uses_instanceid may be set by si_nir_lower_vs_inputs(). */
3081 shader->info.uses_instanceid |= sel->info.uses_instanceid;
3082 shader->info.private_mem_vgprs = DIV_ROUND_UP(nir->scratch_size, 4);
3083
3084 /* Set the FP ALU behavior. */
3085 /* By default, we disable denormals for FP32 and enable them for FP16 and FP64
3086 * for performance and correctness reasons. FP32 denormals can't be enabled because
3087 * they break output modifiers and v_mad_f32 and are very slow on GFX6-7.
3088 *
3089 * float_controls_execution_mode defines the set of valid behaviors. Contradicting flags
3090 * can be set simultaneously, which means we are allowed to choose, but not really because
3091 * some options cause GLCTS failures.
3092 */
3093 unsigned float_mode = V_00B028_FP_16_64_DENORMS;
3094
3095 if (!(nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) &&
3096 nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
3097 float_mode |= V_00B028_FP_32_ROUND_TOWARDS_ZERO;
3098
3099 if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
3100 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64)) &&
3101 nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
3102 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
3103 float_mode |= V_00B028_FP_16_64_ROUND_TOWARDS_ZERO;
3104
3105 if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_PRESERVE_FP16 |
3106 FLOAT_CONTROLS_DENORM_PRESERVE_FP64)) &&
3107 nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
3108 FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64))
3109 float_mode &= ~V_00B028_FP_16_64_DENORMS;
3110
3111 ret =
3112 #if AMD_LLVM_AVAILABLE
3113 !nir->info.use_aco_amd ? si_llvm_compile_shader(sscreen, compiler, shader, &linked, debug) :
3114 #endif
3115 si_aco_compile_shader(shader, &linked, debug);
3116
3117 if (!ret)
3118 goto out;
3119
3120 shader->config.float_mode = float_mode;
3121
3122 /* The GS copy shader is compiled next. */
3123 if (nir->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
3124 shader->gs_copy_shader =
3125 si_nir_generate_gs_copy_shader(sscreen, compiler, shader, nir, debug,
3126 &linked.consumer.legacy_gs_output_info.info);
3127 if (!shader->gs_copy_shader) {
3128 fprintf(stderr, "radeonsi: can't create GS copy shader\n");
3129 ret = false;
3130 goto out;
3131 }
3132 }
3133
3134 /* Compute vs_output_ps_input_cntl. */
3135 if ((nir->info.stage == MESA_SHADER_VERTEX ||
3136 nir->info.stage == MESA_SHADER_TESS_EVAL ||
3137 nir->info.stage == MESA_SHADER_GEOMETRY) &&
3138 !shader->key.ge.as_ls && !shader->key.ge.as_es) {
3139 uint8_t *vs_output_param_offset = shader->info.vs_output_param_offset;
3140
3141 if (nir->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg)
3142 vs_output_param_offset = shader->gs_copy_shader->info.vs_output_param_offset;
3143
3144 /* We must use the original shader info before the removal of duplicated shader outputs. */
3145 /* VS and TES should also set primitive ID output if it's used. */
3146 unsigned num_outputs_with_prim_id = sel->info.num_outputs +
3147 shader->key.ge.mono.u.vs_export_prim_id;
3148
3149 for (unsigned i = 0; i < num_outputs_with_prim_id; i++) {
3150 unsigned semantic = sel->info.output_semantic[i];
3151 unsigned offset = vs_output_param_offset[semantic];
3152 unsigned ps_input_cntl;
3153
3154 if (offset <= AC_EXP_PARAM_OFFSET_31) {
3155 /* The input is loaded from parameter memory. */
3156 ps_input_cntl = S_028644_OFFSET(offset);
3157 } else {
3158 /* The input is a DEFAULT_VAL constant. */
3159 assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
3160 offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
3161 offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
3162
3163 /* OFFSET=0x20 means that DEFAULT_VAL is used. */
3164 ps_input_cntl = S_028644_OFFSET(0x20) |
3165 S_028644_DEFAULT_VAL(offset);
3166 }
3167
3168 shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl;
3169 }
3170 }
3171
3172 /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */
3173 if (gl_shader_stage_is_compute(nir->info.stage)) {
3174 unsigned max_vgprs =
3175 sscreen->info.num_physical_wave64_vgprs_per_simd * (shader->wave_size == 32 ? 2 : 1);
3176 unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
3177 unsigned max_sgprs_per_wave = 128;
3178 unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
3179 unsigned threads_per_tg = si_get_max_workgroup_size(shader);
3180 unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, shader->wave_size);
3181 unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
3182
3183 max_vgprs = max_vgprs / waves_per_simd;
3184 max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
3185
3186 if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
3187 fprintf(stderr,
3188 "LLVM failed to compile a shader correctly: "
3189 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
3190 shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);
3191
3192 /* Just terminate the process, because dependent
3193 * shaders can hang due to bad input data, but use
3194 * the env var to allow shader-db to work.
3195 */
3196 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
3197 abort();
3198 }
3199 }
3200
3201 /* Add/remove the scratch offset to/from input SGPRs. */
3202 if (!sel->screen->info.has_scratch_base_registers &&
3203 !si_is_merged_shader(shader)) {
3204 if (nir->info.use_aco_amd) {
3205 /* When aco scratch_offset arg is added explicitly at the beginning.
3206 * After compile if no scratch used, reduce the input sgpr count.
3207 */
3208 if (!shader->config.scratch_bytes_per_wave)
3209 shader->info.num_input_sgprs--;
3210 } else {
3211 /* scratch_offset arg is added by llvm implicitly */
3212 if (shader->info.num_input_sgprs)
3213 shader->info.num_input_sgprs++;
3214 }
3215 }
3216
3217 /* Calculate the number of fragment input VGPRs. */
3218 if (nir->info.stage == MESA_SHADER_FRAGMENT)
3219 shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(&shader->config);
3220
3221 si_calculate_max_simd_waves(shader);
3222
3223 if (si_can_dump_shader(sscreen, nir->info.stage, SI_DUMP_STATS)) {
3224 struct util_debug_callback out_stderr = {
3225 .debug_message = debug_message_stderr,
3226 };
3227
3228 si_shader_dump_stats_for_shader_db(sscreen, shader, &out_stderr);
3229 } else {
3230 si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
3231 }
3232
3233 out:
3234 for (unsigned i = 0; i < SI_NUM_LINKED_SHADERS; i++) {
3235 if (linked.shader[i].free_nir)
3236 ralloc_free(linked.shader[i].nir);
3237 }
3238
3239 return ret;
3240 }
3241
3242 /**
3243 * Create, compile and return a shader part (prolog or epilog).
3244 *
3245 * \param sscreen screen
3246 * \param list list of shader parts of the same category
3247 * \param type shader type
3248 * \param key shader part key
3249 * \param prolog whether the part being requested is a prolog
3250 * \param tm LLVM target machine
3251 * \param debug debug callback
3252 * \return non-NULL on success
3253 */
3254 static struct si_shader_part *
si_get_shader_part(struct si_screen * sscreen,struct si_shader_part ** list,gl_shader_stage stage,bool prolog,union si_shader_part_key * key,struct ac_llvm_compiler * compiler,struct util_debug_callback * debug,const char * name)3255 si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
3256 gl_shader_stage stage, bool prolog, union si_shader_part_key *key,
3257 struct ac_llvm_compiler *compiler, struct util_debug_callback *debug,
3258 const char *name)
3259 {
3260 struct si_shader_part *result;
3261
3262 simple_mtx_lock(&sscreen->shader_parts_mutex);
3263
3264 /* Find existing. */
3265 for (result = *list; result; result = result->next) {
3266 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
3267 simple_mtx_unlock(&sscreen->shader_parts_mutex);
3268 return result;
3269 }
3270 }
3271
3272 /* Compile a new one. */
3273 result = CALLOC_STRUCT(si_shader_part);
3274 result->key = *key;
3275
3276 bool ok =
3277 #if AMD_LLVM_AVAILABLE
3278 !(sscreen->use_aco ||
3279 (stage == MESA_SHADER_FRAGMENT &&
3280 ((prolog && key->ps_prolog.use_aco) ||
3281 (!prolog && key->ps_epilog.use_aco)))) ?
3282 si_llvm_build_shader_part(sscreen, stage, prolog, compiler, debug, name, result) :
3283 #endif
3284 si_aco_build_shader_part(sscreen, stage, prolog, debug, name, result);
3285
3286 if (ok) {
3287 result->next = *list;
3288 *list = result;
3289 } else {
3290 FREE(result);
3291 result = NULL;
3292 }
3293
3294 simple_mtx_unlock(&sscreen->shader_parts_mutex);
3295 return result;
3296 }
3297
3298
3299 /**
3300 * Select and compile (or reuse) TCS parts (epilog).
3301 */
si_shader_select_tcs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3302 static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3303 struct si_shader *shader, struct util_debug_callback *debug)
3304 {
3305 if (sscreen->info.gfx_level >= GFX9) {
3306 assert(shader->wave_size == 32 || shader->wave_size == 64);
3307 unsigned index = shader->wave_size / 32 - 1;
3308 shader->previous_stage = shader->key.ge.part.tcs.ls->main_shader_part_ls[index];
3309 }
3310
3311 return true;
3312 }
3313
3314 /**
3315 * Select and compile (or reuse) GS parts (prolog).
3316 */
si_shader_select_gs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3317 static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3318 struct si_shader *shader, struct util_debug_callback *debug)
3319 {
3320 if (sscreen->info.gfx_level >= GFX9) {
3321 if (shader->key.ge.as_ngg) {
3322 assert(shader->wave_size == 32 || shader->wave_size == 64);
3323 unsigned index = shader->wave_size / 32 - 1;
3324 shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_ngg_es[index];
3325 } else {
3326 shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_es;
3327 }
3328 }
3329
3330 return true;
3331 }
3332
3333 /**
3334 * Compute the PS prolog key, which contains all the information needed to
3335 * build the PS prolog function, and set related bits in shader->config.
3336 */
si_get_ps_prolog_key(struct si_shader * shader,union si_shader_part_key * key)3337 static void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key)
3338 {
3339 struct si_shader_info *info = &shader->selector->info;
3340
3341 memset(key, 0, sizeof(*key));
3342 key->ps_prolog.states = shader->key.ps.part.prolog;
3343 key->ps_prolog.use_aco = info->base.use_aco_amd;
3344 key->ps_prolog.wave32 = shader->wave_size == 32;
3345 key->ps_prolog.colors_read = shader->info.ps_colors_read;
3346 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
3347 key->ps_prolog.wqm =
3348 info->base.fs.needs_quad_helper_invocations &&
3349 (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
3350 key->ps_prolog.states.force_linear_sample_interp ||
3351 key->ps_prolog.states.force_persp_center_interp ||
3352 key->ps_prolog.states.force_linear_center_interp ||
3353 key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear ||
3354 key->ps_prolog.states.samplemask_log_ps_iter ||
3355 key->ps_prolog.states.get_frag_coord_from_pixel_coord ||
3356 key->ps_prolog.states.force_samplemask_to_helper_invocation);
3357 key->ps_prolog.fragcoord_usage_mask =
3358 G_0286CC_POS_X_FLOAT_ENA(shader->config.spi_ps_input_ena) |
3359 (G_0286CC_POS_Y_FLOAT_ENA(shader->config.spi_ps_input_ena) << 1) |
3360 (G_0286CC_POS_Z_FLOAT_ENA(shader->config.spi_ps_input_ena) << 2) |
3361 (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) << 3);
3362 key->ps_prolog.pixel_center_integer = key->ps_prolog.fragcoord_usage_mask &&
3363 shader->selector->info.base.fs.pixel_center_integer;
3364
3365 if (shader->key.ps.part.prolog.poly_stipple)
3366 shader->info.uses_vmem_load_other = true;
3367
3368 if (shader->info.ps_colors_read) {
3369 uint8_t *color = shader->selector->info.color_attr_index;
3370
3371 if (shader->key.ps.part.prolog.color_two_side) {
3372 /* BCOLORs are stored after the last input. */
3373 key->ps_prolog.num_interp_inputs = shader->info.num_ps_inputs;
3374 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
3375 }
3376
3377 for (unsigned i = 0; i < 2; i++) {
3378 unsigned interp = info->color_interpolate[i];
3379 unsigned location = info->color_interpolate_loc[i];
3380
3381 if (!(shader->info.ps_colors_read & (0xf << i * 4)))
3382 continue;
3383
3384 key->ps_prolog.color_attr_index[i] = color[i];
3385
3386 if (shader->key.ps.part.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
3387 interp = INTERP_MODE_FLAT;
3388
3389 switch (interp) {
3390 case INTERP_MODE_FLAT:
3391 key->ps_prolog.color_interp_vgpr_index[i] = -1;
3392 break;
3393 case INTERP_MODE_SMOOTH:
3394 case INTERP_MODE_COLOR:
3395 /* Force the interpolation location for colors here. */
3396 if (shader->key.ps.part.prolog.force_persp_sample_interp)
3397 location = TGSI_INTERPOLATE_LOC_SAMPLE;
3398 if (shader->key.ps.part.prolog.force_persp_center_interp)
3399 location = TGSI_INTERPOLATE_LOC_CENTER;
3400
3401 switch (location) {
3402 case TGSI_INTERPOLATE_LOC_SAMPLE:
3403 key->ps_prolog.color_interp_vgpr_index[i] = 0;
3404 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
3405 break;
3406 case TGSI_INTERPOLATE_LOC_CENTER:
3407 key->ps_prolog.color_interp_vgpr_index[i] = 2;
3408 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
3409 break;
3410 case TGSI_INTERPOLATE_LOC_CENTROID:
3411 key->ps_prolog.color_interp_vgpr_index[i] = 4;
3412 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
3413 break;
3414 default:
3415 assert(0);
3416 }
3417 break;
3418 case INTERP_MODE_NOPERSPECTIVE:
3419 /* Force the interpolation location for colors here. */
3420 if (shader->key.ps.part.prolog.force_linear_sample_interp)
3421 location = TGSI_INTERPOLATE_LOC_SAMPLE;
3422 if (shader->key.ps.part.prolog.force_linear_center_interp)
3423 location = TGSI_INTERPOLATE_LOC_CENTER;
3424
3425 /* The VGPR assignment for non-monolithic shaders
3426 * works because InitialPSInputAddr is set on the
3427 * main shader and PERSP_PULL_MODEL is never used.
3428 */
3429 switch (location) {
3430 case TGSI_INTERPOLATE_LOC_SAMPLE:
3431 key->ps_prolog.color_interp_vgpr_index[i] = 6;
3432 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
3433 break;
3434 case TGSI_INTERPOLATE_LOC_CENTER:
3435 key->ps_prolog.color_interp_vgpr_index[i] = 8;
3436 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
3437 break;
3438 case TGSI_INTERPOLATE_LOC_CENTROID:
3439 key->ps_prolog.color_interp_vgpr_index[i] = 10;
3440 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
3441 break;
3442 default:
3443 assert(0);
3444 }
3445 break;
3446 default:
3447 assert(0);
3448 }
3449 }
3450 }
3451 }
3452
3453 /**
3454 * Check whether a PS prolog is required based on the key.
3455 */
si_need_ps_prolog(const union si_shader_part_key * key)3456 static bool si_need_ps_prolog(const union si_shader_part_key *key)
3457 {
3458 return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
3459 key->ps_prolog.states.force_linear_sample_interp ||
3460 key->ps_prolog.states.force_persp_center_interp ||
3461 key->ps_prolog.states.force_linear_center_interp ||
3462 key->ps_prolog.states.bc_optimize_for_persp ||
3463 key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
3464 key->ps_prolog.states.samplemask_log_ps_iter ||
3465 key->ps_prolog.states.get_frag_coord_from_pixel_coord ||
3466 key->ps_prolog.states.force_samplemask_to_helper_invocation;
3467 }
3468
3469 /**
3470 * Compute the PS epilog key, which contains all the information needed to
3471 * build the PS epilog function.
3472 */
si_get_ps_epilog_key(struct si_shader * shader,union si_shader_part_key * key)3473 static void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
3474 {
3475 struct si_shader_info *info = &shader->selector->info;
3476 memset(key, 0, sizeof(*key));
3477 key->ps_epilog.use_aco = info->base.use_aco_amd;
3478 key->ps_epilog.wave32 = shader->wave_size == 32;
3479 key->ps_epilog.uses_discard = si_shader_uses_discard(shader);
3480 key->ps_epilog.colors_written = info->colors_written;
3481 key->ps_epilog.color_types = info->output_color_types;
3482 key->ps_epilog.writes_all_cbufs = info->color0_writes_all_cbufs &&
3483 /* Check whether a non-zero color buffer is bound. */
3484 !!(shader->key.ps.part.epilog.spi_shader_col_format & 0xfffffff0);
3485 key->ps_epilog.writes_z = info->writes_z;
3486 key->ps_epilog.writes_stencil = info->writes_stencil;
3487 key->ps_epilog.writes_samplemask = info->writes_samplemask;
3488 key->ps_epilog.states = shader->key.ps.part.epilog;
3489 }
3490
3491 /**
3492 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
3493 */
si_shader_select_ps_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3494 static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3495 struct si_shader *shader, struct util_debug_callback *debug)
3496 {
3497 union si_shader_part_key prolog_key;
3498 union si_shader_part_key epilog_key;
3499
3500 /* Get the prolog. */
3501 si_get_ps_prolog_key(shader, &prolog_key);
3502
3503 /* The prolog is a no-op if these aren't set. */
3504 if (si_need_ps_prolog(&prolog_key)) {
3505 shader->prolog =
3506 si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key,
3507 compiler, debug, "Fragment Shader Prolog");
3508 if (!shader->prolog)
3509 return false;
3510 }
3511
3512 /* Get the epilog. */
3513 si_get_ps_epilog_key(shader, &epilog_key);
3514
3515 shader->epilog =
3516 si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key,
3517 compiler, debug, "Fragment Shader Epilog");
3518 if (!shader->epilog)
3519 return false;
3520
3521 si_set_spi_ps_input_config_for_separate_prolog(shader);
3522 si_fixup_spi_ps_input_config(shader);
3523
3524 /* Make sure spi_ps_input_addr bits is superset of spi_ps_input_ena. */
3525 unsigned spi_ps_input_ena = shader->config.spi_ps_input_ena;
3526 unsigned spi_ps_input_addr = shader->config.spi_ps_input_addr;
3527 assert((spi_ps_input_ena & spi_ps_input_addr) == spi_ps_input_ena);
3528
3529 return true;
3530 }
3531
si_multiwave_lds_size_workaround(struct si_screen * sscreen,unsigned * lds_size)3532 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
3533 {
3534 /* If tessellation is all offchip and on-chip GS isn't used, this
3535 * workaround is not needed.
3536 */
3537 return;
3538
3539 /* SPI barrier management bug:
3540 * Make sure we have at least 4k of LDS in use to avoid the bug.
3541 * It applies to workgroup sizes of more than one wavefront.
3542 */
3543 if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
3544 *lds_size = MAX2(*lds_size, 8);
3545 }
3546
si_fix_resource_usage(struct si_screen * sscreen,struct si_shader * shader)3547 static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
3548 {
3549 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
3550
3551 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
3552
3553 if (shader->selector->stage == MESA_SHADER_COMPUTE &&
3554 si_get_max_workgroup_size(shader) > shader->wave_size) {
3555 si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
3556 }
3557 }
3558
si_create_shader_variant(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3559 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3560 struct si_shader *shader, struct util_debug_callback *debug)
3561 {
3562 struct si_shader_selector *sel = shader->selector;
3563 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key, shader->wave_size);
3564
3565 /* LS, ES, VS are compiled on demand if the main part hasn't been
3566 * compiled for that stage.
3567 *
3568 * GS are compiled on demand if the main part hasn't been compiled
3569 * for the chosen NGG-ness.
3570 *
3571 * Vertex shaders are compiled on demand when a vertex fetch
3572 * workaround must be applied.
3573 */
3574 if (shader->is_monolithic) {
3575 /* Monolithic shader (compiled as a whole, has many variants,
3576 * may take a long time to compile).
3577 */
3578 if (!si_compile_shader(sscreen, compiler, shader, debug))
3579 return false;
3580 } else {
3581 /* The shader consists of several parts:
3582 *
3583 * - the middle part is the user shader, it has 1 variant only
3584 * and it was compiled during the creation of the shader
3585 * selector
3586 * - the prolog part is inserted at the beginning
3587 * - the epilog part is inserted at the end
3588 *
3589 * The prolog and epilog have many (but simple) variants.
3590 *
3591 * Starting with gfx9, geometry and tessellation control
3592 * shaders also contain the prolog and user shader parts of
3593 * the previous shader stage.
3594 */
3595
3596 if (!mainp)
3597 return false;
3598
3599 determine_shader_variant_info(sscreen, shader);
3600
3601 /* Copy the compiled shader data over. */
3602 shader->is_binary_shared = true;
3603 shader->binary = mainp->binary;
3604 shader->config = mainp->config;
3605 shader->info = mainp->info;
3606
3607 /* Select prologs and/or epilogs. */
3608 switch (sel->stage) {
3609 case MESA_SHADER_TESS_CTRL:
3610 if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
3611 return false;
3612 break;
3613 case MESA_SHADER_GEOMETRY:
3614 if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
3615 return false;
3616
3617 /* Clone the GS copy shader for the shader variant.
3618 * We can't just copy the pointer because we change the pm4 state and
3619 * si_shader_selector::gs_copy_shader must be immutable because it's shared
3620 * by multiple contexts.
3621 */
3622 if (!shader->key.ge.as_ngg) {
3623 assert(mainp->gs_copy_shader);
3624 assert(mainp->gs_copy_shader->bo);
3625 assert(!mainp->gs_copy_shader->previous_stage_sel);
3626 assert(!mainp->gs_copy_shader->scratch_va);
3627
3628 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
3629 memcpy(shader->gs_copy_shader, mainp->gs_copy_shader,
3630 sizeof(*shader->gs_copy_shader));
3631 /* Increase the reference count. */
3632 pipe_reference(NULL, &shader->gs_copy_shader->bo->b.b.reference);
3633 /* Initialize some fields differently. */
3634 shader->gs_copy_shader->shader_log = NULL;
3635 shader->gs_copy_shader->is_binary_shared = true;
3636 util_queue_fence_init(&shader->gs_copy_shader->ready);
3637 }
3638 break;
3639 case MESA_SHADER_FRAGMENT:
3640 if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
3641 return false;
3642
3643 /* Make sure we have at least as many VGPRs as there
3644 * are allocated inputs.
3645 */
3646 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
3647 break;
3648 default:;
3649 }
3650
3651 assert(shader->wave_size == mainp->wave_size);
3652 assert(!shader->previous_stage || shader->wave_size == shader->previous_stage->wave_size);
3653
3654 /* Update SGPR and VGPR counts. */
3655 if (shader->prolog) {
3656 shader->config.num_sgprs =
3657 MAX2(shader->config.num_sgprs, shader->prolog->num_sgprs);
3658 shader->config.num_vgprs =
3659 MAX2(shader->config.num_vgprs, shader->prolog->num_vgprs);
3660 }
3661 if (shader->previous_stage) {
3662 shader->config.num_sgprs =
3663 MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
3664 shader->config.num_vgprs =
3665 MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
3666 shader->config.spilled_sgprs =
3667 MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
3668 shader->config.spilled_vgprs =
3669 MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
3670 shader->info.private_mem_vgprs =
3671 MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
3672 shader->config.scratch_bytes_per_wave =
3673 MAX2(shader->config.scratch_bytes_per_wave,
3674 shader->previous_stage->config.scratch_bytes_per_wave);
3675 shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
3676 shader->info.uses_vmem_load_other |= shader->previous_stage->info.uses_vmem_load_other;
3677 shader->info.uses_vmem_sampler_or_bvh |= shader->previous_stage->info.uses_vmem_sampler_or_bvh;
3678 }
3679 if (shader->epilog) {
3680 shader->config.num_sgprs =
3681 MAX2(shader->config.num_sgprs, shader->epilog->num_sgprs);
3682 shader->config.num_vgprs =
3683 MAX2(shader->config.num_vgprs, shader->epilog->num_vgprs);
3684 }
3685 si_calculate_max_simd_waves(shader);
3686 }
3687
3688 if (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
3689 assert(!shader->key.ge.as_es && !shader->key.ge.as_ls);
3690 if (!gfx10_ngg_calculate_subgroup_info(shader)) {
3691 fprintf(stderr, "Failed to compute subgroup info\n");
3692 return false;
3693 }
3694 } else if (sscreen->info.gfx_level >= GFX9 && sel->stage == MESA_SHADER_GEOMETRY) {
3695 gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
3696 }
3697
3698 shader->uses_vs_state_provoking_vertex =
3699 sscreen->use_ngg &&
3700 /* Used to convert triangle strips from GS to triangles. */
3701 ((sel->stage == MESA_SHADER_GEOMETRY &&
3702 util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) ||
3703 (sel->stage == MESA_SHADER_VERTEX &&
3704 /* Used to export PrimitiveID from the correct vertex. */
3705 shader->key.ge.mono.u.vs_export_prim_id));
3706
3707 shader->uses_gs_state_outprim = sscreen->use_ngg &&
3708 /* Only used by streamout and the PrimID export in vertex
3709 * shaders. */
3710 sel->stage == MESA_SHADER_VERTEX &&
3711 (si_shader_uses_streamout(shader) ||
3712 shader->uses_vs_state_provoking_vertex);
3713
3714 if (sel->stage == MESA_SHADER_VERTEX) {
3715 shader->uses_base_instance = sel->info.uses_base_instance ||
3716 shader->key.ge.mono.instance_divisor_is_one ||
3717 shader->key.ge.mono.instance_divisor_is_fetched;
3718 } else if (sel->stage == MESA_SHADER_TESS_CTRL) {
3719 shader->uses_base_instance = shader->previous_stage_sel &&
3720 (shader->previous_stage_sel->info.uses_base_instance ||
3721 shader->key.ge.mono.instance_divisor_is_one ||
3722 shader->key.ge.mono.instance_divisor_is_fetched);
3723 } else if (sel->stage == MESA_SHADER_GEOMETRY) {
3724 shader->uses_base_instance = shader->previous_stage_sel &&
3725 (shader->previous_stage_sel->info.uses_base_instance ||
3726 shader->key.ge.mono.instance_divisor_is_one ||
3727 shader->key.ge.mono.instance_divisor_is_fetched);
3728 }
3729
3730 si_fix_resource_usage(sscreen, shader);
3731
3732 /* Upload. */
3733 bool ok = si_shader_binary_upload(sscreen, shader, 0) >= 0;
3734
3735 shader->complete_shader_binary_size = si_get_shader_binary_size(sscreen, shader);
3736
3737 si_shader_dump(sscreen, shader, debug, stderr, true);
3738
3739 if (!ok)
3740 fprintf(stderr, "LLVM failed to upload shader\n");
3741 return ok;
3742 }
3743
si_shader_binary_clean(struct si_shader_binary * binary)3744 void si_shader_binary_clean(struct si_shader_binary *binary)
3745 {
3746 free((void *)binary->code_buffer);
3747 binary->code_buffer = NULL;
3748
3749 free(binary->llvm_ir_string);
3750 binary->llvm_ir_string = NULL;
3751
3752 free((void *)binary->symbols);
3753 binary->symbols = NULL;
3754
3755 free(binary->uploaded_code);
3756 binary->uploaded_code = NULL;
3757 binary->uploaded_code_size = 0;
3758 }
3759
si_shader_destroy(struct si_shader * shader)3760 void si_shader_destroy(struct si_shader *shader)
3761 {
3762 si_resource_reference(&shader->bo, NULL);
3763
3764 if (!shader->is_binary_shared)
3765 si_shader_binary_clean(&shader->binary);
3766
3767 free(shader->shader_log);
3768 }
3769
si_get_ps_prolog_args(struct si_shader_args * args,const union si_shader_part_key * key)3770 void si_get_ps_prolog_args(struct si_shader_args *args,
3771 const union si_shader_part_key *key)
3772 {
3773 memset(args, 0, sizeof(*args));
3774
3775 const unsigned num_input_sgprs = key->ps_prolog.num_input_sgprs;
3776
3777 struct ac_arg input_sgprs[num_input_sgprs];
3778 for (unsigned i = 0; i < num_input_sgprs; i++)
3779 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, input_sgprs + i);
3780
3781 args->internal_bindings = input_sgprs[SI_SGPR_INTERNAL_BINDINGS];
3782 /* Use the absolute location of the input. */
3783 args->ac.prim_mask = input_sgprs[SI_PS_NUM_USER_SGPR];
3784
3785 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_sample);
3786 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_center);
3787 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_centroid);
3788 /* skip PERSP_PULL_MODEL */
3789 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_sample);
3790 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_center);
3791 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_centroid);
3792 /* skip LINE_STIPPLE_TEX */
3793
3794 /* POS_X|Y|Z|W_FLOAT */
3795 u_foreach_bit(i, key->ps_prolog.fragcoord_usage_mask)
3796 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[i]);
3797
3798 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.front_face);
3799 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.ancillary);
3800 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.sample_coverage);
3801 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.pos_fixed_pt);
3802 }
3803
si_get_ps_epilog_args(struct si_shader_args * args,const union si_shader_part_key * key,struct ac_arg colors[MAX_DRAW_BUFFERS],struct ac_arg * depth,struct ac_arg * stencil,struct ac_arg * sample_mask)3804 void si_get_ps_epilog_args(struct si_shader_args *args,
3805 const union si_shader_part_key *key,
3806 struct ac_arg colors[MAX_DRAW_BUFFERS],
3807 struct ac_arg *depth, struct ac_arg *stencil,
3808 struct ac_arg *sample_mask)
3809 {
3810 memset(args, 0, sizeof(*args));
3811
3812 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3813 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3814 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3815 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3816 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3817 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3818 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &args->alpha_reference);
3819
3820 u_foreach_bit (i, key->ps_epilog.colors_written) {
3821 ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_FLOAT, colors + i);
3822 }
3823
3824 if (key->ps_epilog.writes_z)
3825 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, depth);
3826
3827 if (key->ps_epilog.writes_stencil)
3828 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, stencil);
3829
3830 if (key->ps_epilog.writes_samplemask)
3831 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, sample_mask);
3832 }
3833