1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "si_shader.h"
8 #include "ac_nir.h"
9 #include "ac_rtld.h"
10 #include "nir.h"
11 #include "nir_builder.h"
12 #include "nir_serialize.h"
13 #include "nir_xfb_info.h"
14 #include "si_pipe.h"
15 #include "si_shader_internal.h"
16 #include "sid.h"
17 #include "tgsi/tgsi_from_mesa.h"
18 #include "util/u_memory.h"
19 #include "util/mesa-sha1.h"
20 #include "util/ralloc.h"
21 #include "util/u_upload_mgr.h"
22
23 #if LLVM_AVAILABLE
24 #include <llvm/Config/llvm-config.h> /* for LLVM_VERSION_MAJOR */
25 #else
26 #define LLVM_VERSION_MAJOR 0
27 #endif
28
29 static const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
30
31 static const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
32
33 static void si_dump_shader_key(const struct si_shader *shader, FILE *f);
34 static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader);
35
36 /* Get the number of all interpolated inputs */
si_get_ps_num_interp(struct si_shader * ps)37 unsigned si_get_ps_num_interp(struct si_shader *ps)
38 {
39 unsigned num_colors = !!(ps->info.ps_colors_read & 0x0f) + !!(ps->info.ps_colors_read & 0xf0);
40 unsigned num_interp =
41 ps->info.num_ps_inputs + (ps->key.ps.part.prolog.color_two_side ? num_colors : 0);
42
43 assert(num_interp <= 32);
44 return MIN2(num_interp, 32);
45 }
46
47 /** Whether the shader runs as a combination of multiple API shaders */
si_is_multi_part_shader(struct si_shader * shader)48 bool si_is_multi_part_shader(struct si_shader *shader)
49 {
50 if (shader->selector->screen->info.gfx_level <= GFX8 ||
51 shader->selector->stage > MESA_SHADER_GEOMETRY)
52 return false;
53
54 return shader->key.ge.as_ls || shader->key.ge.as_es ||
55 shader->selector->stage == MESA_SHADER_TESS_CTRL ||
56 shader->selector->stage == MESA_SHADER_GEOMETRY;
57 }
58
59 /** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
si_is_merged_shader(struct si_shader * shader)60 bool si_is_merged_shader(struct si_shader *shader)
61 {
62 if (shader->selector->stage > MESA_SHADER_GEOMETRY || shader->is_gs_copy_shader)
63 return false;
64
65 return shader->key.ge.as_ngg || si_is_multi_part_shader(shader);
66 }
67
68 /**
69 * Returns a unique index for a semantic name and index. The index must be
70 * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
71 * calculated.
72 */
si_shader_io_get_unique_index(unsigned semantic)73 unsigned si_shader_io_get_unique_index(unsigned semantic)
74 {
75 switch (semantic) {
76 case VARYING_SLOT_POS:
77 return SI_UNIQUE_SLOT_POS;
78 default:
79 if (semantic >= VARYING_SLOT_VAR0 && semantic <= VARYING_SLOT_VAR31)
80 return SI_UNIQUE_SLOT_VAR0 + (semantic - VARYING_SLOT_VAR0);
81
82 if (semantic >= VARYING_SLOT_VAR0_16BIT && semantic <= VARYING_SLOT_VAR15_16BIT)
83 return SI_UNIQUE_SLOT_VAR0_16BIT + (semantic - VARYING_SLOT_VAR0_16BIT);
84
85 assert(!"invalid generic index");
86 return 0;
87
88 /* Legacy desktop GL varyings. */
89 case VARYING_SLOT_FOGC:
90 return SI_UNIQUE_SLOT_FOGC;
91 case VARYING_SLOT_COL0:
92 return SI_UNIQUE_SLOT_COL0;
93 case VARYING_SLOT_COL1:
94 return SI_UNIQUE_SLOT_COL1;
95 case VARYING_SLOT_BFC0:
96 return SI_UNIQUE_SLOT_BFC0;
97 case VARYING_SLOT_BFC1:
98 return SI_UNIQUE_SLOT_BFC1;
99 case VARYING_SLOT_TEX0:
100 case VARYING_SLOT_TEX1:
101 case VARYING_SLOT_TEX2:
102 case VARYING_SLOT_TEX3:
103 case VARYING_SLOT_TEX4:
104 case VARYING_SLOT_TEX5:
105 case VARYING_SLOT_TEX6:
106 case VARYING_SLOT_TEX7:
107 return SI_UNIQUE_SLOT_TEX0 + (semantic - VARYING_SLOT_TEX0);
108 case VARYING_SLOT_CLIP_VERTEX:
109 return SI_UNIQUE_SLOT_CLIP_VERTEX;
110
111 /* Varyings present in both GLES and desktop GL. */
112 case VARYING_SLOT_CLIP_DIST0:
113 return SI_UNIQUE_SLOT_CLIP_DIST0;
114 case VARYING_SLOT_CLIP_DIST1:
115 return SI_UNIQUE_SLOT_CLIP_DIST1;
116 case VARYING_SLOT_PSIZ:
117 return SI_UNIQUE_SLOT_PSIZ;
118 case VARYING_SLOT_LAYER:
119 return SI_UNIQUE_SLOT_LAYER;
120 case VARYING_SLOT_VIEWPORT:
121 return SI_UNIQUE_SLOT_VIEWPORT;
122 case VARYING_SLOT_PRIMITIVE_ID:
123 return SI_UNIQUE_SLOT_PRIMITIVE_ID;
124 }
125 }
126
declare_streamout_params(struct si_shader_args * args,struct si_shader * shader,const shader_info * info)127 static void declare_streamout_params(struct si_shader_args *args, struct si_shader *shader,
128 const shader_info *info)
129 {
130 if (shader->selector->screen->info.gfx_level >= GFX11) {
131 /* NGG streamout. */
132 if (info->stage == MESA_SHADER_TESS_EVAL)
133 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
134 return;
135 }
136
137 /* Streamout SGPRs. */
138 if (si_shader_uses_streamout(shader)) {
139 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_config);
140 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_write_index);
141
142 /* A streamout buffer offset is loaded if the stride is non-zero. */
143 for (int i = 0; i < 4; i++) {
144 if (!info->xfb_stride[i])
145 continue;
146
147 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_offset[i]);
148 }
149 } else if (info->stage == MESA_SHADER_TESS_EVAL) {
150 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
151 }
152 }
153
si_get_max_workgroup_size(const struct si_shader * shader)154 unsigned si_get_max_workgroup_size(const struct si_shader *shader)
155 {
156 gl_shader_stage stage = shader->is_gs_copy_shader ?
157 MESA_SHADER_VERTEX : shader->selector->stage;
158
159 assert(shader->wave_size);
160
161 switch (stage) {
162 case MESA_SHADER_VERTEX:
163 case MESA_SHADER_TESS_EVAL:
164 /* Use the largest workgroup size for streamout */
165 if (shader->key.ge.as_ngg)
166 return si_shader_uses_streamout(shader) ? 256 : 128;
167
168 /* As part of merged shader. */
169 return shader->selector->screen->info.gfx_level >= GFX9 &&
170 (shader->key.ge.as_ls || shader->key.ge.as_es) ? 128 : shader->wave_size;
171
172 case MESA_SHADER_TESS_CTRL:
173 /* Return this so that LLVM doesn't remove s_barrier
174 * instructions on chips where we use s_barrier. */
175 return shader->selector->screen->info.gfx_level >= GFX7 ? 128 : shader->wave_size;
176
177 case MESA_SHADER_GEOMETRY:
178 /* GS can always generate up to 256 vertices. */
179 return shader->selector->screen->info.gfx_level >= GFX9 ? 256 : shader->wave_size;
180
181 case MESA_SHADER_COMPUTE:
182 break; /* see below */
183
184 default:
185 return shader->wave_size;
186 }
187
188 /* Compile a variable block size using the maximum variable size. */
189 if (shader->selector->info.base.workgroup_size_variable)
190 return SI_MAX_VARIABLE_THREADS_PER_BLOCK;
191
192 uint16_t *local_size = shader->selector->info.base.workgroup_size;
193 unsigned max_work_group_size = (uint32_t)local_size[0] *
194 (uint32_t)local_size[1] *
195 (uint32_t)local_size[2];
196 assert(max_work_group_size);
197 return max_work_group_size;
198 }
199
declare_const_and_shader_buffers(struct si_shader_args * args,struct si_shader * shader,const shader_info * info,bool assign_params)200 static void declare_const_and_shader_buffers(struct si_shader_args *args, struct si_shader *shader,
201 const shader_info *info, bool assign_params)
202 {
203 enum ac_arg_type const_shader_buf_type;
204
205 if (info->num_ubos == 1 && info->num_ssbos == 0)
206 const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
207 else
208 const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
209
210 ac_add_arg(
211 &args->ac, AC_ARG_SGPR, 1, const_shader_buf_type,
212 assign_params ? &args->const_and_shader_buffers : &args->other_const_and_shader_buffers);
213 }
214
declare_samplers_and_images(struct si_shader_args * args,bool assign_params)215 static void declare_samplers_and_images(struct si_shader_args *args, bool assign_params)
216 {
217 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
218 assign_params ? &args->samplers_and_images : &args->other_samplers_and_images);
219 }
220
declare_per_stage_desc_pointers(struct si_shader_args * args,struct si_shader * shader,const shader_info * info,bool assign_params)221 static void declare_per_stage_desc_pointers(struct si_shader_args *args, struct si_shader *shader,
222 const shader_info *info, bool assign_params)
223 {
224 declare_const_and_shader_buffers(args, shader, info, assign_params);
225 declare_samplers_and_images(args, assign_params);
226 }
227
declare_global_desc_pointers(struct si_shader_args * args)228 static void declare_global_desc_pointers(struct si_shader_args *args)
229 {
230 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->internal_bindings);
231 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
232 &args->bindless_samplers_and_images);
233 }
234
declare_vb_descriptor_input_sgprs(struct si_shader_args * args,struct si_shader * shader)235 static void declare_vb_descriptor_input_sgprs(struct si_shader_args *args,
236 struct si_shader *shader)
237 {
238 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->ac.vertex_buffers);
239
240 unsigned num_vbos_in_user_sgprs = shader->selector->info.num_vbos_in_user_sgprs;
241 if (num_vbos_in_user_sgprs) {
242 unsigned user_sgprs = args->ac.num_sgprs_used;
243
244 if (si_is_merged_shader(shader))
245 user_sgprs -= 8;
246 assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
247
248 /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
249 for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
250 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
251
252 assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(args->vb_descriptors));
253 for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
254 ac_add_arg(&args->ac, AC_ARG_SGPR, 4, AC_ARG_INT, &args->vb_descriptors[i]);
255 }
256 }
257
declare_vs_input_vgprs(struct si_shader_args * args,struct si_shader * shader)258 static void declare_vs_input_vgprs(struct si_shader_args *args, struct si_shader *shader)
259 {
260 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vertex_id);
261
262 if (shader->selector->screen->info.gfx_level >= GFX12) {
263 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
264 } else if (shader->key.ge.as_ls) {
265 if (shader->selector->screen->info.gfx_level >= GFX11) {
266 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
267 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
268 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
269 } else if (shader->selector->screen->info.gfx_level >= GFX10) {
270 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
271 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
272 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
273 } else {
274 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_rel_patch_id);
275 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
276 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
277 }
278 } else if (shader->selector->screen->info.gfx_level >= GFX10) {
279 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
280 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
281 /* user vgpr or PrimID (legacy) */
282 shader->key.ge.as_ngg ? NULL : &args->ac.vs_prim_id);
283 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
284 } else {
285 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.instance_id);
286 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.vs_prim_id);
287 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
288 }
289 }
290
declare_vs_blit_inputs(struct si_shader * shader,struct si_shader_args * args,const shader_info * info)291 static void declare_vs_blit_inputs(struct si_shader *shader, struct si_shader_args *args,
292 const shader_info *info)
293 {
294 bool has_attribute_ring_address = shader->selector->screen->info.gfx_level >= GFX11;
295
296 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_blit_inputs); /* i16 x1, y1 */
297 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */
298 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */
299
300 if (info->vs.blit_sgprs_amd ==
301 SI_VS_BLIT_SGPRS_POS_COLOR + has_attribute_ring_address) {
302 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
303 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
304 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
305 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
306 if (has_attribute_ring_address)
307 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* attribute ring address */
308 } else if (info->vs.blit_sgprs_amd ==
309 SI_VS_BLIT_SGPRS_POS_TEXCOORD + has_attribute_ring_address) {
310 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
311 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
312 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
313 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
314 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
315 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
316 if (has_attribute_ring_address)
317 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* attribute ring address */
318 }
319 }
320
declare_tes_input_vgprs(struct si_shader_args * args)321 static void declare_tes_input_vgprs(struct si_shader_args *args)
322 {
323 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_u);
324 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.tes_v);
325 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_rel_patch_id);
326 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tes_patch_id);
327 }
328
329 enum
330 {
331 /* Convenient merged shader definitions. */
332 SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES,
333 SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
334 };
335
si_add_arg_checked(struct ac_shader_args * args,enum ac_arg_regfile file,unsigned registers,enum ac_arg_type type,struct ac_arg * arg,unsigned idx)336 static void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
337 enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
338 {
339 assert(args->arg_count == idx);
340 ac_add_arg(args, file, registers, type, arg);
341 }
342
si_init_shader_args(struct si_shader * shader,struct si_shader_args * args,const shader_info * info)343 static void si_init_shader_args(struct si_shader *shader, struct si_shader_args *args,
344 const shader_info *info)
345 {
346 unsigned i, num_returns, num_return_sgprs;
347 unsigned num_prolog_vgprs = 0;
348 struct si_shader_selector *sel = shader->selector;
349 unsigned stage = shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : info->stage;
350 unsigned stage_case = stage;
351
352 memset(args, 0, sizeof(*args));
353
354 /* Set MERGED shaders. */
355 if (sel->screen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY) {
356 if (shader->key.ge.as_ls || stage == MESA_SHADER_TESS_CTRL)
357 stage_case = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
358 else if (shader->key.ge.as_es || shader->key.ge.as_ngg || stage == MESA_SHADER_GEOMETRY)
359 stage_case = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
360 }
361
362 switch (stage_case) {
363 case MESA_SHADER_VERTEX:
364 declare_global_desc_pointers(args);
365
366 if (info->vs.blit_sgprs_amd) {
367 declare_vs_blit_inputs(shader, args, info);
368 } else {
369 declare_per_stage_desc_pointers(args, shader, info, true);
370 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
371
372 if (shader->is_gs_copy_shader) {
373 declare_streamout_params(args, shader, info);
374 } else {
375 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
376 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
377 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
378 declare_vb_descriptor_input_sgprs(args, shader);
379
380 if (shader->key.ge.as_es) {
381 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
382 } else if (shader->key.ge.as_ls) {
383 /* no extra parameters */
384 } else {
385 declare_streamout_params(args, shader, info);
386 }
387 }
388 }
389
390 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
391 if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
392 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
393
394 /* VGPRs */
395 declare_vs_input_vgprs(args, shader);
396
397 break;
398
399 case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
400 declare_global_desc_pointers(args);
401 declare_per_stage_desc_pointers(args, shader, info, true);
402 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
403 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
404 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
405 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
406 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
407
408 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
409 if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
410 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
411
412 /* VGPRs */
413 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
414 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
415 break;
416
417 case SI_SHADER_MERGED_VERTEX_TESSCTRL:
418 /* Merged stages have 8 system SGPRs at the beginning. */
419 /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
420 /* Gfx11+: SPI_SHADER_PGM_LO/HI_HS */
421 declare_per_stage_desc_pointers(args, shader, info, stage == MESA_SHADER_TESS_CTRL);
422 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
423 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info);
424 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_factor_offset);
425 if (sel->screen->info.gfx_level >= GFX11)
426 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tcs_wave_id);
427 else
428 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
429 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
430 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
431
432 declare_global_desc_pointers(args);
433 declare_per_stage_desc_pointers(args, shader, info, stage == MESA_SHADER_VERTEX);
434
435 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
436 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
437 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
438 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
439 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
440 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
441
442 /* VGPRs (first TCS, then VS) */
443 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_patch_id);
444 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.tcs_rel_ids);
445
446 if (stage == MESA_SHADER_VERTEX) {
447 declare_vs_input_vgprs(args, shader);
448
449 /* Need to keep LS/HS arg index same for shared args when ACO,
450 * so this is not able to be before shared VGPRs.
451 */
452 declare_vb_descriptor_input_sgprs(args, shader);
453
454 /* LS return values are inputs to the TCS main shader part. */
455 if (!shader->is_monolithic || shader->key.ge.opt.same_patch_vertices) {
456 for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
457 ac_add_return(&args->ac, AC_ARG_SGPR);
458 for (i = 0; i < 2; i++)
459 ac_add_return(&args->ac, AC_ARG_VGPR);
460
461 /* VS outputs passed via VGPRs to TCS. */
462 if (shader->key.ge.opt.same_patch_vertices && !info->use_aco_amd) {
463 unsigned num_outputs = util_last_bit64(shader->selector->info.ls_es_outputs_written);
464 for (i = 0; i < num_outputs * 4; i++)
465 ac_add_return(&args->ac, AC_ARG_VGPR);
466 }
467 }
468 } else {
469 /* TCS inputs are passed via VGPRs from VS. */
470 if (shader->key.ge.opt.same_patch_vertices && !info->use_aco_amd) {
471 unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->info.ls_es_outputs_written);
472 for (i = 0; i < num_inputs * 4; i++)
473 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
474 }
475 }
476 break;
477
478 case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
479 /* Merged stages have 8 system SGPRs at the beginning. */
480 /* Gfx9-10: SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
481 /* Gfx11+: SPI_SHADER_PGM_LO/HI_GS */
482 declare_per_stage_desc_pointers(args, shader, info, stage == MESA_SHADER_GEOMETRY);
483
484 if (shader->key.ge.as_ngg)
485 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_tg_info);
486 else
487 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
488
489 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.merged_wave_info);
490 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
491 if (sel->screen->info.gfx_level >= GFX11)
492 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_attr_offset);
493 else
494 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
495 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
496 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
497
498 declare_global_desc_pointers(args);
499 if (stage != MESA_SHADER_VERTEX || !info->vs.blit_sgprs_amd) {
500 declare_per_stage_desc_pointers(
501 args, shader, info, (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL));
502 }
503
504 if (stage == MESA_SHADER_VERTEX && info->vs.blit_sgprs_amd) {
505 declare_vs_blit_inputs(shader, args, info);
506 } else {
507 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
508
509 if (stage == MESA_SHADER_VERTEX) {
510 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.base_vertex);
511 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.draw_id);
512 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.start_instance);
513 } else if (stage == MESA_SHADER_TESS_EVAL) {
514 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
515 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
516 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
517 } else {
518 /* GS */
519 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
520 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
521 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
522 }
523
524 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &args->small_prim_cull_info);
525 if (sel->screen->info.gfx_level >= GFX11)
526 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->gs_attr_address);
527 else
528 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
529 }
530
531 /* VGPRs (first GS, then VS/TES) */
532 if (sel->screen->info.gfx_level >= GFX12) {
533 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
534 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
535 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
536 } else {
537 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
538 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
539 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
540 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
541 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
542 }
543
544 if (stage == MESA_SHADER_VERTEX) {
545 declare_vs_input_vgprs(args, shader);
546
547 /* Need to keep ES/GS arg index same for shared args when ACO,
548 * so this is not able to be before shared VGPRs.
549 */
550 if (!info->vs.blit_sgprs_amd)
551 declare_vb_descriptor_input_sgprs(args, shader);
552 } else if (stage == MESA_SHADER_TESS_EVAL) {
553 declare_tes_input_vgprs(args);
554 }
555
556 if (shader->key.ge.as_es && !shader->is_monolithic &&
557 (stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL)) {
558 /* ES return values are inputs to GS. */
559 for (i = 0; i < 8 + GFX9_GS_NUM_USER_SGPR; i++)
560 ac_add_return(&args->ac, AC_ARG_SGPR);
561 for (i = 0; i < (sel->screen->info.gfx_level >= GFX12 ? 3 : 5); i++)
562 ac_add_return(&args->ac, AC_ARG_VGPR);
563 }
564 break;
565
566 case MESA_SHADER_TESS_EVAL:
567 declare_global_desc_pointers(args);
568 declare_per_stage_desc_pointers(args, shader, info, true);
569 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->vs_state_bits);
570 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tcs_offchip_layout);
571 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->tes_offchip_addr);
572
573 if (shader->key.ge.as_es) {
574 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
575 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
576 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.es2gs_offset);
577 } else {
578 declare_streamout_params(args, shader, info);
579 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tess_offchip_offset);
580 }
581
582 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
583 if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
584 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
585
586 /* VGPRs */
587 declare_tes_input_vgprs(args);
588 break;
589
590 case MESA_SHADER_GEOMETRY:
591 declare_global_desc_pointers(args);
592 declare_per_stage_desc_pointers(args, shader, info, true);
593 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs2vs_offset);
594 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.gs_wave_id);
595
596 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
597 if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
598 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
599
600 /* VGPRs */
601 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[0]);
602 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[1]);
603 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_prim_id);
604 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[2]);
605 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[3]);
606 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[4]);
607 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_vtx_offset[5]);
608 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.gs_invocation_id);
609 break;
610
611 case MESA_SHADER_FRAGMENT:
612 declare_global_desc_pointers(args);
613 declare_per_stage_desc_pointers(args, shader, info, true);
614 si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->alpha_reference,
615 SI_PARAM_ALPHA_REF);
616 si_add_arg_checked(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.prim_mask,
617 SI_PARAM_PRIM_MASK);
618
619 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_sample,
620 SI_PARAM_PERSP_SAMPLE);
621 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_center,
622 SI_PARAM_PERSP_CENTER);
623 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.persp_centroid,
624 SI_PARAM_PERSP_CENTROID);
625 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
626 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_sample,
627 SI_PARAM_LINEAR_SAMPLE);
628 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_center,
629 SI_PARAM_LINEAR_CENTER);
630 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 2, AC_ARG_INT, &args->ac.linear_centroid,
631 SI_PARAM_LINEAR_CENTROID);
632 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
633 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[0],
634 SI_PARAM_POS_X_FLOAT);
635 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[1],
636 SI_PARAM_POS_Y_FLOAT);
637 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[2],
638 SI_PARAM_POS_Z_FLOAT);
639 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.frag_pos[3],
640 SI_PARAM_POS_W_FLOAT);
641 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.front_face,
642 SI_PARAM_FRONT_FACE);
643 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.ancillary,
644 SI_PARAM_ANCILLARY);
645 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.sample_coverage,
646 SI_PARAM_SAMPLE_COVERAGE);
647 si_add_arg_checked(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.pos_fixed_pt,
648 SI_PARAM_POS_FIXED_PT);
649
650 if (info->use_aco_amd) {
651 ac_compact_ps_vgpr_args(&args->ac, shader->config.spi_ps_input_addr);
652
653 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
654 if (sel->screen->info.gfx_level < GFX11)
655 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
656 }
657
658 /* Monolithic PS emit prolog and epilog in NIR directly. */
659 if (!shader->is_monolithic) {
660 /* Color inputs from the prolog. */
661 if (shader->selector->info.colors_read) {
662 unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);
663
664 for (i = 0; i < num_color_elements; i++)
665 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, i ? NULL : &args->color_start);
666
667 num_prolog_vgprs += num_color_elements;
668 }
669
670 /* Outputs for the epilog. */
671 num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
672 /* These must always be declared even if Z/stencil/samplemask are killed. */
673 num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
674 sel->info.writes_z + sel->info.writes_stencil + sel->info.writes_samplemask +
675 1 /* SampleMaskIn */;
676
677 for (i = 0; i < num_return_sgprs; i++)
678 ac_add_return(&args->ac, AC_ARG_SGPR);
679 for (; i < num_returns; i++)
680 ac_add_return(&args->ac, AC_ARG_VGPR);
681 }
682 break;
683
684 case MESA_SHADER_COMPUTE:
685 case MESA_SHADER_KERNEL:
686 declare_global_desc_pointers(args);
687 declare_per_stage_desc_pointers(args, shader, info, true);
688 if (shader->selector->info.uses_grid_size)
689 ac_add_arg(&args->ac, AC_ARG_SGPR, 3, AC_ARG_INT, &args->ac.num_work_groups);
690 if (shader->selector->info.uses_variable_block_size)
691 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->block_size);
692
693 unsigned cs_user_data_dwords =
694 info->cs.user_data_components_amd;
695 if (cs_user_data_dwords) {
696 ac_add_arg(&args->ac, AC_ARG_SGPR, MIN2(cs_user_data_dwords, 4), AC_ARG_INT,
697 &args->cs_user_data[0]);
698 if (cs_user_data_dwords > 4) {
699 ac_add_arg(&args->ac, AC_ARG_SGPR, cs_user_data_dwords - 4, AC_ARG_INT,
700 &args->cs_user_data[1]);
701 }
702 }
703
704 /* Some descriptors can be in user SGPRs. */
705 /* Shader buffers in user SGPRs. */
706 for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) {
707 while (args->ac.num_sgprs_used % 4 != 0)
708 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
709
710 ac_add_arg(&args->ac, AC_ARG_SGPR, 4, AC_ARG_INT, &args->cs_shaderbuf[i]);
711 }
712 /* Images in user SGPRs. */
713 for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) {
714 unsigned num_sgprs = BITSET_TEST(info->image_buffers, i) ? 4 : 8;
715
716 while (args->ac.num_sgprs_used % num_sgprs != 0)
717 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
718
719 ac_add_arg(&args->ac, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &args->cs_image[i]);
720 }
721
722 /* Hardware SGPRs. */
723 for (i = 0; i < 3; i++) {
724 if (shader->selector->info.uses_block_id[i]) {
725 /* GFX12 loads workgroup IDs into ttmp registers, so they are not input SGPRs, but we
726 * still need to set this to indicate that they are enabled (for ac_nir_to_llvm).
727 */
728 if (sel->screen->info.gfx_level >= GFX12)
729 args->ac.workgroup_ids[i].used = true;
730 else
731 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.workgroup_ids[i]);
732 }
733 }
734 if (shader->selector->info.uses_tg_size)
735 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.tg_size);
736
737 /* GFX11 set FLAT_SCRATCH directly instead of using this arg. */
738 if (info->use_aco_amd && sel->screen->info.gfx_level < GFX11)
739 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.scratch_offset);
740
741 /* Hardware VGPRs. */
742 /* Thread IDs are packed in VGPR0, 10 bits per component or stored in 3 separate VGPRs */
743 if (sel->screen->info.gfx_level >= GFX11 ||
744 (!sel->screen->info.has_graphics && sel->screen->info.family >= CHIP_MI200)) {
745 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_ids_packed);
746 } else {
747 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_id_x);
748 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_id_y);
749 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.local_invocation_id_z);
750 }
751 break;
752 default:
753 assert(0 && "unimplemented shader");
754 return;
755 }
756
757 shader->info.num_input_sgprs = args->ac.num_sgprs_used;
758 shader->info.num_input_vgprs = args->ac.num_vgprs_used;
759
760 assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
761 shader->info.num_input_vgprs -= num_prolog_vgprs;
762 }
763
get_lds_granularity(struct si_screen * screen,gl_shader_stage stage)764 static unsigned get_lds_granularity(struct si_screen *screen, gl_shader_stage stage)
765 {
766 return screen->info.gfx_level >= GFX11 && stage == MESA_SHADER_FRAGMENT ? 1024 :
767 screen->info.gfx_level >= GFX7 ? 512 : 256;
768 }
769
si_shader_binary_open(struct si_screen * screen,struct si_shader * shader,struct ac_rtld_binary * rtld)770 static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
771 struct ac_rtld_binary *rtld)
772 {
773 const struct si_shader_selector *sel = shader->selector;
774 const char *part_elfs[5];
775 size_t part_sizes[5];
776 unsigned num_parts = 0;
777
778 #define add_part(shader_or_part) \
779 if (shader_or_part) { \
780 part_elfs[num_parts] = (shader_or_part)->binary.code_buffer; \
781 part_sizes[num_parts] = (shader_or_part)->binary.code_size; \
782 num_parts++; \
783 }
784
785 add_part(shader->prolog);
786 add_part(shader->previous_stage);
787 add_part(shader);
788 add_part(shader->epilog);
789
790 #undef add_part
791
792 struct ac_rtld_symbol lds_symbols[2];
793 unsigned num_lds_symbols = 0;
794
795 if (sel && screen->info.gfx_level >= GFX9 && !shader->is_gs_copy_shader &&
796 (sel->stage == MESA_SHADER_GEOMETRY ||
797 (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg))) {
798 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
799 sym->name = "esgs_ring";
800 sym->size = shader->gs_info.esgs_ring_size * 4;
801 sym->align = 64 * 1024;
802 }
803
804 if (sel->stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
805 struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
806 sym->name = "ngg_emit";
807 sym->size = shader->ngg.ngg_emit_size * 4;
808 sym->align = 4;
809 }
810
811 bool ok = ac_rtld_open(
812 rtld, (struct ac_rtld_open_info){.info = &screen->info,
813 .options =
814 {
815 .halt_at_entry = screen->options.halt_shaders,
816 .waitcnt_wa = num_parts > 1 &&
817 screen->info.needs_llvm_wait_wa,
818 },
819 .shader_type = sel->stage,
820 .wave_size = shader->wave_size,
821 .num_parts = num_parts,
822 .elf_ptrs = part_elfs,
823 .elf_sizes = part_sizes,
824 .num_shared_lds_symbols = num_lds_symbols,
825 .shared_lds_symbols = lds_symbols});
826
827 if (rtld->lds_size > 0) {
828 unsigned alloc_granularity = get_lds_granularity(screen, sel->stage);
829 shader->config.lds_size = DIV_ROUND_UP(rtld->lds_size, alloc_granularity);
830 }
831
832 return ok;
833 }
834
get_shader_binaries(struct si_shader * shader,struct si_shader_binary * bin[4])835 static unsigned get_shader_binaries(struct si_shader *shader, struct si_shader_binary *bin[4])
836 {
837 unsigned num_bin = 0;
838
839 if (shader->prolog)
840 bin[num_bin++] = &shader->prolog->binary;
841
842 if (shader->previous_stage)
843 bin[num_bin++] = &shader->previous_stage->binary;
844
845 bin[num_bin++] = &shader->binary;
846
847 if (shader->epilog)
848 bin[num_bin++] = &shader->epilog->binary;
849
850 return num_bin;
851 }
852
853 /* si_get_shader_binary_size should only be called once per shader
854 * and the result should be stored in shader->complete_shader_binary_size.
855 */
si_get_shader_binary_size(struct si_screen * screen,struct si_shader * shader)856 unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
857 {
858 if (shader->binary.type == SI_SHADER_BINARY_ELF) {
859 struct ac_rtld_binary rtld;
860 si_shader_binary_open(screen, shader, &rtld);
861 uint64_t size = rtld.exec_size;
862 ac_rtld_close(&rtld);
863 return size;
864 } else {
865 struct si_shader_binary *bin[4];
866 unsigned num_bin = get_shader_binaries(shader, bin);
867
868 unsigned size = 0;
869 for (unsigned i = 0; i < num_bin; i++) {
870 assert(bin[i]->type == SI_SHADER_BINARY_RAW);
871 size += bin[i]->exec_size;
872 }
873 return size;
874 }
875 }
876
si_get_shader_prefetch_size(struct si_shader * shader)877 unsigned si_get_shader_prefetch_size(struct si_shader *shader)
878 {
879 struct si_screen *sscreen = shader->selector->screen;
880 /* This excludes arrays of constants after instructions. */
881 unsigned exec_size =
882 ac_align_shader_binary_for_prefetch(&sscreen->info,
883 shader->complete_shader_binary_size);
884
885 /* INST_PREF_SIZE uses 128B granularity.
886 * - GFX11: max 128 * 63 = 8064
887 * - GFX12: max 128 * 255 = 32640
888 */
889 unsigned max_pref_size = shader->selector->screen->info.gfx_level >= GFX12 ? 255 : 63;
890 unsigned exec_size_gran128 = DIV_ROUND_UP(exec_size, 128);
891
892 return MIN2(max_pref_size, exec_size_gran128);
893 }
894
si_get_external_symbol(enum amd_gfx_level gfx_level,void * data,const char * name,uint64_t * value)895 static bool si_get_external_symbol(enum amd_gfx_level gfx_level, void *data, const char *name,
896 uint64_t *value)
897 {
898 uint64_t *scratch_va = data;
899
900 if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
901 *value = (uint32_t)*scratch_va;
902 return true;
903 }
904 if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
905 /* Enable scratch coalescing. */
906 *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32);
907
908 if (gfx_level >= GFX11)
909 *value |= S_008F04_SWIZZLE_ENABLE_GFX11(1);
910 else
911 *value |= S_008F04_SWIZZLE_ENABLE_GFX6(1);
912 return true;
913 }
914
915 return false;
916 }
917
pre_upload_binary(struct si_screen * sscreen,struct si_shader * shader,unsigned binary_size,bool dma_upload,struct si_context ** upload_ctx,struct pipe_resource ** staging,unsigned * staging_offset,int64_t bo_offset)918 static void *pre_upload_binary(struct si_screen *sscreen, struct si_shader *shader,
919 unsigned binary_size, bool dma_upload,
920 struct si_context **upload_ctx,
921 struct pipe_resource **staging,
922 unsigned *staging_offset,
923 int64_t bo_offset)
924 {
925 unsigned aligned_size = ac_align_shader_binary_for_prefetch(&sscreen->info, binary_size);
926
927 if (bo_offset >= 0) {
928 /* sqtt needs to upload shaders as a pipeline, where all shaders
929 * are contiguous in memory.
930 * In this case, bo_offset will be positive and we don't have to
931 * realloc a new bo.
932 */
933 shader->gpu_address = shader->bo->gpu_address + bo_offset;
934 dma_upload = false;
935 } else {
936 si_resource_reference(&shader->bo, NULL);
937 shader->bo = si_aligned_buffer_create(
938 &sscreen->b,
939 SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT |
940 (dma_upload ? PIPE_RESOURCE_FLAG_UNMAPPABLE : 0),
941 PIPE_USAGE_IMMUTABLE, align(aligned_size, SI_CPDMA_ALIGNMENT), 256);
942 if (!shader->bo)
943 return NULL;
944
945 shader->gpu_address = shader->bo->gpu_address;
946 bo_offset = 0;
947 }
948
949 if (dma_upload) {
950 /* First upload into a staging buffer. */
951 *upload_ctx = si_get_aux_context(&sscreen->aux_context.shader_upload);
952
953 void *ret;
954 u_upload_alloc((*upload_ctx)->b.stream_uploader, 0, binary_size, 256,
955 staging_offset, staging, &ret);
956 if (!ret)
957 si_put_aux_context_flush(&sscreen->aux_context.shader_upload);
958
959 return ret;
960 } else {
961 void *ptr = sscreen->ws->buffer_map(sscreen->ws,
962 shader->bo->buf, NULL,
963 PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);
964 if (!ptr)
965 return NULL;
966
967 return ptr + bo_offset;
968 }
969 }
970
post_upload_binary(struct si_screen * sscreen,struct si_shader * shader,void * code,unsigned code_size,unsigned binary_size,bool dma_upload,struct si_context * upload_ctx,struct pipe_resource * staging,unsigned staging_offset)971 static void post_upload_binary(struct si_screen *sscreen, struct si_shader *shader,
972 void *code, unsigned code_size,
973 unsigned binary_size, bool dma_upload,
974 struct si_context *upload_ctx,
975 struct pipe_resource *staging,
976 unsigned staging_offset)
977 {
978 if (sscreen->debug_flags & DBG(SQTT)) {
979 /* Remember the uploaded code */
980 shader->binary.uploaded_code_size = code_size;
981 shader->binary.uploaded_code = malloc(code_size);
982 memcpy(shader->binary.uploaded_code, code, code_size);
983 }
984
985 if (dma_upload) {
986 /* Then copy from the staging buffer to VRAM.
987 *
988 * We can't use the upload copy in si_buffer_transfer_unmap because that might use
989 * a compute shader, and we can't use shaders in the code that is responsible for making
990 * them available.
991 */
992 si_cp_dma_copy_buffer(upload_ctx, &shader->bo->b.b, staging, 0, staging_offset,
993 binary_size);
994 si_barrier_after_simple_buffer_op(upload_ctx, 0, &shader->bo->b.b, staging);
995 upload_ctx->barrier_flags |= SI_BARRIER_INV_ICACHE | SI_BARRIER_INV_L2;
996
997 #if 0 /* debug: validate whether the copy was successful */
998 uint32_t *dst_binary = malloc(binary_size);
999 uint32_t *src_binary = (uint32_t*)code;
1000 pipe_buffer_read(&upload_ctx->b, &shader->bo->b.b, 0, binary_size, dst_binary);
1001 puts("dst_binary == src_binary:");
1002 for (unsigned i = 0; i < binary_size / 4; i++) {
1003 printf(" %08x == %08x\n", dst_binary[i], src_binary[i]);
1004 }
1005 free(dst_binary);
1006 exit(0);
1007 #endif
1008
1009 si_put_aux_context_flush(&sscreen->aux_context.shader_upload);
1010 pipe_resource_reference(&staging, NULL);
1011 } else {
1012 sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf);
1013 }
1014 }
1015
upload_binary_elf(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,bool dma_upload,int64_t bo_offset)1016 static int upload_binary_elf(struct si_screen *sscreen, struct si_shader *shader,
1017 uint64_t scratch_va, bool dma_upload, int64_t bo_offset)
1018 {
1019 struct ac_rtld_binary binary;
1020 if (!si_shader_binary_open(sscreen, shader, &binary))
1021 return -1;
1022
1023 struct si_context *upload_ctx = NULL;
1024 struct pipe_resource *staging = NULL;
1025 unsigned staging_offset = 0;
1026
1027 void *rx_ptr = pre_upload_binary(sscreen, shader, binary.rx_size, dma_upload,
1028 &upload_ctx, &staging, &staging_offset,
1029 bo_offset);
1030 if (!rx_ptr)
1031 return -1;
1032
1033 /* Upload. */
1034 struct ac_rtld_upload_info u = {};
1035 u.binary = &binary;
1036 u.get_external_symbol = si_get_external_symbol;
1037 u.cb_data = &scratch_va;
1038 u.rx_va = shader->gpu_address;
1039 u.rx_ptr = rx_ptr;
1040
1041 int size = ac_rtld_upload(&u);
1042
1043 post_upload_binary(sscreen, shader, rx_ptr, size, binary.rx_size, dma_upload,
1044 upload_ctx, staging, staging_offset);
1045
1046 ac_rtld_close(&binary);
1047
1048 return size;
1049 }
1050
calculate_needed_lds_size(struct si_screen * sscreen,struct si_shader * shader)1051 static void calculate_needed_lds_size(struct si_screen *sscreen, struct si_shader *shader)
1052 {
1053 gl_shader_stage stage =
1054 shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : shader->selector->stage;
1055
1056 if (sscreen->info.gfx_level >= GFX9 && stage <= MESA_SHADER_GEOMETRY &&
1057 (stage == MESA_SHADER_GEOMETRY || shader->key.ge.as_ngg)) {
1058 unsigned size_in_dw = shader->gs_info.esgs_ring_size;
1059
1060 if (stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)
1061 size_in_dw += shader->ngg.ngg_emit_size;
1062
1063 if (shader->key.ge.as_ngg) {
1064 unsigned scratch_dw_size = gfx10_ngg_get_scratch_dw_size(shader);
1065 if (scratch_dw_size) {
1066 /* scratch base address needs to be 8 byte aligned */
1067 size_in_dw = ALIGN(size_in_dw, 2);
1068 size_in_dw += scratch_dw_size;
1069 }
1070 }
1071
1072 shader->config.lds_size =
1073 DIV_ROUND_UP(size_in_dw * 4, get_lds_granularity(sscreen, stage));
1074 }
1075 }
1076
upload_binary_raw(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,bool dma_upload,int64_t bo_offset)1077 static int upload_binary_raw(struct si_screen *sscreen, struct si_shader *shader,
1078 uint64_t scratch_va, bool dma_upload, int64_t bo_offset)
1079 {
1080 struct si_shader_binary *bin[4];
1081 unsigned num_bin = get_shader_binaries(shader, bin);
1082
1083 unsigned code_size = 0, exec_size = 0;
1084 for (unsigned i = 0; i < num_bin; i++) {
1085 assert(bin[i]->type == SI_SHADER_BINARY_RAW);
1086 code_size += bin[i]->code_size;
1087 exec_size += bin[i]->exec_size;
1088 }
1089
1090 struct si_context *upload_ctx = NULL;
1091 struct pipe_resource *staging = NULL;
1092 unsigned staging_offset = 0;
1093
1094 void *rx_ptr = pre_upload_binary(sscreen, shader, code_size, dma_upload,
1095 &upload_ctx, &staging, &staging_offset,
1096 bo_offset);
1097 if (!rx_ptr)
1098 return -1;
1099
1100 unsigned exec_offset = 0, data_offset = exec_size;
1101 for (unsigned i = 0; i < num_bin; i++) {
1102 memcpy(rx_ptr + exec_offset, bin[i]->code_buffer, bin[i]->exec_size);
1103
1104 if (bin[i]->num_symbols) {
1105 /* Offset needed to add to const data symbol because of inserting other
1106 * shader part between exec code and const data.
1107 */
1108 unsigned const_offset = data_offset - exec_offset - bin[i]->exec_size;
1109
1110 /* Prolog and epilog have no symbols. */
1111 struct si_shader *sh = bin[i] == &shader->binary ? shader : shader->previous_stage;
1112 assert(sh && bin[i] == &sh->binary);
1113
1114 si_aco_resolve_symbols(sh, rx_ptr + exec_offset, (const uint32_t *)bin[i]->code_buffer,
1115 scratch_va, const_offset);
1116 }
1117
1118 exec_offset += bin[i]->exec_size;
1119
1120 unsigned data_size = bin[i]->code_size - bin[i]->exec_size;
1121 if (data_size) {
1122 memcpy(rx_ptr + data_offset, bin[i]->code_buffer + bin[i]->exec_size, data_size);
1123 data_offset += data_size;
1124 }
1125 }
1126
1127 post_upload_binary(sscreen, shader, rx_ptr, code_size, code_size, dma_upload,
1128 upload_ctx, staging, staging_offset);
1129
1130 calculate_needed_lds_size(sscreen, shader);
1131 return code_size;
1132 }
1133
si_shader_binary_upload_at(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va,int64_t bo_offset)1134 int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shader,
1135 uint64_t scratch_va, int64_t bo_offset)
1136 {
1137 bool dma_upload = !(sscreen->debug_flags & DBG(NO_DMA_SHADERS)) && sscreen->info.has_cp_dma &&
1138 sscreen->info.has_dedicated_vram && !sscreen->info.all_vram_visible &&
1139 bo_offset < 0;
1140
1141 if (shader->binary.type == SI_SHADER_BINARY_ELF) {
1142 return upload_binary_elf(sscreen, shader, scratch_va, dma_upload, bo_offset);
1143 } else {
1144 assert(shader->binary.type == SI_SHADER_BINARY_RAW);
1145 return upload_binary_raw(sscreen, shader, scratch_va, dma_upload, bo_offset);
1146 }
1147 }
1148
si_shader_binary_upload(struct si_screen * sscreen,struct si_shader * shader,uint64_t scratch_va)1149 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
1150 uint64_t scratch_va)
1151 {
1152 return si_shader_binary_upload_at(sscreen, shader, scratch_va, -1);
1153 }
1154
print_disassembly(const char * disasm,size_t nbytes,const char * name,FILE * file,struct util_debug_callback * debug)1155 static void print_disassembly(const char *disasm, size_t nbytes,
1156 const char *name, FILE *file,
1157 struct util_debug_callback *debug)
1158 {
1159 if (debug && debug->debug_message) {
1160 /* Very long debug messages are cut off, so send the
1161 * disassembly one line at a time. This causes more
1162 * overhead, but on the plus side it simplifies
1163 * parsing of resulting logs.
1164 */
1165 util_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");
1166
1167 uint64_t line = 0;
1168 while (line < nbytes) {
1169 int count = nbytes - line;
1170 const char *nl = memchr(disasm + line, '\n', nbytes - line);
1171 if (nl)
1172 count = nl - (disasm + line);
1173
1174 if (count) {
1175 util_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
1176 }
1177
1178 line += count + 1;
1179 }
1180
1181 util_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
1182 }
1183
1184 if (file) {
1185 fprintf(file, "Shader %s disassembly:\n", name);
1186 fprintf(file, "%*s", (int)nbytes, disasm);
1187 }
1188 }
1189
si_shader_dump_disassembly(struct si_screen * screen,const struct si_shader_binary * binary,gl_shader_stage stage,unsigned wave_size,struct util_debug_callback * debug,const char * name,FILE * file)1190 static void si_shader_dump_disassembly(struct si_screen *screen,
1191 const struct si_shader_binary *binary,
1192 gl_shader_stage stage, unsigned wave_size,
1193 struct util_debug_callback *debug, const char *name,
1194 FILE *file)
1195 {
1196 if (binary->type == SI_SHADER_BINARY_RAW) {
1197 print_disassembly(binary->disasm_string, binary->disasm_size, name, file, debug);
1198 return;
1199 }
1200
1201 struct ac_rtld_binary rtld_binary;
1202
1203 if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
1204 .info = &screen->info,
1205 .shader_type = stage,
1206 .wave_size = wave_size,
1207 .num_parts = 1,
1208 .elf_ptrs = &binary->code_buffer,
1209 .elf_sizes = &binary->code_size}))
1210 return;
1211
1212 const char *disasm;
1213 size_t nbytes;
1214
1215 if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
1216 goto out;
1217
1218 if (nbytes > INT_MAX)
1219 goto out;
1220
1221 print_disassembly(disasm, nbytes, name, file, debug);
1222
1223 out:
1224 ac_rtld_close(&rtld_binary);
1225 }
1226
si_calculate_max_simd_waves(struct si_shader * shader)1227 static void si_calculate_max_simd_waves(struct si_shader *shader)
1228 {
1229 struct si_screen *sscreen = shader->selector->screen;
1230 struct ac_shader_config *conf = &shader->config;
1231 unsigned lds_increment = get_lds_granularity(sscreen, shader->selector->stage);
1232 unsigned lds_per_wave = 0;
1233 unsigned max_simd_waves;
1234
1235 max_simd_waves = sscreen->info.max_waves_per_simd;
1236
1237 /* Compute LDS usage for PS. */
1238 switch (shader->selector->stage) {
1239 case MESA_SHADER_FRAGMENT:
1240 /* The minimum usage per wave is (num_inputs * 48). The maximum
1241 * usage is (num_inputs * 48 * 16).
1242 * We can get anything in between and it varies between waves.
1243 *
1244 * The 48 bytes per input for a single primitive is equal to
1245 * 4 bytes/component * 4 components/input * 3 points.
1246 *
1247 * Other stages don't know the size at compile time or don't
1248 * allocate LDS per wave, but instead they do it per thread group.
1249 */
1250 lds_per_wave = conf->lds_size * lds_increment +
1251 align(shader->info.num_ps_inputs * 48, lds_increment);
1252 break;
1253 case MESA_SHADER_COMPUTE: {
1254 unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
1255 lds_per_wave = (conf->lds_size * lds_increment) /
1256 DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
1257 }
1258 break;
1259 default:;
1260 }
1261
1262 /* Compute the per-SIMD wave counts. */
1263 if (conf->num_sgprs) {
1264 max_simd_waves =
1265 MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
1266 }
1267
1268 if (conf->num_vgprs) {
1269 /* GFX 10.3 internally:
1270 * - aligns VGPRS to 16 for Wave32 and 8 for Wave64
1271 * - aligns LDS to 1024
1272 *
1273 * For shader-db stats, set num_vgprs that the hw actually uses.
1274 */
1275 unsigned num_vgprs = conf->num_vgprs;
1276 if (sscreen->info.gfx_level >= GFX10_3) {
1277 unsigned real_vgpr_gran = sscreen->info.num_physical_wave64_vgprs_per_simd / 64;
1278 num_vgprs = util_align_npot(num_vgprs, real_vgpr_gran * (shader->wave_size == 32 ? 2 : 1));
1279 } else {
1280 num_vgprs = align(num_vgprs, shader->wave_size == 32 ? 8 : 4);
1281 }
1282
1283 /* Always print wave limits as Wave64, so that we can compare
1284 * Wave32 and Wave64 with shader-db fairly. */
1285 unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
1286 max_simd_waves = MIN2(max_simd_waves, max_vgprs / num_vgprs);
1287 }
1288
1289 unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
1290 if (lds_per_wave)
1291 max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
1292
1293 shader->info.max_simd_waves = max_simd_waves;
1294 }
1295
si_shader_dump_stats_for_shader_db(struct si_screen * screen,struct si_shader * shader,struct util_debug_callback * debug)1296 void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
1297 struct util_debug_callback *debug)
1298 {
1299 const struct ac_shader_config *conf = &shader->config;
1300 static const char *stages[] = {"VS", "TCS", "TES", "GS", "PS", "CS"};
1301
1302 if (screen->options.debug_disassembly)
1303 si_shader_dump_disassembly(screen, &shader->binary, shader->selector->stage,
1304 shader->wave_size, debug, "main", NULL);
1305
1306 unsigned num_ls_outputs = 0;
1307 unsigned num_hs_outputs = 0;
1308 unsigned num_es_outputs = 0;
1309 unsigned num_gs_outputs = 0;
1310 unsigned num_vs_outputs = 0;
1311 unsigned num_ps_outputs = 0;
1312
1313 if (shader->selector->stage <= MESA_SHADER_GEOMETRY) {
1314 /* This doesn't include pos exports because only param exports are interesting
1315 * for performance and can be optimized.
1316 */
1317 if (shader->key.ge.as_ls)
1318 num_ls_outputs = si_shader_lshs_vertex_stride(shader) / 16;
1319 else if (shader->selector->stage == MESA_SHADER_TESS_CTRL)
1320 num_hs_outputs = util_last_bit64(shader->selector->info.tcs_outputs_written_for_tes);
1321 else if (shader->key.ge.as_es)
1322 num_es_outputs = shader->selector->info.esgs_vertex_stride / 16;
1323 else if (shader->gs_copy_shader)
1324 num_gs_outputs = shader->gs_copy_shader->info.nr_param_exports;
1325 else if (shader->selector->stage == MESA_SHADER_GEOMETRY)
1326 num_gs_outputs = shader->info.nr_param_exports;
1327 else if (shader->selector->stage == MESA_SHADER_VERTEX ||
1328 shader->selector->stage == MESA_SHADER_TESS_EVAL)
1329 num_vs_outputs = shader->info.nr_param_exports;
1330 else
1331 unreachable("invalid shader key");
1332 } else if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
1333 num_ps_outputs = util_bitcount(shader->selector->info.colors_written) +
1334 (shader->ps.writes_z ||
1335 shader->ps.writes_stencil ||
1336 shader->ps.writes_samplemask);
1337 }
1338
1339 util_debug_message(debug, SHADER_INFO,
1340 "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
1341 "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
1342 "Spilled VGPRs: %d PrivMem VGPRs: %d LSOutputs: %u HSOutputs: %u "
1343 "HSPatchOuts: %u ESOutputs: %u GSOutputs: %u VSOutputs: %u PSOutputs: %u "
1344 "InlineUniforms: %u DivergentLoop: %u (%s, W%u)",
1345 conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
1346 conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
1347 conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
1348 num_ls_outputs, num_hs_outputs,
1349 util_last_bit(shader->selector->info.patch_outputs_written_for_tes),
1350 num_es_outputs, num_gs_outputs, num_vs_outputs, num_ps_outputs,
1351 shader->selector->info.base.num_inlinable_uniforms,
1352 shader->selector->info.has_divergent_loop,
1353 stages[shader->selector->stage], shader->wave_size);
1354 }
1355
si_can_dump_shader(struct si_screen * sscreen,gl_shader_stage stage,enum si_shader_dump_type dump_type)1356 bool si_can_dump_shader(struct si_screen *sscreen, gl_shader_stage stage,
1357 enum si_shader_dump_type dump_type)
1358 {
1359 static uint64_t filter[] = {
1360 [SI_DUMP_SHADER_KEY] = DBG(NIR) | DBG(INIT_LLVM) | DBG(LLVM) | DBG(INIT_ACO) | DBG(ACO) | DBG(ASM),
1361 [SI_DUMP_INIT_NIR] = DBG(INIT_NIR),
1362 [SI_DUMP_NIR] = DBG(NIR),
1363 [SI_DUMP_INIT_LLVM_IR] = DBG(INIT_LLVM),
1364 [SI_DUMP_LLVM_IR] = DBG(LLVM),
1365 [SI_DUMP_INIT_ACO_IR] = DBG(INIT_ACO),
1366 [SI_DUMP_ACO_IR] = DBG(ACO),
1367 [SI_DUMP_ASM] = DBG(ASM),
1368 [SI_DUMP_STATS] = DBG(STATS),
1369 [SI_DUMP_ALWAYS] = DBG(VS) | DBG(TCS) | DBG(TES) | DBG(GS) | DBG(PS) | DBG(CS),
1370 };
1371 assert(dump_type < ARRAY_SIZE(filter));
1372
1373 return sscreen->debug_flags & (1 << stage) &&
1374 sscreen->debug_flags & filter[dump_type];
1375 }
1376
si_shader_dump_stats(struct si_screen * sscreen,struct si_shader * shader,FILE * file,bool check_debug_option)1377 static void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
1378 bool check_debug_option)
1379 {
1380 const struct ac_shader_config *conf = &shader->config;
1381
1382 if (shader->selector->stage == MESA_SHADER_FRAGMENT) {
1383 fprintf(file,
1384 "*** SHADER CONFIG ***\n"
1385 "SPI_PS_INPUT_ADDR = 0x%04x\n"
1386 "SPI_PS_INPUT_ENA = 0x%04x\n",
1387 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
1388 }
1389
1390 fprintf(file,
1391 "*** SHADER STATS ***\n"
1392 "SGPRS: %d\n"
1393 "VGPRS: %d\n"
1394 "Spilled SGPRs: %d\n"
1395 "Spilled VGPRs: %d\n"
1396 "Private memory VGPRs: %d\n"
1397 "Code Size: %d bytes\n"
1398 "LDS: %d bytes\n"
1399 "Scratch: %d bytes per wave\n"
1400 "Max Waves: %d\n"
1401 "********************\n\n\n",
1402 conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
1403 shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
1404 conf->lds_size * get_lds_granularity(sscreen, shader->selector->stage),
1405 conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
1406 }
1407
si_get_shader_name(const struct si_shader * shader)1408 const char *si_get_shader_name(const struct si_shader *shader)
1409 {
1410 switch (shader->selector->stage) {
1411 case MESA_SHADER_VERTEX:
1412 if (shader->key.ge.as_es)
1413 return "Vertex Shader as ES";
1414 else if (shader->key.ge.as_ls)
1415 return "Vertex Shader as LS";
1416 else if (shader->key.ge.as_ngg)
1417 return "Vertex Shader as ESGS";
1418 else
1419 return "Vertex Shader as VS";
1420 case MESA_SHADER_TESS_CTRL:
1421 return "Tessellation Control Shader";
1422 case MESA_SHADER_TESS_EVAL:
1423 if (shader->key.ge.as_es)
1424 return "Tessellation Evaluation Shader as ES";
1425 else if (shader->key.ge.as_ngg)
1426 return "Tessellation Evaluation Shader as ESGS";
1427 else
1428 return "Tessellation Evaluation Shader as VS";
1429 case MESA_SHADER_GEOMETRY:
1430 if (shader->is_gs_copy_shader)
1431 return "GS Copy Shader as VS";
1432 else
1433 return "Geometry Shader";
1434 case MESA_SHADER_FRAGMENT:
1435 return "Pixel Shader";
1436 case MESA_SHADER_COMPUTE:
1437 return "Compute Shader";
1438 default:
1439 return "Unknown Shader";
1440 }
1441 }
1442
si_shader_dump(struct si_screen * sscreen,struct si_shader * shader,struct util_debug_callback * debug,FILE * file,bool check_debug_option)1443 void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
1444 struct util_debug_callback *debug, FILE *file, bool check_debug_option)
1445 {
1446 gl_shader_stage stage = shader->selector->stage;
1447
1448 if (!check_debug_option || si_can_dump_shader(sscreen, stage, SI_DUMP_SHADER_KEY))
1449 si_dump_shader_key(shader, file);
1450
1451 if (!check_debug_option && shader->binary.llvm_ir_string) {
1452 /* This is only used with ddebug. */
1453 if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
1454 fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
1455 fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
1456 }
1457
1458 fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
1459 fprintf(file, "%s\n", shader->binary.llvm_ir_string);
1460 }
1461
1462 if (!check_debug_option || (si_can_dump_shader(sscreen, stage, SI_DUMP_ASM))) {
1463 fprintf(file, "\n%s:\n", si_get_shader_name(shader));
1464
1465 if (shader->prolog)
1466 si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, shader->wave_size, debug,
1467 "prolog", file);
1468 if (shader->previous_stage)
1469 si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,
1470 shader->wave_size, debug, "previous stage", file);
1471 si_shader_dump_disassembly(sscreen, &shader->binary, stage, shader->wave_size, debug, "main",
1472 file);
1473
1474 if (shader->epilog)
1475 si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, shader->wave_size, debug,
1476 "epilog", file);
1477 fprintf(file, "\n");
1478
1479 si_shader_dump_stats(sscreen, shader, file, check_debug_option);
1480 }
1481 }
1482
si_dump_shader_key_vs(const union si_shader_key * key,FILE * f)1483 static void si_dump_shader_key_vs(const union si_shader_key *key, FILE *f)
1484 {
1485 fprintf(f, " mono.instance_divisor_is_one = %u\n", key->ge.mono.instance_divisor_is_one);
1486 fprintf(f, " mono.instance_divisor_is_fetched = %u\n",
1487 key->ge.mono.instance_divisor_is_fetched);
1488 fprintf(f, " mono.vs.fetch_opencode = %x\n", key->ge.mono.vs_fetch_opencode);
1489 fprintf(f, " mono.vs.fix_fetch = {");
1490 for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
1491 union si_vs_fix_fetch fix = key->ge.mono.vs_fix_fetch[i];
1492 if (i)
1493 fprintf(f, ", ");
1494 if (!fix.bits)
1495 fprintf(f, "0");
1496 else
1497 fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
1498 fix.u.format);
1499 }
1500 fprintf(f, "}\n");
1501 }
1502
si_dump_shader_key(const struct si_shader * shader,FILE * f)1503 static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
1504 {
1505 const union si_shader_key *key = &shader->key;
1506 gl_shader_stage stage = shader->selector->stage;
1507
1508 fprintf(f, "SHADER KEY\n");
1509 fprintf(f, " source_blake3 = {");
1510 _mesa_blake3_print(f, shader->selector->info.base.source_blake3);
1511 fprintf(f, "}\n");
1512
1513 switch (stage) {
1514 case MESA_SHADER_VERTEX:
1515 si_dump_shader_key_vs(key, f);
1516 fprintf(f, " as_es = %u\n", key->ge.as_es);
1517 fprintf(f, " as_ls = %u\n", key->ge.as_ls);
1518 fprintf(f, " as_ngg = %u\n", key->ge.as_ngg);
1519 fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1520 break;
1521
1522 case MESA_SHADER_TESS_CTRL:
1523 if (shader->selector->screen->info.gfx_level >= GFX9)
1524 si_dump_shader_key_vs(key, f);
1525
1526 fprintf(f, " opt.tes_prim_mode = %u\n", key->ge.opt.tes_prim_mode);
1527 fprintf(f, " opt.tes_reads_tess_factors = %u\n", key->ge.opt.tes_reads_tess_factors);
1528 fprintf(f, " opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1529 fprintf(f, " opt.same_patch_vertices = %u\n", key->ge.opt.same_patch_vertices);
1530 break;
1531
1532 case MESA_SHADER_TESS_EVAL:
1533 fprintf(f, " as_es = %u\n", key->ge.as_es);
1534 fprintf(f, " as_ngg = %u\n", key->ge.as_ngg);
1535 fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->ge.mono.u.vs_export_prim_id);
1536 break;
1537
1538 case MESA_SHADER_GEOMETRY:
1539 if (shader->is_gs_copy_shader)
1540 break;
1541
1542 if (shader->selector->screen->info.gfx_level >= GFX9 &&
1543 key->ge.part.gs.es->stage == MESA_SHADER_VERTEX)
1544 si_dump_shader_key_vs(key, f);
1545
1546 fprintf(f, " mono.u.gs_tri_strip_adj_fix = %u\n", key->ge.mono.u.gs_tri_strip_adj_fix);
1547 fprintf(f, " as_ngg = %u\n", key->ge.as_ngg);
1548 break;
1549
1550 case MESA_SHADER_COMPUTE:
1551 break;
1552
1553 case MESA_SHADER_FRAGMENT:
1554 fprintf(f, " prolog.color_two_side = %u\n", key->ps.part.prolog.color_two_side);
1555 fprintf(f, " prolog.flatshade_colors = %u\n", key->ps.part.prolog.flatshade_colors);
1556 fprintf(f, " prolog.poly_stipple = %u\n", key->ps.part.prolog.poly_stipple);
1557 fprintf(f, " prolog.force_persp_sample_interp = %u\n",
1558 key->ps.part.prolog.force_persp_sample_interp);
1559 fprintf(f, " prolog.force_linear_sample_interp = %u\n",
1560 key->ps.part.prolog.force_linear_sample_interp);
1561 fprintf(f, " prolog.force_persp_center_interp = %u\n",
1562 key->ps.part.prolog.force_persp_center_interp);
1563 fprintf(f, " prolog.force_linear_center_interp = %u\n",
1564 key->ps.part.prolog.force_linear_center_interp);
1565 fprintf(f, " prolog.bc_optimize_for_persp = %u\n",
1566 key->ps.part.prolog.bc_optimize_for_persp);
1567 fprintf(f, " prolog.bc_optimize_for_linear = %u\n",
1568 key->ps.part.prolog.bc_optimize_for_linear);
1569 fprintf(f, " prolog.samplemask_log_ps_iter = %u\n",
1570 key->ps.part.prolog.samplemask_log_ps_iter);
1571 fprintf(f, " epilog.spi_shader_col_format = 0x%x\n",
1572 key->ps.part.epilog.spi_shader_col_format);
1573 fprintf(f, " epilog.color_is_int8 = 0x%X\n", key->ps.part.epilog.color_is_int8);
1574 fprintf(f, " epilog.color_is_int10 = 0x%X\n", key->ps.part.epilog.color_is_int10);
1575 fprintf(f, " epilog.last_cbuf = %u\n", key->ps.part.epilog.last_cbuf);
1576 fprintf(f, " epilog.alpha_func = %u\n", key->ps.part.epilog.alpha_func);
1577 fprintf(f, " epilog.alpha_to_one = %u\n", key->ps.part.epilog.alpha_to_one);
1578 fprintf(f, " epilog.alpha_to_coverage_via_mrtz = %u\n", key->ps.part.epilog.alpha_to_coverage_via_mrtz);
1579 fprintf(f, " epilog.clamp_color = %u\n", key->ps.part.epilog.clamp_color);
1580 fprintf(f, " epilog.dual_src_blend_swizzle = %u\n", key->ps.part.epilog.dual_src_blend_swizzle);
1581 fprintf(f, " epilog.rbplus_depth_only_opt = %u\n", key->ps.part.epilog.rbplus_depth_only_opt);
1582 fprintf(f, " epilog.kill_z = %u\n", key->ps.part.epilog.kill_z);
1583 fprintf(f, " epilog.kill_stencil = %u\n", key->ps.part.epilog.kill_stencil);
1584 fprintf(f, " epilog.kill_samplemask = %u\n", key->ps.part.epilog.kill_samplemask);
1585 fprintf(f, " mono.poly_line_smoothing = %u\n", key->ps.mono.poly_line_smoothing);
1586 fprintf(f, " mono.point_smoothing = %u\n", key->ps.mono.point_smoothing);
1587 fprintf(f, " mono.interpolate_at_sample_force_center = %u\n",
1588 key->ps.mono.interpolate_at_sample_force_center);
1589 fprintf(f, " mono.fbfetch_msaa = %u\n", key->ps.mono.fbfetch_msaa);
1590 fprintf(f, " mono.fbfetch_is_1D = %u\n", key->ps.mono.fbfetch_is_1D);
1591 fprintf(f, " mono.fbfetch_layered = %u\n", key->ps.mono.fbfetch_layered);
1592 break;
1593
1594 default:
1595 assert(0);
1596 }
1597
1598 if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||
1599 stage == MESA_SHADER_VERTEX) &&
1600 !key->ge.as_es && !key->ge.as_ls) {
1601 fprintf(f, " mono.remove_streamout = 0x%x\n", key->ge.mono.remove_streamout);
1602 fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs);
1603 fprintf(f, " opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances);
1604 fprintf(f, " opt.kill_pointsize = %u\n", key->ge.opt.kill_pointsize);
1605 fprintf(f, " opt.kill_layer = %u\n", key->ge.opt.kill_layer);
1606 fprintf(f, " opt.remove_streamout = %u\n", key->ge.opt.remove_streamout);
1607 fprintf(f, " opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling);
1608 fprintf(f, " opt.ngg_vs_streamout_num_verts_per_prim = %u\n",
1609 key->ge.opt.ngg_vs_streamout_num_verts_per_prim);
1610 }
1611
1612 if (stage <= MESA_SHADER_GEOMETRY)
1613 fprintf(f, " opt.prefer_mono = %u\n", key->ge.opt.prefer_mono);
1614 else
1615 fprintf(f, " opt.prefer_mono = %u\n", key->ps.opt.prefer_mono);
1616
1617 if (stage <= MESA_SHADER_GEOMETRY) {
1618 if (key->ge.opt.inline_uniforms) {
1619 fprintf(f, " opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1620 key->ge.opt.inline_uniforms,
1621 key->ge.opt.inlined_uniform_values[0],
1622 key->ge.opt.inlined_uniform_values[1],
1623 key->ge.opt.inlined_uniform_values[2],
1624 key->ge.opt.inlined_uniform_values[3]);
1625 } else {
1626 fprintf(f, " opt.inline_uniforms = 0\n");
1627 }
1628 } else {
1629 if (key->ps.opt.inline_uniforms) {
1630 fprintf(f, " opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
1631 key->ps.opt.inline_uniforms,
1632 key->ps.opt.inlined_uniform_values[0],
1633 key->ps.opt.inlined_uniform_values[1],
1634 key->ps.opt.inlined_uniform_values[2],
1635 key->ps.opt.inlined_uniform_values[3]);
1636 } else {
1637 fprintf(f, " opt.inline_uniforms = 0\n");
1638 }
1639 }
1640 }
1641
1642 /* TODO: convert to nir_shader_instructions_pass */
si_nir_kill_outputs(nir_shader * nir,const union si_shader_key * key)1643 static bool si_nir_kill_outputs(nir_shader *nir, const union si_shader_key *key)
1644 {
1645 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
1646 assert(impl);
1647 assert(nir->info.stage <= MESA_SHADER_GEOMETRY);
1648
1649 if (!key->ge.opt.kill_outputs &&
1650 !key->ge.opt.kill_pointsize &&
1651 !key->ge.opt.kill_layer &&
1652 !key->ge.opt.kill_clip_distances &&
1653 !(nir->info.outputs_written & BITFIELD64_BIT(VARYING_SLOT_LAYER))) {
1654 nir_metadata_preserve(impl, nir_metadata_all);
1655 return false;
1656 }
1657
1658 bool progress = false;
1659
1660 nir_foreach_block(block, impl) {
1661 nir_foreach_instr_safe(instr, block) {
1662 if (instr->type != nir_instr_type_intrinsic)
1663 continue;
1664
1665 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1666 if (intr->intrinsic != nir_intrinsic_store_output)
1667 continue;
1668
1669 /* No indirect indexing allowed. */
1670 ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1671 assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1672
1673 assert(intr->num_components == 1); /* only scalar stores expected */
1674 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
1675
1676 if (nir_slot_is_varying(sem.location, MESA_SHADER_FRAGMENT) &&
1677 key->ge.opt.kill_outputs &
1678 (1ull << si_shader_io_get_unique_index(sem.location)))
1679 progress |= nir_remove_varying(intr, MESA_SHADER_FRAGMENT);
1680
1681 switch (sem.location) {
1682 case VARYING_SLOT_PSIZ:
1683 if (key->ge.opt.kill_pointsize)
1684 progress |= nir_remove_sysval_output(intr, MESA_SHADER_FRAGMENT);
1685 break;
1686
1687 case VARYING_SLOT_CLIP_VERTEX:
1688 /* TODO: We should only kill specific clip planes as required by kill_clip_distance,
1689 * not whole gl_ClipVertex. Lower ClipVertex in NIR.
1690 */
1691 if ((key->ge.opt.kill_clip_distances & SI_USER_CLIP_PLANE_MASK) ==
1692 SI_USER_CLIP_PLANE_MASK)
1693 progress |= nir_remove_sysval_output(intr, MESA_SHADER_FRAGMENT);
1694 break;
1695
1696 case VARYING_SLOT_CLIP_DIST0:
1697 case VARYING_SLOT_CLIP_DIST1:
1698 if (key->ge.opt.kill_clip_distances) {
1699 assert(nir_intrinsic_src_type(intr) == nir_type_float32);
1700 unsigned index = (sem.location - VARYING_SLOT_CLIP_DIST0) * 4 +
1701 nir_intrinsic_component(intr);
1702
1703 if (key->ge.opt.kill_clip_distances & BITFIELD_BIT(index))
1704 progress |= nir_remove_sysval_output(intr, MESA_SHADER_FRAGMENT);
1705 }
1706 break;
1707
1708 case VARYING_SLOT_LAYER:
1709 /* LAYER is never passed to FS. Instead, we load it there as a system value. */
1710 progress |= nir_remove_varying(intr, MESA_SHADER_FRAGMENT);
1711
1712 if (key->ge.opt.kill_layer)
1713 progress |= nir_remove_sysval_output(intr, MESA_SHADER_FRAGMENT);
1714 break;
1715 }
1716 }
1717 }
1718
1719 if (progress) {
1720 nir_metadata_preserve(impl, nir_metadata_control_flow);
1721 } else {
1722 nir_metadata_preserve(impl, nir_metadata_all);
1723 }
1724
1725 return progress;
1726 }
1727
1728 /* Remove PS output components from NIR if they are disabled by spi_shader_col_format. */
kill_ps_outputs_cb(struct nir_builder * b,nir_instr * instr,void * _key)1729 static bool kill_ps_outputs_cb(struct nir_builder *b, nir_instr *instr, void *_key)
1730 {
1731 if (instr->type != nir_instr_type_intrinsic)
1732 return false;
1733
1734 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1735 if (intr->intrinsic != nir_intrinsic_store_output)
1736 return false;
1737
1738 /* No indirect indexing allowed. */
1739 ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
1740 assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
1741
1742 unsigned location = nir_intrinsic_io_semantics(intr).location;
1743 const union si_shader_key *key = _key;
1744
1745 switch (location) {
1746 case FRAG_RESULT_DEPTH:
1747 case FRAG_RESULT_STENCIL:
1748 return false;
1749
1750 case FRAG_RESULT_SAMPLE_MASK:
1751 if (key->ps.part.epilog.kill_samplemask) {
1752 nir_instr_remove(instr);
1753 return true;
1754 }
1755 return false;
1756 }
1757
1758 /* Color outputs. */
1759 unsigned component = nir_intrinsic_component(intr);
1760 unsigned comp_mask = BITFIELD_RANGE(component, intr->num_components);
1761 unsigned cb_shader_mask = ac_get_cb_shader_mask(key->ps.part.epilog.spi_shader_col_format);
1762
1763 /* Preserve alpha if ALPHA_TESTING is enabled. */
1764 if (key->ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS ||
1765 key->ps.part.epilog.alpha_to_coverage_via_mrtz)
1766 cb_shader_mask |= 1 << 3;
1767
1768 /* If COLOR is broadcasted to multiple color buffers, combine their masks. */
1769 if (location == FRAG_RESULT_COLOR) {
1770 for (unsigned i = 1; i <= key->ps.part.epilog.last_cbuf; i++)
1771 cb_shader_mask |= (cb_shader_mask >> (i * 4)) & 0xf;
1772 }
1773
1774 unsigned index = location == FRAG_RESULT_COLOR ? 0 : location - FRAG_RESULT_DATA0;
1775 unsigned output_mask = (cb_shader_mask >> (index * 4)) & 0xf;
1776
1777 if ((output_mask & comp_mask) == comp_mask)
1778 return false;
1779
1780 if (!(output_mask & comp_mask)) {
1781 nir_instr_remove(instr);
1782 return true;
1783 }
1784
1785 /* Fill disabled components with undef. */
1786 b->cursor = nir_before_instr(instr);
1787 nir_def *new_value = intr->src[0].ssa;
1788 nir_def *undef = nir_undef(b, 1, new_value->bit_size);
1789
1790 unsigned kill_mask = (~output_mask & comp_mask) >> component;
1791 u_foreach_bit(i, kill_mask) {
1792 new_value = nir_vector_insert_imm(b, new_value, undef, i);
1793 }
1794
1795 nir_src_rewrite(&intr->src[0], new_value);
1796 return true;
1797 }
1798
si_nir_kill_ps_outputs(nir_shader * nir,const union si_shader_key * key)1799 static bool si_nir_kill_ps_outputs(nir_shader *nir, const union si_shader_key *key)
1800 {
1801 assert(nir->info.stage == MESA_SHADER_FRAGMENT);
1802 return nir_shader_instructions_pass(nir, kill_ps_outputs_cb,
1803 nir_metadata_control_flow, (void*)key);
1804 }
1805
clamp_vertex_color_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * state)1806 static bool clamp_vertex_color_instr(nir_builder *b,
1807 nir_intrinsic_instr *intrin, void *state)
1808 {
1809 if (intrin->intrinsic != nir_intrinsic_store_output)
1810 return false;
1811
1812 unsigned location = nir_intrinsic_io_semantics(intrin).location;
1813 if (location != VARYING_SLOT_COL0 && location != VARYING_SLOT_COL1 &&
1814 location != VARYING_SLOT_BFC0 && location != VARYING_SLOT_BFC1)
1815 return false;
1816
1817 /* no indirect output */
1818 assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]));
1819 /* only scalar output */
1820 assert(intrin->src[0].ssa->num_components == 1);
1821
1822 b->cursor = nir_before_instr(&intrin->instr);
1823
1824 nir_def *color = intrin->src[0].ssa;
1825 nir_def *clamp = nir_load_clamp_vertex_color_amd(b);
1826 nir_def *new_color = nir_bcsel(b, clamp, nir_fsat(b, color), color);
1827 nir_src_rewrite(&intrin->src[0], new_color);
1828
1829 return true;
1830 }
1831
si_nir_clamp_vertex_color(nir_shader * nir)1832 static bool si_nir_clamp_vertex_color(nir_shader *nir)
1833 {
1834 uint64_t mask = VARYING_BIT_COL0 | VARYING_BIT_COL1 | VARYING_BIT_BFC0 | VARYING_BIT_BFC1;
1835 if (!(nir->info.outputs_written & mask))
1836 return false;
1837
1838 return nir_shader_intrinsics_pass(nir, clamp_vertex_color_instr,
1839 nir_metadata_control_flow,
1840 NULL);
1841 }
1842
si_map_io_driver_location(unsigned semantic)1843 static unsigned si_map_io_driver_location(unsigned semantic)
1844 {
1845 if ((semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_TESS_MAX) ||
1846 semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
1847 semantic == VARYING_SLOT_TESS_LEVEL_OUTER)
1848 return ac_shader_io_get_unique_index_patch(semantic);
1849
1850 return si_shader_io_get_unique_index(semantic);
1851 }
1852
si_lower_io_to_mem(struct si_shader * shader,nir_shader * nir)1853 static bool si_lower_io_to_mem(struct si_shader *shader, nir_shader *nir)
1854 {
1855 struct si_shader_selector *sel = shader->selector;
1856 struct si_shader_selector *next_sel = shader->next_shader ? shader->next_shader->selector : sel;
1857 const union si_shader_key *key = &shader->key;
1858 const bool is_gfx9_mono_tcs = shader->is_monolithic &&
1859 next_sel->stage == MESA_SHADER_TESS_CTRL &&
1860 sel->screen->info.gfx_level >= GFX9;
1861
1862 if (nir->info.stage == MESA_SHADER_VERTEX) {
1863 if (key->ge.as_ls) {
1864 NIR_PASS_V(nir, ac_nir_lower_ls_outputs_to_mem,
1865 is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
1866 sel->screen->info.gfx_level,
1867 key->ge.opt.same_patch_vertices,
1868 is_gfx9_mono_tcs ? next_sel->info.tcs_inputs_via_temp : 0,
1869 is_gfx9_mono_tcs ? next_sel->info.tcs_inputs_via_lds : ~0ull);
1870 return true;
1871 } else if (key->ge.as_es) {
1872 NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1873 sel->screen->info.gfx_level, sel->info.esgs_vertex_stride, ~0ULL);
1874 return true;
1875 }
1876 } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
1877 NIR_PASS_V(nir, ac_nir_lower_hs_inputs_to_mem,
1878 is_gfx9_mono_tcs ? NULL : si_map_io_driver_location,
1879 sel->screen->info.gfx_level, key->ge.opt.same_patch_vertices,
1880 sel->info.tcs_inputs_via_temp, sel->info.tcs_inputs_via_lds);
1881
1882 /* Used by hs_emit_write_tess_factors() when monolithic shader. */
1883 if (nir->info.tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED)
1884 nir->info.tess._primitive_mode = key->ge.opt.tes_prim_mode;
1885
1886 nir_tcs_info tcs_info;
1887 nir_gather_tcs_info(nir, &tcs_info, nir->info.tess._primitive_mode,
1888 nir->info.tess.spacing);
1889
1890 NIR_PASS_V(nir, ac_nir_lower_hs_outputs_to_mem, &tcs_info, si_map_io_driver_location,
1891 sel->screen->info.gfx_level,
1892 ~0ULL, ~0U, /* no TES inputs filter */
1893 shader->wave_size);
1894 return true;
1895 } else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
1896 NIR_PASS_V(nir, ac_nir_lower_tes_inputs_to_mem, si_map_io_driver_location);
1897
1898 if (key->ge.as_es) {
1899 NIR_PASS_V(nir, ac_nir_lower_es_outputs_to_mem, si_map_io_driver_location,
1900 sel->screen->info.gfx_level, sel->info.esgs_vertex_stride, ~0ULL);
1901 }
1902
1903 return true;
1904 } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1905 NIR_PASS_V(nir, ac_nir_lower_gs_inputs_to_mem, si_map_io_driver_location,
1906 sel->screen->info.gfx_level, key->ge.mono.u.gs_tri_strip_adj_fix);
1907 return true;
1908 }
1909
1910 return false;
1911 }
1912
si_lower_ngg(struct si_shader * shader,nir_shader * nir)1913 static void si_lower_ngg(struct si_shader *shader, nir_shader *nir)
1914 {
1915 struct si_shader_selector *sel = shader->selector;
1916 const union si_shader_key *key = &shader->key;
1917 assert(key->ge.as_ngg);
1918
1919 uint8_t clip_cull_dist_mask =
1920 (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) |
1921 sel->info.culldist_mask;
1922
1923 ac_nir_lower_ngg_options options = {
1924 .family = sel->screen->info.family,
1925 .gfx_level = sel->screen->info.gfx_level,
1926 .max_workgroup_size = si_get_max_workgroup_size(shader),
1927 .wave_size = shader->wave_size,
1928 .can_cull = si_shader_culling_enabled(shader),
1929 .disable_streamout = !si_shader_uses_streamout(shader),
1930 .vs_output_param_offset = shader->info.vs_output_param_offset,
1931 .has_param_exports = shader->info.nr_param_exports,
1932 .clip_cull_dist_mask = clip_cull_dist_mask,
1933 .kill_pointsize = key->ge.opt.kill_pointsize,
1934 .kill_layer = key->ge.opt.kill_layer,
1935 .force_vrs = sel->screen->options.vrs2x2,
1936 .use_gfx12_xfb_intrinsic = !nir->info.use_aco_amd,
1937 };
1938
1939 if (nir->info.stage == MESA_SHADER_VERTEX ||
1940 nir->info.stage == MESA_SHADER_TESS_EVAL) {
1941 /* Per instance inputs, used to remove instance load after culling. */
1942 unsigned instance_rate_inputs = 0;
1943
1944 if (nir->info.stage == MESA_SHADER_VERTEX) {
1945 instance_rate_inputs = key->ge.mono.instance_divisor_is_one |
1946 key->ge.mono.instance_divisor_is_fetched;
1947
1948 /* Manually mark the instance ID used, so the shader can repack it. */
1949 if (instance_rate_inputs)
1950 BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
1951 } else {
1952 /* Manually mark the primitive ID used, so the shader can repack it. */
1953 if (key->ge.mono.u.vs_export_prim_id)
1954 BITSET_SET(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
1955 }
1956
1957 unsigned clip_plane_enable =
1958 SI_NGG_CULL_GET_CLIP_PLANE_ENABLE(key->ge.opt.ngg_culling);
1959 unsigned num_vertices = si_get_num_vertices_per_output_prim(shader);
1960
1961 options.num_vertices_per_primitive = num_vertices ? num_vertices : 3;
1962 options.early_prim_export = gfx10_ngg_export_prim_early(shader);
1963 options.passthrough = gfx10_is_ngg_passthrough(shader);
1964 options.use_edgeflags = gfx10_has_variable_edgeflags(shader);
1965 options.has_gen_prim_query = options.has_xfb_prim_query =
1966 sel->screen->info.gfx_level >= GFX11 && !nir->info.vs.blit_sgprs_amd;
1967 options.export_primitive_id = key->ge.mono.u.vs_export_prim_id;
1968 options.instance_rate_inputs = instance_rate_inputs;
1969 options.user_clip_plane_enable_mask = clip_plane_enable;
1970
1971 NIR_PASS_V(nir, ac_nir_lower_ngg_nogs, &options);
1972 } else {
1973 assert(nir->info.stage == MESA_SHADER_GEOMETRY);
1974
1975 options.gs_out_vtx_bytes = sel->info.gsvs_vertex_size;
1976 options.has_gen_prim_query = options.has_xfb_prim_query =
1977 sel->screen->info.gfx_level >= GFX11;
1978 options.has_gs_invocations_query = sel->screen->info.gfx_level < GFX11;
1979 options.has_gs_primitives_query = true;
1980
1981 /* For monolithic ES/GS to add vscnt wait when GS export pos0. */
1982 if (key->ge.part.gs.es)
1983 nir->info.writes_memory |= key->ge.part.gs.es->info.base.writes_memory;
1984
1985 NIR_PASS_V(nir, ac_nir_lower_ngg_gs, &options);
1986 }
1987
1988 /* may generate some vector output store */
1989 NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1990 }
1991
si_deserialize_shader(struct si_shader_selector * sel)1992 struct nir_shader *si_deserialize_shader(struct si_shader_selector *sel)
1993 {
1994 struct pipe_screen *screen = &sel->screen->b;
1995 const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR, sel->stage);
1996
1997 struct blob_reader blob_reader;
1998 blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
1999 return nir_deserialize(NULL, options, &blob_reader);
2000 }
2001
si_nir_assign_param_offsets(nir_shader * nir,struct si_shader * shader,int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])2002 static void si_nir_assign_param_offsets(nir_shader *nir, struct si_shader *shader,
2003 int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS])
2004 {
2005 struct si_shader_selector *sel = shader->selector;
2006 struct si_shader_binary_info *info = &shader->info;
2007
2008 uint64_t outputs_written = 0;
2009 uint32_t outputs_written_16bit = 0;
2010
2011 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2012 assert(impl);
2013
2014 nir_foreach_block(block, impl) {
2015 nir_foreach_instr_safe(instr, block) {
2016 if (instr->type != nir_instr_type_intrinsic)
2017 continue;
2018
2019 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2020 if (intr->intrinsic != nir_intrinsic_store_output)
2021 continue;
2022
2023 /* No indirect indexing allowed. */
2024 ASSERTED nir_src offset = *nir_get_io_offset_src(intr);
2025 assert(nir_src_is_const(offset) && nir_src_as_uint(offset) == 0);
2026
2027 assert(intr->num_components == 1); /* only scalar stores expected */
2028 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
2029
2030 if (sem.location >= VARYING_SLOT_VAR0_16BIT)
2031 outputs_written_16bit |= BITFIELD_BIT(sem.location - VARYING_SLOT_VAR0_16BIT);
2032 else
2033 outputs_written |= BITFIELD64_BIT(sem.location);
2034
2035 /* Assign the param index if it's unassigned. */
2036 if (nir_slot_is_varying(sem.location, MESA_SHADER_FRAGMENT) && !sem.no_varying &&
2037 (sem.gs_streams & 0x3) == 0 &&
2038 info->vs_output_param_offset[sem.location] == AC_EXP_PARAM_DEFAULT_VAL_0000) {
2039 /* The semantic and the base should be the same as in si_shader_info. */
2040 assert(sem.location == sel->info.output_semantic[nir_intrinsic_base(intr)]);
2041 /* It must not be remapped (duplicated). */
2042 assert(slot_remap[sem.location] == -1);
2043
2044 info->vs_output_param_offset[sem.location] = info->nr_param_exports++;
2045 }
2046 }
2047 }
2048
2049 /* Duplicated outputs are redirected here. */
2050 for (unsigned i = 0; i < NUM_TOTAL_VARYING_SLOTS; i++) {
2051 if (slot_remap[i] >= 0)
2052 info->vs_output_param_offset[i] = info->vs_output_param_offset[slot_remap[i]];
2053 }
2054
2055 if (shader->key.ge.mono.u.vs_export_prim_id) {
2056 info->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = info->nr_param_exports++;
2057 }
2058
2059 /* Update outputs written info, we may remove some outputs before. */
2060 nir->info.outputs_written = outputs_written;
2061 nir->info.outputs_written_16bit = outputs_written_16bit;
2062 }
2063
si_assign_param_offsets(nir_shader * nir,struct si_shader * shader)2064 static void si_assign_param_offsets(nir_shader *nir, struct si_shader *shader)
2065 {
2066 /* Initialize this first. */
2067 shader->info.nr_param_exports = 0;
2068
2069 STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
2070 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
2071 sizeof(shader->info.vs_output_param_offset));
2072
2073 /* A slot remapping table for duplicated outputs, so that 1 vertex shader output can be
2074 * mapped to multiple fragment shader inputs.
2075 */
2076 int8_t slot_remap[NUM_TOTAL_VARYING_SLOTS];
2077 memset(slot_remap, -1, NUM_TOTAL_VARYING_SLOTS);
2078
2079 /* This sets DEFAULT_VAL for constant outputs in vs_output_param_offset. */
2080 /* TODO: This doesn't affect GS. */
2081 NIR_PASS_V(nir, ac_nir_optimize_outputs, false, slot_remap,
2082 shader->info.vs_output_param_offset);
2083
2084 /* Assign the non-constant outputs. */
2085 /* TODO: Use this for the GS copy shader too. */
2086 si_nir_assign_param_offsets(nir, shader, slot_remap);
2087 }
2088
si_get_nr_pos_exports(const struct si_shader_selector * sel,const union si_shader_key * key)2089 static unsigned si_get_nr_pos_exports(const struct si_shader_selector *sel,
2090 const union si_shader_key *key)
2091 {
2092 const struct si_shader_info *info = &sel->info;
2093
2094 /* Must have a position export. */
2095 unsigned nr_pos_exports = 1;
2096
2097 if ((info->writes_psize && !key->ge.opt.kill_pointsize) ||
2098 (info->writes_edgeflag && !key->ge.as_ngg) ||
2099 (info->writes_layer && !key->ge.opt.kill_layer) ||
2100 info->writes_viewport_index || sel->screen->options.vrs2x2) {
2101 nr_pos_exports++;
2102 }
2103
2104 unsigned clipdist_mask =
2105 (info->clipdist_mask & ~key->ge.opt.kill_clip_distances) | info->culldist_mask;
2106
2107 for (int i = 0; i < 2; i++) {
2108 if (clipdist_mask & BITFIELD_RANGE(i * 4, 4))
2109 nr_pos_exports++;
2110 }
2111
2112 return nr_pos_exports;
2113 }
2114
lower_ps_load_color_intrinsic(nir_builder * b,nir_instr * instr,void * state)2115 static bool lower_ps_load_color_intrinsic(nir_builder *b, nir_instr *instr, void *state)
2116 {
2117 nir_def **colors = (nir_def **)state;
2118
2119 if (instr->type != nir_instr_type_intrinsic)
2120 return false;
2121
2122 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
2123
2124 if (intrin->intrinsic != nir_intrinsic_load_color0 &&
2125 intrin->intrinsic != nir_intrinsic_load_color1)
2126 return false;
2127
2128 unsigned index = intrin->intrinsic == nir_intrinsic_load_color0 ? 0 : 1;
2129 assert(colors[index]);
2130
2131 nir_def_replace(&intrin->def, colors[index]);
2132 return true;
2133 }
2134
si_nir_lower_ps_color_input(nir_shader * nir,const union si_shader_key * key,const struct si_shader_info * info)2135 static bool si_nir_lower_ps_color_input(nir_shader *nir, const union si_shader_key *key,
2136 const struct si_shader_info *info)
2137 {
2138 bool progress = false;
2139 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2140
2141 nir_builder builder = nir_builder_at(nir_before_impl(impl));
2142 nir_builder *b = &builder;
2143
2144 /* Build ready to be used colors at the beginning of the shader. */
2145 nir_def *colors[2] = {0};
2146 for (int i = 0; i < 2; i++) {
2147 if (!(info->colors_read & (0xf << (i * 4))))
2148 continue;
2149
2150 unsigned color_base = info->color_attr_index[i];
2151 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
2152 * otherwise it's at offset "num_inputs".
2153 */
2154 unsigned back_color_base = info->num_inputs;
2155 if (i == 1 && (info->colors_read & 0xf))
2156 back_color_base += 1;
2157
2158 enum glsl_interp_mode interp_mode = info->color_interpolate[i];
2159 if (interp_mode == INTERP_MODE_COLOR) {
2160 interp_mode = key->ps.part.prolog.flatshade_colors ?
2161 INTERP_MODE_FLAT : INTERP_MODE_SMOOTH;
2162 }
2163
2164 nir_def *back_color = NULL;
2165 if (interp_mode == INTERP_MODE_FLAT) {
2166 colors[i] = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
2167 .base = color_base,
2168 .io_semantics.location = VARYING_SLOT_COL0 + i,
2169 .io_semantics.num_slots = 1);
2170
2171 if (key->ps.part.prolog.color_two_side) {
2172 back_color = nir_load_input(b, 4, 32, nir_imm_int(b, 0),
2173 .base = back_color_base,
2174 .io_semantics.location = VARYING_SLOT_BFC0 + i,
2175 .io_semantics.num_slots = 1);
2176 }
2177 } else {
2178 nir_intrinsic_op op = 0;
2179 switch (info->color_interpolate_loc[i]) {
2180 case TGSI_INTERPOLATE_LOC_CENTER:
2181 op = nir_intrinsic_load_barycentric_pixel;
2182 break;
2183 case TGSI_INTERPOLATE_LOC_CENTROID:
2184 op = nir_intrinsic_load_barycentric_centroid;
2185 break;
2186 case TGSI_INTERPOLATE_LOC_SAMPLE:
2187 op = nir_intrinsic_load_barycentric_sample;
2188 break;
2189 default:
2190 unreachable("invalid color interpolate location");
2191 break;
2192 }
2193
2194 nir_def *barycentric = nir_load_barycentric(b, op, interp_mode);
2195
2196 colors[i] =
2197 nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0),
2198 .base = color_base,
2199 .io_semantics.location = VARYING_SLOT_COL0 + i,
2200 .io_semantics.num_slots = 1);
2201
2202 if (key->ps.part.prolog.color_two_side) {
2203 back_color =
2204 nir_load_interpolated_input(b, 4, 32, barycentric, nir_imm_int(b, 0),
2205 .base = back_color_base,
2206 .io_semantics.location = VARYING_SLOT_BFC0 + i,
2207 .io_semantics.num_slots = 1);
2208 }
2209 }
2210
2211 if (back_color) {
2212 nir_def *is_front_face = nir_load_front_face(b, 1);
2213 colors[i] = nir_bcsel(b, is_front_face, colors[i], back_color);
2214 }
2215
2216 progress = true;
2217 }
2218
2219 /* lower nir_load_color0/1 to use the color value. */
2220 return nir_shader_instructions_pass(nir, lower_ps_load_color_intrinsic,
2221 nir_metadata_control_flow,
2222 colors) || progress;
2223 }
2224
si_nir_emit_polygon_stipple(nir_shader * nir,struct si_shader_args * args)2225 static void si_nir_emit_polygon_stipple(nir_shader *nir, struct si_shader_args *args)
2226 {
2227 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
2228
2229 nir_builder builder = nir_builder_at(nir_before_impl(impl));
2230 nir_builder *b = &builder;
2231
2232 /* Load the buffer descriptor. */
2233 nir_def *desc =
2234 si_nir_load_internal_binding(b, args, SI_PS_CONST_POLY_STIPPLE, 4);
2235
2236 /* Use the fixed-point gl_FragCoord input.
2237 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
2238 * per coordinate to get the repeating effect.
2239 */
2240 nir_def *pos_x = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 0, 5);
2241 nir_def *pos_y = ac_nir_unpack_arg(b, &args->ac, args->ac.pos_fixed_pt, 16, 5);
2242
2243 nir_def *zero = nir_imm_int(b, 0);
2244 /* The stipple pattern is 32x32, each row has 32 bits. */
2245 nir_def *offset = nir_ishl_imm(b, pos_y, 2);
2246 nir_def *row = nir_load_buffer_amd(b, 1, 32, desc, offset, zero, zero);
2247 nir_def *bit = nir_ubfe(b, row, pos_x, nir_imm_int(b, 1));
2248
2249 nir_def *pass = nir_i2b(b, bit);
2250 nir_discard_if(b, nir_inot(b, pass));
2251 }
2252
si_should_clear_lds(struct si_screen * sscreen,const struct nir_shader * shader)2253 bool si_should_clear_lds(struct si_screen *sscreen, const struct nir_shader *shader)
2254 {
2255 return gl_shader_stage_is_compute(shader->info.stage) &&
2256 shader->info.shared_size > 0 && sscreen->options.clear_lds;
2257 }
2258
clamp_shadow_comparison_value(nir_builder * b,nir_instr * instr,void * state)2259 static bool clamp_shadow_comparison_value(nir_builder *b, nir_instr *instr, void *state)
2260 {
2261 if (instr->type != nir_instr_type_tex)
2262 return false;
2263
2264 nir_tex_instr *tex = nir_instr_as_tex(instr);
2265 if (!tex->is_shadow)
2266 return false;
2267
2268 b->cursor = nir_before_instr(instr);
2269
2270 int samp_index = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle);
2271 int comp_index = nir_tex_instr_src_index(tex, nir_tex_src_comparator);
2272 assert(samp_index >= 0 && comp_index >= 0);
2273
2274 nir_def *sampler = tex->src[samp_index].src.ssa;
2275 nir_def *compare = tex->src[comp_index].src.ssa;
2276 /* Must have been lowered to descriptor. */
2277 assert(sampler->num_components > 1);
2278
2279 nir_def *upgraded = nir_channel(b, sampler, 3);
2280 upgraded = nir_i2b(b, nir_ubfe_imm(b, upgraded, 29, 1));
2281
2282 nir_def *clamped = nir_fsat(b, compare);
2283 compare = nir_bcsel(b, upgraded, clamped, compare);
2284
2285 nir_src_rewrite(&tex->src[comp_index].src, compare);
2286 return true;
2287 }
2288
si_nir_clamp_shadow_comparison_value(nir_shader * nir)2289 static bool si_nir_clamp_shadow_comparison_value(nir_shader *nir)
2290 {
2291 /* Section 8.23.1 (Depth Texture Comparison Mode) of the
2292 * OpenGL 4.5 spec says:
2293 *
2294 * "If the texture’s internal format indicates a fixed-point
2295 * depth texture, then D_t and D_ref are clamped to the
2296 * range [0, 1]; otherwise no clamping is performed."
2297 *
2298 * TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
2299 * so the depth comparison value isn't clamped for Z16 and
2300 * Z24 anymore. Do it manually here for GFX8-9; GFX10 has
2301 * an explicitly clamped 32-bit float format.
2302 */
2303 return nir_shader_instructions_pass(nir, clamp_shadow_comparison_value,
2304 nir_metadata_control_flow,
2305 NULL);
2306 }
2307
si_get_nir_shader(struct si_shader * shader,struct si_shader_args * args,bool * free_nir,ac_nir_gs_output_info * output_info)2308 static struct nir_shader *si_get_nir_shader(struct si_shader *shader, struct si_shader_args *args,
2309 bool *free_nir, ac_nir_gs_output_info *output_info)
2310 {
2311 struct si_shader_selector *sel = shader->selector;
2312 const union si_shader_key *key = &shader->key;
2313
2314 nir_shader *nir;
2315 *free_nir = false;
2316
2317 if (sel->nir) {
2318 nir = sel->nir;
2319 } else if (sel->nir_binary) {
2320 nir = si_deserialize_shader(sel);
2321 *free_nir = true;
2322 } else {
2323 return NULL;
2324 }
2325
2326 bool progress = false;
2327 bool late_opts = false;
2328
2329 const char *original_name = NULL;
2330 if (unlikely(should_print_nir(nir))) {
2331 /* Modify the shader's name so that each variant gets its own name. */
2332 original_name = ralloc_strdup(nir, nir->info.name);
2333 ralloc_asprintf_append((char **)&nir->info.name, "-%08x", _mesa_hash_data(key, sizeof(*key)));
2334
2335 /* Dummy pass to get the starting point. */
2336 printf("nir_dummy_pass\n");
2337 nir_print_shader(nir, stdout);
2338 }
2339
2340 si_init_shader_args(shader, args, &nir->info);
2341
2342 /* Kill outputs according to the shader key. */
2343 if (nir->info.stage <= MESA_SHADER_GEOMETRY)
2344 NIR_PASS(progress, nir, si_nir_kill_outputs, key);
2345
2346 NIR_PASS(progress, nir, ac_nir_lower_tex,
2347 &(ac_nir_lower_tex_options){
2348 .gfx_level = sel->screen->info.gfx_level,
2349 .lower_array_layer_round_even = !sel->screen->info.conformant_trunc_coord,
2350 });
2351
2352 if (nir->info.uses_resource_info_query)
2353 NIR_PASS(progress, nir, ac_nir_lower_resinfo, sel->screen->info.gfx_level);
2354
2355 bool inline_uniforms = false;
2356 uint32_t *inlined_uniform_values;
2357 si_get_inline_uniform_state((union si_shader_key*)key, nir->info.stage,
2358 &inline_uniforms, &inlined_uniform_values);
2359
2360 if (inline_uniforms) {
2361 assert(*free_nir);
2362
2363 /* Most places use shader information from the default variant, not
2364 * the optimized variant. These are the things that the driver looks at
2365 * in optimized variants and the list of things that we need to do.
2366 *
2367 * The driver takes into account these things if they suddenly disappear
2368 * from the shader code:
2369 * - Register usage and code size decrease (obvious)
2370 * - Eliminated PS system values are disabled by LLVM
2371 * (FragCoord, FrontFace, barycentrics)
2372 * - VS/TES/GS param exports are eliminated if they are undef.
2373 * The param space for eliminated outputs is also not allocated.
2374 * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)
2375 * - TCS output stores are eliminated
2376 * - Eliminated PS inputs are removed from PS.NUM_INTERP.
2377 *
2378 * TODO: These are things the driver ignores in the final shader code
2379 * and relies on the default shader info.
2380 * - System values in VS, TCS, TES, GS are not eliminated
2381 * - uses_discard - if it changed to false
2382 * - writes_memory - if it changed to false
2383 * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not
2384 * eliminated
2385 * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS)
2386 * GS outputs are eliminated except for the temporary LDS.
2387 * Clip distances, gl_PointSize, gl_Layer and PS outputs are eliminated based
2388 * on current states, so we don't care about the shader code.
2389 *
2390 * TODO: Merged shaders don't inline uniforms for the first stage.
2391 * VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS.
2392 * (key == NULL for the first stage here)
2393 *
2394 * TODO: Compute shaders don't support inlinable uniforms, because they
2395 * don't have shader variants.
2396 *
2397 * TODO: The driver uses a linear search to find a shader variant. This
2398 * can be really slow if we get too many variants due to uniform inlining.
2399 */
2400 NIR_PASS_V(nir, nir_inline_uniforms, nir->info.num_inlinable_uniforms,
2401 inlined_uniform_values, nir->info.inlinable_uniform_dw_offsets);
2402 progress = true;
2403 }
2404
2405 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2406 /* This uses the epilog key, so only monolithic shaders can call this. */
2407 if (shader->is_monolithic)
2408 NIR_PASS(progress, nir, si_nir_kill_ps_outputs, key);
2409
2410 if (key->ps.mono.poly_line_smoothing)
2411 NIR_PASS(progress, nir, nir_lower_poly_line_smooth, SI_NUM_SMOOTH_AA_SAMPLES);
2412
2413 if (key->ps.mono.point_smoothing)
2414 NIR_PASS(progress, nir, nir_lower_point_smooth, true);
2415
2416 NIR_PASS(progress, nir, nir_lower_fragcoord_wtrans);
2417 }
2418
2419 /* This must be before si_nir_lower_resource. */
2420 if (!sel->screen->info.has_image_opcodes)
2421 NIR_PASS(progress, nir, ac_nir_lower_image_opcodes);
2422
2423 /* LLVM does not work well with this, so is handled in llvm backend waterfall. */
2424 if (nir->info.use_aco_amd && sel->info.has_non_uniform_tex_access) {
2425 nir_lower_non_uniform_access_options options = {
2426 .types = nir_lower_non_uniform_texture_access,
2427 };
2428 NIR_PASS(progress, nir, nir_lower_non_uniform_access, &options);
2429 }
2430
2431 bool is_last_vgt_stage =
2432 (nir->info.stage == MESA_SHADER_VERTEX ||
2433 nir->info.stage == MESA_SHADER_TESS_EVAL ||
2434 (nir->info.stage == MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg)) &&
2435 !shader->key.ge.as_ls && !shader->key.ge.as_es;
2436
2437 /* Legacy GS is not last VGT stage because it has GS copy shader. */
2438 bool is_legacy_gs = nir->info.stage == MESA_SHADER_GEOMETRY && !key->ge.as_ngg;
2439
2440 if (is_last_vgt_stage || is_legacy_gs)
2441 NIR_PASS(progress, nir, si_nir_clamp_vertex_color);
2442
2443 if (progress) {
2444 si_nir_opts(sel->screen, nir, true);
2445 late_opts = true;
2446 progress = false;
2447 }
2448
2449 /* Lower large variables that are always constant with load_constant intrinsics, which
2450 * get turned into PC-relative loads from a data section next to the shader.
2451 *
2452 * Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
2453 * this should be done after that.
2454 *
2455 * The pass crashes if there are dead temps of lowered IO interface types, so remove
2456 * them first.
2457 */
2458 NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
2459 NIR_PASS(progress, nir, nir_opt_large_constants, glsl_get_natural_size_align_bytes, 16);
2460
2461 /* Loop unrolling caused by uniform inlining can help eliminate indirect indexing, so
2462 * this should be done after that.
2463 */
2464 progress |= ac_nir_lower_indirect_derefs(nir, sel->screen->info.gfx_level);
2465
2466 if (nir->info.stage == MESA_SHADER_VERTEX)
2467 NIR_PASS(progress, nir, si_nir_lower_vs_inputs, shader, args);
2468
2469 progress |= si_lower_io_to_mem(shader, nir);
2470
2471 if (is_last_vgt_stage) {
2472 /* Assign param export indices. */
2473 si_assign_param_offsets(nir, shader);
2474
2475 /* Assign num of position exports. */
2476 shader->info.nr_pos_exports = si_get_nr_pos_exports(sel, key);
2477
2478 if (key->ge.as_ngg) {
2479 /* Lower last VGT NGG shader stage. */
2480 si_lower_ngg(shader, nir);
2481 } else if (nir->info.stage == MESA_SHADER_VERTEX ||
2482 nir->info.stage == MESA_SHADER_TESS_EVAL) {
2483 /* Lower last VGT none-NGG VS/TES shader stage. */
2484 unsigned clip_cull_mask =
2485 (sel->info.clipdist_mask & ~key->ge.opt.kill_clip_distances) |
2486 sel->info.culldist_mask;
2487
2488 NIR_PASS_V(nir, ac_nir_lower_legacy_vs,
2489 sel->screen->info.gfx_level,
2490 clip_cull_mask,
2491 shader->info.vs_output_param_offset,
2492 shader->info.nr_param_exports,
2493 shader->key.ge.mono.u.vs_export_prim_id,
2494 !si_shader_uses_streamout(shader),
2495 key->ge.opt.kill_pointsize,
2496 key->ge.opt.kill_layer,
2497 sel->screen->options.vrs2x2);
2498 }
2499 progress = true;
2500 } else if (is_legacy_gs) {
2501 NIR_PASS_V(nir, ac_nir_lower_legacy_gs, false, sel->screen->use_ngg, output_info);
2502 progress = true;
2503 } else if (nir->info.stage == MESA_SHADER_FRAGMENT && shader->is_monolithic) {
2504 /* Uniform inlining can eliminate PS inputs, and colormask can remove PS outputs,
2505 * which can also cause the elimination of PS inputs. Remove holes after removed PS inputs
2506 * by renumbering them. This can only happen with monolithic PS. Colors are unaffected
2507 * because they are still represented by nir_intrinsic_load_color0/1.
2508 */
2509 NIR_PASS_V(nir, nir_recompute_io_bases, nir_var_shader_in);
2510
2511 /* Two-side color selection and interpolation: Get the latest shader info because
2512 * uniform inlining and colormask can fully eliminate color inputs.
2513 */
2514 struct si_shader_info info;
2515 si_nir_scan_shader(sel->screen, nir, &info);
2516
2517 if (info.colors_read)
2518 NIR_PASS(progress, nir, si_nir_lower_ps_color_input, &shader->key, &info);
2519
2520 /* We need to set this early for lowering nir_intrinsic_load_point_coord_maybe_flipped,
2521 * which can only occur with monolithic PS.
2522 */
2523 shader->info.num_ps_inputs = info.num_inputs;
2524 shader->info.ps_colors_read = info.colors_read;
2525
2526 ac_nir_lower_ps_early_options early_options = {
2527 .force_persp_sample_interp = key->ps.part.prolog.force_persp_sample_interp,
2528 .force_linear_sample_interp = key->ps.part.prolog.force_linear_sample_interp,
2529 .force_persp_center_interp = key->ps.part.prolog.force_persp_center_interp,
2530 .force_linear_center_interp = key->ps.part.prolog.force_linear_center_interp,
2531 .ps_iter_samples = 1 << key->ps.part.prolog.samplemask_log_ps_iter,
2532
2533 .clamp_color = key->ps.part.epilog.clamp_color,
2534 .alpha_test_alpha_to_one = key->ps.part.epilog.alpha_to_one,
2535 .alpha_func = key->ps.part.epilog.alpha_func,
2536 .keep_alpha_for_mrtz = key->ps.part.epilog.alpha_to_coverage_via_mrtz,
2537 .spi_shader_col_format_hint = key->ps.part.epilog.spi_shader_col_format,
2538 .kill_z = key->ps.part.epilog.kill_z,
2539 .kill_stencil = key->ps.part.epilog.kill_stencil,
2540 .kill_samplemask = key->ps.part.epilog.kill_samplemask,
2541 };
2542
2543 NIR_PASS_V(nir, ac_nir_lower_ps_early, &early_options);
2544
2545 ac_nir_lower_ps_late_options late_options = {
2546 .gfx_level = sel->screen->info.gfx_level,
2547 .family = sel->screen->info.family,
2548 .use_aco = nir->info.use_aco_amd,
2549 .bc_optimize_for_persp = key->ps.part.prolog.bc_optimize_for_persp,
2550 .bc_optimize_for_linear = key->ps.part.prolog.bc_optimize_for_linear,
2551 .uses_discard = si_shader_uses_discard(shader),
2552 .alpha_to_coverage_via_mrtz = key->ps.part.epilog.alpha_to_coverage_via_mrtz,
2553 .dual_src_blend_swizzle = key->ps.part.epilog.dual_src_blend_swizzle,
2554 .spi_shader_col_format = key->ps.part.epilog.spi_shader_col_format,
2555 .color_is_int8 = key->ps.part.epilog.color_is_int8,
2556 .color_is_int10 = key->ps.part.epilog.color_is_int10,
2557 .alpha_to_one = key->ps.part.epilog.alpha_to_one,
2558 };
2559
2560 NIR_PASS_V(nir, ac_nir_lower_ps_late, &late_options);
2561
2562 if (key->ps.part.prolog.poly_stipple)
2563 NIR_PASS_V(nir, si_nir_emit_polygon_stipple, args);
2564
2565 progress = true;
2566 }
2567
2568 assert(shader->wave_size == 32 || shader->wave_size == 64);
2569
2570 NIR_PASS(progress, nir, nir_lower_subgroups,
2571 &(struct nir_lower_subgroups_options) {
2572 .subgroup_size = shader->wave_size,
2573 .ballot_bit_size = shader->wave_size,
2574 .ballot_components = 1,
2575 .lower_to_scalar = true,
2576 .lower_subgroup_masks = true,
2577 .lower_relative_shuffle = true,
2578 .lower_rotate_to_shuffle = !nir->info.use_aco_amd,
2579 .lower_shuffle_to_32bit = true,
2580 .lower_vote_eq = true,
2581 .lower_vote_bool_eq = true,
2582 .lower_quad_broadcast_dynamic = true,
2583 .lower_quad_broadcast_dynamic_to_const = sel->screen->info.gfx_level <= GFX7,
2584 .lower_shuffle_to_swizzle_amd = true,
2585 .lower_ballot_bit_count_to_mbcnt_amd = true,
2586 .lower_inverse_ballot = !nir->info.use_aco_amd && LLVM_VERSION_MAJOR < 17,
2587 .lower_boolean_reduce = nir->info.use_aco_amd,
2588 .lower_boolean_shuffle = true,
2589 });
2590
2591 NIR_PASS(progress, nir, nir_lower_pack);
2592 NIR_PASS(progress, nir, nir_opt_idiv_const, 8);
2593 NIR_PASS(progress, nir, nir_lower_idiv,
2594 &(nir_lower_idiv_options){
2595 .allow_fp16 = sel->screen->info.gfx_level >= GFX9,
2596 });
2597
2598 if (si_should_clear_lds(sel->screen, nir)) {
2599 const unsigned chunk_size = 16; /* max single store size */
2600 const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
2601 NIR_PASS_V(nir, nir_clear_shared_memory, shared_size, chunk_size);
2602 }
2603
2604 NIR_PASS_V(nir, nir_divergence_analysis); /* required by ac_nir_flag_smem_for_loads */
2605 NIR_PASS(progress, nir, ac_nir_flag_smem_for_loads, sel->screen->info.gfx_level,
2606 !sel->info.base.use_aco_amd, true);
2607 NIR_PASS(progress, nir, nir_lower_io_to_scalar,
2608 nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_shared | nir_var_mem_global,
2609 ac_nir_scalarize_overfetching_loads_callback, &sel->screen->info.gfx_level);
2610 NIR_PASS(progress, nir, si_nir_lower_resource, shader, args);
2611
2612 if (progress) {
2613 si_nir_opts(sel->screen, nir, false);
2614 progress = false;
2615 late_opts = true;
2616 }
2617
2618 NIR_PASS(progress, nir, nir_opt_load_store_vectorize,
2619 &(nir_load_store_vectorize_options){
2620 .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_shared | nir_var_mem_global |
2621 nir_var_shader_temp,
2622 .callback = ac_nir_mem_vectorize_callback,
2623 .cb_data = &(struct ac_nir_config){sel->screen->info.gfx_level, sel->info.base.use_aco_amd},
2624 /* On GFX6, read2/write2 is out-of-bounds if the offset register is negative, even if
2625 * the final offset is not.
2626 */
2627 .has_shared2_amd = sel->screen->info.gfx_level >= GFX7,
2628 });
2629 NIR_PASS(progress, nir, nir_opt_shrink_stores, false);
2630
2631 nir_divergence_analysis(nir);
2632 NIR_PASS(progress, nir, ac_nir_flag_smem_for_loads, sel->screen->info.gfx_level,
2633 !nir->info.use_aco_amd, false);
2634 NIR_PASS(progress, nir, ac_nir_lower_mem_access_bit_sizes,
2635 sel->screen->info.gfx_level, !nir->info.use_aco_amd);
2636
2637 if (nir->info.stage == MESA_SHADER_KERNEL) {
2638 NIR_PASS(progress, nir, ac_nir_lower_global_access);
2639
2640 if (nir->info.bit_sizes_int & (8 | 16)) {
2641 if (sel->screen->info.gfx_level >= GFX8)
2642 nir_divergence_analysis(nir);
2643
2644 NIR_PASS(progress, nir, nir_lower_bit_size, ac_nir_lower_bit_size_callback,
2645 &sel->screen->info.gfx_level);
2646 }
2647 }
2648
2649 /* This must be after lowering resources to descriptor loads and before lowering intrinsics
2650 * to args and lowering int64.
2651 */
2652 if (nir->info.use_aco_amd)
2653 progress |= ac_nir_optimize_uniform_atomics(nir);
2654
2655 NIR_PASS(progress, nir, nir_lower_int64);
2656 NIR_PASS(progress, nir, si_nir_lower_abi, shader, args);
2657 NIR_PASS(progress, nir, ac_nir_lower_intrinsics_to_args, sel->screen->info.gfx_level,
2658 sel->screen->info.has_ls_vgpr_init_bug,
2659 si_select_hw_stage(nir->info.stage, key, sel->screen->info.gfx_level),
2660 shader->wave_size, si_get_max_workgroup_size(shader), &args->ac);
2661
2662 /* LLVM keep non-uniform sampler as index, so can't do this in NIR.
2663 * Must be done after si_nir_lower_resource().
2664 */
2665 if (nir->info.use_aco_amd && sel->info.has_shadow_comparison &&
2666 sel->screen->info.gfx_level >= GFX8 && sel->screen->info.gfx_level <= GFX9) {
2667 NIR_PASS(progress, nir, si_nir_clamp_shadow_comparison_value);
2668 }
2669
2670 if (progress) {
2671 si_nir_opts(sel->screen, nir, false);
2672 progress = false;
2673 late_opts = true;
2674 }
2675
2676 static const nir_opt_offsets_options offset_options = {
2677 .uniform_max = 0,
2678 .buffer_max = ~0,
2679 .shared_max = ~0,
2680 };
2681 NIR_PASS_V(nir, nir_opt_offsets, &offset_options);
2682
2683 if (late_opts)
2684 si_nir_late_opts(nir);
2685
2686 NIR_PASS(progress, nir, nir_opt_sink,
2687 nir_move_const_undef | nir_move_copies | nir_move_alu | nir_move_comparisons |
2688 nir_move_load_ubo | nir_move_load_ssbo);
2689 NIR_PASS(progress, nir, nir_opt_move,
2690 nir_move_const_undef | nir_move_copies | nir_move_alu | nir_move_comparisons |
2691 nir_move_load_ubo);
2692 /* Run nir_opt_move again to make sure that comparisons are as close as possible to the first
2693 * use to prevent SCC spilling.
2694 */
2695 NIR_PASS(progress, nir, nir_opt_move, nir_move_comparisons);
2696
2697 /* This must be done after si_nir_late_opts() because it may generate vec const. */
2698 NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
2699
2700 /* This helps LLVM form VMEM clauses and thus get more GPU cache hits.
2701 * 200 is tuned for Viewperf. It should be done last.
2702 */
2703 NIR_PASS_V(nir, nir_group_loads, nir_group_same_resource_only, 200);
2704
2705 if (unlikely(original_name)) {
2706 ralloc_free((void*)nir->info.name);
2707 nir->info.name = original_name;
2708 }
2709
2710 return nir;
2711 }
2712
si_update_shader_binary_info(struct si_shader * shader,nir_shader * nir)2713 void si_update_shader_binary_info(struct si_shader *shader, nir_shader *nir)
2714 {
2715 struct si_shader_info info;
2716 si_nir_scan_shader(shader->selector->screen, nir, &info);
2717
2718 shader->info.uses_vmem_load_other |= info.uses_vmem_load_other;
2719 shader->info.uses_vmem_sampler_or_bvh |= info.uses_vmem_sampler_or_bvh;
2720
2721 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2722 /* Since uniform inlining can remove PS inputs, set the latest info about PS inputs here. */
2723 shader->info.num_ps_inputs = info.num_inputs;
2724 shader->info.ps_colors_read = info.colors_read;
2725
2726 /* A non-monolithic PS doesn't know if back colors are enabled, so copy 2 more. */
2727 unsigned max_interp = MIN2(info.num_inputs + 2, SI_NUM_INTERP);
2728 memcpy(shader->info.ps_inputs, info.input, max_interp * sizeof(info.input[0]));
2729 }
2730 }
2731
2732 /* Generate code for the hardware VS shader stage to go with a geometry shader */
2733 static struct si_shader *
si_nir_generate_gs_copy_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * gs_shader,nir_shader * gs_nir,struct util_debug_callback * debug,ac_nir_gs_output_info * output_info)2734 si_nir_generate_gs_copy_shader(struct si_screen *sscreen,
2735 struct ac_llvm_compiler *compiler,
2736 struct si_shader *gs_shader,
2737 nir_shader *gs_nir,
2738 struct util_debug_callback *debug,
2739 ac_nir_gs_output_info *output_info)
2740 {
2741 struct si_shader *shader;
2742 struct si_shader_selector *gs_selector = gs_shader->selector;
2743 struct si_shader_info *gsinfo = &gs_selector->info;
2744 union si_shader_key *gskey = &gs_shader->key;
2745
2746 shader = CALLOC_STRUCT(si_shader);
2747 if (!shader)
2748 return NULL;
2749
2750 /* We can leave the fence as permanently signaled because the GS copy
2751 * shader only becomes visible globally after it has been compiled. */
2752 util_queue_fence_init(&shader->ready);
2753
2754 shader->selector = gs_selector;
2755 shader->is_gs_copy_shader = true;
2756 shader->wave_size = si_determine_wave_size(sscreen, shader);
2757
2758 STATIC_ASSERT(sizeof(shader->info.vs_output_param_offset[0]) == 1);
2759 memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_DEFAULT_VAL_0000,
2760 sizeof(shader->info.vs_output_param_offset));
2761
2762 for (unsigned i = 0; i < gsinfo->num_outputs; i++) {
2763 unsigned semantic = gsinfo->output_semantic[i];
2764
2765 /* Skip if no channel writes to stream 0. */
2766 if (!nir_slot_is_varying(semantic, MESA_SHADER_FRAGMENT) ||
2767 (gsinfo->output_streams[i] & 0x03 &&
2768 gsinfo->output_streams[i] & 0x0c &&
2769 gsinfo->output_streams[i] & 0x30 &&
2770 gsinfo->output_streams[i] & 0xc0))
2771 continue;
2772
2773 shader->info.vs_output_param_offset[semantic] = shader->info.nr_param_exports++;
2774 }
2775
2776 shader->info.nr_pos_exports = si_get_nr_pos_exports(gs_selector, gskey);
2777
2778 unsigned clip_cull_mask =
2779 (gsinfo->clipdist_mask & ~gskey->ge.opt.kill_clip_distances) | gsinfo->culldist_mask;
2780
2781 nir_shader *nir =
2782 ac_nir_create_gs_copy_shader(gs_nir,
2783 sscreen->info.gfx_level,
2784 clip_cull_mask,
2785 shader->info.vs_output_param_offset,
2786 shader->info.nr_param_exports,
2787 !si_shader_uses_streamout(gs_shader),
2788 gskey->ge.opt.kill_pointsize,
2789 gskey->ge.opt.kill_layer,
2790 sscreen->options.vrs2x2,
2791 output_info);
2792
2793 struct si_shader_args args;
2794 si_init_shader_args(shader, &args, &gs_nir->info);
2795
2796 NIR_PASS_V(nir, si_nir_lower_abi, shader, &args);
2797 NIR_PASS_V(nir, ac_nir_lower_intrinsics_to_args, sscreen->info.gfx_level,
2798 sscreen->info.has_ls_vgpr_init_bug, AC_HW_VERTEX_SHADER, 64, 64, &args.ac);
2799
2800 si_nir_opts(gs_selector->screen, nir, false);
2801
2802 NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
2803
2804 if (si_can_dump_shader(sscreen, MESA_SHADER_GEOMETRY, SI_DUMP_NIR)) {
2805 fprintf(stderr, "GS Copy Shader:\n");
2806 nir_print_shader(nir, stderr);
2807 }
2808
2809 bool ok =
2810 #if AMD_LLVM_AVAILABLE
2811 !gs_nir->info.use_aco_amd ? si_llvm_compile_shader(sscreen, compiler, shader,
2812 &args, debug, nir) :
2813 #endif
2814 si_aco_compile_shader(shader, &args, nir, debug);
2815
2816 if (ok) {
2817 assert(!shader->config.scratch_bytes_per_wave);
2818 ok = si_shader_binary_upload(sscreen, shader, 0) >= 0;
2819 si_shader_dump(sscreen, shader, debug, stderr, true);
2820 }
2821 ralloc_free(nir);
2822
2823 if (!ok) {
2824 FREE(shader);
2825 shader = NULL;
2826 } else {
2827 si_fix_resource_usage(sscreen, shader);
2828 }
2829 return shader;
2830 }
2831
2832 struct si_gs_output_info {
2833 uint8_t streams[64];
2834 uint8_t streams_16bit_lo[16];
2835 uint8_t streams_16bit_hi[16];
2836
2837 uint8_t usage_mask[64];
2838 uint8_t usage_mask_16bit_lo[16];
2839 uint8_t usage_mask_16bit_hi[16];
2840
2841 ac_nir_gs_output_info info;
2842 };
2843
2844 static void
si_init_gs_output_info(struct si_shader_info * info,struct si_gs_output_info * out_info)2845 si_init_gs_output_info(struct si_shader_info *info, struct si_gs_output_info *out_info)
2846 {
2847 for (int i = 0; i < info->num_outputs; i++) {
2848 unsigned slot = info->output_semantic[i];
2849 if (slot < VARYING_SLOT_VAR0_16BIT) {
2850 out_info->streams[slot] = info->output_streams[i];
2851 out_info->usage_mask[slot] = info->output_usagemask[i];
2852 } else {
2853 unsigned index = slot - VARYING_SLOT_VAR0_16BIT;
2854 /* TODO: 16bit need separated fields for lo/hi part. */
2855 out_info->streams_16bit_lo[index] = info->output_streams[i];
2856 out_info->streams_16bit_hi[index] = info->output_streams[i];
2857 out_info->usage_mask_16bit_lo[index] = info->output_usagemask[i];
2858 out_info->usage_mask_16bit_hi[index] = info->output_usagemask[i];
2859 }
2860 }
2861
2862 ac_nir_gs_output_info *ac_info = &out_info->info;
2863
2864 ac_info->streams = out_info->streams;
2865 ac_info->streams_16bit_lo = out_info->streams_16bit_lo;
2866 ac_info->streams_16bit_hi = out_info->streams_16bit_hi;
2867
2868 ac_info->sysval_mask = out_info->usage_mask;
2869 ac_info->varying_mask = out_info->usage_mask;
2870 ac_info->varying_mask_16bit_lo = out_info->usage_mask_16bit_lo;
2871 ac_info->varying_mask_16bit_hi = out_info->usage_mask_16bit_hi;
2872
2873 /* TODO: construct 16bit slot per component store type. */
2874 ac_info->types_16bit_lo = ac_info->types_16bit_hi = NULL;
2875 }
2876
si_fixup_spi_ps_input_config(struct si_shader * shader)2877 static void si_fixup_spi_ps_input_config(struct si_shader *shader)
2878 {
2879 const union si_shader_key *key = &shader->key;
2880
2881 /* Enable POS_FIXED_PT if polygon stippling is enabled. */
2882 if (key->ps.part.prolog.poly_stipple)
2883 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2884
2885 /* Set up the enable bits for per-sample shading if needed. */
2886 if (key->ps.part.prolog.force_persp_sample_interp &&
2887 (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2888 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2889 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
2890 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2891 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
2892 }
2893 if (key->ps.part.prolog.force_linear_sample_interp &&
2894 (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
2895 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2896 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
2897 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2898 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
2899 }
2900 if (key->ps.part.prolog.force_persp_center_interp &&
2901 (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2902 G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2903 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
2904 shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
2905 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2906 }
2907 if (key->ps.part.prolog.force_linear_center_interp &&
2908 (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
2909 G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
2910 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
2911 shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
2912 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2913 }
2914
2915 /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
2916 if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
2917 !(shader->config.spi_ps_input_ena & 0xf)) {
2918 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2919 }
2920
2921 /* At least one pair of interpolation weights must be enabled. */
2922 if (!(shader->config.spi_ps_input_ena & 0x7f))
2923 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
2924
2925 /* Samplemask fixup requires the sample ID. */
2926 if (key->ps.part.prolog.samplemask_log_ps_iter)
2927 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2928 }
2929
2930 static void
si_set_spi_ps_input_config(struct si_shader * shader)2931 si_set_spi_ps_input_config(struct si_shader *shader)
2932 {
2933 const struct si_shader_selector *sel = shader->selector;
2934 const struct si_shader_info *info = &sel->info;
2935 const union si_shader_key *key = &shader->key;
2936
2937 /* TODO: This should be determined from the final NIR instead of the input NIR,
2938 * otherwise LLVM will have a performance advantage here because it determines
2939 * VGPR inputs for each shader variant after LLVM optimizations.
2940 */
2941 shader->config.spi_ps_input_ena =
2942 S_0286CC_PERSP_CENTER_ENA(info->uses_persp_center) |
2943 S_0286CC_PERSP_CENTROID_ENA(info->uses_persp_centroid) |
2944 S_0286CC_PERSP_SAMPLE_ENA(info->uses_persp_sample) |
2945 S_0286CC_LINEAR_CENTER_ENA(info->uses_linear_center) |
2946 S_0286CC_LINEAR_CENTROID_ENA(info->uses_linear_centroid) |
2947 S_0286CC_LINEAR_SAMPLE_ENA(info->uses_linear_sample) |
2948 S_0286CC_FRONT_FACE_ENA(info->uses_frontface && !key->ps.opt.force_front_face_input) |
2949 S_0286CC_SAMPLE_COVERAGE_ENA(info->reads_samplemask) |
2950 S_0286CC_ANCILLARY_ENA(info->uses_sampleid || info->uses_layer_id);
2951
2952 uint8_t mask = info->reads_frag_coord_mask | info->reads_sample_pos_mask;
2953 u_foreach_bit(i, mask) {
2954 shader->config.spi_ps_input_ena |= S_0286CC_POS_X_FLOAT_ENA(1) << i;
2955 }
2956
2957 if (key->ps.part.prolog.color_two_side)
2958 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
2959
2960 /* INTERP_MODE_COLOR, same as SMOOTH if flat shading is disabled. */
2961 if (info->uses_interp_color && !key->ps.part.prolog.flatshade_colors) {
2962 shader->config.spi_ps_input_ena |=
2963 S_0286CC_PERSP_SAMPLE_ENA(info->uses_persp_sample_color) |
2964 S_0286CC_PERSP_CENTER_ENA(info->uses_persp_center_color) |
2965 S_0286CC_PERSP_CENTROID_ENA(info->uses_persp_centroid_color);
2966 }
2967
2968 /* nir_lower_poly_line_smooth use nir_load_sample_mask_in */
2969 if (key->ps.mono.poly_line_smoothing)
2970 shader->config.spi_ps_input_ena |= S_0286CC_SAMPLE_COVERAGE_ENA(1);
2971
2972 /* nir_lower_point_smooth use nir_load_point_coord_maybe_flipped which is lowered
2973 * to nir_load_barycentric_pixel and nir_load_interpolated_input.
2974 */
2975 if (key->ps.mono.point_smoothing)
2976 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
2977
2978 /* See fetch_framebuffer() for used args when fbfetch output. */
2979 if (info->base.fs.uses_fbfetch_output) {
2980 shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
2981
2982 if (key->ps.mono.fbfetch_layered || key->ps.mono.fbfetch_msaa)
2983 shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
2984 }
2985
2986 if (shader->is_monolithic) {
2987 si_fixup_spi_ps_input_config(shader);
2988 shader->config.spi_ps_input_addr = shader->config.spi_ps_input_ena;
2989 } else {
2990 /* Part mode will call si_fixup_spi_ps_input_config() when combining multi
2991 * shader part in si_shader_select_ps_parts().
2992 *
2993 * Reserve register locations for VGPR inputs the PS prolog may need.
2994 */
2995 shader->config.spi_ps_input_addr =
2996 shader->config.spi_ps_input_ena |
2997 SI_SPI_PS_INPUT_ADDR_FOR_PROLOG;
2998 }
2999 }
3000
3001 static void
debug_message_stderr(void * data,unsigned * id,enum util_debug_type ptype,const char * fmt,va_list args)3002 debug_message_stderr(void *data, unsigned *id, enum util_debug_type ptype,
3003 const char *fmt, va_list args)
3004 {
3005 vfprintf(stderr, fmt, args);
3006 fprintf(stderr, "\n");
3007 }
3008
3009 static void
determine_shader_variant_info(struct si_screen * sscreen,struct si_shader * shader)3010 determine_shader_variant_info(struct si_screen *sscreen, struct si_shader *shader)
3011 {
3012 struct si_shader_selector *sel = shader->selector;
3013
3014 if (sel->stage == MESA_SHADER_FRAGMENT) {
3015 shader->ps.writes_z = sel->info.writes_z && !shader->key.ps.part.epilog.kill_z;
3016 shader->ps.writes_stencil = sel->info.writes_stencil &&
3017 !shader->key.ps.part.epilog.kill_stencil;
3018 shader->ps.writes_samplemask = sel->info.writes_samplemask &&
3019 !shader->key.ps.part.epilog.kill_samplemask;
3020 }
3021 }
3022
si_compile_shader(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3023 bool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3024 struct si_shader *shader, struct util_debug_callback *debug)
3025 {
3026 bool ret = true;
3027 struct si_shader_selector *sel = shader->selector;
3028
3029 determine_shader_variant_info(sscreen, shader);
3030
3031 /* ACO need spi_ps_input in advance to init args and used in compiler. */
3032 if (sel->stage == MESA_SHADER_FRAGMENT && sel->info.base.use_aco_amd)
3033 si_set_spi_ps_input_config(shader);
3034
3035 /* We need this info only when legacy GS. */
3036 struct si_gs_output_info legacy_gs_output_info;
3037 if (sel->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
3038 memset(&legacy_gs_output_info, 0, sizeof(legacy_gs_output_info));
3039 si_init_gs_output_info(&sel->info, &legacy_gs_output_info);
3040 }
3041
3042 struct si_shader_args args;
3043 bool free_nir;
3044 struct nir_shader *nir =
3045 si_get_nir_shader(shader, &args, &free_nir, &legacy_gs_output_info.info);
3046
3047 /* Dump NIR before doing NIR->LLVM conversion in case the
3048 * conversion fails. */
3049 if (si_can_dump_shader(sscreen, nir->info.stage, SI_DUMP_NIR)) {
3050 nir_print_shader(nir, stderr);
3051
3052 if (nir->xfb_info)
3053 nir_print_xfb_info(nir->xfb_info, stderr);
3054 }
3055
3056 /* Initialize vs_output_ps_input_cntl to default. */
3057 for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++)
3058 shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED;
3059 shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0;
3060
3061 si_update_shader_binary_info(shader, nir);
3062
3063 /* uses_instanceid may be set by si_nir_lower_vs_inputs(). */
3064 shader->info.uses_instanceid |= sel->info.uses_instanceid;
3065 shader->info.private_mem_vgprs = DIV_ROUND_UP(nir->scratch_size, 4);
3066
3067 /* Set the FP ALU behavior. */
3068 /* By default, we disable denormals for FP32 and enable them for FP16 and FP64
3069 * for performance and correctness reasons. FP32 denormals can't be enabled because
3070 * they break output modifiers and v_mad_f32 and are very slow on GFX6-7.
3071 *
3072 * float_controls_execution_mode defines the set of valid behaviors. Contradicting flags
3073 * can be set simultaneously, which means we are allowed to choose, but not really because
3074 * some options cause GLCTS failures.
3075 */
3076 unsigned float_mode = V_00B028_FP_16_64_DENORMS;
3077
3078 if (!(nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) &&
3079 nir->info.float_controls_execution_mode & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
3080 float_mode |= V_00B028_FP_32_ROUND_TOWARDS_ZERO;
3081
3082 if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
3083 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64)) &&
3084 nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
3085 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
3086 float_mode |= V_00B028_FP_16_64_ROUND_TOWARDS_ZERO;
3087
3088 if (!(nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_PRESERVE_FP16 |
3089 FLOAT_CONTROLS_DENORM_PRESERVE_FP64)) &&
3090 nir->info.float_controls_execution_mode & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
3091 FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64))
3092 float_mode &= ~V_00B028_FP_16_64_DENORMS;
3093
3094 ret =
3095 #if AMD_LLVM_AVAILABLE
3096 !nir->info.use_aco_amd ? si_llvm_compile_shader(sscreen, compiler, shader, &args,
3097 debug, nir) :
3098 #endif
3099 si_aco_compile_shader(shader, &args, nir, debug);
3100
3101 if (!ret)
3102 goto out;
3103
3104 shader->config.float_mode = float_mode;
3105
3106 /* The GS copy shader is compiled next. */
3107 if (nir->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
3108 shader->gs_copy_shader =
3109 si_nir_generate_gs_copy_shader(sscreen, compiler, shader, nir, debug,
3110 &legacy_gs_output_info.info);
3111 if (!shader->gs_copy_shader) {
3112 fprintf(stderr, "radeonsi: can't create GS copy shader\n");
3113 ret = false;
3114 goto out;
3115 }
3116 }
3117
3118 /* Compute vs_output_ps_input_cntl. */
3119 if ((nir->info.stage == MESA_SHADER_VERTEX ||
3120 nir->info.stage == MESA_SHADER_TESS_EVAL ||
3121 nir->info.stage == MESA_SHADER_GEOMETRY) &&
3122 !shader->key.ge.as_ls && !shader->key.ge.as_es) {
3123 uint8_t *vs_output_param_offset = shader->info.vs_output_param_offset;
3124
3125 if (nir->info.stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg)
3126 vs_output_param_offset = shader->gs_copy_shader->info.vs_output_param_offset;
3127
3128 /* We must use the original shader info before the removal of duplicated shader outputs. */
3129 /* VS and TES should also set primitive ID output if it's used. */
3130 unsigned num_outputs_with_prim_id = sel->info.num_outputs +
3131 shader->key.ge.mono.u.vs_export_prim_id;
3132
3133 for (unsigned i = 0; i < num_outputs_with_prim_id; i++) {
3134 unsigned semantic = sel->info.output_semantic[i];
3135 unsigned offset = vs_output_param_offset[semantic];
3136 unsigned ps_input_cntl;
3137
3138 if (offset <= AC_EXP_PARAM_OFFSET_31) {
3139 /* The input is loaded from parameter memory. */
3140 ps_input_cntl = S_028644_OFFSET(offset);
3141 } else {
3142 /* The input is a DEFAULT_VAL constant. */
3143 assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
3144 offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
3145 offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
3146
3147 /* OFFSET=0x20 means that DEFAULT_VAL is used. */
3148 ps_input_cntl = S_028644_OFFSET(0x20) |
3149 S_028644_DEFAULT_VAL(offset);
3150 }
3151
3152 shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl;
3153 }
3154 }
3155
3156 /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */
3157 if (gl_shader_stage_is_compute(nir->info.stage)) {
3158 unsigned max_vgprs =
3159 sscreen->info.num_physical_wave64_vgprs_per_simd * (shader->wave_size == 32 ? 2 : 1);
3160 unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
3161 unsigned max_sgprs_per_wave = 128;
3162 unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
3163 unsigned threads_per_tg = si_get_max_workgroup_size(shader);
3164 unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, shader->wave_size);
3165 unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
3166
3167 max_vgprs = max_vgprs / waves_per_simd;
3168 max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
3169
3170 if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
3171 fprintf(stderr,
3172 "LLVM failed to compile a shader correctly: "
3173 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
3174 shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);
3175
3176 /* Just terminate the process, because dependent
3177 * shaders can hang due to bad input data, but use
3178 * the env var to allow shader-db to work.
3179 */
3180 if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
3181 abort();
3182 }
3183 }
3184
3185 /* Add/remove the scratch offset to/from input SGPRs. */
3186 if (!sel->screen->info.has_scratch_base_registers &&
3187 !si_is_merged_shader(shader)) {
3188 if (nir->info.use_aco_amd) {
3189 /* When aco scratch_offset arg is added explicitly at the beginning.
3190 * After compile if no scratch used, reduce the input sgpr count.
3191 */
3192 if (!shader->config.scratch_bytes_per_wave)
3193 shader->info.num_input_sgprs--;
3194 } else {
3195 /* scratch_offset arg is added by llvm implicitly */
3196 if (shader->info.num_input_sgprs)
3197 shader->info.num_input_sgprs++;
3198 }
3199 }
3200
3201 /* Calculate the number of fragment input VGPRs. */
3202 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
3203 shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(
3204 &shader->config, &shader->info.num_fragcoord_components);
3205 }
3206
3207 si_calculate_max_simd_waves(shader);
3208
3209 if (si_can_dump_shader(sscreen, nir->info.stage, SI_DUMP_STATS)) {
3210 struct util_debug_callback out_stderr = {
3211 .debug_message = debug_message_stderr,
3212 };
3213
3214 si_shader_dump_stats_for_shader_db(sscreen, shader, &out_stderr);
3215 } else {
3216 si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
3217 }
3218
3219 out:
3220 if (free_nir)
3221 ralloc_free(nir);
3222
3223 return ret;
3224 }
3225
3226 /**
3227 * Create, compile and return a shader part (prolog or epilog).
3228 *
3229 * \param sscreen screen
3230 * \param list list of shader parts of the same category
3231 * \param type shader type
3232 * \param key shader part key
3233 * \param prolog whether the part being requested is a prolog
3234 * \param tm LLVM target machine
3235 * \param debug debug callback
3236 * \return non-NULL on success
3237 */
3238 static struct si_shader_part *
si_get_shader_part(struct si_screen * sscreen,struct si_shader_part ** list,gl_shader_stage stage,bool prolog,union si_shader_part_key * key,struct ac_llvm_compiler * compiler,struct util_debug_callback * debug,const char * name)3239 si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
3240 gl_shader_stage stage, bool prolog, union si_shader_part_key *key,
3241 struct ac_llvm_compiler *compiler, struct util_debug_callback *debug,
3242 const char *name)
3243 {
3244 struct si_shader_part *result;
3245
3246 simple_mtx_lock(&sscreen->shader_parts_mutex);
3247
3248 /* Find existing. */
3249 for (result = *list; result; result = result->next) {
3250 if (memcmp(&result->key, key, sizeof(*key)) == 0) {
3251 simple_mtx_unlock(&sscreen->shader_parts_mutex);
3252 return result;
3253 }
3254 }
3255
3256 /* Compile a new one. */
3257 result = CALLOC_STRUCT(si_shader_part);
3258 result->key = *key;
3259
3260 bool ok =
3261 #if AMD_LLVM_AVAILABLE
3262 !(sscreen->use_aco ||
3263 (stage == MESA_SHADER_FRAGMENT &&
3264 ((prolog && key->ps_prolog.use_aco) ||
3265 (!prolog && key->ps_epilog.use_aco)))) ?
3266 si_llvm_build_shader_part(sscreen, stage, prolog, compiler, debug, name, result) :
3267 #endif
3268 si_aco_build_shader_part(sscreen, stage, prolog, debug, name, result);
3269
3270 if (ok) {
3271 result->next = *list;
3272 *list = result;
3273 } else {
3274 FREE(result);
3275 result = NULL;
3276 }
3277
3278 simple_mtx_unlock(&sscreen->shader_parts_mutex);
3279 return result;
3280 }
3281
3282
3283 /**
3284 * Select and compile (or reuse) TCS parts (epilog).
3285 */
si_shader_select_tcs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3286 static bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3287 struct si_shader *shader, struct util_debug_callback *debug)
3288 {
3289 if (sscreen->info.gfx_level >= GFX9) {
3290 assert(shader->wave_size == 32 || shader->wave_size == 64);
3291 unsigned index = shader->wave_size / 32 - 1;
3292 shader->previous_stage = shader->key.ge.part.tcs.ls->main_shader_part_ls[index];
3293 }
3294
3295 return true;
3296 }
3297
3298 /**
3299 * Select and compile (or reuse) GS parts (prolog).
3300 */
si_shader_select_gs_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3301 static bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3302 struct si_shader *shader, struct util_debug_callback *debug)
3303 {
3304 if (sscreen->info.gfx_level >= GFX9) {
3305 if (shader->key.ge.as_ngg) {
3306 assert(shader->wave_size == 32 || shader->wave_size == 64);
3307 unsigned index = shader->wave_size / 32 - 1;
3308 shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_ngg_es[index];
3309 } else {
3310 shader->previous_stage = shader->key.ge.part.gs.es->main_shader_part_es;
3311 }
3312 }
3313
3314 return true;
3315 }
3316
3317 /**
3318 * Compute the PS prolog key, which contains all the information needed to
3319 * build the PS prolog function, and set related bits in shader->config.
3320 */
si_get_ps_prolog_key(struct si_shader * shader,union si_shader_part_key * key)3321 static void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key)
3322 {
3323 struct si_shader_info *info = &shader->selector->info;
3324
3325 memset(key, 0, sizeof(*key));
3326 key->ps_prolog.states = shader->key.ps.part.prolog;
3327 key->ps_prolog.use_aco = info->base.use_aco_amd;
3328 key->ps_prolog.wave32 = shader->wave_size == 32;
3329 key->ps_prolog.colors_read = shader->info.ps_colors_read;
3330 key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
3331 key->ps_prolog.wqm =
3332 info->base.fs.needs_quad_helper_invocations &&
3333 (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
3334 key->ps_prolog.states.force_linear_sample_interp ||
3335 key->ps_prolog.states.force_persp_center_interp ||
3336 key->ps_prolog.states.force_linear_center_interp ||
3337 key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);
3338 key->ps_prolog.num_fragcoord_components = shader->info.num_fragcoord_components;
3339
3340 if (shader->key.ps.part.prolog.poly_stipple)
3341 shader->info.uses_vmem_load_other = true;
3342
3343 if (shader->info.ps_colors_read) {
3344 uint8_t *color = shader->selector->info.color_attr_index;
3345
3346 if (shader->key.ps.part.prolog.color_two_side) {
3347 /* BCOLORs are stored after the last input. */
3348 key->ps_prolog.num_interp_inputs = shader->info.num_ps_inputs;
3349 shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
3350 }
3351
3352 for (unsigned i = 0; i < 2; i++) {
3353 unsigned interp = info->color_interpolate[i];
3354 unsigned location = info->color_interpolate_loc[i];
3355
3356 if (!(shader->info.ps_colors_read & (0xf << i * 4)))
3357 continue;
3358
3359 key->ps_prolog.color_attr_index[i] = color[i];
3360
3361 if (shader->key.ps.part.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
3362 interp = INTERP_MODE_FLAT;
3363
3364 switch (interp) {
3365 case INTERP_MODE_FLAT:
3366 key->ps_prolog.color_interp_vgpr_index[i] = -1;
3367 break;
3368 case INTERP_MODE_SMOOTH:
3369 case INTERP_MODE_COLOR:
3370 /* Force the interpolation location for colors here. */
3371 if (shader->key.ps.part.prolog.force_persp_sample_interp)
3372 location = TGSI_INTERPOLATE_LOC_SAMPLE;
3373 if (shader->key.ps.part.prolog.force_persp_center_interp)
3374 location = TGSI_INTERPOLATE_LOC_CENTER;
3375
3376 switch (location) {
3377 case TGSI_INTERPOLATE_LOC_SAMPLE:
3378 key->ps_prolog.color_interp_vgpr_index[i] = 0;
3379 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
3380 break;
3381 case TGSI_INTERPOLATE_LOC_CENTER:
3382 key->ps_prolog.color_interp_vgpr_index[i] = 2;
3383 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
3384 break;
3385 case TGSI_INTERPOLATE_LOC_CENTROID:
3386 key->ps_prolog.color_interp_vgpr_index[i] = 4;
3387 shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
3388 break;
3389 default:
3390 assert(0);
3391 }
3392 break;
3393 case INTERP_MODE_NOPERSPECTIVE:
3394 /* Force the interpolation location for colors here. */
3395 if (shader->key.ps.part.prolog.force_linear_sample_interp)
3396 location = TGSI_INTERPOLATE_LOC_SAMPLE;
3397 if (shader->key.ps.part.prolog.force_linear_center_interp)
3398 location = TGSI_INTERPOLATE_LOC_CENTER;
3399
3400 /* The VGPR assignment for non-monolithic shaders
3401 * works because InitialPSInputAddr is set on the
3402 * main shader and PERSP_PULL_MODEL is never used.
3403 */
3404 switch (location) {
3405 case TGSI_INTERPOLATE_LOC_SAMPLE:
3406 key->ps_prolog.color_interp_vgpr_index[i] = 6;
3407 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
3408 break;
3409 case TGSI_INTERPOLATE_LOC_CENTER:
3410 key->ps_prolog.color_interp_vgpr_index[i] = 8;
3411 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
3412 break;
3413 case TGSI_INTERPOLATE_LOC_CENTROID:
3414 key->ps_prolog.color_interp_vgpr_index[i] = 10;
3415 shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
3416 break;
3417 default:
3418 assert(0);
3419 }
3420 break;
3421 default:
3422 assert(0);
3423 }
3424 }
3425 }
3426 }
3427
3428 /**
3429 * Check whether a PS prolog is required based on the key.
3430 */
si_need_ps_prolog(const union si_shader_part_key * key)3431 static bool si_need_ps_prolog(const union si_shader_part_key *key)
3432 {
3433 return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
3434 key->ps_prolog.states.force_linear_sample_interp ||
3435 key->ps_prolog.states.force_persp_center_interp ||
3436 key->ps_prolog.states.force_linear_center_interp ||
3437 key->ps_prolog.states.bc_optimize_for_persp ||
3438 key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
3439 key->ps_prolog.states.samplemask_log_ps_iter;
3440 }
3441
3442 /**
3443 * Compute the PS epilog key, which contains all the information needed to
3444 * build the PS epilog function.
3445 */
si_get_ps_epilog_key(struct si_shader * shader,union si_shader_part_key * key)3446 static void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
3447 {
3448 struct si_shader_info *info = &shader->selector->info;
3449 memset(key, 0, sizeof(*key));
3450 key->ps_epilog.use_aco = info->base.use_aco_amd;
3451 key->ps_epilog.wave32 = shader->wave_size == 32;
3452 key->ps_epilog.uses_discard = si_shader_uses_discard(shader);
3453 key->ps_epilog.colors_written = info->colors_written;
3454 key->ps_epilog.color_types = info->output_color_types;
3455 key->ps_epilog.writes_z = info->writes_z;
3456 key->ps_epilog.writes_stencil = info->writes_stencil;
3457 key->ps_epilog.writes_samplemask = info->writes_samplemask;
3458 key->ps_epilog.states = shader->key.ps.part.epilog;
3459 }
3460
3461 /**
3462 * Select and compile (or reuse) pixel shader parts (prolog & epilog).
3463 */
si_shader_select_ps_parts(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3464 static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3465 struct si_shader *shader, struct util_debug_callback *debug)
3466 {
3467 union si_shader_part_key prolog_key;
3468 union si_shader_part_key epilog_key;
3469
3470 /* Get the prolog. */
3471 si_get_ps_prolog_key(shader, &prolog_key);
3472
3473 /* The prolog is a no-op if these aren't set. */
3474 if (si_need_ps_prolog(&prolog_key)) {
3475 shader->prolog =
3476 si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key,
3477 compiler, debug, "Fragment Shader Prolog");
3478 if (!shader->prolog)
3479 return false;
3480 }
3481
3482 /* Get the epilog. */
3483 si_get_ps_epilog_key(shader, &epilog_key);
3484
3485 shader->epilog =
3486 si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key,
3487 compiler, debug, "Fragment Shader Epilog");
3488 if (!shader->epilog)
3489 return false;
3490
3491 si_fixup_spi_ps_input_config(shader);
3492
3493 /* Make sure spi_ps_input_addr bits is superset of spi_ps_input_ena. */
3494 unsigned spi_ps_input_ena = shader->config.spi_ps_input_ena;
3495 unsigned spi_ps_input_addr = shader->config.spi_ps_input_addr;
3496 assert((spi_ps_input_ena & spi_ps_input_addr) == spi_ps_input_ena);
3497
3498 return true;
3499 }
3500
si_multiwave_lds_size_workaround(struct si_screen * sscreen,unsigned * lds_size)3501 void si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
3502 {
3503 /* If tessellation is all offchip and on-chip GS isn't used, this
3504 * workaround is not needed.
3505 */
3506 return;
3507
3508 /* SPI barrier management bug:
3509 * Make sure we have at least 4k of LDS in use to avoid the bug.
3510 * It applies to workgroup sizes of more than one wavefront.
3511 */
3512 if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
3513 *lds_size = MAX2(*lds_size, 8);
3514 }
3515
si_fix_resource_usage(struct si_screen * sscreen,struct si_shader * shader)3516 static void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
3517 {
3518 unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
3519
3520 shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
3521
3522 if (shader->selector->stage == MESA_SHADER_COMPUTE &&
3523 si_get_max_workgroup_size(shader) > shader->wave_size) {
3524 si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
3525 }
3526 }
3527
si_create_shader_variant(struct si_screen * sscreen,struct ac_llvm_compiler * compiler,struct si_shader * shader,struct util_debug_callback * debug)3528 bool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
3529 struct si_shader *shader, struct util_debug_callback *debug)
3530 {
3531 struct si_shader_selector *sel = shader->selector;
3532 struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key, shader->wave_size);
3533
3534 /* LS, ES, VS are compiled on demand if the main part hasn't been
3535 * compiled for that stage.
3536 *
3537 * GS are compiled on demand if the main part hasn't been compiled
3538 * for the chosen NGG-ness.
3539 *
3540 * Vertex shaders are compiled on demand when a vertex fetch
3541 * workaround must be applied.
3542 */
3543 if (shader->is_monolithic) {
3544 /* Monolithic shader (compiled as a whole, has many variants,
3545 * may take a long time to compile).
3546 */
3547 if (!si_compile_shader(sscreen, compiler, shader, debug))
3548 return false;
3549 } else {
3550 /* The shader consists of several parts:
3551 *
3552 * - the middle part is the user shader, it has 1 variant only
3553 * and it was compiled during the creation of the shader
3554 * selector
3555 * - the prolog part is inserted at the beginning
3556 * - the epilog part is inserted at the end
3557 *
3558 * The prolog and epilog have many (but simple) variants.
3559 *
3560 * Starting with gfx9, geometry and tessellation control
3561 * shaders also contain the prolog and user shader parts of
3562 * the previous shader stage.
3563 */
3564
3565 if (!mainp)
3566 return false;
3567
3568 determine_shader_variant_info(sscreen, shader);
3569
3570 /* Copy the compiled shader data over. */
3571 shader->is_binary_shared = true;
3572 shader->binary = mainp->binary;
3573 shader->config = mainp->config;
3574 shader->info = mainp->info;
3575
3576 /* Select prologs and/or epilogs. */
3577 switch (sel->stage) {
3578 case MESA_SHADER_TESS_CTRL:
3579 if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
3580 return false;
3581 break;
3582 case MESA_SHADER_GEOMETRY:
3583 if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
3584 return false;
3585
3586 /* Clone the GS copy shader for the shader variant.
3587 * We can't just copy the pointer because we change the pm4 state and
3588 * si_shader_selector::gs_copy_shader must be immutable because it's shared
3589 * by multiple contexts.
3590 */
3591 if (!shader->key.ge.as_ngg) {
3592 assert(mainp->gs_copy_shader);
3593 assert(mainp->gs_copy_shader->bo);
3594 assert(!mainp->gs_copy_shader->previous_stage_sel);
3595 assert(!mainp->gs_copy_shader->scratch_va);
3596
3597 shader->gs_copy_shader = CALLOC_STRUCT(si_shader);
3598 memcpy(shader->gs_copy_shader, mainp->gs_copy_shader,
3599 sizeof(*shader->gs_copy_shader));
3600 /* Increase the reference count. */
3601 pipe_reference(NULL, &shader->gs_copy_shader->bo->b.b.reference);
3602 /* Initialize some fields differently. */
3603 shader->gs_copy_shader->shader_log = NULL;
3604 shader->gs_copy_shader->is_binary_shared = true;
3605 util_queue_fence_init(&shader->gs_copy_shader->ready);
3606 }
3607 break;
3608 case MESA_SHADER_FRAGMENT:
3609 if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
3610 return false;
3611
3612 /* Make sure we have at least as many VGPRs as there
3613 * are allocated inputs.
3614 */
3615 shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
3616 break;
3617 default:;
3618 }
3619
3620 assert(shader->wave_size == mainp->wave_size);
3621 assert(!shader->previous_stage || shader->wave_size == shader->previous_stage->wave_size);
3622
3623 /* Update SGPR and VGPR counts. */
3624 if (shader->prolog) {
3625 shader->config.num_sgprs =
3626 MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs);
3627 shader->config.num_vgprs =
3628 MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs);
3629 }
3630 if (shader->previous_stage) {
3631 shader->config.num_sgprs =
3632 MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
3633 shader->config.num_vgprs =
3634 MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
3635 shader->config.spilled_sgprs =
3636 MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
3637 shader->config.spilled_vgprs =
3638 MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
3639 shader->info.private_mem_vgprs =
3640 MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
3641 shader->config.scratch_bytes_per_wave =
3642 MAX2(shader->config.scratch_bytes_per_wave,
3643 shader->previous_stage->config.scratch_bytes_per_wave);
3644 shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
3645 shader->info.uses_vmem_load_other |= shader->previous_stage->info.uses_vmem_load_other;
3646 shader->info.uses_vmem_sampler_or_bvh |= shader->previous_stage->info.uses_vmem_sampler_or_bvh;
3647 }
3648 if (shader->epilog) {
3649 shader->config.num_sgprs =
3650 MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs);
3651 shader->config.num_vgprs =
3652 MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs);
3653 }
3654 si_calculate_max_simd_waves(shader);
3655 }
3656
3657 if (sel->stage <= MESA_SHADER_GEOMETRY && shader->key.ge.as_ngg) {
3658 assert(!shader->key.ge.as_es && !shader->key.ge.as_ls);
3659 if (!gfx10_ngg_calculate_subgroup_info(shader)) {
3660 fprintf(stderr, "Failed to compute subgroup info\n");
3661 return false;
3662 }
3663 } else if (sscreen->info.gfx_level >= GFX9 && sel->stage == MESA_SHADER_GEOMETRY) {
3664 gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
3665 }
3666
3667 shader->uses_vs_state_provoking_vertex =
3668 sscreen->use_ngg &&
3669 /* Used to convert triangle strips from GS to triangles. */
3670 ((sel->stage == MESA_SHADER_GEOMETRY &&
3671 util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) ||
3672 (sel->stage == MESA_SHADER_VERTEX &&
3673 /* Used to export PrimitiveID from the correct vertex. */
3674 shader->key.ge.mono.u.vs_export_prim_id));
3675
3676 shader->uses_gs_state_outprim = sscreen->use_ngg &&
3677 /* Only used by streamout and the PrimID export in vertex
3678 * shaders. */
3679 sel->stage == MESA_SHADER_VERTEX &&
3680 (si_shader_uses_streamout(shader) ||
3681 shader->uses_vs_state_provoking_vertex);
3682
3683 if (sel->stage == MESA_SHADER_VERTEX) {
3684 shader->uses_base_instance = sel->info.uses_base_instance ||
3685 shader->key.ge.mono.instance_divisor_is_one ||
3686 shader->key.ge.mono.instance_divisor_is_fetched;
3687 } else if (sel->stage == MESA_SHADER_TESS_CTRL) {
3688 shader->uses_base_instance = shader->previous_stage_sel &&
3689 (shader->previous_stage_sel->info.uses_base_instance ||
3690 shader->key.ge.mono.instance_divisor_is_one ||
3691 shader->key.ge.mono.instance_divisor_is_fetched);
3692 } else if (sel->stage == MESA_SHADER_GEOMETRY) {
3693 shader->uses_base_instance = shader->previous_stage_sel &&
3694 (shader->previous_stage_sel->info.uses_base_instance ||
3695 shader->key.ge.mono.instance_divisor_is_one ||
3696 shader->key.ge.mono.instance_divisor_is_fetched);
3697 }
3698
3699 si_fix_resource_usage(sscreen, shader);
3700
3701 /* Upload. */
3702 bool ok = si_shader_binary_upload(sscreen, shader, 0) >= 0;
3703
3704 shader->complete_shader_binary_size = si_get_shader_binary_size(sscreen, shader);
3705
3706 si_shader_dump(sscreen, shader, debug, stderr, true);
3707
3708 if (!ok)
3709 fprintf(stderr, "LLVM failed to upload shader\n");
3710 return ok;
3711 }
3712
si_shader_binary_clean(struct si_shader_binary * binary)3713 void si_shader_binary_clean(struct si_shader_binary *binary)
3714 {
3715 free((void *)binary->code_buffer);
3716 binary->code_buffer = NULL;
3717
3718 free(binary->llvm_ir_string);
3719 binary->llvm_ir_string = NULL;
3720
3721 free((void *)binary->symbols);
3722 binary->symbols = NULL;
3723
3724 free(binary->uploaded_code);
3725 binary->uploaded_code = NULL;
3726 binary->uploaded_code_size = 0;
3727 }
3728
si_shader_destroy(struct si_shader * shader)3729 void si_shader_destroy(struct si_shader *shader)
3730 {
3731 si_resource_reference(&shader->bo, NULL);
3732
3733 if (!shader->is_binary_shared)
3734 si_shader_binary_clean(&shader->binary);
3735
3736 free(shader->shader_log);
3737 }
3738
si_get_prev_stage_nir_shader(struct si_shader * shader,struct si_shader * prev_shader,struct si_shader_args * args,bool * free_nir)3739 nir_shader *si_get_prev_stage_nir_shader(struct si_shader *shader,
3740 struct si_shader *prev_shader,
3741 struct si_shader_args *args,
3742 bool *free_nir)
3743 {
3744 const struct si_shader_selector *sel = shader->selector;
3745 const union si_shader_key *key = &shader->key;
3746
3747 if (sel->stage == MESA_SHADER_TESS_CTRL) {
3748 struct si_shader_selector *ls = key->ge.part.tcs.ls;
3749
3750 prev_shader->selector = ls;
3751 prev_shader->key.ge.as_ls = 1;
3752 } else {
3753 struct si_shader_selector *es = key->ge.part.gs.es;
3754
3755 prev_shader->selector = es;
3756 prev_shader->key.ge.as_es = 1;
3757 prev_shader->key.ge.as_ngg = key->ge.as_ngg;
3758 }
3759
3760 prev_shader->next_shader = shader;
3761 prev_shader->key.ge.mono = key->ge.mono;
3762 prev_shader->key.ge.opt = key->ge.opt;
3763 prev_shader->key.ge.opt.inline_uniforms = false; /* only TCS/GS can inline uniforms */
3764 /* kill_outputs was computed based on second shader's outputs so we can't use it to
3765 * kill first shader's outputs.
3766 */
3767 prev_shader->key.ge.opt.kill_outputs = 0;
3768 prev_shader->is_monolithic = true;
3769 prev_shader->wave_size = shader->wave_size;
3770
3771 nir_shader *nir = si_get_nir_shader(prev_shader, args, free_nir, NULL);
3772 si_update_shader_binary_info(shader, nir);
3773
3774 shader->info.uses_instanceid |=
3775 prev_shader->selector->info.uses_instanceid || prev_shader->info.uses_instanceid;
3776
3777 return nir;
3778 }
3779
si_get_ps_prolog_args(struct si_shader_args * args,const union si_shader_part_key * key)3780 void si_get_ps_prolog_args(struct si_shader_args *args,
3781 const union si_shader_part_key *key)
3782 {
3783 memset(args, 0, sizeof(*args));
3784
3785 const unsigned num_input_sgprs = key->ps_prolog.num_input_sgprs;
3786
3787 struct ac_arg input_sgprs[num_input_sgprs];
3788 for (unsigned i = 0; i < num_input_sgprs; i++)
3789 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, input_sgprs + i);
3790
3791 args->internal_bindings = input_sgprs[SI_SGPR_INTERNAL_BINDINGS];
3792 /* Use the absolute location of the input. */
3793 args->ac.prim_mask = input_sgprs[SI_PS_NUM_USER_SGPR];
3794
3795 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_sample);
3796 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_center);
3797 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.persp_centroid);
3798 /* skip PERSP_PULL_MODEL */
3799 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_sample);
3800 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_center);
3801 ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_FLOAT, &args->ac.linear_centroid);
3802 /* skip LINE_STIPPLE_TEX */
3803
3804 /* POS_X|Y|Z|W_FLOAT */
3805 for (unsigned i = 0; i < key->ps_prolog.num_fragcoord_components; i++)
3806 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
3807
3808 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.front_face);
3809 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.ancillary);
3810 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.sample_coverage);
3811 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.pos_fixed_pt);
3812 }
3813
si_get_ps_epilog_args(struct si_shader_args * args,const union si_shader_part_key * key,struct ac_arg colors[MAX_DRAW_BUFFERS],struct ac_arg * depth,struct ac_arg * stencil,struct ac_arg * sample_mask)3814 void si_get_ps_epilog_args(struct si_shader_args *args,
3815 const union si_shader_part_key *key,
3816 struct ac_arg colors[MAX_DRAW_BUFFERS],
3817 struct ac_arg *depth, struct ac_arg *stencil,
3818 struct ac_arg *sample_mask)
3819 {
3820 memset(args, 0, sizeof(*args));
3821
3822 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3823 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3824 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3825 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
3826 ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_FLOAT, &args->alpha_reference);
3827
3828 u_foreach_bit (i, key->ps_epilog.colors_written) {
3829 ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_FLOAT, colors + i);
3830 }
3831
3832 if (key->ps_epilog.writes_z)
3833 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, depth);
3834
3835 if (key->ps_epilog.writes_stencil)
3836 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, stencil);
3837
3838 if (key->ps_epilog.writes_samplemask)
3839 ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, sample_mask);
3840 }
3841