1 /*
2 * Copyright 2023 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "si_shader_internal.h"
26 #include "si_pipe.h"
27 #include "ac_hw_stage.h"
28 #include "aco_interface.h"
29
30 static void
si_aco_compiler_debug(void * private_data,enum aco_compiler_debug_level level,const char * message)31 si_aco_compiler_debug(void *private_data, enum aco_compiler_debug_level level,
32 const char *message)
33 {
34 struct util_debug_callback *debug = private_data;
35
36 util_debug_message(debug, SHADER_INFO, "%s\n", message);
37 }
38
39 static void
si_fill_aco_options(struct si_screen * screen,gl_shader_stage stage,struct aco_compiler_options * options,struct util_debug_callback * debug)40 si_fill_aco_options(struct si_screen *screen, gl_shader_stage stage,
41 struct aco_compiler_options *options,
42 struct util_debug_callback *debug)
43 {
44 options->dump_shader =
45 si_can_dump_shader(screen, stage, SI_DUMP_ACO_IR) ||
46 si_can_dump_shader(screen, stage, SI_DUMP_ASM);
47 options->dump_preoptir = si_can_dump_shader(screen, stage, SI_DUMP_INIT_ACO_IR);
48 options->record_ir = screen->record_llvm_ir;
49 options->is_opengl = true;
50
51 options->has_ls_vgpr_init_bug = screen->info.has_ls_vgpr_init_bug;
52 options->load_grid_size_from_user_sgpr = true;
53 options->family = screen->info.family;
54 options->gfx_level = screen->info.gfx_level;
55 options->address32_hi = screen->info.address32_hi;
56
57 options->debug.func = si_aco_compiler_debug;
58 options->debug.private_data = debug;
59 }
60
61 static void
si_fill_aco_shader_info(struct si_shader * shader,struct aco_shader_info * info,struct si_shader_args * args)62 si_fill_aco_shader_info(struct si_shader *shader, struct aco_shader_info *info,
63 struct si_shader_args *args)
64 {
65 const struct si_shader_selector *sel = shader->selector;
66 const union si_shader_key *key = &shader->key;
67 const enum amd_gfx_level gfx_level = sel->screen->info.gfx_level;
68 gl_shader_stage stage = shader->is_gs_copy_shader ? MESA_SHADER_VERTEX : sel->stage;
69
70 info->wave_size = shader->wave_size;
71 info->workgroup_size = si_get_max_workgroup_size(shader);
72 /* aco need non-zero value */
73 if (!info->workgroup_size)
74 info->workgroup_size = info->wave_size;
75
76 info->merged_shader_compiled_separately = !shader->is_gs_copy_shader &&
77 si_is_multi_part_shader(shader) && !shader->is_monolithic;
78
79 info->image_2d_view_of_3d = gfx_level == GFX9;
80 info->hw_stage = si_select_hw_stage(stage, key, gfx_level);
81
82 if (stage <= MESA_SHADER_GEOMETRY && key->ge.as_ngg && !key->ge.as_es) {
83 info->has_ngg_culling = key->ge.opt.ngg_culling;
84 info->has_ngg_early_prim_export = gfx10_ngg_export_prim_early(shader);
85 }
86
87 switch (stage) {
88 case MESA_SHADER_TESS_CTRL:
89 info->vs.tcs_in_out_eq = key->ge.opt.same_patch_vertices;
90 info->vs.tcs_temp_only_input_mask = sel->info.tcs_vgpr_only_inputs;
91 info->has_epilog = !shader->is_monolithic;
92 info->tcs.pass_tessfactors_by_reg = sel->info.tessfactors_are_def_in_all_invocs;
93 info->tcs.patch_stride = si_get_tcs_out_patch_stride(&sel->info);
94 info->tcs.tcs_offchip_layout = args->tcs_offchip_layout;
95 info->tcs.tes_offchip_addr = args->tes_offchip_addr;
96 info->tcs.vs_state_bits = args->vs_state_bits;
97 break;
98 case MESA_SHADER_FRAGMENT:
99 info->ps.num_interp = si_get_ps_num_interp(shader);
100 info->ps.spi_ps_input_ena = shader->config.spi_ps_input_ena;
101 info->ps.spi_ps_input_addr = shader->config.spi_ps_input_addr;
102 info->ps.alpha_reference = args->alpha_reference;
103 info->has_epilog = !shader->is_monolithic;
104 break;
105 default:
106 break;
107 }
108 }
109
110 static void
si_aco_build_shader_binary(void ** data,const struct ac_shader_config * config,const char * llvm_ir_str,unsigned llvm_ir_size,const char * disasm_str,unsigned disasm_size,uint32_t * statistics,uint32_t stats_size,uint32_t exec_size,const uint32_t * code,uint32_t code_dw,const struct aco_symbol * symbols,unsigned num_symbols)111 si_aco_build_shader_binary(void **data, const struct ac_shader_config *config,
112 const char *llvm_ir_str, unsigned llvm_ir_size, const char *disasm_str,
113 unsigned disasm_size, uint32_t *statistics, uint32_t stats_size,
114 uint32_t exec_size, const uint32_t *code, uint32_t code_dw,
115 const struct aco_symbol *symbols, unsigned num_symbols)
116 {
117 struct si_shader *shader = (struct si_shader *)data;
118
119 unsigned code_size = code_dw * 4;
120 char *buffer = MALLOC(code_size + disasm_size);
121 memcpy(buffer, code, code_size);
122
123 shader->binary.type = SI_SHADER_BINARY_RAW;
124 shader->binary.code_buffer = buffer;
125 shader->binary.code_size = code_size;
126 shader->binary.exec_size = exec_size;
127
128 if (disasm_size) {
129 memcpy(buffer + code_size, disasm_str, disasm_size);
130 shader->binary.disasm_string = buffer + code_size;
131 shader->binary.disasm_size = disasm_size;
132 }
133
134 if (llvm_ir_size) {
135 shader->binary.llvm_ir_string = MALLOC(llvm_ir_size);
136 memcpy(shader->binary.llvm_ir_string, llvm_ir_str, llvm_ir_size);
137 }
138
139 if (num_symbols) {
140 unsigned symbol_size = num_symbols * sizeof(*symbols);
141 void *data = MALLOC(symbol_size);
142 memcpy(data, symbols, symbol_size);
143 shader->binary.symbols = data;
144 shader->binary.num_symbols = num_symbols;
145 }
146
147 shader->config = *config;
148 }
149
150 bool
si_aco_compile_shader(struct si_shader * shader,struct si_shader_args * args,struct nir_shader * nir,struct util_debug_callback * debug)151 si_aco_compile_shader(struct si_shader *shader,
152 struct si_shader_args *args,
153 struct nir_shader *nir,
154 struct util_debug_callback *debug)
155 {
156 const struct si_shader_selector *sel = shader->selector;
157
158 struct aco_compiler_options options = {0};
159 si_fill_aco_options(sel->screen, sel->stage, &options, debug);
160
161 struct aco_shader_info info = {0};
162 si_fill_aco_shader_info(shader, &info, args);
163
164 nir_shader *shaders[2];
165 unsigned num_shaders = 0;
166
167 bool free_nir = false;
168 struct si_shader prev_shader = {};
169 struct si_shader_args prev_args;
170
171 /* For merged shader stage. */
172 if (shader->is_monolithic && sel->screen->info.gfx_level >= GFX9 &&
173 (sel->stage == MESA_SHADER_TESS_CTRL || sel->stage == MESA_SHADER_GEOMETRY)) {
174
175 shaders[num_shaders++] =
176 si_get_prev_stage_nir_shader(shader, &prev_shader, &prev_args, &free_nir);
177
178 args = &prev_args;
179 }
180
181 shaders[num_shaders++] = nir;
182
183 aco_compile_shader(&options, &info, num_shaders, shaders, &args->ac,
184 si_aco_build_shader_binary, (void **)shader);
185
186 if (free_nir)
187 ralloc_free(shaders[0]);
188
189 return true;
190 }
191
192 void
si_aco_resolve_symbols(struct si_shader * shader,uint32_t * code_for_write,const uint32_t * code_for_read,uint64_t scratch_va,uint32_t const_offset)193 si_aco_resolve_symbols(struct si_shader *shader, uint32_t *code_for_write,
194 const uint32_t *code_for_read, uint64_t scratch_va, uint32_t const_offset)
195 {
196 const struct aco_symbol *symbols = (struct aco_symbol *)shader->binary.symbols;
197 const struct si_shader_selector *sel = shader->selector;
198 const union si_shader_key *key = &shader->key;
199
200 for (int i = 0; i < shader->binary.num_symbols; i++) {
201 uint32_t value = 0;
202
203 switch (symbols[i].id) {
204 case aco_symbol_scratch_addr_lo:
205 value = scratch_va;
206 break;
207 case aco_symbol_scratch_addr_hi:
208 value = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
209
210 if (sel->screen->info.gfx_level >= GFX11)
211 value |= S_008F04_SWIZZLE_ENABLE_GFX11(1);
212 else
213 value |= S_008F04_SWIZZLE_ENABLE_GFX6(1);
214 break;
215 case aco_symbol_lds_ngg_scratch_base:
216 assert(sel->stage <= MESA_SHADER_GEOMETRY && key->ge.as_ngg);
217 value = shader->gs_info.esgs_ring_size * 4;
218 if (sel->stage == MESA_SHADER_GEOMETRY)
219 value += shader->ngg.ngg_emit_size * 4;
220 value = ALIGN(value, 8);
221 break;
222 case aco_symbol_lds_ngg_gs_out_vertex_base:
223 assert(sel->stage == MESA_SHADER_GEOMETRY && key->ge.as_ngg);
224 value = shader->gs_info.esgs_ring_size * 4;
225 break;
226 case aco_symbol_const_data_addr:
227 if (!const_offset)
228 continue;
229 value = code_for_read[symbols[i].offset] + const_offset;
230 break;
231 default:
232 unreachable("invalid aco symbol");
233 break;
234 }
235
236 code_for_write[symbols[i].offset] = value;
237 }
238 }
239
240 static void
si_aco_build_shader_part_binary(void ** priv_ptr,uint32_t num_sgprs,uint32_t num_vgprs,const uint32_t * code,uint32_t code_dw_size,const char * disasm_str,uint32_t disasm_size)241 si_aco_build_shader_part_binary(void** priv_ptr, uint32_t num_sgprs, uint32_t num_vgprs,
242 const uint32_t* code, uint32_t code_dw_size,
243 const char* disasm_str, uint32_t disasm_size)
244 {
245 struct si_shader_part *result = (struct si_shader_part *)priv_ptr;
246 unsigned code_size = code_dw_size * 4;
247
248 char *buffer = MALLOC(code_size + disasm_size);
249 memcpy(buffer, code, code_size);
250
251 result->binary.type = SI_SHADER_BINARY_RAW;
252 result->binary.code_buffer = buffer;
253 result->binary.code_size = code_size;
254 result->binary.exec_size = code_size;
255
256 if (disasm_size) {
257 memcpy(buffer + code_size, disasm_str, disasm_size);
258 result->binary.disasm_string = buffer + code_size;
259 result->binary.disasm_size = disasm_size;
260 }
261
262 result->config.num_sgprs = num_sgprs;
263 result->config.num_vgprs = num_vgprs;
264 }
265
266 static bool
si_aco_build_tcs_epilog(struct si_screen * screen,struct aco_compiler_options * options,struct si_shader_part * result)267 si_aco_build_tcs_epilog(struct si_screen *screen,
268 struct aco_compiler_options *options,
269 struct si_shader_part *result)
270 {
271 const union si_shader_part_key *key = &result->key;
272
273 struct si_shader_args args;
274 struct ac_arg rel_patch_id;
275 struct ac_arg invocation_id;
276 struct ac_arg tcs_out_current_patch_data_offset;
277 struct ac_arg tess_factors[6];
278 si_get_tcs_epilog_args(screen->info.gfx_level, &args, &rel_patch_id, &invocation_id,
279 &tcs_out_current_patch_data_offset, tess_factors);
280
281 struct aco_tcs_epilog_info einfo = {
282 .pass_tessfactors_by_reg = key->tcs_epilog.states.invoc0_tess_factors_are_def,
283 .tcs_out_patch_fits_subgroup = key->tcs_epilog.noop_s_barrier,
284 .primitive_mode = key->tcs_epilog.states.prim_mode,
285 .tess_offchip_ring_size = screen->hs.tess_offchip_ring_size,
286 .tes_reads_tessfactors = key->tcs_epilog.states.tes_reads_tess_factors,
287
288 .rel_patch_id = rel_patch_id,
289 .invocation_id = invocation_id,
290 .tcs_out_current_patch_data_offset = tcs_out_current_patch_data_offset,
291 .tcs_out_lds_layout = args.tes_offchip_addr,
292 .tcs_offchip_layout = args.tcs_offchip_layout,
293 };
294 memcpy(einfo.tess_lvl_out, tess_factors, sizeof(einfo.tess_lvl_out));
295 memcpy(einfo.tess_lvl_in, tess_factors + 4, sizeof(einfo.tess_lvl_in));
296
297 struct aco_shader_info info = {0};
298 info.hw_stage = AC_HW_HULL_SHADER;
299 info.wave_size = key->tcs_epilog.wave32 ? 32 : 64;
300 /* Set to >wave_size to keep p_barrier work. GFX6 has single wave for HS. */
301 info.workgroup_size = screen->info.gfx_level >= GFX7 ? 128 : info.wave_size;
302
303 aco_compile_tcs_epilog(options, &info, &einfo, &args.ac,
304 si_aco_build_shader_part_binary, (void **)result);
305 return true;
306 }
307
308 static bool
si_aco_build_ps_prolog(struct aco_compiler_options * options,struct si_shader_part * result)309 si_aco_build_ps_prolog(struct aco_compiler_options *options,
310 struct si_shader_part *result)
311 {
312 const union si_shader_part_key *key = &result->key;
313
314 struct si_shader_args args;
315 si_get_ps_prolog_args(&args, key);
316
317 struct aco_ps_prolog_info pinfo = {
318 .poly_stipple = key->ps_prolog.states.poly_stipple,
319 .poly_stipple_buf_offset = SI_PS_CONST_POLY_STIPPLE * 16,
320
321 .bc_optimize_for_persp = key->ps_prolog.states.bc_optimize_for_persp,
322 .bc_optimize_for_linear = key->ps_prolog.states.bc_optimize_for_linear,
323 .force_persp_sample_interp = key->ps_prolog.states.force_persp_sample_interp,
324 .force_linear_sample_interp = key->ps_prolog.states.force_linear_sample_interp,
325 .force_persp_center_interp = key->ps_prolog.states.force_persp_center_interp,
326 .force_linear_center_interp = key->ps_prolog.states.force_linear_center_interp,
327
328 .samplemask_log_ps_iter = key->ps_prolog.states.samplemask_log_ps_iter,
329 .num_interp_inputs = key->ps_prolog.num_interp_inputs,
330 .colors_read = key->ps_prolog.colors_read,
331 .color_interp_vgpr_index[0] = key->ps_prolog.color_interp_vgpr_index[0],
332 .color_interp_vgpr_index[1] = key->ps_prolog.color_interp_vgpr_index[1],
333 .color_attr_index[0] = key->ps_prolog.color_attr_index[0],
334 .color_attr_index[1] = key->ps_prolog.color_attr_index[1],
335 .color_two_side = key->ps_prolog.states.color_two_side,
336 .needs_wqm = key->ps_prolog.wqm,
337
338 .internal_bindings = args.internal_bindings,
339 };
340
341 struct aco_shader_info info = {0};
342 info.hw_stage = AC_HW_PIXEL_SHADER;
343 info.workgroup_size = info.wave_size = key->ps_prolog.wave32 ? 32 : 64,
344
345 aco_compile_ps_prolog(options, &info, &pinfo, &args.ac,
346 si_aco_build_shader_part_binary, (void **)result);
347 return true;
348 }
349
350 static bool
si_aco_build_ps_epilog(struct aco_compiler_options * options,struct si_shader_part * result)351 si_aco_build_ps_epilog(struct aco_compiler_options *options,
352 struct si_shader_part *result)
353 {
354 const union si_shader_part_key *key = &result->key;
355
356 struct aco_ps_epilog_info pinfo = {
357 .spi_shader_col_format = key->ps_epilog.states.spi_shader_col_format,
358 .color_is_int8 = key->ps_epilog.states.color_is_int8,
359 .color_is_int10 = key->ps_epilog.states.color_is_int10,
360 .mrt0_is_dual_src = key->ps_epilog.states.dual_src_blend_swizzle,
361 .color_types = key->ps_epilog.color_types,
362 .clamp_color = key->ps_epilog.states.clamp_color,
363 .alpha_to_one = key->ps_epilog.states.alpha_to_one,
364 .alpha_to_coverage_via_mrtz = key->ps_epilog.states.alpha_to_coverage_via_mrtz,
365 .skip_null_export = options->gfx_level >= GFX10 && !key->ps_epilog.uses_discard,
366 .broadcast_last_cbuf = key->ps_epilog.states.last_cbuf,
367 .alpha_func = key->ps_epilog.states.alpha_func,
368 };
369
370 struct si_shader_args args;
371 si_get_ps_epilog_args(&args, key, pinfo.colors, &pinfo.depth, &pinfo.stencil,
372 &pinfo.samplemask);
373 pinfo.alpha_reference = args.alpha_reference;
374
375 struct aco_shader_info info = {0};
376 info.hw_stage = AC_HW_PIXEL_SHADER;
377 info.workgroup_size = info.wave_size = key->ps_epilog.wave32 ? 32 : 64,
378
379 aco_compile_ps_epilog(options, &info, &pinfo, &args.ac,
380 si_aco_build_shader_part_binary, (void **)result);
381 return true;
382 }
383
384 bool
si_aco_build_shader_part(struct si_screen * screen,gl_shader_stage stage,bool prolog,struct util_debug_callback * debug,const char * name,struct si_shader_part * result)385 si_aco_build_shader_part(struct si_screen *screen, gl_shader_stage stage, bool prolog,
386 struct util_debug_callback *debug, const char *name,
387 struct si_shader_part *result)
388 {
389 struct aco_compiler_options options = {0};
390 si_fill_aco_options(screen, stage, &options, debug);
391
392 switch (stage) {
393 case MESA_SHADER_TESS_CTRL:
394 return si_aco_build_tcs_epilog(screen, &options, result);
395 break;
396 case MESA_SHADER_FRAGMENT:
397 if (prolog)
398 return si_aco_build_ps_prolog(&options, result);
399 else
400 return si_aco_build_ps_epilog(&options, result);
401 default:
402 unreachable("bad shader part");
403 }
404
405 return false;
406 }
407