1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * SPDX-License-Identifier: MIT
9 */
10
11 #include "radv_pipeline.h"
12 #include "meta/radv_meta.h"
13 #include "nir/nir.h"
14 #include "nir/nir_builder.h"
15 #include "nir/nir_serialize.h"
16 #include "nir/radv_nir.h"
17 #include "spirv/nir_spirv.h"
18 #include "util/disk_cache.h"
19 #include "util/os_time.h"
20 #include "util/u_atomic.h"
21 #include "radv_cs.h"
22 #include "radv_debug.h"
23 #include "radv_pipeline_rt.h"
24 #include "radv_rmv.h"
25 #include "radv_shader.h"
26 #include "radv_shader_args.h"
27 #include "vk_pipeline.h"
28 #include "vk_render_pass.h"
29 #include "vk_util.h"
30
31 #include "util/u_debug.h"
32 #include "ac_binary.h"
33 #include "ac_nir.h"
34 #include "ac_shader_util.h"
35 #include "aco_interface.h"
36 #include "sid.h"
37 #include "vk_format.h"
38 #include "vk_nir_convert_ycbcr.h"
39 #include "vk_ycbcr_conversion.h"
40 #if AMD_LLVM_AVAILABLE
41 #include "ac_llvm_util.h"
42 #endif
43
44 bool
radv_shader_need_indirect_descriptor_sets(const struct radv_shader * shader)45 radv_shader_need_indirect_descriptor_sets(const struct radv_shader *shader)
46 {
47 const struct radv_userdata_info *loc = radv_get_user_sgpr_info(shader, AC_UD_INDIRECT_DESCRIPTOR_SETS);
48 return loc->sgpr_idx != -1;
49 }
50
51 bool
radv_pipeline_capture_shaders(const struct radv_device * device,VkPipelineCreateFlags2 flags)52 radv_pipeline_capture_shaders(const struct radv_device *device, VkPipelineCreateFlags2 flags)
53 {
54 const struct radv_physical_device *pdev = radv_device_physical(device);
55 const struct radv_instance *instance = radv_physical_device_instance(pdev);
56
57 return (flags & VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR) ||
58 (instance->debug_flags & RADV_DEBUG_DUMP_SHADERS) || device->keep_shader_info;
59 }
60
61 bool
radv_pipeline_capture_shader_stats(const struct radv_device * device,VkPipelineCreateFlags2 flags)62 radv_pipeline_capture_shader_stats(const struct radv_device *device, VkPipelineCreateFlags2 flags)
63 {
64 const struct radv_physical_device *pdev = radv_device_physical(device);
65 const struct radv_instance *instance = radv_physical_device_instance(pdev);
66
67 /* Capture shader statistics when RGP is enabled to correlate shader hashes with Fossilize. */
68 return (flags & VK_PIPELINE_CREATE_2_CAPTURE_STATISTICS_BIT_KHR) ||
69 (instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) || device->keep_shader_info ||
70 (instance->vk.trace_mode & RADV_TRACE_MODE_RGP);
71 }
72
73 bool
radv_pipeline_skip_shaders_cache(const struct radv_device * device,const struct radv_pipeline * pipeline)74 radv_pipeline_skip_shaders_cache(const struct radv_device *device, const struct radv_pipeline *pipeline)
75 {
76 const struct radv_physical_device *pdev = radv_device_physical(device);
77 const struct radv_instance *instance = radv_physical_device_instance(pdev);
78
79 /* Skip the shaders cache when any of the below are true:
80 * - shaders are dumped for debugging (RADV_DEBUG=shaders)
81 * - shaders IR are captured (NIR, backend IR and ASM)
82 * - binaries are captured (driver shouldn't store data to an internal cache)
83 */
84 return (instance->debug_flags & RADV_DEBUG_DUMP_SHADERS) ||
85 (pipeline->create_flags &
86 (VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR | VK_PIPELINE_CREATE_2_CAPTURE_DATA_BIT_KHR));
87 }
88
89 void
radv_pipeline_init(struct radv_device * device,struct radv_pipeline * pipeline,enum radv_pipeline_type type)90 radv_pipeline_init(struct radv_device *device, struct radv_pipeline *pipeline, enum radv_pipeline_type type)
91 {
92 vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
93
94 pipeline->type = type;
95 }
96
97 void
radv_pipeline_destroy(struct radv_device * device,struct radv_pipeline * pipeline,const VkAllocationCallbacks * allocator)98 radv_pipeline_destroy(struct radv_device *device, struct radv_pipeline *pipeline,
99 const VkAllocationCallbacks *allocator)
100 {
101 if (pipeline->cache_object)
102 vk_pipeline_cache_object_unref(&device->vk, pipeline->cache_object);
103
104 switch (pipeline->type) {
105 case RADV_PIPELINE_GRAPHICS:
106 radv_destroy_graphics_pipeline(device, radv_pipeline_to_graphics(pipeline));
107 break;
108 case RADV_PIPELINE_GRAPHICS_LIB:
109 radv_destroy_graphics_lib_pipeline(device, radv_pipeline_to_graphics_lib(pipeline));
110 break;
111 case RADV_PIPELINE_COMPUTE:
112 radv_destroy_compute_pipeline(device, radv_pipeline_to_compute(pipeline));
113 break;
114 case RADV_PIPELINE_RAY_TRACING:
115 radv_destroy_ray_tracing_pipeline(device, radv_pipeline_to_ray_tracing(pipeline));
116 break;
117 default:
118 unreachable("invalid pipeline type");
119 }
120
121 radv_rmv_log_resource_destroy(device, (uint64_t)radv_pipeline_to_handle(pipeline));
122 vk_object_base_finish(&pipeline->base);
123 vk_free2(&device->vk.alloc, allocator, pipeline);
124 }
125
126 VKAPI_ATTR void VKAPI_CALL
radv_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)127 radv_DestroyPipeline(VkDevice _device, VkPipeline _pipeline, const VkAllocationCallbacks *pAllocator)
128 {
129 VK_FROM_HANDLE(radv_device, device, _device);
130 VK_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
131
132 if (!_pipeline)
133 return;
134
135 radv_pipeline_destroy(device, pipeline, pAllocator);
136 }
137
138 struct radv_shader_stage_key
radv_pipeline_get_shader_key(const struct radv_device * device,const VkPipelineShaderStageCreateInfo * stage,VkPipelineCreateFlags2 flags,const void * pNext)139 radv_pipeline_get_shader_key(const struct radv_device *device, const VkPipelineShaderStageCreateInfo *stage,
140 VkPipelineCreateFlags2 flags, const void *pNext)
141 {
142 const struct radv_physical_device *pdev = radv_device_physical(device);
143 const struct radv_instance *instance = radv_physical_device_instance(pdev);
144 gl_shader_stage s = vk_to_mesa_shader_stage(stage->stage);
145 struct vk_pipeline_robustness_state rs;
146 struct radv_shader_stage_key key = {0};
147
148 key.keep_statistic_info = radv_pipeline_capture_shader_stats(device, flags);
149
150 if (flags & VK_PIPELINE_CREATE_2_DISABLE_OPTIMIZATION_BIT)
151 key.optimisations_disabled = 1;
152
153 if (flags & VK_PIPELINE_CREATE_2_VIEW_INDEX_FROM_DEVICE_INDEX_BIT)
154 key.view_index_from_device_index = 1;
155
156 if (flags & VK_PIPELINE_CREATE_2_INDIRECT_BINDABLE_BIT_EXT)
157 key.indirect_bindable = 1;
158
159 if (stage->stage & RADV_GRAPHICS_STAGE_BITS) {
160 key.version = instance->drirc.override_graphics_shader_version;
161 } else if (stage->stage & RADV_RT_STAGE_BITS) {
162 key.version = instance->drirc.override_ray_tracing_shader_version;
163 } else {
164 assert(stage->stage == VK_SHADER_STAGE_COMPUTE_BIT);
165 key.version = instance->drirc.override_compute_shader_version;
166 }
167
168 vk_pipeline_robustness_state_fill(&device->vk, &rs, pNext, stage->pNext);
169
170 radv_set_stage_key_robustness(&rs, s, &key);
171
172 const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *const subgroup_size =
173 vk_find_struct_const(stage->pNext, PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
174
175 if (subgroup_size) {
176 if (subgroup_size->requiredSubgroupSize == 32)
177 key.subgroup_required_size = RADV_REQUIRED_WAVE32;
178 else if (subgroup_size->requiredSubgroupSize == 64)
179 key.subgroup_required_size = RADV_REQUIRED_WAVE64;
180 else
181 unreachable("Unsupported required subgroup size.");
182 }
183
184 if (stage->flags & VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT) {
185 key.subgroup_require_full = 1;
186 }
187
188 return key;
189 }
190
191 void
radv_pipeline_stage_init(VkPipelineCreateFlags2 pipeline_flags,const VkPipelineShaderStageCreateInfo * sinfo,const struct radv_pipeline_layout * pipeline_layout,const struct radv_shader_stage_key * stage_key,struct radv_shader_stage * out_stage)192 radv_pipeline_stage_init(VkPipelineCreateFlags2 pipeline_flags, const VkPipelineShaderStageCreateInfo *sinfo,
193 const struct radv_pipeline_layout *pipeline_layout,
194 const struct radv_shader_stage_key *stage_key, struct radv_shader_stage *out_stage)
195 {
196 const VkShaderModuleCreateInfo *minfo = vk_find_struct_const(sinfo->pNext, SHADER_MODULE_CREATE_INFO);
197 const VkPipelineShaderStageModuleIdentifierCreateInfoEXT *iinfo =
198 vk_find_struct_const(sinfo->pNext, PIPELINE_SHADER_STAGE_MODULE_IDENTIFIER_CREATE_INFO_EXT);
199
200 if (sinfo->module == VK_NULL_HANDLE && !minfo && !iinfo)
201 return;
202
203 memset(out_stage, 0, sizeof(*out_stage));
204
205 out_stage->stage = vk_to_mesa_shader_stage(sinfo->stage);
206 out_stage->next_stage = MESA_SHADER_NONE;
207 out_stage->entrypoint = sinfo->pName;
208 out_stage->spec_info = sinfo->pSpecializationInfo;
209 out_stage->feedback.flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
210 out_stage->key = *stage_key;
211
212 if (sinfo->module != VK_NULL_HANDLE) {
213 struct vk_shader_module *module = vk_shader_module_from_handle(sinfo->module);
214
215 out_stage->spirv.data = module->data;
216 out_stage->spirv.size = module->size;
217 out_stage->spirv.object = &module->base;
218
219 if (module->nir)
220 out_stage->internal_nir = module->nir;
221 } else if (minfo) {
222 out_stage->spirv.data = (const char *)minfo->pCode;
223 out_stage->spirv.size = minfo->codeSize;
224 }
225
226 radv_shader_layout_init(pipeline_layout, out_stage->stage, &out_stage->layout);
227
228 vk_pipeline_hash_shader_stage(pipeline_flags, sinfo, NULL, out_stage->shader_sha1);
229 }
230
231 void
radv_shader_layout_init(const struct radv_pipeline_layout * pipeline_layout,gl_shader_stage stage,struct radv_shader_layout * layout)232 radv_shader_layout_init(const struct radv_pipeline_layout *pipeline_layout, gl_shader_stage stage,
233 struct radv_shader_layout *layout)
234 {
235 layout->num_sets = pipeline_layout->num_sets;
236 for (unsigned i = 0; i < pipeline_layout->num_sets; i++) {
237 layout->set[i].layout = pipeline_layout->set[i].layout;
238 layout->set[i].dynamic_offset_start = pipeline_layout->set[i].dynamic_offset_start;
239 }
240
241 layout->push_constant_size = pipeline_layout->push_constant_size;
242 layout->use_dynamic_descriptors = pipeline_layout->dynamic_offset_count &&
243 (pipeline_layout->dynamic_shader_stages & mesa_to_vk_shader_stage(stage));
244 }
245
246 static const struct vk_ycbcr_conversion_state *
ycbcr_conversion_lookup(const void * data,uint32_t set,uint32_t binding,uint32_t array_index)247 ycbcr_conversion_lookup(const void *data, uint32_t set, uint32_t binding, uint32_t array_index)
248 {
249 const struct radv_shader_layout *layout = data;
250
251 const struct radv_descriptor_set_layout *set_layout = layout->set[set].layout;
252 const struct vk_ycbcr_conversion_state *ycbcr_samplers = radv_immutable_ycbcr_samplers(set_layout, binding);
253
254 if (!ycbcr_samplers)
255 return NULL;
256
257 return ycbcr_samplers + array_index;
258 }
259
260 static uint8_t
opt_vectorize_callback(const nir_instr * instr,const void * _)261 opt_vectorize_callback(const nir_instr *instr, const void *_)
262 {
263 if (instr->type != nir_instr_type_alu)
264 return 0;
265
266 const struct radv_device *device = _;
267 const struct radv_physical_device *pdev = radv_device_physical(device);
268 enum amd_gfx_level chip = pdev->info.gfx_level;
269 if (chip < GFX9)
270 return 1;
271
272 const nir_alu_instr *alu = nir_instr_as_alu(instr);
273 const unsigned bit_size = alu->def.bit_size;
274 if (bit_size != 16)
275 return 1;
276
277 return aco_nir_op_supports_packed_math_16bit(alu) ? 2 : 1;
278 }
279
280 static nir_component_mask_t
non_uniform_access_callback(const nir_src * src,void * _)281 non_uniform_access_callback(const nir_src *src, void *_)
282 {
283 if (src->ssa->num_components == 1)
284 return 0x1;
285 return nir_chase_binding(*src).success ? 0x2 : 0x3;
286 }
287
288 void
radv_postprocess_nir(struct radv_device * device,const struct radv_graphics_state_key * gfx_state,struct radv_shader_stage * stage)289 radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_state_key *gfx_state,
290 struct radv_shader_stage *stage)
291 {
292 const struct radv_physical_device *pdev = radv_device_physical(device);
293 const struct radv_instance *instance = radv_physical_device_instance(pdev);
294 enum amd_gfx_level gfx_level = pdev->info.gfx_level;
295 bool progress;
296
297 /* Wave and workgroup size should already be filled. */
298 assert(stage->info.wave_size && stage->info.workgroup_size);
299
300 if (stage->stage == MESA_SHADER_FRAGMENT) {
301 if (!stage->key.optimisations_disabled) {
302 NIR_PASS(_, stage->nir, nir_opt_cse);
303 }
304 NIR_PASS(_, stage->nir, radv_nir_lower_fs_intrinsics, stage, gfx_state);
305 }
306
307 /* LLVM could support more of these in theory. */
308 bool use_llvm = radv_use_llvm_for_stage(pdev, stage->stage);
309 bool has_inverse_ballot = true;
310 #if AMD_LLVM_AVAILABLE
311 has_inverse_ballot = !use_llvm || LLVM_VERSION_MAJOR >= 17;
312 #endif
313 radv_nir_opt_tid_function_options tid_options = {
314 .use_masked_swizzle_amd = true,
315 .use_dpp16_shift_amd = !use_llvm && gfx_level >= GFX8,
316 .use_clustered_rotate = !use_llvm,
317 .hw_subgroup_size = stage->info.wave_size,
318 .hw_ballot_bit_size = has_inverse_ballot ? stage->info.wave_size : 0,
319 .hw_ballot_num_comp = has_inverse_ballot ? 1 : 0,
320 };
321 NIR_PASS(_, stage->nir, radv_nir_opt_tid_function, &tid_options);
322
323 nir_divergence_analysis(stage->nir);
324 NIR_PASS(_, stage->nir, ac_nir_flag_smem_for_loads, gfx_level, use_llvm, false);
325
326 NIR_PASS(_, stage->nir, nir_lower_memory_model);
327
328 nir_load_store_vectorize_options vectorize_opts = {
329 .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_shared | nir_var_mem_global |
330 nir_var_shader_temp,
331 .callback = ac_nir_mem_vectorize_callback,
332 .cb_data = &(struct ac_nir_config){gfx_level, !use_llvm},
333 .robust_modes = 0,
334 /* On GFX6, read2/write2 is out-of-bounds if the offset register is negative, even if
335 * the final offset is not.
336 */
337 .has_shared2_amd = gfx_level >= GFX7,
338 };
339
340 if (stage->key.uniform_robustness2)
341 vectorize_opts.robust_modes |= nir_var_mem_ubo;
342
343 if (stage->key.storage_robustness2)
344 vectorize_opts.robust_modes |= nir_var_mem_ssbo;
345
346 bool constant_fold_for_push_const = false;
347 if (!stage->key.optimisations_disabled) {
348 progress = false;
349 NIR_PASS(progress, stage->nir, nir_opt_load_store_vectorize, &vectorize_opts);
350 if (progress) {
351 NIR_PASS(_, stage->nir, nir_copy_prop);
352 NIR_PASS(_, stage->nir, nir_opt_shrink_stores, !instance->drirc.disable_shrink_image_store);
353
354 constant_fold_for_push_const = true;
355
356 /* Gather info again, to update whether 8/16-bit are used. */
357 nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir));
358 }
359 }
360
361 enum nir_lower_non_uniform_access_type lower_non_uniform_access_types =
362 nir_lower_non_uniform_ubo_access | nir_lower_non_uniform_ssbo_access | nir_lower_non_uniform_texture_access |
363 nir_lower_non_uniform_image_access;
364
365 /* In practice, most shaders do not have non-uniform-qualified
366 * accesses (see
367 * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17558#note_1475069)
368 * thus a cheaper and likely to fail check is run first.
369 */
370 if (nir_has_non_uniform_access(stage->nir, lower_non_uniform_access_types)) {
371 if (!stage->key.optimisations_disabled) {
372 NIR_PASS(_, stage->nir, nir_opt_non_uniform_access);
373 }
374
375 if (!radv_use_llvm_for_stage(pdev, stage->stage)) {
376 nir_lower_non_uniform_access_options options = {
377 .types = lower_non_uniform_access_types,
378 .callback = &non_uniform_access_callback,
379 .callback_data = NULL,
380 };
381 NIR_PASS(_, stage->nir, nir_lower_non_uniform_access, &options);
382 }
383 }
384
385 progress = false;
386 NIR_PASS(progress, stage->nir, ac_nir_lower_mem_access_bit_sizes, gfx_level, use_llvm);
387 if (progress)
388 constant_fold_for_push_const = true;
389
390 progress = false;
391 NIR_PASS(progress, stage->nir, nir_vk_lower_ycbcr_tex, ycbcr_conversion_lookup, &stage->layout);
392 /* Gather info in the case that nir_vk_lower_ycbcr_tex might have emitted resinfo instructions. */
393 if (progress)
394 nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir));
395
396 bool fix_derivs_in_divergent_cf =
397 stage->stage == MESA_SHADER_FRAGMENT && !radv_use_llvm_for_stage(pdev, stage->stage);
398 if (fix_derivs_in_divergent_cf)
399 nir_divergence_analysis(stage->nir);
400
401 NIR_PASS(_, stage->nir, ac_nir_lower_tex,
402 &(ac_nir_lower_tex_options){
403 .gfx_level = gfx_level,
404 .lower_array_layer_round_even = !pdev->info.conformant_trunc_coord || device->disable_trunc_coord,
405 .fix_derivs_in_divergent_cf = fix_derivs_in_divergent_cf,
406 .max_wqm_vgprs = 64, // TODO: improve spiller and RA support for linear VGPRs
407 });
408
409 if (stage->nir->info.uses_resource_info_query)
410 NIR_PASS(_, stage->nir, ac_nir_lower_resinfo, gfx_level);
411
412 /* Ensure split load_push_constant still have constant offsets, for radv_nir_apply_pipeline_layout. */
413 if (constant_fold_for_push_const && stage->args.ac.inline_push_const_mask)
414 NIR_PASS(_, stage->nir, nir_opt_constant_folding);
415
416 /* TODO: vectorize loads after this to vectorize loading adjacent descriptors */
417 NIR_PASS_V(stage->nir, radv_nir_apply_pipeline_layout, device, stage);
418
419 if (!stage->key.optimisations_disabled) {
420 NIR_PASS(_, stage->nir, nir_opt_shrink_vectors, true);
421 }
422
423 NIR_PASS(_, stage->nir, nir_lower_alu_width, opt_vectorize_callback, device);
424
425 nir_move_options sink_opts = nir_move_const_undef | nir_move_copies;
426
427 if (!stage->key.optimisations_disabled) {
428 NIR_PASS(_, stage->nir, nir_opt_licm);
429 if (stage->stage != MESA_SHADER_FRAGMENT || !pdev->cache_key.disable_sinking_load_input_fs)
430 sink_opts |= nir_move_load_input;
431
432 NIR_PASS(_, stage->nir, nir_opt_sink, sink_opts);
433 NIR_PASS(_, stage->nir, nir_opt_move, nir_move_load_input | nir_move_const_undef | nir_move_copies);
434 }
435
436 /* Lower VS inputs. We need to do this after nir_opt_sink, because
437 * load_input can be reordered, but buffer loads can't.
438 */
439 if (stage->stage == MESA_SHADER_VERTEX) {
440 NIR_PASS(_, stage->nir, radv_nir_lower_vs_inputs, stage, gfx_state, &pdev->info);
441 }
442
443 /* Lower I/O intrinsics to memory instructions. */
444 bool is_last_vgt_stage = radv_is_last_vgt_stage(stage);
445 bool io_to_mem = radv_nir_lower_io_to_mem(device, stage);
446 bool lowered_ngg = stage->info.is_ngg && is_last_vgt_stage;
447 if (lowered_ngg) {
448 radv_lower_ngg(device, stage, gfx_state);
449 } else if (is_last_vgt_stage) {
450 if (stage->stage != MESA_SHADER_GEOMETRY) {
451 NIR_PASS_V(stage->nir, ac_nir_lower_legacy_vs, gfx_level,
452 stage->info.outinfo.clip_dist_mask | stage->info.outinfo.cull_dist_mask,
453 stage->info.outinfo.vs_output_param_offset, stage->info.outinfo.param_exports,
454 stage->info.outinfo.export_prim_id, false, false, false, stage->info.force_vrs_per_vertex);
455
456 } else {
457 ac_nir_gs_output_info gs_out_info = {
458 .streams = stage->info.gs.output_streams,
459 .sysval_mask = stage->info.gs.output_usage_mask,
460 .varying_mask = stage->info.gs.output_usage_mask,
461 };
462 NIR_PASS_V(stage->nir, ac_nir_lower_legacy_gs, false, false, &gs_out_info);
463 }
464 } else if (stage->stage == MESA_SHADER_FRAGMENT) {
465 ac_nir_lower_ps_late_options late_options = {
466 .gfx_level = gfx_level,
467 .family = pdev->info.family,
468 .use_aco = !radv_use_llvm_for_stage(pdev, stage->stage),
469 .bc_optimize_for_persp = G_0286CC_PERSP_CENTER_ENA(stage->info.ps.spi_ps_input_ena) &&
470 G_0286CC_PERSP_CENTROID_ENA(stage->info.ps.spi_ps_input_ena),
471 .bc_optimize_for_linear = G_0286CC_LINEAR_CENTER_ENA(stage->info.ps.spi_ps_input_ena) &&
472 G_0286CC_LINEAR_CENTROID_ENA(stage->info.ps.spi_ps_input_ena),
473 .uses_discard = true,
474 .no_color_export = stage->info.ps.has_epilog,
475 .no_depth_export = stage->info.ps.exports_mrtz_via_epilog,
476
477 };
478
479 if (!late_options.no_color_export) {
480 late_options.dual_src_blend_swizzle = gfx_state->ps.epilog.mrt0_is_dual_src && gfx_level >= GFX11;
481 late_options.color_is_int8 = gfx_state->ps.epilog.color_is_int8;
482 late_options.color_is_int10 = gfx_state->ps.epilog.color_is_int10;
483 late_options.enable_mrt_output_nan_fixup =
484 gfx_state->ps.epilog.enable_mrt_output_nan_fixup && !stage->nir->info.internal;
485 /* Need to filter out unwritten color slots. */
486 late_options.spi_shader_col_format =
487 gfx_state->ps.epilog.spi_shader_col_format & stage->info.ps.colors_written;
488 late_options.alpha_to_one = gfx_state->ps.epilog.alpha_to_one;
489 }
490
491 if (!late_options.no_depth_export) {
492 /* Compared to gfx_state.ps.alpha_to_coverage_via_mrtz,
493 * radv_shader_info.ps.writes_mrt0_alpha need any depth/stencil/sample_mask exist.
494 * ac_nir_lower_ps() require this field to reflect whether alpha via mrtz is really
495 * present.
496 */
497 late_options.alpha_to_coverage_via_mrtz = stage->info.ps.writes_mrt0_alpha;
498 }
499
500 NIR_PASS(_, stage->nir, ac_nir_lower_ps_late, &late_options);
501 }
502
503 if (radv_shader_should_clear_lds(device, stage->nir)) {
504 const unsigned chunk_size = 16; /* max single store size */
505 const unsigned shared_size = ALIGN(stage->nir->info.shared_size, chunk_size);
506 NIR_PASS(_, stage->nir, nir_clear_shared_memory, shared_size, chunk_size);
507 }
508
509 /* This must be after lowering resources to descriptor loads and before lowering intrinsics
510 * to args and lowering int64.
511 */
512 if (!radv_use_llvm_for_stage(pdev, stage->stage))
513 ac_nir_optimize_uniform_atomics(stage->nir);
514
515 NIR_PASS(_, stage->nir, nir_lower_int64);
516
517 NIR_PASS(_, stage->nir, nir_opt_idiv_const, 8);
518
519 NIR_PASS(_, stage->nir, nir_lower_idiv,
520 &(nir_lower_idiv_options){
521 .allow_fp16 = gfx_level >= GFX9,
522 });
523
524 NIR_PASS(_, stage->nir, ac_nir_lower_global_access);
525 NIR_PASS_V(stage->nir, ac_nir_lower_intrinsics_to_args, gfx_level,
526 pdev->info.has_ls_vgpr_init_bug && gfx_state && !gfx_state->vs.has_prolog,
527 radv_select_hw_stage(&stage->info, gfx_level), stage->info.wave_size, stage->info.workgroup_size,
528 &stage->args.ac);
529 NIR_PASS_V(stage->nir, radv_nir_lower_abi, gfx_level, stage, gfx_state, pdev->info.address32_hi);
530 radv_optimize_nir_algebraic(
531 stage->nir, io_to_mem || lowered_ngg || stage->stage == MESA_SHADER_COMPUTE || stage->stage == MESA_SHADER_TASK,
532 gfx_level >= GFX8);
533
534 NIR_PASS(_, stage->nir, nir_lower_fp16_casts, nir_lower_fp16_split_fp64);
535
536 if (stage->nir->info.bit_sizes_int & (8 | 16)) {
537 if (gfx_level >= GFX8)
538 nir_divergence_analysis(stage->nir);
539
540 if (nir_lower_bit_size(stage->nir, ac_nir_lower_bit_size_callback, &gfx_level)) {
541 NIR_PASS(_, stage->nir, nir_opt_constant_folding);
542 }
543 }
544 if (gfx_level >= GFX9) {
545 bool separate_g16 = gfx_level >= GFX10;
546 struct nir_opt_tex_srcs_options opt_srcs_options[] = {
547 {
548 .sampler_dims = ~(BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE) | BITFIELD_BIT(GLSL_SAMPLER_DIM_BUF)),
549 .src_types = (1 << nir_tex_src_coord) | (1 << nir_tex_src_lod) | (1 << nir_tex_src_bias) |
550 (1 << nir_tex_src_min_lod) | (1 << nir_tex_src_ms_index) |
551 (separate_g16 ? 0 : (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy)),
552 },
553 {
554 .sampler_dims = ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE),
555 .src_types = (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy),
556 },
557 };
558 struct nir_opt_16bit_tex_image_options opt_16bit_options = {
559 .rounding_mode = nir_rounding_mode_undef,
560 .opt_tex_dest_types = nir_type_float | nir_type_int | nir_type_uint,
561 .opt_image_dest_types = nir_type_float | nir_type_int | nir_type_uint,
562 .integer_dest_saturates = true,
563 .opt_image_store_data = true,
564 .opt_image_srcs = true,
565 .opt_srcs_options_count = separate_g16 ? 2 : 1,
566 .opt_srcs_options = opt_srcs_options,
567 };
568 bool run_copy_prop = false;
569 NIR_PASS(run_copy_prop, stage->nir, nir_opt_16bit_tex_image, &opt_16bit_options);
570
571 /* Optimizing 16bit texture/image dests leaves scalar moves that stops
572 * nir_opt_vectorize from vectorzing the alu uses of them.
573 */
574 if (run_copy_prop) {
575 NIR_PASS(_, stage->nir, nir_copy_prop);
576 NIR_PASS(_, stage->nir, nir_opt_dce);
577 }
578
579 if (!stage->key.optimisations_disabled &&
580 ((stage->nir->info.bit_sizes_int | stage->nir->info.bit_sizes_float) & 16)) {
581 NIR_PASS(_, stage->nir, nir_opt_vectorize, opt_vectorize_callback, device);
582 }
583 }
584
585 /* cleanup passes */
586 NIR_PASS(_, stage->nir, nir_lower_alu_width, opt_vectorize_callback, device);
587
588 /* This pass changes the global float control mode to RTZ, so can't be used
589 * with LLVM, which only supports RTNE, or RT, where the mode needs to match
590 * across separately compiled stages.
591 */
592 if (!radv_use_llvm_for_stage(pdev, stage->stage) && !gl_shader_stage_is_rt(stage->stage))
593 NIR_PASS(_, stage->nir, ac_nir_opt_pack_half, gfx_level);
594
595 NIR_PASS(_, stage->nir, nir_lower_load_const_to_scalar);
596 NIR_PASS(_, stage->nir, nir_copy_prop);
597 NIR_PASS(_, stage->nir, nir_opt_dce);
598
599 if (!stage->key.optimisations_disabled) {
600 sink_opts |= nir_move_comparisons | nir_move_load_ubo | nir_move_load_ssbo | nir_move_alu;
601 NIR_PASS(_, stage->nir, nir_opt_sink, sink_opts);
602
603 nir_move_options move_opts = nir_move_const_undef | nir_move_load_ubo | nir_move_load_input |
604 nir_move_comparisons | nir_move_copies | nir_move_alu;
605 NIR_PASS(_, stage->nir, nir_opt_move, move_opts);
606
607 /* Run nir_opt_move again to make sure that comparision are as close as possible to the first use to prevent SCC
608 * spilling.
609 */
610 NIR_PASS(_, stage->nir, nir_opt_move, nir_move_comparisons);
611 }
612 }
613
614 bool
radv_shader_should_clear_lds(const struct radv_device * device,const nir_shader * shader)615 radv_shader_should_clear_lds(const struct radv_device *device, const nir_shader *shader)
616 {
617 const struct radv_physical_device *pdev = radv_device_physical(device);
618 const struct radv_instance *instance = radv_physical_device_instance(pdev);
619
620 return (shader->info.stage == MESA_SHADER_COMPUTE || shader->info.stage == MESA_SHADER_MESH ||
621 shader->info.stage == MESA_SHADER_TASK) &&
622 shader->info.shared_size > 0 && instance->drirc.clear_lds;
623 }
624
625 static uint32_t
radv_get_executable_count(struct radv_pipeline * pipeline)626 radv_get_executable_count(struct radv_pipeline *pipeline)
627 {
628 uint32_t ret = 0;
629
630 if (pipeline->type == RADV_PIPELINE_RAY_TRACING) {
631 struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
632 for (uint32_t i = 0; i < rt_pipeline->stage_count; i++)
633 ret += rt_pipeline->stages[i].shader ? 1 : 0;
634 }
635
636 for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
637 if (!pipeline->shaders[i])
638 continue;
639
640 ret += 1u;
641 if (i == MESA_SHADER_GEOMETRY && pipeline->gs_copy_shader) {
642 ret += 1u;
643 }
644 }
645
646 return ret;
647 }
648
649 static struct radv_shader *
radv_get_shader_from_executable_index(struct radv_pipeline * pipeline,int index,gl_shader_stage * stage)650 radv_get_shader_from_executable_index(struct radv_pipeline *pipeline, int index, gl_shader_stage *stage)
651 {
652 if (pipeline->type == RADV_PIPELINE_RAY_TRACING) {
653 struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
654 for (uint32_t i = 0; i < rt_pipeline->stage_count; i++) {
655 struct radv_ray_tracing_stage *rt_stage = &rt_pipeline->stages[i];
656 if (!rt_stage->shader)
657 continue;
658
659 if (!index) {
660 *stage = rt_stage->stage;
661 return rt_stage->shader;
662 }
663
664 index--;
665 }
666 }
667
668 for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
669 if (!pipeline->shaders[i])
670 continue;
671 if (!index) {
672 *stage = i;
673 return pipeline->shaders[i];
674 }
675
676 --index;
677
678 if (i == MESA_SHADER_GEOMETRY && pipeline->gs_copy_shader) {
679 if (!index) {
680 *stage = i;
681 return pipeline->gs_copy_shader;
682 }
683 --index;
684 }
685 }
686
687 *stage = -1;
688 return NULL;
689 }
690
691 /* Basically strlcpy (which does not exist on linux) specialized for
692 * descriptions. */
693 static void
desc_copy(char * desc,const char * src)694 desc_copy(char *desc, const char *src)
695 {
696 int len = strlen(src);
697 assert(len < VK_MAX_DESCRIPTION_SIZE);
698 memcpy(desc, src, len);
699 memset(desc + len, 0, VK_MAX_DESCRIPTION_SIZE - len);
700 }
701
702 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetPipelineExecutablePropertiesKHR(VkDevice _device,const VkPipelineInfoKHR * pPipelineInfo,uint32_t * pExecutableCount,VkPipelineExecutablePropertiesKHR * pProperties)703 radv_GetPipelineExecutablePropertiesKHR(VkDevice _device, const VkPipelineInfoKHR *pPipelineInfo,
704 uint32_t *pExecutableCount, VkPipelineExecutablePropertiesKHR *pProperties)
705 {
706 VK_FROM_HANDLE(radv_pipeline, pipeline, pPipelineInfo->pipeline);
707 const uint32_t total_count = radv_get_executable_count(pipeline);
708
709 if (!pProperties) {
710 *pExecutableCount = total_count;
711 return VK_SUCCESS;
712 }
713
714 const uint32_t count = MIN2(total_count, *pExecutableCount);
715 for (uint32_t executable_idx = 0; executable_idx < count; executable_idx++) {
716 gl_shader_stage stage;
717 struct radv_shader *shader = radv_get_shader_from_executable_index(pipeline, executable_idx, &stage);
718
719 pProperties[executable_idx].stages = mesa_to_vk_shader_stage(stage);
720
721 const char *name = _mesa_shader_stage_to_string(stage);
722 const char *description = NULL;
723 switch (stage) {
724 case MESA_SHADER_VERTEX:
725 description = "Vulkan Vertex Shader";
726 break;
727 case MESA_SHADER_TESS_CTRL:
728 if (!pipeline->shaders[MESA_SHADER_VERTEX]) {
729 pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT;
730 name = "vertex + tessellation control";
731 description = "Combined Vulkan Vertex and Tessellation Control Shaders";
732 } else {
733 description = "Vulkan Tessellation Control Shader";
734 }
735 break;
736 case MESA_SHADER_TESS_EVAL:
737 description = "Vulkan Tessellation Evaluation Shader";
738 break;
739 case MESA_SHADER_GEOMETRY:
740 if (shader->info.type == RADV_SHADER_TYPE_GS_COPY) {
741 name = "geometry copy";
742 description = "Extra shader stage that loads the GS output ringbuffer into the rasterizer";
743 break;
744 }
745
746 if (pipeline->shaders[MESA_SHADER_TESS_CTRL] && !pipeline->shaders[MESA_SHADER_TESS_EVAL]) {
747 pProperties[executable_idx].stages |= VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
748 name = "tessellation evaluation + geometry";
749 description = "Combined Vulkan Tessellation Evaluation and Geometry Shaders";
750 } else if (!pipeline->shaders[MESA_SHADER_TESS_CTRL] && !pipeline->shaders[MESA_SHADER_VERTEX]) {
751 pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT;
752 name = "vertex + geometry";
753 description = "Combined Vulkan Vertex and Geometry Shaders";
754 } else {
755 description = "Vulkan Geometry Shader";
756 }
757 break;
758 case MESA_SHADER_FRAGMENT:
759 description = "Vulkan Fragment Shader";
760 break;
761 case MESA_SHADER_COMPUTE:
762 description = "Vulkan Compute Shader";
763 break;
764 case MESA_SHADER_MESH:
765 description = "Vulkan Mesh Shader";
766 break;
767 case MESA_SHADER_TASK:
768 description = "Vulkan Task Shader";
769 break;
770 case MESA_SHADER_RAYGEN:
771 description = "Vulkan Ray Generation Shader";
772 break;
773 case MESA_SHADER_ANY_HIT:
774 description = "Vulkan Any-Hit Shader";
775 break;
776 case MESA_SHADER_CLOSEST_HIT:
777 description = "Vulkan Closest-Hit Shader";
778 break;
779 case MESA_SHADER_MISS:
780 description = "Vulkan Miss Shader";
781 break;
782 case MESA_SHADER_INTERSECTION:
783 description = "Shader responsible for traversing the acceleration structure";
784 break;
785 case MESA_SHADER_CALLABLE:
786 description = "Vulkan Callable Shader";
787 break;
788 default:
789 unreachable("Unsupported shader stage");
790 }
791
792 pProperties[executable_idx].subgroupSize = shader->info.wave_size;
793 desc_copy(pProperties[executable_idx].name, name);
794 desc_copy(pProperties[executable_idx].description, description);
795 }
796
797 VkResult result = *pExecutableCount < total_count ? VK_INCOMPLETE : VK_SUCCESS;
798 *pExecutableCount = count;
799 return result;
800 }
801
802 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetPipelineExecutableStatisticsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pStatisticCount,VkPipelineExecutableStatisticKHR * pStatistics)803 radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecutableInfoKHR *pExecutableInfo,
804 uint32_t *pStatisticCount, VkPipelineExecutableStatisticKHR *pStatistics)
805 {
806 VK_FROM_HANDLE(radv_device, device, _device);
807 VK_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
808 gl_shader_stage stage;
809 struct radv_shader *shader =
810 radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);
811
812 const struct radv_physical_device *pdev = radv_device_physical(device);
813 const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
814
815 unsigned lds_increment =
816 gfx_level >= GFX11 && stage == MESA_SHADER_FRAGMENT ? 1024 : pdev->info.lds_encode_granularity;
817
818 VkPipelineExecutableStatisticKHR *s = pStatistics;
819 VkPipelineExecutableStatisticKHR *end = s + (pStatistics ? *pStatisticCount : 0);
820 VkResult result = VK_SUCCESS;
821
822 if (s < end) {
823 desc_copy(s->name, "Driver pipeline hash");
824 desc_copy(s->description, "Driver pipeline hash used by RGP");
825 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
826 s->value.u64 = pipeline->pipeline_hash;
827 }
828 ++s;
829
830 if (s < end) {
831 desc_copy(s->name, "SGPRs");
832 desc_copy(s->description, "Number of SGPR registers allocated per subgroup");
833 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
834 s->value.u64 = shader->config.num_sgprs;
835 }
836 ++s;
837
838 if (s < end) {
839 desc_copy(s->name, "VGPRs");
840 desc_copy(s->description, "Number of VGPR registers allocated per subgroup");
841 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
842 s->value.u64 = shader->config.num_vgprs;
843 }
844 ++s;
845
846 if (s < end) {
847 desc_copy(s->name, "Spilled SGPRs");
848 desc_copy(s->description, "Number of SGPR registers spilled per subgroup");
849 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
850 s->value.u64 = shader->config.spilled_sgprs;
851 }
852 ++s;
853
854 if (s < end) {
855 desc_copy(s->name, "Spilled VGPRs");
856 desc_copy(s->description, "Number of VGPR registers spilled per subgroup");
857 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
858 s->value.u64 = shader->config.spilled_vgprs;
859 }
860 ++s;
861
862 if (s < end) {
863 desc_copy(s->name, "Code size");
864 desc_copy(s->description, "Code size in bytes");
865 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
866 s->value.u64 = shader->exec_size;
867 }
868 ++s;
869
870 if (s < end) {
871 desc_copy(s->name, "LDS size");
872 desc_copy(s->description, "LDS size in bytes per workgroup");
873 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
874 s->value.u64 = shader->config.lds_size * lds_increment;
875 }
876 ++s;
877
878 if (s < end) {
879 desc_copy(s->name, "Scratch size");
880 desc_copy(s->description, "Private memory in bytes per subgroup");
881 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
882 s->value.u64 = shader->config.scratch_bytes_per_wave;
883 }
884 ++s;
885
886 if (s < end) {
887 desc_copy(s->name, "Subgroups per SIMD");
888 desc_copy(s->description, "The maximum number of subgroups in flight on a SIMD unit");
889 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
890 s->value.u64 = shader->max_waves;
891 }
892 ++s;
893
894 if (s < end) {
895 desc_copy(s->name, "Combined inputs");
896 desc_copy(s->description, "Number of input slots reserved for the shader (including merged stages)");
897 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
898 s->value.u64 = 0;
899
900 switch (stage) {
901 case MESA_SHADER_VERTEX:
902 if (gfx_level <= GFX8 || (!shader->info.vs.as_es && !shader->info.vs.as_ls)) {
903 /* VS inputs when VS is a separate stage */
904 s->value.u64 += util_bitcount(shader->info.vs.input_slot_usage_mask);
905 }
906 break;
907
908 case MESA_SHADER_TESS_CTRL:
909 if (gfx_level >= GFX9) {
910 /* VS inputs when pipeline has tess */
911 s->value.u64 += util_bitcount(shader->info.vs.input_slot_usage_mask);
912 }
913
914 /* VS -> TCS inputs */
915 s->value.u64 += shader->info.tcs.num_linked_inputs;
916 break;
917
918 case MESA_SHADER_TESS_EVAL:
919 if (gfx_level <= GFX8 || !shader->info.tes.as_es) {
920 /* TCS -> TES inputs when TES is a separate stage */
921 s->value.u64 += shader->info.tes.num_linked_inputs + shader->info.tes.num_linked_patch_inputs;
922 }
923 break;
924
925 case MESA_SHADER_GEOMETRY:
926 /* The IO stats of the GS copy shader are already reflected by GS and FS, so leave it empty. */
927 if (shader->info.type == RADV_SHADER_TYPE_GS_COPY)
928 break;
929
930 if (gfx_level >= GFX9) {
931 if (shader->info.gs.es_type == MESA_SHADER_VERTEX) {
932 /* VS inputs when pipeline has GS but no tess */
933 s->value.u64 += util_bitcount(shader->info.vs.input_slot_usage_mask);
934 } else if (shader->info.gs.es_type == MESA_SHADER_TESS_EVAL) {
935 /* TCS -> TES inputs when pipeline has GS */
936 s->value.u64 += shader->info.tes.num_linked_inputs + shader->info.tes.num_linked_patch_inputs;
937 }
938 }
939
940 /* VS -> GS or TES -> GS inputs */
941 s->value.u64 += shader->info.gs.num_linked_inputs;
942 break;
943
944 case MESA_SHADER_FRAGMENT:
945 s->value.u64 += shader->info.ps.num_inputs;
946 break;
947
948 default:
949 /* Other stages don't have IO or we are not interested in them. */
950 break;
951 }
952 }
953 ++s;
954
955 if (s < end) {
956 desc_copy(s->name, "Combined outputs");
957 desc_copy(s->description, "Number of output slots reserved for the shader (including merged stages)");
958 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
959 s->value.u64 = 0;
960
961 switch (stage) {
962 case MESA_SHADER_VERTEX:
963 if (!shader->info.vs.as_ls && !shader->info.vs.as_es) {
964 /* VS -> FS outputs. */
965 s->value.u64 += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
966 shader->info.outinfo.prim_param_exports;
967 } else if (gfx_level <= GFX8) {
968 /* VS -> TCS, VS -> GS outputs on GFX6-8 */
969 s->value.u64 += shader->info.vs.num_linked_outputs;
970 }
971 break;
972
973 case MESA_SHADER_TESS_CTRL:
974 if (gfx_level >= GFX9) {
975 /* VS -> TCS outputs on GFX9+ */
976 s->value.u64 += shader->info.vs.num_linked_outputs;
977 }
978
979 /* TCS -> TES outputs */
980 s->value.u64 += shader->info.tcs.num_linked_outputs + shader->info.tcs.num_linked_patch_outputs;
981 break;
982
983 case MESA_SHADER_TESS_EVAL:
984 if (!shader->info.tes.as_es) {
985 /* TES -> FS outputs */
986 s->value.u64 += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
987 shader->info.outinfo.prim_param_exports;
988 } else if (gfx_level <= GFX8) {
989 /* TES -> GS outputs on GFX6-8 */
990 s->value.u64 += shader->info.tes.num_linked_outputs;
991 }
992 break;
993
994 case MESA_SHADER_GEOMETRY:
995 /* The IO stats of the GS copy shader are already reflected by GS and FS, so leave it empty. */
996 if (shader->info.type == RADV_SHADER_TYPE_GS_COPY)
997 break;
998
999 if (gfx_level >= GFX9) {
1000 if (shader->info.gs.es_type == MESA_SHADER_VERTEX) {
1001 /* VS -> GS outputs on GFX9+ */
1002 s->value.u64 += shader->info.vs.num_linked_outputs;
1003 } else if (shader->info.gs.es_type == MESA_SHADER_TESS_EVAL) {
1004 /* TES -> GS outputs on GFX9+ */
1005 s->value.u64 += shader->info.tes.num_linked_outputs;
1006 }
1007 }
1008
1009 if (shader->info.is_ngg) {
1010 /* GS -> FS outputs (GFX10+ NGG) */
1011 s->value.u64 += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
1012 shader->info.outinfo.prim_param_exports;
1013 } else {
1014 /* GS -> FS outputs (GFX6-10.3 legacy) */
1015 s->value.u64 += shader->info.gs.gsvs_vertex_size / 16;
1016 }
1017 break;
1018
1019 case MESA_SHADER_MESH:
1020 /* MS -> FS outputs */
1021 s->value.u64 += shader->info.outinfo.pos_exports + shader->info.outinfo.param_exports +
1022 shader->info.outinfo.prim_param_exports;
1023 break;
1024
1025 case MESA_SHADER_FRAGMENT:
1026 s->value.u64 += DIV_ROUND_UP(util_bitcount(shader->info.ps.colors_written), 4) + !!shader->info.ps.writes_z +
1027 !!shader->info.ps.writes_stencil + !!shader->info.ps.writes_sample_mask +
1028 !!shader->info.ps.writes_mrt0_alpha;
1029 break;
1030
1031 default:
1032 /* Other stages don't have IO or we are not interested in them. */
1033 break;
1034 }
1035 }
1036 ++s;
1037
1038 if (shader->statistics) {
1039 for (unsigned i = 0; i < aco_num_statistics; i++) {
1040 const struct aco_compiler_statistic_info *info = &aco_statistic_infos[i];
1041 if (s < end) {
1042 desc_copy(s->name, info->name);
1043 desc_copy(s->description, info->desc);
1044 s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1045 s->value.u64 = shader->statistics[i];
1046 }
1047 ++s;
1048 }
1049 }
1050
1051 if (!pStatistics)
1052 *pStatisticCount = s - pStatistics;
1053 else if (s > end) {
1054 *pStatisticCount = end - pStatistics;
1055 result = VK_INCOMPLETE;
1056 } else {
1057 *pStatisticCount = s - pStatistics;
1058 }
1059
1060 return result;
1061 }
1062
1063 static VkResult
radv_copy_representation(void * data,size_t * data_size,const char * src)1064 radv_copy_representation(void *data, size_t *data_size, const char *src)
1065 {
1066 size_t total_size = strlen(src) + 1;
1067
1068 if (!data) {
1069 *data_size = total_size;
1070 return VK_SUCCESS;
1071 }
1072
1073 size_t size = MIN2(total_size, *data_size);
1074
1075 memcpy(data, src, size);
1076 if (size)
1077 *((char *)data + size - 1) = 0;
1078 return size < total_size ? VK_INCOMPLETE : VK_SUCCESS;
1079 }
1080
1081 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetPipelineExecutableInternalRepresentationsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pInternalRepresentationCount,VkPipelineExecutableInternalRepresentationKHR * pInternalRepresentations)1082 radv_GetPipelineExecutableInternalRepresentationsKHR(
1083 VkDevice _device, const VkPipelineExecutableInfoKHR *pExecutableInfo, uint32_t *pInternalRepresentationCount,
1084 VkPipelineExecutableInternalRepresentationKHR *pInternalRepresentations)
1085 {
1086 VK_FROM_HANDLE(radv_device, device, _device);
1087 VK_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
1088 const struct radv_physical_device *pdev = radv_device_physical(device);
1089 gl_shader_stage stage;
1090 struct radv_shader *shader =
1091 radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);
1092
1093 VkPipelineExecutableInternalRepresentationKHR *p = pInternalRepresentations;
1094 VkPipelineExecutableInternalRepresentationKHR *end =
1095 p + (pInternalRepresentations ? *pInternalRepresentationCount : 0);
1096 VkResult result = VK_SUCCESS;
1097 /* optimized NIR */
1098 if (p < end) {
1099 p->isText = true;
1100 desc_copy(p->name, "NIR Shader(s)");
1101 desc_copy(p->description, "The optimized NIR shader(s)");
1102 if (radv_copy_representation(p->pData, &p->dataSize, shader->nir_string) != VK_SUCCESS)
1103 result = VK_INCOMPLETE;
1104 }
1105 ++p;
1106
1107 /* backend IR */
1108 if (p < end) {
1109 p->isText = true;
1110 if (radv_use_llvm_for_stage(pdev, stage)) {
1111 desc_copy(p->name, "LLVM IR");
1112 desc_copy(p->description, "The LLVM IR after some optimizations");
1113 } else {
1114 desc_copy(p->name, "ACO IR");
1115 desc_copy(p->description, "The ACO IR after some optimizations");
1116 }
1117 if (radv_copy_representation(p->pData, &p->dataSize, shader->ir_string) != VK_SUCCESS)
1118 result = VK_INCOMPLETE;
1119 }
1120 ++p;
1121
1122 /* Disassembler */
1123 if (p < end && shader->disasm_string) {
1124 p->isText = true;
1125 desc_copy(p->name, "Assembly");
1126 desc_copy(p->description, "Final Assembly");
1127 if (radv_copy_representation(p->pData, &p->dataSize, shader->disasm_string) != VK_SUCCESS)
1128 result = VK_INCOMPLETE;
1129 }
1130 ++p;
1131
1132 if (!pInternalRepresentations)
1133 *pInternalRepresentationCount = p - pInternalRepresentations;
1134 else if (p > end) {
1135 result = VK_INCOMPLETE;
1136 *pInternalRepresentationCount = end - pInternalRepresentations;
1137 } else {
1138 *pInternalRepresentationCount = p - pInternalRepresentations;
1139 }
1140
1141 return result;
1142 }
1143
1144 static void
vk_shader_module_finish(void * _module)1145 vk_shader_module_finish(void *_module)
1146 {
1147 struct vk_shader_module *module = _module;
1148 vk_object_base_finish(&module->base);
1149 }
1150
1151 VkPipelineShaderStageCreateInfo *
radv_copy_shader_stage_create_info(struct radv_device * device,uint32_t stageCount,const VkPipelineShaderStageCreateInfo * pStages,void * mem_ctx)1152 radv_copy_shader_stage_create_info(struct radv_device *device, uint32_t stageCount,
1153 const VkPipelineShaderStageCreateInfo *pStages, void *mem_ctx)
1154 {
1155 VkPipelineShaderStageCreateInfo *new_stages;
1156
1157 size_t size = sizeof(VkPipelineShaderStageCreateInfo) * stageCount;
1158 new_stages = ralloc_size(mem_ctx, size);
1159 if (!new_stages)
1160 return NULL;
1161
1162 if (size)
1163 memcpy(new_stages, pStages, size);
1164
1165 for (uint32_t i = 0; i < stageCount; i++) {
1166 VK_FROM_HANDLE(vk_shader_module, module, new_stages[i].module);
1167
1168 const VkShaderModuleCreateInfo *minfo = vk_find_struct_const(pStages[i].pNext, SHADER_MODULE_CREATE_INFO);
1169
1170 if (module) {
1171 struct vk_shader_module *new_module = ralloc_size(mem_ctx, sizeof(struct vk_shader_module) + module->size);
1172 if (!new_module)
1173 return NULL;
1174
1175 ralloc_set_destructor(new_module, vk_shader_module_finish);
1176 vk_object_base_init(&device->vk, &new_module->base, VK_OBJECT_TYPE_SHADER_MODULE);
1177
1178 new_module->nir = NULL;
1179 memcpy(new_module->hash, module->hash, sizeof(module->hash));
1180 new_module->size = module->size;
1181 memcpy(new_module->data, module->data, module->size);
1182
1183 module = new_module;
1184 } else if (minfo) {
1185 module = ralloc_size(mem_ctx, sizeof(struct vk_shader_module) + minfo->codeSize);
1186 if (!module)
1187 return NULL;
1188
1189 vk_shader_module_init(&device->vk, module, minfo);
1190 }
1191
1192 if (module) {
1193 const VkSpecializationInfo *spec = new_stages[i].pSpecializationInfo;
1194 if (spec) {
1195 VkSpecializationInfo *new_spec = ralloc(mem_ctx, VkSpecializationInfo);
1196 if (!new_spec)
1197 return NULL;
1198
1199 new_spec->mapEntryCount = spec->mapEntryCount;
1200 uint32_t map_entries_size = sizeof(VkSpecializationMapEntry) * spec->mapEntryCount;
1201 new_spec->pMapEntries = ralloc_size(mem_ctx, map_entries_size);
1202 if (!new_spec->pMapEntries)
1203 return NULL;
1204 memcpy((void *)new_spec->pMapEntries, spec->pMapEntries, map_entries_size);
1205
1206 new_spec->dataSize = spec->dataSize;
1207 new_spec->pData = ralloc_size(mem_ctx, spec->dataSize);
1208 if (!new_spec->pData)
1209 return NULL;
1210 memcpy((void *)new_spec->pData, spec->pData, spec->dataSize);
1211
1212 new_stages[i].pSpecializationInfo = new_spec;
1213 }
1214
1215 new_stages[i].module = vk_shader_module_to_handle(module);
1216 new_stages[i].pName = ralloc_strdup(mem_ctx, new_stages[i].pName);
1217 if (!new_stages[i].pName)
1218 return NULL;
1219 new_stages[i].pNext = NULL;
1220 }
1221 }
1222
1223 return new_stages;
1224 }
1225
1226 void
radv_pipeline_hash(const struct radv_device * device,const struct radv_pipeline_layout * pipeline_layout,struct mesa_sha1 * ctx)1227 radv_pipeline_hash(const struct radv_device *device, const struct radv_pipeline_layout *pipeline_layout,
1228 struct mesa_sha1 *ctx)
1229 {
1230 _mesa_sha1_update(ctx, device->cache_hash, sizeof(device->cache_hash));
1231 if (pipeline_layout)
1232 _mesa_sha1_update(ctx, pipeline_layout->hash, sizeof(pipeline_layout->hash));
1233 }
1234
1235 void
radv_pipeline_hash_shader_stage(VkPipelineCreateFlags2 pipeline_flags,const VkPipelineShaderStageCreateInfo * sinfo,const struct radv_shader_stage_key * stage_key,struct mesa_sha1 * ctx)1236 radv_pipeline_hash_shader_stage(VkPipelineCreateFlags2 pipeline_flags, const VkPipelineShaderStageCreateInfo *sinfo,
1237 const struct radv_shader_stage_key *stage_key, struct mesa_sha1 *ctx)
1238 {
1239 unsigned char shader_sha1[SHA1_DIGEST_LENGTH];
1240
1241 vk_pipeline_hash_shader_stage(pipeline_flags, sinfo, NULL, shader_sha1);
1242
1243 _mesa_sha1_update(ctx, shader_sha1, sizeof(shader_sha1));
1244 _mesa_sha1_update(ctx, stage_key, sizeof(*stage_key));
1245 }
1246