• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a
9  * copy of this software and associated documentation files (the "Software"),
10  * to deal in the Software without restriction, including without limitation
11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12  * and/or sell copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the next
16  * paragraph) shall be included in all copies or substantial portions of the
17  * Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25  * IN THE SOFTWARE.
26  */
27 
28 #include "meta/radv_meta.h"
29 #include "nir/nir.h"
30 #include "nir/nir_builder.h"
31 #include "nir/nir_serialize.h"
32 #include "nir/radv_nir.h"
33 #include "spirv/nir_spirv.h"
34 #include "util/disk_cache.h"
35 #include "util/mesa-sha1.h"
36 #include "util/os_time.h"
37 #include "util/u_atomic.h"
38 #include "radv_cs.h"
39 #include "radv_debug.h"
40 #include "radv_private.h"
41 #include "radv_shader.h"
42 #include "radv_shader_args.h"
43 #include "vk_pipeline.h"
44 #include "vk_render_pass.h"
45 #include "vk_util.h"
46 
47 #include "util/u_debug.h"
48 #include "ac_binary.h"
49 #include "ac_nir.h"
50 #include "ac_shader_util.h"
51 #include "aco_interface.h"
52 #include "sid.h"
53 #include "vk_format.h"
54 #include "vk_nir_convert_ycbcr.h"
55 
56 bool
radv_shader_need_indirect_descriptor_sets(const struct radv_shader * shader)57 radv_shader_need_indirect_descriptor_sets(const struct radv_shader *shader)
58 {
59    const struct radv_userdata_info *loc = radv_get_user_sgpr(shader, AC_UD_INDIRECT_DESCRIPTOR_SETS);
60    return loc->sgpr_idx != -1;
61 }
62 
63 bool
radv_pipeline_capture_shaders(const struct radv_device * device,VkPipelineCreateFlags2KHR flags)64 radv_pipeline_capture_shaders(const struct radv_device *device, VkPipelineCreateFlags2KHR flags)
65 {
66    return (flags & VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR) ||
67           (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS) || device->keep_shader_info;
68 }
69 
70 bool
radv_pipeline_capture_shader_stats(const struct radv_device * device,VkPipelineCreateFlags2KHR flags)71 radv_pipeline_capture_shader_stats(const struct radv_device *device, VkPipelineCreateFlags2KHR flags)
72 {
73    return (flags & VK_PIPELINE_CREATE_2_CAPTURE_STATISTICS_BIT_KHR) ||
74           (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) || device->keep_shader_info;
75 }
76 
77 void
radv_pipeline_init(struct radv_device * device,struct radv_pipeline * pipeline,enum radv_pipeline_type type)78 radv_pipeline_init(struct radv_device *device, struct radv_pipeline *pipeline, enum radv_pipeline_type type)
79 {
80    vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
81 
82    pipeline->type = type;
83 }
84 
85 void
radv_pipeline_destroy(struct radv_device * device,struct radv_pipeline * pipeline,const VkAllocationCallbacks * allocator)86 radv_pipeline_destroy(struct radv_device *device, struct radv_pipeline *pipeline,
87                       const VkAllocationCallbacks *allocator)
88 {
89    if (pipeline->cache_object)
90       vk_pipeline_cache_object_unref(&device->vk, pipeline->cache_object);
91 
92    switch (pipeline->type) {
93    case RADV_PIPELINE_GRAPHICS:
94       radv_destroy_graphics_pipeline(device, radv_pipeline_to_graphics(pipeline));
95       break;
96    case RADV_PIPELINE_GRAPHICS_LIB:
97       radv_destroy_graphics_lib_pipeline(device, radv_pipeline_to_graphics_lib(pipeline));
98       break;
99    case RADV_PIPELINE_COMPUTE:
100       radv_destroy_compute_pipeline(device, radv_pipeline_to_compute(pipeline));
101       break;
102    case RADV_PIPELINE_RAY_TRACING:
103       radv_destroy_ray_tracing_pipeline(device, radv_pipeline_to_ray_tracing(pipeline));
104       break;
105    default:
106       unreachable("invalid pipeline type");
107    }
108 
109    if (pipeline->cs.buf)
110       free(pipeline->cs.buf);
111 
112    radv_rmv_log_resource_destroy(device, (uint64_t)radv_pipeline_to_handle(pipeline));
113    vk_object_base_finish(&pipeline->base);
114    vk_free2(&device->vk.alloc, allocator, pipeline);
115 }
116 
117 VKAPI_ATTR void VKAPI_CALL
radv_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)118 radv_DestroyPipeline(VkDevice _device, VkPipeline _pipeline, const VkAllocationCallbacks *pAllocator)
119 {
120    RADV_FROM_HANDLE(radv_device, device, _device);
121    RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
122 
123    if (!_pipeline)
124       return;
125 
126    radv_pipeline_destroy(device, pipeline, pAllocator);
127 }
128 
129 static enum radv_buffer_robustness
radv_convert_buffer_robustness(const struct radv_device * device,VkPipelineRobustnessBufferBehaviorEXT behaviour)130 radv_convert_buffer_robustness(const struct radv_device *device, VkPipelineRobustnessBufferBehaviorEXT behaviour)
131 {
132    switch (behaviour) {
133    case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DEVICE_DEFAULT_EXT:
134       return device->buffer_robustness;
135    case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
136       return RADV_BUFFER_ROBUSTNESS_DISABLED;
137    case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
138       return RADV_BUFFER_ROBUSTNESS_1;
139    case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
140       return RADV_BUFFER_ROBUSTNESS_2;
141    default:
142       unreachable("Invalid pipeline robustness behavior");
143    }
144 }
145 
146 struct radv_shader_stage_key
radv_pipeline_get_shader_key(const struct radv_device * device,const VkPipelineShaderStageCreateInfo * stage,VkPipelineCreateFlags2KHR flags,const void * pNext)147 radv_pipeline_get_shader_key(const struct radv_device *device, const VkPipelineShaderStageCreateInfo *stage,
148                              VkPipelineCreateFlags2KHR flags, const void *pNext)
149 {
150    gl_shader_stage s = vk_to_mesa_shader_stage(stage->stage);
151    struct radv_shader_stage_key key = {0};
152 
153    key.keep_statistic_info = radv_pipeline_capture_shader_stats(device, flags);
154 
155    if (flags & VK_PIPELINE_CREATE_2_DISABLE_OPTIMIZATION_BIT_KHR)
156       key.optimisations_disabled = 1;
157 
158    if (stage->stage & RADV_GRAPHICS_STAGE_BITS) {
159       key.version = device->instance->drirc.override_graphics_shader_version;
160    } else if (stage->stage & RADV_RT_STAGE_BITS) {
161       key.version = device->instance->drirc.override_ray_tracing_shader_version;
162    } else {
163       assert(stage->stage == VK_SHADER_STAGE_COMPUTE_BIT);
164       key.version = device->instance->drirc.override_compute_shader_version;
165    }
166 
167    const VkPipelineRobustnessCreateInfoEXT *pipeline_robust_info =
168       vk_find_struct_const(pNext, PIPELINE_ROBUSTNESS_CREATE_INFO_EXT);
169 
170    const VkPipelineRobustnessCreateInfoEXT *stage_robust_info =
171       vk_find_struct_const(stage->pNext, PIPELINE_ROBUSTNESS_CREATE_INFO_EXT);
172 
173    /* map any hit to intersection as these shaders get merged */
174    if (s == MESA_SHADER_ANY_HIT)
175       s = MESA_SHADER_INTERSECTION;
176 
177    enum radv_buffer_robustness storage_robustness = device->buffer_robustness;
178    enum radv_buffer_robustness uniform_robustness = device->buffer_robustness;
179    enum radv_buffer_robustness vertex_robustness = device->buffer_robustness;
180 
181    const VkPipelineRobustnessCreateInfoEXT *robust_info = stage_robust_info ? stage_robust_info : pipeline_robust_info;
182 
183    if (robust_info) {
184       storage_robustness = radv_convert_buffer_robustness(device, robust_info->storageBuffers);
185       uniform_robustness = radv_convert_buffer_robustness(device, robust_info->uniformBuffers);
186       vertex_robustness = radv_convert_buffer_robustness(device, robust_info->vertexInputs);
187    }
188 
189    if (storage_robustness >= RADV_BUFFER_ROBUSTNESS_2)
190       key.storage_robustness2 = 1;
191    if (uniform_robustness >= RADV_BUFFER_ROBUSTNESS_2)
192       key.uniform_robustness2 = 1;
193    if (s == MESA_SHADER_VERTEX && vertex_robustness >= RADV_BUFFER_ROBUSTNESS_1)
194       key.vertex_robustness1 = 1u;
195 
196    const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *const subgroup_size =
197       vk_find_struct_const(stage->pNext, PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
198 
199    if (subgroup_size) {
200       if (subgroup_size->requiredSubgroupSize == 32)
201          key.subgroup_required_size = RADV_REQUIRED_WAVE32;
202       else if (subgroup_size->requiredSubgroupSize == 64)
203          key.subgroup_required_size = RADV_REQUIRED_WAVE64;
204       else
205          unreachable("Unsupported required subgroup size.");
206    }
207 
208    if (stage->flags & VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT) {
209       key.subgroup_require_full = 1;
210    }
211 
212    return key;
213 }
214 
215 void
radv_pipeline_stage_init(const VkPipelineShaderStageCreateInfo * sinfo,const struct radv_pipeline_layout * pipeline_layout,const struct radv_shader_stage_key * stage_key,struct radv_shader_stage * out_stage)216 radv_pipeline_stage_init(const VkPipelineShaderStageCreateInfo *sinfo,
217                          const struct radv_pipeline_layout *pipeline_layout,
218                          const struct radv_shader_stage_key *stage_key, struct radv_shader_stage *out_stage)
219 {
220    const VkShaderModuleCreateInfo *minfo = vk_find_struct_const(sinfo->pNext, SHADER_MODULE_CREATE_INFO);
221    const VkPipelineShaderStageModuleIdentifierCreateInfoEXT *iinfo =
222       vk_find_struct_const(sinfo->pNext, PIPELINE_SHADER_STAGE_MODULE_IDENTIFIER_CREATE_INFO_EXT);
223 
224    if (sinfo->module == VK_NULL_HANDLE && !minfo && !iinfo)
225       return;
226 
227    memset(out_stage, 0, sizeof(*out_stage));
228 
229    out_stage->stage = vk_to_mesa_shader_stage(sinfo->stage);
230    out_stage->next_stage = MESA_SHADER_NONE;
231    out_stage->entrypoint = sinfo->pName;
232    out_stage->spec_info = sinfo->pSpecializationInfo;
233    out_stage->feedback.flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
234    out_stage->key = *stage_key;
235 
236    if (sinfo->module != VK_NULL_HANDLE) {
237       struct vk_shader_module *module = vk_shader_module_from_handle(sinfo->module);
238 
239       out_stage->spirv.data = module->data;
240       out_stage->spirv.size = module->size;
241       out_stage->spirv.object = &module->base;
242 
243       if (module->nir)
244          out_stage->internal_nir = module->nir;
245    } else if (minfo) {
246       out_stage->spirv.data = (const char *)minfo->pCode;
247       out_stage->spirv.size = minfo->codeSize;
248    }
249 
250    radv_shader_layout_init(pipeline_layout, out_stage->stage, &out_stage->layout);
251 
252    vk_pipeline_hash_shader_stage(sinfo, NULL, out_stage->shader_sha1);
253 }
254 
255 void
radv_shader_layout_init(const struct radv_pipeline_layout * pipeline_layout,gl_shader_stage stage,struct radv_shader_layout * layout)256 radv_shader_layout_init(const struct radv_pipeline_layout *pipeline_layout, gl_shader_stage stage,
257                         struct radv_shader_layout *layout)
258 {
259    layout->num_sets = pipeline_layout->num_sets;
260    for (unsigned i = 0; i < pipeline_layout->num_sets; i++) {
261       layout->set[i].layout = pipeline_layout->set[i].layout;
262       layout->set[i].dynamic_offset_start = pipeline_layout->set[i].dynamic_offset_start;
263    }
264 
265    layout->push_constant_size = pipeline_layout->push_constant_size;
266 
267    if (pipeline_layout->dynamic_offset_count &&
268        (pipeline_layout->dynamic_shader_stages & mesa_to_vk_shader_stage(stage))) {
269       layout->use_dynamic_descriptors = true;
270    }
271 }
272 
273 static const struct vk_ycbcr_conversion_state *
ycbcr_conversion_lookup(const void * data,uint32_t set,uint32_t binding,uint32_t array_index)274 ycbcr_conversion_lookup(const void *data, uint32_t set, uint32_t binding, uint32_t array_index)
275 {
276    const struct radv_shader_layout *layout = data;
277 
278    const struct radv_descriptor_set_layout *set_layout = layout->set[set].layout;
279    const struct vk_ycbcr_conversion_state *ycbcr_samplers = radv_immutable_ycbcr_samplers(set_layout, binding);
280 
281    if (!ycbcr_samplers)
282       return NULL;
283 
284    return ycbcr_samplers + array_index;
285 }
286 
287 bool
radv_mem_vectorize_callback(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)288 radv_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size, unsigned num_components,
289                             nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data)
290 {
291    if (num_components > 4)
292       return false;
293 
294    bool is_scratch = false;
295    switch (low->intrinsic) {
296    case nir_intrinsic_load_stack:
297    case nir_intrinsic_load_scratch:
298    case nir_intrinsic_store_stack:
299    case nir_intrinsic_store_scratch:
300       is_scratch = true;
301       break;
302    default:
303       break;
304    }
305 
306    /* >128 bit loads are split except with SMEM. On GFX6-8, >32 bit scratch loads are split. */
307    enum amd_gfx_level gfx_level = *(enum amd_gfx_level *)data;
308    if (bit_size * num_components > (is_scratch && gfx_level <= GFX8 ? 32 : 128))
309       return false;
310 
311    uint32_t align;
312    if (align_offset)
313       align = 1 << (ffs(align_offset) - 1);
314    else
315       align = align_mul;
316 
317    switch (low->intrinsic) {
318    case nir_intrinsic_load_global:
319    case nir_intrinsic_load_global_constant:
320    case nir_intrinsic_store_global:
321    case nir_intrinsic_store_ssbo:
322    case nir_intrinsic_load_ssbo:
323    case nir_intrinsic_load_ubo:
324    case nir_intrinsic_load_push_constant:
325    case nir_intrinsic_load_stack:
326    case nir_intrinsic_load_scratch:
327    case nir_intrinsic_store_stack:
328    case nir_intrinsic_store_scratch: {
329       unsigned max_components;
330       if (align % 4 == 0)
331          max_components = NIR_MAX_VEC_COMPONENTS;
332       else if (align % 2 == 0)
333          max_components = 16u / bit_size;
334       else
335          max_components = 8u / bit_size;
336       return (align % (bit_size / 8u)) == 0 && num_components <= max_components;
337    }
338    case nir_intrinsic_load_deref:
339    case nir_intrinsic_store_deref:
340       assert(nir_deref_mode_is(nir_src_as_deref(low->src[0]), nir_var_mem_shared));
341       FALLTHROUGH;
342    case nir_intrinsic_load_shared:
343    case nir_intrinsic_store_shared:
344       if (bit_size * num_components == 96) { /* 96 bit loads require 128 bit alignment and are split otherwise */
345          return align % 16 == 0;
346       } else if (bit_size == 16 && (align % 4)) {
347          /* AMD hardware can't do 2-byte aligned f16vec2 loads, but they are useful for ALU
348           * vectorization, because our vectorizer requires the scalar IR to already contain vectors.
349           */
350          return (align % 2 == 0) && num_components <= 2;
351       } else {
352          if (num_components == 3) {
353             /* AMD hardware can't do 3-component loads except for 96-bit loads, handled above. */
354             return false;
355          }
356          unsigned req = bit_size * num_components;
357          if (req == 64 || req == 128) /* 64-bit and 128-bit loads can use ds_read2_b{32,64} */
358             req /= 2u;
359          return align % (req / 8u) == 0;
360       }
361    default:
362       return false;
363    }
364    return false;
365 }
366 
367 static unsigned
lower_bit_size_callback(const nir_instr * instr,void * _)368 lower_bit_size_callback(const nir_instr *instr, void *_)
369 {
370    struct radv_device *device = _;
371    enum amd_gfx_level chip = device->physical_device->rad_info.gfx_level;
372 
373    if (instr->type != nir_instr_type_alu)
374       return 0;
375    nir_alu_instr *alu = nir_instr_as_alu(instr);
376 
377    /* If an instruction is not scalarized by this point,
378     * it can be emitted as packed instruction */
379    if (alu->def.num_components > 1)
380       return 0;
381 
382    if (alu->def.bit_size & (8 | 16)) {
383       unsigned bit_size = alu->def.bit_size;
384       switch (alu->op) {
385       case nir_op_bitfield_select:
386       case nir_op_imul_high:
387       case nir_op_umul_high:
388       case nir_op_uadd_carry:
389       case nir_op_usub_borrow:
390          return 32;
391       case nir_op_iabs:
392       case nir_op_imax:
393       case nir_op_umax:
394       case nir_op_imin:
395       case nir_op_umin:
396       case nir_op_ishr:
397       case nir_op_ushr:
398       case nir_op_ishl:
399       case nir_op_isign:
400       case nir_op_uadd_sat:
401       case nir_op_usub_sat:
402          return (bit_size == 8 || !(chip >= GFX8 && alu->def.divergent)) ? 32 : 0;
403       case nir_op_iadd_sat:
404       case nir_op_isub_sat:
405          return bit_size == 8 || !alu->def.divergent ? 32 : 0;
406 
407       default:
408          return 0;
409       }
410    }
411 
412    if (nir_src_bit_size(alu->src[0].src) & (8 | 16)) {
413       unsigned bit_size = nir_src_bit_size(alu->src[0].src);
414       switch (alu->op) {
415       case nir_op_bit_count:
416       case nir_op_find_lsb:
417       case nir_op_ufind_msb:
418          return 32;
419       case nir_op_ilt:
420       case nir_op_ige:
421       case nir_op_ieq:
422       case nir_op_ine:
423       case nir_op_ult:
424       case nir_op_uge:
425       case nir_op_bitz:
426       case nir_op_bitnz:
427          return (bit_size == 8 || !(chip >= GFX8 && alu->def.divergent)) ? 32 : 0;
428       default:
429          return 0;
430       }
431    }
432 
433    return 0;
434 }
435 
436 static uint8_t
opt_vectorize_callback(const nir_instr * instr,const void * _)437 opt_vectorize_callback(const nir_instr *instr, const void *_)
438 {
439    if (instr->type != nir_instr_type_alu)
440       return 0;
441 
442    const struct radv_device *device = _;
443    enum amd_gfx_level chip = device->physical_device->rad_info.gfx_level;
444    if (chip < GFX9)
445       return 1;
446 
447    const nir_alu_instr *alu = nir_instr_as_alu(instr);
448    const unsigned bit_size = alu->def.bit_size;
449    if (bit_size != 16)
450       return 1;
451 
452    return aco_nir_op_supports_packed_math_16bit(alu) ? 2 : 1;
453 }
454 
455 static nir_component_mask_t
non_uniform_access_callback(const nir_src * src,void * _)456 non_uniform_access_callback(const nir_src *src, void *_)
457 {
458    if (src->ssa->num_components == 1)
459       return 0x1;
460    return nir_chase_binding(*src).success ? 0x2 : 0x3;
461 }
462 
463 void
radv_postprocess_nir(struct radv_device * device,const struct radv_graphics_state_key * gfx_state,struct radv_shader_stage * stage)464 radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_state_key *gfx_state,
465                      struct radv_shader_stage *stage)
466 {
467    enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level;
468    bool progress;
469 
470    /* Wave and workgroup size should already be filled. */
471    assert(stage->info.wave_size && stage->info.workgroup_size);
472 
473    if (stage->stage == MESA_SHADER_FRAGMENT) {
474       if (!stage->key.optimisations_disabled) {
475          NIR_PASS(_, stage->nir, nir_opt_cse);
476       }
477       NIR_PASS(_, stage->nir, radv_nir_lower_fs_intrinsics, stage, gfx_state);
478    }
479 
480    enum nir_lower_non_uniform_access_type lower_non_uniform_access_types =
481       nir_lower_non_uniform_ubo_access | nir_lower_non_uniform_ssbo_access | nir_lower_non_uniform_texture_access |
482       nir_lower_non_uniform_image_access;
483 
484    /* In practice, most shaders do not have non-uniform-qualified
485     * accesses (see
486     * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17558#note_1475069)
487     * thus a cheaper and likely to fail check is run first.
488     */
489    if (nir_has_non_uniform_access(stage->nir, lower_non_uniform_access_types)) {
490       if (!stage->key.optimisations_disabled) {
491          NIR_PASS(_, stage->nir, nir_opt_non_uniform_access);
492       }
493 
494       if (!radv_use_llvm_for_stage(device, stage->stage)) {
495          nir_lower_non_uniform_access_options options = {
496             .types = lower_non_uniform_access_types,
497             .callback = &non_uniform_access_callback,
498             .callback_data = NULL,
499          };
500          NIR_PASS(_, stage->nir, nir_lower_non_uniform_access, &options);
501       }
502    }
503    NIR_PASS(_, stage->nir, nir_lower_memory_model);
504 
505    nir_load_store_vectorize_options vectorize_opts = {
506       .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_push_const | nir_var_mem_shared | nir_var_mem_global |
507                nir_var_shader_temp,
508       .callback = radv_mem_vectorize_callback,
509       .cb_data = &gfx_level,
510       .robust_modes = 0,
511       /* On GFX6, read2/write2 is out-of-bounds if the offset register is negative, even if
512        * the final offset is not.
513        */
514       .has_shared2_amd = gfx_level >= GFX7,
515    };
516 
517    if (stage->key.uniform_robustness2)
518       vectorize_opts.robust_modes |= nir_var_mem_ubo;
519 
520    if (stage->key.storage_robustness2)
521       vectorize_opts.robust_modes |= nir_var_mem_ssbo;
522 
523    if (!stage->key.optimisations_disabled) {
524       progress = false;
525       NIR_PASS(progress, stage->nir, nir_opt_load_store_vectorize, &vectorize_opts);
526       if (progress) {
527          NIR_PASS(_, stage->nir, nir_copy_prop);
528          NIR_PASS(_, stage->nir, nir_opt_shrink_stores, !device->instance->drirc.disable_shrink_image_store);
529 
530          /* Gather info again, to update whether 8/16-bit are used. */
531          nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir));
532       }
533    }
534 
535    NIR_PASS(_, stage->nir, ac_nir_lower_subdword_loads,
536             (ac_nir_lower_subdword_options){.modes_1_comp = nir_var_mem_ubo,
537                                             .modes_N_comps = nir_var_mem_ubo | nir_var_mem_ssbo});
538 
539    progress = false;
540    NIR_PASS(progress, stage->nir, nir_vk_lower_ycbcr_tex, ycbcr_conversion_lookup, &stage->layout);
541    /* Gather info in the case that nir_vk_lower_ycbcr_tex might have emitted resinfo instructions. */
542    if (progress)
543       nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir));
544 
545    bool fix_derivs_in_divergent_cf =
546       stage->stage == MESA_SHADER_FRAGMENT && !radv_use_llvm_for_stage(device, stage->stage);
547    if (fix_derivs_in_divergent_cf) {
548       NIR_PASS(_, stage->nir, nir_convert_to_lcssa, true, true);
549       nir_divergence_analysis(stage->nir);
550    }
551    NIR_PASS(_, stage->nir, ac_nir_lower_tex,
552             &(ac_nir_lower_tex_options){
553                .gfx_level = gfx_level,
554                .lower_array_layer_round_even =
555                   !device->physical_device->rad_info.conformant_trunc_coord || device->disable_trunc_coord,
556                .fix_derivs_in_divergent_cf = fix_derivs_in_divergent_cf,
557                .max_wqm_vgprs = 64, // TODO: improve spiller and RA support for linear VGPRs
558             });
559    if (fix_derivs_in_divergent_cf)
560       NIR_PASS(_, stage->nir, nir_opt_remove_phis); /* cleanup LCSSA phis */
561 
562    if (stage->nir->info.uses_resource_info_query)
563       NIR_PASS(_, stage->nir, ac_nir_lower_resinfo, gfx_level);
564 
565    NIR_PASS_V(stage->nir, radv_nir_apply_pipeline_layout, device, stage);
566 
567    if (!stage->key.optimisations_disabled) {
568       NIR_PASS(_, stage->nir, nir_opt_shrink_vectors);
569    }
570 
571    NIR_PASS(_, stage->nir, nir_lower_alu_width, opt_vectorize_callback, device);
572 
573    nir_move_options sink_opts = nir_move_const_undef | nir_move_copies;
574 
575    if (!stage->key.optimisations_disabled) {
576       if (stage->stage != MESA_SHADER_FRAGMENT || !device->physical_device->cache_key.disable_sinking_load_input_fs)
577          sink_opts |= nir_move_load_input;
578 
579       NIR_PASS(_, stage->nir, nir_opt_sink, sink_opts);
580       NIR_PASS(_, stage->nir, nir_opt_move, nir_move_load_input | nir_move_const_undef | nir_move_copies);
581    }
582 
583    /* Lower VS inputs. We need to do this after nir_opt_sink, because
584     * load_input can be reordered, but buffer loads can't.
585     */
586    if (stage->stage == MESA_SHADER_VERTEX) {
587       NIR_PASS(_, stage->nir, radv_nir_lower_vs_inputs, stage, gfx_state, &device->physical_device->rad_info);
588    }
589 
590    /* Lower I/O intrinsics to memory instructions. */
591    bool is_last_vgt_stage = radv_is_last_vgt_stage(stage);
592    bool io_to_mem = radv_nir_lower_io_to_mem(device, stage);
593    bool lowered_ngg = stage->info.is_ngg && is_last_vgt_stage;
594    if (lowered_ngg) {
595       radv_lower_ngg(device, stage, gfx_state);
596    } else if (is_last_vgt_stage) {
597       if (stage->stage != MESA_SHADER_GEOMETRY) {
598          NIR_PASS_V(stage->nir, ac_nir_lower_legacy_vs, gfx_level,
599                     stage->info.outinfo.clip_dist_mask | stage->info.outinfo.cull_dist_mask,
600                     stage->info.outinfo.vs_output_param_offset, stage->info.outinfo.param_exports,
601                     stage->info.outinfo.export_prim_id, false, false, false, stage->info.force_vrs_per_vertex);
602 
603       } else {
604          bool emulate_ngg_gs_query_pipeline_stat = device->physical_device->emulate_ngg_gs_query_pipeline_stat;
605 
606          ac_nir_gs_output_info gs_out_info = {
607             .streams = stage->info.gs.output_streams,
608             .usage_mask = stage->info.gs.output_usage_mask,
609          };
610          NIR_PASS_V(stage->nir, ac_nir_lower_legacy_gs, false, emulate_ngg_gs_query_pipeline_stat, &gs_out_info);
611       }
612    } else if (stage->stage == MESA_SHADER_FRAGMENT) {
613       ac_nir_lower_ps_options options = {
614          .gfx_level = gfx_level,
615          .family = device->physical_device->rad_info.family,
616          .use_aco = !radv_use_llvm_for_stage(device, stage->stage),
617          .uses_discard = true,
618          .alpha_func = COMPARE_FUNC_ALWAYS,
619          .no_color_export = stage->info.has_epilog,
620          .no_depth_export = stage->info.ps.exports_mrtz_via_epilog,
621 
622          .bc_optimize_for_persp = G_0286CC_PERSP_CENTER_ENA(stage->info.ps.spi_ps_input) &&
623                                   G_0286CC_PERSP_CENTROID_ENA(stage->info.ps.spi_ps_input),
624          .bc_optimize_for_linear = G_0286CC_LINEAR_CENTER_ENA(stage->info.ps.spi_ps_input) &&
625                                    G_0286CC_LINEAR_CENTROID_ENA(stage->info.ps.spi_ps_input),
626       };
627 
628       if (!options.no_color_export) {
629          options.dual_src_blend_swizzle = gfx_state->ps.epilog.mrt0_is_dual_src && gfx_level >= GFX11;
630          options.color_is_int8 = gfx_state->ps.epilog.color_is_int8;
631          options.color_is_int10 = gfx_state->ps.epilog.color_is_int10;
632          options.enable_mrt_output_nan_fixup =
633             gfx_state->ps.epilog.enable_mrt_output_nan_fixup && !stage->nir->info.internal;
634          /* Need to filter out unwritten color slots. */
635          options.spi_shader_col_format = gfx_state->ps.epilog.spi_shader_col_format & stage->info.ps.colors_written;
636       }
637 
638       if (!options.no_depth_export) {
639          /* Compared to gfx_state.ps.alpha_to_coverage_via_mrtz,
640           * radv_shader_info.ps.writes_mrt0_alpha need any depth/stencil/sample_mask exist.
641           * ac_nir_lower_ps() require this field to reflect whether alpha via mrtz is really
642           * present.
643           */
644          options.alpha_to_coverage_via_mrtz = stage->info.ps.writes_mrt0_alpha;
645       }
646 
647       NIR_PASS_V(stage->nir, ac_nir_lower_ps, &options);
648    }
649 
650    if (radv_shader_should_clear_lds(device, stage->nir)) {
651       const unsigned chunk_size = 16; /* max single store size */
652       const unsigned shared_size = ALIGN(stage->nir->info.shared_size, chunk_size);
653       NIR_PASS(_, stage->nir, nir_clear_shared_memory, shared_size, chunk_size);
654    }
655 
656    NIR_PASS(_, stage->nir, nir_lower_int64);
657 
658    NIR_PASS(_, stage->nir, nir_opt_idiv_const, 8);
659 
660    NIR_PASS(_, stage->nir, nir_lower_idiv,
661             &(nir_lower_idiv_options){
662                .allow_fp16 = gfx_level >= GFX9,
663             });
664 
665    if (radv_use_llvm_for_stage(device, stage->stage))
666       NIR_PASS_V(stage->nir, nir_lower_io_to_scalar, nir_var_mem_global, NULL, NULL);
667 
668    NIR_PASS(_, stage->nir, ac_nir_lower_global_access);
669    NIR_PASS_V(stage->nir, ac_nir_lower_intrinsics_to_args, gfx_level, radv_select_hw_stage(&stage->info, gfx_level),
670               &stage->args.ac);
671    NIR_PASS_V(stage->nir, radv_nir_lower_abi, gfx_level, stage, gfx_state,
672               device->physical_device->rad_info.address32_hi);
673    radv_optimize_nir_algebraic(
674       stage->nir, io_to_mem || lowered_ngg || stage->stage == MESA_SHADER_COMPUTE || stage->stage == MESA_SHADER_TASK);
675 
676    NIR_PASS(_, stage->nir, nir_lower_fp16_casts, nir_lower_fp16_split_fp64);
677 
678    if (stage->nir->info.bit_sizes_int & (8 | 16)) {
679       if (gfx_level >= GFX8) {
680          NIR_PASS(_, stage->nir, nir_convert_to_lcssa, true, true);
681          nir_divergence_analysis(stage->nir);
682       }
683 
684       if (nir_lower_bit_size(stage->nir, lower_bit_size_callback, device)) {
685          NIR_PASS(_, stage->nir, nir_opt_constant_folding);
686       }
687 
688       if (gfx_level >= GFX8)
689          NIR_PASS(_, stage->nir, nir_opt_remove_phis); /* cleanup LCSSA phis */
690    }
691    if (((stage->nir->info.bit_sizes_int | stage->nir->info.bit_sizes_float) & 16) && gfx_level >= GFX9) {
692       bool separate_g16 = gfx_level >= GFX10;
693       struct nir_fold_tex_srcs_options fold_srcs_options[] = {
694          {
695             .sampler_dims = ~(BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE) | BITFIELD_BIT(GLSL_SAMPLER_DIM_BUF)),
696             .src_types = (1 << nir_tex_src_coord) | (1 << nir_tex_src_lod) | (1 << nir_tex_src_bias) |
697                          (1 << nir_tex_src_min_lod) | (1 << nir_tex_src_ms_index) |
698                          (separate_g16 ? 0 : (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy)),
699          },
700          {
701             .sampler_dims = ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE),
702             .src_types = (1 << nir_tex_src_ddx) | (1 << nir_tex_src_ddy),
703          },
704       };
705       struct nir_fold_16bit_tex_image_options fold_16bit_options = {
706          .rounding_mode = nir_rounding_mode_rtz,
707          .fold_tex_dest_types = nir_type_float,
708          .fold_image_dest_types = nir_type_float,
709          .fold_image_store_data = true,
710          .fold_image_srcs = !radv_use_llvm_for_stage(device, stage->stage),
711          .fold_srcs_options_count = separate_g16 ? 2 : 1,
712          .fold_srcs_options = fold_srcs_options,
713       };
714       NIR_PASS(_, stage->nir, nir_fold_16bit_tex_image, &fold_16bit_options);
715 
716       if (!stage->key.optimisations_disabled) {
717          NIR_PASS(_, stage->nir, nir_opt_vectorize, opt_vectorize_callback, device);
718       }
719    }
720 
721    /* cleanup passes */
722    NIR_PASS(_, stage->nir, nir_lower_alu_width, opt_vectorize_callback, device);
723    NIR_PASS(_, stage->nir, nir_lower_load_const_to_scalar);
724    NIR_PASS(_, stage->nir, nir_copy_prop);
725    NIR_PASS(_, stage->nir, nir_opt_dce);
726 
727    if (!stage->key.optimisations_disabled) {
728       sink_opts |= nir_move_comparisons | nir_move_load_ubo | nir_move_load_ssbo;
729       NIR_PASS(_, stage->nir, nir_opt_sink, sink_opts);
730 
731       nir_move_options move_opts =
732          nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | nir_move_comparisons | nir_move_copies;
733       NIR_PASS(_, stage->nir, nir_opt_move, move_opts);
734    }
735 }
736 
737 bool
radv_shader_should_clear_lds(const struct radv_device * device,const nir_shader * shader)738 radv_shader_should_clear_lds(const struct radv_device *device, const nir_shader *shader)
739 {
740    return (shader->info.stage == MESA_SHADER_COMPUTE || shader->info.stage == MESA_SHADER_MESH ||
741            shader->info.stage == MESA_SHADER_TASK) &&
742           shader->info.shared_size > 0 && device->instance->drirc.clear_lds;
743 }
744 
745 static uint32_t
radv_get_executable_count(struct radv_pipeline * pipeline)746 radv_get_executable_count(struct radv_pipeline *pipeline)
747 {
748    uint32_t ret = 0;
749 
750    if (pipeline->type == RADV_PIPELINE_RAY_TRACING) {
751       struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
752       for (uint32_t i = 0; i < rt_pipeline->stage_count; i++)
753          ret += rt_pipeline->stages[i].shader ? 1 : 0;
754    }
755 
756    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
757       if (!pipeline->shaders[i])
758          continue;
759 
760       if (i == MESA_SHADER_GEOMETRY && !radv_pipeline_has_ngg(radv_pipeline_to_graphics(pipeline))) {
761          ret += 2u;
762       } else {
763          ret += 1u;
764       }
765    }
766    return ret;
767 }
768 
769 static struct radv_shader *
radv_get_shader_from_executable_index(struct radv_pipeline * pipeline,int index,gl_shader_stage * stage)770 radv_get_shader_from_executable_index(struct radv_pipeline *pipeline, int index, gl_shader_stage *stage)
771 {
772    if (pipeline->type == RADV_PIPELINE_RAY_TRACING) {
773       struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
774       for (uint32_t i = 0; i < rt_pipeline->stage_count; i++) {
775          struct radv_ray_tracing_stage *rt_stage = &rt_pipeline->stages[i];
776          if (!rt_stage->shader)
777             continue;
778 
779          if (!index) {
780             *stage = rt_stage->stage;
781             return rt_stage->shader;
782          }
783 
784          index--;
785       }
786    }
787 
788    for (int i = 0; i < MESA_VULKAN_SHADER_STAGES; ++i) {
789       if (!pipeline->shaders[i])
790          continue;
791       if (!index) {
792          *stage = i;
793          return pipeline->shaders[i];
794       }
795 
796       --index;
797 
798       if (i == MESA_SHADER_GEOMETRY && !radv_pipeline_has_ngg(radv_pipeline_to_graphics(pipeline))) {
799          if (!index) {
800             *stage = i;
801             return pipeline->gs_copy_shader;
802          }
803          --index;
804       }
805    }
806 
807    *stage = -1;
808    return NULL;
809 }
810 
811 /* Basically strlcpy (which does not exist on linux) specialized for
812  * descriptions. */
813 static void
desc_copy(char * desc,const char * src)814 desc_copy(char *desc, const char *src)
815 {
816    int len = strlen(src);
817    assert(len < VK_MAX_DESCRIPTION_SIZE);
818    memcpy(desc, src, len);
819    memset(desc + len, 0, VK_MAX_DESCRIPTION_SIZE - len);
820 }
821 
822 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetPipelineExecutablePropertiesKHR(VkDevice _device,const VkPipelineInfoKHR * pPipelineInfo,uint32_t * pExecutableCount,VkPipelineExecutablePropertiesKHR * pProperties)823 radv_GetPipelineExecutablePropertiesKHR(VkDevice _device, const VkPipelineInfoKHR *pPipelineInfo,
824                                         uint32_t *pExecutableCount, VkPipelineExecutablePropertiesKHR *pProperties)
825 {
826    RADV_FROM_HANDLE(radv_pipeline, pipeline, pPipelineInfo->pipeline);
827    const uint32_t total_count = radv_get_executable_count(pipeline);
828 
829    if (!pProperties) {
830       *pExecutableCount = total_count;
831       return VK_SUCCESS;
832    }
833 
834    const uint32_t count = MIN2(total_count, *pExecutableCount);
835    for (uint32_t executable_idx = 0; executable_idx < count; executable_idx++) {
836       gl_shader_stage stage;
837       struct radv_shader *shader = radv_get_shader_from_executable_index(pipeline, executable_idx, &stage);
838 
839       pProperties[executable_idx].stages = mesa_to_vk_shader_stage(stage);
840 
841       const char *name = _mesa_shader_stage_to_string(stage);
842       const char *description = NULL;
843       switch (stage) {
844       case MESA_SHADER_VERTEX:
845          description = "Vulkan Vertex Shader";
846          break;
847       case MESA_SHADER_TESS_CTRL:
848          if (!pipeline->shaders[MESA_SHADER_VERTEX]) {
849             pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT;
850             name = "vertex + tessellation control";
851             description = "Combined Vulkan Vertex and Tessellation Control Shaders";
852          } else {
853             description = "Vulkan Tessellation Control Shader";
854          }
855          break;
856       case MESA_SHADER_TESS_EVAL:
857          description = "Vulkan Tessellation Evaluation Shader";
858          break;
859       case MESA_SHADER_GEOMETRY:
860          if (shader->info.type == RADV_SHADER_TYPE_GS_COPY) {
861             name = "geometry copy";
862             description = "Extra shader stage that loads the GS output ringbuffer into the rasterizer";
863             break;
864          }
865 
866          if (pipeline->shaders[MESA_SHADER_TESS_CTRL] && !pipeline->shaders[MESA_SHADER_TESS_EVAL]) {
867             pProperties[executable_idx].stages |= VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
868             name = "tessellation evaluation + geometry";
869             description = "Combined Vulkan Tessellation Evaluation and Geometry Shaders";
870          } else if (!pipeline->shaders[MESA_SHADER_TESS_CTRL] && !pipeline->shaders[MESA_SHADER_VERTEX]) {
871             pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT;
872             name = "vertex + geometry";
873             description = "Combined Vulkan Vertex and Geometry Shaders";
874          } else {
875             description = "Vulkan Geometry Shader";
876          }
877          break;
878       case MESA_SHADER_FRAGMENT:
879          description = "Vulkan Fragment Shader";
880          break;
881       case MESA_SHADER_COMPUTE:
882          description = "Vulkan Compute Shader";
883          break;
884       case MESA_SHADER_MESH:
885          description = "Vulkan Mesh Shader";
886          break;
887       case MESA_SHADER_TASK:
888          description = "Vulkan Task Shader";
889          break;
890       case MESA_SHADER_RAYGEN:
891          description = "Vulkan Ray Generation Shader";
892          break;
893       case MESA_SHADER_ANY_HIT:
894          description = "Vulkan Any-Hit Shader";
895          break;
896       case MESA_SHADER_CLOSEST_HIT:
897          description = "Vulkan Closest-Hit Shader";
898          break;
899       case MESA_SHADER_MISS:
900          description = "Vulkan Miss Shader";
901          break;
902       case MESA_SHADER_INTERSECTION:
903          description = "Shader responsible for traversing the acceleration structure";
904          break;
905       case MESA_SHADER_CALLABLE:
906          description = "Vulkan Callable Shader";
907          break;
908       default:
909          unreachable("Unsupported shader stage");
910       }
911 
912       pProperties[executable_idx].subgroupSize = shader->info.wave_size;
913       desc_copy(pProperties[executable_idx].name, name);
914       desc_copy(pProperties[executable_idx].description, description);
915    }
916 
917    VkResult result = *pExecutableCount < total_count ? VK_INCOMPLETE : VK_SUCCESS;
918    *pExecutableCount = count;
919    return result;
920 }
921 
922 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetPipelineExecutableStatisticsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pStatisticCount,VkPipelineExecutableStatisticKHR * pStatistics)923 radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecutableInfoKHR *pExecutableInfo,
924                                         uint32_t *pStatisticCount, VkPipelineExecutableStatisticKHR *pStatistics)
925 {
926    RADV_FROM_HANDLE(radv_device, device, _device);
927    RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
928    gl_shader_stage stage;
929    struct radv_shader *shader =
930       radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);
931 
932    const struct radv_physical_device *pdevice = device->physical_device;
933 
934    unsigned lds_increment = pdevice->rad_info.gfx_level >= GFX11 && stage == MESA_SHADER_FRAGMENT
935                                ? 1024
936                                : pdevice->rad_info.lds_encode_granularity;
937 
938    VkPipelineExecutableStatisticKHR *s = pStatistics;
939    VkPipelineExecutableStatisticKHR *end = s + (pStatistics ? *pStatisticCount : 0);
940    VkResult result = VK_SUCCESS;
941 
942    if (s < end) {
943       desc_copy(s->name, "Driver pipeline hash");
944       desc_copy(s->description, "Driver pipeline hash used by RGP");
945       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
946       s->value.u64 = pipeline->pipeline_hash;
947    }
948    ++s;
949 
950    if (s < end) {
951       desc_copy(s->name, "SGPRs");
952       desc_copy(s->description, "Number of SGPR registers allocated per subgroup");
953       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
954       s->value.u64 = shader->config.num_sgprs;
955    }
956    ++s;
957 
958    if (s < end) {
959       desc_copy(s->name, "VGPRs");
960       desc_copy(s->description, "Number of VGPR registers allocated per subgroup");
961       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
962       s->value.u64 = shader->config.num_vgprs;
963    }
964    ++s;
965 
966    if (s < end) {
967       desc_copy(s->name, "Spilled SGPRs");
968       desc_copy(s->description, "Number of SGPR registers spilled per subgroup");
969       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
970       s->value.u64 = shader->config.spilled_sgprs;
971    }
972    ++s;
973 
974    if (s < end) {
975       desc_copy(s->name, "Spilled VGPRs");
976       desc_copy(s->description, "Number of VGPR registers spilled per subgroup");
977       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
978       s->value.u64 = shader->config.spilled_vgprs;
979    }
980    ++s;
981 
982    if (s < end) {
983       desc_copy(s->name, "Code size");
984       desc_copy(s->description, "Code size in bytes");
985       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
986       s->value.u64 = shader->exec_size;
987    }
988    ++s;
989 
990    if (s < end) {
991       desc_copy(s->name, "LDS size");
992       desc_copy(s->description, "LDS size in bytes per workgroup");
993       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
994       s->value.u64 = shader->config.lds_size * lds_increment;
995    }
996    ++s;
997 
998    if (s < end) {
999       desc_copy(s->name, "Scratch size");
1000       desc_copy(s->description, "Private memory in bytes per subgroup");
1001       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1002       s->value.u64 = shader->config.scratch_bytes_per_wave;
1003    }
1004    ++s;
1005 
1006    if (s < end) {
1007       desc_copy(s->name, "Subgroups per SIMD");
1008       desc_copy(s->description, "The maximum number of subgroups in flight on a SIMD unit");
1009       s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1010       s->value.u64 = shader->max_waves;
1011    }
1012    ++s;
1013 
1014    if (shader->statistics) {
1015       for (unsigned i = 0; i < aco_num_statistics; i++) {
1016          const struct aco_compiler_statistic_info *info = &aco_statistic_infos[i];
1017          if (s < end) {
1018             desc_copy(s->name, info->name);
1019             desc_copy(s->description, info->desc);
1020             s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1021             s->value.u64 = shader->statistics[i];
1022          }
1023          ++s;
1024       }
1025    }
1026 
1027    if (!pStatistics)
1028       *pStatisticCount = s - pStatistics;
1029    else if (s > end) {
1030       *pStatisticCount = end - pStatistics;
1031       result = VK_INCOMPLETE;
1032    } else {
1033       *pStatisticCount = s - pStatistics;
1034    }
1035 
1036    return result;
1037 }
1038 
1039 static VkResult
radv_copy_representation(void * data,size_t * data_size,const char * src)1040 radv_copy_representation(void *data, size_t *data_size, const char *src)
1041 {
1042    size_t total_size = strlen(src) + 1;
1043 
1044    if (!data) {
1045       *data_size = total_size;
1046       return VK_SUCCESS;
1047    }
1048 
1049    size_t size = MIN2(total_size, *data_size);
1050 
1051    memcpy(data, src, size);
1052    if (size)
1053       *((char *)data + size - 1) = 0;
1054    return size < total_size ? VK_INCOMPLETE : VK_SUCCESS;
1055 }
1056 
1057 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetPipelineExecutableInternalRepresentationsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pInternalRepresentationCount,VkPipelineExecutableInternalRepresentationKHR * pInternalRepresentations)1058 radv_GetPipelineExecutableInternalRepresentationsKHR(
1059    VkDevice _device, const VkPipelineExecutableInfoKHR *pExecutableInfo, uint32_t *pInternalRepresentationCount,
1060    VkPipelineExecutableInternalRepresentationKHR *pInternalRepresentations)
1061 {
1062    RADV_FROM_HANDLE(radv_device, device, _device);
1063    RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
1064    gl_shader_stage stage;
1065    struct radv_shader *shader =
1066       radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);
1067 
1068    VkPipelineExecutableInternalRepresentationKHR *p = pInternalRepresentations;
1069    VkPipelineExecutableInternalRepresentationKHR *end =
1070       p + (pInternalRepresentations ? *pInternalRepresentationCount : 0);
1071    VkResult result = VK_SUCCESS;
1072    /* optimized NIR */
1073    if (p < end) {
1074       p->isText = true;
1075       desc_copy(p->name, "NIR Shader(s)");
1076       desc_copy(p->description, "The optimized NIR shader(s)");
1077       if (radv_copy_representation(p->pData, &p->dataSize, shader->nir_string) != VK_SUCCESS)
1078          result = VK_INCOMPLETE;
1079    }
1080    ++p;
1081 
1082    /* backend IR */
1083    if (p < end) {
1084       p->isText = true;
1085       if (radv_use_llvm_for_stage(device, stage)) {
1086          desc_copy(p->name, "LLVM IR");
1087          desc_copy(p->description, "The LLVM IR after some optimizations");
1088       } else {
1089          desc_copy(p->name, "ACO IR");
1090          desc_copy(p->description, "The ACO IR after some optimizations");
1091       }
1092       if (radv_copy_representation(p->pData, &p->dataSize, shader->ir_string) != VK_SUCCESS)
1093          result = VK_INCOMPLETE;
1094    }
1095    ++p;
1096 
1097    /* Disassembler */
1098    if (p < end && shader->disasm_string) {
1099       p->isText = true;
1100       desc_copy(p->name, "Assembly");
1101       desc_copy(p->description, "Final Assembly");
1102       if (radv_copy_representation(p->pData, &p->dataSize, shader->disasm_string) != VK_SUCCESS)
1103          result = VK_INCOMPLETE;
1104    }
1105    ++p;
1106 
1107    if (!pInternalRepresentations)
1108       *pInternalRepresentationCount = p - pInternalRepresentations;
1109    else if (p > end) {
1110       result = VK_INCOMPLETE;
1111       *pInternalRepresentationCount = end - pInternalRepresentations;
1112    } else {
1113       *pInternalRepresentationCount = p - pInternalRepresentations;
1114    }
1115 
1116    return result;
1117 }
1118 
1119 static void
vk_shader_module_finish(void * _module)1120 vk_shader_module_finish(void *_module)
1121 {
1122    struct vk_shader_module *module = _module;
1123    vk_object_base_finish(&module->base);
1124 }
1125 
1126 VkPipelineShaderStageCreateInfo *
radv_copy_shader_stage_create_info(struct radv_device * device,uint32_t stageCount,const VkPipelineShaderStageCreateInfo * pStages,void * mem_ctx)1127 radv_copy_shader_stage_create_info(struct radv_device *device, uint32_t stageCount,
1128                                    const VkPipelineShaderStageCreateInfo *pStages, void *mem_ctx)
1129 {
1130    VkPipelineShaderStageCreateInfo *new_stages;
1131 
1132    size_t size = sizeof(VkPipelineShaderStageCreateInfo) * stageCount;
1133    new_stages = ralloc_size(mem_ctx, size);
1134    if (!new_stages)
1135       return NULL;
1136 
1137    if (size)
1138       memcpy(new_stages, pStages, size);
1139 
1140    for (uint32_t i = 0; i < stageCount; i++) {
1141       RADV_FROM_HANDLE(vk_shader_module, module, new_stages[i].module);
1142 
1143       const VkShaderModuleCreateInfo *minfo = vk_find_struct_const(pStages[i].pNext, SHADER_MODULE_CREATE_INFO);
1144 
1145       if (module) {
1146          struct vk_shader_module *new_module = ralloc_size(mem_ctx, sizeof(struct vk_shader_module) + module->size);
1147          if (!new_module)
1148             return NULL;
1149 
1150          ralloc_set_destructor(new_module, vk_shader_module_finish);
1151          vk_object_base_init(&device->vk, &new_module->base, VK_OBJECT_TYPE_SHADER_MODULE);
1152 
1153          new_module->nir = NULL;
1154          memcpy(new_module->hash, module->hash, sizeof(module->hash));
1155          new_module->size = module->size;
1156          memcpy(new_module->data, module->data, module->size);
1157 
1158          module = new_module;
1159       } else if (minfo) {
1160          module = ralloc_size(mem_ctx, sizeof(struct vk_shader_module) + minfo->codeSize);
1161          if (!module)
1162             return NULL;
1163 
1164          vk_shader_module_init(&device->vk, module, minfo);
1165       }
1166 
1167       if (module) {
1168          const VkSpecializationInfo *spec = new_stages[i].pSpecializationInfo;
1169          if (spec) {
1170             VkSpecializationInfo *new_spec = ralloc(mem_ctx, VkSpecializationInfo);
1171             if (!new_spec)
1172                return NULL;
1173 
1174             new_spec->mapEntryCount = spec->mapEntryCount;
1175             uint32_t map_entries_size = sizeof(VkSpecializationMapEntry) * spec->mapEntryCount;
1176             new_spec->pMapEntries = ralloc_size(mem_ctx, map_entries_size);
1177             if (!new_spec->pMapEntries)
1178                return NULL;
1179             memcpy((void *)new_spec->pMapEntries, spec->pMapEntries, map_entries_size);
1180 
1181             new_spec->dataSize = spec->dataSize;
1182             new_spec->pData = ralloc_size(mem_ctx, spec->dataSize);
1183             if (!new_spec->pData)
1184                return NULL;
1185             memcpy((void *)new_spec->pData, spec->pData, spec->dataSize);
1186 
1187             new_stages[i].pSpecializationInfo = new_spec;
1188          }
1189 
1190          new_stages[i].module = vk_shader_module_to_handle(module);
1191          new_stages[i].pName = ralloc_strdup(mem_ctx, new_stages[i].pName);
1192          if (!new_stages[i].pName)
1193             return NULL;
1194          new_stages[i].pNext = NULL;
1195       }
1196    }
1197 
1198    return new_stages;
1199 }
1200