1 /*
2  * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3  * SPDX-License-Identifier: MIT
4  */
5 #include "nvk_shader.h"
6 
7 #include "nvk_cmd_buffer.h"
8 #include "nvk_descriptor_set_layout.h"
9 #include "nvk_device.h"
10 #include "nvk_mme.h"
11 #include "nvk_physical_device.h"
12 #include "nvk_sampler.h"
13 #include "nvk_shader.h"
14 
15 #include "vk_nir_convert_ycbcr.h"
16 #include "vk_pipeline.h"
17 #include "vk_pipeline_layout.h"
18 #include "vk_shader_module.h"
19 #include "vk_ycbcr_conversion.h"
20 
21 #include "nak.h"
22 #include "nir.h"
23 #include "nir_builder.h"
24 #include "compiler/spirv/nir_spirv.h"
25 
26 #include "nv50_ir_driver.h"
27 
28 #include "util/mesa-sha1.h"
29 #include "util/u_debug.h"
30 
31 #include "cla097.h"
32 #include "clb097.h"
33 #include "clc597.h"
34 #include "nv_push_cl9097.h"
35 #include "nv_push_clb197.h"
36 #include "nv_push_clc397.h"
37 #include "nv_push_clc797.h"
38 
39 static void
shared_var_info(const struct glsl_type * type,unsigned * size,unsigned * align)40 shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
41 {
42    assert(glsl_type_is_vector_or_scalar(type));
43 
44    uint32_t comp_size = glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
45    unsigned length = glsl_get_vector_elements(type);
46    *size = comp_size * length, *align = comp_size;
47 }
48 
49 VkShaderStageFlags
nvk_nak_stages(const struct nv_device_info * info)50 nvk_nak_stages(const struct nv_device_info *info)
51 {
52    const VkShaderStageFlags all =
53       VK_SHADER_STAGE_VERTEX_BIT |
54       VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
55       VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT |
56       VK_SHADER_STAGE_GEOMETRY_BIT |
57       VK_SHADER_STAGE_FRAGMENT_BIT |
58       VK_SHADER_STAGE_COMPUTE_BIT;
59 
60    const struct debug_control flags[] = {
61       { "vs", BITFIELD64_BIT(MESA_SHADER_VERTEX) },
62       { "tcs", BITFIELD64_BIT(MESA_SHADER_TESS_CTRL) },
63       { "tes", BITFIELD64_BIT(MESA_SHADER_TESS_EVAL) },
64       { "gs", BITFIELD64_BIT(MESA_SHADER_GEOMETRY) },
65       { "fs", BITFIELD64_BIT(MESA_SHADER_FRAGMENT) },
66       { "cs", BITFIELD64_BIT(MESA_SHADER_COMPUTE) },
67       { "all", all },
68       { NULL, 0 },
69    };
70 
71    const char *env_str = getenv("NVK_USE_NAK");
72    if (env_str == NULL)
73       return info->cls_eng3d >= MAXWELL_A ? all : 0;
74    else
75       return parse_debug_string(env_str, flags);
76 }
77 
78 static bool
use_nak(const struct nvk_physical_device * pdev,gl_shader_stage stage)79 use_nak(const struct nvk_physical_device *pdev, gl_shader_stage stage)
80 {
81    return nvk_nak_stages(&pdev->info) & mesa_to_vk_shader_stage(stage);
82 }
83 
84 uint64_t
nvk_physical_device_compiler_flags(const struct nvk_physical_device * pdev)85 nvk_physical_device_compiler_flags(const struct nvk_physical_device *pdev)
86 {
87    bool no_cbufs = pdev->debug_flags & NVK_DEBUG_NO_CBUF;
88    bool use_edb_buffer_views = nvk_use_edb_buffer_views(pdev);
89    uint64_t prog_debug = nvk_cg_get_prog_debug();
90    uint64_t prog_optimize = nvk_cg_get_prog_optimize();
91    uint64_t nak_stages = nvk_nak_stages(&pdev->info);
92    uint64_t nak_flags = nak_debug_flags(pdev->nak);
93 
94    assert(prog_debug <= UINT8_MAX);
95    assert(prog_optimize < 16);
96    assert(nak_stages <= UINT32_MAX);
97    assert(nak_flags <= UINT16_MAX);
98 
99    return prog_debug
100       | (prog_optimize << 8)
101       | ((uint64_t)no_cbufs << 12)
102       | ((uint64_t)use_edb_buffer_views << 13)
103       | (nak_stages << 16)
104       | (nak_flags << 48);
105 }
106 
107 static const nir_shader_compiler_options *
nvk_get_nir_options(struct vk_physical_device * vk_pdev,gl_shader_stage stage,UNUSED const struct vk_pipeline_robustness_state * rs)108 nvk_get_nir_options(struct vk_physical_device *vk_pdev,
109                     gl_shader_stage stage,
110                     UNUSED const struct vk_pipeline_robustness_state *rs)
111 {
112    const struct nvk_physical_device *pdev =
113       container_of(vk_pdev, struct nvk_physical_device, vk);
114 
115    if (use_nak(pdev, stage))
116       return nak_nir_options(pdev->nak);
117    else
118       return nvk_cg_nir_options(pdev, stage);
119 }
120 
121 nir_address_format
nvk_ubo_addr_format(const struct nvk_physical_device * pdev,const struct vk_pipeline_robustness_state * rs)122 nvk_ubo_addr_format(const struct nvk_physical_device *pdev,
123                     const struct vk_pipeline_robustness_state *rs)
124 {
125    if (nvk_use_bindless_cbuf(&pdev->info)) {
126       return nir_address_format_vec2_index_32bit_offset;
127    } else if (rs->null_uniform_buffer_descriptor) {
128       /* We need bounds checking for null descriptors */
129       return nir_address_format_64bit_bounded_global;
130    } else {
131       switch (rs->uniform_buffers) {
132       case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
133          return nir_address_format_64bit_global_32bit_offset;
134       case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
135       case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
136          return nir_address_format_64bit_bounded_global;
137       default:
138          unreachable("Invalid robust buffer access behavior");
139       }
140    }
141 }
142 
143 nir_address_format
nvk_ssbo_addr_format(const struct nvk_physical_device * pdev,const struct vk_pipeline_robustness_state * rs)144 nvk_ssbo_addr_format(const struct nvk_physical_device *pdev,
145                     const struct vk_pipeline_robustness_state *rs)
146 {
147    if (rs->null_storage_buffer_descriptor) {
148       /* We need bounds checking for null descriptors */
149       return nir_address_format_64bit_bounded_global;
150    } else {
151       switch (rs->storage_buffers) {
152       case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
153          return nir_address_format_64bit_global_32bit_offset;
154       case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
155       case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
156          return nir_address_format_64bit_bounded_global;
157       default:
158          unreachable("Invalid robust buffer access behavior");
159       }
160    }
161 }
162 
163 static struct spirv_to_nir_options
nvk_get_spirv_options(struct vk_physical_device * vk_pdev,UNUSED gl_shader_stage stage,const struct vk_pipeline_robustness_state * rs)164 nvk_get_spirv_options(struct vk_physical_device *vk_pdev,
165                       UNUSED gl_shader_stage stage,
166                       const struct vk_pipeline_robustness_state *rs)
167 {
168    const struct nvk_physical_device *pdev =
169       container_of(vk_pdev, struct nvk_physical_device, vk);
170 
171    return (struct spirv_to_nir_options) {
172       .ssbo_addr_format = nvk_ssbo_addr_format(pdev, rs),
173       .phys_ssbo_addr_format = nir_address_format_64bit_global,
174       .ubo_addr_format = nvk_ubo_addr_format(pdev, rs),
175       .shared_addr_format = nir_address_format_32bit_offset,
176       .min_ssbo_alignment = NVK_MIN_SSBO_ALIGNMENT,
177       .min_ubo_alignment = nvk_min_cbuf_alignment(&pdev->info),
178    };
179 }
180 
181 static void
nvk_preprocess_nir(struct vk_physical_device * vk_pdev,nir_shader * nir)182 nvk_preprocess_nir(struct vk_physical_device *vk_pdev, nir_shader *nir)
183 {
184    const struct nvk_physical_device *pdev =
185       container_of(vk_pdev, struct nvk_physical_device, vk);
186 
187    NIR_PASS_V(nir, nir_lower_io_to_temporaries,
188               nir_shader_get_entrypoint(nir), true, false);
189 
190    if (use_nak(pdev, nir->info.stage))
191       nak_preprocess_nir(nir, pdev->nak);
192    else
193       nvk_cg_preprocess_nir(nir);
194 }
195 
196 static void
nvk_populate_fs_key(struct nak_fs_key * key,const struct vk_graphics_pipeline_state * state)197 nvk_populate_fs_key(struct nak_fs_key *key,
198                     const struct vk_graphics_pipeline_state *state)
199 {
200    memset(key, 0, sizeof(*key));
201 
202    key->sample_info_cb = 0;
203    key->sample_locations_offset = nvk_root_descriptor_offset(draw.sample_locations);
204    key->sample_masks_offset = nvk_root_descriptor_offset(draw.sample_masks);
205 
206    /* Turn underestimate on when no state is availaible or if explicitly set */
207    if (state == NULL || state->rs == NULL ||
208        state->rs->conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT)
209       key->uses_underestimate = true;
210 
211    if (state == NULL)
212       return;
213 
214    if (state->pipeline_flags &
215        VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT)
216       key->zs_self_dep = true;
217 
218    /* We force per-sample interpolation whenever sampleShadingEnable is set
219     * regardless of minSampleShading or rasterizationSamples.
220     *
221     * When sampleShadingEnable is set, few guarantees are made about the
222     * location of interpolation of the inputs.  The only real guarantees are
223     * that the inputs are interpolated within the pixel and that you get at
224     * least `rasterizationSamples * minSampleShading` unique positions.
225     * Importantly, it does not require that when `rasterizationSamples *
226     * minSampleShading <= 1.0` that those positions are at the fragment
227     * center.  Therefore, it's valid to just always do per-sample (which maps
228     * to CENTROID on NVIDIA hardware) all the time and let the hardware sort
229     * it out based on what we set in HYBRID_ANTI_ALIAS_CONTROL::passes.
230     *
231     * Also, we set HYBRID_ANTI_ALIAS_CONTROL::centroid at draw time based on
232     * `rasterizationSamples * minSampleShading` so it should be per-pixel
233     * whenever we're running only a single pass.  However, this would still be
234     * correct even if it got interpolated at some other sample.
235     *
236     * The one caveat here is that we have to be careful about gl_SampleMaskIn.
237     * When `nak_fs_key::force_sample_shading = true` we also turn any reads of
238     * gl_SampleMaskIn into `1 << gl_SampleID` because the hardware sample mask
239     * is actually per-fragment, not per-pass.  We handle this by smashing
240     * minSampleShading to 1.0 whenever gl_SampleMaskIn is read.
241     */
242    const struct vk_multisample_state *ms = state->ms;
243    if (ms != NULL && ms->sample_shading_enable)
244       key->force_sample_shading = true;
245 }
246 
247 static void
nvk_hash_graphics_state(struct vk_physical_device * device,const struct vk_graphics_pipeline_state * state,VkShaderStageFlags stages,blake3_hash blake3_out)248 nvk_hash_graphics_state(struct vk_physical_device *device,
249                         const struct vk_graphics_pipeline_state *state,
250                         VkShaderStageFlags stages,
251                         blake3_hash blake3_out)
252 {
253    struct mesa_blake3 blake3_ctx;
254    _mesa_blake3_init(&blake3_ctx);
255    if (stages & VK_SHADER_STAGE_FRAGMENT_BIT) {
256       struct nak_fs_key key;
257       nvk_populate_fs_key(&key, state);
258       _mesa_blake3_update(&blake3_ctx, &key, sizeof(key));
259 
260       const bool is_multiview = state->rp->view_mask != 0;
261       _mesa_blake3_update(&blake3_ctx, &is_multiview, sizeof(is_multiview));
262 
263       /* This doesn't impact the shader compile but it does go in the
264        * nvk_shader and gets [de]serialized along with the binary so we
265        * need to hash it.
266        */
267       if (state->ms && state->ms->sample_shading_enable) {
268          _mesa_blake3_update(&blake3_ctx, &state->ms->min_sample_shading,
269                              sizeof(state->ms->min_sample_shading));
270       }
271    }
272    _mesa_blake3_final(&blake3_ctx, blake3_out);
273 }
274 
275 static bool
lower_load_intrinsic(nir_builder * b,nir_intrinsic_instr * load,UNUSED void * _data)276 lower_load_intrinsic(nir_builder *b, nir_intrinsic_instr *load,
277                      UNUSED void *_data)
278 {
279    switch (load->intrinsic) {
280    case nir_intrinsic_load_ubo: {
281       b->cursor = nir_before_instr(&load->instr);
282 
283       nir_def *index = load->src[0].ssa;
284       nir_def *offset = load->src[1].ssa;
285       const enum gl_access_qualifier access = nir_intrinsic_access(load);
286       const uint32_t align_mul = nir_intrinsic_align_mul(load);
287       const uint32_t align_offset = nir_intrinsic_align_offset(load);
288 
289       nir_def *val;
290       if (load->src[0].ssa->num_components == 1) {
291          val = nir_ldc_nv(b, load->num_components, load->def.bit_size,
292                            index, offset, .access = access,
293                            .align_mul = align_mul,
294                            .align_offset = align_offset);
295       } else if (load->src[0].ssa->num_components == 2) {
296          nir_def *handle = nir_pack_64_2x32(b, load->src[0].ssa);
297          val = nir_ldcx_nv(b, load->num_components, load->def.bit_size,
298                            handle, offset, .access = access,
299                            .align_mul = align_mul,
300                            .align_offset = align_offset);
301       } else {
302          unreachable("Invalid UBO index");
303       }
304       nir_def_rewrite_uses(&load->def, val);
305       return true;
306    }
307 
308    case nir_intrinsic_load_global_constant_offset:
309    case nir_intrinsic_load_global_constant_bounded: {
310       b->cursor = nir_before_instr(&load->instr);
311 
312       nir_def *base_addr = load->src[0].ssa;
313       nir_def *offset = load->src[1].ssa;
314 
315       nir_def *zero = NULL;
316       if (load->intrinsic == nir_intrinsic_load_global_constant_bounded) {
317          nir_def *bound = load->src[2].ssa;
318 
319          unsigned bit_size = load->def.bit_size;
320          assert(bit_size >= 8 && bit_size % 8 == 0);
321          unsigned byte_size = bit_size / 8;
322 
323          zero = nir_imm_zero(b, load->num_components, bit_size);
324 
325          unsigned load_size = byte_size * load->num_components;
326 
327          nir_def *sat_offset =
328             nir_umin(b, offset, nir_imm_int(b, UINT32_MAX - (load_size - 1)));
329          nir_def *in_bounds =
330             nir_ilt(b, nir_iadd_imm(b, sat_offset, load_size - 1), bound);
331 
332          nir_push_if(b, in_bounds);
333       }
334 
335       nir_def *val =
336          nir_build_load_global_constant(b, load->def.num_components,
337                                         load->def.bit_size,
338                                         nir_iadd(b, base_addr, nir_u2u64(b, offset)),
339                                         .align_mul = nir_intrinsic_align_mul(load),
340                                         .align_offset = nir_intrinsic_align_offset(load));
341 
342       if (load->intrinsic == nir_intrinsic_load_global_constant_bounded) {
343          nir_pop_if(b, NULL);
344          val = nir_if_phi(b, val, zero);
345       }
346 
347       nir_def_rewrite_uses(&load->def, val);
348       return true;
349    }
350 
351    default:
352       return false;
353    }
354 }
355 
356 struct lower_ycbcr_state {
357    uint32_t set_layout_count;
358    struct vk_descriptor_set_layout * const *set_layouts;
359 };
360 
361 static const struct vk_ycbcr_conversion_state *
lookup_ycbcr_conversion(const void * _state,uint32_t set,uint32_t binding,uint32_t array_index)362 lookup_ycbcr_conversion(const void *_state, uint32_t set,
363                         uint32_t binding, uint32_t array_index)
364 {
365    const struct lower_ycbcr_state *state = _state;
366    assert(set < state->set_layout_count);
367    assert(state->set_layouts[set] != NULL);
368    const struct nvk_descriptor_set_layout *set_layout =
369       vk_to_nvk_descriptor_set_layout(state->set_layouts[set]);
370    assert(binding < set_layout->binding_count);
371 
372    const struct nvk_descriptor_set_binding_layout *bind_layout =
373       &set_layout->binding[binding];
374 
375    if (bind_layout->immutable_samplers == NULL)
376       return NULL;
377 
378    array_index = MIN2(array_index, bind_layout->array_size - 1);
379 
380    const struct nvk_sampler *sampler =
381       bind_layout->immutable_samplers[array_index];
382 
383    return sampler && sampler->vk.ycbcr_conversion ?
384           &sampler->vk.ycbcr_conversion->state : NULL;
385 }
386 
387 static inline bool
nir_has_image_var(nir_shader * nir)388 nir_has_image_var(nir_shader *nir)
389 {
390    nir_foreach_image_variable(_, nir)
391       return true;
392 
393    return false;
394 }
395 
396 static void
nvk_lower_nir(struct nvk_device * dev,nir_shader * nir,VkShaderCreateFlagsEXT shader_flags,const struct vk_pipeline_robustness_state * rs,bool is_multiview,uint32_t set_layout_count,struct vk_descriptor_set_layout * const * set_layouts,struct nvk_cbuf_map * cbuf_map_out)397 nvk_lower_nir(struct nvk_device *dev, nir_shader *nir,
398               VkShaderCreateFlagsEXT shader_flags,
399               const struct vk_pipeline_robustness_state *rs,
400               bool is_multiview,
401               uint32_t set_layout_count,
402               struct vk_descriptor_set_layout * const *set_layouts,
403               struct nvk_cbuf_map *cbuf_map_out)
404 {
405    struct nvk_physical_device *pdev = nvk_device_physical(dev);
406 
407    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
408       NIR_PASS(_, nir, nir_lower_input_attachments,
409                &(nir_input_attachment_options) {
410                   .use_fragcoord_sysval = use_nak(pdev, nir->info.stage),
411                   .use_layer_id_sysval = use_nak(pdev, nir->info.stage) ||
412                                          is_multiview,
413                   .use_view_id_for_layer = is_multiview,
414                });
415    }
416 
417    if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
418       NIR_PASS(_, nir, nir_lower_patch_vertices,
419                nir->info.tess.tcs_vertices_out, NULL);
420    }
421 
422    const struct lower_ycbcr_state ycbcr_state = {
423       .set_layout_count = set_layout_count,
424       .set_layouts = set_layouts,
425    };
426    NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex,
427             lookup_ycbcr_conversion, &ycbcr_state);
428 
429    nir_lower_compute_system_values_options csv_options = {
430       .has_base_workgroup_id = true,
431    };
432    NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options);
433 
434    /* Lower push constants before lower_descriptors */
435    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const,
436             nir_address_format_32bit_offset);
437 
438    /* Lower non-uniform access before lower_descriptors */
439    enum nir_lower_non_uniform_access_type lower_non_uniform_access_types =
440       nir_lower_non_uniform_ubo_access;
441 
442    if (pdev->info.cls_eng3d < TURING_A) {
443       lower_non_uniform_access_types |= nir_lower_non_uniform_texture_access |
444                                         nir_lower_non_uniform_image_access;
445    }
446 
447    /* In practice, most shaders do not have non-uniform-qualified accesses
448     * thus a cheaper and likely to fail check is run first.
449     */
450    if (nir_has_non_uniform_access(nir, lower_non_uniform_access_types)) {
451       struct nir_lower_non_uniform_access_options opts = {
452          .types = lower_non_uniform_access_types,
453          .callback = NULL,
454       };
455       NIR_PASS(_, nir, nir_opt_non_uniform_access);
456       NIR_PASS(_, nir, nir_lower_non_uniform_access, &opts);
457    }
458 
459    /* TODO: Kepler image lowering requires image params to be loaded from the
460     * descriptor set which we don't currently support.
461     */
462    assert(pdev->info.cls_eng3d >= MAXWELL_A || !nir_has_image_var(nir));
463 
464    struct nvk_cbuf_map *cbuf_map = NULL;
465    if (use_nak(pdev, nir->info.stage) &&
466        !(pdev->debug_flags & NVK_DEBUG_NO_CBUF)) {
467       cbuf_map = cbuf_map_out;
468 
469       /* Large constant support assumes cbufs */
470       NIR_PASS(_, nir, nir_opt_large_constants, NULL, 32);
471    } else {
472       /* Codegen sometimes puts stuff in cbuf 1 and adds 1 to our cbuf indices
473        * so we can't really rely on it for lowering to cbufs and instead place
474        * the root descriptors in both cbuf 0 and cbuf 1.
475        */
476       *cbuf_map_out = (struct nvk_cbuf_map) {
477          .cbuf_count = 2,
478          .cbufs = {
479             { .type = NVK_CBUF_TYPE_ROOT_DESC },
480             { .type = NVK_CBUF_TYPE_ROOT_DESC },
481          }
482       };
483    }
484 
485    NIR_PASS(_, nir, nvk_nir_lower_descriptors, pdev, shader_flags, rs,
486             set_layout_count, set_layouts, cbuf_map);
487    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
488             nir_address_format_64bit_global);
489    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo,
490             nvk_ssbo_addr_format(pdev, rs));
491    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
492             nvk_ubo_addr_format(pdev, rs));
493    NIR_PASS(_, nir, nir_shader_intrinsics_pass,
494             lower_load_intrinsic, nir_metadata_none, NULL);
495 
496    if (!nir->info.shared_memory_explicit_layout) {
497       NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
498                nir_var_mem_shared, shared_var_info);
499    }
500    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared,
501             nir_address_format_32bit_offset);
502 
503    if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
504       /* QMD::SHARED_MEMORY_SIZE requires an alignment of 256B so it's safe to
505        * align everything up to 16B so we can write whole vec4s.
506        */
507       nir->info.shared_size = align(nir->info.shared_size, 16);
508       NIR_PASS(_, nir, nir_zero_initialize_shared_memory,
509                nir->info.shared_size, 16);
510 
511       /* We need to call lower_compute_system_values again because
512        * nir_zero_initialize_shared_memory generates load_invocation_id which
513        * has to be lowered to load_invocation_index.
514        */
515       NIR_PASS(_, nir, nir_lower_compute_system_values, NULL);
516    }
517 }
518 
519 #ifndef NDEBUG
520 static void
nvk_shader_dump(struct nvk_shader * shader)521 nvk_shader_dump(struct nvk_shader *shader)
522 {
523    unsigned pos;
524 
525    if (shader->info.stage != MESA_SHADER_COMPUTE) {
526       _debug_printf("dumping HDR for %s shader\n",
527                     _mesa_shader_stage_to_string(shader->info.stage));
528       for (pos = 0; pos < ARRAY_SIZE(shader->info.hdr); ++pos)
529          _debug_printf("HDR[%02"PRIxPTR"] = 0x%08x\n",
530                       pos * sizeof(shader->info.hdr[0]), shader->info.hdr[pos]);
531    }
532    _debug_printf("shader binary code (0x%x bytes):", shader->code_size);
533    for (pos = 0; pos < shader->code_size / 4; ++pos) {
534       if ((pos % 8) == 0)
535          _debug_printf("\n");
536       _debug_printf("%08x ", ((const uint32_t *)shader->code_ptr)[pos]);
537    }
538    _debug_printf("\n");
539 }
540 #endif
541 
542 static VkResult
nvk_compile_nir_with_nak(struct nvk_physical_device * pdev,nir_shader * nir,VkShaderCreateFlagsEXT shader_flags,const struct vk_pipeline_robustness_state * rs,const struct nak_fs_key * fs_key,struct nvk_shader * shader)543 nvk_compile_nir_with_nak(struct nvk_physical_device *pdev,
544                          nir_shader *nir,
545                          VkShaderCreateFlagsEXT shader_flags,
546                          const struct vk_pipeline_robustness_state *rs,
547                          const struct nak_fs_key *fs_key,
548                          struct nvk_shader *shader)
549 {
550    const bool dump_asm =
551       shader_flags & VK_SHADER_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_MESA;
552 
553    nir_variable_mode robust2_modes = 0;
554    if (rs->uniform_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
555       robust2_modes |= nir_var_mem_ubo;
556    if (rs->storage_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
557       robust2_modes |= nir_var_mem_ssbo;
558 
559    shader->nak = nak_compile_shader(nir, dump_asm, pdev->nak, robust2_modes, fs_key);
560    shader->info = shader->nak->info;
561    shader->code_ptr = shader->nak->code;
562    shader->code_size = shader->nak->code_size;
563 
564    return VK_SUCCESS;
565 }
566 
567 static VkResult
nvk_compile_nir(struct nvk_device * dev,nir_shader * nir,VkShaderCreateFlagsEXT shader_flags,const struct vk_pipeline_robustness_state * rs,const struct nak_fs_key * fs_key,struct nvk_shader * shader)568 nvk_compile_nir(struct nvk_device *dev, nir_shader *nir,
569                 VkShaderCreateFlagsEXT shader_flags,
570                 const struct vk_pipeline_robustness_state *rs,
571                 const struct nak_fs_key *fs_key,
572                 struct nvk_shader *shader)
573 {
574    struct nvk_physical_device *pdev = nvk_device_physical(dev);
575    VkResult result;
576 
577    if (use_nak(pdev, nir->info.stage)) {
578       result = nvk_compile_nir_with_nak(pdev, nir, shader_flags, rs,
579                                        fs_key, shader);
580    } else {
581       result = nvk_cg_compile_nir(pdev, nir, fs_key, shader);
582    }
583    if (result != VK_SUCCESS)
584       return result;
585 
586    if (nir->constant_data_size > 0) {
587       uint32_t data_align = nvk_min_cbuf_alignment(&pdev->info);
588       uint32_t data_size = align(nir->constant_data_size, data_align);
589 
590       void *data = malloc(data_size);
591       if (data == NULL)
592          return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
593 
594       memcpy(data, nir->constant_data, nir->constant_data_size);
595 
596       assert(nir->constant_data_size <= data_size);
597       memset(data + nir->constant_data_size, 0,
598              data_size - nir->constant_data_size);
599 
600       shader->data_ptr = data;
601       shader->data_size = data_size;
602    }
603 
604    return VK_SUCCESS;
605 }
606 
607 static VkResult
nvk_shader_upload(struct nvk_device * dev,struct nvk_shader * shader)608 nvk_shader_upload(struct nvk_device *dev, struct nvk_shader *shader)
609 {
610    struct nvk_physical_device *pdev = nvk_device_physical(dev);
611 
612    uint32_t hdr_size = 0;
613    if (shader->info.stage != MESA_SHADER_COMPUTE) {
614       if (pdev->info.cls_eng3d >= TURING_A)
615          hdr_size = TU102_SHADER_HEADER_SIZE;
616       else
617          hdr_size = GF100_SHADER_HEADER_SIZE;
618    }
619 
620    /* Fermi   needs 0x40 alignment
621     * Kepler+ needs the first instruction to be 0x80 aligned, so we waste 0x30 bytes
622     */
623    int alignment = pdev->info.cls_eng3d >= KEPLER_A ? 0x80 : 0x40;
624 
625    uint32_t total_size = 0;
626    if (pdev->info.cls_eng3d >= KEPLER_A &&
627        pdev->info.cls_eng3d < TURING_A &&
628        hdr_size > 0) {
629       /* The instructions are what has to be aligned so we need to start at a
630        * small offset (0x30 B) into the upload area.
631        */
632       total_size = alignment - hdr_size;
633    }
634 
635    const uint32_t hdr_offset = total_size;
636    total_size += hdr_size;
637 
638    const uint32_t code_offset = total_size;
639    assert(code_offset % alignment == 0);
640    total_size += shader->code_size;
641 
642    uint32_t data_offset = 0;
643    if (shader->data_size > 0) {
644       uint32_t cbuf_alignment = nvk_min_cbuf_alignment(&pdev->info);
645       alignment = MAX2(alignment, cbuf_alignment);
646       total_size = align(total_size, cbuf_alignment);
647       data_offset = total_size;
648       total_size += shader->data_size;
649    }
650 
651    char *data = malloc(total_size);
652    if (data == NULL)
653       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
654 
655    assert(hdr_size <= sizeof(shader->info.hdr));
656    memcpy(data + hdr_offset, shader->info.hdr, hdr_size);
657    memcpy(data + code_offset, shader->code_ptr, shader->code_size);
658    if (shader->data_size > 0)
659       memcpy(data + data_offset, shader->data_ptr, shader->data_size);
660 
661 #ifndef NDEBUG
662    if (debug_get_bool_option("NV50_PROG_DEBUG", false))
663       nvk_shader_dump(shader);
664 #endif
665 
666    VkResult result = nvk_heap_upload(dev, &dev->shader_heap, data,
667                                      total_size, alignment,
668                                      &shader->upload_addr);
669    if (result == VK_SUCCESS) {
670       shader->upload_size = total_size;
671 
672       shader->hdr_addr = shader->upload_addr + hdr_offset;
673       if (pdev->info.cls_eng3d < VOLTA_A) {
674          const uint64_t heap_base_addr =
675             nvk_heap_contiguous_base_address(&dev->shader_heap);
676          assert(shader->upload_addr - heap_base_addr < UINT32_MAX);
677          shader->hdr_addr -= heap_base_addr;
678       }
679       shader->data_addr = shader->upload_addr + data_offset;
680    }
681    free(data);
682 
683    return result;
684 }
685 
686 uint32_t
mesa_to_nv9097_shader_type(gl_shader_stage stage)687 mesa_to_nv9097_shader_type(gl_shader_stage stage)
688 {
689    static const uint32_t mesa_to_nv9097[] = {
690       [MESA_SHADER_VERTEX]    = NV9097_SET_PIPELINE_SHADER_TYPE_VERTEX,
691       [MESA_SHADER_TESS_CTRL] = NV9097_SET_PIPELINE_SHADER_TYPE_TESSELLATION_INIT,
692       [MESA_SHADER_TESS_EVAL] = NV9097_SET_PIPELINE_SHADER_TYPE_TESSELLATION,
693       [MESA_SHADER_GEOMETRY]  = NV9097_SET_PIPELINE_SHADER_TYPE_GEOMETRY,
694       [MESA_SHADER_FRAGMENT]  = NV9097_SET_PIPELINE_SHADER_TYPE_PIXEL,
695    };
696    assert(stage < ARRAY_SIZE(mesa_to_nv9097));
697    return mesa_to_nv9097[stage];
698 }
699 
700 uint32_t
nvk_pipeline_bind_group(gl_shader_stage stage)701 nvk_pipeline_bind_group(gl_shader_stage stage)
702 {
703    return stage;
704 }
705 
706 uint16_t
nvk_max_shader_push_dw(struct nvk_physical_device * pdev,gl_shader_stage stage,bool last_vtgm)707 nvk_max_shader_push_dw(struct nvk_physical_device *pdev,
708                        gl_shader_stage stage, bool last_vtgm)
709 {
710    if (stage == MESA_SHADER_COMPUTE)
711       return 0;
712 
713    uint16_t max_dw_count = 8;
714 
715    if (stage == MESA_SHADER_TESS_EVAL)
716       max_dw_count += 2;
717 
718    if (stage == MESA_SHADER_FRAGMENT)
719       max_dw_count += 13;
720 
721    if (last_vtgm) {
722       max_dw_count += 8;
723       max_dw_count += 4 * (5 + (128 / 4));
724    }
725 
726    return max_dw_count;
727 }
728 
729 static VkResult
nvk_shader_fill_push(struct nvk_device * dev,struct nvk_shader * shader,const VkAllocationCallbacks * pAllocator)730 nvk_shader_fill_push(struct nvk_device *dev,
731                      struct nvk_shader *shader,
732                      const VkAllocationCallbacks* pAllocator)
733 {
734    struct nvk_physical_device *pdev = nvk_device_physical(dev);
735 
736    ASSERTED uint16_t max_dw_count = 0;
737    uint32_t push_dw[200];
738    struct nv_push push, *p = &push;
739    nv_push_init(&push, push_dw, ARRAY_SIZE(push_dw));
740 
741    const uint32_t type = mesa_to_nv9097_shader_type(shader->info.stage);
742 
743    /* We always map index == type */
744    const uint32_t idx = type;
745 
746    max_dw_count += 2;
747    P_IMMD(p, NV9097, SET_PIPELINE_SHADER(idx), {
748       .enable  = ENABLE_TRUE,
749       .type    = type,
750    });
751 
752    max_dw_count += 3;
753    uint64_t addr = shader->hdr_addr;
754    if (pdev->info.cls_eng3d >= VOLTA_A) {
755       P_MTHD(p, NVC397, SET_PIPELINE_PROGRAM_ADDRESS_A(idx));
756       P_NVC397_SET_PIPELINE_PROGRAM_ADDRESS_A(p, idx, addr >> 32);
757       P_NVC397_SET_PIPELINE_PROGRAM_ADDRESS_B(p, idx, addr);
758    } else {
759       assert(addr < 0xffffffff);
760       P_IMMD(p, NV9097, SET_PIPELINE_PROGRAM(idx), addr);
761    }
762 
763    max_dw_count += 3;
764    P_MTHD(p, NVC397, SET_PIPELINE_REGISTER_COUNT(idx));
765    P_NVC397_SET_PIPELINE_REGISTER_COUNT(p, idx, shader->info.num_gprs);
766    P_NVC397_SET_PIPELINE_BINDING(p, idx,
767       nvk_pipeline_bind_group(shader->info.stage));
768 
769    if (shader->info.stage == MESA_SHADER_TESS_EVAL) {
770       max_dw_count += 2;
771       P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_TESS_PARAMS));
772       P_INLINE_DATA(p, nvk_mme_tess_params(shader->info.ts.domain,
773                                            shader->info.ts.spacing,
774                                            shader->info.ts.prims));
775    }
776 
777    if (shader->info.stage == MESA_SHADER_FRAGMENT) {
778       max_dw_count += 13;
779 
780       P_MTHD(p, NVC397, SET_SUBTILING_PERF_KNOB_A);
781       P_NV9097_SET_SUBTILING_PERF_KNOB_A(p, {
782          .fraction_of_spm_register_file_per_subtile         = 0x10,
783          .fraction_of_spm_pixel_output_buffer_per_subtile   = 0x40,
784          .fraction_of_spm_triangle_ram_per_subtile          = 0x16,
785          .fraction_of_max_quads_per_subtile                 = 0x20,
786       });
787       P_NV9097_SET_SUBTILING_PERF_KNOB_B(p, 0x20);
788 
789       P_IMMD(p, NV9097, SET_API_MANDATED_EARLY_Z,
790              shader->info.fs.early_fragment_tests);
791 
792       if (pdev->info.cls_eng3d >= MAXWELL_B) {
793          P_IMMD(p, NVB197, SET_POST_Z_PS_IMASK,
794                 shader->info.fs.post_depth_coverage);
795       } else {
796          assert(!shader->info.fs.post_depth_coverage);
797       }
798 
799       P_IMMD(p, NV9097, SET_ZCULL_BOUNDS, {
800          .z_min_unbounded_enable = shader->info.fs.writes_depth,
801          .z_max_unbounded_enable = shader->info.fs.writes_depth,
802       });
803 
804       if (pdev->info.cls_eng3d >= TURING_A) {
805          /* From the Vulkan 1.3.297 spec:
806           *
807           *    "If sample shading is enabled, an implementation must invoke
808           *    the fragment shader at least
809           *
810           *    max( ⌈ minSampleShading × rasterizationSamples ⌉, 1)
811           *
812           *    times per fragment."
813           *
814           * The max() here means that, regardless of the actual value of
815           * minSampleShading, we need to invoke at least once per pixel,
816           * meaning that we need to disable fragment shading rate.  We also
817           * need to disable FSR if sample shading is used by the shader.
818           */
819          P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_SHADING_RATE_CONTROL));
820          P_INLINE_DATA(p, nvk_mme_shading_rate_control_sample_shading(
821             shader->sample_shading_enable ||
822             shader->info.fs.uses_sample_shading));
823       }
824 
825       float mss = 0;
826       if (shader->info.fs.uses_sample_shading) {
827          mss = 1;
828       } else if (shader->sample_shading_enable) {
829          mss = CLAMP(shader->min_sample_shading, 0, 1);
830       } else {
831          mss = 0;
832       }
833       P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_ANTI_ALIAS));
834       P_INLINE_DATA(p, nvk_mme_anti_alias_min_sample_shading(mss));
835    }
836 
837    /* Stash this before we do XFB and clip/cull */
838    shader->push_dw_count = nv_push_dw_count(&push);
839    assert(max_dw_count ==
840           nvk_max_shader_push_dw(pdev, shader->info.stage, false));
841 
842    if (shader->info.stage != MESA_SHADER_FRAGMENT &&
843        shader->info.stage != MESA_SHADER_TESS_CTRL) {
844       max_dw_count += 8;
845 
846       P_IMMD(p, NV9097, SET_RT_LAYER, {
847          .v       = 0,
848          .control = shader->info.vtg.writes_layer ?
849                     CONTROL_GEOMETRY_SHADER_SELECTS_LAYER :
850                     CONTROL_V_SELECTS_LAYER,
851       });
852 
853       if (pdev->info.cls_eng3d >= AMPERE_B) {
854          P_IMMD(p, NVC797, SET_VARIABLE_PIXEL_RATE_SHADING_TABLE_SELECT, {
855             .source = shader->info.vtg.writes_vprs_table_index ?
856                       SOURCE_FROM_VPRS_TABLE_INDEX :
857                       SOURCE_FROM_CONSTANT,
858             .source_constant_value = 0,
859          });
860       }
861 
862       const uint8_t clip_enable = shader->info.vtg.clip_enable;
863       const uint8_t cull_enable = shader->info.vtg.cull_enable;
864       P_IMMD(p, NV9097, SET_USER_CLIP_ENABLE, {
865          .plane0 = ((clip_enable | cull_enable) >> 0) & 1,
866          .plane1 = ((clip_enable | cull_enable) >> 1) & 1,
867          .plane2 = ((clip_enable | cull_enable) >> 2) & 1,
868          .plane3 = ((clip_enable | cull_enable) >> 3) & 1,
869          .plane4 = ((clip_enable | cull_enable) >> 4) & 1,
870          .plane5 = ((clip_enable | cull_enable) >> 5) & 1,
871          .plane6 = ((clip_enable | cull_enable) >> 6) & 1,
872          .plane7 = ((clip_enable | cull_enable) >> 7) & 1,
873       });
874       P_IMMD(p, NV9097, SET_USER_CLIP_OP, {
875          .plane0 = (cull_enable >> 0) & 1,
876          .plane1 = (cull_enable >> 1) & 1,
877          .plane2 = (cull_enable >> 2) & 1,
878          .plane3 = (cull_enable >> 3) & 1,
879          .plane4 = (cull_enable >> 4) & 1,
880          .plane5 = (cull_enable >> 5) & 1,
881          .plane6 = (cull_enable >> 6) & 1,
882          .plane7 = (cull_enable >> 7) & 1,
883       });
884 
885       struct nak_xfb_info *xfb = &shader->info.vtg.xfb;
886       for (uint8_t b = 0; b < ARRAY_SIZE(xfb->attr_count); b++) {
887          const uint8_t attr_count = xfb->attr_count[b];
888 
889          max_dw_count += 5 + (128 / 4);
890 
891          P_MTHD(p, NV9097, SET_STREAM_OUT_CONTROL_STREAM(b));
892          P_NV9097_SET_STREAM_OUT_CONTROL_STREAM(p, b, xfb->stream[b]);
893          P_NV9097_SET_STREAM_OUT_CONTROL_COMPONENT_COUNT(p, b, attr_count);
894          P_NV9097_SET_STREAM_OUT_CONTROL_STRIDE(p, b, xfb->stride[b]);
895 
896          if (attr_count > 0) {
897             /* upload packed varying indices in multiples of 4 bytes */
898             const uint32_t n = DIV_ROUND_UP(attr_count, 4);
899             P_MTHD(p, NV9097, SET_STREAM_OUT_LAYOUT_SELECT(b, 0));
900             P_INLINE_ARRAY(p, (const uint32_t*)xfb->attr_index[b], n);
901          }
902       }
903 
904       shader->vtgm_push_dw_count = nv_push_dw_count(&push);
905       assert(max_dw_count ==
906              nvk_max_shader_push_dw(pdev, shader->info.stage, true));
907    }
908 
909    assert(nv_push_dw_count(&push) <= max_dw_count);
910    assert(max_dw_count <= ARRAY_SIZE(push_dw));
911 
912    uint16_t dw_count = nv_push_dw_count(&push);
913    shader->push_dw =
914       vk_zalloc2(&dev->vk.alloc, pAllocator, dw_count * sizeof(*push_dw),
915                  sizeof(*push_dw), VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
916    if (shader->push_dw == NULL)
917       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
918 
919    memcpy(shader->push_dw, push_dw, dw_count * sizeof(*push_dw));
920 
921    return VK_SUCCESS;
922 }
923 
924 static const struct vk_shader_ops nvk_shader_ops;
925 
926 static void
nvk_shader_destroy(struct vk_device * vk_dev,struct vk_shader * vk_shader,const VkAllocationCallbacks * pAllocator)927 nvk_shader_destroy(struct vk_device *vk_dev,
928                    struct vk_shader *vk_shader,
929                    const VkAllocationCallbacks* pAllocator)
930 {
931    struct nvk_device *dev = container_of(vk_dev, struct nvk_device, vk);
932    struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
933 
934    vk_free2(&dev->vk.alloc, pAllocator, shader->push_dw);
935 
936    if (shader->upload_size > 0) {
937       nvk_heap_free(dev, &dev->shader_heap,
938                     shader->upload_addr,
939                     shader->upload_size);
940    }
941 
942    if (shader->nak) {
943       nak_shader_bin_destroy(shader->nak);
944    } else {
945       /* This came from codegen or deserialize, just free it */
946       free((void *)shader->code_ptr);
947    }
948 
949    free((void *)shader->data_ptr);
950 
951    vk_shader_free(&dev->vk, pAllocator, &shader->vk);
952 }
953 
954 static VkResult
nvk_compile_shader(struct nvk_device * dev,struct vk_shader_compile_info * info,const struct vk_graphics_pipeline_state * state,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shader_out)955 nvk_compile_shader(struct nvk_device *dev,
956                    struct vk_shader_compile_info *info,
957                    const struct vk_graphics_pipeline_state *state,
958                    const VkAllocationCallbacks* pAllocator,
959                    struct vk_shader **shader_out)
960 {
961    struct nvk_shader *shader;
962    VkResult result;
963 
964    /* We consume the NIR, regardless of success or failure */
965    nir_shader *nir = info->nir;
966 
967    shader = vk_shader_zalloc(&dev->vk, &nvk_shader_ops, info->stage,
968                              pAllocator, sizeof(*shader));
969    if (shader == NULL) {
970       ralloc_free(nir);
971       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
972    }
973 
974    /* TODO: Multiview with ESO */
975    const bool is_multiview = state && state->rp->view_mask != 0;
976 
977    nvk_lower_nir(dev, nir, info->flags, info->robustness, is_multiview,
978                  info->set_layout_count, info->set_layouts,
979                  &shader->cbuf_map);
980 
981    struct nak_fs_key fs_key_tmp, *fs_key = NULL;
982    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
983       nvk_populate_fs_key(&fs_key_tmp, state);
984       fs_key = &fs_key_tmp;
985    }
986 
987    result = nvk_compile_nir(dev, nir, info->flags, info->robustness,
988                             fs_key, shader);
989    ralloc_free(nir);
990    if (result != VK_SUCCESS) {
991       nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
992       return result;
993    }
994 
995    result = nvk_shader_upload(dev, shader);
996    if (result != VK_SUCCESS) {
997       nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
998       return result;
999    }
1000 
1001    if (info->stage == MESA_SHADER_FRAGMENT) {
1002       if (state != NULL && state->ms != NULL) {
1003          shader->sample_shading_enable = state->ms->sample_shading_enable;
1004          if (state->ms->sample_shading_enable)
1005             shader->min_sample_shading = state->ms->min_sample_shading;
1006       }
1007    }
1008 
1009    if (info->stage != MESA_SHADER_COMPUTE) {
1010       result = nvk_shader_fill_push(dev, shader, pAllocator);
1011       if (result != VK_SUCCESS) {
1012          nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
1013          return result;
1014       }
1015    }
1016 
1017    *shader_out = &shader->vk;
1018 
1019    return VK_SUCCESS;
1020 }
1021 
1022 VkResult
nvk_compile_nir_shader(struct nvk_device * dev,nir_shader * nir,const VkAllocationCallbacks * alloc,struct nvk_shader ** shader_out)1023 nvk_compile_nir_shader(struct nvk_device *dev, nir_shader *nir,
1024                        const VkAllocationCallbacks *alloc,
1025                        struct nvk_shader **shader_out)
1026 {
1027    struct nvk_physical_device *pdev = nvk_device_physical(dev);
1028 
1029    const struct vk_pipeline_robustness_state rs_none = {
1030       .uniform_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
1031       .storage_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
1032       .images = VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS_2_EXT,
1033    };
1034 
1035    assert(nir->info.stage == MESA_SHADER_COMPUTE);
1036    if (nir->options == NULL)
1037       nir->options = nvk_get_nir_options(&pdev->vk, nir->info.stage, &rs_none);
1038 
1039    struct vk_shader_compile_info info = {
1040       .stage = nir->info.stage,
1041       .nir = nir,
1042       .robustness = &rs_none,
1043    };
1044 
1045    struct vk_shader *shader = NULL;
1046    VkResult result = nvk_compile_shader(dev, &info, NULL, alloc, &shader);
1047    if (result != VK_SUCCESS)
1048       return result;
1049 
1050    *shader_out = container_of(shader, struct nvk_shader, vk);
1051 
1052    return VK_SUCCESS;
1053 }
1054 
1055 static VkResult
nvk_compile_shaders(struct vk_device * vk_dev,uint32_t shader_count,struct vk_shader_compile_info * infos,const struct vk_graphics_pipeline_state * state,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shaders_out)1056 nvk_compile_shaders(struct vk_device *vk_dev,
1057                     uint32_t shader_count,
1058                     struct vk_shader_compile_info *infos,
1059                     const struct vk_graphics_pipeline_state *state,
1060                     const VkAllocationCallbacks* pAllocator,
1061                     struct vk_shader **shaders_out)
1062 {
1063    struct nvk_device *dev = container_of(vk_dev, struct nvk_device, vk);
1064 
1065    for (uint32_t i = 0; i < shader_count; i++) {
1066       VkResult result = nvk_compile_shader(dev, &infos[i], state,
1067                                            pAllocator, &shaders_out[i]);
1068       if (result != VK_SUCCESS) {
1069          /* Clean up all the shaders before this point */
1070          for (uint32_t j = 0; j < i; j++)
1071             nvk_shader_destroy(&dev->vk, shaders_out[j], pAllocator);
1072 
1073          /* Clean up all the NIR after this point */
1074          for (uint32_t j = i + 1; j < shader_count; j++)
1075             ralloc_free(infos[j].nir);
1076 
1077          /* Memset the output array */
1078          memset(shaders_out, 0, shader_count * sizeof(*shaders_out));
1079 
1080          return result;
1081       }
1082    }
1083 
1084    return VK_SUCCESS;
1085 }
1086 
1087 static VkResult
nvk_deserialize_shader(struct vk_device * vk_dev,struct blob_reader * blob,uint32_t binary_version,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shader_out)1088 nvk_deserialize_shader(struct vk_device *vk_dev,
1089                        struct blob_reader *blob,
1090                        uint32_t binary_version,
1091                        const VkAllocationCallbacks* pAllocator,
1092                        struct vk_shader **shader_out)
1093 {
1094    struct nvk_device *dev = container_of(vk_dev, struct nvk_device, vk);
1095    struct nvk_shader *shader;
1096    VkResult result;
1097 
1098    struct nak_shader_info info;
1099    blob_copy_bytes(blob, &info, sizeof(info));
1100 
1101    struct nvk_cbuf_map cbuf_map;
1102    blob_copy_bytes(blob, &cbuf_map, sizeof(cbuf_map));
1103 
1104    bool sample_shading_enable;
1105    blob_copy_bytes(blob, &sample_shading_enable, sizeof(sample_shading_enable));
1106 
1107    float min_sample_shading;
1108    blob_copy_bytes(blob, &min_sample_shading, sizeof(min_sample_shading));
1109 
1110    const uint32_t code_size = blob_read_uint32(blob);
1111    const uint32_t data_size = blob_read_uint32(blob);
1112    if (blob->overrun)
1113       return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
1114 
1115    shader = vk_shader_zalloc(&dev->vk, &nvk_shader_ops, info.stage,
1116                              pAllocator, sizeof(*shader));
1117    if (shader == NULL)
1118       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1119 
1120    shader->info = info;
1121    shader->cbuf_map = cbuf_map;
1122    shader->sample_shading_enable = sample_shading_enable;
1123    shader->min_sample_shading = min_sample_shading;
1124    shader->code_size = code_size;
1125    shader->data_size = data_size;
1126 
1127    shader->code_ptr = malloc(code_size);
1128    if (shader->code_ptr == NULL) {
1129       nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
1130       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1131    }
1132 
1133    shader->data_ptr = malloc(data_size);
1134    if (shader->data_ptr == NULL) {
1135       nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
1136       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1137    }
1138 
1139    blob_copy_bytes(blob, (void *)shader->code_ptr, shader->code_size);
1140    blob_copy_bytes(blob, (void *)shader->data_ptr, shader->data_size);
1141    if (blob->overrun) {
1142       nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
1143       return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
1144    }
1145 
1146    result = nvk_shader_upload(dev, shader);
1147    if (result != VK_SUCCESS) {
1148       nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
1149       return result;
1150    }
1151 
1152    if (info.stage != MESA_SHADER_COMPUTE) {
1153       result = nvk_shader_fill_push(dev, shader, pAllocator);
1154       if (result != VK_SUCCESS) {
1155          nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
1156          return result;
1157       }
1158    }
1159 
1160    *shader_out = &shader->vk;
1161 
1162    return VK_SUCCESS;
1163 }
1164 
1165 static bool
nvk_shader_serialize(struct vk_device * vk_dev,const struct vk_shader * vk_shader,struct blob * blob)1166 nvk_shader_serialize(struct vk_device *vk_dev,
1167                      const struct vk_shader *vk_shader,
1168                      struct blob *blob)
1169 {
1170    struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
1171 
1172    /* We can't currently cache assmbly */
1173    if (shader->nak != NULL && shader->nak->asm_str != NULL)
1174       return false;
1175 
1176    blob_write_bytes(blob, &shader->info, sizeof(shader->info));
1177    blob_write_bytes(blob, &shader->cbuf_map, sizeof(shader->cbuf_map));
1178    blob_write_bytes(blob, &shader->sample_shading_enable,
1179                     sizeof(shader->sample_shading_enable));
1180    blob_write_bytes(blob, &shader->min_sample_shading,
1181                     sizeof(shader->min_sample_shading));
1182 
1183    blob_write_uint32(blob, shader->code_size);
1184    blob_write_uint32(blob, shader->data_size);
1185    blob_write_bytes(blob, shader->code_ptr, shader->code_size);
1186    blob_write_bytes(blob, shader->data_ptr, shader->data_size);
1187 
1188    return !blob->out_of_memory;
1189 }
1190 
1191 #define WRITE_STR(field, ...) ({                               \
1192    memset(field, 0, sizeof(field));                            \
1193    UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__); \
1194    assert(i > 0 && i < sizeof(field));                         \
1195 })
1196 
1197 static VkResult
nvk_shader_get_executable_properties(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t * executable_count,VkPipelineExecutablePropertiesKHR * properties)1198 nvk_shader_get_executable_properties(
1199    UNUSED struct vk_device *device,
1200    const struct vk_shader *vk_shader,
1201    uint32_t *executable_count,
1202    VkPipelineExecutablePropertiesKHR *properties)
1203 {
1204    struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
1205    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
1206                           properties, executable_count);
1207 
1208    vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
1209       props->stages = mesa_to_vk_shader_stage(shader->info.stage);
1210       props->subgroupSize = 32;
1211       WRITE_STR(props->name, "%s",
1212                 _mesa_shader_stage_to_string(shader->info.stage));
1213       WRITE_STR(props->description, "%s shader",
1214                 _mesa_shader_stage_to_string(shader->info.stage));
1215    }
1216 
1217    return vk_outarray_status(&out);
1218 }
1219 
1220 static VkResult
nvk_shader_get_executable_statistics(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t executable_index,uint32_t * statistic_count,VkPipelineExecutableStatisticKHR * statistics)1221 nvk_shader_get_executable_statistics(
1222    UNUSED struct vk_device *device,
1223    const struct vk_shader *vk_shader,
1224    uint32_t executable_index,
1225    uint32_t *statistic_count,
1226    VkPipelineExecutableStatisticKHR *statistics)
1227 {
1228    struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
1229    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
1230                           statistics, statistic_count);
1231 
1232    assert(executable_index == 0);
1233 
1234    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
1235       WRITE_STR(stat->name, "Instruction count");
1236       WRITE_STR(stat->description, "Number of instructions used by this shader");
1237       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1238       stat->value.u64 = shader->info.num_instrs;
1239    }
1240 
1241    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
1242       WRITE_STR(stat->name, "Code Size");
1243       WRITE_STR(stat->description,
1244                 "Size of the compiled shader binary, in bytes");
1245       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1246       stat->value.u64 = shader->code_size;
1247    }
1248 
1249    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
1250       WRITE_STR(stat->name, "Number of GPRs");
1251       WRITE_STR(stat->description, "Number of GPRs used by this pipeline");
1252       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1253       stat->value.u64 = shader->info.num_gprs;
1254    }
1255 
1256    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
1257       WRITE_STR(stat->name, "SLM Size");
1258       WRITE_STR(stat->description,
1259                 "Size of shader local (scratch) memory, in bytes");
1260       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1261       stat->value.u64 = shader->info.slm_size;
1262    }
1263 
1264    return vk_outarray_status(&out);
1265 }
1266 
1267 static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR * ir,const char * data)1268 write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
1269               const char *data)
1270 {
1271    ir->isText = VK_TRUE;
1272 
1273    size_t data_len = strlen(data) + 1;
1274 
1275    if (ir->pData == NULL) {
1276       ir->dataSize = data_len;
1277       return true;
1278    }
1279 
1280    strncpy(ir->pData, data, ir->dataSize);
1281    if (ir->dataSize < data_len)
1282       return false;
1283 
1284    ir->dataSize = data_len;
1285    return true;
1286 }
1287 
1288 static VkResult
nvk_shader_get_executable_internal_representations(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t executable_index,uint32_t * internal_representation_count,VkPipelineExecutableInternalRepresentationKHR * internal_representations)1289 nvk_shader_get_executable_internal_representations(
1290    UNUSED struct vk_device *device,
1291    const struct vk_shader *vk_shader,
1292    uint32_t executable_index,
1293    uint32_t *internal_representation_count,
1294    VkPipelineExecutableInternalRepresentationKHR *internal_representations)
1295 {
1296    struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
1297    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
1298                           internal_representations,
1299                           internal_representation_count);
1300    bool incomplete_text = false;
1301 
1302    assert(executable_index == 0);
1303 
1304    if (shader->nak != NULL && shader->nak->asm_str != NULL) {
1305       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
1306          WRITE_STR(ir->name, "NAK assembly");
1307          WRITE_STR(ir->description, "NAK assembly");
1308          if (!write_ir_text(ir, shader->nak->asm_str))
1309             incomplete_text = true;
1310       }
1311    }
1312 
1313    return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
1314 }
1315 
1316 static const struct vk_shader_ops nvk_shader_ops = {
1317    .destroy = nvk_shader_destroy,
1318    .serialize = nvk_shader_serialize,
1319    .get_executable_properties = nvk_shader_get_executable_properties,
1320    .get_executable_statistics = nvk_shader_get_executable_statistics,
1321    .get_executable_internal_representations =
1322       nvk_shader_get_executable_internal_representations,
1323 };
1324 
1325 const struct vk_device_shader_ops nvk_device_shader_ops = {
1326    .get_nir_options = nvk_get_nir_options,
1327    .get_spirv_options = nvk_get_spirv_options,
1328    .preprocess_nir = nvk_preprocess_nir,
1329    .hash_graphics_state = nvk_hash_graphics_state,
1330    .compile = nvk_compile_shaders,
1331    .deserialize = nvk_deserialize_shader,
1332    .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
1333    .cmd_bind_shaders = nvk_cmd_bind_shaders,
1334 };
1335