1 /*
2 * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3 * SPDX-License-Identifier: MIT
4 */
5 #include "nvk_shader.h"
6
7 #include "nvk_cmd_buffer.h"
8 #include "nvk_descriptor_set_layout.h"
9 #include "nvk_device.h"
10 #include "nvk_mme.h"
11 #include "nvk_physical_device.h"
12 #include "nvk_sampler.h"
13 #include "nvk_shader.h"
14
15 #include "vk_nir_convert_ycbcr.h"
16 #include "vk_pipeline.h"
17 #include "vk_pipeline_layout.h"
18 #include "vk_shader_module.h"
19 #include "vk_ycbcr_conversion.h"
20
21 #include "nak.h"
22 #include "nir.h"
23 #include "nir_builder.h"
24 #include "compiler/spirv/nir_spirv.h"
25
26 #include "nv50_ir_driver.h"
27
28 #include "util/mesa-sha1.h"
29 #include "util/u_debug.h"
30
31 #include "cla097.h"
32 #include "clb097.h"
33 #include "clc597.h"
34 #include "nv_push_cl9097.h"
35 #include "nv_push_clb197.h"
36 #include "nv_push_clc397.h"
37 #include "nv_push_clc797.h"
38
39 static void
shared_var_info(const struct glsl_type * type,unsigned * size,unsigned * align)40 shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
41 {
42 assert(glsl_type_is_vector_or_scalar(type));
43
44 uint32_t comp_size = glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
45 unsigned length = glsl_get_vector_elements(type);
46 *size = comp_size * length, *align = comp_size;
47 }
48
49 VkShaderStageFlags
nvk_nak_stages(const struct nv_device_info * info)50 nvk_nak_stages(const struct nv_device_info *info)
51 {
52 const VkShaderStageFlags all =
53 VK_SHADER_STAGE_VERTEX_BIT |
54 VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
55 VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT |
56 VK_SHADER_STAGE_GEOMETRY_BIT |
57 VK_SHADER_STAGE_FRAGMENT_BIT |
58 VK_SHADER_STAGE_COMPUTE_BIT;
59
60 const struct debug_control flags[] = {
61 { "vs", BITFIELD64_BIT(MESA_SHADER_VERTEX) },
62 { "tcs", BITFIELD64_BIT(MESA_SHADER_TESS_CTRL) },
63 { "tes", BITFIELD64_BIT(MESA_SHADER_TESS_EVAL) },
64 { "gs", BITFIELD64_BIT(MESA_SHADER_GEOMETRY) },
65 { "fs", BITFIELD64_BIT(MESA_SHADER_FRAGMENT) },
66 { "cs", BITFIELD64_BIT(MESA_SHADER_COMPUTE) },
67 { "all", all },
68 { NULL, 0 },
69 };
70
71 const char *env_str = getenv("NVK_USE_NAK");
72 if (env_str == NULL)
73 return info->cls_eng3d >= MAXWELL_A ? all : 0;
74 else
75 return parse_debug_string(env_str, flags);
76 }
77
78 static bool
use_nak(const struct nvk_physical_device * pdev,gl_shader_stage stage)79 use_nak(const struct nvk_physical_device *pdev, gl_shader_stage stage)
80 {
81 return nvk_nak_stages(&pdev->info) & mesa_to_vk_shader_stage(stage);
82 }
83
84 uint64_t
nvk_physical_device_compiler_flags(const struct nvk_physical_device * pdev)85 nvk_physical_device_compiler_flags(const struct nvk_physical_device *pdev)
86 {
87 bool no_cbufs = pdev->debug_flags & NVK_DEBUG_NO_CBUF;
88 bool use_edb_buffer_views = nvk_use_edb_buffer_views(pdev);
89 uint64_t prog_debug = nvk_cg_get_prog_debug();
90 uint64_t prog_optimize = nvk_cg_get_prog_optimize();
91 uint64_t nak_stages = nvk_nak_stages(&pdev->info);
92 uint64_t nak_flags = nak_debug_flags(pdev->nak);
93
94 assert(prog_debug <= UINT8_MAX);
95 assert(prog_optimize < 16);
96 assert(nak_stages <= UINT32_MAX);
97 assert(nak_flags <= UINT16_MAX);
98
99 return prog_debug
100 | (prog_optimize << 8)
101 | ((uint64_t)no_cbufs << 12)
102 | ((uint64_t)use_edb_buffer_views << 13)
103 | (nak_stages << 16)
104 | (nak_flags << 48);
105 }
106
107 static const nir_shader_compiler_options *
nvk_get_nir_options(struct vk_physical_device * vk_pdev,gl_shader_stage stage,UNUSED const struct vk_pipeline_robustness_state * rs)108 nvk_get_nir_options(struct vk_physical_device *vk_pdev,
109 gl_shader_stage stage,
110 UNUSED const struct vk_pipeline_robustness_state *rs)
111 {
112 const struct nvk_physical_device *pdev =
113 container_of(vk_pdev, struct nvk_physical_device, vk);
114
115 if (use_nak(pdev, stage))
116 return nak_nir_options(pdev->nak);
117 else
118 return nvk_cg_nir_options(pdev, stage);
119 }
120
121 nir_address_format
nvk_ubo_addr_format(const struct nvk_physical_device * pdev,const struct vk_pipeline_robustness_state * rs)122 nvk_ubo_addr_format(const struct nvk_physical_device *pdev,
123 const struct vk_pipeline_robustness_state *rs)
124 {
125 if (nvk_use_bindless_cbuf(&pdev->info)) {
126 return nir_address_format_vec2_index_32bit_offset;
127 } else if (rs->null_uniform_buffer_descriptor) {
128 /* We need bounds checking for null descriptors */
129 return nir_address_format_64bit_bounded_global;
130 } else {
131 switch (rs->uniform_buffers) {
132 case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
133 return nir_address_format_64bit_global_32bit_offset;
134 case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
135 case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
136 return nir_address_format_64bit_bounded_global;
137 default:
138 unreachable("Invalid robust buffer access behavior");
139 }
140 }
141 }
142
143 nir_address_format
nvk_ssbo_addr_format(const struct nvk_physical_device * pdev,const struct vk_pipeline_robustness_state * rs)144 nvk_ssbo_addr_format(const struct nvk_physical_device *pdev,
145 const struct vk_pipeline_robustness_state *rs)
146 {
147 if (rs->null_storage_buffer_descriptor) {
148 /* We need bounds checking for null descriptors */
149 return nir_address_format_64bit_bounded_global;
150 } else {
151 switch (rs->storage_buffers) {
152 case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
153 return nir_address_format_64bit_global_32bit_offset;
154 case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
155 case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
156 return nir_address_format_64bit_bounded_global;
157 default:
158 unreachable("Invalid robust buffer access behavior");
159 }
160 }
161 }
162
163 static struct spirv_to_nir_options
nvk_get_spirv_options(struct vk_physical_device * vk_pdev,UNUSED gl_shader_stage stage,const struct vk_pipeline_robustness_state * rs)164 nvk_get_spirv_options(struct vk_physical_device *vk_pdev,
165 UNUSED gl_shader_stage stage,
166 const struct vk_pipeline_robustness_state *rs)
167 {
168 const struct nvk_physical_device *pdev =
169 container_of(vk_pdev, struct nvk_physical_device, vk);
170
171 return (struct spirv_to_nir_options) {
172 .ssbo_addr_format = nvk_ssbo_addr_format(pdev, rs),
173 .phys_ssbo_addr_format = nir_address_format_64bit_global,
174 .ubo_addr_format = nvk_ubo_addr_format(pdev, rs),
175 .shared_addr_format = nir_address_format_32bit_offset,
176 .min_ssbo_alignment = NVK_MIN_SSBO_ALIGNMENT,
177 .min_ubo_alignment = nvk_min_cbuf_alignment(&pdev->info),
178 };
179 }
180
181 static void
nvk_preprocess_nir(struct vk_physical_device * vk_pdev,nir_shader * nir)182 nvk_preprocess_nir(struct vk_physical_device *vk_pdev, nir_shader *nir)
183 {
184 const struct nvk_physical_device *pdev =
185 container_of(vk_pdev, struct nvk_physical_device, vk);
186
187 NIR_PASS_V(nir, nir_lower_io_to_temporaries,
188 nir_shader_get_entrypoint(nir), true, false);
189
190 if (use_nak(pdev, nir->info.stage))
191 nak_preprocess_nir(nir, pdev->nak);
192 else
193 nvk_cg_preprocess_nir(nir);
194 }
195
196 static void
nvk_populate_fs_key(struct nak_fs_key * key,const struct vk_graphics_pipeline_state * state)197 nvk_populate_fs_key(struct nak_fs_key *key,
198 const struct vk_graphics_pipeline_state *state)
199 {
200 memset(key, 0, sizeof(*key));
201
202 key->sample_info_cb = 0;
203 key->sample_locations_offset = nvk_root_descriptor_offset(draw.sample_locations);
204 key->sample_masks_offset = nvk_root_descriptor_offset(draw.sample_masks);
205
206 /* Turn underestimate on when no state is availaible or if explicitly set */
207 if (state == NULL || state->rs == NULL ||
208 state->rs->conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT)
209 key->uses_underestimate = true;
210
211 if (state == NULL)
212 return;
213
214 if (state->pipeline_flags &
215 VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT)
216 key->zs_self_dep = true;
217
218 /* We force per-sample interpolation whenever sampleShadingEnable is set
219 * regardless of minSampleShading or rasterizationSamples.
220 *
221 * When sampleShadingEnable is set, few guarantees are made about the
222 * location of interpolation of the inputs. The only real guarantees are
223 * that the inputs are interpolated within the pixel and that you get at
224 * least `rasterizationSamples * minSampleShading` unique positions.
225 * Importantly, it does not require that when `rasterizationSamples *
226 * minSampleShading <= 1.0` that those positions are at the fragment
227 * center. Therefore, it's valid to just always do per-sample (which maps
228 * to CENTROID on NVIDIA hardware) all the time and let the hardware sort
229 * it out based on what we set in HYBRID_ANTI_ALIAS_CONTROL::passes.
230 *
231 * Also, we set HYBRID_ANTI_ALIAS_CONTROL::centroid at draw time based on
232 * `rasterizationSamples * minSampleShading` so it should be per-pixel
233 * whenever we're running only a single pass. However, this would still be
234 * correct even if it got interpolated at some other sample.
235 *
236 * The one caveat here is that we have to be careful about gl_SampleMaskIn.
237 * When `nak_fs_key::force_sample_shading = true` we also turn any reads of
238 * gl_SampleMaskIn into `1 << gl_SampleID` because the hardware sample mask
239 * is actually per-fragment, not per-pass. We handle this by smashing
240 * minSampleShading to 1.0 whenever gl_SampleMaskIn is read.
241 */
242 const struct vk_multisample_state *ms = state->ms;
243 if (ms != NULL && ms->sample_shading_enable)
244 key->force_sample_shading = true;
245 }
246
247 static void
nvk_hash_graphics_state(struct vk_physical_device * device,const struct vk_graphics_pipeline_state * state,VkShaderStageFlags stages,blake3_hash blake3_out)248 nvk_hash_graphics_state(struct vk_physical_device *device,
249 const struct vk_graphics_pipeline_state *state,
250 VkShaderStageFlags stages,
251 blake3_hash blake3_out)
252 {
253 struct mesa_blake3 blake3_ctx;
254 _mesa_blake3_init(&blake3_ctx);
255 if (stages & VK_SHADER_STAGE_FRAGMENT_BIT) {
256 struct nak_fs_key key;
257 nvk_populate_fs_key(&key, state);
258 _mesa_blake3_update(&blake3_ctx, &key, sizeof(key));
259
260 const bool is_multiview = state->rp->view_mask != 0;
261 _mesa_blake3_update(&blake3_ctx, &is_multiview, sizeof(is_multiview));
262
263 /* This doesn't impact the shader compile but it does go in the
264 * nvk_shader and gets [de]serialized along with the binary so we
265 * need to hash it.
266 */
267 if (state->ms && state->ms->sample_shading_enable) {
268 _mesa_blake3_update(&blake3_ctx, &state->ms->min_sample_shading,
269 sizeof(state->ms->min_sample_shading));
270 }
271 }
272 _mesa_blake3_final(&blake3_ctx, blake3_out);
273 }
274
275 static bool
lower_load_intrinsic(nir_builder * b,nir_intrinsic_instr * load,UNUSED void * _data)276 lower_load_intrinsic(nir_builder *b, nir_intrinsic_instr *load,
277 UNUSED void *_data)
278 {
279 switch (load->intrinsic) {
280 case nir_intrinsic_load_ubo: {
281 b->cursor = nir_before_instr(&load->instr);
282
283 nir_def *index = load->src[0].ssa;
284 nir_def *offset = load->src[1].ssa;
285 const enum gl_access_qualifier access = nir_intrinsic_access(load);
286 const uint32_t align_mul = nir_intrinsic_align_mul(load);
287 const uint32_t align_offset = nir_intrinsic_align_offset(load);
288
289 nir_def *val;
290 if (load->src[0].ssa->num_components == 1) {
291 val = nir_ldc_nv(b, load->num_components, load->def.bit_size,
292 index, offset, .access = access,
293 .align_mul = align_mul,
294 .align_offset = align_offset);
295 } else if (load->src[0].ssa->num_components == 2) {
296 nir_def *handle = nir_pack_64_2x32(b, load->src[0].ssa);
297 val = nir_ldcx_nv(b, load->num_components, load->def.bit_size,
298 handle, offset, .access = access,
299 .align_mul = align_mul,
300 .align_offset = align_offset);
301 } else {
302 unreachable("Invalid UBO index");
303 }
304 nir_def_rewrite_uses(&load->def, val);
305 return true;
306 }
307
308 case nir_intrinsic_load_global_constant_offset:
309 case nir_intrinsic_load_global_constant_bounded: {
310 b->cursor = nir_before_instr(&load->instr);
311
312 nir_def *base_addr = load->src[0].ssa;
313 nir_def *offset = load->src[1].ssa;
314
315 nir_def *zero = NULL;
316 if (load->intrinsic == nir_intrinsic_load_global_constant_bounded) {
317 nir_def *bound = load->src[2].ssa;
318
319 unsigned bit_size = load->def.bit_size;
320 assert(bit_size >= 8 && bit_size % 8 == 0);
321 unsigned byte_size = bit_size / 8;
322
323 zero = nir_imm_zero(b, load->num_components, bit_size);
324
325 unsigned load_size = byte_size * load->num_components;
326
327 nir_def *sat_offset =
328 nir_umin(b, offset, nir_imm_int(b, UINT32_MAX - (load_size - 1)));
329 nir_def *in_bounds =
330 nir_ilt(b, nir_iadd_imm(b, sat_offset, load_size - 1), bound);
331
332 nir_push_if(b, in_bounds);
333 }
334
335 nir_def *val =
336 nir_build_load_global_constant(b, load->def.num_components,
337 load->def.bit_size,
338 nir_iadd(b, base_addr, nir_u2u64(b, offset)),
339 .align_mul = nir_intrinsic_align_mul(load),
340 .align_offset = nir_intrinsic_align_offset(load));
341
342 if (load->intrinsic == nir_intrinsic_load_global_constant_bounded) {
343 nir_pop_if(b, NULL);
344 val = nir_if_phi(b, val, zero);
345 }
346
347 nir_def_rewrite_uses(&load->def, val);
348 return true;
349 }
350
351 default:
352 return false;
353 }
354 }
355
356 struct lower_ycbcr_state {
357 uint32_t set_layout_count;
358 struct vk_descriptor_set_layout * const *set_layouts;
359 };
360
361 static const struct vk_ycbcr_conversion_state *
lookup_ycbcr_conversion(const void * _state,uint32_t set,uint32_t binding,uint32_t array_index)362 lookup_ycbcr_conversion(const void *_state, uint32_t set,
363 uint32_t binding, uint32_t array_index)
364 {
365 const struct lower_ycbcr_state *state = _state;
366 assert(set < state->set_layout_count);
367 assert(state->set_layouts[set] != NULL);
368 const struct nvk_descriptor_set_layout *set_layout =
369 vk_to_nvk_descriptor_set_layout(state->set_layouts[set]);
370 assert(binding < set_layout->binding_count);
371
372 const struct nvk_descriptor_set_binding_layout *bind_layout =
373 &set_layout->binding[binding];
374
375 if (bind_layout->immutable_samplers == NULL)
376 return NULL;
377
378 array_index = MIN2(array_index, bind_layout->array_size - 1);
379
380 const struct nvk_sampler *sampler =
381 bind_layout->immutable_samplers[array_index];
382
383 return sampler && sampler->vk.ycbcr_conversion ?
384 &sampler->vk.ycbcr_conversion->state : NULL;
385 }
386
387 static inline bool
nir_has_image_var(nir_shader * nir)388 nir_has_image_var(nir_shader *nir)
389 {
390 nir_foreach_image_variable(_, nir)
391 return true;
392
393 return false;
394 }
395
396 static void
nvk_lower_nir(struct nvk_device * dev,nir_shader * nir,VkShaderCreateFlagsEXT shader_flags,const struct vk_pipeline_robustness_state * rs,bool is_multiview,uint32_t set_layout_count,struct vk_descriptor_set_layout * const * set_layouts,struct nvk_cbuf_map * cbuf_map_out)397 nvk_lower_nir(struct nvk_device *dev, nir_shader *nir,
398 VkShaderCreateFlagsEXT shader_flags,
399 const struct vk_pipeline_robustness_state *rs,
400 bool is_multiview,
401 uint32_t set_layout_count,
402 struct vk_descriptor_set_layout * const *set_layouts,
403 struct nvk_cbuf_map *cbuf_map_out)
404 {
405 struct nvk_physical_device *pdev = nvk_device_physical(dev);
406
407 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
408 NIR_PASS(_, nir, nir_lower_input_attachments,
409 &(nir_input_attachment_options) {
410 .use_fragcoord_sysval = use_nak(pdev, nir->info.stage),
411 .use_layer_id_sysval = use_nak(pdev, nir->info.stage) ||
412 is_multiview,
413 .use_view_id_for_layer = is_multiview,
414 });
415 }
416
417 if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
418 NIR_PASS(_, nir, nir_lower_patch_vertices,
419 nir->info.tess.tcs_vertices_out, NULL);
420 }
421
422 const struct lower_ycbcr_state ycbcr_state = {
423 .set_layout_count = set_layout_count,
424 .set_layouts = set_layouts,
425 };
426 NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex,
427 lookup_ycbcr_conversion, &ycbcr_state);
428
429 nir_lower_compute_system_values_options csv_options = {
430 .has_base_workgroup_id = true,
431 };
432 NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options);
433
434 /* Lower push constants before lower_descriptors */
435 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const,
436 nir_address_format_32bit_offset);
437
438 /* Lower non-uniform access before lower_descriptors */
439 enum nir_lower_non_uniform_access_type lower_non_uniform_access_types =
440 nir_lower_non_uniform_ubo_access;
441
442 if (pdev->info.cls_eng3d < TURING_A) {
443 lower_non_uniform_access_types |= nir_lower_non_uniform_texture_access |
444 nir_lower_non_uniform_image_access;
445 }
446
447 /* In practice, most shaders do not have non-uniform-qualified accesses
448 * thus a cheaper and likely to fail check is run first.
449 */
450 if (nir_has_non_uniform_access(nir, lower_non_uniform_access_types)) {
451 struct nir_lower_non_uniform_access_options opts = {
452 .types = lower_non_uniform_access_types,
453 .callback = NULL,
454 };
455 NIR_PASS(_, nir, nir_opt_non_uniform_access);
456 NIR_PASS(_, nir, nir_lower_non_uniform_access, &opts);
457 }
458
459 /* TODO: Kepler image lowering requires image params to be loaded from the
460 * descriptor set which we don't currently support.
461 */
462 assert(pdev->info.cls_eng3d >= MAXWELL_A || !nir_has_image_var(nir));
463
464 struct nvk_cbuf_map *cbuf_map = NULL;
465 if (use_nak(pdev, nir->info.stage) &&
466 !(pdev->debug_flags & NVK_DEBUG_NO_CBUF)) {
467 cbuf_map = cbuf_map_out;
468
469 /* Large constant support assumes cbufs */
470 NIR_PASS(_, nir, nir_opt_large_constants, NULL, 32);
471 } else {
472 /* Codegen sometimes puts stuff in cbuf 1 and adds 1 to our cbuf indices
473 * so we can't really rely on it for lowering to cbufs and instead place
474 * the root descriptors in both cbuf 0 and cbuf 1.
475 */
476 *cbuf_map_out = (struct nvk_cbuf_map) {
477 .cbuf_count = 2,
478 .cbufs = {
479 { .type = NVK_CBUF_TYPE_ROOT_DESC },
480 { .type = NVK_CBUF_TYPE_ROOT_DESC },
481 }
482 };
483 }
484
485 nir_opt_access_options opt_access_options = {
486 .is_vulkan = true,
487 };
488 NIR_PASS(_, nir, nir_opt_access, &opt_access_options);
489 NIR_PASS(_, nir, nvk_nir_lower_descriptors, pdev, shader_flags, rs,
490 set_layout_count, set_layouts, cbuf_map);
491 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
492 nir_address_format_64bit_global);
493 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo,
494 nvk_ssbo_addr_format(pdev, rs));
495 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
496 nvk_ubo_addr_format(pdev, rs));
497 NIR_PASS(_, nir, nir_shader_intrinsics_pass,
498 lower_load_intrinsic, nir_metadata_none, NULL);
499
500 if (!nir->info.shared_memory_explicit_layout) {
501 NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
502 nir_var_mem_shared, shared_var_info);
503 }
504 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared,
505 nir_address_format_32bit_offset);
506
507 if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
508 /* QMD::SHARED_MEMORY_SIZE requires an alignment of 256B so it's safe to
509 * align everything up to 16B so we can write whole vec4s.
510 */
511 nir->info.shared_size = align(nir->info.shared_size, 16);
512 NIR_PASS(_, nir, nir_zero_initialize_shared_memory,
513 nir->info.shared_size, 16);
514
515 /* We need to call lower_compute_system_values again because
516 * nir_zero_initialize_shared_memory generates load_invocation_id which
517 * has to be lowered to load_invocation_index.
518 */
519 NIR_PASS(_, nir, nir_lower_compute_system_values, NULL);
520 }
521 }
522
523 #ifndef NDEBUG
524 static void
nvk_shader_dump(struct nvk_shader * shader)525 nvk_shader_dump(struct nvk_shader *shader)
526 {
527 unsigned pos;
528
529 if (shader->info.stage != MESA_SHADER_COMPUTE) {
530 _debug_printf("dumping HDR for %s shader\n",
531 _mesa_shader_stage_to_string(shader->info.stage));
532 for (pos = 0; pos < ARRAY_SIZE(shader->info.hdr); ++pos)
533 _debug_printf("HDR[%02"PRIxPTR"] = 0x%08x\n",
534 pos * sizeof(shader->info.hdr[0]), shader->info.hdr[pos]);
535 }
536 _debug_printf("shader binary code (0x%x bytes):", shader->code_size);
537 for (pos = 0; pos < shader->code_size / 4; ++pos) {
538 if ((pos % 8) == 0)
539 _debug_printf("\n");
540 _debug_printf("%08x ", ((const uint32_t *)shader->code_ptr)[pos]);
541 }
542 _debug_printf("\n");
543 }
544 #endif
545
546 static VkResult
nvk_compile_nir_with_nak(struct nvk_physical_device * pdev,nir_shader * nir,VkShaderCreateFlagsEXT shader_flags,const struct vk_pipeline_robustness_state * rs,const struct nak_fs_key * fs_key,struct nvk_shader * shader)547 nvk_compile_nir_with_nak(struct nvk_physical_device *pdev,
548 nir_shader *nir,
549 VkShaderCreateFlagsEXT shader_flags,
550 const struct vk_pipeline_robustness_state *rs,
551 const struct nak_fs_key *fs_key,
552 struct nvk_shader *shader)
553 {
554 const bool dump_asm =
555 shader_flags & VK_SHADER_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_MESA;
556
557 nir_variable_mode robust2_modes = 0;
558 if (rs->uniform_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
559 robust2_modes |= nir_var_mem_ubo;
560 if (rs->storage_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
561 robust2_modes |= nir_var_mem_ssbo;
562
563 shader->nak = nak_compile_shader(nir, dump_asm, pdev->nak, robust2_modes, fs_key);
564
565 if (!shader->nak)
566 return vk_errorf(pdev, VK_ERROR_UNKNOWN, "Internal compiler error in NAK");
567
568 shader->info = shader->nak->info;
569 shader->code_ptr = shader->nak->code;
570 shader->code_size = shader->nak->code_size;
571
572 return VK_SUCCESS;
573 }
574
575 static VkResult
nvk_compile_nir(struct nvk_device * dev,nir_shader * nir,VkShaderCreateFlagsEXT shader_flags,const struct vk_pipeline_robustness_state * rs,const struct nak_fs_key * fs_key,struct nvk_shader * shader)576 nvk_compile_nir(struct nvk_device *dev, nir_shader *nir,
577 VkShaderCreateFlagsEXT shader_flags,
578 const struct vk_pipeline_robustness_state *rs,
579 const struct nak_fs_key *fs_key,
580 struct nvk_shader *shader)
581 {
582 struct nvk_physical_device *pdev = nvk_device_physical(dev);
583 VkResult result;
584
585 if (use_nak(pdev, nir->info.stage)) {
586 result = nvk_compile_nir_with_nak(pdev, nir, shader_flags, rs,
587 fs_key, shader);
588 } else {
589 result = nvk_cg_compile_nir(pdev, nir, fs_key, shader);
590 }
591 if (result != VK_SUCCESS)
592 return result;
593
594 if (nir->constant_data_size > 0) {
595 uint32_t data_align = nvk_min_cbuf_alignment(&pdev->info);
596 uint32_t data_size = align(nir->constant_data_size, data_align);
597
598 void *data = malloc(data_size);
599 if (data == NULL)
600 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
601
602 memcpy(data, nir->constant_data, nir->constant_data_size);
603
604 assert(nir->constant_data_size <= data_size);
605 memset(data + nir->constant_data_size, 0,
606 data_size - nir->constant_data_size);
607
608 shader->data_ptr = data;
609 shader->data_size = data_size;
610 }
611
612 return VK_SUCCESS;
613 }
614
615 static VkResult
nvk_shader_upload(struct nvk_device * dev,struct nvk_shader * shader)616 nvk_shader_upload(struct nvk_device *dev, struct nvk_shader *shader)
617 {
618 struct nvk_physical_device *pdev = nvk_device_physical(dev);
619
620 uint32_t hdr_size = 0;
621 if (shader->info.stage != MESA_SHADER_COMPUTE) {
622 if (pdev->info.cls_eng3d >= TURING_A)
623 hdr_size = TU102_SHADER_HEADER_SIZE;
624 else
625 hdr_size = GF100_SHADER_HEADER_SIZE;
626 }
627
628 /* Fermi needs 0x40 alignment
629 * Kepler+ needs the first instruction to be 0x80 aligned, so we waste 0x30 bytes
630 */
631 int alignment = pdev->info.cls_eng3d >= KEPLER_A ? 0x80 : 0x40;
632
633 uint32_t total_size = 0;
634 if (pdev->info.cls_eng3d >= KEPLER_A &&
635 pdev->info.cls_eng3d < TURING_A &&
636 hdr_size > 0) {
637 /* The instructions are what has to be aligned so we need to start at a
638 * small offset (0x30 B) into the upload area.
639 */
640 total_size = alignment - hdr_size;
641 }
642
643 const uint32_t hdr_offset = total_size;
644 total_size += hdr_size;
645
646 const uint32_t code_offset = total_size;
647 assert(code_offset % alignment == 0);
648 total_size += shader->code_size;
649
650 uint32_t data_offset = 0;
651 if (shader->data_size > 0) {
652 uint32_t cbuf_alignment = nvk_min_cbuf_alignment(&pdev->info);
653 alignment = MAX2(alignment, cbuf_alignment);
654 total_size = align(total_size, cbuf_alignment);
655 data_offset = total_size;
656 total_size += shader->data_size;
657 }
658
659 char *data = malloc(total_size);
660 if (data == NULL)
661 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
662
663 assert(hdr_size <= sizeof(shader->info.hdr));
664 memcpy(data + hdr_offset, shader->info.hdr, hdr_size);
665 memcpy(data + code_offset, shader->code_ptr, shader->code_size);
666 if (shader->data_size > 0)
667 memcpy(data + data_offset, shader->data_ptr, shader->data_size);
668
669 #ifndef NDEBUG
670 if (debug_get_bool_option("NV50_PROG_DEBUG", false))
671 nvk_shader_dump(shader);
672 #endif
673
674 VkResult result = nvk_heap_upload(dev, &dev->shader_heap, data,
675 total_size, alignment,
676 &shader->upload_addr);
677 if (result == VK_SUCCESS) {
678 shader->upload_size = total_size;
679
680 shader->hdr_addr = shader->upload_addr + hdr_offset;
681 if (pdev->info.cls_eng3d < VOLTA_A) {
682 const uint64_t heap_base_addr =
683 nvk_heap_contiguous_base_address(&dev->shader_heap);
684 assert(shader->upload_addr - heap_base_addr < UINT32_MAX);
685 shader->hdr_addr -= heap_base_addr;
686 }
687 shader->data_addr = shader->upload_addr + data_offset;
688 }
689 free(data);
690
691 return result;
692 }
693
694 uint32_t
mesa_to_nv9097_shader_type(gl_shader_stage stage)695 mesa_to_nv9097_shader_type(gl_shader_stage stage)
696 {
697 static const uint32_t mesa_to_nv9097[] = {
698 [MESA_SHADER_VERTEX] = NV9097_SET_PIPELINE_SHADER_TYPE_VERTEX,
699 [MESA_SHADER_TESS_CTRL] = NV9097_SET_PIPELINE_SHADER_TYPE_TESSELLATION_INIT,
700 [MESA_SHADER_TESS_EVAL] = NV9097_SET_PIPELINE_SHADER_TYPE_TESSELLATION,
701 [MESA_SHADER_GEOMETRY] = NV9097_SET_PIPELINE_SHADER_TYPE_GEOMETRY,
702 [MESA_SHADER_FRAGMENT] = NV9097_SET_PIPELINE_SHADER_TYPE_PIXEL,
703 };
704 assert(stage < ARRAY_SIZE(mesa_to_nv9097));
705 return mesa_to_nv9097[stage];
706 }
707
708 uint32_t
nvk_pipeline_bind_group(gl_shader_stage stage)709 nvk_pipeline_bind_group(gl_shader_stage stage)
710 {
711 return stage;
712 }
713
714 uint16_t
nvk_max_shader_push_dw(struct nvk_physical_device * pdev,gl_shader_stage stage,bool last_vtgm)715 nvk_max_shader_push_dw(struct nvk_physical_device *pdev,
716 gl_shader_stage stage, bool last_vtgm)
717 {
718 if (stage == MESA_SHADER_COMPUTE)
719 return 0;
720
721 uint16_t max_dw_count = 8;
722
723 if (stage == MESA_SHADER_TESS_EVAL)
724 max_dw_count += 2;
725
726 if (stage == MESA_SHADER_FRAGMENT)
727 max_dw_count += 13;
728
729 if (last_vtgm) {
730 max_dw_count += 8;
731 max_dw_count += 4 * (5 + (128 / 4));
732 }
733
734 return max_dw_count;
735 }
736
737 static VkResult
nvk_shader_fill_push(struct nvk_device * dev,struct nvk_shader * shader,const VkAllocationCallbacks * pAllocator)738 nvk_shader_fill_push(struct nvk_device *dev,
739 struct nvk_shader *shader,
740 const VkAllocationCallbacks* pAllocator)
741 {
742 struct nvk_physical_device *pdev = nvk_device_physical(dev);
743
744 ASSERTED uint16_t max_dw_count = 0;
745 uint32_t push_dw[200];
746 struct nv_push push, *p = &push;
747 nv_push_init(&push, push_dw, ARRAY_SIZE(push_dw));
748
749 const uint32_t type = mesa_to_nv9097_shader_type(shader->info.stage);
750
751 /* We always map index == type */
752 const uint32_t idx = type;
753
754 max_dw_count += 2;
755 P_IMMD(p, NV9097, SET_PIPELINE_SHADER(idx), {
756 .enable = ENABLE_TRUE,
757 .type = type,
758 });
759
760 max_dw_count += 3;
761 uint64_t addr = shader->hdr_addr;
762 if (pdev->info.cls_eng3d >= VOLTA_A) {
763 P_MTHD(p, NVC397, SET_PIPELINE_PROGRAM_ADDRESS_A(idx));
764 P_NVC397_SET_PIPELINE_PROGRAM_ADDRESS_A(p, idx, addr >> 32);
765 P_NVC397_SET_PIPELINE_PROGRAM_ADDRESS_B(p, idx, addr);
766 } else {
767 assert(addr < 0xffffffff);
768 P_IMMD(p, NV9097, SET_PIPELINE_PROGRAM(idx), addr);
769 }
770
771 max_dw_count += 3;
772 P_MTHD(p, NVC397, SET_PIPELINE_REGISTER_COUNT(idx));
773 P_NVC397_SET_PIPELINE_REGISTER_COUNT(p, idx, shader->info.num_gprs);
774 P_NVC397_SET_PIPELINE_BINDING(p, idx,
775 nvk_pipeline_bind_group(shader->info.stage));
776
777 if (shader->info.stage == MESA_SHADER_TESS_EVAL) {
778 max_dw_count += 2;
779 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_TESS_PARAMS));
780 P_INLINE_DATA(p, nvk_mme_tess_params(shader->info.ts.domain,
781 shader->info.ts.spacing,
782 shader->info.ts.prims));
783 }
784
785 if (shader->info.stage == MESA_SHADER_FRAGMENT) {
786 max_dw_count += 13;
787
788 P_MTHD(p, NVC397, SET_SUBTILING_PERF_KNOB_A);
789 P_NV9097_SET_SUBTILING_PERF_KNOB_A(p, {
790 .fraction_of_spm_register_file_per_subtile = 0x10,
791 .fraction_of_spm_pixel_output_buffer_per_subtile = 0x40,
792 .fraction_of_spm_triangle_ram_per_subtile = 0x16,
793 .fraction_of_max_quads_per_subtile = 0x20,
794 });
795 P_NV9097_SET_SUBTILING_PERF_KNOB_B(p, 0x20);
796
797 P_IMMD(p, NV9097, SET_API_MANDATED_EARLY_Z,
798 shader->info.fs.early_fragment_tests);
799
800 if (pdev->info.cls_eng3d >= MAXWELL_B) {
801 P_IMMD(p, NVB197, SET_POST_Z_PS_IMASK,
802 shader->info.fs.post_depth_coverage);
803 } else {
804 assert(!shader->info.fs.post_depth_coverage);
805 }
806
807 P_IMMD(p, NV9097, SET_ZCULL_BOUNDS, {
808 .z_min_unbounded_enable = shader->info.fs.writes_depth,
809 .z_max_unbounded_enable = shader->info.fs.writes_depth,
810 });
811
812 if (pdev->info.cls_eng3d >= TURING_A) {
813 /* From the Vulkan 1.3.297 spec:
814 *
815 * "If sample shading is enabled, an implementation must invoke
816 * the fragment shader at least
817 *
818 * max( ⌈ minSampleShading × rasterizationSamples ⌉, 1)
819 *
820 * times per fragment."
821 *
822 * The max() here means that, regardless of the actual value of
823 * minSampleShading, we need to invoke at least once per pixel,
824 * meaning that we need to disable fragment shading rate. We also
825 * need to disable FSR if sample shading is used by the shader.
826 */
827 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_SET_SHADING_RATE_CONTROL));
828 P_INLINE_DATA(p, nvk_mme_shading_rate_control_sample_shading(
829 shader->sample_shading_enable ||
830 shader->info.fs.uses_sample_shading));
831 }
832
833 float mss = 0;
834 if (shader->info.fs.uses_sample_shading) {
835 mss = 1;
836 } else if (shader->sample_shading_enable) {
837 mss = CLAMP(shader->min_sample_shading, 0, 1);
838 } else {
839 mss = 0;
840 }
841 P_1INC(p, NVB197, CALL_MME_MACRO(NVK_MME_SET_ANTI_ALIAS));
842 P_INLINE_DATA(p, nvk_mme_anti_alias_min_sample_shading(mss));
843 }
844
845 /* Stash this before we do XFB and clip/cull */
846 shader->push_dw_count = nv_push_dw_count(&push);
847 assert(max_dw_count ==
848 nvk_max_shader_push_dw(pdev, shader->info.stage, false));
849
850 if (shader->info.stage != MESA_SHADER_FRAGMENT &&
851 shader->info.stage != MESA_SHADER_TESS_CTRL) {
852 max_dw_count += 8;
853
854 P_IMMD(p, NV9097, SET_RT_LAYER, {
855 .v = 0,
856 .control = shader->info.vtg.writes_layer ?
857 CONTROL_GEOMETRY_SHADER_SELECTS_LAYER :
858 CONTROL_V_SELECTS_LAYER,
859 });
860
861 if (pdev->info.cls_eng3d >= AMPERE_B) {
862 P_IMMD(p, NVC797, SET_VARIABLE_PIXEL_RATE_SHADING_TABLE_SELECT, {
863 .source = shader->info.vtg.writes_vprs_table_index ?
864 SOURCE_FROM_VPRS_TABLE_INDEX :
865 SOURCE_FROM_CONSTANT,
866 .source_constant_value = 0,
867 });
868 }
869
870 const uint8_t clip_enable = shader->info.vtg.clip_enable;
871 const uint8_t cull_enable = shader->info.vtg.cull_enable;
872 P_IMMD(p, NV9097, SET_USER_CLIP_ENABLE, {
873 .plane0 = ((clip_enable | cull_enable) >> 0) & 1,
874 .plane1 = ((clip_enable | cull_enable) >> 1) & 1,
875 .plane2 = ((clip_enable | cull_enable) >> 2) & 1,
876 .plane3 = ((clip_enable | cull_enable) >> 3) & 1,
877 .plane4 = ((clip_enable | cull_enable) >> 4) & 1,
878 .plane5 = ((clip_enable | cull_enable) >> 5) & 1,
879 .plane6 = ((clip_enable | cull_enable) >> 6) & 1,
880 .plane7 = ((clip_enable | cull_enable) >> 7) & 1,
881 });
882 P_IMMD(p, NV9097, SET_USER_CLIP_OP, {
883 .plane0 = (cull_enable >> 0) & 1,
884 .plane1 = (cull_enable >> 1) & 1,
885 .plane2 = (cull_enable >> 2) & 1,
886 .plane3 = (cull_enable >> 3) & 1,
887 .plane4 = (cull_enable >> 4) & 1,
888 .plane5 = (cull_enable >> 5) & 1,
889 .plane6 = (cull_enable >> 6) & 1,
890 .plane7 = (cull_enable >> 7) & 1,
891 });
892
893 struct nak_xfb_info *xfb = &shader->info.vtg.xfb;
894 for (uint8_t b = 0; b < ARRAY_SIZE(xfb->attr_count); b++) {
895 const uint8_t attr_count = xfb->attr_count[b];
896
897 max_dw_count += 5 + (128 / 4);
898
899 P_MTHD(p, NV9097, SET_STREAM_OUT_CONTROL_STREAM(b));
900 P_NV9097_SET_STREAM_OUT_CONTROL_STREAM(p, b, xfb->stream[b]);
901 P_NV9097_SET_STREAM_OUT_CONTROL_COMPONENT_COUNT(p, b, attr_count);
902 P_NV9097_SET_STREAM_OUT_CONTROL_STRIDE(p, b, xfb->stride[b]);
903
904 if (attr_count > 0) {
905 /* upload packed varying indices in multiples of 4 bytes */
906 const uint32_t n = DIV_ROUND_UP(attr_count, 4);
907 P_MTHD(p, NV9097, SET_STREAM_OUT_LAYOUT_SELECT(b, 0));
908 P_INLINE_ARRAY(p, (const uint32_t*)xfb->attr_index[b], n);
909 }
910 }
911
912 shader->vtgm_push_dw_count = nv_push_dw_count(&push);
913 assert(max_dw_count ==
914 nvk_max_shader_push_dw(pdev, shader->info.stage, true));
915 }
916
917 assert(nv_push_dw_count(&push) <= max_dw_count);
918 assert(max_dw_count <= ARRAY_SIZE(push_dw));
919
920 uint16_t dw_count = nv_push_dw_count(&push);
921 shader->push_dw =
922 vk_zalloc2(&dev->vk.alloc, pAllocator, dw_count * sizeof(*push_dw),
923 sizeof(*push_dw), VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
924 if (shader->push_dw == NULL)
925 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
926
927 memcpy(shader->push_dw, push_dw, dw_count * sizeof(*push_dw));
928
929 return VK_SUCCESS;
930 }
931
932 static const struct vk_shader_ops nvk_shader_ops;
933
934 static void
nvk_shader_destroy(struct vk_device * vk_dev,struct vk_shader * vk_shader,const VkAllocationCallbacks * pAllocator)935 nvk_shader_destroy(struct vk_device *vk_dev,
936 struct vk_shader *vk_shader,
937 const VkAllocationCallbacks* pAllocator)
938 {
939 struct nvk_device *dev = container_of(vk_dev, struct nvk_device, vk);
940 struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
941
942 vk_free2(&dev->vk.alloc, pAllocator, shader->push_dw);
943
944 if (shader->upload_size > 0) {
945 nvk_heap_free(dev, &dev->shader_heap,
946 shader->upload_addr,
947 shader->upload_size);
948 }
949
950 if (shader->nak) {
951 nak_shader_bin_destroy(shader->nak);
952 } else {
953 /* This came from codegen or deserialize, just free it */
954 free((void *)shader->code_ptr);
955 }
956
957 free((void *)shader->data_ptr);
958
959 vk_shader_free(&dev->vk, pAllocator, &shader->vk);
960 }
961
962 static VkResult
nvk_compile_shader(struct nvk_device * dev,struct vk_shader_compile_info * info,const struct vk_graphics_pipeline_state * state,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shader_out)963 nvk_compile_shader(struct nvk_device *dev,
964 struct vk_shader_compile_info *info,
965 const struct vk_graphics_pipeline_state *state,
966 const VkAllocationCallbacks* pAllocator,
967 struct vk_shader **shader_out)
968 {
969 struct nvk_shader *shader;
970 VkResult result;
971
972 /* We consume the NIR, regardless of success or failure */
973 nir_shader *nir = info->nir;
974
975 shader = vk_shader_zalloc(&dev->vk, &nvk_shader_ops, info->stage,
976 pAllocator, sizeof(*shader));
977 if (shader == NULL) {
978 ralloc_free(nir);
979 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
980 }
981
982 /* TODO: Multiview with ESO */
983 const bool is_multiview = state && state->rp->view_mask != 0;
984
985 nvk_lower_nir(dev, nir, info->flags, info->robustness, is_multiview,
986 info->set_layout_count, info->set_layouts,
987 &shader->cbuf_map);
988
989 struct nak_fs_key fs_key_tmp, *fs_key = NULL;
990 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
991 nvk_populate_fs_key(&fs_key_tmp, state);
992 fs_key = &fs_key_tmp;
993 }
994
995 result = nvk_compile_nir(dev, nir, info->flags, info->robustness,
996 fs_key, shader);
997 ralloc_free(nir);
998 if (result != VK_SUCCESS) {
999 nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
1000 return result;
1001 }
1002
1003 result = nvk_shader_upload(dev, shader);
1004 if (result != VK_SUCCESS) {
1005 nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
1006 return result;
1007 }
1008
1009 if (info->stage == MESA_SHADER_FRAGMENT) {
1010 if (state != NULL && state->ms != NULL) {
1011 shader->sample_shading_enable = state->ms->sample_shading_enable;
1012 if (state->ms->sample_shading_enable)
1013 shader->min_sample_shading = state->ms->min_sample_shading;
1014 }
1015 }
1016
1017 if (info->stage != MESA_SHADER_COMPUTE) {
1018 result = nvk_shader_fill_push(dev, shader, pAllocator);
1019 if (result != VK_SUCCESS) {
1020 nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
1021 return result;
1022 }
1023 }
1024
1025 *shader_out = &shader->vk;
1026
1027 return VK_SUCCESS;
1028 }
1029
1030 VkResult
nvk_compile_nir_shader(struct nvk_device * dev,nir_shader * nir,const VkAllocationCallbacks * alloc,struct nvk_shader ** shader_out)1031 nvk_compile_nir_shader(struct nvk_device *dev, nir_shader *nir,
1032 const VkAllocationCallbacks *alloc,
1033 struct nvk_shader **shader_out)
1034 {
1035 struct nvk_physical_device *pdev = nvk_device_physical(dev);
1036
1037 const struct vk_pipeline_robustness_state rs_none = {
1038 .uniform_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
1039 .storage_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
1040 .images = VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS_2_EXT,
1041 };
1042
1043 assert(nir->info.stage == MESA_SHADER_COMPUTE);
1044 if (nir->options == NULL)
1045 nir->options = nvk_get_nir_options(&pdev->vk, nir->info.stage, &rs_none);
1046
1047 struct vk_shader_compile_info info = {
1048 .stage = nir->info.stage,
1049 .nir = nir,
1050 .robustness = &rs_none,
1051 };
1052
1053 struct vk_shader *shader = NULL;
1054 VkResult result = nvk_compile_shader(dev, &info, NULL, alloc, &shader);
1055 if (result != VK_SUCCESS)
1056 return result;
1057
1058 *shader_out = container_of(shader, struct nvk_shader, vk);
1059
1060 return VK_SUCCESS;
1061 }
1062
1063 static VkResult
nvk_compile_shaders(struct vk_device * vk_dev,uint32_t shader_count,struct vk_shader_compile_info * infos,const struct vk_graphics_pipeline_state * state,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shaders_out)1064 nvk_compile_shaders(struct vk_device *vk_dev,
1065 uint32_t shader_count,
1066 struct vk_shader_compile_info *infos,
1067 const struct vk_graphics_pipeline_state *state,
1068 const VkAllocationCallbacks* pAllocator,
1069 struct vk_shader **shaders_out)
1070 {
1071 struct nvk_device *dev = container_of(vk_dev, struct nvk_device, vk);
1072
1073 for (uint32_t i = 0; i < shader_count; i++) {
1074 VkResult result = nvk_compile_shader(dev, &infos[i], state,
1075 pAllocator, &shaders_out[i]);
1076 if (result != VK_SUCCESS) {
1077 /* Clean up all the shaders before this point */
1078 for (uint32_t j = 0; j < i; j++)
1079 nvk_shader_destroy(&dev->vk, shaders_out[j], pAllocator);
1080
1081 /* Clean up all the NIR after this point */
1082 for (uint32_t j = i + 1; j < shader_count; j++)
1083 ralloc_free(infos[j].nir);
1084
1085 /* Memset the output array */
1086 memset(shaders_out, 0, shader_count * sizeof(*shaders_out));
1087
1088 return result;
1089 }
1090 }
1091
1092 return VK_SUCCESS;
1093 }
1094
1095 static VkResult
nvk_deserialize_shader(struct vk_device * vk_dev,struct blob_reader * blob,uint32_t binary_version,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shader_out)1096 nvk_deserialize_shader(struct vk_device *vk_dev,
1097 struct blob_reader *blob,
1098 uint32_t binary_version,
1099 const VkAllocationCallbacks* pAllocator,
1100 struct vk_shader **shader_out)
1101 {
1102 struct nvk_device *dev = container_of(vk_dev, struct nvk_device, vk);
1103 struct nvk_shader *shader;
1104 VkResult result;
1105
1106 struct nak_shader_info info;
1107 blob_copy_bytes(blob, &info, sizeof(info));
1108
1109 struct nvk_cbuf_map cbuf_map;
1110 blob_copy_bytes(blob, &cbuf_map, sizeof(cbuf_map));
1111
1112 bool sample_shading_enable;
1113 blob_copy_bytes(blob, &sample_shading_enable, sizeof(sample_shading_enable));
1114
1115 float min_sample_shading;
1116 blob_copy_bytes(blob, &min_sample_shading, sizeof(min_sample_shading));
1117
1118 const uint32_t code_size = blob_read_uint32(blob);
1119 const uint32_t data_size = blob_read_uint32(blob);
1120 if (blob->overrun)
1121 return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
1122
1123 shader = vk_shader_zalloc(&dev->vk, &nvk_shader_ops, info.stage,
1124 pAllocator, sizeof(*shader));
1125 if (shader == NULL)
1126 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1127
1128 shader->info = info;
1129 shader->cbuf_map = cbuf_map;
1130 shader->sample_shading_enable = sample_shading_enable;
1131 shader->min_sample_shading = min_sample_shading;
1132 shader->code_size = code_size;
1133 shader->data_size = data_size;
1134
1135 shader->code_ptr = malloc(code_size);
1136 if (shader->code_ptr == NULL) {
1137 nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
1138 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1139 }
1140
1141 shader->data_ptr = malloc(data_size);
1142 if (shader->data_ptr == NULL) {
1143 nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
1144 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1145 }
1146
1147 blob_copy_bytes(blob, (void *)shader->code_ptr, shader->code_size);
1148 blob_copy_bytes(blob, (void *)shader->data_ptr, shader->data_size);
1149 if (blob->overrun) {
1150 nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
1151 return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
1152 }
1153
1154 result = nvk_shader_upload(dev, shader);
1155 if (result != VK_SUCCESS) {
1156 nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
1157 return result;
1158 }
1159
1160 if (info.stage != MESA_SHADER_COMPUTE) {
1161 result = nvk_shader_fill_push(dev, shader, pAllocator);
1162 if (result != VK_SUCCESS) {
1163 nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
1164 return result;
1165 }
1166 }
1167
1168 *shader_out = &shader->vk;
1169
1170 return VK_SUCCESS;
1171 }
1172
1173 static bool
nvk_shader_serialize(struct vk_device * vk_dev,const struct vk_shader * vk_shader,struct blob * blob)1174 nvk_shader_serialize(struct vk_device *vk_dev,
1175 const struct vk_shader *vk_shader,
1176 struct blob *blob)
1177 {
1178 struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
1179
1180 /* We can't currently cache assmbly */
1181 if (shader->nak != NULL && shader->nak->asm_str != NULL)
1182 return false;
1183
1184 blob_write_bytes(blob, &shader->info, sizeof(shader->info));
1185 blob_write_bytes(blob, &shader->cbuf_map, sizeof(shader->cbuf_map));
1186 blob_write_bytes(blob, &shader->sample_shading_enable,
1187 sizeof(shader->sample_shading_enable));
1188 blob_write_bytes(blob, &shader->min_sample_shading,
1189 sizeof(shader->min_sample_shading));
1190
1191 blob_write_uint32(blob, shader->code_size);
1192 blob_write_uint32(blob, shader->data_size);
1193 blob_write_bytes(blob, shader->code_ptr, shader->code_size);
1194 blob_write_bytes(blob, shader->data_ptr, shader->data_size);
1195
1196 return !blob->out_of_memory;
1197 }
1198
1199 #define WRITE_STR(field, ...) ({ \
1200 memset(field, 0, sizeof(field)); \
1201 UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__); \
1202 assert(i > 0 && i < sizeof(field)); \
1203 })
1204
1205 static VkResult
nvk_shader_get_executable_properties(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t * executable_count,VkPipelineExecutablePropertiesKHR * properties)1206 nvk_shader_get_executable_properties(
1207 UNUSED struct vk_device *device,
1208 const struct vk_shader *vk_shader,
1209 uint32_t *executable_count,
1210 VkPipelineExecutablePropertiesKHR *properties)
1211 {
1212 struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
1213 VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
1214 properties, executable_count);
1215
1216 vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
1217 props->stages = mesa_to_vk_shader_stage(shader->info.stage);
1218 props->subgroupSize = 32;
1219 WRITE_STR(props->name, "%s",
1220 _mesa_shader_stage_to_string(shader->info.stage));
1221 WRITE_STR(props->description, "%s shader",
1222 _mesa_shader_stage_to_string(shader->info.stage));
1223 }
1224
1225 return vk_outarray_status(&out);
1226 }
1227
1228 static VkResult
nvk_shader_get_executable_statistics(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t executable_index,uint32_t * statistic_count,VkPipelineExecutableStatisticKHR * statistics)1229 nvk_shader_get_executable_statistics(
1230 UNUSED struct vk_device *device,
1231 const struct vk_shader *vk_shader,
1232 uint32_t executable_index,
1233 uint32_t *statistic_count,
1234 VkPipelineExecutableStatisticKHR *statistics)
1235 {
1236 struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
1237 VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
1238 statistics, statistic_count);
1239
1240 assert(executable_index == 0);
1241
1242 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
1243 WRITE_STR(stat->name, "Instruction count");
1244 WRITE_STR(stat->description, "Number of instructions used by this shader");
1245 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1246 stat->value.u64 = shader->info.num_instrs;
1247 }
1248
1249 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
1250 WRITE_STR(stat->name, "Code Size");
1251 WRITE_STR(stat->description,
1252 "Size of the compiled shader binary, in bytes");
1253 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1254 stat->value.u64 = shader->code_size;
1255 }
1256
1257 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
1258 WRITE_STR(stat->name, "Number of GPRs");
1259 WRITE_STR(stat->description, "Number of GPRs used by this pipeline");
1260 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1261 stat->value.u64 = shader->info.num_gprs;
1262 }
1263
1264 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
1265 WRITE_STR(stat->name, "SLM Size");
1266 WRITE_STR(stat->description,
1267 "Size of shader local (scratch) memory, in bytes");
1268 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1269 stat->value.u64 = shader->info.slm_size;
1270 }
1271
1272 return vk_outarray_status(&out);
1273 }
1274
1275 static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR * ir,const char * data)1276 write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
1277 const char *data)
1278 {
1279 ir->isText = VK_TRUE;
1280
1281 size_t data_len = strlen(data) + 1;
1282
1283 if (ir->pData == NULL) {
1284 ir->dataSize = data_len;
1285 return true;
1286 }
1287
1288 strncpy(ir->pData, data, ir->dataSize);
1289 if (ir->dataSize < data_len)
1290 return false;
1291
1292 ir->dataSize = data_len;
1293 return true;
1294 }
1295
1296 static VkResult
nvk_shader_get_executable_internal_representations(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t executable_index,uint32_t * internal_representation_count,VkPipelineExecutableInternalRepresentationKHR * internal_representations)1297 nvk_shader_get_executable_internal_representations(
1298 UNUSED struct vk_device *device,
1299 const struct vk_shader *vk_shader,
1300 uint32_t executable_index,
1301 uint32_t *internal_representation_count,
1302 VkPipelineExecutableInternalRepresentationKHR *internal_representations)
1303 {
1304 struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
1305 VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
1306 internal_representations,
1307 internal_representation_count);
1308 bool incomplete_text = false;
1309
1310 assert(executable_index == 0);
1311
1312 if (shader->nak != NULL && shader->nak->asm_str != NULL) {
1313 vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
1314 WRITE_STR(ir->name, "NAK assembly");
1315 WRITE_STR(ir->description, "NAK assembly");
1316 if (!write_ir_text(ir, shader->nak->asm_str))
1317 incomplete_text = true;
1318 }
1319 }
1320
1321 return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
1322 }
1323
1324 static const struct vk_shader_ops nvk_shader_ops = {
1325 .destroy = nvk_shader_destroy,
1326 .serialize = nvk_shader_serialize,
1327 .get_executable_properties = nvk_shader_get_executable_properties,
1328 .get_executable_statistics = nvk_shader_get_executable_statistics,
1329 .get_executable_internal_representations =
1330 nvk_shader_get_executable_internal_representations,
1331 };
1332
1333 const struct vk_device_shader_ops nvk_device_shader_ops = {
1334 .get_nir_options = nvk_get_nir_options,
1335 .get_spirv_options = nvk_get_spirv_options,
1336 .preprocess_nir = nvk_preprocess_nir,
1337 .hash_graphics_state = nvk_hash_graphics_state,
1338 .compile = nvk_compile_shaders,
1339 .deserialize = nvk_deserialize_shader,
1340 .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
1341 .cmd_bind_shaders = nvk_cmd_bind_shaders,
1342 };
1343