1 /*
2 * Copyright © 2021 Collabora Ltd.
3 *
4 * Derived from tu_shader.c which is:
5 * Copyright © 2019 Google LLC
6 *
7 * Also derived from anv_pipeline.c which is
8 * Copyright © 2015 Intel Corporation
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a
11 * copy of this software and associated documentation files (the "Software"),
12 * to deal in the Software without restriction, including without limitation
13 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
14 * and/or sell copies of the Software, and to permit persons to whom the
15 * Software is furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice (including the next
18 * paragraph) shall be included in all copies or substantial portions of the
19 * Software.
20 *
21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 * DEALINGS IN THE SOFTWARE.
28 */
29
30 #include "genxml/gen_macros.h"
31
32 #include "panvk_cmd_buffer.h"
33 #include "panvk_device.h"
34 #include "panvk_instance.h"
35 #include "panvk_mempool.h"
36 #include "panvk_physical_device.h"
37 #include "panvk_shader.h"
38
39 #include "spirv/nir_spirv.h"
40 #include "util/memstream.h"
41 #include "util/mesa-sha1.h"
42 #include "util/u_dynarray.h"
43 #include "nir_builder.h"
44 #include "nir_conversion_builder.h"
45 #include "nir_deref.h"
46
47 #include "vk_graphics_state.h"
48 #include "vk_shader_module.h"
49
50 #include "compiler/bifrost_nir.h"
51 #include "pan_shader.h"
52
53 #include "vk_log.h"
54 #include "vk_pipeline.h"
55 #include "vk_pipeline_layout.h"
56 #include "vk_shader.h"
57 #include "vk_util.h"
58
59 static bool
panvk_lower_sysvals(nir_builder * b,nir_instr * instr,void * data)60 panvk_lower_sysvals(nir_builder *b, nir_instr *instr, void *data)
61 {
62 if (instr->type != nir_instr_type_intrinsic)
63 return false;
64
65 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
66 unsigned bit_size = intr->def.bit_size;
67 nir_def *val = NULL;
68 b->cursor = nir_before_instr(instr);
69
70 switch (intr->intrinsic) {
71 case nir_intrinsic_load_base_workgroup_id:
72 val = load_sysval(b, compute, bit_size, base);
73 break;
74 case nir_intrinsic_load_num_workgroups:
75 val = load_sysval(b, compute, bit_size, num_work_groups);
76 break;
77 case nir_intrinsic_load_workgroup_size:
78 val = load_sysval(b, compute, bit_size, local_group_size);
79 break;
80 case nir_intrinsic_load_viewport_scale:
81 val = load_sysval(b, graphics, bit_size, viewport.scale);
82 break;
83 case nir_intrinsic_load_viewport_offset:
84 val = load_sysval(b, graphics, bit_size, viewport.offset);
85 break;
86 case nir_intrinsic_load_first_vertex:
87 val = load_sysval(b, graphics, bit_size, vs.first_vertex);
88 break;
89 case nir_intrinsic_load_base_instance:
90 val = load_sysval(b, graphics, bit_size, vs.base_instance);
91 break;
92 case nir_intrinsic_load_noperspective_varyings_pan:
93 /* TODO: use a VS epilog specialized on constant noperspective_varyings
94 * with VK_EXT_graphics_pipeline_libraries and VK_EXT_shader_object */
95 assert(b->shader->info.stage == MESA_SHADER_VERTEX);
96 val = load_sysval(b, graphics, bit_size, vs.noperspective_varyings);
97 break;
98
99 #if PAN_ARCH <= 7
100 case nir_intrinsic_load_raw_vertex_offset_pan:
101 val = load_sysval(b, graphics, bit_size, vs.raw_vertex_offset);
102 break;
103 case nir_intrinsic_load_layer_id:
104 assert(b->shader->info.stage == MESA_SHADER_FRAGMENT);
105 val = load_sysval(b, graphics, bit_size, layer_id);
106 break;
107 #endif
108
109 case nir_intrinsic_load_draw_id:
110 /* TODO: We only implement single-draw direct and indirect draws, so this
111 * is sufficient. We'll revisit this when we get around to implementing
112 * multidraw. */
113 assert(b->shader->info.stage == MESA_SHADER_VERTEX);
114 val = nir_imm_int(b, 0);
115 break;
116
117 default:
118 return false;
119 }
120
121 assert(val->num_components == intr->def.num_components);
122
123 b->cursor = nir_after_instr(instr);
124 nir_def_rewrite_uses(&intr->def, val);
125 return true;
126 }
127
128 static bool
panvk_lower_load_vs_input(nir_builder * b,nir_intrinsic_instr * intrin,UNUSED void * data)129 panvk_lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin,
130 UNUSED void *data)
131 {
132 if (intrin->intrinsic != nir_intrinsic_load_input)
133 return false;
134
135 b->cursor = nir_before_instr(&intrin->instr);
136 nir_def *ld_attr = nir_load_attribute_pan(
137 b, intrin->def.num_components, intrin->def.bit_size,
138 PAN_ARCH <= 7 ?
139 nir_load_raw_vertex_id_pan(b) :
140 nir_load_vertex_id(b),
141 PAN_ARCH >= 9 ?
142 nir_iadd(b, nir_load_instance_id(b), nir_load_base_instance(b)) :
143 nir_load_instance_id(b),
144 nir_get_io_offset_src(intrin)->ssa,
145 .base = nir_intrinsic_base(intrin),
146 .component = nir_intrinsic_component(intrin),
147 .dest_type = nir_intrinsic_dest_type(intrin));
148 nir_def_replace(&intrin->def, ld_attr);
149
150 return true;
151 }
152
153 #if PAN_ARCH <= 7
154 static bool
lower_gl_pos_layer_writes(nir_builder * b,nir_instr * instr,void * data)155 lower_gl_pos_layer_writes(nir_builder *b, nir_instr *instr, void *data)
156 {
157 if (instr->type != nir_instr_type_intrinsic)
158 return false;
159
160 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
161
162 if (intr->intrinsic != nir_intrinsic_copy_deref)
163 return false;
164
165 nir_variable *dst_var = nir_intrinsic_get_var(intr, 0);
166 nir_variable *src_var = nir_intrinsic_get_var(intr, 1);
167
168 if (!dst_var || dst_var->data.mode != nir_var_shader_out || !src_var ||
169 src_var->data.mode != nir_var_shader_temp)
170 return false;
171
172 if (dst_var->data.location == VARYING_SLOT_LAYER) {
173 /* We don't really write the layer, we just make sure primitives are
174 * discarded if gl_Layer doesn't match the layer passed to the draw.
175 */
176 b->cursor = nir_instr_remove(instr);
177 return true;
178 }
179
180 if (dst_var->data.location == VARYING_SLOT_POS) {
181 nir_variable *temp_layer_var = data;
182 nir_variable *temp_pos_var = src_var;
183
184 b->cursor = nir_before_instr(instr);
185 nir_def *layer = nir_load_var(b, temp_layer_var);
186 nir_def *pos = nir_load_var(b, temp_pos_var);
187 nir_def *inf_pos = nir_imm_vec4(b, INFINITY, INFINITY, INFINITY, 1.0f);
188 nir_def *ref_layer = load_sysval(b, graphics, 32, layer_id);
189
190 nir_store_var(b, temp_pos_var,
191 nir_bcsel(b, nir_ieq(b, layer, ref_layer), pos, inf_pos),
192 0xf);
193 return true;
194 }
195
196 return false;
197 }
198
199 static bool
lower_layer_writes(nir_shader * nir)200 lower_layer_writes(nir_shader *nir)
201 {
202 if (nir->info.stage == MESA_SHADER_FRAGMENT)
203 return false;
204
205 nir_variable *temp_layer_var = NULL;
206 bool has_layer_var = false;
207
208 nir_foreach_variable_with_modes(var, nir,
209 nir_var_shader_out | nir_var_shader_temp) {
210 if (var->data.mode == nir_var_shader_out &&
211 var->data.location == VARYING_SLOT_LAYER)
212 has_layer_var = true;
213
214 if (var->data.mode == nir_var_shader_temp &&
215 var->data.location == VARYING_SLOT_LAYER)
216 temp_layer_var = var;
217 }
218
219 if (!has_layer_var)
220 return false;
221
222 assert(temp_layer_var);
223
224 return nir_shader_instructions_pass(nir, lower_gl_pos_layer_writes,
225 nir_metadata_control_flow,
226 temp_layer_var);
227 }
228 #endif
229
230 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)231 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
232 {
233 assert(glsl_type_is_vector_or_scalar(type));
234
235 uint32_t comp_size =
236 glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
237 unsigned length = glsl_get_vector_elements(type);
238 *size = comp_size * length, *align = comp_size * (length == 3 ? 4 : length);
239 }
240
241 static inline nir_address_format
panvk_buffer_ubo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness)242 panvk_buffer_ubo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness)
243 {
244 switch (robustness) {
245 case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
246 case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
247 case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
248 return PAN_ARCH <= 7 ? nir_address_format_32bit_index_offset
249 : nir_address_format_vec2_index_32bit_offset;
250 default:
251 unreachable("Invalid robust buffer access behavior");
252 }
253 }
254
255 static inline nir_address_format
panvk_buffer_ssbo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness)256 panvk_buffer_ssbo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness)
257 {
258 switch (robustness) {
259 case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
260 return PAN_ARCH <= 7 ? nir_address_format_64bit_global_32bit_offset
261 : nir_address_format_vec2_index_32bit_offset;
262 case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
263 case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
264 return PAN_ARCH <= 7 ? nir_address_format_64bit_bounded_global
265 : nir_address_format_vec2_index_32bit_offset;
266 default:
267 unreachable("Invalid robust buffer access behavior");
268 }
269 }
270
271 static const nir_shader_compiler_options *
panvk_get_nir_options(UNUSED struct vk_physical_device * vk_pdev,UNUSED gl_shader_stage stage,UNUSED const struct vk_pipeline_robustness_state * rs)272 panvk_get_nir_options(UNUSED struct vk_physical_device *vk_pdev,
273 UNUSED gl_shader_stage stage,
274 UNUSED const struct vk_pipeline_robustness_state *rs)
275 {
276 return GENX(pan_shader_get_compiler_options)();
277 }
278
279 static struct spirv_to_nir_options
panvk_get_spirv_options(UNUSED struct vk_physical_device * vk_pdev,UNUSED gl_shader_stage stage,const struct vk_pipeline_robustness_state * rs)280 panvk_get_spirv_options(UNUSED struct vk_physical_device *vk_pdev,
281 UNUSED gl_shader_stage stage,
282 const struct vk_pipeline_robustness_state *rs)
283 {
284 return (struct spirv_to_nir_options){
285 .ubo_addr_format = panvk_buffer_ubo_addr_format(rs->uniform_buffers),
286 .ssbo_addr_format = panvk_buffer_ssbo_addr_format(rs->storage_buffers),
287 .phys_ssbo_addr_format = nir_address_format_64bit_global,
288 };
289 }
290
291 static void
panvk_preprocess_nir(UNUSED struct vk_physical_device * vk_pdev,nir_shader * nir)292 panvk_preprocess_nir(UNUSED struct vk_physical_device *vk_pdev, nir_shader *nir)
293 {
294 /* Ensure to regroup output variables at the same location */
295 if (nir->info.stage == MESA_SHADER_FRAGMENT)
296 NIR_PASS(_, nir, nir_lower_io_to_vector, nir_var_shader_out);
297
298 NIR_PASS(_, nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir),
299 true, true);
300
301 #if PAN_ARCH <= 7
302 /* This needs to be done just after the io_to_temporaries pass, because we
303 * rely on in/out temporaries to collect the final layer_id value. */
304 NIR_PASS(_, nir, lower_layer_writes);
305 #endif
306
307 NIR_PASS(_, nir, nir_lower_indirect_derefs,
308 nir_var_shader_in | nir_var_shader_out, UINT32_MAX);
309
310 NIR_PASS(_, nir, nir_opt_copy_prop_vars);
311 NIR_PASS(_, nir, nir_opt_combine_stores, nir_var_all);
312 NIR_PASS(_, nir, nir_opt_loop);
313
314 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
315 struct nir_input_attachment_options lower_input_attach_opts = {
316 .use_fragcoord_sysval = true,
317 .use_layer_id_sysval = true,
318 };
319
320 NIR_PASS(_, nir, nir_lower_input_attachments, &lower_input_attach_opts);
321 }
322
323 /* Do texture lowering here. Yes, it's a duplication of the texture
324 * lowering in bifrost_compile. However, we need to lower texture stuff
325 * now, before we call panvk_per_arch(nir_lower_descriptors)() because some
326 * of the texture lowering generates nir_texop_txs which we handle as part
327 * of descriptor lowering.
328 *
329 * TODO: We really should be doing this in common code, not dpulicated in
330 * panvk. In order to do that, we need to rework the panfrost compile
331 * flow to look more like the Intel flow:
332 *
333 * 1. Compile SPIR-V to NIR and maybe do a tiny bit of lowering that needs
334 * to be done really early.
335 *
336 * 2. pan_preprocess_nir: Does common lowering and runs the optimization
337 * loop. Nothing here should be API-specific.
338 *
339 * 3. Do additional lowering in panvk
340 *
341 * 4. pan_postprocess_nir: Does final lowering and runs the optimization
342 * loop again. This can happen as part of the final compile.
343 *
344 * This would give us a better place to do panvk-specific lowering.
345 */
346 nir_lower_tex_options lower_tex_options = {
347 .lower_txs_lod = true,
348 .lower_txp = ~0,
349 .lower_tg4_broadcom_swizzle = true,
350 .lower_txd_cube_map = true,
351 .lower_invalid_implicit_lod = true,
352 };
353 NIR_PASS(_, nir, nir_lower_tex, &lower_tex_options);
354 NIR_PASS(_, nir, nir_lower_system_values);
355
356 nir_lower_compute_system_values_options options = {
357 .has_base_workgroup_id = true,
358 };
359
360 NIR_PASS(_, nir, nir_lower_compute_system_values, &options);
361
362 if (nir->info.stage == MESA_SHADER_FRAGMENT)
363 NIR_PASS(_, nir, nir_lower_wpos_center);
364
365 NIR_PASS(_, nir, nir_split_var_copies);
366 NIR_PASS(_, nir, nir_lower_var_copies);
367 }
368
369 static void
panvk_hash_graphics_state(struct vk_physical_device * device,const struct vk_graphics_pipeline_state * state,VkShaderStageFlags stages,blake3_hash blake3_out)370 panvk_hash_graphics_state(struct vk_physical_device *device,
371 const struct vk_graphics_pipeline_state *state,
372 VkShaderStageFlags stages, blake3_hash blake3_out)
373 {
374 struct mesa_blake3 blake3_ctx;
375 _mesa_blake3_init(&blake3_ctx);
376
377 /* This doesn't impact the shader compile but it does go in the
378 * panvk_shader and gets [de]serialized along with the binary so
379 * we need to hash it.
380 */
381 bool sample_shading_enable = state->ms && state->ms->sample_shading_enable;
382 _mesa_blake3_update(&blake3_ctx, &sample_shading_enable,
383 sizeof(sample_shading_enable));
384
385 _mesa_blake3_update(&blake3_ctx, &state->rp->view_mask,
386 sizeof(state->rp->view_mask));
387
388 _mesa_blake3_final(&blake3_ctx, blake3_out);
389 }
390
391 #if PAN_ARCH >= 9
392 static bool
valhall_pack_buf_idx(nir_builder * b,nir_instr * instr,UNUSED void * data)393 valhall_pack_buf_idx(nir_builder *b, nir_instr *instr, UNUSED void *data)
394 {
395 if (instr->type != nir_instr_type_intrinsic)
396 return false;
397
398 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
399 unsigned index_src;
400
401 switch (intrin->intrinsic) {
402 case nir_intrinsic_load_ubo:
403 case nir_intrinsic_load_ssbo:
404 case nir_intrinsic_ssbo_atomic:
405 case nir_intrinsic_ssbo_atomic_swap:
406 index_src = 0;
407 break;
408
409 case nir_intrinsic_store_ssbo:
410 index_src = 1;
411 break;
412
413 default:
414 return false;
415 }
416
417 nir_def *index = intrin->src[index_src].ssa;
418
419 /* The descriptor lowering pass can add UBO loads, and those already have the
420 * right index format. */
421 if (index->num_components == 1)
422 return false;
423
424 b->cursor = nir_before_instr(&intrin->instr);
425
426 /* The valhall backend expects nir_address_format_32bit_index_offset,
427 * but address mode is nir_address_format_vec2_index_32bit_offset to allow
428 * us to store the array size, set and index without losing information
429 * while walking the descriptor deref chain (needed to do a bound check on
430 * the array index when we reach the end of the chain).
431 * Turn it back to nir_address_format_32bit_index_offset after IOs
432 * have been lowered. */
433 nir_def *packed_index =
434 nir_iadd(b, nir_channel(b, index, 0), nir_channel(b, index, 1));
435 nir_src_rewrite(&intrin->src[index_src], packed_index);
436 return true;
437 }
438 #endif
439
440 static bool
valhall_lower_get_ssbo_size(struct nir_builder * b,nir_intrinsic_instr * intr,void * data)441 valhall_lower_get_ssbo_size(struct nir_builder *b,
442 nir_intrinsic_instr *intr, void *data)
443 {
444 if (intr->intrinsic != nir_intrinsic_get_ssbo_size)
445 return false;
446
447 b->cursor = nir_before_instr(&intr->instr);
448
449 nir_def *table_idx =
450 nir_ushr_imm(b, nir_channel(b, intr->src[0].ssa, 0), 24);
451 nir_def *res_table = nir_ior_imm(b, table_idx, pan_res_handle(62, 0));
452 nir_def *buf_idx = nir_channel(b, intr->src[0].ssa, 1);
453 nir_def *desc_offset = nir_imul_imm(b, buf_idx, PANVK_DESCRIPTOR_SIZE);
454 nir_def *size = nir_load_ubo(
455 b, 1, 32, res_table, nir_iadd_imm(b, desc_offset, 4), .range = ~0u,
456 .align_mul = PANVK_DESCRIPTOR_SIZE, .align_offset = 4);
457
458 nir_def_replace(&intr->def, size);
459 return true;
460 }
461
462 static bool
collect_push_constant(struct nir_builder * b,nir_intrinsic_instr * intr,void * data)463 collect_push_constant(struct nir_builder *b, nir_intrinsic_instr *intr,
464 void *data)
465 {
466 if (intr->intrinsic != nir_intrinsic_load_push_constant)
467 return false;
468
469 struct panvk_shader *shader = data;
470 uint32_t base = nir_intrinsic_base(intr);
471 bool is_sysval = base >= SYSVALS_PUSH_CONST_BASE;
472 uint32_t offset, size;
473
474 /* Sysvals should have a constant offset. */
475 assert(!is_sysval || nir_src_is_const(intr->src[0]));
476
477 if (is_sysval)
478 base -= SYSVALS_PUSH_CONST_BASE;
479
480 /* If the offset is dynamic, we need to flag [base:base+range] as used, to
481 * allow global mem access. */
482 if (!nir_src_is_const(intr->src[0])) {
483 offset = base;
484 size = nir_intrinsic_range(intr);
485
486 /* Flag the push_consts sysval as needed if we have an indirect offset. */
487 if (b->shader->info.stage == MESA_SHADER_COMPUTE)
488 shader_use_sysval(shader, compute, push_consts);
489 else
490 shader_use_sysval(shader, graphics, push_consts);
491 } else {
492 offset = base + nir_src_as_uint(intr->src[0]);
493 size = (intr->def.bit_size / 8) * intr->def.num_components;
494 }
495
496 if (is_sysval)
497 shader_use_sysval_range(shader, offset, size);
498 else
499 shader_use_push_const_range(shader, offset, size);
500
501 return true;
502 }
503
504 static bool
move_push_constant(struct nir_builder * b,nir_intrinsic_instr * intr,void * data)505 move_push_constant(struct nir_builder *b, nir_intrinsic_instr *intr, void *data)
506 {
507 if (intr->intrinsic != nir_intrinsic_load_push_constant)
508 return false;
509
510 struct panvk_shader *shader = data;
511 unsigned base = nir_intrinsic_base(intr);
512 bool is_sysval = base >= SYSVALS_PUSH_CONST_BASE;
513
514 if (is_sysval)
515 base -= SYSVALS_PUSH_CONST_BASE;
516
517 /* Sysvals should have a constant offset. */
518 assert(!is_sysval || nir_src_is_const(intr->src[0]));
519
520 b->cursor = nir_before_instr(&intr->instr);
521
522 if (nir_src_is_const(intr->src[0])) {
523 unsigned offset = base + nir_src_as_uint(intr->src[0]);
524
525 /* We place the sysvals first, and then comes the user push constants.
526 * We do that so we always have the blend constants at offset 0 for
527 * blend shaders. */
528 if (is_sysval)
529 offset = shader_remapped_sysval_offset(shader, offset);
530 else
531 offset = shader_remapped_push_const_offset(shader, offset);
532
533 nir_src_rewrite(&intr->src[0], nir_imm_int(b, offset));
534
535 /* We always set the range/base to zero, to make sure no pass is using it
536 * after that point. */
537 nir_intrinsic_set_base(intr, 0);
538 nir_intrinsic_set_range(intr, 0);
539 } else {
540 /* We don't use load_sysval() on purpose, because it would set
541 * .base=SYSVALS_PUSH_CONST_BASE, and we're supposed to force a base of
542 * zero in this pass. */
543 unsigned push_const_buf_offset = shader_remapped_sysval_offset(
544 shader, b->shader->info.stage == MESA_SHADER_COMPUTE
545 ? sysval_offset(compute, push_consts)
546 : sysval_offset(graphics, push_consts));
547 nir_def *push_const_buf = nir_load_push_constant(
548 b, 1, 64, nir_imm_int(b, push_const_buf_offset));
549 unsigned push_const_offset =
550 shader_remapped_fau_offset(shader, push_consts, base);
551 nir_def *offset = nir_iadd_imm(b, intr->src[0].ssa, push_const_offset);
552 unsigned align = nir_combined_align(nir_intrinsic_align_mul(intr),
553 nir_intrinsic_align_offset(intr));
554
555 /* We assume an alignment of 64-bit max for packed push-constants. */
556 align = MIN2(align, FAU_WORD_SIZE);
557 nir_def *value =
558 nir_load_global(b, nir_iadd(b, push_const_buf, nir_u2u64(b, offset)),
559 align, intr->def.num_components, intr->def.bit_size);
560
561 nir_def_replace(&intr->def, value);
562 }
563
564 return true;
565 }
566
567 static void
lower_load_push_consts(nir_shader * nir,struct panvk_shader * shader)568 lower_load_push_consts(nir_shader *nir, struct panvk_shader *shader)
569 {
570 /* Before we lower load_push_constant()s with a dynamic offset to global
571 * loads, we want to run a few optimization passes to get rid of offset
572 * calculation involving only constant values. */
573 bool progress = false;
574 do {
575 progress = false;
576 NIR_PASS(progress, nir, nir_copy_prop);
577 NIR_PASS(progress, nir, nir_opt_remove_phis);
578 NIR_PASS(progress, nir, nir_opt_dce);
579 NIR_PASS(progress, nir, nir_opt_dead_cf);
580 NIR_PASS(progress, nir, nir_opt_cse);
581 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
582 NIR_PASS(progress, nir, nir_opt_algebraic);
583 NIR_PASS(progress, nir, nir_opt_constant_folding);
584 } while (progress);
585
586 /* We always reserve the 4 blend constant words for fragment shaders,
587 * because we don't know the blend configuration at this point, and
588 * we might end up with a blend shader reading those blend constants. */
589 if (shader->vk.stage == MESA_SHADER_FRAGMENT) {
590 /* We rely on blend constants being placed first and covering 4 words. */
591 STATIC_ASSERT(
592 offsetof(struct panvk_graphics_sysvals, blend.constants) == 0 &&
593 sizeof(((struct panvk_graphics_sysvals *)NULL)->blend.constants) ==
594 16);
595
596 shader_use_sysval(shader, graphics, blend.constants);
597 }
598
599 progress = false;
600 NIR_PASS(progress, nir, nir_shader_intrinsics_pass, collect_push_constant,
601 nir_metadata_all, shader);
602
603 /* Some load_push_constant instructions might be eliminated after
604 * scalarization+dead-code-elimination. Since these pass happen in
605 * bifrost_compile(), we can't run the push_constant packing after the
606 * optimization took place, so let's just have our own FAU count instead
607 * of using info.push.count to make it consistent with the
608 * used_{sysvals,push_consts} bitmaps, even if it sometimes implies loading
609 * more than we really need. Doing that also takes into account the fact
610 * blend constants are never loaded from the fragment shader, but might be
611 * needed in the blend shader. */
612 shader->fau.sysval_count = BITSET_COUNT(shader->fau.used_sysvals);
613 shader->fau.total_count =
614 shader->fau.sysval_count + BITSET_COUNT(shader->fau.used_push_consts);
615
616 if (!progress)
617 return;
618
619 NIR_PASS(_, nir, nir_shader_intrinsics_pass, move_push_constant,
620 nir_metadata_control_flow, shader);
621 }
622
623 static void
panvk_lower_nir(struct panvk_device * dev,nir_shader * nir,uint32_t set_layout_count,struct vk_descriptor_set_layout * const * set_layouts,const struct vk_pipeline_robustness_state * rs,uint32_t * noperspective_varyings,const struct panfrost_compile_inputs * compile_input,struct panvk_shader * shader)624 panvk_lower_nir(struct panvk_device *dev, nir_shader *nir,
625 uint32_t set_layout_count,
626 struct vk_descriptor_set_layout *const *set_layouts,
627 const struct vk_pipeline_robustness_state *rs,
628 uint32_t *noperspective_varyings,
629 const struct panfrost_compile_inputs *compile_input,
630 struct panvk_shader *shader)
631 {
632 struct panvk_instance *instance =
633 to_panvk_instance(dev->vk.physical->instance);
634 gl_shader_stage stage = nir->info.stage;
635
636 #if PAN_ARCH >= 10
637 if (stage == MESA_SHADER_VERTEX && compile_input->view_mask) {
638 nir_lower_multiview_options options = {
639 .view_mask = compile_input->view_mask,
640 .allowed_per_view_outputs = ~0
641 };
642 /* The only case where this should fail is with memory/image writes,
643 * which we don't support in vertex shaders */
644 assert(nir_can_lower_multiview(nir, options));
645 NIR_PASS(_, nir, nir_lower_multiview, options);
646 /* Pull output writes out of the loop and give them constant offsets for
647 * pan_lower_store_components */
648 NIR_PASS(_, nir, nir_lower_io_to_temporaries,
649 nir_shader_get_entrypoint(nir), true, false);
650 }
651 #endif
652
653 /* Lower input intrinsics for fragment shaders early to get the max
654 * number of varying loads, as this number is required during descriptor
655 * lowering for v9+. */
656 if (stage == MESA_SHADER_FRAGMENT) {
657 nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
658 stage);
659 #if PAN_ARCH >= 9
660 shader->desc_info.max_varying_loads = nir->num_inputs;
661 #endif
662 }
663
664 panvk_per_arch(nir_lower_descriptors)(nir, dev, rs, set_layout_count,
665 set_layouts, shader);
666
667 NIR_PASS(_, nir, nir_split_var_copies);
668 NIR_PASS(_, nir, nir_lower_var_copies);
669
670 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
671 panvk_buffer_ubo_addr_format(rs->uniform_buffers));
672 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo,
673 panvk_buffer_ssbo_addr_format(rs->storage_buffers));
674 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const,
675 nir_address_format_32bit_offset);
676 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
677 nir_address_format_64bit_global);
678
679 #if PAN_ARCH >= 9
680 NIR_PASS(_, nir, nir_shader_intrinsics_pass, valhall_lower_get_ssbo_size,
681 nir_metadata_control_flow, NULL);
682 NIR_PASS(_, nir, nir_shader_instructions_pass, valhall_pack_buf_idx,
683 nir_metadata_control_flow, NULL);
684 #endif
685
686 if (gl_shader_stage_uses_workgroup(stage)) {
687 if (!nir->info.shared_memory_explicit_layout) {
688 NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared,
689 shared_type_info);
690 }
691
692 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared,
693 nir_address_format_32bit_offset);
694 }
695
696 if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
697 /* Align everything up to 16 bytes to take advantage of load store
698 * vectorization. */
699 nir->info.shared_size = align(nir->info.shared_size, 16);
700 NIR_PASS(_, nir, nir_zero_initialize_shared_memory, nir->info.shared_size,
701 16);
702
703 /* We need to call lower_compute_system_values again because
704 * nir_zero_initialize_shared_memory generates load_invocation_id which
705 * has to be lowered to load_invocation_index.
706 */
707 NIR_PASS(_, nir, nir_lower_compute_system_values, NULL);
708 }
709
710 if (stage == MESA_SHADER_VERTEX) {
711 /* We need the driver_location to match the vertex attribute location,
712 * so we can use the attribute layout described by
713 * vk_vertex_input_state where there are holes in the attribute locations.
714 */
715 nir_foreach_shader_in_variable(var, nir) {
716 assert(var->data.location >= VERT_ATTRIB_GENERIC0 &&
717 var->data.location <= VERT_ATTRIB_GENERIC15);
718 var->data.driver_location = var->data.location - VERT_ATTRIB_GENERIC0;
719 }
720 } else if (stage != MESA_SHADER_FRAGMENT) {
721 /* Input varyings in fragment shader have been lowered early. */
722 nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
723 stage);
724 }
725
726 nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
727 stage);
728
729 /* Needed to turn shader_temp into function_temp since the backend only
730 * handles the latter for now.
731 */
732 NIR_PASS(_, nir, nir_lower_global_vars_to_local);
733
734 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
735 if (unlikely(instance->debug_flags & PANVK_DEBUG_NIR)) {
736 fprintf(stderr, "translated nir:\n");
737 nir_print_shader(nir, stderr);
738 }
739
740 pan_shader_preprocess(nir, compile_input->gpu_id);
741
742 if (stage == MESA_SHADER_VERTEX)
743 NIR_PASS(_, nir, nir_shader_intrinsics_pass, panvk_lower_load_vs_input,
744 nir_metadata_control_flow, NULL);
745
746 /* since valhall, panvk_per_arch(nir_lower_descriptors) separates the
747 * driver set and the user sets, and does not need pan_lower_image_index
748 */
749 if (PAN_ARCH < 9 && stage == MESA_SHADER_VERTEX)
750 NIR_PASS(_, nir, pan_lower_image_index, MAX_VS_ATTRIBS);
751
752 if (noperspective_varyings && stage == MESA_SHADER_VERTEX)
753 NIR_PASS(_, nir, pan_nir_lower_static_noperspective,
754 *noperspective_varyings);
755
756 NIR_PASS(_, nir, nir_shader_instructions_pass, panvk_lower_sysvals,
757 nir_metadata_control_flow, NULL);
758
759 lower_load_push_consts(nir, shader);
760 }
761
762 static VkResult
panvk_compile_nir(struct panvk_device * dev,nir_shader * nir,VkShaderCreateFlagsEXT shader_flags,struct panfrost_compile_inputs * compile_input,struct panvk_shader * shader)763 panvk_compile_nir(struct panvk_device *dev, nir_shader *nir,
764 VkShaderCreateFlagsEXT shader_flags,
765 struct panfrost_compile_inputs *compile_input,
766 struct panvk_shader *shader)
767 {
768 const bool dump_asm =
769 shader_flags & VK_SHADER_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_MESA;
770
771 struct util_dynarray binary;
772 util_dynarray_init(&binary, NULL);
773 GENX(pan_shader_compile)(nir, compile_input, &binary, &shader->info);
774
775 void *bin_ptr = util_dynarray_element(&binary, uint8_t, 0);
776 unsigned bin_size = util_dynarray_num_elements(&binary, uint8_t);
777
778 shader->bin_size = 0;
779 shader->bin_ptr = NULL;
780
781 if (bin_size) {
782 void *data = malloc(bin_size);
783
784 if (data == NULL)
785 return panvk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
786
787 memcpy(data, bin_ptr, bin_size);
788 shader->bin_size = bin_size;
789 shader->bin_ptr = data;
790 }
791 util_dynarray_fini(&binary);
792
793 if (dump_asm) {
794 shader->nir_str = nir_shader_as_str(nir, NULL);
795
796 char *data = NULL;
797 size_t disasm_size = 0;
798
799 if (shader->bin_size) {
800 struct u_memstream mem;
801 if (u_memstream_open(&mem, &data, &disasm_size)) {
802 FILE *const stream = u_memstream_get(&mem);
803 pan_shader_disassemble(stream, shader->bin_ptr, shader->bin_size,
804 compile_input->gpu_id, false);
805 u_memstream_close(&mem);
806 }
807 }
808
809 char *asm_str = malloc(disasm_size + 1);
810 memcpy(asm_str, data, disasm_size);
811 asm_str[disasm_size] = '\0';
812 free(data);
813
814 shader->asm_str = asm_str;
815 }
816
817 #if PAN_ARCH <= 7
818 /* Patch the descriptor count */
819 shader->info.ubo_count =
820 shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_UBO] +
821 shader->desc_info.dyn_ubos.count;
822 shader->info.texture_count =
823 shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_TEXTURE];
824 shader->info.sampler_count =
825 shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_SAMPLER];
826
827 /* Dummy sampler. */
828 if (!shader->info.sampler_count && shader->info.texture_count)
829 shader->info.sampler_count++;
830
831 if (nir->info.stage == MESA_SHADER_VERTEX) {
832 /* We leave holes in the attribute locations, but pan_shader.c assumes the
833 * opposite. Patch attribute_count accordingly, so
834 * pan_shader_prepare_rsd() does what we expect.
835 */
836 uint32_t gen_attribs =
837 (shader->info.attributes_read & VERT_BIT_GENERIC_ALL) >>
838 VERT_ATTRIB_GENERIC0;
839
840 shader->info.attribute_count = util_last_bit(gen_attribs);
841
842 /* NULL IDVS shaders are not allowed. */
843 if (!bin_size)
844 shader->info.vs.idvs = false;
845 }
846
847 /* Image attributes start at MAX_VS_ATTRIBS in the VS attribute table,
848 * and zero in other stages.
849 */
850 if (shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_IMG] > 0)
851 shader->info.attribute_count =
852 shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_IMG] +
853 (nir->info.stage == MESA_SHADER_VERTEX ? MAX_VS_ATTRIBS : 0);
854 #endif
855
856 shader->local_size.x = nir->info.workgroup_size[0];
857 shader->local_size.y = nir->info.workgroup_size[1];
858 shader->local_size.z = nir->info.workgroup_size[2];
859
860 return VK_SUCCESS;
861 }
862
863 #if PAN_ARCH >= 9
864 static enum mali_flush_to_zero_mode
shader_ftz_mode(struct panvk_shader * shader)865 shader_ftz_mode(struct panvk_shader *shader)
866 {
867 if (shader->info.ftz_fp32) {
868 if (shader->info.ftz_fp16)
869 return MALI_FLUSH_TO_ZERO_MODE_ALWAYS;
870 else
871 return MALI_FLUSH_TO_ZERO_MODE_DX11;
872 } else {
873 /* We don't have a "flush FP16, preserve FP32" mode, but APIs
874 * should not be able to generate that.
875 */
876 assert(!shader->info.ftz_fp16 && !shader->info.ftz_fp32);
877 return MALI_FLUSH_TO_ZERO_MODE_PRESERVE_SUBNORMALS;
878 }
879 }
880 #endif
881
882 static VkResult
panvk_shader_upload(struct panvk_device * dev,struct panvk_shader * shader,const VkAllocationCallbacks * pAllocator)883 panvk_shader_upload(struct panvk_device *dev, struct panvk_shader *shader,
884 const VkAllocationCallbacks *pAllocator)
885 {
886 shader->code_mem = (struct panvk_priv_mem){0};
887
888 #if PAN_ARCH <= 7
889 shader->rsd = (struct panvk_priv_mem){0};
890 #else
891 shader->spd = (struct panvk_priv_mem){0};
892 #endif
893
894 if (!shader->bin_size)
895 return VK_SUCCESS;
896
897 shader->code_mem = panvk_pool_upload_aligned(
898 &dev->mempools.exec, shader->bin_ptr, shader->bin_size, 128);
899 if (!panvk_priv_mem_dev_addr(shader->code_mem))
900 return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
901
902 #if PAN_ARCH <= 7
903 if (shader->info.stage == MESA_SHADER_FRAGMENT)
904 return VK_SUCCESS;
905
906 shader->rsd = panvk_pool_alloc_desc(&dev->mempools.rw, RENDERER_STATE);
907 if (!panvk_priv_mem_dev_addr(shader->rsd))
908 return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
909
910 pan_cast_and_pack(panvk_priv_mem_host_addr(shader->rsd), RENDERER_STATE,
911 cfg) {
912 pan_shader_prepare_rsd(&shader->info, panvk_shader_get_dev_addr(shader),
913 &cfg);
914 }
915 #else
916 if (shader->info.stage != MESA_SHADER_VERTEX) {
917 shader->spd = panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
918 if (!panvk_priv_mem_dev_addr(shader->spd))
919 return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
920
921 pan_cast_and_pack(panvk_priv_mem_host_addr(shader->spd), SHADER_PROGRAM,
922 cfg) {
923 cfg.stage = pan_shader_stage(&shader->info);
924
925 if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
926 cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
927 else if (cfg.stage == MALI_SHADER_STAGE_VERTEX)
928 cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
929
930 cfg.register_allocation =
931 pan_register_allocation(shader->info.work_reg_count);
932 cfg.binary = panvk_shader_get_dev_addr(shader);
933 cfg.preload.r48_r63 = (shader->info.preload >> 48);
934 cfg.flush_to_zero_mode = shader_ftz_mode(shader);
935
936 if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
937 cfg.requires_helper_threads = shader->info.contains_barrier;
938 }
939 } else {
940 shader->spds.pos_points =
941 panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
942 if (!panvk_priv_mem_dev_addr(shader->spds.pos_points))
943 return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
944
945 pan_cast_and_pack(panvk_priv_mem_host_addr(shader->spds.pos_points),
946 SHADER_PROGRAM, cfg) {
947 cfg.stage = pan_shader_stage(&shader->info);
948 cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
949 cfg.register_allocation =
950 pan_register_allocation(shader->info.work_reg_count);
951 cfg.binary = panvk_shader_get_dev_addr(shader);
952 cfg.preload.r48_r63 = (shader->info.preload >> 48);
953 cfg.flush_to_zero_mode = shader_ftz_mode(shader);
954 }
955
956 shader->spds.pos_triangles =
957 panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
958 if (!panvk_priv_mem_dev_addr(shader->spds.pos_triangles))
959 return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
960
961 pan_cast_and_pack(panvk_priv_mem_host_addr(shader->spds.pos_triangles),
962 SHADER_PROGRAM, cfg) {
963 cfg.stage = pan_shader_stage(&shader->info);
964 cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
965 cfg.register_allocation =
966 pan_register_allocation(shader->info.work_reg_count);
967 cfg.binary =
968 panvk_shader_get_dev_addr(shader) + shader->info.vs.no_psiz_offset;
969 cfg.preload.r48_r63 = (shader->info.preload >> 48);
970 cfg.flush_to_zero_mode = shader_ftz_mode(shader);
971 }
972
973 if (shader->info.vs.secondary_enable) {
974 shader->spds.var =
975 panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
976 if (!panvk_priv_mem_dev_addr(shader->spds.var))
977 return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
978
979 pan_cast_and_pack(panvk_priv_mem_host_addr(shader->spds.var),
980 SHADER_PROGRAM, cfg) {
981 unsigned work_count = shader->info.vs.secondary_work_reg_count;
982
983 cfg.stage = pan_shader_stage(&shader->info);
984 cfg.vertex_warp_limit = MALI_WARP_LIMIT_FULL;
985 cfg.register_allocation = pan_register_allocation(work_count);
986 cfg.binary = panvk_shader_get_dev_addr(shader) +
987 shader->info.vs.secondary_offset;
988 cfg.preload.r48_r63 = (shader->info.vs.secondary_preload >> 48);
989 cfg.flush_to_zero_mode = shader_ftz_mode(shader);
990 }
991 }
992 }
993 #endif
994
995 return VK_SUCCESS;
996 }
997
998 static void
panvk_shader_destroy(struct vk_device * vk_dev,struct vk_shader * vk_shader,const VkAllocationCallbacks * pAllocator)999 panvk_shader_destroy(struct vk_device *vk_dev, struct vk_shader *vk_shader,
1000 const VkAllocationCallbacks *pAllocator)
1001 {
1002 struct panvk_device *dev = to_panvk_device(vk_dev);
1003 struct panvk_shader *shader =
1004 container_of(vk_shader, struct panvk_shader, vk);
1005
1006 free((void *)shader->asm_str);
1007 ralloc_free((void *)shader->nir_str);
1008
1009 panvk_pool_free_mem(&shader->code_mem);
1010
1011 #if PAN_ARCH <= 7
1012 panvk_pool_free_mem(&shader->rsd);
1013 panvk_pool_free_mem(&shader->desc_info.others.map);
1014 #else
1015 if (shader->info.stage != MESA_SHADER_VERTEX) {
1016 panvk_pool_free_mem(&shader->spd);
1017 } else {
1018 panvk_pool_free_mem(&shader->spds.var);
1019 panvk_pool_free_mem(&shader->spds.pos_points);
1020 panvk_pool_free_mem(&shader->spds.pos_triangles);
1021 }
1022 #endif
1023
1024 free((void *)shader->bin_ptr);
1025 vk_shader_free(&dev->vk, pAllocator, &shader->vk);
1026 }
1027
1028 static const struct vk_shader_ops panvk_shader_ops;
1029
1030 static VkResult
panvk_compile_shader(struct panvk_device * dev,struct vk_shader_compile_info * info,const struct vk_graphics_pipeline_state * state,uint32_t * noperspective_varyings,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shader_out)1031 panvk_compile_shader(struct panvk_device *dev,
1032 struct vk_shader_compile_info *info,
1033 const struct vk_graphics_pipeline_state *state,
1034 uint32_t *noperspective_varyings,
1035 const VkAllocationCallbacks *pAllocator,
1036 struct vk_shader **shader_out)
1037 {
1038 struct panvk_physical_device *phys_dev =
1039 to_panvk_physical_device(dev->vk.physical);
1040
1041 struct panvk_shader *shader;
1042 VkResult result;
1043
1044 /* We consume the NIR, regardless of success or failure */
1045 nir_shader *nir = info->nir;
1046
1047 shader = vk_shader_zalloc(&dev->vk, &panvk_shader_ops, info->stage,
1048 pAllocator, sizeof(*shader));
1049 if (shader == NULL)
1050 return panvk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1051
1052 struct panfrost_compile_inputs inputs = {
1053 .gpu_id = phys_dev->kmod.props.gpu_prod_id,
1054 .no_ubo_to_push = true,
1055 .view_mask = (state && state->rp) ? state->rp->view_mask : 0,
1056 };
1057
1058 if (info->stage == MESA_SHADER_FRAGMENT && state != NULL &&
1059 state->ms != NULL && state->ms->sample_shading_enable)
1060 nir->info.fs.uses_sample_shading = true;
1061
1062 panvk_lower_nir(dev, nir, info->set_layout_count, info->set_layouts,
1063 info->robustness, noperspective_varyings, &inputs, shader);
1064
1065 #if PAN_ARCH >= 9
1066 if (info->stage == MESA_SHADER_FRAGMENT)
1067 /* Use LD_VAR_BUF[_IMM] for varyings if possible. */
1068 inputs.valhall.use_ld_var_buf = panvk_use_ld_var_buf(shader);
1069 #endif
1070
1071 result = panvk_compile_nir(dev, nir, info->flags, &inputs, shader);
1072
1073 /* We need to update info.push.count because it's used to initialize the
1074 * RSD in pan_shader_prepare_rsd(). */
1075 shader->info.push.count = shader->fau.total_count * 2;
1076
1077 if (result != VK_SUCCESS) {
1078 panvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
1079 return result;
1080 }
1081
1082 result = panvk_shader_upload(dev, shader, pAllocator);
1083
1084 if (result != VK_SUCCESS) {
1085 panvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
1086 return result;
1087 }
1088
1089 *shader_out = &shader->vk;
1090
1091 return result;
1092 }
1093
1094 static VkResult
panvk_compile_shaders(struct vk_device * vk_dev,uint32_t shader_count,struct vk_shader_compile_info * infos,const struct vk_graphics_pipeline_state * state,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shaders_out)1095 panvk_compile_shaders(struct vk_device *vk_dev, uint32_t shader_count,
1096 struct vk_shader_compile_info *infos,
1097 const struct vk_graphics_pipeline_state *state,
1098 const VkAllocationCallbacks *pAllocator,
1099 struct vk_shader **shaders_out)
1100 {
1101 struct panvk_device *dev = to_panvk_device(vk_dev);
1102 bool use_static_noperspective = false;
1103 uint32_t noperspective_varyings = 0;
1104 VkResult result;
1105 int32_t i;
1106
1107 /* Vulkan runtime passes us shaders in stage order, so the FS will always
1108 * be last if it exists. Iterate shaders in reverse order to ensure FS is
1109 * processed before VS. */
1110 for (i = shader_count - 1; i >= 0; i--) {
1111 uint32_t *noperspective_varyings_ptr =
1112 use_static_noperspective ? &noperspective_varyings : NULL;
1113 result = panvk_compile_shader(dev, &infos[i], state,
1114 noperspective_varyings_ptr,
1115 pAllocator,
1116 &shaders_out[i]);
1117
1118 if (result != VK_SUCCESS)
1119 goto err_cleanup;
1120
1121 /* If we are linking VS and FS, we can use the static interpolation
1122 * qualifiers from the FS in the VS. */
1123 if (infos[i].nir->info.stage == MESA_SHADER_FRAGMENT) {
1124 struct panvk_shader *shader =
1125 container_of(shaders_out[i], struct panvk_shader, vk);
1126
1127 use_static_noperspective = true;
1128 noperspective_varyings = shader->info.varyings.noperspective;
1129 }
1130
1131 /* Clean up NIR for the current shader */
1132 ralloc_free(infos[i].nir);
1133 }
1134
1135 /* TODO: If we get multiple shaders here, we can perform part of the link
1136 * logic at compile time. */
1137
1138 return VK_SUCCESS;
1139
1140 err_cleanup:
1141 /* Clean up all the shaders before this point */
1142 for (int32_t j = shader_count - 1; j > i; j--)
1143 panvk_shader_destroy(&dev->vk, shaders_out[j], pAllocator);
1144
1145 /* Clean up all the NIR from this point */
1146 for (int32_t j = i; j >= 0; j--)
1147 ralloc_free(infos[j].nir);
1148
1149 /* Memset the output array */
1150 memset(shaders_out, 0, shader_count * sizeof(*shaders_out));
1151
1152 return result;
1153 }
1154
1155 static VkResult
shader_desc_info_deserialize(struct blob_reader * blob,struct panvk_shader * shader)1156 shader_desc_info_deserialize(struct blob_reader *blob,
1157 struct panvk_shader *shader)
1158 {
1159 shader->desc_info.used_set_mask = blob_read_uint32(blob);
1160
1161 #if PAN_ARCH <= 7
1162 shader->desc_info.dyn_ubos.count = blob_read_uint32(blob);
1163 blob_copy_bytes(blob, shader->desc_info.dyn_ubos.map,
1164 shader->desc_info.dyn_ubos.count);
1165 shader->desc_info.dyn_ssbos.count = blob_read_uint32(blob);
1166 blob_copy_bytes(blob, shader->desc_info.dyn_ssbos.map,
1167 shader->desc_info.dyn_ssbos.count);
1168
1169 uint32_t others_count = 0;
1170 for (unsigned i = 0; i < ARRAY_SIZE(shader->desc_info.others.count); i++) {
1171 shader->desc_info.others.count[i] = blob_read_uint32(blob);
1172 others_count += shader->desc_info.others.count[i];
1173 }
1174
1175 if (others_count) {
1176 struct panvk_device *dev = to_panvk_device(shader->vk.base.device);
1177 struct panvk_pool_alloc_info alloc_info = {
1178 .size = others_count * sizeof(uint32_t),
1179 .alignment = sizeof(uint32_t),
1180 };
1181 shader->desc_info.others.map =
1182 panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
1183 uint32_t *copy_table =
1184 panvk_priv_mem_host_addr(shader->desc_info.others.map);
1185
1186 if (!copy_table)
1187 return panvk_error(shader, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1188
1189 blob_copy_bytes(blob, copy_table, others_count * sizeof(*copy_table));
1190 }
1191 #else
1192 shader->desc_info.dyn_bufs.count = blob_read_uint32(blob);
1193 blob_copy_bytes(blob, shader->desc_info.dyn_bufs.map,
1194 sizeof(*shader->desc_info.dyn_bufs.map) *
1195 shader->desc_info.dyn_bufs.count);
1196 #endif
1197
1198 return VK_SUCCESS;
1199 }
1200
1201 static VkResult
panvk_deserialize_shader(struct vk_device * vk_dev,struct blob_reader * blob,uint32_t binary_version,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shader_out)1202 panvk_deserialize_shader(struct vk_device *vk_dev, struct blob_reader *blob,
1203 uint32_t binary_version,
1204 const VkAllocationCallbacks *pAllocator,
1205 struct vk_shader **shader_out)
1206 {
1207 struct panvk_device *device = to_panvk_device(vk_dev);
1208 struct panvk_shader *shader;
1209 VkResult result;
1210
1211 struct pan_shader_info info;
1212 blob_copy_bytes(blob, &info, sizeof(info));
1213
1214 struct panvk_shader_fau_info fau;
1215 blob_copy_bytes(blob, &fau, sizeof(fau));
1216
1217 struct pan_compute_dim local_size;
1218 blob_copy_bytes(blob, &local_size, sizeof(local_size));
1219
1220 const uint32_t bin_size = blob_read_uint32(blob);
1221
1222 if (blob->overrun)
1223 return panvk_error(device, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
1224
1225 shader = vk_shader_zalloc(vk_dev, &panvk_shader_ops, info.stage, pAllocator,
1226 sizeof(*shader));
1227 if (shader == NULL)
1228 return panvk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1229
1230 shader->info = info;
1231 shader->fau = fau;
1232 shader->local_size = local_size;
1233 shader->bin_size = bin_size;
1234
1235 shader->bin_ptr = malloc(bin_size);
1236 if (shader->bin_ptr == NULL) {
1237 panvk_shader_destroy(vk_dev, &shader->vk, pAllocator);
1238 return panvk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1239 }
1240
1241 blob_copy_bytes(blob, (void *)shader->bin_ptr, shader->bin_size);
1242
1243 result = shader_desc_info_deserialize(blob, shader);
1244
1245 if (result != VK_SUCCESS) {
1246 panvk_shader_destroy(vk_dev, &shader->vk, pAllocator);
1247 return panvk_error(device, result);
1248 }
1249
1250 if (blob->overrun) {
1251 panvk_shader_destroy(vk_dev, &shader->vk, pAllocator);
1252 return panvk_error(device, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
1253 }
1254
1255 result = panvk_shader_upload(device, shader, pAllocator);
1256
1257 if (result != VK_SUCCESS) {
1258 panvk_shader_destroy(vk_dev, &shader->vk, pAllocator);
1259 return result;
1260 }
1261
1262 *shader_out = &shader->vk;
1263
1264 return result;
1265 }
1266
1267 static void
shader_desc_info_serialize(struct blob * blob,const struct panvk_shader * shader)1268 shader_desc_info_serialize(struct blob *blob, const struct panvk_shader *shader)
1269 {
1270 blob_write_uint32(blob, shader->desc_info.used_set_mask);
1271
1272 #if PAN_ARCH <= 7
1273 blob_write_uint32(blob, shader->desc_info.dyn_ubos.count);
1274 blob_write_bytes(blob, shader->desc_info.dyn_ubos.map,
1275 sizeof(*shader->desc_info.dyn_ubos.map) *
1276 shader->desc_info.dyn_ubos.count);
1277 blob_write_uint32(blob, shader->desc_info.dyn_ssbos.count);
1278 blob_write_bytes(blob, shader->desc_info.dyn_ssbos.map,
1279 sizeof(*shader->desc_info.dyn_ssbos.map) *
1280 shader->desc_info.dyn_ssbos.count);
1281
1282 unsigned others_count = 0;
1283 for (unsigned i = 0; i < ARRAY_SIZE(shader->desc_info.others.count); i++) {
1284 blob_write_uint32(blob, shader->desc_info.others.count[i]);
1285 others_count += shader->desc_info.others.count[i];
1286 }
1287
1288 blob_write_bytes(blob,
1289 panvk_priv_mem_host_addr(shader->desc_info.others.map),
1290 sizeof(uint32_t) * others_count);
1291 #else
1292 blob_write_uint32(blob, shader->desc_info.dyn_bufs.count);
1293 blob_write_bytes(blob, shader->desc_info.dyn_bufs.map,
1294 sizeof(*shader->desc_info.dyn_bufs.map) *
1295 shader->desc_info.dyn_bufs.count);
1296 #endif
1297 }
1298
1299 static bool
panvk_shader_serialize(struct vk_device * vk_dev,const struct vk_shader * vk_shader,struct blob * blob)1300 panvk_shader_serialize(struct vk_device *vk_dev,
1301 const struct vk_shader *vk_shader, struct blob *blob)
1302 {
1303 struct panvk_shader *shader =
1304 container_of(vk_shader, struct panvk_shader, vk);
1305
1306 /**
1307 * We can't currently cache assembly
1308 * TODO: Implement seriaization with assembly
1309 **/
1310 if (shader->nir_str != NULL || shader->asm_str != NULL)
1311 return false;
1312
1313 blob_write_bytes(blob, &shader->info, sizeof(shader->info));
1314 blob_write_bytes(blob, &shader->fau, sizeof(shader->fau));
1315 blob_write_bytes(blob, &shader->local_size, sizeof(shader->local_size));
1316 blob_write_uint32(blob, shader->bin_size);
1317 blob_write_bytes(blob, shader->bin_ptr, shader->bin_size);
1318 shader_desc_info_serialize(blob, shader);
1319
1320 return !blob->out_of_memory;
1321 }
1322
1323 #define WRITE_STR(field, ...) \
1324 ({ \
1325 memset(field, 0, sizeof(field)); \
1326 UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__); \
1327 assert(i > 0 && i < sizeof(field)); \
1328 })
1329
1330 static VkResult
panvk_shader_get_executable_properties(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t * executable_count,VkPipelineExecutablePropertiesKHR * properties)1331 panvk_shader_get_executable_properties(
1332 UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
1333 uint32_t *executable_count, VkPipelineExecutablePropertiesKHR *properties)
1334 {
1335 UNUSED struct panvk_shader *shader =
1336 container_of(vk_shader, struct panvk_shader, vk);
1337
1338 VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out, properties,
1339 executable_count);
1340
1341 vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props)
1342 {
1343 props->stages = mesa_to_vk_shader_stage(shader->info.stage);
1344 props->subgroupSize = 8;
1345 WRITE_STR(props->name, "%s",
1346 _mesa_shader_stage_to_string(shader->info.stage));
1347 WRITE_STR(props->description, "%s shader",
1348 _mesa_shader_stage_to_string(shader->info.stage));
1349 }
1350
1351 return vk_outarray_status(&out);
1352 }
1353
1354 static VkResult
panvk_shader_get_executable_statistics(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t executable_index,uint32_t * statistic_count,VkPipelineExecutableStatisticKHR * statistics)1355 panvk_shader_get_executable_statistics(
1356 UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
1357 uint32_t executable_index, uint32_t *statistic_count,
1358 VkPipelineExecutableStatisticKHR *statistics)
1359 {
1360 UNUSED struct panvk_shader *shader =
1361 container_of(vk_shader, struct panvk_shader, vk);
1362
1363 VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, statistics,
1364 statistic_count);
1365
1366 assert(executable_index == 0);
1367
1368 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat)
1369 {
1370 WRITE_STR(stat->name, "Code Size");
1371 WRITE_STR(stat->description,
1372 "Size of the compiled shader binary, in bytes");
1373 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1374 stat->value.u64 = shader->bin_size;
1375 }
1376
1377 /* TODO: more executable statistics (VK_KHR_pipeline_executable_properties) */
1378
1379 return vk_outarray_status(&out);
1380 }
1381
1382 static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR * ir,const char * data)1383 write_ir_text(VkPipelineExecutableInternalRepresentationKHR *ir,
1384 const char *data)
1385 {
1386 ir->isText = VK_TRUE;
1387
1388 size_t data_len = strlen(data) + 1;
1389
1390 if (ir->pData == NULL) {
1391 ir->dataSize = data_len;
1392 return true;
1393 }
1394
1395 strncpy(ir->pData, data, ir->dataSize);
1396 if (ir->dataSize < data_len)
1397 return false;
1398
1399 ir->dataSize = data_len;
1400 return true;
1401 }
1402
1403 static VkResult
panvk_shader_get_executable_internal_representations(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t executable_index,uint32_t * internal_representation_count,VkPipelineExecutableInternalRepresentationKHR * internal_representations)1404 panvk_shader_get_executable_internal_representations(
1405 UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
1406 uint32_t executable_index, uint32_t *internal_representation_count,
1407 VkPipelineExecutableInternalRepresentationKHR *internal_representations)
1408 {
1409 UNUSED struct panvk_shader *shader =
1410 container_of(vk_shader, struct panvk_shader, vk);
1411 VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
1412 internal_representations,
1413 internal_representation_count);
1414 bool incomplete_text = false;
1415
1416 if (shader->nir_str != NULL) {
1417 vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
1418 &out, ir)
1419 {
1420 WRITE_STR(ir->name, "NIR shader");
1421 WRITE_STR(ir->description,
1422 "NIR shader before sending to the back-end compiler");
1423 if (!write_ir_text(ir, shader->nir_str))
1424 incomplete_text = true;
1425 }
1426 }
1427
1428 if (shader->asm_str != NULL) {
1429 vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
1430 &out, ir)
1431 {
1432 WRITE_STR(ir->name, "Assembly");
1433 WRITE_STR(ir->description, "Final Assembly");
1434 if (!write_ir_text(ir, shader->asm_str))
1435 incomplete_text = true;
1436 }
1437 }
1438
1439 return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
1440 }
1441
1442 #if PAN_ARCH <= 7
1443 static mali_pixel_format
get_varying_format(gl_shader_stage stage,gl_varying_slot loc,enum pipe_format pfmt)1444 get_varying_format(gl_shader_stage stage, gl_varying_slot loc,
1445 enum pipe_format pfmt)
1446 {
1447 switch (loc) {
1448 case VARYING_SLOT_PNTC:
1449 case VARYING_SLOT_PSIZ:
1450 #if PAN_ARCH <= 6
1451 return (MALI_R16F << 12) | panfrost_get_default_swizzle(1);
1452 #else
1453 return (MALI_R16F << 12) | MALI_RGB_COMPONENT_ORDER_R000;
1454 #endif
1455 case VARYING_SLOT_POS:
1456 #if PAN_ARCH <= 6
1457 return (MALI_SNAP_4 << 12) | panfrost_get_default_swizzle(4);
1458 #else
1459 return (MALI_SNAP_4 << 12) | MALI_RGB_COMPONENT_ORDER_RGBA;
1460 #endif
1461 default:
1462 assert(pfmt != PIPE_FORMAT_NONE);
1463 return GENX(panfrost_format_from_pipe_format)(pfmt)->hw;
1464 }
1465 }
1466
1467 struct varyings_info {
1468 enum pipe_format fmts[VARYING_SLOT_MAX];
1469 BITSET_DECLARE(active, VARYING_SLOT_MAX);
1470 };
1471
1472 static void
collect_varyings_info(const struct pan_shader_varying * varyings,unsigned varying_count,struct varyings_info * info)1473 collect_varyings_info(const struct pan_shader_varying *varyings,
1474 unsigned varying_count, struct varyings_info *info)
1475 {
1476 for (unsigned i = 0; i < varying_count; i++) {
1477 gl_varying_slot loc = varyings[i].location;
1478
1479 if (varyings[i].format == PIPE_FORMAT_NONE)
1480 continue;
1481
1482 info->fmts[loc] = varyings[i].format;
1483 BITSET_SET(info->active, loc);
1484 }
1485 }
1486
1487 static inline enum panvk_varying_buf_id
varying_buf_id(gl_varying_slot loc)1488 varying_buf_id(gl_varying_slot loc)
1489 {
1490 switch (loc) {
1491 case VARYING_SLOT_POS:
1492 return PANVK_VARY_BUF_POSITION;
1493 case VARYING_SLOT_PSIZ:
1494 return PANVK_VARY_BUF_PSIZ;
1495 default:
1496 return PANVK_VARY_BUF_GENERAL;
1497 }
1498 }
1499
1500 static mali_pixel_format
varying_format(gl_varying_slot loc,enum pipe_format pfmt)1501 varying_format(gl_varying_slot loc, enum pipe_format pfmt)
1502 {
1503 switch (loc) {
1504 case VARYING_SLOT_PNTC:
1505 case VARYING_SLOT_PSIZ:
1506 #if PAN_ARCH <= 6
1507 return (MALI_R16F << 12) | panfrost_get_default_swizzle(1);
1508 #else
1509 return (MALI_R16F << 12) | MALI_RGB_COMPONENT_ORDER_R000;
1510 #endif
1511 case VARYING_SLOT_POS:
1512 #if PAN_ARCH <= 6
1513 return (MALI_SNAP_4 << 12) | panfrost_get_default_swizzle(4);
1514 #else
1515 return (MALI_SNAP_4 << 12) | MALI_RGB_COMPONENT_ORDER_RGBA;
1516 #endif
1517 default:
1518 return GENX(panfrost_format_from_pipe_format)(pfmt)->hw;
1519 }
1520 }
1521
1522 static VkResult
emit_varying_attrs(struct panvk_pool * desc_pool,const struct pan_shader_varying * varyings,unsigned varying_count,const struct varyings_info * info,unsigned * buf_offsets,struct panvk_priv_mem * mem)1523 emit_varying_attrs(struct panvk_pool *desc_pool,
1524 const struct pan_shader_varying *varyings,
1525 unsigned varying_count, const struct varyings_info *info,
1526 unsigned *buf_offsets, struct panvk_priv_mem *mem)
1527 {
1528 unsigned attr_count = BITSET_COUNT(info->active);
1529
1530 *mem = panvk_pool_alloc_desc_array(desc_pool, attr_count, ATTRIBUTE);
1531
1532 if (attr_count && !panvk_priv_mem_dev_addr(*mem))
1533 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1534
1535 struct mali_attribute_packed *attrs = panvk_priv_mem_host_addr(*mem);
1536 unsigned attr_idx = 0;
1537
1538 for (unsigned i = 0; i < varying_count; i++) {
1539 pan_pack(&attrs[attr_idx++], ATTRIBUTE, cfg) {
1540 gl_varying_slot loc = varyings[i].location;
1541 enum pipe_format pfmt = varyings[i].format != PIPE_FORMAT_NONE
1542 ? info->fmts[loc]
1543 : PIPE_FORMAT_NONE;
1544
1545 if (pfmt == PIPE_FORMAT_NONE) {
1546 #if PAN_ARCH >= 7
1547 cfg.format = (MALI_CONSTANT << 12) | MALI_RGB_COMPONENT_ORDER_0000;
1548 #else
1549 cfg.format = (MALI_CONSTANT << 12) | PAN_V6_SWIZZLE(0, 0, 0, 0);
1550 #endif
1551 } else {
1552 cfg.buffer_index = varying_buf_id(loc);
1553 cfg.offset = buf_offsets[loc];
1554 cfg.format = varying_format(loc, info->fmts[loc]);
1555 }
1556 cfg.offset_enable = false;
1557 }
1558 }
1559
1560 return VK_SUCCESS;
1561 }
1562
1563 VkResult
panvk_per_arch(link_shaders)1564 panvk_per_arch(link_shaders)(struct panvk_pool *desc_pool,
1565 const struct panvk_shader *vs,
1566 const struct panvk_shader *fs,
1567 struct panvk_shader_link *link)
1568 {
1569 BITSET_DECLARE(active_attrs, VARYING_SLOT_MAX) = {0};
1570 unsigned buf_strides[PANVK_VARY_BUF_MAX] = {0};
1571 unsigned buf_offsets[VARYING_SLOT_MAX] = {0};
1572 struct varyings_info out_vars = {0};
1573 struct varyings_info in_vars = {0};
1574 unsigned loc;
1575
1576 assert(vs);
1577 assert(vs->info.stage == MESA_SHADER_VERTEX);
1578
1579 collect_varyings_info(vs->info.varyings.output,
1580 vs->info.varyings.output_count, &out_vars);
1581
1582 if (fs) {
1583 assert(fs->info.stage == MESA_SHADER_FRAGMENT);
1584 collect_varyings_info(fs->info.varyings.input,
1585 fs->info.varyings.input_count, &in_vars);
1586 }
1587
1588 BITSET_OR(active_attrs, in_vars.active, out_vars.active);
1589
1590 /* Handle the position and point size buffers explicitly, as they are
1591 * passed through separate buffer pointers to the tiler job.
1592 */
1593 if (BITSET_TEST(out_vars.active, VARYING_SLOT_POS)) {
1594 buf_strides[PANVK_VARY_BUF_POSITION] = sizeof(float) * 4;
1595 BITSET_CLEAR(active_attrs, VARYING_SLOT_POS);
1596 }
1597
1598 if (BITSET_TEST(out_vars.active, VARYING_SLOT_PSIZ)) {
1599 buf_strides[PANVK_VARY_BUF_PSIZ] = sizeof(uint16_t);
1600 BITSET_CLEAR(active_attrs, VARYING_SLOT_PSIZ);
1601 }
1602
1603 BITSET_FOREACH_SET(loc, active_attrs, VARYING_SLOT_MAX) {
1604 /* We expect the VS to write to all inputs read by the FS, and the
1605 * FS to read all inputs written by the VS. If that's not the
1606 * case, we keep PIPE_FORMAT_NONE to reflect the fact we should use a
1607 * sink attribute (writes are discarded, reads return zeros).
1608 */
1609 if (in_vars.fmts[loc] == PIPE_FORMAT_NONE ||
1610 out_vars.fmts[loc] == PIPE_FORMAT_NONE) {
1611 in_vars.fmts[loc] = PIPE_FORMAT_NONE;
1612 out_vars.fmts[loc] = PIPE_FORMAT_NONE;
1613 continue;
1614 }
1615
1616 unsigned out_size = util_format_get_blocksize(out_vars.fmts[loc]);
1617 unsigned buf_idx = varying_buf_id(loc);
1618
1619 /* Always trust the VS input format, so we can:
1620 * - discard components that are never read
1621 * - use float types for interpolated fragment shader inputs
1622 * - use fp16 for floats with mediump
1623 * - make sure components that are not written by the FS are set to zero
1624 */
1625 out_vars.fmts[loc] = in_vars.fmts[loc];
1626
1627 /* Special buffers are handled explicitly before this loop, everything
1628 * else should be laid out in the general varying buffer.
1629 */
1630 assert(buf_idx == PANVK_VARY_BUF_GENERAL);
1631
1632 /* Keep things aligned a 32-bit component. */
1633 buf_offsets[loc] = buf_strides[buf_idx];
1634 buf_strides[buf_idx] += ALIGN_POT(out_size, 4);
1635 }
1636
1637 VkResult result = emit_varying_attrs(
1638 desc_pool, vs->info.varyings.output, vs->info.varyings.output_count,
1639 &out_vars, buf_offsets, &link->vs.attribs);
1640 if (result != VK_SUCCESS)
1641 return result;
1642
1643 if (fs) {
1644 result = emit_varying_attrs(desc_pool, fs->info.varyings.input,
1645 fs->info.varyings.input_count, &in_vars,
1646 buf_offsets, &link->fs.attribs);
1647 if (result != VK_SUCCESS)
1648 return result;
1649 }
1650
1651 memcpy(link->buf_strides, buf_strides, sizeof(link->buf_strides));
1652 return VK_SUCCESS;
1653 }
1654 #endif
1655
1656 static const struct vk_shader_ops panvk_shader_ops = {
1657 .destroy = panvk_shader_destroy,
1658 .serialize = panvk_shader_serialize,
1659 .get_executable_properties = panvk_shader_get_executable_properties,
1660 .get_executable_statistics = panvk_shader_get_executable_statistics,
1661 .get_executable_internal_representations =
1662 panvk_shader_get_executable_internal_representations,
1663 };
1664
1665 static void
panvk_cmd_bind_shader(struct panvk_cmd_buffer * cmd,const gl_shader_stage stage,struct panvk_shader * shader)1666 panvk_cmd_bind_shader(struct panvk_cmd_buffer *cmd, const gl_shader_stage stage,
1667 struct panvk_shader *shader)
1668 {
1669 switch (stage) {
1670 case MESA_SHADER_COMPUTE:
1671 if (cmd->state.compute.shader != shader) {
1672 cmd->state.compute.shader = shader;
1673 compute_state_set_dirty(cmd, CS);
1674 compute_state_set_dirty(cmd, PUSH_UNIFORMS);
1675 }
1676 break;
1677 case MESA_SHADER_VERTEX:
1678 if (cmd->state.gfx.vs.shader != shader) {
1679 cmd->state.gfx.vs.shader = shader;
1680 gfx_state_set_dirty(cmd, VS);
1681 gfx_state_set_dirty(cmd, VS_PUSH_UNIFORMS);
1682 }
1683 break;
1684 case MESA_SHADER_FRAGMENT:
1685 if (cmd->state.gfx.fs.shader != shader) {
1686 cmd->state.gfx.fs.shader = shader;
1687 gfx_state_set_dirty(cmd, FS);
1688 gfx_state_set_dirty(cmd, FS_PUSH_UNIFORMS);
1689 }
1690 break;
1691 default:
1692 assert(!"Unsupported stage");
1693 break;
1694 }
1695 }
1696
1697 static void
panvk_cmd_bind_shaders(struct vk_command_buffer * vk_cmd,uint32_t stage_count,const gl_shader_stage * stages,struct vk_shader ** const shaders)1698 panvk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count,
1699 const gl_shader_stage *stages,
1700 struct vk_shader **const shaders)
1701 {
1702 struct panvk_cmd_buffer *cmd =
1703 container_of(vk_cmd, struct panvk_cmd_buffer, vk);
1704
1705 for (uint32_t i = 0; i < stage_count; i++) {
1706 struct panvk_shader *shader =
1707 container_of(shaders[i], struct panvk_shader, vk);
1708
1709 panvk_cmd_bind_shader(cmd, stages[i], shader);
1710 }
1711 }
1712
1713 const struct vk_device_shader_ops panvk_per_arch(device_shader_ops) = {
1714 .get_nir_options = panvk_get_nir_options,
1715 .get_spirv_options = panvk_get_spirv_options,
1716 .preprocess_nir = panvk_preprocess_nir,
1717 .hash_graphics_state = panvk_hash_graphics_state,
1718 .compile = panvk_compile_shaders,
1719 .deserialize = panvk_deserialize_shader,
1720 .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
1721 .cmd_bind_shaders = panvk_cmd_bind_shaders,
1722 };
1723
1724 static void
panvk_internal_shader_destroy(struct vk_device * vk_dev,struct vk_shader * vk_shader,const VkAllocationCallbacks * pAllocator)1725 panvk_internal_shader_destroy(struct vk_device *vk_dev,
1726 struct vk_shader *vk_shader,
1727 const VkAllocationCallbacks *pAllocator)
1728 {
1729 struct panvk_device *dev = to_panvk_device(vk_dev);
1730 struct panvk_internal_shader *shader =
1731 container_of(vk_shader, struct panvk_internal_shader, vk);
1732
1733 panvk_pool_free_mem(&shader->code_mem);
1734
1735 #if PAN_ARCH <= 7
1736 panvk_pool_free_mem(&shader->rsd);
1737 #else
1738 panvk_pool_free_mem(&shader->spd);
1739 #endif
1740
1741 vk_shader_free(&dev->vk, pAllocator, &shader->vk);
1742 }
1743
1744 static const struct vk_shader_ops panvk_internal_shader_ops = {
1745 .destroy = panvk_internal_shader_destroy,
1746 };
1747
1748 VkResult
panvk_per_arch(create_internal_shader)1749 panvk_per_arch(create_internal_shader)(
1750 struct panvk_device *dev, nir_shader *nir,
1751 struct panfrost_compile_inputs *compiler_inputs,
1752 struct panvk_internal_shader **shader_out)
1753 {
1754 struct panvk_internal_shader *shader =
1755 vk_shader_zalloc(&dev->vk, &panvk_internal_shader_ops, nir->info.stage,
1756 NULL, sizeof(*shader));
1757 if (shader == NULL)
1758 return panvk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1759
1760 VkResult result;
1761 struct util_dynarray binary;
1762
1763 util_dynarray_init(&binary, nir);
1764 GENX(pan_shader_compile)(nir, compiler_inputs, &binary, &shader->info);
1765
1766 unsigned bin_size = util_dynarray_num_elements(&binary, uint8_t);
1767 if (bin_size) {
1768 shader->code_mem = panvk_pool_upload_aligned(&dev->mempools.exec,
1769 binary.data, bin_size, 128);
1770 if (!panvk_priv_mem_dev_addr(shader->code_mem)) {
1771 result = panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1772 goto err_free_shader;
1773 }
1774 }
1775
1776 *shader_out = shader;
1777 return VK_SUCCESS;
1778
1779 err_free_shader:
1780 vk_shader_free(&dev->vk, NULL, &shader->vk);
1781 return result;
1782 }
1783