• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Collabora Ltd.
3  *
4  * Derived from tu_shader.c which is:
5  * Copyright © 2019 Google LLC
6  *
7  * Also derived from anv_pipeline.c which is
8  * Copyright © 2015 Intel Corporation
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a
11  * copy of this software and associated documentation files (the "Software"),
12  * to deal in the Software without restriction, including without limitation
13  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
14  * and/or sell copies of the Software, and to permit persons to whom the
15  * Software is furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice (including the next
18  * paragraph) shall be included in all copies or substantial portions of the
19  * Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
24  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27  * DEALINGS IN THE SOFTWARE.
28  */
29 
30 #include "genxml/gen_macros.h"
31 
32 #include "panvk_cmd_buffer.h"
33 #include "panvk_device.h"
34 #include "panvk_instance.h"
35 #include "panvk_mempool.h"
36 #include "panvk_physical_device.h"
37 #include "panvk_shader.h"
38 
39 #include "spirv/nir_spirv.h"
40 #include "util/memstream.h"
41 #include "util/mesa-sha1.h"
42 #include "util/u_dynarray.h"
43 #include "nir_builder.h"
44 #include "nir_conversion_builder.h"
45 #include "nir_deref.h"
46 
47 #include "vk_graphics_state.h"
48 #include "vk_shader_module.h"
49 
50 #include "compiler/bifrost_nir.h"
51 #include "pan_shader.h"
52 
53 #include "vk_log.h"
54 #include "vk_pipeline.h"
55 #include "vk_pipeline_layout.h"
56 #include "vk_shader.h"
57 #include "vk_util.h"
58 
59 static bool
panvk_lower_sysvals(nir_builder * b,nir_instr * instr,void * data)60 panvk_lower_sysvals(nir_builder *b, nir_instr *instr, void *data)
61 {
62    if (instr->type != nir_instr_type_intrinsic)
63       return false;
64 
65    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
66    unsigned bit_size = intr->def.bit_size;
67    nir_def *val = NULL;
68    b->cursor = nir_before_instr(instr);
69 
70    switch (intr->intrinsic) {
71    case nir_intrinsic_load_base_workgroup_id:
72       val = load_sysval(b, compute, bit_size, base);
73       break;
74    case nir_intrinsic_load_num_workgroups:
75       val = load_sysval(b, compute, bit_size, num_work_groups);
76       break;
77    case nir_intrinsic_load_workgroup_size:
78       val = load_sysval(b, compute, bit_size, local_group_size);
79       break;
80    case nir_intrinsic_load_viewport_scale:
81       val = load_sysval(b, graphics, bit_size, viewport.scale);
82       break;
83    case nir_intrinsic_load_viewport_offset:
84       val = load_sysval(b, graphics, bit_size, viewport.offset);
85       break;
86    case nir_intrinsic_load_first_vertex:
87       val = load_sysval(b, graphics, bit_size, vs.first_vertex);
88       break;
89    case nir_intrinsic_load_base_instance:
90       val = load_sysval(b, graphics, bit_size, vs.base_instance);
91       break;
92    case nir_intrinsic_load_noperspective_varyings_pan:
93       /* TODO: use a VS epilog specialized on constant noperspective_varyings
94        * with VK_EXT_graphics_pipeline_libraries and VK_EXT_shader_object */
95       assert(b->shader->info.stage == MESA_SHADER_VERTEX);
96       val = load_sysval(b, graphics, bit_size, vs.noperspective_varyings);
97       break;
98 
99 #if PAN_ARCH <= 7
100    case nir_intrinsic_load_raw_vertex_offset_pan:
101       val = load_sysval(b, graphics, bit_size, vs.raw_vertex_offset);
102       break;
103    case nir_intrinsic_load_layer_id:
104       assert(b->shader->info.stage == MESA_SHADER_FRAGMENT);
105       val = load_sysval(b, graphics, bit_size, layer_id);
106       break;
107 #endif
108 
109    case nir_intrinsic_load_draw_id:
110       /* TODO: We only implement single-draw direct and indirect draws, so this
111        * is sufficient. We'll revisit this when we get around to implementing
112        * multidraw. */
113       assert(b->shader->info.stage == MESA_SHADER_VERTEX);
114       val = nir_imm_int(b, 0);
115       break;
116 
117    default:
118       return false;
119    }
120 
121    assert(val->num_components == intr->def.num_components);
122 
123    b->cursor = nir_after_instr(instr);
124    nir_def_rewrite_uses(&intr->def, val);
125    return true;
126 }
127 
128 static bool
panvk_lower_load_vs_input(nir_builder * b,nir_intrinsic_instr * intrin,UNUSED void * data)129 panvk_lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin,
130                            UNUSED void *data)
131 {
132    if (intrin->intrinsic != nir_intrinsic_load_input)
133       return false;
134 
135    b->cursor = nir_before_instr(&intrin->instr);
136    nir_def *ld_attr = nir_load_attribute_pan(
137       b, intrin->def.num_components, intrin->def.bit_size,
138       PAN_ARCH <= 7 ?
139          nir_load_raw_vertex_id_pan(b) :
140          nir_load_vertex_id(b),
141       PAN_ARCH >= 9 ?
142          nir_iadd(b, nir_load_instance_id(b), nir_load_base_instance(b)) :
143          nir_load_instance_id(b),
144       nir_get_io_offset_src(intrin)->ssa,
145       .base = nir_intrinsic_base(intrin),
146       .component = nir_intrinsic_component(intrin),
147       .dest_type = nir_intrinsic_dest_type(intrin));
148    nir_def_replace(&intrin->def, ld_attr);
149 
150    return true;
151 }
152 
153 #if PAN_ARCH <= 7
154 static bool
lower_gl_pos_layer_writes(nir_builder * b,nir_instr * instr,void * data)155 lower_gl_pos_layer_writes(nir_builder *b, nir_instr *instr, void *data)
156 {
157    if (instr->type != nir_instr_type_intrinsic)
158       return false;
159 
160    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
161 
162    if (intr->intrinsic != nir_intrinsic_copy_deref)
163       return false;
164 
165    nir_variable *dst_var = nir_intrinsic_get_var(intr, 0);
166    nir_variable *src_var = nir_intrinsic_get_var(intr, 1);
167 
168    if (!dst_var || dst_var->data.mode != nir_var_shader_out || !src_var ||
169        src_var->data.mode != nir_var_shader_temp)
170       return false;
171 
172    if (dst_var->data.location == VARYING_SLOT_LAYER) {
173       /* We don't really write the layer, we just make sure primitives are
174        * discarded if gl_Layer doesn't match the layer passed to the draw.
175        */
176       b->cursor = nir_instr_remove(instr);
177       return true;
178    }
179 
180    if (dst_var->data.location == VARYING_SLOT_POS) {
181       nir_variable *temp_layer_var = data;
182       nir_variable *temp_pos_var = src_var;
183 
184       b->cursor = nir_before_instr(instr);
185       nir_def *layer = nir_load_var(b, temp_layer_var);
186       nir_def *pos = nir_load_var(b, temp_pos_var);
187       nir_def *inf_pos = nir_imm_vec4(b, INFINITY, INFINITY, INFINITY, 1.0f);
188       nir_def *ref_layer = load_sysval(b, graphics, 32, layer_id);
189 
190       nir_store_var(b, temp_pos_var,
191                     nir_bcsel(b, nir_ieq(b, layer, ref_layer), pos, inf_pos),
192                     0xf);
193       return true;
194    }
195 
196    return false;
197 }
198 
199 static bool
lower_layer_writes(nir_shader * nir)200 lower_layer_writes(nir_shader *nir)
201 {
202    if (nir->info.stage == MESA_SHADER_FRAGMENT)
203       return false;
204 
205    nir_variable *temp_layer_var = NULL;
206    bool has_layer_var = false;
207 
208    nir_foreach_variable_with_modes(var, nir,
209                                    nir_var_shader_out | nir_var_shader_temp) {
210       if (var->data.mode == nir_var_shader_out &&
211           var->data.location == VARYING_SLOT_LAYER)
212          has_layer_var = true;
213 
214       if (var->data.mode == nir_var_shader_temp &&
215           var->data.location == VARYING_SLOT_LAYER)
216          temp_layer_var = var;
217    }
218 
219    if (!has_layer_var)
220       return false;
221 
222    assert(temp_layer_var);
223 
224    return nir_shader_instructions_pass(nir, lower_gl_pos_layer_writes,
225                                        nir_metadata_control_flow,
226                                        temp_layer_var);
227 }
228 #endif
229 
230 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)231 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
232 {
233    assert(glsl_type_is_vector_or_scalar(type));
234 
235    uint32_t comp_size =
236       glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
237    unsigned length = glsl_get_vector_elements(type);
238    *size = comp_size * length, *align = comp_size * (length == 3 ? 4 : length);
239 }
240 
241 static inline nir_address_format
panvk_buffer_ubo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness)242 panvk_buffer_ubo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness)
243 {
244    switch (robustness) {
245    case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
246    case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
247    case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
248       return PAN_ARCH <= 7 ? nir_address_format_32bit_index_offset
249                            : nir_address_format_vec2_index_32bit_offset;
250    default:
251       unreachable("Invalid robust buffer access behavior");
252    }
253 }
254 
255 static inline nir_address_format
panvk_buffer_ssbo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness)256 panvk_buffer_ssbo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness)
257 {
258    switch (robustness) {
259    case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
260       return PAN_ARCH <= 7 ? nir_address_format_64bit_global_32bit_offset
261                            : nir_address_format_vec2_index_32bit_offset;
262    case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
263    case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
264       return PAN_ARCH <= 7 ? nir_address_format_64bit_bounded_global
265                            : nir_address_format_vec2_index_32bit_offset;
266    default:
267       unreachable("Invalid robust buffer access behavior");
268    }
269 }
270 
271 static const nir_shader_compiler_options *
panvk_get_nir_options(UNUSED struct vk_physical_device * vk_pdev,UNUSED gl_shader_stage stage,UNUSED const struct vk_pipeline_robustness_state * rs)272 panvk_get_nir_options(UNUSED struct vk_physical_device *vk_pdev,
273                       UNUSED gl_shader_stage stage,
274                       UNUSED const struct vk_pipeline_robustness_state *rs)
275 {
276    return GENX(pan_shader_get_compiler_options)();
277 }
278 
279 static struct spirv_to_nir_options
panvk_get_spirv_options(UNUSED struct vk_physical_device * vk_pdev,UNUSED gl_shader_stage stage,const struct vk_pipeline_robustness_state * rs)280 panvk_get_spirv_options(UNUSED struct vk_physical_device *vk_pdev,
281                         UNUSED gl_shader_stage stage,
282                         const struct vk_pipeline_robustness_state *rs)
283 {
284    return (struct spirv_to_nir_options){
285       .ubo_addr_format = panvk_buffer_ubo_addr_format(rs->uniform_buffers),
286       .ssbo_addr_format = panvk_buffer_ssbo_addr_format(rs->storage_buffers),
287       .phys_ssbo_addr_format = nir_address_format_64bit_global,
288    };
289 }
290 
291 static void
panvk_preprocess_nir(UNUSED struct vk_physical_device * vk_pdev,nir_shader * nir)292 panvk_preprocess_nir(UNUSED struct vk_physical_device *vk_pdev, nir_shader *nir)
293 {
294    /* Ensure to regroup output variables at the same location */
295    if (nir->info.stage == MESA_SHADER_FRAGMENT)
296       NIR_PASS(_, nir, nir_lower_io_to_vector, nir_var_shader_out);
297 
298    NIR_PASS(_, nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir),
299             true, true);
300 
301 #if PAN_ARCH <= 7
302    /* This needs to be done just after the io_to_temporaries pass, because we
303     * rely on in/out temporaries to collect the final layer_id value. */
304    NIR_PASS(_, nir, lower_layer_writes);
305 #endif
306 
307    NIR_PASS(_, nir, nir_lower_indirect_derefs,
308             nir_var_shader_in | nir_var_shader_out, UINT32_MAX);
309 
310    NIR_PASS(_, nir, nir_opt_copy_prop_vars);
311    NIR_PASS(_, nir, nir_opt_combine_stores, nir_var_all);
312    NIR_PASS(_, nir, nir_opt_loop);
313 
314    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
315       struct nir_input_attachment_options lower_input_attach_opts = {
316          .use_fragcoord_sysval = true,
317          .use_layer_id_sysval = true,
318       };
319 
320       NIR_PASS(_, nir, nir_lower_input_attachments, &lower_input_attach_opts);
321    }
322 
323    /* Do texture lowering here.  Yes, it's a duplication of the texture
324     * lowering in bifrost_compile.  However, we need to lower texture stuff
325     * now, before we call panvk_per_arch(nir_lower_descriptors)() because some
326     * of the texture lowering generates nir_texop_txs which we handle as part
327     * of descriptor lowering.
328     *
329     * TODO: We really should be doing this in common code, not dpulicated in
330     * panvk.  In order to do that, we need to rework the panfrost compile
331     * flow to look more like the Intel flow:
332     *
333     *  1. Compile SPIR-V to NIR and maybe do a tiny bit of lowering that needs
334     *     to be done really early.
335     *
336     *  2. pan_preprocess_nir: Does common lowering and runs the optimization
337     *     loop.  Nothing here should be API-specific.
338     *
339     *  3. Do additional lowering in panvk
340     *
341     *  4. pan_postprocess_nir: Does final lowering and runs the optimization
342     *     loop again.  This can happen as part of the final compile.
343     *
344     * This would give us a better place to do panvk-specific lowering.
345     */
346    nir_lower_tex_options lower_tex_options = {
347       .lower_txs_lod = true,
348       .lower_txp = ~0,
349       .lower_tg4_broadcom_swizzle = true,
350       .lower_txd_cube_map = true,
351       .lower_invalid_implicit_lod = true,
352    };
353    NIR_PASS(_, nir, nir_lower_tex, &lower_tex_options);
354    NIR_PASS(_, nir, nir_lower_system_values);
355 
356    nir_lower_compute_system_values_options options = {
357       .has_base_workgroup_id = true,
358    };
359 
360    NIR_PASS(_, nir, nir_lower_compute_system_values, &options);
361 
362    if (nir->info.stage == MESA_SHADER_FRAGMENT)
363       NIR_PASS(_, nir, nir_lower_wpos_center);
364 
365    NIR_PASS(_, nir, nir_split_var_copies);
366    NIR_PASS(_, nir, nir_lower_var_copies);
367 }
368 
369 static void
panvk_hash_graphics_state(struct vk_physical_device * device,const struct vk_graphics_pipeline_state * state,VkShaderStageFlags stages,blake3_hash blake3_out)370 panvk_hash_graphics_state(struct vk_physical_device *device,
371                           const struct vk_graphics_pipeline_state *state,
372                           VkShaderStageFlags stages, blake3_hash blake3_out)
373 {
374    struct mesa_blake3 blake3_ctx;
375    _mesa_blake3_init(&blake3_ctx);
376 
377    /* This doesn't impact the shader compile but it does go in the
378     * panvk_shader and gets [de]serialized along with the binary so
379     * we need to hash it.
380     */
381    bool sample_shading_enable = state->ms && state->ms->sample_shading_enable;
382    _mesa_blake3_update(&blake3_ctx, &sample_shading_enable,
383                        sizeof(sample_shading_enable));
384 
385    _mesa_blake3_update(&blake3_ctx, &state->rp->view_mask,
386                        sizeof(state->rp->view_mask));
387 
388    _mesa_blake3_final(&blake3_ctx, blake3_out);
389 }
390 
391 #if PAN_ARCH >= 9
392 static bool
valhall_pack_buf_idx(nir_builder * b,nir_instr * instr,UNUSED void * data)393 valhall_pack_buf_idx(nir_builder *b, nir_instr *instr, UNUSED void *data)
394 {
395    if (instr->type != nir_instr_type_intrinsic)
396       return false;
397 
398    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
399    unsigned index_src;
400 
401    switch (intrin->intrinsic) {
402    case nir_intrinsic_load_ubo:
403    case nir_intrinsic_load_ssbo:
404    case nir_intrinsic_ssbo_atomic:
405    case nir_intrinsic_ssbo_atomic_swap:
406       index_src = 0;
407       break;
408 
409    case nir_intrinsic_store_ssbo:
410       index_src = 1;
411       break;
412 
413    default:
414       return false;
415    }
416 
417    nir_def *index = intrin->src[index_src].ssa;
418 
419    /* The descriptor lowering pass can add UBO loads, and those already have the
420     * right index format. */
421    if (index->num_components == 1)
422       return false;
423 
424    b->cursor = nir_before_instr(&intrin->instr);
425 
426    /* The valhall backend expects nir_address_format_32bit_index_offset,
427     * but address mode is nir_address_format_vec2_index_32bit_offset to allow
428     * us to store the array size, set and index without losing information
429     * while walking the descriptor deref chain (needed to do a bound check on
430     * the array index when we reach the end of the chain).
431     * Turn it back to nir_address_format_32bit_index_offset after IOs
432     * have been lowered. */
433    nir_def *packed_index =
434       nir_iadd(b, nir_channel(b, index, 0), nir_channel(b, index, 1));
435    nir_src_rewrite(&intrin->src[index_src], packed_index);
436    return true;
437 }
438 #endif
439 
440 static bool
valhall_lower_get_ssbo_size(struct nir_builder * b,nir_intrinsic_instr * intr,void * data)441 valhall_lower_get_ssbo_size(struct nir_builder *b,
442                             nir_intrinsic_instr *intr, void *data)
443 {
444    if (intr->intrinsic != nir_intrinsic_get_ssbo_size)
445       return false;
446 
447    b->cursor = nir_before_instr(&intr->instr);
448 
449    nir_def *table_idx =
450       nir_ushr_imm(b, nir_channel(b, intr->src[0].ssa, 0), 24);
451    nir_def *res_table = nir_ior_imm(b, table_idx, pan_res_handle(62, 0));
452    nir_def *buf_idx = nir_channel(b, intr->src[0].ssa, 1);
453    nir_def *desc_offset = nir_imul_imm(b, buf_idx, PANVK_DESCRIPTOR_SIZE);
454    nir_def *size = nir_load_ubo(
455       b, 1, 32, res_table, nir_iadd_imm(b, desc_offset, 4), .range = ~0u,
456       .align_mul = PANVK_DESCRIPTOR_SIZE, .align_offset = 4);
457 
458    nir_def_replace(&intr->def, size);
459    return true;
460 }
461 
462 static bool
collect_push_constant(struct nir_builder * b,nir_intrinsic_instr * intr,void * data)463 collect_push_constant(struct nir_builder *b, nir_intrinsic_instr *intr,
464                       void *data)
465 {
466    if (intr->intrinsic != nir_intrinsic_load_push_constant)
467       return false;
468 
469    struct panvk_shader *shader = data;
470    uint32_t base = nir_intrinsic_base(intr);
471    bool is_sysval = base >= SYSVALS_PUSH_CONST_BASE;
472    uint32_t offset, size;
473 
474    /* Sysvals should have a constant offset. */
475    assert(!is_sysval || nir_src_is_const(intr->src[0]));
476 
477    if (is_sysval)
478       base -= SYSVALS_PUSH_CONST_BASE;
479 
480    /* If the offset is dynamic, we need to flag [base:base+range] as used, to
481     * allow global mem access. */
482    if (!nir_src_is_const(intr->src[0])) {
483       offset = base;
484       size = nir_intrinsic_range(intr);
485 
486       /* Flag the push_consts sysval as needed if we have an indirect offset. */
487       if (b->shader->info.stage == MESA_SHADER_COMPUTE)
488          shader_use_sysval(shader, compute, push_consts);
489       else
490          shader_use_sysval(shader, graphics, push_consts);
491    } else {
492       offset = base + nir_src_as_uint(intr->src[0]);
493       size = (intr->def.bit_size / 8) * intr->def.num_components;
494    }
495 
496    if (is_sysval)
497       shader_use_sysval_range(shader, offset, size);
498    else
499       shader_use_push_const_range(shader, offset, size);
500 
501    return true;
502 }
503 
504 static bool
move_push_constant(struct nir_builder * b,nir_intrinsic_instr * intr,void * data)505 move_push_constant(struct nir_builder *b, nir_intrinsic_instr *intr, void *data)
506 {
507    if (intr->intrinsic != nir_intrinsic_load_push_constant)
508       return false;
509 
510    struct panvk_shader *shader = data;
511    unsigned base = nir_intrinsic_base(intr);
512    bool is_sysval = base >= SYSVALS_PUSH_CONST_BASE;
513 
514    if (is_sysval)
515       base -= SYSVALS_PUSH_CONST_BASE;
516 
517    /* Sysvals should have a constant offset. */
518    assert(!is_sysval || nir_src_is_const(intr->src[0]));
519 
520    b->cursor = nir_before_instr(&intr->instr);
521 
522    if (nir_src_is_const(intr->src[0])) {
523       unsigned offset = base + nir_src_as_uint(intr->src[0]);
524 
525       /* We place the sysvals first, and then comes the user push constants.
526        * We do that so we always have the blend constants at offset 0 for
527        * blend shaders. */
528       if (is_sysval)
529          offset = shader_remapped_sysval_offset(shader, offset);
530       else
531          offset = shader_remapped_push_const_offset(shader, offset);
532 
533       nir_src_rewrite(&intr->src[0], nir_imm_int(b, offset));
534 
535       /* We always set the range/base to zero, to make sure no pass is using it
536        * after that point. */
537       nir_intrinsic_set_base(intr, 0);
538       nir_intrinsic_set_range(intr, 0);
539    } else {
540       /* We don't use load_sysval() on purpose, because it would set
541        * .base=SYSVALS_PUSH_CONST_BASE, and we're supposed to force a base of
542        * zero in this pass. */
543       unsigned push_const_buf_offset = shader_remapped_sysval_offset(
544          shader, b->shader->info.stage == MESA_SHADER_COMPUTE
545                     ? sysval_offset(compute, push_consts)
546                     : sysval_offset(graphics, push_consts));
547       nir_def *push_const_buf = nir_load_push_constant(
548          b, 1, 64, nir_imm_int(b, push_const_buf_offset));
549       unsigned push_const_offset =
550          shader_remapped_fau_offset(shader, push_consts, base);
551       nir_def *offset = nir_iadd_imm(b, intr->src[0].ssa, push_const_offset);
552       unsigned align = nir_combined_align(nir_intrinsic_align_mul(intr),
553                                           nir_intrinsic_align_offset(intr));
554 
555       /* We assume an alignment of 64-bit max for packed push-constants. */
556       align = MIN2(align, FAU_WORD_SIZE);
557       nir_def *value =
558          nir_load_global(b, nir_iadd(b, push_const_buf, nir_u2u64(b, offset)),
559                          align, intr->def.num_components, intr->def.bit_size);
560 
561       nir_def_replace(&intr->def, value);
562    }
563 
564    return true;
565 }
566 
567 static void
lower_load_push_consts(nir_shader * nir,struct panvk_shader * shader)568 lower_load_push_consts(nir_shader *nir, struct panvk_shader *shader)
569 {
570    /* Before we lower load_push_constant()s with a dynamic offset to global
571     * loads, we want to run a few optimization passes to get rid of offset
572     * calculation involving only constant values. */
573    bool progress = false;
574    do {
575       progress = false;
576       NIR_PASS(progress, nir, nir_copy_prop);
577       NIR_PASS(progress, nir, nir_opt_remove_phis);
578       NIR_PASS(progress, nir, nir_opt_dce);
579       NIR_PASS(progress, nir, nir_opt_dead_cf);
580       NIR_PASS(progress, nir, nir_opt_cse);
581       NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
582       NIR_PASS(progress, nir, nir_opt_algebraic);
583       NIR_PASS(progress, nir, nir_opt_constant_folding);
584    } while (progress);
585 
586    /* We always reserve the 4 blend constant words for fragment shaders,
587     * because we don't know the blend configuration at this point, and
588     * we might end up with a blend shader reading those blend constants. */
589    if (shader->vk.stage == MESA_SHADER_FRAGMENT) {
590       /* We rely on blend constants being placed first and covering 4 words. */
591       STATIC_ASSERT(
592          offsetof(struct panvk_graphics_sysvals, blend.constants) == 0 &&
593          sizeof(((struct panvk_graphics_sysvals *)NULL)->blend.constants) ==
594             16);
595 
596       shader_use_sysval(shader, graphics, blend.constants);
597    }
598 
599    progress = false;
600    NIR_PASS(progress, nir, nir_shader_intrinsics_pass, collect_push_constant,
601             nir_metadata_all, shader);
602 
603    /* Some load_push_constant instructions might be eliminated after
604     * scalarization+dead-code-elimination. Since these pass happen in
605     * bifrost_compile(), we can't run the push_constant packing after the
606     * optimization took place, so let's just have our own FAU count instead
607     * of using info.push.count to make it consistent with the
608     * used_{sysvals,push_consts} bitmaps, even if it sometimes implies loading
609     * more than we really need. Doing that also takes into account the fact
610     * blend constants are never loaded from the fragment shader, but might be
611     * needed in the blend shader. */
612    shader->fau.sysval_count = BITSET_COUNT(shader->fau.used_sysvals);
613    shader->fau.total_count =
614       shader->fau.sysval_count + BITSET_COUNT(shader->fau.used_push_consts);
615 
616    if (!progress)
617       return;
618 
619    NIR_PASS(_, nir, nir_shader_intrinsics_pass, move_push_constant,
620             nir_metadata_control_flow, shader);
621 }
622 
623 static void
panvk_lower_nir(struct panvk_device * dev,nir_shader * nir,uint32_t set_layout_count,struct vk_descriptor_set_layout * const * set_layouts,const struct vk_pipeline_robustness_state * rs,uint32_t * noperspective_varyings,const struct panfrost_compile_inputs * compile_input,struct panvk_shader * shader)624 panvk_lower_nir(struct panvk_device *dev, nir_shader *nir,
625                 uint32_t set_layout_count,
626                 struct vk_descriptor_set_layout *const *set_layouts,
627                 const struct vk_pipeline_robustness_state *rs,
628                 uint32_t *noperspective_varyings,
629                 const struct panfrost_compile_inputs *compile_input,
630                 struct panvk_shader *shader)
631 {
632    struct panvk_instance *instance =
633       to_panvk_instance(dev->vk.physical->instance);
634    gl_shader_stage stage = nir->info.stage;
635 
636 #if PAN_ARCH >= 10
637    if (stage == MESA_SHADER_VERTEX && compile_input->view_mask) {
638       nir_lower_multiview_options options = {
639          .view_mask = compile_input->view_mask,
640          .allowed_per_view_outputs = ~0
641       };
642       /* The only case where this should fail is with memory/image writes,
643        * which we don't support in vertex shaders */
644       assert(nir_can_lower_multiview(nir, options));
645       NIR_PASS(_, nir, nir_lower_multiview, options);
646       /* Pull output writes out of the loop and give them constant offsets for
647        * pan_lower_store_components */
648       NIR_PASS(_, nir, nir_lower_io_to_temporaries,
649                nir_shader_get_entrypoint(nir), true, false);
650    }
651 #endif
652 
653    /* Lower input intrinsics for fragment shaders early to get the max
654     * number of varying loads, as this number is required during descriptor
655     * lowering for v9+. */
656    if (stage == MESA_SHADER_FRAGMENT) {
657       nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
658                                   stage);
659 #if PAN_ARCH >= 9
660       shader->desc_info.max_varying_loads = nir->num_inputs;
661 #endif
662    }
663 
664    panvk_per_arch(nir_lower_descriptors)(nir, dev, rs, set_layout_count,
665                                          set_layouts, shader);
666 
667    NIR_PASS(_, nir, nir_split_var_copies);
668    NIR_PASS(_, nir, nir_lower_var_copies);
669 
670    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
671             panvk_buffer_ubo_addr_format(rs->uniform_buffers));
672    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo,
673             panvk_buffer_ssbo_addr_format(rs->storage_buffers));
674    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const,
675             nir_address_format_32bit_offset);
676    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
677             nir_address_format_64bit_global);
678 
679 #if PAN_ARCH >= 9
680    NIR_PASS(_, nir, nir_shader_intrinsics_pass, valhall_lower_get_ssbo_size,
681             nir_metadata_control_flow, NULL);
682    NIR_PASS(_, nir, nir_shader_instructions_pass, valhall_pack_buf_idx,
683             nir_metadata_control_flow, NULL);
684 #endif
685 
686    if (gl_shader_stage_uses_workgroup(stage)) {
687       if (!nir->info.shared_memory_explicit_layout) {
688          NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared,
689                   shared_type_info);
690       }
691 
692       NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared,
693                nir_address_format_32bit_offset);
694    }
695 
696    if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
697       /* Align everything up to 16 bytes to take advantage of load store
698        * vectorization. */
699       nir->info.shared_size = align(nir->info.shared_size, 16);
700       NIR_PASS(_, nir, nir_zero_initialize_shared_memory, nir->info.shared_size,
701                16);
702 
703       /* We need to call lower_compute_system_values again because
704        * nir_zero_initialize_shared_memory generates load_invocation_id which
705        * has to be lowered to load_invocation_index.
706        */
707       NIR_PASS(_, nir, nir_lower_compute_system_values, NULL);
708    }
709 
710    if (stage == MESA_SHADER_VERTEX) {
711       /* We need the driver_location to match the vertex attribute location,
712        * so we can use the attribute layout described by
713        * vk_vertex_input_state where there are holes in the attribute locations.
714        */
715       nir_foreach_shader_in_variable(var, nir) {
716          assert(var->data.location >= VERT_ATTRIB_GENERIC0 &&
717                 var->data.location <= VERT_ATTRIB_GENERIC15);
718          var->data.driver_location = var->data.location - VERT_ATTRIB_GENERIC0;
719       }
720    } else if (stage != MESA_SHADER_FRAGMENT) {
721       /* Input varyings in fragment shader have been lowered early. */
722       nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
723                                   stage);
724    }
725 
726    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
727                                stage);
728 
729    /* Needed to turn shader_temp into function_temp since the backend only
730     * handles the latter for now.
731     */
732    NIR_PASS(_, nir, nir_lower_global_vars_to_local);
733 
734    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
735    if (unlikely(instance->debug_flags & PANVK_DEBUG_NIR)) {
736       fprintf(stderr, "translated nir:\n");
737       nir_print_shader(nir, stderr);
738    }
739 
740    pan_shader_preprocess(nir, compile_input->gpu_id);
741 
742    if (stage == MESA_SHADER_VERTEX)
743       NIR_PASS(_, nir, nir_shader_intrinsics_pass, panvk_lower_load_vs_input,
744                nir_metadata_control_flow, NULL);
745 
746    /* since valhall, panvk_per_arch(nir_lower_descriptors) separates the
747     * driver set and the user sets, and does not need pan_lower_image_index
748     */
749    if (PAN_ARCH < 9 && stage == MESA_SHADER_VERTEX)
750       NIR_PASS(_, nir, pan_lower_image_index, MAX_VS_ATTRIBS);
751 
752    if (noperspective_varyings && stage == MESA_SHADER_VERTEX)
753       NIR_PASS(_, nir, pan_nir_lower_static_noperspective,
754                *noperspective_varyings);
755 
756    NIR_PASS(_, nir, nir_shader_instructions_pass, panvk_lower_sysvals,
757             nir_metadata_control_flow, NULL);
758 
759    lower_load_push_consts(nir, shader);
760 }
761 
762 static VkResult
panvk_compile_nir(struct panvk_device * dev,nir_shader * nir,VkShaderCreateFlagsEXT shader_flags,struct panfrost_compile_inputs * compile_input,struct panvk_shader * shader)763 panvk_compile_nir(struct panvk_device *dev, nir_shader *nir,
764                   VkShaderCreateFlagsEXT shader_flags,
765                   struct panfrost_compile_inputs *compile_input,
766                   struct panvk_shader *shader)
767 {
768    const bool dump_asm =
769       shader_flags & VK_SHADER_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_MESA;
770 
771    struct util_dynarray binary;
772    util_dynarray_init(&binary, NULL);
773    GENX(pan_shader_compile)(nir, compile_input, &binary, &shader->info);
774 
775    void *bin_ptr = util_dynarray_element(&binary, uint8_t, 0);
776    unsigned bin_size = util_dynarray_num_elements(&binary, uint8_t);
777 
778    shader->bin_size = 0;
779    shader->bin_ptr = NULL;
780 
781    if (bin_size) {
782       void *data = malloc(bin_size);
783 
784       if (data == NULL)
785          return panvk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
786 
787       memcpy(data, bin_ptr, bin_size);
788       shader->bin_size = bin_size;
789       shader->bin_ptr = data;
790    }
791    util_dynarray_fini(&binary);
792 
793    if (dump_asm) {
794       shader->nir_str = nir_shader_as_str(nir, NULL);
795 
796       char *data = NULL;
797       size_t disasm_size = 0;
798 
799       if (shader->bin_size) {
800          struct u_memstream mem;
801          if (u_memstream_open(&mem, &data, &disasm_size)) {
802             FILE *const stream = u_memstream_get(&mem);
803             pan_shader_disassemble(stream, shader->bin_ptr, shader->bin_size,
804                                    compile_input->gpu_id, false);
805             u_memstream_close(&mem);
806          }
807       }
808 
809       char *asm_str = malloc(disasm_size + 1);
810       memcpy(asm_str, data, disasm_size);
811       asm_str[disasm_size] = '\0';
812       free(data);
813 
814       shader->asm_str = asm_str;
815    }
816 
817 #if PAN_ARCH <= 7
818    /* Patch the descriptor count */
819    shader->info.ubo_count =
820       shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_UBO] +
821       shader->desc_info.dyn_ubos.count;
822    shader->info.texture_count =
823       shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_TEXTURE];
824    shader->info.sampler_count =
825       shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_SAMPLER];
826 
827    /* Dummy sampler. */
828    if (!shader->info.sampler_count && shader->info.texture_count)
829       shader->info.sampler_count++;
830 
831    if (nir->info.stage == MESA_SHADER_VERTEX) {
832       /* We leave holes in the attribute locations, but pan_shader.c assumes the
833        * opposite. Patch attribute_count accordingly, so
834        * pan_shader_prepare_rsd() does what we expect.
835        */
836       uint32_t gen_attribs =
837          (shader->info.attributes_read & VERT_BIT_GENERIC_ALL) >>
838          VERT_ATTRIB_GENERIC0;
839 
840       shader->info.attribute_count = util_last_bit(gen_attribs);
841 
842       /* NULL IDVS shaders are not allowed. */
843       if (!bin_size)
844          shader->info.vs.idvs = false;
845    }
846 
847    /* Image attributes start at MAX_VS_ATTRIBS in the VS attribute table,
848     * and zero in other stages.
849     */
850    if (shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_IMG] > 0)
851       shader->info.attribute_count =
852          shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_IMG] +
853          (nir->info.stage == MESA_SHADER_VERTEX ? MAX_VS_ATTRIBS : 0);
854 #endif
855 
856    shader->local_size.x = nir->info.workgroup_size[0];
857    shader->local_size.y = nir->info.workgroup_size[1];
858    shader->local_size.z = nir->info.workgroup_size[2];
859 
860    return VK_SUCCESS;
861 }
862 
863 #if PAN_ARCH >= 9
864 static enum mali_flush_to_zero_mode
shader_ftz_mode(struct panvk_shader * shader)865 shader_ftz_mode(struct panvk_shader *shader)
866 {
867    if (shader->info.ftz_fp32) {
868       if (shader->info.ftz_fp16)
869          return MALI_FLUSH_TO_ZERO_MODE_ALWAYS;
870       else
871          return MALI_FLUSH_TO_ZERO_MODE_DX11;
872    } else {
873       /* We don't have a "flush FP16, preserve FP32" mode, but APIs
874        * should not be able to generate that.
875        */
876       assert(!shader->info.ftz_fp16 && !shader->info.ftz_fp32);
877       return MALI_FLUSH_TO_ZERO_MODE_PRESERVE_SUBNORMALS;
878    }
879 }
880 #endif
881 
882 static VkResult
panvk_shader_upload(struct panvk_device * dev,struct panvk_shader * shader,const VkAllocationCallbacks * pAllocator)883 panvk_shader_upload(struct panvk_device *dev, struct panvk_shader *shader,
884                     const VkAllocationCallbacks *pAllocator)
885 {
886    shader->code_mem = (struct panvk_priv_mem){0};
887 
888 #if PAN_ARCH <= 7
889    shader->rsd = (struct panvk_priv_mem){0};
890 #else
891    shader->spd = (struct panvk_priv_mem){0};
892 #endif
893 
894    if (!shader->bin_size)
895       return VK_SUCCESS;
896 
897    shader->code_mem = panvk_pool_upload_aligned(
898       &dev->mempools.exec, shader->bin_ptr, shader->bin_size, 128);
899    if (!panvk_priv_mem_dev_addr(shader->code_mem))
900       return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
901 
902 #if PAN_ARCH <= 7
903    if (shader->info.stage == MESA_SHADER_FRAGMENT)
904       return VK_SUCCESS;
905 
906    shader->rsd = panvk_pool_alloc_desc(&dev->mempools.rw, RENDERER_STATE);
907    if (!panvk_priv_mem_dev_addr(shader->rsd))
908       return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
909 
910    pan_cast_and_pack(panvk_priv_mem_host_addr(shader->rsd), RENDERER_STATE,
911                      cfg) {
912       pan_shader_prepare_rsd(&shader->info, panvk_shader_get_dev_addr(shader),
913                              &cfg);
914    }
915 #else
916    if (shader->info.stage != MESA_SHADER_VERTEX) {
917       shader->spd = panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
918       if (!panvk_priv_mem_dev_addr(shader->spd))
919          return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
920 
921       pan_cast_and_pack(panvk_priv_mem_host_addr(shader->spd), SHADER_PROGRAM,
922                         cfg) {
923          cfg.stage = pan_shader_stage(&shader->info);
924 
925          if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
926             cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
927          else if (cfg.stage == MALI_SHADER_STAGE_VERTEX)
928             cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
929 
930          cfg.register_allocation =
931             pan_register_allocation(shader->info.work_reg_count);
932          cfg.binary = panvk_shader_get_dev_addr(shader);
933          cfg.preload.r48_r63 = (shader->info.preload >> 48);
934          cfg.flush_to_zero_mode = shader_ftz_mode(shader);
935 
936          if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
937             cfg.requires_helper_threads = shader->info.contains_barrier;
938       }
939    } else {
940       shader->spds.pos_points =
941          panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
942       if (!panvk_priv_mem_dev_addr(shader->spds.pos_points))
943          return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
944 
945       pan_cast_and_pack(panvk_priv_mem_host_addr(shader->spds.pos_points),
946                         SHADER_PROGRAM, cfg) {
947          cfg.stage = pan_shader_stage(&shader->info);
948          cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
949          cfg.register_allocation =
950             pan_register_allocation(shader->info.work_reg_count);
951          cfg.binary = panvk_shader_get_dev_addr(shader);
952          cfg.preload.r48_r63 = (shader->info.preload >> 48);
953          cfg.flush_to_zero_mode = shader_ftz_mode(shader);
954       }
955 
956       shader->spds.pos_triangles =
957          panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
958       if (!panvk_priv_mem_dev_addr(shader->spds.pos_triangles))
959          return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
960 
961       pan_cast_and_pack(panvk_priv_mem_host_addr(shader->spds.pos_triangles),
962                         SHADER_PROGRAM, cfg) {
963          cfg.stage = pan_shader_stage(&shader->info);
964          cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
965          cfg.register_allocation =
966             pan_register_allocation(shader->info.work_reg_count);
967          cfg.binary =
968             panvk_shader_get_dev_addr(shader) + shader->info.vs.no_psiz_offset;
969          cfg.preload.r48_r63 = (shader->info.preload >> 48);
970          cfg.flush_to_zero_mode = shader_ftz_mode(shader);
971       }
972 
973       if (shader->info.vs.secondary_enable) {
974          shader->spds.var =
975             panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
976          if (!panvk_priv_mem_dev_addr(shader->spds.var))
977             return panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
978 
979          pan_cast_and_pack(panvk_priv_mem_host_addr(shader->spds.var),
980                            SHADER_PROGRAM, cfg) {
981             unsigned work_count = shader->info.vs.secondary_work_reg_count;
982 
983             cfg.stage = pan_shader_stage(&shader->info);
984             cfg.vertex_warp_limit = MALI_WARP_LIMIT_FULL;
985             cfg.register_allocation = pan_register_allocation(work_count);
986             cfg.binary = panvk_shader_get_dev_addr(shader) +
987                          shader->info.vs.secondary_offset;
988             cfg.preload.r48_r63 = (shader->info.vs.secondary_preload >> 48);
989             cfg.flush_to_zero_mode = shader_ftz_mode(shader);
990          }
991       }
992    }
993 #endif
994 
995    return VK_SUCCESS;
996 }
997 
998 static void
panvk_shader_destroy(struct vk_device * vk_dev,struct vk_shader * vk_shader,const VkAllocationCallbacks * pAllocator)999 panvk_shader_destroy(struct vk_device *vk_dev, struct vk_shader *vk_shader,
1000                      const VkAllocationCallbacks *pAllocator)
1001 {
1002    struct panvk_device *dev = to_panvk_device(vk_dev);
1003    struct panvk_shader *shader =
1004       container_of(vk_shader, struct panvk_shader, vk);
1005 
1006    free((void *)shader->asm_str);
1007    ralloc_free((void *)shader->nir_str);
1008 
1009    panvk_pool_free_mem(&shader->code_mem);
1010 
1011 #if PAN_ARCH <= 7
1012    panvk_pool_free_mem(&shader->rsd);
1013    panvk_pool_free_mem(&shader->desc_info.others.map);
1014 #else
1015    if (shader->info.stage != MESA_SHADER_VERTEX) {
1016       panvk_pool_free_mem(&shader->spd);
1017    } else {
1018       panvk_pool_free_mem(&shader->spds.var);
1019       panvk_pool_free_mem(&shader->spds.pos_points);
1020       panvk_pool_free_mem(&shader->spds.pos_triangles);
1021    }
1022 #endif
1023 
1024    free((void *)shader->bin_ptr);
1025    vk_shader_free(&dev->vk, pAllocator, &shader->vk);
1026 }
1027 
1028 static const struct vk_shader_ops panvk_shader_ops;
1029 
1030 static VkResult
panvk_compile_shader(struct panvk_device * dev,struct vk_shader_compile_info * info,const struct vk_graphics_pipeline_state * state,uint32_t * noperspective_varyings,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shader_out)1031 panvk_compile_shader(struct panvk_device *dev,
1032                      struct vk_shader_compile_info *info,
1033                      const struct vk_graphics_pipeline_state *state,
1034                      uint32_t *noperspective_varyings,
1035                      const VkAllocationCallbacks *pAllocator,
1036                      struct vk_shader **shader_out)
1037 {
1038    struct panvk_physical_device *phys_dev =
1039       to_panvk_physical_device(dev->vk.physical);
1040 
1041    struct panvk_shader *shader;
1042    VkResult result;
1043 
1044    /* We consume the NIR, regardless of success or failure */
1045    nir_shader *nir = info->nir;
1046 
1047    shader = vk_shader_zalloc(&dev->vk, &panvk_shader_ops, info->stage,
1048                              pAllocator, sizeof(*shader));
1049    if (shader == NULL)
1050       return panvk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1051 
1052    struct panfrost_compile_inputs inputs = {
1053       .gpu_id = phys_dev->kmod.props.gpu_prod_id,
1054       .no_ubo_to_push = true,
1055       .view_mask = (state && state->rp) ? state->rp->view_mask : 0,
1056    };
1057 
1058    if (info->stage == MESA_SHADER_FRAGMENT && state != NULL &&
1059        state->ms != NULL && state->ms->sample_shading_enable)
1060       nir->info.fs.uses_sample_shading = true;
1061 
1062    panvk_lower_nir(dev, nir, info->set_layout_count, info->set_layouts,
1063                    info->robustness, noperspective_varyings, &inputs, shader);
1064 
1065 #if PAN_ARCH >= 9
1066    if (info->stage == MESA_SHADER_FRAGMENT)
1067       /* Use LD_VAR_BUF[_IMM] for varyings if possible. */
1068       inputs.valhall.use_ld_var_buf = panvk_use_ld_var_buf(shader);
1069 #endif
1070 
1071    result = panvk_compile_nir(dev, nir, info->flags, &inputs, shader);
1072 
1073    /* We need to update info.push.count because it's used to initialize the
1074     * RSD in pan_shader_prepare_rsd(). */
1075    shader->info.push.count = shader->fau.total_count * 2;
1076 
1077    if (result != VK_SUCCESS) {
1078       panvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
1079       return result;
1080    }
1081 
1082    result = panvk_shader_upload(dev, shader, pAllocator);
1083 
1084    if (result != VK_SUCCESS) {
1085       panvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
1086       return result;
1087    }
1088 
1089    *shader_out = &shader->vk;
1090 
1091    return result;
1092 }
1093 
1094 static VkResult
panvk_compile_shaders(struct vk_device * vk_dev,uint32_t shader_count,struct vk_shader_compile_info * infos,const struct vk_graphics_pipeline_state * state,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shaders_out)1095 panvk_compile_shaders(struct vk_device *vk_dev, uint32_t shader_count,
1096                       struct vk_shader_compile_info *infos,
1097                       const struct vk_graphics_pipeline_state *state,
1098                       const VkAllocationCallbacks *pAllocator,
1099                       struct vk_shader **shaders_out)
1100 {
1101    struct panvk_device *dev = to_panvk_device(vk_dev);
1102    bool use_static_noperspective = false;
1103    uint32_t noperspective_varyings = 0;
1104    VkResult result;
1105    int32_t i;
1106 
1107    /* Vulkan runtime passes us shaders in stage order, so the FS will always
1108     * be last if it exists. Iterate shaders in reverse order to ensure FS is
1109     * processed before VS. */
1110    for (i = shader_count - 1; i >= 0; i--) {
1111       uint32_t *noperspective_varyings_ptr =
1112          use_static_noperspective ? &noperspective_varyings : NULL;
1113       result = panvk_compile_shader(dev, &infos[i], state,
1114                                     noperspective_varyings_ptr,
1115                                     pAllocator,
1116                                     &shaders_out[i]);
1117 
1118       if (result != VK_SUCCESS)
1119          goto err_cleanup;
1120 
1121       /* If we are linking VS and FS, we can use the static interpolation
1122        * qualifiers from the FS in the VS. */
1123       if (infos[i].nir->info.stage == MESA_SHADER_FRAGMENT) {
1124          struct panvk_shader *shader =
1125             container_of(shaders_out[i], struct panvk_shader, vk);
1126 
1127          use_static_noperspective = true;
1128          noperspective_varyings = shader->info.varyings.noperspective;
1129       }
1130 
1131       /* Clean up NIR for the current shader */
1132       ralloc_free(infos[i].nir);
1133    }
1134 
1135    /* TODO: If we get multiple shaders here, we can perform part of the link
1136     * logic at compile time. */
1137 
1138    return VK_SUCCESS;
1139 
1140 err_cleanup:
1141    /* Clean up all the shaders before this point */
1142    for (int32_t j = shader_count - 1; j > i; j--)
1143       panvk_shader_destroy(&dev->vk, shaders_out[j], pAllocator);
1144 
1145    /* Clean up all the NIR from this point */
1146    for (int32_t j = i; j >= 0; j--)
1147       ralloc_free(infos[j].nir);
1148 
1149    /* Memset the output array */
1150    memset(shaders_out, 0, shader_count * sizeof(*shaders_out));
1151 
1152    return result;
1153 }
1154 
1155 static VkResult
shader_desc_info_deserialize(struct blob_reader * blob,struct panvk_shader * shader)1156 shader_desc_info_deserialize(struct blob_reader *blob,
1157                              struct panvk_shader *shader)
1158 {
1159    shader->desc_info.used_set_mask = blob_read_uint32(blob);
1160 
1161 #if PAN_ARCH <= 7
1162    shader->desc_info.dyn_ubos.count = blob_read_uint32(blob);
1163    blob_copy_bytes(blob, shader->desc_info.dyn_ubos.map,
1164                    shader->desc_info.dyn_ubos.count);
1165    shader->desc_info.dyn_ssbos.count = blob_read_uint32(blob);
1166    blob_copy_bytes(blob, shader->desc_info.dyn_ssbos.map,
1167                    shader->desc_info.dyn_ssbos.count);
1168 
1169    uint32_t others_count = 0;
1170    for (unsigned i = 0; i < ARRAY_SIZE(shader->desc_info.others.count); i++) {
1171       shader->desc_info.others.count[i] = blob_read_uint32(blob);
1172       others_count += shader->desc_info.others.count[i];
1173    }
1174 
1175    if (others_count) {
1176       struct panvk_device *dev = to_panvk_device(shader->vk.base.device);
1177       struct panvk_pool_alloc_info alloc_info = {
1178          .size = others_count * sizeof(uint32_t),
1179          .alignment = sizeof(uint32_t),
1180       };
1181       shader->desc_info.others.map =
1182          panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
1183       uint32_t *copy_table =
1184          panvk_priv_mem_host_addr(shader->desc_info.others.map);
1185 
1186       if (!copy_table)
1187          return panvk_error(shader, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1188 
1189       blob_copy_bytes(blob, copy_table, others_count * sizeof(*copy_table));
1190    }
1191 #else
1192    shader->desc_info.dyn_bufs.count = blob_read_uint32(blob);
1193    blob_copy_bytes(blob, shader->desc_info.dyn_bufs.map,
1194                    sizeof(*shader->desc_info.dyn_bufs.map) *
1195                    shader->desc_info.dyn_bufs.count);
1196 #endif
1197 
1198    return VK_SUCCESS;
1199 }
1200 
1201 static VkResult
panvk_deserialize_shader(struct vk_device * vk_dev,struct blob_reader * blob,uint32_t binary_version,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shader_out)1202 panvk_deserialize_shader(struct vk_device *vk_dev, struct blob_reader *blob,
1203                          uint32_t binary_version,
1204                          const VkAllocationCallbacks *pAllocator,
1205                          struct vk_shader **shader_out)
1206 {
1207    struct panvk_device *device = to_panvk_device(vk_dev);
1208    struct panvk_shader *shader;
1209    VkResult result;
1210 
1211    struct pan_shader_info info;
1212    blob_copy_bytes(blob, &info, sizeof(info));
1213 
1214    struct panvk_shader_fau_info fau;
1215    blob_copy_bytes(blob, &fau, sizeof(fau));
1216 
1217    struct pan_compute_dim local_size;
1218    blob_copy_bytes(blob, &local_size, sizeof(local_size));
1219 
1220    const uint32_t bin_size = blob_read_uint32(blob);
1221 
1222    if (blob->overrun)
1223       return panvk_error(device, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
1224 
1225    shader = vk_shader_zalloc(vk_dev, &panvk_shader_ops, info.stage, pAllocator,
1226                              sizeof(*shader));
1227    if (shader == NULL)
1228       return panvk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1229 
1230    shader->info = info;
1231    shader->fau = fau;
1232    shader->local_size = local_size;
1233    shader->bin_size = bin_size;
1234 
1235    shader->bin_ptr = malloc(bin_size);
1236    if (shader->bin_ptr == NULL) {
1237       panvk_shader_destroy(vk_dev, &shader->vk, pAllocator);
1238       return panvk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1239    }
1240 
1241    blob_copy_bytes(blob, (void *)shader->bin_ptr, shader->bin_size);
1242 
1243    result = shader_desc_info_deserialize(blob, shader);
1244 
1245    if (result != VK_SUCCESS) {
1246       panvk_shader_destroy(vk_dev, &shader->vk, pAllocator);
1247       return panvk_error(device, result);
1248    }
1249 
1250    if (blob->overrun) {
1251       panvk_shader_destroy(vk_dev, &shader->vk, pAllocator);
1252       return panvk_error(device, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
1253    }
1254 
1255    result = panvk_shader_upload(device, shader, pAllocator);
1256 
1257    if (result != VK_SUCCESS) {
1258       panvk_shader_destroy(vk_dev, &shader->vk, pAllocator);
1259       return result;
1260    }
1261 
1262    *shader_out = &shader->vk;
1263 
1264    return result;
1265 }
1266 
1267 static void
shader_desc_info_serialize(struct blob * blob,const struct panvk_shader * shader)1268 shader_desc_info_serialize(struct blob *blob, const struct panvk_shader *shader)
1269 {
1270    blob_write_uint32(blob, shader->desc_info.used_set_mask);
1271 
1272 #if PAN_ARCH <= 7
1273    blob_write_uint32(blob, shader->desc_info.dyn_ubos.count);
1274    blob_write_bytes(blob, shader->desc_info.dyn_ubos.map,
1275                     sizeof(*shader->desc_info.dyn_ubos.map) *
1276                        shader->desc_info.dyn_ubos.count);
1277    blob_write_uint32(blob, shader->desc_info.dyn_ssbos.count);
1278    blob_write_bytes(blob, shader->desc_info.dyn_ssbos.map,
1279                     sizeof(*shader->desc_info.dyn_ssbos.map) *
1280                        shader->desc_info.dyn_ssbos.count);
1281 
1282    unsigned others_count = 0;
1283    for (unsigned i = 0; i < ARRAY_SIZE(shader->desc_info.others.count); i++) {
1284       blob_write_uint32(blob, shader->desc_info.others.count[i]);
1285       others_count += shader->desc_info.others.count[i];
1286    }
1287 
1288    blob_write_bytes(blob,
1289                     panvk_priv_mem_host_addr(shader->desc_info.others.map),
1290                     sizeof(uint32_t) * others_count);
1291 #else
1292    blob_write_uint32(blob, shader->desc_info.dyn_bufs.count);
1293    blob_write_bytes(blob, shader->desc_info.dyn_bufs.map,
1294                     sizeof(*shader->desc_info.dyn_bufs.map) *
1295                     shader->desc_info.dyn_bufs.count);
1296 #endif
1297 }
1298 
1299 static bool
panvk_shader_serialize(struct vk_device * vk_dev,const struct vk_shader * vk_shader,struct blob * blob)1300 panvk_shader_serialize(struct vk_device *vk_dev,
1301                        const struct vk_shader *vk_shader, struct blob *blob)
1302 {
1303    struct panvk_shader *shader =
1304       container_of(vk_shader, struct panvk_shader, vk);
1305 
1306    /**
1307     * We can't currently cache assembly
1308     * TODO: Implement seriaization with assembly
1309     **/
1310    if (shader->nir_str != NULL || shader->asm_str != NULL)
1311       return false;
1312 
1313    blob_write_bytes(blob, &shader->info, sizeof(shader->info));
1314    blob_write_bytes(blob, &shader->fau, sizeof(shader->fau));
1315    blob_write_bytes(blob, &shader->local_size, sizeof(shader->local_size));
1316    blob_write_uint32(blob, shader->bin_size);
1317    blob_write_bytes(blob, shader->bin_ptr, shader->bin_size);
1318    shader_desc_info_serialize(blob, shader);
1319 
1320    return !blob->out_of_memory;
1321 }
1322 
1323 #define WRITE_STR(field, ...)                                                  \
1324    ({                                                                          \
1325       memset(field, 0, sizeof(field));                                         \
1326       UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__);              \
1327       assert(i > 0 && i < sizeof(field));                                      \
1328    })
1329 
1330 static VkResult
panvk_shader_get_executable_properties(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t * executable_count,VkPipelineExecutablePropertiesKHR * properties)1331 panvk_shader_get_executable_properties(
1332    UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
1333    uint32_t *executable_count, VkPipelineExecutablePropertiesKHR *properties)
1334 {
1335    UNUSED struct panvk_shader *shader =
1336       container_of(vk_shader, struct panvk_shader, vk);
1337 
1338    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out, properties,
1339                           executable_count);
1340 
1341    vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props)
1342    {
1343       props->stages = mesa_to_vk_shader_stage(shader->info.stage);
1344       props->subgroupSize = 8;
1345       WRITE_STR(props->name, "%s",
1346                 _mesa_shader_stage_to_string(shader->info.stage));
1347       WRITE_STR(props->description, "%s shader",
1348                 _mesa_shader_stage_to_string(shader->info.stage));
1349    }
1350 
1351    return vk_outarray_status(&out);
1352 }
1353 
1354 static VkResult
panvk_shader_get_executable_statistics(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t executable_index,uint32_t * statistic_count,VkPipelineExecutableStatisticKHR * statistics)1355 panvk_shader_get_executable_statistics(
1356    UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
1357    uint32_t executable_index, uint32_t *statistic_count,
1358    VkPipelineExecutableStatisticKHR *statistics)
1359 {
1360    UNUSED struct panvk_shader *shader =
1361       container_of(vk_shader, struct panvk_shader, vk);
1362 
1363    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, statistics,
1364                           statistic_count);
1365 
1366    assert(executable_index == 0);
1367 
1368    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat)
1369    {
1370       WRITE_STR(stat->name, "Code Size");
1371       WRITE_STR(stat->description,
1372                 "Size of the compiled shader binary, in bytes");
1373       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1374       stat->value.u64 = shader->bin_size;
1375    }
1376 
1377    /* TODO: more executable statistics (VK_KHR_pipeline_executable_properties) */
1378 
1379    return vk_outarray_status(&out);
1380 }
1381 
1382 static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR * ir,const char * data)1383 write_ir_text(VkPipelineExecutableInternalRepresentationKHR *ir,
1384               const char *data)
1385 {
1386    ir->isText = VK_TRUE;
1387 
1388    size_t data_len = strlen(data) + 1;
1389 
1390    if (ir->pData == NULL) {
1391       ir->dataSize = data_len;
1392       return true;
1393    }
1394 
1395    strncpy(ir->pData, data, ir->dataSize);
1396    if (ir->dataSize < data_len)
1397       return false;
1398 
1399    ir->dataSize = data_len;
1400    return true;
1401 }
1402 
1403 static VkResult
panvk_shader_get_executable_internal_representations(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t executable_index,uint32_t * internal_representation_count,VkPipelineExecutableInternalRepresentationKHR * internal_representations)1404 panvk_shader_get_executable_internal_representations(
1405    UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
1406    uint32_t executable_index, uint32_t *internal_representation_count,
1407    VkPipelineExecutableInternalRepresentationKHR *internal_representations)
1408 {
1409    UNUSED struct panvk_shader *shader =
1410       container_of(vk_shader, struct panvk_shader, vk);
1411    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
1412                           internal_representations,
1413                           internal_representation_count);
1414    bool incomplete_text = false;
1415 
1416    if (shader->nir_str != NULL) {
1417       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
1418                                &out, ir)
1419       {
1420          WRITE_STR(ir->name, "NIR shader");
1421          WRITE_STR(ir->description,
1422                    "NIR shader before sending to the back-end compiler");
1423          if (!write_ir_text(ir, shader->nir_str))
1424             incomplete_text = true;
1425       }
1426    }
1427 
1428    if (shader->asm_str != NULL) {
1429       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
1430                                &out, ir)
1431       {
1432          WRITE_STR(ir->name, "Assembly");
1433          WRITE_STR(ir->description, "Final Assembly");
1434          if (!write_ir_text(ir, shader->asm_str))
1435             incomplete_text = true;
1436       }
1437    }
1438 
1439    return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
1440 }
1441 
1442 #if PAN_ARCH <= 7
1443 static mali_pixel_format
get_varying_format(gl_shader_stage stage,gl_varying_slot loc,enum pipe_format pfmt)1444 get_varying_format(gl_shader_stage stage, gl_varying_slot loc,
1445                    enum pipe_format pfmt)
1446 {
1447    switch (loc) {
1448    case VARYING_SLOT_PNTC:
1449    case VARYING_SLOT_PSIZ:
1450 #if PAN_ARCH <= 6
1451       return (MALI_R16F << 12) | panfrost_get_default_swizzle(1);
1452 #else
1453       return (MALI_R16F << 12) | MALI_RGB_COMPONENT_ORDER_R000;
1454 #endif
1455    case VARYING_SLOT_POS:
1456 #if PAN_ARCH <= 6
1457       return (MALI_SNAP_4 << 12) | panfrost_get_default_swizzle(4);
1458 #else
1459       return (MALI_SNAP_4 << 12) | MALI_RGB_COMPONENT_ORDER_RGBA;
1460 #endif
1461    default:
1462       assert(pfmt != PIPE_FORMAT_NONE);
1463       return GENX(panfrost_format_from_pipe_format)(pfmt)->hw;
1464    }
1465 }
1466 
1467 struct varyings_info {
1468    enum pipe_format fmts[VARYING_SLOT_MAX];
1469    BITSET_DECLARE(active, VARYING_SLOT_MAX);
1470 };
1471 
1472 static void
collect_varyings_info(const struct pan_shader_varying * varyings,unsigned varying_count,struct varyings_info * info)1473 collect_varyings_info(const struct pan_shader_varying *varyings,
1474                       unsigned varying_count, struct varyings_info *info)
1475 {
1476    for (unsigned i = 0; i < varying_count; i++) {
1477       gl_varying_slot loc = varyings[i].location;
1478 
1479       if (varyings[i].format == PIPE_FORMAT_NONE)
1480          continue;
1481 
1482       info->fmts[loc] = varyings[i].format;
1483       BITSET_SET(info->active, loc);
1484    }
1485 }
1486 
1487 static inline enum panvk_varying_buf_id
varying_buf_id(gl_varying_slot loc)1488 varying_buf_id(gl_varying_slot loc)
1489 {
1490    switch (loc) {
1491    case VARYING_SLOT_POS:
1492       return PANVK_VARY_BUF_POSITION;
1493    case VARYING_SLOT_PSIZ:
1494       return PANVK_VARY_BUF_PSIZ;
1495    default:
1496       return PANVK_VARY_BUF_GENERAL;
1497    }
1498 }
1499 
1500 static mali_pixel_format
varying_format(gl_varying_slot loc,enum pipe_format pfmt)1501 varying_format(gl_varying_slot loc, enum pipe_format pfmt)
1502 {
1503    switch (loc) {
1504    case VARYING_SLOT_PNTC:
1505    case VARYING_SLOT_PSIZ:
1506 #if PAN_ARCH <= 6
1507       return (MALI_R16F << 12) | panfrost_get_default_swizzle(1);
1508 #else
1509       return (MALI_R16F << 12) | MALI_RGB_COMPONENT_ORDER_R000;
1510 #endif
1511    case VARYING_SLOT_POS:
1512 #if PAN_ARCH <= 6
1513       return (MALI_SNAP_4 << 12) | panfrost_get_default_swizzle(4);
1514 #else
1515       return (MALI_SNAP_4 << 12) | MALI_RGB_COMPONENT_ORDER_RGBA;
1516 #endif
1517    default:
1518       return GENX(panfrost_format_from_pipe_format)(pfmt)->hw;
1519    }
1520 }
1521 
1522 static VkResult
emit_varying_attrs(struct panvk_pool * desc_pool,const struct pan_shader_varying * varyings,unsigned varying_count,const struct varyings_info * info,unsigned * buf_offsets,struct panvk_priv_mem * mem)1523 emit_varying_attrs(struct panvk_pool *desc_pool,
1524                    const struct pan_shader_varying *varyings,
1525                    unsigned varying_count, const struct varyings_info *info,
1526                    unsigned *buf_offsets, struct panvk_priv_mem *mem)
1527 {
1528    unsigned attr_count = BITSET_COUNT(info->active);
1529 
1530    *mem = panvk_pool_alloc_desc_array(desc_pool, attr_count, ATTRIBUTE);
1531 
1532    if (attr_count && !panvk_priv_mem_dev_addr(*mem))
1533       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1534 
1535    struct mali_attribute_packed *attrs = panvk_priv_mem_host_addr(*mem);
1536    unsigned attr_idx = 0;
1537 
1538    for (unsigned i = 0; i < varying_count; i++) {
1539       pan_pack(&attrs[attr_idx++], ATTRIBUTE, cfg) {
1540          gl_varying_slot loc = varyings[i].location;
1541          enum pipe_format pfmt = varyings[i].format != PIPE_FORMAT_NONE
1542                                     ? info->fmts[loc]
1543                                     : PIPE_FORMAT_NONE;
1544 
1545          if (pfmt == PIPE_FORMAT_NONE) {
1546 #if PAN_ARCH >= 7
1547             cfg.format = (MALI_CONSTANT << 12) | MALI_RGB_COMPONENT_ORDER_0000;
1548 #else
1549             cfg.format = (MALI_CONSTANT << 12) | PAN_V6_SWIZZLE(0, 0, 0, 0);
1550 #endif
1551          } else {
1552             cfg.buffer_index = varying_buf_id(loc);
1553             cfg.offset = buf_offsets[loc];
1554             cfg.format = varying_format(loc, info->fmts[loc]);
1555          }
1556          cfg.offset_enable = false;
1557       }
1558    }
1559 
1560    return VK_SUCCESS;
1561 }
1562 
1563 VkResult
panvk_per_arch(link_shaders)1564 panvk_per_arch(link_shaders)(struct panvk_pool *desc_pool,
1565                              const struct panvk_shader *vs,
1566                              const struct panvk_shader *fs,
1567                              struct panvk_shader_link *link)
1568 {
1569    BITSET_DECLARE(active_attrs, VARYING_SLOT_MAX) = {0};
1570    unsigned buf_strides[PANVK_VARY_BUF_MAX] = {0};
1571    unsigned buf_offsets[VARYING_SLOT_MAX] = {0};
1572    struct varyings_info out_vars = {0};
1573    struct varyings_info in_vars = {0};
1574    unsigned loc;
1575 
1576    assert(vs);
1577    assert(vs->info.stage == MESA_SHADER_VERTEX);
1578 
1579    collect_varyings_info(vs->info.varyings.output,
1580                          vs->info.varyings.output_count, &out_vars);
1581 
1582    if (fs) {
1583       assert(fs->info.stage == MESA_SHADER_FRAGMENT);
1584       collect_varyings_info(fs->info.varyings.input,
1585                             fs->info.varyings.input_count, &in_vars);
1586    }
1587 
1588    BITSET_OR(active_attrs, in_vars.active, out_vars.active);
1589 
1590    /* Handle the position and point size buffers explicitly, as they are
1591     * passed through separate buffer pointers to the tiler job.
1592     */
1593    if (BITSET_TEST(out_vars.active, VARYING_SLOT_POS)) {
1594       buf_strides[PANVK_VARY_BUF_POSITION] = sizeof(float) * 4;
1595       BITSET_CLEAR(active_attrs, VARYING_SLOT_POS);
1596    }
1597 
1598    if (BITSET_TEST(out_vars.active, VARYING_SLOT_PSIZ)) {
1599       buf_strides[PANVK_VARY_BUF_PSIZ] = sizeof(uint16_t);
1600       BITSET_CLEAR(active_attrs, VARYING_SLOT_PSIZ);
1601    }
1602 
1603    BITSET_FOREACH_SET(loc, active_attrs, VARYING_SLOT_MAX) {
1604       /* We expect the VS to write to all inputs read by the FS, and the
1605        * FS to read all inputs written by the VS. If that's not the
1606        * case, we keep PIPE_FORMAT_NONE to reflect the fact we should use a
1607        * sink attribute (writes are discarded, reads return zeros).
1608        */
1609       if (in_vars.fmts[loc] == PIPE_FORMAT_NONE ||
1610           out_vars.fmts[loc] == PIPE_FORMAT_NONE) {
1611          in_vars.fmts[loc] = PIPE_FORMAT_NONE;
1612          out_vars.fmts[loc] = PIPE_FORMAT_NONE;
1613          continue;
1614       }
1615 
1616       unsigned out_size = util_format_get_blocksize(out_vars.fmts[loc]);
1617       unsigned buf_idx = varying_buf_id(loc);
1618 
1619       /* Always trust the VS input format, so we can:
1620        * - discard components that are never read
1621        * - use float types for interpolated fragment shader inputs
1622        * - use fp16 for floats with mediump
1623        * - make sure components that are not written by the FS are set to zero
1624        */
1625       out_vars.fmts[loc] = in_vars.fmts[loc];
1626 
1627       /* Special buffers are handled explicitly before this loop, everything
1628        * else should be laid out in the general varying buffer.
1629        */
1630       assert(buf_idx == PANVK_VARY_BUF_GENERAL);
1631 
1632       /* Keep things aligned a 32-bit component. */
1633       buf_offsets[loc] = buf_strides[buf_idx];
1634       buf_strides[buf_idx] += ALIGN_POT(out_size, 4);
1635    }
1636 
1637    VkResult result = emit_varying_attrs(
1638       desc_pool, vs->info.varyings.output, vs->info.varyings.output_count,
1639       &out_vars, buf_offsets, &link->vs.attribs);
1640    if (result != VK_SUCCESS)
1641       return result;
1642 
1643    if (fs) {
1644       result = emit_varying_attrs(desc_pool, fs->info.varyings.input,
1645                                   fs->info.varyings.input_count, &in_vars,
1646                                   buf_offsets, &link->fs.attribs);
1647       if (result != VK_SUCCESS)
1648          return result;
1649    }
1650 
1651    memcpy(link->buf_strides, buf_strides, sizeof(link->buf_strides));
1652    return VK_SUCCESS;
1653 }
1654 #endif
1655 
1656 static const struct vk_shader_ops panvk_shader_ops = {
1657    .destroy = panvk_shader_destroy,
1658    .serialize = panvk_shader_serialize,
1659    .get_executable_properties = panvk_shader_get_executable_properties,
1660    .get_executable_statistics = panvk_shader_get_executable_statistics,
1661    .get_executable_internal_representations =
1662       panvk_shader_get_executable_internal_representations,
1663 };
1664 
1665 static void
panvk_cmd_bind_shader(struct panvk_cmd_buffer * cmd,const gl_shader_stage stage,struct panvk_shader * shader)1666 panvk_cmd_bind_shader(struct panvk_cmd_buffer *cmd, const gl_shader_stage stage,
1667                       struct panvk_shader *shader)
1668 {
1669    switch (stage) {
1670    case MESA_SHADER_COMPUTE:
1671       if (cmd->state.compute.shader != shader) {
1672          cmd->state.compute.shader = shader;
1673          compute_state_set_dirty(cmd, CS);
1674          compute_state_set_dirty(cmd, PUSH_UNIFORMS);
1675       }
1676       break;
1677    case MESA_SHADER_VERTEX:
1678       if (cmd->state.gfx.vs.shader != shader) {
1679          cmd->state.gfx.vs.shader = shader;
1680          gfx_state_set_dirty(cmd, VS);
1681          gfx_state_set_dirty(cmd, VS_PUSH_UNIFORMS);
1682       }
1683       break;
1684    case MESA_SHADER_FRAGMENT:
1685       if (cmd->state.gfx.fs.shader != shader) {
1686          cmd->state.gfx.fs.shader = shader;
1687          gfx_state_set_dirty(cmd, FS);
1688          gfx_state_set_dirty(cmd, FS_PUSH_UNIFORMS);
1689       }
1690       break;
1691    default:
1692       assert(!"Unsupported stage");
1693       break;
1694    }
1695 }
1696 
1697 static void
panvk_cmd_bind_shaders(struct vk_command_buffer * vk_cmd,uint32_t stage_count,const gl_shader_stage * stages,struct vk_shader ** const shaders)1698 panvk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count,
1699                        const gl_shader_stage *stages,
1700                        struct vk_shader **const shaders)
1701 {
1702    struct panvk_cmd_buffer *cmd =
1703       container_of(vk_cmd, struct panvk_cmd_buffer, vk);
1704 
1705    for (uint32_t i = 0; i < stage_count; i++) {
1706       struct panvk_shader *shader =
1707          container_of(shaders[i], struct panvk_shader, vk);
1708 
1709       panvk_cmd_bind_shader(cmd, stages[i], shader);
1710    }
1711 }
1712 
1713 const struct vk_device_shader_ops panvk_per_arch(device_shader_ops) = {
1714    .get_nir_options = panvk_get_nir_options,
1715    .get_spirv_options = panvk_get_spirv_options,
1716    .preprocess_nir = panvk_preprocess_nir,
1717    .hash_graphics_state = panvk_hash_graphics_state,
1718    .compile = panvk_compile_shaders,
1719    .deserialize = panvk_deserialize_shader,
1720    .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
1721    .cmd_bind_shaders = panvk_cmd_bind_shaders,
1722 };
1723 
1724 static void
panvk_internal_shader_destroy(struct vk_device * vk_dev,struct vk_shader * vk_shader,const VkAllocationCallbacks * pAllocator)1725 panvk_internal_shader_destroy(struct vk_device *vk_dev,
1726                               struct vk_shader *vk_shader,
1727                               const VkAllocationCallbacks *pAllocator)
1728 {
1729    struct panvk_device *dev = to_panvk_device(vk_dev);
1730    struct panvk_internal_shader *shader =
1731       container_of(vk_shader, struct panvk_internal_shader, vk);
1732 
1733    panvk_pool_free_mem(&shader->code_mem);
1734 
1735 #if PAN_ARCH <= 7
1736    panvk_pool_free_mem(&shader->rsd);
1737 #else
1738    panvk_pool_free_mem(&shader->spd);
1739 #endif
1740 
1741    vk_shader_free(&dev->vk, pAllocator, &shader->vk);
1742 }
1743 
1744 static const struct vk_shader_ops panvk_internal_shader_ops = {
1745    .destroy = panvk_internal_shader_destroy,
1746 };
1747 
1748 VkResult
panvk_per_arch(create_internal_shader)1749 panvk_per_arch(create_internal_shader)(
1750    struct panvk_device *dev, nir_shader *nir,
1751    struct panfrost_compile_inputs *compiler_inputs,
1752    struct panvk_internal_shader **shader_out)
1753 {
1754    struct panvk_internal_shader *shader =
1755       vk_shader_zalloc(&dev->vk, &panvk_internal_shader_ops, nir->info.stage,
1756                        NULL, sizeof(*shader));
1757    if (shader == NULL)
1758       return panvk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1759 
1760    VkResult result;
1761    struct util_dynarray binary;
1762 
1763    util_dynarray_init(&binary, nir);
1764    GENX(pan_shader_compile)(nir, compiler_inputs, &binary, &shader->info);
1765 
1766    unsigned bin_size = util_dynarray_num_elements(&binary, uint8_t);
1767    if (bin_size) {
1768       shader->code_mem = panvk_pool_upload_aligned(&dev->mempools.exec,
1769                                                    binary.data, bin_size, 128);
1770       if (!panvk_priv_mem_dev_addr(shader->code_mem)) {
1771          result = panvk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1772          goto err_free_shader;
1773       }
1774    }
1775 
1776    *shader_out = shader;
1777    return VK_SUCCESS;
1778 
1779 err_free_shader:
1780    vk_shader_free(&dev->vk, NULL, &shader->vk);
1781    return result;
1782 }
1783