• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Raspberry Pi Ltd
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "vk_util.h"
25 
26 #include "v3dv_debug.h"
27 #include "v3dv_private.h"
28 
29 #include "common/v3d_debug.h"
30 #include "qpu/qpu_disasm.h"
31 
32 #include "compiler/nir/nir_builder.h"
33 #include "nir/nir_serialize.h"
34 
35 #include "util/u_atomic.h"
36 #include "util/u_prim.h"
37 #include "util/os_time.h"
38 
39 #include "vk_pipeline.h"
40 #include "vulkan/util/vk_format.h"
41 
42 static VkResult
43 compute_vpm_config(struct v3dv_pipeline *pipeline);
44 
45 void
v3dv_print_v3d_key(struct v3d_key * key,uint32_t v3d_key_size)46 v3dv_print_v3d_key(struct v3d_key *key,
47                    uint32_t v3d_key_size)
48 {
49    struct mesa_sha1 ctx;
50    unsigned char sha1[20];
51    char sha1buf[41];
52 
53    _mesa_sha1_init(&ctx);
54 
55    _mesa_sha1_update(&ctx, key, v3d_key_size);
56 
57    _mesa_sha1_final(&ctx, sha1);
58    _mesa_sha1_format(sha1buf, sha1);
59 
60    fprintf(stderr, "key %p: %s\n", key, sha1buf);
61 }
62 
63 static void
pipeline_compute_sha1_from_nir(struct v3dv_pipeline_stage * p_stage)64 pipeline_compute_sha1_from_nir(struct v3dv_pipeline_stage *p_stage)
65 {
66    VkPipelineShaderStageCreateInfo info = {
67       .module = vk_shader_module_handle_from_nir(p_stage->nir),
68       .pName = p_stage->entrypoint,
69       .stage = mesa_to_vk_shader_stage(p_stage->nir->info.stage),
70    };
71 
72    vk_pipeline_hash_shader_stage(&info, p_stage->shader_sha1);
73 }
74 
75 void
v3dv_shader_variant_destroy(struct v3dv_device * device,struct v3dv_shader_variant * variant)76 v3dv_shader_variant_destroy(struct v3dv_device *device,
77                             struct v3dv_shader_variant *variant)
78 {
79    /* The assembly BO is shared by all variants in the pipeline, so it can't
80     * be freed here and should be freed with the pipeline
81     */
82    if (variant->qpu_insts)
83       free(variant->qpu_insts);
84    ralloc_free(variant->prog_data.base);
85    vk_free(&device->vk.alloc, variant);
86 }
87 
88 static void
destroy_pipeline_stage(struct v3dv_device * device,struct v3dv_pipeline_stage * p_stage,const VkAllocationCallbacks * pAllocator)89 destroy_pipeline_stage(struct v3dv_device *device,
90                        struct v3dv_pipeline_stage *p_stage,
91                        const VkAllocationCallbacks *pAllocator)
92 {
93    if (!p_stage)
94       return;
95 
96    ralloc_free(p_stage->nir);
97    vk_free2(&device->vk.alloc, pAllocator, p_stage);
98 }
99 
100 static void
pipeline_free_stages(struct v3dv_device * device,struct v3dv_pipeline * pipeline,const VkAllocationCallbacks * pAllocator)101 pipeline_free_stages(struct v3dv_device *device,
102                      struct v3dv_pipeline *pipeline,
103                      const VkAllocationCallbacks *pAllocator)
104 {
105    assert(pipeline);
106 
107    /* FIXME: we can't just use a loop over mesa stage due the bin, would be
108     * good to find an alternative.
109     */
110    destroy_pipeline_stage(device, pipeline->vs, pAllocator);
111    destroy_pipeline_stage(device, pipeline->vs_bin, pAllocator);
112    destroy_pipeline_stage(device, pipeline->gs, pAllocator);
113    destroy_pipeline_stage(device, pipeline->gs_bin, pAllocator);
114    destroy_pipeline_stage(device, pipeline->fs, pAllocator);
115    destroy_pipeline_stage(device, pipeline->cs, pAllocator);
116 
117    pipeline->vs = NULL;
118    pipeline->vs_bin = NULL;
119    pipeline->gs = NULL;
120    pipeline->gs_bin = NULL;
121    pipeline->fs = NULL;
122    pipeline->cs = NULL;
123 }
124 
125 static void
v3dv_destroy_pipeline(struct v3dv_pipeline * pipeline,struct v3dv_device * device,const VkAllocationCallbacks * pAllocator)126 v3dv_destroy_pipeline(struct v3dv_pipeline *pipeline,
127                       struct v3dv_device *device,
128                       const VkAllocationCallbacks *pAllocator)
129 {
130    if (!pipeline)
131       return;
132 
133    pipeline_free_stages(device, pipeline, pAllocator);
134 
135    if (pipeline->shared_data) {
136       v3dv_pipeline_shared_data_unref(device, pipeline->shared_data);
137       pipeline->shared_data = NULL;
138    }
139 
140    if (pipeline->spill.bo) {
141       assert(pipeline->spill.size_per_thread > 0);
142       v3dv_bo_free(device, pipeline->spill.bo);
143    }
144 
145    if (pipeline->default_attribute_values) {
146       v3dv_bo_free(device, pipeline->default_attribute_values);
147       pipeline->default_attribute_values = NULL;
148    }
149 
150    if (pipeline->executables.mem_ctx)
151       ralloc_free(pipeline->executables.mem_ctx);
152 
153    vk_object_free(&device->vk, pAllocator, pipeline);
154 }
155 
156 VKAPI_ATTR void VKAPI_CALL
v3dv_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)157 v3dv_DestroyPipeline(VkDevice _device,
158                      VkPipeline _pipeline,
159                      const VkAllocationCallbacks *pAllocator)
160 {
161    V3DV_FROM_HANDLE(v3dv_device, device, _device);
162    V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, _pipeline);
163 
164    if (!pipeline)
165       return;
166 
167    v3dv_destroy_pipeline(pipeline, device, pAllocator);
168 }
169 
170 static const struct spirv_to_nir_options default_spirv_options =  {
171    .caps = {
172       .device_group = true,
173       .float_controls = true,
174       .multiview = true,
175       .storage_8bit = true,
176       .storage_16bit = true,
177       .subgroup_basic = true,
178       .variable_pointers = true,
179       .vk_memory_model = true,
180       .vk_memory_model_device_scope = true,
181       .physical_storage_buffer_address = true,
182     },
183    .ubo_addr_format = nir_address_format_32bit_index_offset,
184    .ssbo_addr_format = nir_address_format_32bit_index_offset,
185    .phys_ssbo_addr_format = nir_address_format_2x32bit_global,
186    .push_const_addr_format = nir_address_format_logical,
187    .shared_addr_format = nir_address_format_32bit_offset,
188 };
189 
190 const nir_shader_compiler_options v3dv_nir_options = {
191    .lower_uadd_sat = true,
192    .lower_usub_sat = true,
193    .lower_iadd_sat = true,
194    .lower_all_io_to_temps = true,
195    .lower_extract_byte = true,
196    .lower_extract_word = true,
197    .lower_insert_byte = true,
198    .lower_insert_word = true,
199    .lower_bitfield_insert_to_shifts = true,
200    .lower_bitfield_extract_to_shifts = true,
201    .lower_bitfield_reverse = true,
202    .lower_bit_count = true,
203    .lower_cs_local_id_to_index = true,
204    .lower_ffract = true,
205    .lower_fmod = true,
206    .lower_pack_unorm_2x16 = true,
207    .lower_pack_snorm_2x16 = true,
208    .lower_unpack_unorm_2x16 = true,
209    .lower_unpack_snorm_2x16 = true,
210    .lower_pack_unorm_4x8 = true,
211    .lower_pack_snorm_4x8 = true,
212    .lower_unpack_unorm_4x8 = true,
213    .lower_unpack_snorm_4x8 = true,
214    .lower_pack_half_2x16 = true,
215    .lower_unpack_half_2x16 = true,
216    .lower_pack_32_2x16 = true,
217    .lower_pack_32_2x16_split = true,
218    .lower_unpack_32_2x16_split = true,
219    .lower_mul_2x32_64 = true,
220    .lower_fdiv = true,
221    .lower_find_lsb = true,
222    .lower_ffma16 = true,
223    .lower_ffma32 = true,
224    .lower_ffma64 = true,
225    .lower_flrp32 = true,
226    .lower_fpow = true,
227    .lower_fsat = true,
228    .lower_fsqrt = true,
229    .lower_ifind_msb = true,
230    .lower_isign = true,
231    .lower_ldexp = true,
232    .lower_mul_high = true,
233    .lower_wpos_pntc = true,
234    .lower_rotate = true,
235    .lower_to_scalar = true,
236    .lower_device_index_to_zero = true,
237    .has_fsub = true,
238    .has_isub = true,
239    .vertex_id_zero_based = false, /* FIXME: to set this to true, the intrinsic
240                                    * needs to be supported */
241    .lower_interpolate_at = true,
242    .max_unroll_iterations = 16,
243    .force_indirect_unrolling = (nir_var_shader_in | nir_var_function_temp),
244    .divergence_analysis_options =
245       nir_divergence_multiple_workgroup_per_compute_subgroup
246 };
247 
248 const nir_shader_compiler_options *
v3dv_pipeline_get_nir_options(void)249 v3dv_pipeline_get_nir_options(void)
250 {
251    return &v3dv_nir_options;
252 }
253 
254 #define OPT(pass, ...) ({                                  \
255    bool this_progress = false;                             \
256    NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
257    if (this_progress)                                      \
258       progress = true;                                     \
259    this_progress;                                          \
260 })
261 
262 static void
nir_optimize(nir_shader * nir,bool allow_copies)263 nir_optimize(nir_shader *nir, bool allow_copies)
264 {
265    bool progress;
266 
267    do {
268       progress = false;
269       OPT(nir_split_array_vars, nir_var_function_temp);
270       OPT(nir_shrink_vec_array_vars, nir_var_function_temp);
271       OPT(nir_opt_deref);
272       OPT(nir_lower_vars_to_ssa);
273       if (allow_copies) {
274          /* Only run this pass in the first call to nir_optimize.  Later calls
275           * assume that we've lowered away any copy_deref instructions and we
276           * don't want to introduce any more.
277           */
278          OPT(nir_opt_find_array_copies);
279       }
280 
281       OPT(nir_remove_dead_variables,
282           (nir_variable_mode)(nir_var_function_temp |
283                               nir_var_shader_temp |
284                               nir_var_mem_shared),
285           NULL);
286 
287       OPT(nir_opt_copy_prop_vars);
288       OPT(nir_opt_dead_write_vars);
289       OPT(nir_opt_combine_stores, nir_var_all);
290 
291       OPT(nir_lower_alu_to_scalar, NULL, NULL);
292 
293       OPT(nir_copy_prop);
294       OPT(nir_lower_phis_to_scalar, false);
295 
296       OPT(nir_copy_prop);
297       OPT(nir_opt_dce);
298       OPT(nir_opt_cse);
299       OPT(nir_opt_combine_stores, nir_var_all);
300 
301       /* Passing 0 to the peephole select pass causes it to convert
302        * if-statements that contain only move instructions in the branches
303        * regardless of the count.
304        *
305        * Passing 1 to the peephole select pass causes it to convert
306        * if-statements that contain at most a single ALU instruction (total)
307        * in both branches.
308        */
309       OPT(nir_opt_peephole_select, 0, false, false);
310       OPT(nir_opt_peephole_select, 8, false, true);
311 
312       OPT(nir_opt_intrinsics);
313       OPT(nir_opt_idiv_const, 32);
314       OPT(nir_opt_algebraic);
315       OPT(nir_lower_alu);
316       OPT(nir_opt_constant_folding);
317 
318       OPT(nir_opt_dead_cf);
319       if (nir_opt_trivial_continues(nir)) {
320          progress = true;
321          OPT(nir_copy_prop);
322          OPT(nir_opt_dce);
323       }
324       OPT(nir_opt_conditional_discard);
325 
326       OPT(nir_opt_remove_phis);
327       OPT(nir_opt_gcm, false);
328       OPT(nir_opt_if, nir_opt_if_optimize_phi_true_false);
329       OPT(nir_opt_undef);
330       OPT(nir_lower_pack);
331 
332       /* There are two optimizations that we don't do here, and we rely on the
333        * backend:
334        *
335        * nir_lower_flrp only needs to be called once, as nothing should
336        * rematerialize any flrps. As we are already calling it on the backend
337        * compiler, we don't call it again.
338        *
339        * nir_opt_loop_unroll: as the backend includes custom strategies in
340        * order to get the lowest spill/fills possible, and some of them
341        * include disable loop unrolling.
342        *
343        * FIXME: ideally we would like to just remove this method and
344        * v3d_optimize_nir. But:
345        *
346        *   * Using it leads to some regressions on Vulkan CTS tests, due to
347        *     some lowering use there
348        *   * We would need to move to the backend some additional
349        *     lowerings/optimizations that are used on the Vulkan
350        *     frontend. That would require to check that we are not getting any
351        *     regression or performance drop on OpenGL
352        *
353        * For now we would keep this Vulkan fronted nir_optimize
354        */
355 
356    } while (progress);
357 }
358 
359 static void
preprocess_nir(nir_shader * nir)360 preprocess_nir(nir_shader *nir)
361 {
362    const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
363       .frag_coord = true,
364       .point_coord = true,
365    };
366    NIR_PASS(_, nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
367 
368    /* Vulkan uses the separate-shader linking model */
369    nir->info.separate_shader = true;
370 
371    /* Make sure we lower variable initializers on output variables so that
372     * nir_remove_dead_variables below sees the corresponding stores
373     */
374    NIR_PASS(_, nir, nir_lower_variable_initializers, nir_var_shader_out);
375 
376    if (nir->info.stage == MESA_SHADER_FRAGMENT)
377       NIR_PASS(_, nir, nir_lower_io_to_vector, nir_var_shader_out);
378    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
379       NIR_PASS(_, nir, nir_lower_input_attachments,
380                  &(nir_input_attachment_options) {
381                     .use_fragcoord_sysval = false,
382                        });
383    }
384 
385    NIR_PASS_V(nir, nir_lower_io_to_temporaries,
386               nir_shader_get_entrypoint(nir), true, false);
387 
388    NIR_PASS(_, nir, nir_lower_system_values);
389 
390    NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
391 
392    NIR_PASS(_, nir, nir_normalize_cubemap_coords);
393 
394    NIR_PASS(_, nir, nir_lower_global_vars_to_local);
395 
396    NIR_PASS(_, nir, nir_split_var_copies);
397    NIR_PASS(_, nir, nir_split_struct_vars, nir_var_function_temp);
398 
399    nir_optimize(nir, true);
400 
401    NIR_PASS(_, nir, nir_lower_explicit_io,
402             nir_var_mem_push_const,
403             nir_address_format_32bit_offset);
404 
405    NIR_PASS(_, nir, nir_lower_explicit_io,
406             nir_var_mem_ubo | nir_var_mem_ssbo,
407             nir_address_format_32bit_index_offset);
408 
409    NIR_PASS(_, nir, nir_lower_explicit_io,
410             nir_var_mem_global,
411             nir_address_format_2x32bit_global);
412 
413    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
414 
415    /* Lower a bunch of stuff */
416    NIR_PASS(_, nir, nir_lower_var_copies);
417 
418    NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
419 
420    NIR_PASS(_, nir, nir_lower_indirect_derefs,
421             nir_var_function_temp, 2);
422 
423    NIR_PASS(_, nir, nir_lower_array_deref_of_vec,
424             nir_var_mem_ubo | nir_var_mem_ssbo,
425             nir_lower_direct_array_deref_of_vec_load);
426 
427    NIR_PASS(_, nir, nir_lower_frexp);
428 
429    /* Get rid of split copies */
430    nir_optimize(nir, false);
431 }
432 
433 static nir_shader *
shader_module_compile_to_nir(struct v3dv_device * device,struct v3dv_pipeline_stage * stage)434 shader_module_compile_to_nir(struct v3dv_device *device,
435                              struct v3dv_pipeline_stage *stage)
436 {
437    nir_shader *nir;
438    const nir_shader_compiler_options *nir_options = &v3dv_nir_options;
439 
440 
441    if (unlikely(V3D_DEBUG & V3D_DEBUG_DUMP_SPIRV) && stage->module->nir == NULL)
442       v3dv_print_spirv(stage->module->data, stage->module->size, stderr);
443 
444    /* vk_shader_module_to_nir also handles internal shaders, when module->nir
445     * != NULL. It also calls nir_validate_shader on both cases, so we don't
446     * call it again here.
447     */
448    VkResult result = vk_shader_module_to_nir(&device->vk, stage->module,
449                                              broadcom_shader_stage_to_gl(stage->stage),
450                                              stage->entrypoint,
451                                              stage->spec_info,
452                                              &default_spirv_options,
453                                              nir_options,
454                                              NULL, &nir);
455    if (result != VK_SUCCESS)
456       return NULL;
457    assert(nir->info.stage == broadcom_shader_stage_to_gl(stage->stage));
458 
459    if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERDB) && stage->module->nir == NULL) {
460       char sha1buf[41];
461       _mesa_sha1_format(sha1buf, stage->pipeline->sha1);
462       nir->info.name = ralloc_strdup(nir, sha1buf);
463    }
464 
465    if (unlikely(V3D_DEBUG & (V3D_DEBUG_NIR |
466                              v3d_debug_flag_for_shader_stage(
467                                 broadcom_shader_stage_to_gl(stage->stage))))) {
468       fprintf(stderr, "NIR after vk_shader_module_to_nir: %s prog %d NIR:\n",
469               broadcom_shader_stage_name(stage->stage),
470               stage->program_id);
471       nir_print_shader(nir, stderr);
472       fprintf(stderr, "\n");
473    }
474 
475    preprocess_nir(nir);
476 
477    return nir;
478 }
479 
480 static int
type_size_vec4(const struct glsl_type * type,bool bindless)481 type_size_vec4(const struct glsl_type *type, bool bindless)
482 {
483    return glsl_count_attribute_slots(type, false);
484 }
485 
486 /* FIXME: the number of parameters for this method is somewhat big. Perhaps
487  * rethink.
488  */
489 static unsigned
descriptor_map_add(struct v3dv_descriptor_map * map,int set,int binding,int array_index,int array_size,int start_index,uint8_t return_size)490 descriptor_map_add(struct v3dv_descriptor_map *map,
491                    int set,
492                    int binding,
493                    int array_index,
494                    int array_size,
495                    int start_index,
496                    uint8_t return_size)
497 {
498    assert(array_index < array_size);
499    assert(return_size == 16 || return_size == 32);
500 
501    unsigned index = start_index;
502    for (; index < map->num_desc; index++) {
503       if (map->used[index] &&
504           set == map->set[index] &&
505           binding == map->binding[index] &&
506           array_index == map->array_index[index]) {
507          assert(array_size == map->array_size[index]);
508          if (return_size != map->return_size[index]) {
509             /* It the return_size is different it means that the same sampler
510              * was used for operations with different precision
511              * requirement. In this case we need to ensure that we use the
512              * larger one.
513              */
514             map->return_size[index] = 32;
515          }
516          return index;
517       } else if (!map->used[index]) {
518          break;
519       }
520    }
521 
522    assert(index < DESCRIPTOR_MAP_SIZE);
523    assert(!map->used[index]);
524 
525    map->used[index] = true;
526    map->set[index] = set;
527    map->binding[index] = binding;
528    map->array_index[index] = array_index;
529    map->array_size[index] = array_size;
530    map->return_size[index] = return_size;
531    map->num_desc = MAX2(map->num_desc, index + 1);
532 
533    return index;
534 }
535 
536 struct lower_pipeline_layout_state {
537    struct v3dv_pipeline *pipeline;
538    const struct v3dv_pipeline_layout *layout;
539    bool needs_default_sampler_state;
540 };
541 
542 
543 static void
lower_load_push_constant(nir_builder * b,nir_intrinsic_instr * instr,struct lower_pipeline_layout_state * state)544 lower_load_push_constant(nir_builder *b, nir_intrinsic_instr *instr,
545                          struct lower_pipeline_layout_state *state)
546 {
547    assert(instr->intrinsic == nir_intrinsic_load_push_constant);
548    instr->intrinsic = nir_intrinsic_load_uniform;
549 }
550 
551 static struct v3dv_descriptor_map*
pipeline_get_descriptor_map(struct v3dv_pipeline * pipeline,VkDescriptorType desc_type,gl_shader_stage gl_stage,bool is_sampler)552 pipeline_get_descriptor_map(struct v3dv_pipeline *pipeline,
553                             VkDescriptorType desc_type,
554                             gl_shader_stage gl_stage,
555                             bool is_sampler)
556 {
557    enum broadcom_shader_stage broadcom_stage =
558       gl_shader_stage_to_broadcom(gl_stage);
559 
560    assert(pipeline->shared_data &&
561           pipeline->shared_data->maps[broadcom_stage]);
562 
563    switch(desc_type) {
564    case VK_DESCRIPTOR_TYPE_SAMPLER:
565       return &pipeline->shared_data->maps[broadcom_stage]->sampler_map;
566    case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
567    case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
568    case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
569    case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
570    case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
571       return &pipeline->shared_data->maps[broadcom_stage]->texture_map;
572    case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
573       return is_sampler ?
574          &pipeline->shared_data->maps[broadcom_stage]->sampler_map :
575          &pipeline->shared_data->maps[broadcom_stage]->texture_map;
576    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
577    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
578    case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
579       return &pipeline->shared_data->maps[broadcom_stage]->ubo_map;
580    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
581    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
582       return &pipeline->shared_data->maps[broadcom_stage]->ssbo_map;
583    default:
584       unreachable("Descriptor type unknown or not having a descriptor map");
585    }
586 }
587 
588 /* Gathers info from the intrinsic (set and binding) and then lowers it so it
589  * could be used by the v3d_compiler */
590 static void
lower_vulkan_resource_index(nir_builder * b,nir_intrinsic_instr * instr,struct lower_pipeline_layout_state * state)591 lower_vulkan_resource_index(nir_builder *b,
592                             nir_intrinsic_instr *instr,
593                             struct lower_pipeline_layout_state *state)
594 {
595    assert(instr->intrinsic == nir_intrinsic_vulkan_resource_index);
596 
597    nir_const_value *const_val = nir_src_as_const_value(instr->src[0]);
598 
599    unsigned set = nir_intrinsic_desc_set(instr);
600    unsigned binding = nir_intrinsic_binding(instr);
601    struct v3dv_descriptor_set_layout *set_layout = state->layout->set[set].layout;
602    struct v3dv_descriptor_set_binding_layout *binding_layout =
603       &set_layout->binding[binding];
604    unsigned index = 0;
605 
606    switch (binding_layout->type) {
607    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
608    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
609    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
610    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
611    case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: {
612       struct v3dv_descriptor_map *descriptor_map =
613          pipeline_get_descriptor_map(state->pipeline, binding_layout->type,
614                                      b->shader->info.stage, false);
615 
616       if (!const_val)
617          unreachable("non-constant vulkan_resource_index array index");
618 
619       /* At compile-time we will need to know if we are processing a UBO load
620        * for an inline or a regular UBO so we can handle inline loads like
621        * push constants. At the level of NIR level however, the inline
622        * information is gone, so we rely on the index to make this distinction.
623        * Particularly, we reserve indices 1..MAX_INLINE_UNIFORM_BUFFERS for
624        * inline buffers. This means that at the descriptor map level
625        * we store inline buffers at slots 0..MAX_INLINE_UNIFORM_BUFFERS - 1,
626        * and regular UBOs at indices starting from MAX_INLINE_UNIFORM_BUFFERS.
627        */
628       uint32_t start_index = 0;
629       if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
630           binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) {
631          start_index = MAX_INLINE_UNIFORM_BUFFERS;
632       }
633 
634       index = descriptor_map_add(descriptor_map, set, binding,
635                                  const_val->u32,
636                                  binding_layout->array_size,
637                                  start_index,
638                                  32 /* return_size: doesn't really apply for this case */);
639 
640       /* We always reserve index 0 for push constants */
641       if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
642           binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
643           binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
644          index++;
645       }
646 
647       break;
648    }
649 
650    default:
651       unreachable("unsupported descriptor type for vulkan_resource_index");
652       break;
653    }
654 
655    /* Since we use the deref pass, both vulkan_resource_index and
656     * vulkan_load_descriptor return a vec2 providing an index and
657     * offset. Our backend compiler only cares about the index part.
658     */
659    nir_ssa_def_rewrite_uses(&instr->dest.ssa,
660                             nir_imm_ivec2(b, index, 0));
661    nir_instr_remove(&instr->instr);
662 }
663 
664 /* Returns return_size, so it could be used for the case of not having a
665  * sampler object
666  */
667 static uint8_t
lower_tex_src_to_offset(nir_builder * b,nir_tex_instr * instr,unsigned src_idx,struct lower_pipeline_layout_state * state)668 lower_tex_src_to_offset(nir_builder *b,
669                         nir_tex_instr *instr,
670                         unsigned src_idx,
671                         struct lower_pipeline_layout_state *state)
672 {
673    nir_ssa_def *index = NULL;
674    unsigned base_index = 0;
675    unsigned array_elements = 1;
676    nir_tex_src *src = &instr->src[src_idx];
677    bool is_sampler = src->src_type == nir_tex_src_sampler_deref;
678 
679    /* We compute first the offsets */
680    nir_deref_instr *deref = nir_instr_as_deref(src->src.ssa->parent_instr);
681    while (deref->deref_type != nir_deref_type_var) {
682       assert(deref->parent.is_ssa);
683       nir_deref_instr *parent =
684          nir_instr_as_deref(deref->parent.ssa->parent_instr);
685 
686       assert(deref->deref_type == nir_deref_type_array);
687 
688       if (nir_src_is_const(deref->arr.index) && index == NULL) {
689          /* We're still building a direct index */
690          base_index += nir_src_as_uint(deref->arr.index) * array_elements;
691       } else {
692          if (index == NULL) {
693             /* We used to be direct but not anymore */
694             index = nir_imm_int(b, base_index);
695             base_index = 0;
696          }
697 
698          index = nir_iadd(b, index,
699                           nir_imul(b, nir_imm_int(b, array_elements),
700                                    nir_ssa_for_src(b, deref->arr.index, 1)));
701       }
702 
703       array_elements *= glsl_get_length(parent->type);
704 
705       deref = parent;
706    }
707 
708    if (index)
709       index = nir_umin(b, index, nir_imm_int(b, array_elements - 1));
710 
711    /* We have the offsets, we apply them, rewriting the source or removing
712     * instr if needed
713     */
714    if (index) {
715       nir_instr_rewrite_src(&instr->instr, &src->src,
716                             nir_src_for_ssa(index));
717 
718       src->src_type = is_sampler ?
719          nir_tex_src_sampler_offset :
720          nir_tex_src_texture_offset;
721    } else {
722       nir_tex_instr_remove_src(instr, src_idx);
723    }
724 
725    uint32_t set = deref->var->data.descriptor_set;
726    uint32_t binding = deref->var->data.binding;
727    /* FIXME: this is a really simplified check for the precision to be used
728     * for the sampling. Right now we are ony checking for the variables used
729     * on the operation itself, but there are other cases that we could use to
730     * infer the precision requirement.
731     */
732    bool relaxed_precision = deref->var->data.precision == GLSL_PRECISION_MEDIUM ||
733                             deref->var->data.precision == GLSL_PRECISION_LOW;
734    struct v3dv_descriptor_set_layout *set_layout = state->layout->set[set].layout;
735    struct v3dv_descriptor_set_binding_layout *binding_layout =
736       &set_layout->binding[binding];
737 
738    /* For input attachments, the shader includes the attachment_idx. As we are
739     * treating them as a texture, we only want the base_index
740     */
741    uint32_t array_index = binding_layout->type != VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT ?
742       deref->var->data.index + base_index :
743       base_index;
744 
745    uint8_t return_size;
746    if (unlikely(V3D_DEBUG & V3D_DEBUG_TMU_16BIT))
747       return_size = 16;
748    else  if (unlikely(V3D_DEBUG & V3D_DEBUG_TMU_32BIT))
749       return_size = 32;
750    else
751       return_size = relaxed_precision || instr->is_shadow ? 16 : 32;
752 
753    struct v3dv_descriptor_map *map =
754       pipeline_get_descriptor_map(state->pipeline, binding_layout->type,
755                                   b->shader->info.stage, is_sampler);
756    int desc_index =
757       descriptor_map_add(map,
758                          deref->var->data.descriptor_set,
759                          deref->var->data.binding,
760                          array_index,
761                          binding_layout->array_size,
762                          0,
763                          return_size);
764 
765    if (is_sampler)
766       instr->sampler_index = desc_index;
767    else
768       instr->texture_index = desc_index;
769 
770    return return_size;
771 }
772 
773 static bool
lower_sampler(nir_builder * b,nir_tex_instr * instr,struct lower_pipeline_layout_state * state)774 lower_sampler(nir_builder *b,
775               nir_tex_instr *instr,
776               struct lower_pipeline_layout_state *state)
777 {
778    uint8_t return_size = 0;
779 
780    int texture_idx =
781       nir_tex_instr_src_index(instr, nir_tex_src_texture_deref);
782 
783    if (texture_idx >= 0)
784       return_size = lower_tex_src_to_offset(b, instr, texture_idx, state);
785 
786    int sampler_idx =
787       nir_tex_instr_src_index(instr, nir_tex_src_sampler_deref);
788 
789    if (sampler_idx >= 0)
790       lower_tex_src_to_offset(b, instr, sampler_idx, state);
791 
792    if (texture_idx < 0 && sampler_idx < 0)
793       return false;
794 
795    /* If we don't have a sampler, we assign it the idx we reserve for this
796     * case, and we ensure that it is using the correct return size.
797     */
798    if (sampler_idx < 0) {
799       state->needs_default_sampler_state = true;
800       instr->sampler_index = return_size == 16 ?
801          V3DV_NO_SAMPLER_16BIT_IDX : V3DV_NO_SAMPLER_32BIT_IDX;
802    }
803 
804    return true;
805 }
806 
807 /* FIXME: really similar to lower_tex_src_to_offset, perhaps refactor? */
808 static void
lower_image_deref(nir_builder * b,nir_intrinsic_instr * instr,struct lower_pipeline_layout_state * state)809 lower_image_deref(nir_builder *b,
810                   nir_intrinsic_instr *instr,
811                   struct lower_pipeline_layout_state *state)
812 {
813    nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
814    nir_ssa_def *index = NULL;
815    unsigned array_elements = 1;
816    unsigned base_index = 0;
817 
818    while (deref->deref_type != nir_deref_type_var) {
819       assert(deref->parent.is_ssa);
820       nir_deref_instr *parent =
821          nir_instr_as_deref(deref->parent.ssa->parent_instr);
822 
823       assert(deref->deref_type == nir_deref_type_array);
824 
825       if (nir_src_is_const(deref->arr.index) && index == NULL) {
826          /* We're still building a direct index */
827          base_index += nir_src_as_uint(deref->arr.index) * array_elements;
828       } else {
829          if (index == NULL) {
830             /* We used to be direct but not anymore */
831             index = nir_imm_int(b, base_index);
832             base_index = 0;
833          }
834 
835          index = nir_iadd(b, index,
836                           nir_imul(b, nir_imm_int(b, array_elements),
837                                    nir_ssa_for_src(b, deref->arr.index, 1)));
838       }
839 
840       array_elements *= glsl_get_length(parent->type);
841 
842       deref = parent;
843    }
844 
845    if (index)
846       index = nir_umin(b, index, nir_imm_int(b, array_elements - 1));
847 
848    uint32_t set = deref->var->data.descriptor_set;
849    uint32_t binding = deref->var->data.binding;
850    struct v3dv_descriptor_set_layout *set_layout = state->layout->set[set].layout;
851    struct v3dv_descriptor_set_binding_layout *binding_layout =
852       &set_layout->binding[binding];
853 
854    uint32_t array_index = deref->var->data.index + base_index;
855 
856    assert(binding_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE ||
857           binding_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER);
858 
859    struct v3dv_descriptor_map *map =
860       pipeline_get_descriptor_map(state->pipeline, binding_layout->type,
861                                   b->shader->info.stage, false);
862 
863    int desc_index =
864       descriptor_map_add(map,
865                          deref->var->data.descriptor_set,
866                          deref->var->data.binding,
867                          array_index,
868                          binding_layout->array_size,
869                          0,
870                          32 /* return_size: doesn't apply for textures */);
871 
872    /* Note: we don't need to do anything here in relation to the precision and
873     * the output size because for images we can infer that info from the image
874     * intrinsic, that includes the image format (see
875     * NIR_INTRINSIC_FORMAT). That is done by the v3d compiler.
876     */
877 
878    index = nir_imm_int(b, desc_index);
879 
880    nir_rewrite_image_intrinsic(instr, index, false);
881 }
882 
883 static bool
lower_intrinsic(nir_builder * b,nir_intrinsic_instr * instr,struct lower_pipeline_layout_state * state)884 lower_intrinsic(nir_builder *b,
885                 nir_intrinsic_instr *instr,
886                 struct lower_pipeline_layout_state *state)
887 {
888    switch (instr->intrinsic) {
889    case nir_intrinsic_load_push_constant:
890       lower_load_push_constant(b, instr, state);
891       return true;
892 
893    case nir_intrinsic_vulkan_resource_index:
894       lower_vulkan_resource_index(b, instr, state);
895       return true;
896 
897    case nir_intrinsic_load_vulkan_descriptor: {
898       /* Loading the descriptor happens as part of load/store instructions,
899        * so for us this is a no-op.
900        */
901       nir_ssa_def_rewrite_uses(&instr->dest.ssa, instr->src[0].ssa);
902       nir_instr_remove(&instr->instr);
903       return true;
904    }
905 
906    case nir_intrinsic_image_deref_load:
907    case nir_intrinsic_image_deref_store:
908    case nir_intrinsic_image_deref_atomic_add:
909    case nir_intrinsic_image_deref_atomic_imin:
910    case nir_intrinsic_image_deref_atomic_umin:
911    case nir_intrinsic_image_deref_atomic_imax:
912    case nir_intrinsic_image_deref_atomic_umax:
913    case nir_intrinsic_image_deref_atomic_and:
914    case nir_intrinsic_image_deref_atomic_or:
915    case nir_intrinsic_image_deref_atomic_xor:
916    case nir_intrinsic_image_deref_atomic_exchange:
917    case nir_intrinsic_image_deref_atomic_comp_swap:
918    case nir_intrinsic_image_deref_size:
919    case nir_intrinsic_image_deref_samples:
920       lower_image_deref(b, instr, state);
921       return true;
922 
923    default:
924       return false;
925    }
926 }
927 
928 static bool
lower_pipeline_layout_cb(nir_builder * b,nir_instr * instr,void * _state)929 lower_pipeline_layout_cb(nir_builder *b,
930                          nir_instr *instr,
931                          void *_state)
932 {
933    bool progress = false;
934    struct lower_pipeline_layout_state *state = _state;
935 
936    b->cursor = nir_before_instr(instr);
937    switch (instr->type) {
938    case nir_instr_type_tex:
939       progress |= lower_sampler(b, nir_instr_as_tex(instr), state);
940       break;
941    case nir_instr_type_intrinsic:
942       progress |= lower_intrinsic(b, nir_instr_as_intrinsic(instr), state);
943       break;
944    default:
945       break;
946    }
947 
948    return progress;
949 }
950 
951 static bool
lower_pipeline_layout_info(nir_shader * shader,struct v3dv_pipeline * pipeline,const struct v3dv_pipeline_layout * layout,bool * needs_default_sampler_state)952 lower_pipeline_layout_info(nir_shader *shader,
953                            struct v3dv_pipeline *pipeline,
954                            const struct v3dv_pipeline_layout *layout,
955                            bool *needs_default_sampler_state)
956 {
957    bool progress = false;
958 
959    struct lower_pipeline_layout_state state = {
960       .pipeline = pipeline,
961       .layout = layout,
962       .needs_default_sampler_state = false,
963    };
964 
965    progress = nir_shader_instructions_pass(shader, lower_pipeline_layout_cb,
966                                            nir_metadata_block_index |
967                                            nir_metadata_dominance,
968                                            &state);
969 
970    *needs_default_sampler_state = state.needs_default_sampler_state;
971 
972    return progress;
973 }
974 
975 
976 static void
lower_fs_io(nir_shader * nir)977 lower_fs_io(nir_shader *nir)
978 {
979    /* Our backend doesn't handle array fragment shader outputs */
980    NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false);
981    NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_shader_out, NULL);
982 
983    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
984                                MESA_SHADER_FRAGMENT);
985 
986    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
987                                MESA_SHADER_FRAGMENT);
988 
989    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
990             type_size_vec4, 0);
991 }
992 
993 static void
lower_gs_io(struct nir_shader * nir)994 lower_gs_io(struct nir_shader *nir)
995 {
996    NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false);
997 
998    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
999                                MESA_SHADER_GEOMETRY);
1000 
1001    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
1002                                MESA_SHADER_GEOMETRY);
1003 }
1004 
1005 static void
lower_vs_io(struct nir_shader * nir)1006 lower_vs_io(struct nir_shader *nir)
1007 {
1008    NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false);
1009 
1010    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
1011                                MESA_SHADER_VERTEX);
1012 
1013    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
1014                                MESA_SHADER_VERTEX);
1015 
1016    /* FIXME: if we call nir_lower_io, we get a crash later. Likely because it
1017     * overlaps with v3d_nir_lower_io. Need further research though.
1018     */
1019 }
1020 
1021 static void
shader_debug_output(const char * message,void * data)1022 shader_debug_output(const char *message, void *data)
1023 {
1024    /* FIXME: We probably don't want to debug anything extra here, and in fact
1025     * the compiler is not using this callback too much, only as an alternative
1026     * way to debug out the shaderdb stats, that you can already get using
1027     * V3D_DEBUG=shaderdb. Perhaps it would make sense to revisit the v3d
1028     * compiler to remove that callback.
1029     */
1030 }
1031 
1032 static void
pipeline_populate_v3d_key(struct v3d_key * key,const struct v3dv_pipeline_stage * p_stage,uint32_t ucp_enables,bool robust_buffer_access)1033 pipeline_populate_v3d_key(struct v3d_key *key,
1034                           const struct v3dv_pipeline_stage *p_stage,
1035                           uint32_t ucp_enables,
1036                           bool robust_buffer_access)
1037 {
1038    assert(p_stage->pipeline->shared_data &&
1039           p_stage->pipeline->shared_data->maps[p_stage->stage]);
1040 
1041    /* The following values are default values used at pipeline create. We use
1042     * there 32 bit as default return size.
1043     */
1044    struct v3dv_descriptor_map *sampler_map =
1045       &p_stage->pipeline->shared_data->maps[p_stage->stage]->sampler_map;
1046    struct v3dv_descriptor_map *texture_map =
1047       &p_stage->pipeline->shared_data->maps[p_stage->stage]->texture_map;
1048 
1049    key->num_tex_used = texture_map->num_desc;
1050    assert(key->num_tex_used <= V3D_MAX_TEXTURE_SAMPLERS);
1051    for (uint32_t tex_idx = 0; tex_idx < texture_map->num_desc; tex_idx++) {
1052       key->tex[tex_idx].swizzle[0] = PIPE_SWIZZLE_X;
1053       key->tex[tex_idx].swizzle[1] = PIPE_SWIZZLE_Y;
1054       key->tex[tex_idx].swizzle[2] = PIPE_SWIZZLE_Z;
1055       key->tex[tex_idx].swizzle[3] = PIPE_SWIZZLE_W;
1056    }
1057 
1058    key->num_samplers_used = sampler_map->num_desc;
1059    assert(key->num_samplers_used <= V3D_MAX_TEXTURE_SAMPLERS);
1060    for (uint32_t sampler_idx = 0; sampler_idx < sampler_map->num_desc;
1061         sampler_idx++) {
1062       key->sampler[sampler_idx].return_size =
1063          sampler_map->return_size[sampler_idx];
1064 
1065       key->sampler[sampler_idx].return_channels =
1066          key->sampler[sampler_idx].return_size == 32 ? 4 : 2;
1067    }
1068 
1069    switch (p_stage->stage) {
1070    case BROADCOM_SHADER_VERTEX:
1071    case BROADCOM_SHADER_VERTEX_BIN:
1072       key->is_last_geometry_stage = p_stage->pipeline->gs == NULL;
1073       break;
1074    case BROADCOM_SHADER_GEOMETRY:
1075    case BROADCOM_SHADER_GEOMETRY_BIN:
1076       /* FIXME: while we don't implement tessellation shaders */
1077       key->is_last_geometry_stage = true;
1078       break;
1079    case BROADCOM_SHADER_FRAGMENT:
1080    case BROADCOM_SHADER_COMPUTE:
1081       key->is_last_geometry_stage = false;
1082       break;
1083    default:
1084       unreachable("unsupported shader stage");
1085    }
1086 
1087    /* Vulkan doesn't have fixed function state for user clip planes. Instead,
1088     * shaders can write to gl_ClipDistance[], in which case the SPIR-V compiler
1089     * takes care of adding a single compact array variable at
1090     * VARYING_SLOT_CLIP_DIST0, so we don't need any user clip plane lowering.
1091     *
1092     * The only lowering we are interested is specific to the fragment shader,
1093     * where we want to emit discards to honor writes to gl_ClipDistance[] in
1094     * previous stages. This is done via nir_lower_clip_fs() so we only set up
1095     * the ucp enable mask for that stage.
1096     */
1097    key->ucp_enables = ucp_enables;
1098 
1099    key->robust_buffer_access = robust_buffer_access;
1100 
1101    key->environment = V3D_ENVIRONMENT_VULKAN;
1102 }
1103 
1104 /* FIXME: anv maps to hw primitive type. Perhaps eventually we would do the
1105  * same. For not using prim_mode that is the one already used on v3d
1106  */
1107 static const enum pipe_prim_type vk_to_pipe_prim_type[] = {
1108    [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = PIPE_PRIM_POINTS,
1109    [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = PIPE_PRIM_LINES,
1110    [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = PIPE_PRIM_LINE_STRIP,
1111    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = PIPE_PRIM_TRIANGLES,
1112    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = PIPE_PRIM_TRIANGLE_STRIP,
1113    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = PIPE_PRIM_TRIANGLE_FAN,
1114    [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = PIPE_PRIM_LINES_ADJACENCY,
1115    [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = PIPE_PRIM_LINE_STRIP_ADJACENCY,
1116    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = PIPE_PRIM_TRIANGLES_ADJACENCY,
1117    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY,
1118 };
1119 
1120 static const enum pipe_logicop vk_to_pipe_logicop[] = {
1121    [VK_LOGIC_OP_CLEAR] = PIPE_LOGICOP_CLEAR,
1122    [VK_LOGIC_OP_AND] = PIPE_LOGICOP_AND,
1123    [VK_LOGIC_OP_AND_REVERSE] = PIPE_LOGICOP_AND_REVERSE,
1124    [VK_LOGIC_OP_COPY] = PIPE_LOGICOP_COPY,
1125    [VK_LOGIC_OP_AND_INVERTED] = PIPE_LOGICOP_AND_INVERTED,
1126    [VK_LOGIC_OP_NO_OP] = PIPE_LOGICOP_NOOP,
1127    [VK_LOGIC_OP_XOR] = PIPE_LOGICOP_XOR,
1128    [VK_LOGIC_OP_OR] = PIPE_LOGICOP_OR,
1129    [VK_LOGIC_OP_NOR] = PIPE_LOGICOP_NOR,
1130    [VK_LOGIC_OP_EQUIVALENT] = PIPE_LOGICOP_EQUIV,
1131    [VK_LOGIC_OP_INVERT] = PIPE_LOGICOP_INVERT,
1132    [VK_LOGIC_OP_OR_REVERSE] = PIPE_LOGICOP_OR_REVERSE,
1133    [VK_LOGIC_OP_COPY_INVERTED] = PIPE_LOGICOP_COPY_INVERTED,
1134    [VK_LOGIC_OP_OR_INVERTED] = PIPE_LOGICOP_OR_INVERTED,
1135    [VK_LOGIC_OP_NAND] = PIPE_LOGICOP_NAND,
1136    [VK_LOGIC_OP_SET] = PIPE_LOGICOP_SET,
1137 };
1138 
1139 static void
pipeline_populate_v3d_fs_key(struct v3d_fs_key * key,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct v3dv_pipeline_stage * p_stage,bool has_geometry_shader,uint32_t ucp_enables)1140 pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
1141                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
1142                              const struct v3dv_pipeline_stage *p_stage,
1143                              bool has_geometry_shader,
1144                              uint32_t ucp_enables)
1145 {
1146    assert(p_stage->stage == BROADCOM_SHADER_FRAGMENT);
1147 
1148    memset(key, 0, sizeof(*key));
1149 
1150    const bool rba = p_stage->pipeline->device->features.robustBufferAccess;
1151    pipeline_populate_v3d_key(&key->base, p_stage, ucp_enables, rba);
1152 
1153    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
1154       pCreateInfo->pInputAssemblyState;
1155    uint8_t topology = vk_to_pipe_prim_type[ia_info->topology];
1156 
1157    key->is_points = (topology == PIPE_PRIM_POINTS);
1158    key->is_lines = (topology >= PIPE_PRIM_LINES &&
1159                     topology <= PIPE_PRIM_LINE_STRIP);
1160    key->has_gs = has_geometry_shader;
1161 
1162    const VkPipelineColorBlendStateCreateInfo *cb_info =
1163       !pCreateInfo->pRasterizationState->rasterizerDiscardEnable ?
1164       pCreateInfo->pColorBlendState : NULL;
1165 
1166    key->logicop_func = cb_info && cb_info->logicOpEnable == VK_TRUE ?
1167                        vk_to_pipe_logicop[cb_info->logicOp] :
1168                        PIPE_LOGICOP_COPY;
1169 
1170    const bool raster_enabled =
1171       !pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
1172 
1173    /* Multisample rasterization state must be ignored if rasterization
1174     * is disabled.
1175     */
1176    const VkPipelineMultisampleStateCreateInfo *ms_info =
1177       raster_enabled ? pCreateInfo->pMultisampleState : NULL;
1178    if (ms_info) {
1179       assert(ms_info->rasterizationSamples == VK_SAMPLE_COUNT_1_BIT ||
1180              ms_info->rasterizationSamples == VK_SAMPLE_COUNT_4_BIT);
1181       key->msaa = ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT;
1182 
1183       if (key->msaa) {
1184          key->sample_coverage =
1185             p_stage->pipeline->sample_mask != (1 << V3D_MAX_SAMPLES) - 1;
1186          key->sample_alpha_to_coverage = ms_info->alphaToCoverageEnable;
1187          key->sample_alpha_to_one = ms_info->alphaToOneEnable;
1188       }
1189    }
1190 
1191    /* This is intended for V3D versions before 4.1, otherwise we just use the
1192     * tile buffer load/store swap R/B bit.
1193     */
1194    key->swap_color_rb = 0;
1195 
1196    const struct v3dv_render_pass *pass =
1197       v3dv_render_pass_from_handle(pCreateInfo->renderPass);
1198    const struct v3dv_subpass *subpass = p_stage->pipeline->subpass;
1199    for (uint32_t i = 0; i < subpass->color_count; i++) {
1200       const uint32_t att_idx = subpass->color_attachments[i].attachment;
1201       if (att_idx == VK_ATTACHMENT_UNUSED)
1202          continue;
1203 
1204       key->cbufs |= 1 << i;
1205 
1206       VkFormat fb_format = pass->attachments[att_idx].desc.format;
1207       enum pipe_format fb_pipe_format = vk_format_to_pipe_format(fb_format);
1208 
1209       /* If logic operations are enabled then we might emit color reads and we
1210        * need to know the color buffer format and swizzle for that
1211        */
1212       if (key->logicop_func != PIPE_LOGICOP_COPY) {
1213          key->color_fmt[i].format = fb_pipe_format;
1214          memcpy(key->color_fmt[i].swizzle,
1215                 v3dv_get_format_swizzle(p_stage->pipeline->device, fb_format),
1216                 sizeof(key->color_fmt[i].swizzle));
1217       }
1218 
1219       const struct util_format_description *desc =
1220          vk_format_description(fb_format);
1221 
1222       if (desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
1223           desc->channel[0].size == 32) {
1224          key->f32_color_rb |= 1 << i;
1225       }
1226 
1227       if (p_stage->nir->info.fs.untyped_color_outputs) {
1228          if (util_format_is_pure_uint(fb_pipe_format))
1229             key->uint_color_rb |= 1 << i;
1230          else if (util_format_is_pure_sint(fb_pipe_format))
1231             key->int_color_rb |= 1 << i;
1232       }
1233 
1234       if (key->is_points) {
1235          /* This mask represents state for GL_ARB_point_sprite which is not
1236           * relevant to Vulkan.
1237           */
1238          key->point_sprite_mask = 0;
1239 
1240          /* Vulkan mandates upper left. */
1241          key->point_coord_upper_left = true;
1242       }
1243    }
1244 }
1245 
1246 static void
setup_stage_outputs_from_next_stage_inputs(uint8_t next_stage_num_inputs,struct v3d_varying_slot * next_stage_input_slots,uint8_t * num_used_outputs,struct v3d_varying_slot * used_output_slots,uint32_t size_of_used_output_slots)1247 setup_stage_outputs_from_next_stage_inputs(
1248    uint8_t next_stage_num_inputs,
1249    struct v3d_varying_slot *next_stage_input_slots,
1250    uint8_t *num_used_outputs,
1251    struct v3d_varying_slot *used_output_slots,
1252    uint32_t size_of_used_output_slots)
1253 {
1254    *num_used_outputs = next_stage_num_inputs;
1255    memcpy(used_output_slots, next_stage_input_slots, size_of_used_output_slots);
1256 }
1257 
1258 static void
pipeline_populate_v3d_gs_key(struct v3d_gs_key * key,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct v3dv_pipeline_stage * p_stage)1259 pipeline_populate_v3d_gs_key(struct v3d_gs_key *key,
1260                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
1261                              const struct v3dv_pipeline_stage *p_stage)
1262 {
1263    assert(p_stage->stage == BROADCOM_SHADER_GEOMETRY ||
1264           p_stage->stage == BROADCOM_SHADER_GEOMETRY_BIN);
1265 
1266    memset(key, 0, sizeof(*key));
1267 
1268    const bool rba = p_stage->pipeline->device->features.robustBufferAccess;
1269    pipeline_populate_v3d_key(&key->base, p_stage, 0, rba);
1270 
1271    struct v3dv_pipeline *pipeline = p_stage->pipeline;
1272 
1273    key->per_vertex_point_size =
1274       p_stage->nir->info.outputs_written & (1ull << VARYING_SLOT_PSIZ);
1275 
1276    key->is_coord = broadcom_shader_stage_is_binning(p_stage->stage);
1277 
1278    assert(key->base.is_last_geometry_stage);
1279    if (key->is_coord) {
1280       /* Output varyings in the last binning shader are only used for transform
1281        * feedback. Set to 0 as VK_EXT_transform_feedback is not supported.
1282        */
1283       key->num_used_outputs = 0;
1284    } else {
1285       struct v3dv_shader_variant *fs_variant =
1286          pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
1287 
1288       STATIC_ASSERT(sizeof(key->used_outputs) ==
1289                     sizeof(fs_variant->prog_data.fs->input_slots));
1290 
1291       setup_stage_outputs_from_next_stage_inputs(
1292          fs_variant->prog_data.fs->num_inputs,
1293          fs_variant->prog_data.fs->input_slots,
1294          &key->num_used_outputs,
1295          key->used_outputs,
1296          sizeof(key->used_outputs));
1297    }
1298 }
1299 
1300 static void
pipeline_populate_v3d_vs_key(struct v3d_vs_key * key,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct v3dv_pipeline_stage * p_stage)1301 pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
1302                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
1303                              const struct v3dv_pipeline_stage *p_stage)
1304 {
1305    assert(p_stage->stage == BROADCOM_SHADER_VERTEX ||
1306           p_stage->stage == BROADCOM_SHADER_VERTEX_BIN);
1307 
1308    memset(key, 0, sizeof(*key));
1309 
1310    const bool rba = p_stage->pipeline->device->features.robustBufferAccess;
1311    pipeline_populate_v3d_key(&key->base, p_stage, 0, rba);
1312 
1313    struct v3dv_pipeline *pipeline = p_stage->pipeline;
1314 
1315    /* Vulkan specifies a point size per vertex, so true for if the prim are
1316     * points, like on ES2)
1317     */
1318    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
1319       pCreateInfo->pInputAssemblyState;
1320    uint8_t topology = vk_to_pipe_prim_type[ia_info->topology];
1321 
1322    /* FIXME: PRIM_POINTS is not enough, in gallium the full check is
1323     * PIPE_PRIM_POINTS && v3d->rasterizer->base.point_size_per_vertex */
1324    key->per_vertex_point_size = (topology == PIPE_PRIM_POINTS);
1325 
1326    key->is_coord = broadcom_shader_stage_is_binning(p_stage->stage);
1327 
1328    if (key->is_coord) { /* Binning VS*/
1329       if (key->base.is_last_geometry_stage) {
1330          /* Output varyings in the last binning shader are only used for
1331           * transform feedback. Set to 0 as VK_EXT_transform_feedback is not
1332           * supported.
1333           */
1334          key->num_used_outputs = 0;
1335       } else {
1336          /* Linking against GS binning program */
1337          assert(pipeline->gs);
1338          struct v3dv_shader_variant *gs_bin_variant =
1339             pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
1340 
1341          STATIC_ASSERT(sizeof(key->used_outputs) ==
1342                        sizeof(gs_bin_variant->prog_data.gs->input_slots));
1343 
1344          setup_stage_outputs_from_next_stage_inputs(
1345             gs_bin_variant->prog_data.gs->num_inputs,
1346             gs_bin_variant->prog_data.gs->input_slots,
1347             &key->num_used_outputs,
1348             key->used_outputs,
1349             sizeof(key->used_outputs));
1350       }
1351    } else { /* Render VS */
1352       if (pipeline->gs) {
1353          /* Linking against GS render program */
1354          struct v3dv_shader_variant *gs_variant =
1355             pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
1356 
1357          STATIC_ASSERT(sizeof(key->used_outputs) ==
1358                        sizeof(gs_variant->prog_data.gs->input_slots));
1359 
1360          setup_stage_outputs_from_next_stage_inputs(
1361             gs_variant->prog_data.gs->num_inputs,
1362             gs_variant->prog_data.gs->input_slots,
1363             &key->num_used_outputs,
1364             key->used_outputs,
1365             sizeof(key->used_outputs));
1366       } else {
1367          /* Linking against FS program */
1368          struct v3dv_shader_variant *fs_variant =
1369             pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
1370 
1371          STATIC_ASSERT(sizeof(key->used_outputs) ==
1372                        sizeof(fs_variant->prog_data.fs->input_slots));
1373 
1374          setup_stage_outputs_from_next_stage_inputs(
1375             fs_variant->prog_data.fs->num_inputs,
1376             fs_variant->prog_data.fs->input_slots,
1377             &key->num_used_outputs,
1378             key->used_outputs,
1379             sizeof(key->used_outputs));
1380       }
1381    }
1382 
1383    const VkPipelineVertexInputStateCreateInfo *vi_info =
1384       pCreateInfo->pVertexInputState;
1385    for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
1386       const VkVertexInputAttributeDescription *desc =
1387          &vi_info->pVertexAttributeDescriptions[i];
1388       assert(desc->location < MAX_VERTEX_ATTRIBS);
1389       if (desc->format == VK_FORMAT_B8G8R8A8_UNORM)
1390          key->va_swap_rb_mask |= 1 << (VERT_ATTRIB_GENERIC0 + desc->location);
1391    }
1392 }
1393 
1394 /**
1395  * Creates the initial form of the pipeline stage for a binning shader by
1396  * cloning the render shader and flagging it as a coordinate shader.
1397  *
1398  * Returns NULL if it was not able to allocate the object, so it should be
1399  * handled as a VK_ERROR_OUT_OF_HOST_MEMORY error.
1400  */
1401 static struct v3dv_pipeline_stage *
pipeline_stage_create_binning(const struct v3dv_pipeline_stage * src,const VkAllocationCallbacks * pAllocator)1402 pipeline_stage_create_binning(const struct v3dv_pipeline_stage *src,
1403                               const VkAllocationCallbacks *pAllocator)
1404 {
1405    struct v3dv_device *device = src->pipeline->device;
1406 
1407    struct v3dv_pipeline_stage *p_stage =
1408       vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*p_stage), 8,
1409                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1410 
1411    if (p_stage == NULL)
1412       return NULL;
1413 
1414    assert(src->stage == BROADCOM_SHADER_VERTEX ||
1415           src->stage == BROADCOM_SHADER_GEOMETRY);
1416 
1417    enum broadcom_shader_stage bin_stage =
1418       src->stage == BROADCOM_SHADER_VERTEX ?
1419          BROADCOM_SHADER_VERTEX_BIN :
1420          BROADCOM_SHADER_GEOMETRY_BIN;
1421 
1422    p_stage->pipeline = src->pipeline;
1423    p_stage->stage = bin_stage;
1424    p_stage->entrypoint = src->entrypoint;
1425    p_stage->module = src->module;
1426    /* For binning shaders we will clone the NIR code from the corresponding
1427     * render shader later, when we call pipeline_compile_xxx_shader. This way
1428     * we only have to run the relevant NIR lowerings once for render shaders
1429     */
1430    p_stage->nir = NULL;
1431    p_stage->spec_info = src->spec_info;
1432    p_stage->feedback = (VkPipelineCreationFeedback) { 0 };
1433    memcpy(p_stage->shader_sha1, src->shader_sha1, 20);
1434 
1435    return p_stage;
1436 }
1437 
1438 /**
1439  * Returns false if it was not able to allocate or map the assembly bo memory.
1440  */
1441 static bool
upload_assembly(struct v3dv_pipeline * pipeline)1442 upload_assembly(struct v3dv_pipeline *pipeline)
1443 {
1444    uint32_t total_size = 0;
1445    for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
1446       struct v3dv_shader_variant *variant =
1447          pipeline->shared_data->variants[stage];
1448 
1449       if (variant != NULL)
1450          total_size += variant->qpu_insts_size;
1451    }
1452 
1453    struct v3dv_bo *bo = v3dv_bo_alloc(pipeline->device, total_size,
1454                                       "pipeline shader assembly", true);
1455    if (!bo) {
1456       fprintf(stderr, "failed to allocate memory for shader\n");
1457       return false;
1458    }
1459 
1460    bool ok = v3dv_bo_map(pipeline->device, bo, total_size);
1461    if (!ok) {
1462       fprintf(stderr, "failed to map source shader buffer\n");
1463       return false;
1464    }
1465 
1466    uint32_t offset = 0;
1467    for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
1468       struct v3dv_shader_variant *variant =
1469          pipeline->shared_data->variants[stage];
1470 
1471       if (variant != NULL) {
1472          variant->assembly_offset = offset;
1473 
1474          memcpy(bo->map + offset, variant->qpu_insts, variant->qpu_insts_size);
1475          offset += variant->qpu_insts_size;
1476 
1477          /* We dont need qpu_insts anymore. */
1478          free(variant->qpu_insts);
1479          variant->qpu_insts = NULL;
1480       }
1481    }
1482    assert(total_size == offset);
1483 
1484    pipeline->shared_data->assembly_bo = bo;
1485 
1486    return true;
1487 }
1488 
1489 static void
pipeline_hash_graphics(const struct v3dv_pipeline * pipeline,struct v3dv_pipeline_key * key,unsigned char * sha1_out)1490 pipeline_hash_graphics(const struct v3dv_pipeline *pipeline,
1491                        struct v3dv_pipeline_key *key,
1492                        unsigned char *sha1_out)
1493 {
1494    struct mesa_sha1 ctx;
1495    _mesa_sha1_init(&ctx);
1496 
1497    if (pipeline->layout) {
1498       _mesa_sha1_update(&ctx, &pipeline->layout->sha1,
1499                         sizeof(pipeline->layout->sha1));
1500    }
1501 
1502    /* We need to include all shader stages in the sha1 key as linking may modify
1503     * the shader code in any stage. An alternative would be to use the
1504     * serialized NIR, but that seems like an overkill.
1505     */
1506    _mesa_sha1_update(&ctx, pipeline->vs->shader_sha1,
1507                      sizeof(pipeline->vs->shader_sha1));
1508 
1509    if (pipeline->gs) {
1510       _mesa_sha1_update(&ctx, pipeline->gs->shader_sha1,
1511                         sizeof(pipeline->gs->shader_sha1));
1512    }
1513 
1514    _mesa_sha1_update(&ctx, pipeline->fs->shader_sha1,
1515                      sizeof(pipeline->fs->shader_sha1));
1516 
1517    _mesa_sha1_update(&ctx, key, sizeof(struct v3dv_pipeline_key));
1518 
1519    _mesa_sha1_final(&ctx, sha1_out);
1520 }
1521 
1522 static void
pipeline_hash_compute(const struct v3dv_pipeline * pipeline,struct v3dv_pipeline_key * key,unsigned char * sha1_out)1523 pipeline_hash_compute(const struct v3dv_pipeline *pipeline,
1524                       struct v3dv_pipeline_key *key,
1525                       unsigned char *sha1_out)
1526 {
1527    struct mesa_sha1 ctx;
1528    _mesa_sha1_init(&ctx);
1529 
1530    if (pipeline->layout) {
1531       _mesa_sha1_update(&ctx, &pipeline->layout->sha1,
1532                         sizeof(pipeline->layout->sha1));
1533    }
1534 
1535    _mesa_sha1_update(&ctx, pipeline->cs->shader_sha1,
1536                      sizeof(pipeline->cs->shader_sha1));
1537 
1538    _mesa_sha1_update(&ctx, key, sizeof(struct v3dv_pipeline_key));
1539 
1540    _mesa_sha1_final(&ctx, sha1_out);
1541 }
1542 
1543 /* Checks that the pipeline has enough spill size to use for any of their
1544  * variants
1545  */
1546 static void
pipeline_check_spill_size(struct v3dv_pipeline * pipeline)1547 pipeline_check_spill_size(struct v3dv_pipeline *pipeline)
1548 {
1549    uint32_t max_spill_size = 0;
1550 
1551    for(uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
1552       struct v3dv_shader_variant *variant =
1553          pipeline->shared_data->variants[stage];
1554 
1555       if (variant != NULL) {
1556          max_spill_size = MAX2(variant->prog_data.base->spill_size,
1557                                max_spill_size);
1558       }
1559    }
1560 
1561    if (max_spill_size > 0) {
1562       struct v3dv_device *device = pipeline->device;
1563 
1564       /* The TIDX register we use for choosing the area to access
1565        * for scratch space is: (core << 6) | (qpu << 2) | thread.
1566        * Even at minimum threadcount in a particular shader, that
1567        * means we still multiply by qpus by 4.
1568        */
1569       const uint32_t total_spill_size =
1570          4 * device->devinfo.qpu_count * max_spill_size;
1571       if (pipeline->spill.bo) {
1572          assert(pipeline->spill.size_per_thread > 0);
1573          v3dv_bo_free(device, pipeline->spill.bo);
1574       }
1575       pipeline->spill.bo =
1576          v3dv_bo_alloc(device, total_spill_size, "spill", true);
1577       pipeline->spill.size_per_thread = max_spill_size;
1578    }
1579 }
1580 
1581 /**
1582  * Creates a new shader_variant_create. Note that for prog_data is not const,
1583  * so it is assumed that the caller will prove a pointer that the
1584  * shader_variant will own.
1585  *
1586  * Creation doesn't include allocate a BO to store the content of qpu_insts,
1587  * as we will try to share the same bo for several shader variants. Also note
1588  * that qpu_ints being NULL is valid, for example if we are creating the
1589  * shader_variants from the cache, so we can just upload the assembly of all
1590  * the shader stages at once.
1591  */
1592 struct v3dv_shader_variant *
v3dv_shader_variant_create(struct v3dv_device * device,enum broadcom_shader_stage stage,struct v3d_prog_data * prog_data,uint32_t prog_data_size,uint32_t assembly_offset,uint64_t * qpu_insts,uint32_t qpu_insts_size,VkResult * out_vk_result)1593 v3dv_shader_variant_create(struct v3dv_device *device,
1594                            enum broadcom_shader_stage stage,
1595                            struct v3d_prog_data *prog_data,
1596                            uint32_t prog_data_size,
1597                            uint32_t assembly_offset,
1598                            uint64_t *qpu_insts,
1599                            uint32_t qpu_insts_size,
1600                            VkResult *out_vk_result)
1601 {
1602    struct v3dv_shader_variant *variant =
1603       vk_zalloc(&device->vk.alloc, sizeof(*variant), 8,
1604                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1605 
1606    if (variant == NULL) {
1607       *out_vk_result = VK_ERROR_OUT_OF_HOST_MEMORY;
1608       return NULL;
1609    }
1610 
1611    variant->stage = stage;
1612    variant->prog_data_size = prog_data_size;
1613    variant->prog_data.base = prog_data;
1614 
1615    variant->assembly_offset = assembly_offset;
1616    variant->qpu_insts_size = qpu_insts_size;
1617    variant->qpu_insts = qpu_insts;
1618 
1619    *out_vk_result = VK_SUCCESS;
1620 
1621    return variant;
1622 }
1623 
1624 /* For a given key, it returns the compiled version of the shader.  Returns a
1625  * new reference to the shader_variant to the caller, or NULL.
1626  *
1627  * If the method returns NULL it means that something wrong happened:
1628  *   * Not enough memory: this is one of the possible outcomes defined by
1629  *     vkCreateXXXPipelines. out_vk_result will return the proper oom error.
1630  *   * Compilation error: hypothetically this shouldn't happen, as the spec
1631  *     states that vkShaderModule needs to be created with a valid SPIR-V, so
1632  *     any compilation failure is a driver bug. In the practice, something as
1633  *     common as failing to register allocate can lead to a compilation
1634  *     failure. In that case the only option (for any driver) is
1635  *     VK_ERROR_UNKNOWN, even if we know that the problem was a compiler
1636  *     error.
1637  */
1638 static struct v3dv_shader_variant *
pipeline_compile_shader_variant(struct v3dv_pipeline_stage * p_stage,struct v3d_key * key,size_t key_size,const VkAllocationCallbacks * pAllocator,VkResult * out_vk_result)1639 pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage,
1640                                 struct v3d_key *key,
1641                                 size_t key_size,
1642                                 const VkAllocationCallbacks *pAllocator,
1643                                 VkResult *out_vk_result)
1644 {
1645    int64_t stage_start = os_time_get_nano();
1646 
1647    struct v3dv_pipeline *pipeline = p_stage->pipeline;
1648    struct v3dv_physical_device *physical_device =
1649       &pipeline->device->instance->physicalDevice;
1650    const struct v3d_compiler *compiler = physical_device->compiler;
1651 
1652    if (unlikely(V3D_DEBUG & (V3D_DEBUG_NIR |
1653                              v3d_debug_flag_for_shader_stage
1654                              (broadcom_shader_stage_to_gl(p_stage->stage))))) {
1655       fprintf(stderr, "Just before v3d_compile: %s prog %d NIR:\n",
1656               broadcom_shader_stage_name(p_stage->stage),
1657               p_stage->program_id);
1658       nir_print_shader(p_stage->nir, stderr);
1659       fprintf(stderr, "\n");
1660    }
1661 
1662    uint64_t *qpu_insts;
1663    uint32_t qpu_insts_size;
1664    struct v3d_prog_data *prog_data;
1665    uint32_t prog_data_size =
1666       v3d_prog_data_size(broadcom_shader_stage_to_gl(p_stage->stage));
1667 
1668    qpu_insts = v3d_compile(compiler,
1669                            key, &prog_data,
1670                            p_stage->nir,
1671                            shader_debug_output, NULL,
1672                            p_stage->program_id, 0,
1673                            &qpu_insts_size);
1674 
1675    struct v3dv_shader_variant *variant = NULL;
1676 
1677    if (!qpu_insts) {
1678       fprintf(stderr, "Failed to compile %s prog %d NIR to VIR\n",
1679               gl_shader_stage_name(p_stage->stage),
1680               p_stage->program_id);
1681       *out_vk_result = VK_ERROR_UNKNOWN;
1682    } else {
1683       variant =
1684          v3dv_shader_variant_create(pipeline->device, p_stage->stage,
1685                                     prog_data, prog_data_size,
1686                                     0, /* assembly_offset, no final value yet */
1687                                     qpu_insts, qpu_insts_size,
1688                                     out_vk_result);
1689    }
1690    /* At this point we don't need anymore the nir shader, but we are freeing
1691     * all the temporary p_stage structs used during the pipeline creation when
1692     * we finish it, so let's not worry about freeing the nir here.
1693     */
1694 
1695    p_stage->feedback.duration += os_time_get_nano() - stage_start;
1696 
1697    return variant;
1698 }
1699 
1700 static void
link_shaders(nir_shader * producer,nir_shader * consumer)1701 link_shaders(nir_shader *producer, nir_shader *consumer)
1702 {
1703    assert(producer);
1704    assert(consumer);
1705 
1706    if (producer->options->lower_to_scalar) {
1707       NIR_PASS(_, producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
1708       NIR_PASS(_, consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
1709    }
1710 
1711    nir_lower_io_arrays_to_elements(producer, consumer);
1712 
1713    nir_optimize(producer, false);
1714    nir_optimize(consumer, false);
1715 
1716    if (nir_link_opt_varyings(producer, consumer))
1717       nir_optimize(consumer, false);
1718 
1719    NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
1720    NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
1721 
1722    if (nir_remove_unused_varyings(producer, consumer)) {
1723       NIR_PASS(_, producer, nir_lower_global_vars_to_local);
1724       NIR_PASS(_, consumer, nir_lower_global_vars_to_local);
1725 
1726       nir_optimize(producer, false);
1727       nir_optimize(consumer, false);
1728 
1729       /* Optimizations can cause varyings to become unused.
1730        * nir_compact_varyings() depends on all dead varyings being removed so
1731        * we need to call nir_remove_dead_variables() again here.
1732        */
1733       NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
1734       NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
1735    }
1736 }
1737 
1738 static void
pipeline_lower_nir(struct v3dv_pipeline * pipeline,struct v3dv_pipeline_stage * p_stage,struct v3dv_pipeline_layout * layout)1739 pipeline_lower_nir(struct v3dv_pipeline *pipeline,
1740                    struct v3dv_pipeline_stage *p_stage,
1741                    struct v3dv_pipeline_layout *layout)
1742 {
1743    int64_t stage_start = os_time_get_nano();
1744 
1745    assert(pipeline->shared_data &&
1746           pipeline->shared_data->maps[p_stage->stage]);
1747 
1748    nir_shader_gather_info(p_stage->nir, nir_shader_get_entrypoint(p_stage->nir));
1749 
1750    /* We add this because we need a valid sampler for nir_lower_tex to do
1751     * unpacking of the texture operation result, even for the case where there
1752     * is no sampler state.
1753     *
1754     * We add two of those, one for the case we need a 16bit return_size, and
1755     * another for the case we need a 32bit return size.
1756     */
1757    struct v3dv_descriptor_maps *maps =
1758       pipeline->shared_data->maps[p_stage->stage];
1759 
1760    UNUSED unsigned index;
1761    index = descriptor_map_add(&maps->sampler_map, -1, -1, -1, 0, 0, 16);
1762    assert(index == V3DV_NO_SAMPLER_16BIT_IDX);
1763 
1764    index = descriptor_map_add(&maps->sampler_map, -2, -2, -2, 0, 0, 32);
1765    assert(index == V3DV_NO_SAMPLER_32BIT_IDX);
1766 
1767    /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
1768    bool needs_default_sampler_state = false;
1769    NIR_PASS(_, p_stage->nir, lower_pipeline_layout_info, pipeline, layout,
1770             &needs_default_sampler_state);
1771 
1772    /* If in the end we didn't need to use the default sampler states and the
1773     * shader doesn't need any other samplers, get rid of them so we can
1774     * recognize that this program doesn't use any samplers at all.
1775     */
1776    if (!needs_default_sampler_state && maps->sampler_map.num_desc == 2)
1777       maps->sampler_map.num_desc = 0;
1778 
1779    p_stage->feedback.duration += os_time_get_nano() - stage_start;
1780 }
1781 
1782 /**
1783  * The SPIR-V compiler will insert a sized compact array for
1784  * VARYING_SLOT_CLIP_DIST0 if the vertex shader writes to gl_ClipDistance[],
1785  * where the size of the array determines the number of active clip planes.
1786  */
1787 static uint32_t
get_ucp_enable_mask(struct v3dv_pipeline_stage * p_stage)1788 get_ucp_enable_mask(struct v3dv_pipeline_stage *p_stage)
1789 {
1790    assert(p_stage->stage == BROADCOM_SHADER_VERTEX);
1791    const nir_shader *shader = p_stage->nir;
1792    assert(shader);
1793 
1794    nir_foreach_variable_with_modes(var, shader, nir_var_shader_out) {
1795       if (var->data.location == VARYING_SLOT_CLIP_DIST0) {
1796          assert(var->data.compact);
1797          return (1 << glsl_get_length(var->type)) - 1;
1798       }
1799    }
1800    return 0;
1801 }
1802 
1803 static nir_shader *
pipeline_stage_get_nir(struct v3dv_pipeline_stage * p_stage,struct v3dv_pipeline * pipeline,struct v3dv_pipeline_cache * cache)1804 pipeline_stage_get_nir(struct v3dv_pipeline_stage *p_stage,
1805                        struct v3dv_pipeline *pipeline,
1806                        struct v3dv_pipeline_cache *cache)
1807 {
1808    int64_t stage_start = os_time_get_nano();
1809 
1810    nir_shader *nir = NULL;
1811 
1812    nir = v3dv_pipeline_cache_search_for_nir(pipeline, cache,
1813                                             &v3dv_nir_options,
1814                                             p_stage->shader_sha1);
1815 
1816    if (nir) {
1817       assert(nir->info.stage == broadcom_shader_stage_to_gl(p_stage->stage));
1818 
1819       /* A NIR cach hit doesn't avoid the large majority of pipeline stage
1820        * creation so the cache hit is not recorded in the pipeline feedback
1821        * flags
1822        */
1823 
1824       p_stage->feedback.duration += os_time_get_nano() - stage_start;
1825 
1826       return nir;
1827    }
1828 
1829    nir = shader_module_compile_to_nir(pipeline->device, p_stage);
1830 
1831    if (nir) {
1832       struct v3dv_pipeline_cache *default_cache =
1833          &pipeline->device->default_pipeline_cache;
1834 
1835       v3dv_pipeline_cache_upload_nir(pipeline, cache, nir,
1836                                      p_stage->shader_sha1);
1837 
1838       /* Ensure that the variant is on the default cache, as cmd_buffer could
1839        * need to change the current variant
1840        */
1841       if (default_cache != cache) {
1842          v3dv_pipeline_cache_upload_nir(pipeline, default_cache, nir,
1843                                         p_stage->shader_sha1);
1844       }
1845 
1846       p_stage->feedback.duration += os_time_get_nano() - stage_start;
1847 
1848       return nir;
1849    }
1850 
1851    /* FIXME: this shouldn't happen, raise error? */
1852    return NULL;
1853 }
1854 
1855 static VkResult
pipeline_compile_vertex_shader(struct v3dv_pipeline * pipeline,const VkAllocationCallbacks * pAllocator,const VkGraphicsPipelineCreateInfo * pCreateInfo)1856 pipeline_compile_vertex_shader(struct v3dv_pipeline *pipeline,
1857                                const VkAllocationCallbacks *pAllocator,
1858                                const VkGraphicsPipelineCreateInfo *pCreateInfo)
1859 {
1860    assert(pipeline->vs_bin != NULL);
1861    if (pipeline->vs_bin->nir == NULL) {
1862       assert(pipeline->vs->nir);
1863       pipeline->vs_bin->nir = nir_shader_clone(NULL, pipeline->vs->nir);
1864    }
1865 
1866    VkResult vk_result;
1867    struct v3d_vs_key key;
1868    pipeline_populate_v3d_vs_key(&key, pCreateInfo, pipeline->vs);
1869    pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX] =
1870       pipeline_compile_shader_variant(pipeline->vs, &key.base, sizeof(key),
1871                                       pAllocator, &vk_result);
1872    if (vk_result != VK_SUCCESS)
1873       return vk_result;
1874 
1875    pipeline_populate_v3d_vs_key(&key, pCreateInfo, pipeline->vs_bin);
1876    pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN] =
1877       pipeline_compile_shader_variant(pipeline->vs_bin, &key.base, sizeof(key),
1878                                       pAllocator, &vk_result);
1879 
1880    return vk_result;
1881 }
1882 
1883 static VkResult
pipeline_compile_geometry_shader(struct v3dv_pipeline * pipeline,const VkAllocationCallbacks * pAllocator,const VkGraphicsPipelineCreateInfo * pCreateInfo)1884 pipeline_compile_geometry_shader(struct v3dv_pipeline *pipeline,
1885                                  const VkAllocationCallbacks *pAllocator,
1886                                  const VkGraphicsPipelineCreateInfo *pCreateInfo)
1887 {
1888    assert(pipeline->gs);
1889 
1890    assert(pipeline->gs_bin != NULL);
1891    if (pipeline->gs_bin->nir == NULL) {
1892       assert(pipeline->gs->nir);
1893       pipeline->gs_bin->nir = nir_shader_clone(NULL, pipeline->gs->nir);
1894    }
1895 
1896    VkResult vk_result;
1897    struct v3d_gs_key key;
1898    pipeline_populate_v3d_gs_key(&key, pCreateInfo, pipeline->gs);
1899    pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY] =
1900       pipeline_compile_shader_variant(pipeline->gs, &key.base, sizeof(key),
1901                                       pAllocator, &vk_result);
1902    if (vk_result != VK_SUCCESS)
1903       return vk_result;
1904 
1905    pipeline_populate_v3d_gs_key(&key, pCreateInfo, pipeline->gs_bin);
1906    pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN] =
1907       pipeline_compile_shader_variant(pipeline->gs_bin, &key.base, sizeof(key),
1908                                       pAllocator, &vk_result);
1909 
1910    return vk_result;
1911 }
1912 
1913 static VkResult
pipeline_compile_fragment_shader(struct v3dv_pipeline * pipeline,const VkAllocationCallbacks * pAllocator,const VkGraphicsPipelineCreateInfo * pCreateInfo)1914 pipeline_compile_fragment_shader(struct v3dv_pipeline *pipeline,
1915                                  const VkAllocationCallbacks *pAllocator,
1916                                  const VkGraphicsPipelineCreateInfo *pCreateInfo)
1917 {
1918    struct v3dv_pipeline_stage *p_stage = pipeline->vs;
1919 
1920    p_stage = pipeline->fs;
1921 
1922    struct v3d_fs_key key;
1923 
1924    pipeline_populate_v3d_fs_key(&key, pCreateInfo, p_stage,
1925                                 pipeline->gs != NULL,
1926                                 get_ucp_enable_mask(pipeline->vs));
1927 
1928    VkResult vk_result;
1929    pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT] =
1930       pipeline_compile_shader_variant(p_stage, &key.base, sizeof(key),
1931                                       pAllocator, &vk_result);
1932 
1933    return vk_result;
1934 }
1935 
1936 static void
pipeline_populate_graphics_key(struct v3dv_pipeline * pipeline,struct v3dv_pipeline_key * key,const VkGraphicsPipelineCreateInfo * pCreateInfo)1937 pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline,
1938                                struct v3dv_pipeline_key *key,
1939                                const VkGraphicsPipelineCreateInfo *pCreateInfo)
1940 {
1941    memset(key, 0, sizeof(*key));
1942    key->robust_buffer_access =
1943       pipeline->device->features.robustBufferAccess;
1944 
1945    const bool raster_enabled =
1946       !pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
1947 
1948    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
1949       pCreateInfo->pInputAssemblyState;
1950    key->topology = vk_to_pipe_prim_type[ia_info->topology];
1951 
1952    const VkPipelineColorBlendStateCreateInfo *cb_info =
1953       raster_enabled ? pCreateInfo->pColorBlendState : NULL;
1954 
1955    key->logicop_func = cb_info && cb_info->logicOpEnable == VK_TRUE ?
1956       vk_to_pipe_logicop[cb_info->logicOp] :
1957       PIPE_LOGICOP_COPY;
1958 
1959    /* Multisample rasterization state must be ignored if rasterization
1960     * is disabled.
1961     */
1962    const VkPipelineMultisampleStateCreateInfo *ms_info =
1963       raster_enabled ? pCreateInfo->pMultisampleState : NULL;
1964    if (ms_info) {
1965       assert(ms_info->rasterizationSamples == VK_SAMPLE_COUNT_1_BIT ||
1966              ms_info->rasterizationSamples == VK_SAMPLE_COUNT_4_BIT);
1967       key->msaa = ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT;
1968 
1969       if (key->msaa) {
1970          key->sample_coverage =
1971             pipeline->sample_mask != (1 << V3D_MAX_SAMPLES) - 1;
1972          key->sample_alpha_to_coverage = ms_info->alphaToCoverageEnable;
1973          key->sample_alpha_to_one = ms_info->alphaToOneEnable;
1974       }
1975    }
1976 
1977    const struct v3dv_render_pass *pass =
1978       v3dv_render_pass_from_handle(pCreateInfo->renderPass);
1979    const struct v3dv_subpass *subpass = pipeline->subpass;
1980    for (uint32_t i = 0; i < subpass->color_count; i++) {
1981       const uint32_t att_idx = subpass->color_attachments[i].attachment;
1982       if (att_idx == VK_ATTACHMENT_UNUSED)
1983          continue;
1984 
1985       key->cbufs |= 1 << i;
1986 
1987       VkFormat fb_format = pass->attachments[att_idx].desc.format;
1988       enum pipe_format fb_pipe_format = vk_format_to_pipe_format(fb_format);
1989 
1990       /* If logic operations are enabled then we might emit color reads and we
1991        * need to know the color buffer format and swizzle for that
1992        */
1993       if (key->logicop_func != PIPE_LOGICOP_COPY) {
1994          key->color_fmt[i].format = fb_pipe_format;
1995          memcpy(key->color_fmt[i].swizzle,
1996                 v3dv_get_format_swizzle(pipeline->device, fb_format),
1997                 sizeof(key->color_fmt[i].swizzle));
1998       }
1999 
2000       const struct util_format_description *desc =
2001          vk_format_description(fb_format);
2002 
2003       if (desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
2004           desc->channel[0].size == 32) {
2005          key->f32_color_rb |= 1 << i;
2006       }
2007    }
2008 
2009    const VkPipelineVertexInputStateCreateInfo *vi_info =
2010       pCreateInfo->pVertexInputState;
2011    for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
2012       const VkVertexInputAttributeDescription *desc =
2013          &vi_info->pVertexAttributeDescriptions[i];
2014       assert(desc->location < MAX_VERTEX_ATTRIBS);
2015       if (desc->format == VK_FORMAT_B8G8R8A8_UNORM)
2016          key->va_swap_rb_mask |= 1 << (VERT_ATTRIB_GENERIC0 + desc->location);
2017    }
2018 
2019    assert(pipeline->subpass);
2020    key->has_multiview = pipeline->subpass->view_mask != 0;
2021 }
2022 
2023 static void
pipeline_populate_compute_key(struct v3dv_pipeline * pipeline,struct v3dv_pipeline_key * key,const VkComputePipelineCreateInfo * pCreateInfo)2024 pipeline_populate_compute_key(struct v3dv_pipeline *pipeline,
2025                               struct v3dv_pipeline_key *key,
2026                               const VkComputePipelineCreateInfo *pCreateInfo)
2027 {
2028    /* We use the same pipeline key for graphics and compute, but we don't need
2029     * to add a field to flag compute keys because this key is not used alone
2030     * to search in the cache, we also use the SPIR-V or the serialized NIR for
2031     * example, which already flags compute shaders.
2032     */
2033    memset(key, 0, sizeof(*key));
2034    key->robust_buffer_access =
2035       pipeline->device->features.robustBufferAccess;
2036 }
2037 
2038 static struct v3dv_pipeline_shared_data *
v3dv_pipeline_shared_data_new_empty(const unsigned char sha1_key[20],struct v3dv_pipeline * pipeline,bool is_graphics_pipeline)2039 v3dv_pipeline_shared_data_new_empty(const unsigned char sha1_key[20],
2040                                     struct v3dv_pipeline *pipeline,
2041                                     bool is_graphics_pipeline)
2042 {
2043    /* We create new_entry using the device alloc. Right now shared_data is ref
2044     * and unref by both the pipeline and the pipeline cache, so we can't
2045     * ensure that the cache or pipeline alloc will be available on the last
2046     * unref.
2047     */
2048    struct v3dv_pipeline_shared_data *new_entry =
2049       vk_zalloc2(&pipeline->device->vk.alloc, NULL,
2050                  sizeof(struct v3dv_pipeline_shared_data), 8,
2051                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2052 
2053    if (new_entry == NULL)
2054       return NULL;
2055 
2056    for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
2057       /* We don't need specific descriptor maps for binning stages we use the
2058        * map for the render stage.
2059        */
2060       if (broadcom_shader_stage_is_binning(stage))
2061          continue;
2062 
2063       if ((is_graphics_pipeline && stage == BROADCOM_SHADER_COMPUTE) ||
2064           (!is_graphics_pipeline && stage != BROADCOM_SHADER_COMPUTE)) {
2065          continue;
2066       }
2067 
2068       if (stage == BROADCOM_SHADER_GEOMETRY && !pipeline->gs) {
2069          /* We always inject a custom GS if we have multiview */
2070          if (!pipeline->subpass->view_mask)
2071             continue;
2072       }
2073 
2074       struct v3dv_descriptor_maps *new_maps =
2075          vk_zalloc2(&pipeline->device->vk.alloc, NULL,
2076                     sizeof(struct v3dv_descriptor_maps), 8,
2077                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2078 
2079       if (new_maps == NULL)
2080          goto fail;
2081 
2082       new_entry->maps[stage] = new_maps;
2083    }
2084 
2085    new_entry->maps[BROADCOM_SHADER_VERTEX_BIN] =
2086       new_entry->maps[BROADCOM_SHADER_VERTEX];
2087 
2088    new_entry->maps[BROADCOM_SHADER_GEOMETRY_BIN] =
2089       new_entry->maps[BROADCOM_SHADER_GEOMETRY];
2090 
2091    new_entry->ref_cnt = 1;
2092    memcpy(new_entry->sha1_key, sha1_key, 20);
2093 
2094    return new_entry;
2095 
2096 fail:
2097    if (new_entry != NULL) {
2098       for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
2099          if (new_entry->maps[stage] != NULL)
2100             vk_free(&pipeline->device->vk.alloc, new_entry->maps[stage]);
2101       }
2102    }
2103 
2104    vk_free(&pipeline->device->vk.alloc, new_entry);
2105 
2106    return NULL;
2107 }
2108 
2109 static void
write_creation_feedback(struct v3dv_pipeline * pipeline,const void * next,const VkPipelineCreationFeedback * pipeline_feedback,uint32_t stage_count,const VkPipelineShaderStageCreateInfo * stages)2110 write_creation_feedback(struct v3dv_pipeline *pipeline,
2111                         const void *next,
2112                         const VkPipelineCreationFeedback *pipeline_feedback,
2113                         uint32_t stage_count,
2114                         const VkPipelineShaderStageCreateInfo *stages)
2115 {
2116    const VkPipelineCreationFeedbackCreateInfo *create_feedback =
2117       vk_find_struct_const(next, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
2118 
2119    if (create_feedback) {
2120       typed_memcpy(create_feedback->pPipelineCreationFeedback,
2121              pipeline_feedback,
2122              1);
2123 
2124       assert(stage_count == create_feedback->pipelineStageCreationFeedbackCount);
2125 
2126       for (uint32_t i = 0; i < stage_count; i++) {
2127          gl_shader_stage s = vk_to_mesa_shader_stage(stages[i].stage);
2128          switch (s) {
2129          case MESA_SHADER_VERTEX:
2130             create_feedback->pPipelineStageCreationFeedbacks[i] =
2131                pipeline->vs->feedback;
2132 
2133             create_feedback->pPipelineStageCreationFeedbacks[i].duration +=
2134                pipeline->vs_bin->feedback.duration;
2135             break;
2136 
2137          case MESA_SHADER_GEOMETRY:
2138             create_feedback->pPipelineStageCreationFeedbacks[i] =
2139                pipeline->gs->feedback;
2140 
2141             create_feedback->pPipelineStageCreationFeedbacks[i].duration +=
2142                pipeline->gs_bin->feedback.duration;
2143             break;
2144 
2145          case MESA_SHADER_FRAGMENT:
2146             create_feedback->pPipelineStageCreationFeedbacks[i] =
2147                pipeline->fs->feedback;
2148             break;
2149 
2150          case MESA_SHADER_COMPUTE:
2151             create_feedback->pPipelineStageCreationFeedbacks[i] =
2152                pipeline->cs->feedback;
2153             break;
2154 
2155          default:
2156             unreachable("not supported shader stage");
2157          }
2158       }
2159    }
2160 }
2161 
2162 static enum shader_prim
multiview_gs_input_primitive_from_pipeline(struct v3dv_pipeline * pipeline)2163 multiview_gs_input_primitive_from_pipeline(struct v3dv_pipeline *pipeline)
2164 {
2165    switch (pipeline->topology) {
2166    case PIPE_PRIM_POINTS:
2167       return SHADER_PRIM_POINTS;
2168    case PIPE_PRIM_LINES:
2169    case PIPE_PRIM_LINE_STRIP:
2170       return SHADER_PRIM_LINES;
2171    case PIPE_PRIM_TRIANGLES:
2172    case PIPE_PRIM_TRIANGLE_STRIP:
2173    case PIPE_PRIM_TRIANGLE_FAN:
2174       return SHADER_PRIM_TRIANGLES;
2175    default:
2176       /* Since we don't allow GS with multiview, we can only see non-adjacency
2177        * primitives.
2178        */
2179       unreachable("Unexpected pipeline primitive type");
2180    }
2181 }
2182 
2183 static enum shader_prim
multiview_gs_output_primitive_from_pipeline(struct v3dv_pipeline * pipeline)2184 multiview_gs_output_primitive_from_pipeline(struct v3dv_pipeline *pipeline)
2185 {
2186    switch (pipeline->topology) {
2187    case PIPE_PRIM_POINTS:
2188       return SHADER_PRIM_POINTS;
2189    case PIPE_PRIM_LINES:
2190    case PIPE_PRIM_LINE_STRIP:
2191       return SHADER_PRIM_LINE_STRIP;
2192    case PIPE_PRIM_TRIANGLES:
2193    case PIPE_PRIM_TRIANGLE_STRIP:
2194    case PIPE_PRIM_TRIANGLE_FAN:
2195       return SHADER_PRIM_TRIANGLE_STRIP;
2196    default:
2197       /* Since we don't allow GS with multiview, we can only see non-adjacency
2198        * primitives.
2199        */
2200       unreachable("Unexpected pipeline primitive type");
2201    }
2202 }
2203 
2204 static bool
pipeline_add_multiview_gs(struct v3dv_pipeline * pipeline,struct v3dv_pipeline_cache * cache,const VkAllocationCallbacks * pAllocator)2205 pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline,
2206                           struct v3dv_pipeline_cache *cache,
2207                           const VkAllocationCallbacks *pAllocator)
2208 {
2209    /* Create the passthrough GS from the VS output interface */
2210    pipeline->vs->nir = pipeline_stage_get_nir(pipeline->vs, pipeline, cache);
2211    nir_shader *vs_nir = pipeline->vs->nir;
2212 
2213    const nir_shader_compiler_options *options = v3dv_pipeline_get_nir_options();
2214    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_GEOMETRY, options,
2215                                                   "multiview broadcast gs");
2216    nir_shader *nir = b.shader;
2217    nir->info.inputs_read = vs_nir->info.outputs_written;
2218    nir->info.outputs_written = vs_nir->info.outputs_written |
2219                                (1ull << VARYING_SLOT_LAYER);
2220 
2221    uint32_t vertex_count = u_vertices_per_prim(pipeline->topology);
2222    nir->info.gs.input_primitive =
2223       multiview_gs_input_primitive_from_pipeline(pipeline);
2224    nir->info.gs.output_primitive =
2225       multiview_gs_output_primitive_from_pipeline(pipeline);
2226    nir->info.gs.vertices_in = vertex_count;
2227    nir->info.gs.vertices_out = nir->info.gs.vertices_in;
2228    nir->info.gs.invocations = 1;
2229    nir->info.gs.active_stream_mask = 0x1;
2230 
2231    /* Make a list of GS input/output variables from the VS outputs */
2232    nir_variable *in_vars[100];
2233    nir_variable *out_vars[100];
2234    uint32_t var_count = 0;
2235    nir_foreach_shader_out_variable(out_vs_var, vs_nir) {
2236       char name[8];
2237       snprintf(name, ARRAY_SIZE(name), "in_%d", var_count);
2238 
2239       in_vars[var_count] =
2240          nir_variable_create(nir, nir_var_shader_in,
2241                              glsl_array_type(out_vs_var->type, vertex_count, 0),
2242                              name);
2243       in_vars[var_count]->data.location = out_vs_var->data.location;
2244       in_vars[var_count]->data.location_frac = out_vs_var->data.location_frac;
2245       in_vars[var_count]->data.interpolation = out_vs_var->data.interpolation;
2246 
2247       snprintf(name, ARRAY_SIZE(name), "out_%d", var_count);
2248       out_vars[var_count] =
2249          nir_variable_create(nir, nir_var_shader_out, out_vs_var->type, name);
2250       out_vars[var_count]->data.location = out_vs_var->data.location;
2251       out_vars[var_count]->data.interpolation = out_vs_var->data.interpolation;
2252 
2253       var_count++;
2254    }
2255 
2256    /* Add the gl_Layer output variable */
2257    nir_variable *out_layer =
2258       nir_variable_create(nir, nir_var_shader_out, glsl_int_type(),
2259                           "out_Layer");
2260    out_layer->data.location = VARYING_SLOT_LAYER;
2261 
2262    /* Get the view index value that we will write to gl_Layer */
2263    nir_ssa_def *layer =
2264       nir_load_system_value(&b, nir_intrinsic_load_view_index, 0, 1, 32);
2265 
2266    /* Emit all output vertices */
2267    for (uint32_t vi = 0; vi < vertex_count; vi++) {
2268       /* Emit all output varyings */
2269       for (uint32_t i = 0; i < var_count; i++) {
2270          nir_deref_instr *in_value =
2271             nir_build_deref_array_imm(&b, nir_build_deref_var(&b, in_vars[i]), vi);
2272          nir_copy_deref(&b, nir_build_deref_var(&b, out_vars[i]), in_value);
2273       }
2274 
2275       /* Emit gl_Layer write */
2276       nir_store_var(&b, out_layer, layer, 0x1);
2277 
2278       nir_emit_vertex(&b, 0);
2279    }
2280    nir_end_primitive(&b, 0);
2281 
2282    /* Make sure we run our pre-process NIR passes so we produce NIR compatible
2283     * with what we expect from SPIR-V modules.
2284     */
2285    preprocess_nir(nir);
2286 
2287    /* Attach the geometry shader to the  pipeline */
2288    struct v3dv_device *device = pipeline->device;
2289    struct v3dv_physical_device *physical_device =
2290       &device->instance->physicalDevice;
2291 
2292    struct v3dv_pipeline_stage *p_stage =
2293       vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*p_stage), 8,
2294                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2295 
2296    if (p_stage == NULL) {
2297       ralloc_free(nir);
2298       return false;
2299    }
2300 
2301    p_stage->pipeline = pipeline;
2302    p_stage->stage = BROADCOM_SHADER_GEOMETRY;
2303    p_stage->entrypoint = "main";
2304    p_stage->module = 0;
2305    p_stage->nir = nir;
2306    pipeline_compute_sha1_from_nir(p_stage);
2307    p_stage->program_id = p_atomic_inc_return(&physical_device->next_program_id);
2308 
2309    pipeline->has_gs = true;
2310    pipeline->gs = p_stage;
2311    pipeline->active_stages |= MESA_SHADER_GEOMETRY;
2312 
2313    pipeline->gs_bin =
2314       pipeline_stage_create_binning(pipeline->gs, pAllocator);
2315       if (pipeline->gs_bin == NULL)
2316          return false;
2317 
2318    return true;
2319 }
2320 
2321 static void
pipeline_check_buffer_device_address(struct v3dv_pipeline * pipeline)2322 pipeline_check_buffer_device_address(struct v3dv_pipeline *pipeline)
2323 {
2324    for (int i = BROADCOM_SHADER_VERTEX; i < BROADCOM_SHADER_STAGES; i++) {
2325       struct v3dv_shader_variant *variant = pipeline->shared_data->variants[i];
2326       if (variant && variant->prog_data.base->has_global_address) {
2327          pipeline->uses_buffer_device_address = true;
2328          return;
2329       }
2330    }
2331 
2332    pipeline->uses_buffer_device_address = false;
2333 }
2334 
2335 /*
2336  * It compiles a pipeline. Note that it also allocate internal object, but if
2337  * some allocations success, but other fails, the method is not freeing the
2338  * successful ones.
2339  *
2340  * This is done to simplify the code, as what we do in this case is just call
2341  * the pipeline destroy method, and this would handle freeing the internal
2342  * objects allocated. We just need to be careful setting to NULL the objects
2343  * not allocated.
2344  */
2345 static VkResult
pipeline_compile_graphics(struct v3dv_pipeline * pipeline,struct v3dv_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator)2346 pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
2347                           struct v3dv_pipeline_cache *cache,
2348                           const VkGraphicsPipelineCreateInfo *pCreateInfo,
2349                           const VkAllocationCallbacks *pAllocator)
2350 {
2351    VkPipelineCreationFeedback pipeline_feedback = {
2352       .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
2353    };
2354    int64_t pipeline_start = os_time_get_nano();
2355 
2356    struct v3dv_device *device = pipeline->device;
2357    struct v3dv_physical_device *physical_device =
2358       &device->instance->physicalDevice;
2359 
2360    /* First pass to get some common info from the shader, and create the
2361     * individual pipeline_stage objects
2362     */
2363    for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
2364       const VkPipelineShaderStageCreateInfo *sinfo = &pCreateInfo->pStages[i];
2365       gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage);
2366 
2367       struct v3dv_pipeline_stage *p_stage =
2368          vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*p_stage), 8,
2369                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2370 
2371       if (p_stage == NULL)
2372          return VK_ERROR_OUT_OF_HOST_MEMORY;
2373 
2374       /* Note that we are assigning program_id slightly differently that
2375        * v3d. Here we are assigning one per pipeline stage, so vs and vs_bin
2376        * would have a different program_id, while v3d would have the same for
2377        * both. For the case of v3dv, it is more natural to have an id this way,
2378        * as right now we are using it for debugging, not for shader-db.
2379        */
2380       p_stage->program_id =
2381          p_atomic_inc_return(&physical_device->next_program_id);
2382 
2383       p_stage->pipeline = pipeline;
2384       p_stage->stage = gl_shader_stage_to_broadcom(stage);
2385       p_stage->entrypoint = sinfo->pName;
2386       p_stage->module = vk_shader_module_from_handle(sinfo->module);
2387       p_stage->spec_info = sinfo->pSpecializationInfo;
2388 
2389       vk_pipeline_hash_shader_stage(&pCreateInfo->pStages[i], p_stage->shader_sha1);
2390 
2391       pipeline->active_stages |= sinfo->stage;
2392 
2393       /* We will try to get directly the compiled shader variant, so let's not
2394        * worry about getting the nir shader for now.
2395        */
2396       p_stage->nir = NULL;
2397 
2398       switch(stage) {
2399       case MESA_SHADER_VERTEX:
2400          pipeline->vs = p_stage;
2401          pipeline->vs_bin =
2402             pipeline_stage_create_binning(pipeline->vs, pAllocator);
2403          if (pipeline->vs_bin == NULL)
2404             return VK_ERROR_OUT_OF_HOST_MEMORY;
2405          break;
2406 
2407       case MESA_SHADER_GEOMETRY:
2408          pipeline->has_gs = true;
2409          pipeline->gs = p_stage;
2410          pipeline->gs_bin =
2411             pipeline_stage_create_binning(pipeline->gs, pAllocator);
2412          if (pipeline->gs_bin == NULL)
2413             return VK_ERROR_OUT_OF_HOST_MEMORY;
2414          break;
2415 
2416       case MESA_SHADER_FRAGMENT:
2417          pipeline->fs = p_stage;
2418          break;
2419 
2420       default:
2421          unreachable("not supported shader stage");
2422       }
2423    }
2424 
2425    /* Add a no-op fragment shader if needed */
2426    if (!pipeline->fs) {
2427       nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
2428                                                      &v3dv_nir_options,
2429                                                      "noop_fs");
2430 
2431       struct v3dv_pipeline_stage *p_stage =
2432          vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*p_stage), 8,
2433                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2434 
2435       if (p_stage == NULL)
2436          return VK_ERROR_OUT_OF_HOST_MEMORY;
2437 
2438       p_stage->pipeline = pipeline;
2439       p_stage->stage = BROADCOM_SHADER_FRAGMENT;
2440       p_stage->entrypoint = "main";
2441       p_stage->module = 0;
2442       p_stage->nir = b.shader;
2443       pipeline_compute_sha1_from_nir(p_stage);
2444       p_stage->program_id =
2445          p_atomic_inc_return(&physical_device->next_program_id);
2446 
2447       pipeline->fs = p_stage;
2448       pipeline->active_stages |= MESA_SHADER_FRAGMENT;
2449    }
2450 
2451    /* If multiview is enabled, we inject a custom passthrough geometry shader
2452     * to broadcast draw calls to the appropriate views.
2453     */
2454    assert(!pipeline->subpass->view_mask || (!pipeline->has_gs && !pipeline->gs));
2455    if (pipeline->subpass->view_mask) {
2456       if (!pipeline_add_multiview_gs(pipeline, cache, pAllocator))
2457          return VK_ERROR_OUT_OF_HOST_MEMORY;
2458    }
2459 
2460    /* First we try to get the variants from the pipeline cache (unless we are
2461     * required to capture internal representations, since in that case we need
2462     * compile).
2463     */
2464    bool needs_executable_info =
2465       pCreateInfo->flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
2466    if (!needs_executable_info) {
2467       struct v3dv_pipeline_key pipeline_key;
2468       pipeline_populate_graphics_key(pipeline, &pipeline_key, pCreateInfo);
2469       pipeline_hash_graphics(pipeline, &pipeline_key, pipeline->sha1);
2470 
2471       bool cache_hit = false;
2472 
2473       pipeline->shared_data =
2474          v3dv_pipeline_cache_search_for_pipeline(cache,
2475                                                  pipeline->sha1,
2476                                                  &cache_hit);
2477 
2478       if (pipeline->shared_data != NULL) {
2479          /* A correct pipeline must have at least a VS and FS */
2480          assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]);
2481          assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]);
2482          assert(pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
2483          assert(!pipeline->gs ||
2484                 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]);
2485          assert(!pipeline->gs ||
2486                 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]);
2487 
2488          if (cache_hit && cache != &pipeline->device->default_pipeline_cache)
2489             pipeline_feedback.flags |=
2490                VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
2491 
2492          goto success;
2493       }
2494    }
2495 
2496    if (pCreateInfo->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT)
2497       return VK_PIPELINE_COMPILE_REQUIRED;
2498 
2499    /* Otherwise we try to get the NIR shaders (either from the original SPIR-V
2500     * shader or the pipeline cache) and compile.
2501     */
2502    pipeline->shared_data =
2503       v3dv_pipeline_shared_data_new_empty(pipeline->sha1, pipeline, true);
2504    if (!pipeline->shared_data)
2505       return VK_ERROR_OUT_OF_HOST_MEMORY;
2506 
2507    pipeline->vs->feedback.flags |=
2508       VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
2509    if (pipeline->gs)
2510       pipeline->gs->feedback.flags |=
2511          VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
2512    pipeline->fs->feedback.flags |=
2513       VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
2514 
2515    if (!pipeline->vs->nir)
2516       pipeline->vs->nir = pipeline_stage_get_nir(pipeline->vs, pipeline, cache);
2517    if (pipeline->gs && !pipeline->gs->nir)
2518       pipeline->gs->nir = pipeline_stage_get_nir(pipeline->gs, pipeline, cache);
2519    if (!pipeline->fs->nir)
2520       pipeline->fs->nir = pipeline_stage_get_nir(pipeline->fs, pipeline, cache);
2521 
2522    /* Linking + pipeline lowerings */
2523    if (pipeline->gs) {
2524       link_shaders(pipeline->gs->nir, pipeline->fs->nir);
2525       link_shaders(pipeline->vs->nir, pipeline->gs->nir);
2526    } else {
2527       link_shaders(pipeline->vs->nir, pipeline->fs->nir);
2528    }
2529 
2530    pipeline_lower_nir(pipeline, pipeline->fs, pipeline->layout);
2531    lower_fs_io(pipeline->fs->nir);
2532 
2533    if (pipeline->gs) {
2534       pipeline_lower_nir(pipeline, pipeline->gs, pipeline->layout);
2535       lower_gs_io(pipeline->gs->nir);
2536    }
2537 
2538    pipeline_lower_nir(pipeline, pipeline->vs, pipeline->layout);
2539    lower_vs_io(pipeline->vs->nir);
2540 
2541    /* Compiling to vir */
2542    VkResult vk_result;
2543 
2544    /* We should have got all the variants or no variants from the cache */
2545    assert(!pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
2546    vk_result = pipeline_compile_fragment_shader(pipeline, pAllocator, pCreateInfo);
2547    if (vk_result != VK_SUCCESS)
2548       return vk_result;
2549 
2550    assert(!pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY] &&
2551           !pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]);
2552 
2553    if (pipeline->gs) {
2554       vk_result =
2555          pipeline_compile_geometry_shader(pipeline, pAllocator, pCreateInfo);
2556       if (vk_result != VK_SUCCESS)
2557          return vk_result;
2558    }
2559 
2560    assert(!pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX] &&
2561           !pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]);
2562 
2563    vk_result = pipeline_compile_vertex_shader(pipeline, pAllocator, pCreateInfo);
2564    if (vk_result != VK_SUCCESS)
2565       return vk_result;
2566 
2567    if (!upload_assembly(pipeline))
2568       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2569 
2570    v3dv_pipeline_cache_upload_pipeline(pipeline, cache);
2571 
2572  success:
2573 
2574    pipeline_check_buffer_device_address(pipeline);
2575 
2576    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
2577    write_creation_feedback(pipeline,
2578                            pCreateInfo->pNext,
2579                            &pipeline_feedback,
2580                            pCreateInfo->stageCount,
2581                            pCreateInfo->pStages);
2582 
2583    /* Since we have the variants in the pipeline shared data we can now free
2584     * the pipeline stages.
2585     */
2586    if (!needs_executable_info)
2587       pipeline_free_stages(device, pipeline, pAllocator);
2588 
2589    pipeline_check_spill_size(pipeline);
2590 
2591    return compute_vpm_config(pipeline);
2592 }
2593 
2594 static VkResult
compute_vpm_config(struct v3dv_pipeline * pipeline)2595 compute_vpm_config(struct v3dv_pipeline *pipeline)
2596 {
2597    struct v3dv_shader_variant *vs_variant =
2598       pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
2599    struct v3dv_shader_variant *vs_bin_variant =
2600       pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
2601    struct v3d_vs_prog_data *vs = vs_variant->prog_data.vs;
2602    struct v3d_vs_prog_data *vs_bin =vs_bin_variant->prog_data.vs;
2603 
2604    struct v3d_gs_prog_data *gs = NULL;
2605    struct v3d_gs_prog_data *gs_bin = NULL;
2606    if (pipeline->has_gs) {
2607       struct v3dv_shader_variant *gs_variant =
2608          pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
2609       struct v3dv_shader_variant *gs_bin_variant =
2610          pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
2611       gs = gs_variant->prog_data.gs;
2612       gs_bin = gs_bin_variant->prog_data.gs;
2613    }
2614 
2615    if (!v3d_compute_vpm_config(&pipeline->device->devinfo,
2616                                vs_bin, vs, gs_bin, gs,
2617                                &pipeline->vpm_cfg_bin,
2618                                &pipeline->vpm_cfg)) {
2619       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2620    }
2621 
2622    return VK_SUCCESS;
2623 }
2624 
2625 static unsigned
v3dv_dynamic_state_mask(VkDynamicState state)2626 v3dv_dynamic_state_mask(VkDynamicState state)
2627 {
2628    switch(state) {
2629    case VK_DYNAMIC_STATE_VIEWPORT:
2630       return V3DV_DYNAMIC_VIEWPORT;
2631    case VK_DYNAMIC_STATE_SCISSOR:
2632       return V3DV_DYNAMIC_SCISSOR;
2633    case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK:
2634       return V3DV_DYNAMIC_STENCIL_COMPARE_MASK;
2635    case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK:
2636       return V3DV_DYNAMIC_STENCIL_WRITE_MASK;
2637    case VK_DYNAMIC_STATE_STENCIL_REFERENCE:
2638       return V3DV_DYNAMIC_STENCIL_REFERENCE;
2639    case VK_DYNAMIC_STATE_BLEND_CONSTANTS:
2640       return V3DV_DYNAMIC_BLEND_CONSTANTS;
2641    case VK_DYNAMIC_STATE_DEPTH_BIAS:
2642       return V3DV_DYNAMIC_DEPTH_BIAS;
2643    case VK_DYNAMIC_STATE_LINE_WIDTH:
2644       return V3DV_DYNAMIC_LINE_WIDTH;
2645    case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
2646       return V3DV_DYNAMIC_COLOR_WRITE_ENABLE;
2647 
2648    /* Depth bounds testing is not available in in V3D 4.2 so here we are just
2649     * ignoring this dynamic state. We are already asserting at pipeline creation
2650     * time that depth bounds testing is not enabled.
2651     */
2652    case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
2653       return 0;
2654 
2655    default:
2656       unreachable("Unhandled dynamic state");
2657    }
2658 }
2659 
2660 static void
pipeline_init_dynamic_state(struct v3dv_pipeline * pipeline,const VkPipelineDynamicStateCreateInfo * pDynamicState,const VkPipelineViewportStateCreateInfo * pViewportState,const VkPipelineDepthStencilStateCreateInfo * pDepthStencilState,const VkPipelineColorBlendStateCreateInfo * pColorBlendState,const VkPipelineRasterizationStateCreateInfo * pRasterizationState,const VkPipelineColorWriteCreateInfoEXT * pColorWriteState)2661 pipeline_init_dynamic_state(
2662    struct v3dv_pipeline *pipeline,
2663    const VkPipelineDynamicStateCreateInfo *pDynamicState,
2664    const VkPipelineViewportStateCreateInfo *pViewportState,
2665    const VkPipelineDepthStencilStateCreateInfo *pDepthStencilState,
2666    const VkPipelineColorBlendStateCreateInfo *pColorBlendState,
2667    const VkPipelineRasterizationStateCreateInfo *pRasterizationState,
2668    const VkPipelineColorWriteCreateInfoEXT *pColorWriteState)
2669 {
2670    /* Initialize to default values */
2671    struct v3dv_dynamic_state *dynamic = &pipeline->dynamic_state;
2672    memset(dynamic, 0, sizeof(*dynamic));
2673    dynamic->stencil_compare_mask.front = ~0;
2674    dynamic->stencil_compare_mask.back = ~0;
2675    dynamic->stencil_write_mask.front = ~0;
2676    dynamic->stencil_write_mask.back = ~0;
2677    dynamic->line_width = 1.0f;
2678    dynamic->color_write_enable = (1ull << (4 * V3D_MAX_DRAW_BUFFERS)) - 1;
2679 
2680    /* Create a mask of enabled dynamic states */
2681    uint32_t dynamic_states = 0;
2682    if (pDynamicState) {
2683       uint32_t count = pDynamicState->dynamicStateCount;
2684       for (uint32_t s = 0; s < count; s++) {
2685          dynamic_states |=
2686             v3dv_dynamic_state_mask(pDynamicState->pDynamicStates[s]);
2687       }
2688    }
2689 
2690    /* For any pipeline states that are not dynamic, set the dynamic state
2691     * from the static pipeline state.
2692     */
2693    if (pViewportState) {
2694       if (!(dynamic_states & V3DV_DYNAMIC_VIEWPORT)) {
2695          dynamic->viewport.count = pViewportState->viewportCount;
2696          typed_memcpy(dynamic->viewport.viewports, pViewportState->pViewports,
2697                       pViewportState->viewportCount);
2698 
2699          for (uint32_t i = 0; i < dynamic->viewport.count; i++) {
2700             v3dv_viewport_compute_xform(&dynamic->viewport.viewports[i],
2701                                         dynamic->viewport.scale[i],
2702                                         dynamic->viewport.translate[i]);
2703          }
2704       }
2705 
2706       if (!(dynamic_states & V3DV_DYNAMIC_SCISSOR)) {
2707          dynamic->scissor.count = pViewportState->scissorCount;
2708          typed_memcpy(dynamic->scissor.scissors, pViewportState->pScissors,
2709                       pViewportState->scissorCount);
2710       }
2711    }
2712 
2713    if (pDepthStencilState) {
2714       if (!(dynamic_states & V3DV_DYNAMIC_STENCIL_COMPARE_MASK)) {
2715          dynamic->stencil_compare_mask.front =
2716             pDepthStencilState->front.compareMask;
2717          dynamic->stencil_compare_mask.back =
2718             pDepthStencilState->back.compareMask;
2719       }
2720 
2721       if (!(dynamic_states & V3DV_DYNAMIC_STENCIL_WRITE_MASK)) {
2722          dynamic->stencil_write_mask.front = pDepthStencilState->front.writeMask;
2723          dynamic->stencil_write_mask.back = pDepthStencilState->back.writeMask;
2724       }
2725 
2726       if (!(dynamic_states & V3DV_DYNAMIC_STENCIL_REFERENCE)) {
2727          dynamic->stencil_reference.front = pDepthStencilState->front.reference;
2728          dynamic->stencil_reference.back = pDepthStencilState->back.reference;
2729       }
2730    }
2731 
2732    if (pColorBlendState && !(dynamic_states & V3DV_DYNAMIC_BLEND_CONSTANTS)) {
2733       memcpy(dynamic->blend_constants, pColorBlendState->blendConstants,
2734              sizeof(dynamic->blend_constants));
2735    }
2736 
2737    if (pRasterizationState) {
2738       if (pRasterizationState->depthBiasEnable &&
2739           !(dynamic_states & V3DV_DYNAMIC_DEPTH_BIAS)) {
2740          dynamic->depth_bias.constant_factor =
2741             pRasterizationState->depthBiasConstantFactor;
2742          dynamic->depth_bias.depth_bias_clamp =
2743             pRasterizationState->depthBiasClamp;
2744          dynamic->depth_bias.slope_factor =
2745             pRasterizationState->depthBiasSlopeFactor;
2746       }
2747       if (!(dynamic_states & V3DV_DYNAMIC_LINE_WIDTH))
2748          dynamic->line_width = pRasterizationState->lineWidth;
2749    }
2750 
2751    if (pColorWriteState && !(dynamic_states & V3DV_DYNAMIC_COLOR_WRITE_ENABLE)) {
2752       dynamic->color_write_enable = 0;
2753       for (uint32_t i = 0; i < pColorWriteState->attachmentCount; i++)
2754          dynamic->color_write_enable |= pColorWriteState->pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
2755    }
2756 
2757    pipeline->dynamic_state.mask = dynamic_states;
2758 }
2759 
2760 static bool
stencil_op_is_no_op(const VkStencilOpState * stencil)2761 stencil_op_is_no_op(const VkStencilOpState *stencil)
2762 {
2763    return stencil->depthFailOp == VK_STENCIL_OP_KEEP &&
2764           stencil->compareOp == VK_COMPARE_OP_ALWAYS;
2765 }
2766 
2767 static void
enable_depth_bias(struct v3dv_pipeline * pipeline,const VkPipelineRasterizationStateCreateInfo * rs_info)2768 enable_depth_bias(struct v3dv_pipeline *pipeline,
2769                   const VkPipelineRasterizationStateCreateInfo *rs_info)
2770 {
2771    pipeline->depth_bias.enabled = false;
2772    pipeline->depth_bias.is_z16 = false;
2773 
2774    if (!rs_info || !rs_info->depthBiasEnable)
2775       return;
2776 
2777    /* Check the depth/stencil attachment description for the subpass used with
2778     * this pipeline.
2779     */
2780    assert(pipeline->pass && pipeline->subpass);
2781    struct v3dv_render_pass *pass = pipeline->pass;
2782    struct v3dv_subpass *subpass = pipeline->subpass;
2783 
2784    if (subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED)
2785       return;
2786 
2787    assert(subpass->ds_attachment.attachment < pass->attachment_count);
2788    struct v3dv_render_pass_attachment *att =
2789       &pass->attachments[subpass->ds_attachment.attachment];
2790 
2791    if (att->desc.format == VK_FORMAT_D16_UNORM)
2792       pipeline->depth_bias.is_z16 = true;
2793 
2794    pipeline->depth_bias.enabled = true;
2795 }
2796 
2797 static void
pipeline_set_ez_state(struct v3dv_pipeline * pipeline,const VkPipelineDepthStencilStateCreateInfo * ds_info)2798 pipeline_set_ez_state(struct v3dv_pipeline *pipeline,
2799                       const VkPipelineDepthStencilStateCreateInfo *ds_info)
2800 {
2801    if (!ds_info || !ds_info->depthTestEnable) {
2802       pipeline->ez_state = V3D_EZ_DISABLED;
2803       return;
2804    }
2805 
2806    switch (ds_info->depthCompareOp) {
2807    case VK_COMPARE_OP_LESS:
2808    case VK_COMPARE_OP_LESS_OR_EQUAL:
2809       pipeline->ez_state = V3D_EZ_LT_LE;
2810       break;
2811    case VK_COMPARE_OP_GREATER:
2812    case VK_COMPARE_OP_GREATER_OR_EQUAL:
2813       pipeline->ez_state = V3D_EZ_GT_GE;
2814       break;
2815    case VK_COMPARE_OP_NEVER:
2816    case VK_COMPARE_OP_EQUAL:
2817       pipeline->ez_state = V3D_EZ_UNDECIDED;
2818       break;
2819    default:
2820       pipeline->ez_state = V3D_EZ_DISABLED;
2821       pipeline->incompatible_ez_test = true;
2822       break;
2823    }
2824 
2825    /* If stencil is enabled and is not a no-op, we need to disable EZ */
2826    if (ds_info->stencilTestEnable &&
2827        (!stencil_op_is_no_op(&ds_info->front) ||
2828         !stencil_op_is_no_op(&ds_info->back))) {
2829          pipeline->ez_state = V3D_EZ_DISABLED;
2830    }
2831 
2832    /* If the FS writes Z, then it may update against the chosen EZ direction */
2833    struct v3dv_shader_variant *fs_variant =
2834       pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
2835    if (fs_variant && fs_variant->prog_data.fs->writes_z &&
2836        !fs_variant->prog_data.fs->writes_z_from_fep) {
2837       pipeline->ez_state = V3D_EZ_DISABLED;
2838    }
2839 }
2840 
2841 static bool
pipeline_has_integer_vertex_attrib(struct v3dv_pipeline * pipeline)2842 pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
2843 {
2844    for (uint8_t i = 0; i < pipeline->va_count; i++) {
2845       if (vk_format_is_int(pipeline->va[i].vk_format))
2846          return true;
2847    }
2848    return false;
2849 }
2850 
2851 /* @pipeline can be NULL. We assume in that case that all the attributes have
2852  * a float format (we only create an all-float BO once and we reuse it with
2853  * all float pipelines), otherwise we look at the actual type of each
2854  * attribute used with the specific pipeline passed in.
2855  */
2856 struct v3dv_bo *
v3dv_pipeline_create_default_attribute_values(struct v3dv_device * device,struct v3dv_pipeline * pipeline)2857 v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
2858                                               struct v3dv_pipeline *pipeline)
2859 {
2860    uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
2861    struct v3dv_bo *bo;
2862 
2863    bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
2864 
2865    if (!bo) {
2866       fprintf(stderr, "failed to allocate memory for the default "
2867               "attribute values\n");
2868       return NULL;
2869    }
2870 
2871    bool ok = v3dv_bo_map(device, bo, size);
2872    if (!ok) {
2873       fprintf(stderr, "failed to map default attribute values buffer\n");
2874       return false;
2875    }
2876 
2877    uint32_t *attrs = bo->map;
2878    uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
2879    for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
2880       attrs[i * 4 + 0] = 0;
2881       attrs[i * 4 + 1] = 0;
2882       attrs[i * 4 + 2] = 0;
2883       VkFormat attr_format =
2884          pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
2885       if (i < va_count && vk_format_is_int(attr_format)) {
2886          attrs[i * 4 + 3] = 1;
2887       } else {
2888          attrs[i * 4 + 3] = fui(1.0);
2889       }
2890    }
2891 
2892    v3dv_bo_unmap(device, bo);
2893 
2894    return bo;
2895 }
2896 
2897 static void
pipeline_set_sample_mask(struct v3dv_pipeline * pipeline,const VkPipelineMultisampleStateCreateInfo * ms_info)2898 pipeline_set_sample_mask(struct v3dv_pipeline *pipeline,
2899                          const VkPipelineMultisampleStateCreateInfo *ms_info)
2900 {
2901    pipeline->sample_mask = (1 << V3D_MAX_SAMPLES) - 1;
2902 
2903    /* Ignore pSampleMask if we are not enabling multisampling. The hardware
2904     * requires this to be 0xf or 0x0 if using a single sample.
2905     */
2906    if (ms_info && ms_info->pSampleMask &&
2907        ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT) {
2908       pipeline->sample_mask &= ms_info->pSampleMask[0];
2909    }
2910 }
2911 
2912 static void
pipeline_set_sample_rate_shading(struct v3dv_pipeline * pipeline,const VkPipelineMultisampleStateCreateInfo * ms_info)2913 pipeline_set_sample_rate_shading(struct v3dv_pipeline *pipeline,
2914                                  const VkPipelineMultisampleStateCreateInfo *ms_info)
2915 {
2916    pipeline->sample_rate_shading =
2917       ms_info && ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT &&
2918       ms_info->sampleShadingEnable;
2919 }
2920 
2921 static VkResult
pipeline_init(struct v3dv_pipeline * pipeline,struct v3dv_device * device,struct v3dv_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator)2922 pipeline_init(struct v3dv_pipeline *pipeline,
2923               struct v3dv_device *device,
2924               struct v3dv_pipeline_cache *cache,
2925               const VkGraphicsPipelineCreateInfo *pCreateInfo,
2926               const VkAllocationCallbacks *pAllocator)
2927 {
2928    VkResult result = VK_SUCCESS;
2929 
2930    pipeline->device = device;
2931 
2932    V3DV_FROM_HANDLE(v3dv_pipeline_layout, layout, pCreateInfo->layout);
2933    pipeline->layout = layout;
2934 
2935    V3DV_FROM_HANDLE(v3dv_render_pass, render_pass, pCreateInfo->renderPass);
2936    assert(pCreateInfo->subpass < render_pass->subpass_count);
2937    pipeline->pass = render_pass;
2938    pipeline->subpass = &render_pass->subpasses[pCreateInfo->subpass];
2939 
2940    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
2941       pCreateInfo->pInputAssemblyState;
2942    pipeline->topology = vk_to_pipe_prim_type[ia_info->topology];
2943 
2944    /* If rasterization is not enabled, various CreateInfo structs must be
2945     * ignored.
2946     */
2947    const bool raster_enabled =
2948       !pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
2949 
2950    const VkPipelineViewportStateCreateInfo *vp_info =
2951       raster_enabled ? pCreateInfo->pViewportState : NULL;
2952 
2953    const VkPipelineDepthStencilStateCreateInfo *ds_info =
2954       raster_enabled ? pCreateInfo->pDepthStencilState : NULL;
2955 
2956    const VkPipelineRasterizationStateCreateInfo *rs_info =
2957       raster_enabled ? pCreateInfo->pRasterizationState : NULL;
2958 
2959    const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_info =
2960       rs_info ? vk_find_struct_const(
2961          rs_info->pNext,
2962          PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT) :
2963             NULL;
2964 
2965    const VkPipelineRasterizationLineStateCreateInfoEXT *ls_info =
2966       rs_info ? vk_find_struct_const(
2967          rs_info->pNext,
2968          PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT) :
2969             NULL;
2970 
2971    const VkPipelineColorBlendStateCreateInfo *cb_info =
2972       raster_enabled ? pCreateInfo->pColorBlendState : NULL;
2973 
2974    const VkPipelineMultisampleStateCreateInfo *ms_info =
2975       raster_enabled ? pCreateInfo->pMultisampleState : NULL;
2976 
2977    const VkPipelineColorWriteCreateInfoEXT *cw_info =
2978       cb_info ? vk_find_struct_const(cb_info->pNext,
2979                                      PIPELINE_COLOR_WRITE_CREATE_INFO_EXT) :
2980                 NULL;
2981 
2982    pipeline_init_dynamic_state(pipeline,
2983                                pCreateInfo->pDynamicState,
2984                                vp_info, ds_info, cb_info, rs_info, cw_info);
2985 
2986    /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that
2987     * feature and it shouldn't be used by any pipeline.
2988     */
2989    assert(!ds_info || !ds_info->depthBoundsTestEnable);
2990 
2991    v3dv_X(device, pipeline_pack_state)(pipeline, cb_info, ds_info,
2992                                        rs_info, pv_info, ls_info,
2993                                        ms_info);
2994 
2995    enable_depth_bias(pipeline, rs_info);
2996    pipeline_set_sample_mask(pipeline, ms_info);
2997    pipeline_set_sample_rate_shading(pipeline, ms_info);
2998 
2999    pipeline->primitive_restart =
3000       pCreateInfo->pInputAssemblyState->primitiveRestartEnable;
3001 
3002    result = pipeline_compile_graphics(pipeline, cache, pCreateInfo, pAllocator);
3003 
3004    if (result != VK_SUCCESS) {
3005       /* Caller would already destroy the pipeline, and we didn't allocate any
3006        * extra info. We don't need to do anything else.
3007        */
3008       return result;
3009    }
3010 
3011    const VkPipelineVertexInputStateCreateInfo *vi_info =
3012       pCreateInfo->pVertexInputState;
3013 
3014    const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info =
3015       vk_find_struct_const(vi_info->pNext,
3016                            PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
3017 
3018    v3dv_X(device, pipeline_pack_compile_state)(pipeline, vi_info, vd_info);
3019 
3020    if (pipeline_has_integer_vertex_attrib(pipeline)) {
3021       pipeline->default_attribute_values =
3022          v3dv_pipeline_create_default_attribute_values(pipeline->device, pipeline);
3023       if (!pipeline->default_attribute_values)
3024          return VK_ERROR_OUT_OF_DEVICE_MEMORY;
3025    } else {
3026       pipeline->default_attribute_values = NULL;
3027    }
3028 
3029    /* This must be done after the pipeline has been compiled */
3030    pipeline_set_ez_state(pipeline, ds_info);
3031 
3032    return result;
3033 }
3034 
3035 static VkResult
graphics_pipeline_create(VkDevice _device,VkPipelineCache _cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)3036 graphics_pipeline_create(VkDevice _device,
3037                          VkPipelineCache _cache,
3038                          const VkGraphicsPipelineCreateInfo *pCreateInfo,
3039                          const VkAllocationCallbacks *pAllocator,
3040                          VkPipeline *pPipeline)
3041 {
3042    V3DV_FROM_HANDLE(v3dv_device, device, _device);
3043    V3DV_FROM_HANDLE(v3dv_pipeline_cache, cache, _cache);
3044 
3045    struct v3dv_pipeline *pipeline;
3046    VkResult result;
3047 
3048    /* Use the default pipeline cache if none is specified */
3049    if (cache == NULL && device->instance->default_pipeline_cache_enabled)
3050       cache = &device->default_pipeline_cache;
3051 
3052    pipeline = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pipeline),
3053                                VK_OBJECT_TYPE_PIPELINE);
3054 
3055    if (pipeline == NULL)
3056       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
3057 
3058    result = pipeline_init(pipeline, device, cache,
3059                           pCreateInfo,
3060                           pAllocator);
3061 
3062    if (result != VK_SUCCESS) {
3063       v3dv_destroy_pipeline(pipeline, device, pAllocator);
3064       if (result == VK_PIPELINE_COMPILE_REQUIRED)
3065          *pPipeline = VK_NULL_HANDLE;
3066       return result;
3067    }
3068 
3069    *pPipeline = v3dv_pipeline_to_handle(pipeline);
3070 
3071    return VK_SUCCESS;
3072 }
3073 
3074 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateGraphicsPipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t count,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)3075 v3dv_CreateGraphicsPipelines(VkDevice _device,
3076                              VkPipelineCache pipelineCache,
3077                              uint32_t count,
3078                              const VkGraphicsPipelineCreateInfo *pCreateInfos,
3079                              const VkAllocationCallbacks *pAllocator,
3080                              VkPipeline *pPipelines)
3081 {
3082    V3DV_FROM_HANDLE(v3dv_device, device, _device);
3083    VkResult result = VK_SUCCESS;
3084 
3085    if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS))
3086       mtx_lock(&device->pdevice->mutex);
3087 
3088    uint32_t i = 0;
3089    for (; i < count; i++) {
3090       VkResult local_result;
3091 
3092       local_result = graphics_pipeline_create(_device,
3093                                               pipelineCache,
3094                                               &pCreateInfos[i],
3095                                               pAllocator,
3096                                               &pPipelines[i]);
3097 
3098       if (local_result != VK_SUCCESS) {
3099          result = local_result;
3100          pPipelines[i] = VK_NULL_HANDLE;
3101 
3102          if (pCreateInfos[i].flags &
3103              VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
3104             break;
3105       }
3106    }
3107 
3108    for (; i < count; i++)
3109       pPipelines[i] = VK_NULL_HANDLE;
3110 
3111    if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS))
3112       mtx_unlock(&device->pdevice->mutex);
3113 
3114    return result;
3115 }
3116 
3117 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)3118 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
3119 {
3120    assert(glsl_type_is_vector_or_scalar(type));
3121 
3122    uint32_t comp_size = glsl_type_is_boolean(type)
3123       ? 4 : glsl_get_bit_size(type) / 8;
3124    unsigned length = glsl_get_vector_elements(type);
3125    *size = comp_size * length,
3126    *align = comp_size * (length == 3 ? 4 : length);
3127 }
3128 
3129 static void
lower_cs_shared(struct nir_shader * nir)3130 lower_cs_shared(struct nir_shader *nir)
3131 {
3132    NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
3133             nir_var_mem_shared, shared_type_info);
3134    NIR_PASS(_, nir, nir_lower_explicit_io,
3135             nir_var_mem_shared, nir_address_format_32bit_offset);
3136 }
3137 
3138 static VkResult
pipeline_compile_compute(struct v3dv_pipeline * pipeline,struct v3dv_pipeline_cache * cache,const VkComputePipelineCreateInfo * info,const VkAllocationCallbacks * alloc)3139 pipeline_compile_compute(struct v3dv_pipeline *pipeline,
3140                          struct v3dv_pipeline_cache *cache,
3141                          const VkComputePipelineCreateInfo *info,
3142                          const VkAllocationCallbacks *alloc)
3143 {
3144    VkPipelineCreationFeedback pipeline_feedback = {
3145       .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
3146    };
3147    int64_t pipeline_start = os_time_get_nano();
3148 
3149    struct v3dv_device *device = pipeline->device;
3150    struct v3dv_physical_device *physical_device =
3151       &device->instance->physicalDevice;
3152 
3153    const VkPipelineShaderStageCreateInfo *sinfo = &info->stage;
3154    gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage);
3155 
3156    struct v3dv_pipeline_stage *p_stage =
3157       vk_zalloc2(&device->vk.alloc, alloc, sizeof(*p_stage), 8,
3158                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
3159    if (!p_stage)
3160       return VK_ERROR_OUT_OF_HOST_MEMORY;
3161 
3162    p_stage->program_id = p_atomic_inc_return(&physical_device->next_program_id);
3163    p_stage->pipeline = pipeline;
3164    p_stage->stage = gl_shader_stage_to_broadcom(stage);
3165    p_stage->entrypoint = sinfo->pName;
3166    p_stage->module = vk_shader_module_from_handle(sinfo->module);
3167    p_stage->spec_info = sinfo->pSpecializationInfo;
3168    p_stage->feedback = (VkPipelineCreationFeedback) { 0 };
3169 
3170    vk_pipeline_hash_shader_stage(&info->stage, p_stage->shader_sha1);
3171 
3172    p_stage->nir = NULL;
3173 
3174    pipeline->cs = p_stage;
3175    pipeline->active_stages |= sinfo->stage;
3176 
3177    /* First we try to get the variants from the pipeline cache (unless we are
3178     * required to capture internal representations, since in that case we need
3179     * compile).
3180     */
3181    bool needs_executable_info =
3182       info->flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
3183    if (!needs_executable_info) {
3184       struct v3dv_pipeline_key pipeline_key;
3185       pipeline_populate_compute_key(pipeline, &pipeline_key, info);
3186       pipeline_hash_compute(pipeline, &pipeline_key, pipeline->sha1);
3187 
3188       bool cache_hit = false;
3189       pipeline->shared_data =
3190          v3dv_pipeline_cache_search_for_pipeline(cache, pipeline->sha1, &cache_hit);
3191 
3192       if (pipeline->shared_data != NULL) {
3193          assert(pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
3194          if (cache_hit && cache != &pipeline->device->default_pipeline_cache)
3195             pipeline_feedback.flags |=
3196                VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
3197 
3198          goto success;
3199       }
3200    }
3201 
3202    if (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT)
3203       return VK_PIPELINE_COMPILE_REQUIRED;
3204 
3205    pipeline->shared_data = v3dv_pipeline_shared_data_new_empty(pipeline->sha1,
3206                                                                pipeline,
3207                                                                false);
3208    if (!pipeline->shared_data)
3209       return VK_ERROR_OUT_OF_HOST_MEMORY;
3210 
3211    p_stage->feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
3212 
3213    /* If not found on cache, compile it */
3214    p_stage->nir = pipeline_stage_get_nir(p_stage, pipeline, cache);
3215    assert(p_stage->nir);
3216 
3217    nir_optimize(p_stage->nir, false);
3218    pipeline_lower_nir(pipeline, p_stage, pipeline->layout);
3219    lower_cs_shared(p_stage->nir);
3220 
3221    VkResult result = VK_SUCCESS;
3222 
3223    struct v3d_key key;
3224    memset(&key, 0, sizeof(key));
3225    pipeline_populate_v3d_key(&key, p_stage, 0,
3226                              pipeline->device->features.robustBufferAccess);
3227    pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE] =
3228       pipeline_compile_shader_variant(p_stage, &key, sizeof(key),
3229                                       alloc, &result);
3230 
3231    if (result != VK_SUCCESS)
3232       return result;
3233 
3234    if (!upload_assembly(pipeline))
3235       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
3236 
3237    v3dv_pipeline_cache_upload_pipeline(pipeline, cache);
3238 
3239 success:
3240 
3241    pipeline_check_buffer_device_address(pipeline);
3242 
3243    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
3244    write_creation_feedback(pipeline,
3245                            info->pNext,
3246                            &pipeline_feedback,
3247                            1,
3248                            &info->stage);
3249 
3250    /* As we got the variants in pipeline->shared_data, after compiling we
3251     * don't need the pipeline_stages.
3252     */
3253    if (!needs_executable_info)
3254       pipeline_free_stages(device, pipeline, alloc);
3255 
3256    pipeline_check_spill_size(pipeline);
3257 
3258    return VK_SUCCESS;
3259 }
3260 
3261 static VkResult
compute_pipeline_init(struct v3dv_pipeline * pipeline,struct v3dv_device * device,struct v3dv_pipeline_cache * cache,const VkComputePipelineCreateInfo * info,const VkAllocationCallbacks * alloc)3262 compute_pipeline_init(struct v3dv_pipeline *pipeline,
3263                       struct v3dv_device *device,
3264                       struct v3dv_pipeline_cache *cache,
3265                       const VkComputePipelineCreateInfo *info,
3266                       const VkAllocationCallbacks *alloc)
3267 {
3268    V3DV_FROM_HANDLE(v3dv_pipeline_layout, layout, info->layout);
3269 
3270    pipeline->device = device;
3271    pipeline->layout = layout;
3272 
3273    VkResult result = pipeline_compile_compute(pipeline, cache, info, alloc);
3274 
3275    return result;
3276 }
3277 
3278 static VkResult
compute_pipeline_create(VkDevice _device,VkPipelineCache _cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)3279 compute_pipeline_create(VkDevice _device,
3280                          VkPipelineCache _cache,
3281                          const VkComputePipelineCreateInfo *pCreateInfo,
3282                          const VkAllocationCallbacks *pAllocator,
3283                          VkPipeline *pPipeline)
3284 {
3285    V3DV_FROM_HANDLE(v3dv_device, device, _device);
3286    V3DV_FROM_HANDLE(v3dv_pipeline_cache, cache, _cache);
3287 
3288    struct v3dv_pipeline *pipeline;
3289    VkResult result;
3290 
3291    /* Use the default pipeline cache if none is specified */
3292    if (cache == NULL && device->instance->default_pipeline_cache_enabled)
3293       cache = &device->default_pipeline_cache;
3294 
3295    pipeline = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pipeline),
3296                                VK_OBJECT_TYPE_PIPELINE);
3297    if (pipeline == NULL)
3298       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
3299 
3300    result = compute_pipeline_init(pipeline, device, cache,
3301                                   pCreateInfo, pAllocator);
3302    if (result != VK_SUCCESS) {
3303       v3dv_destroy_pipeline(pipeline, device, pAllocator);
3304       if (result == VK_PIPELINE_COMPILE_REQUIRED)
3305          *pPipeline = VK_NULL_HANDLE;
3306       return result;
3307    }
3308 
3309    *pPipeline = v3dv_pipeline_to_handle(pipeline);
3310 
3311    return VK_SUCCESS;
3312 }
3313 
3314 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateComputePipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t createInfoCount,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)3315 v3dv_CreateComputePipelines(VkDevice _device,
3316                             VkPipelineCache pipelineCache,
3317                             uint32_t createInfoCount,
3318                             const VkComputePipelineCreateInfo *pCreateInfos,
3319                             const VkAllocationCallbacks *pAllocator,
3320                             VkPipeline *pPipelines)
3321 {
3322    V3DV_FROM_HANDLE(v3dv_device, device, _device);
3323    VkResult result = VK_SUCCESS;
3324 
3325    if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS))
3326       mtx_lock(&device->pdevice->mutex);
3327 
3328    uint32_t i = 0;
3329    for (; i < createInfoCount; i++) {
3330       VkResult local_result;
3331       local_result = compute_pipeline_create(_device,
3332                                               pipelineCache,
3333                                               &pCreateInfos[i],
3334                                               pAllocator,
3335                                               &pPipelines[i]);
3336 
3337       if (local_result != VK_SUCCESS) {
3338          result = local_result;
3339          pPipelines[i] = VK_NULL_HANDLE;
3340 
3341          if (pCreateInfos[i].flags &
3342              VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
3343             break;
3344       }
3345    }
3346 
3347    for (; i < createInfoCount; i++)
3348       pPipelines[i] = VK_NULL_HANDLE;
3349 
3350    if (unlikely(V3D_DEBUG & V3D_DEBUG_SHADERS))
3351       mtx_unlock(&device->pdevice->mutex);
3352 
3353    return result;
3354 }
3355 
3356 static nir_shader *
pipeline_get_nir(struct v3dv_pipeline * pipeline,enum broadcom_shader_stage stage)3357 pipeline_get_nir(struct v3dv_pipeline *pipeline,
3358                  enum broadcom_shader_stage stage)
3359 {
3360    switch (stage) {
3361    case BROADCOM_SHADER_VERTEX:
3362       if (pipeline->vs)
3363          return pipeline->vs->nir;
3364       break;
3365    case BROADCOM_SHADER_VERTEX_BIN:
3366       if(pipeline->vs_bin)
3367          return pipeline->vs_bin->nir;
3368       break;
3369    case BROADCOM_SHADER_GEOMETRY:
3370       if(pipeline->gs)
3371          return pipeline->gs->nir;
3372       break;
3373    case BROADCOM_SHADER_GEOMETRY_BIN:
3374       if (pipeline->gs_bin)
3375          return pipeline->gs_bin->nir;
3376       break;
3377    case BROADCOM_SHADER_FRAGMENT:
3378       if (pipeline->fs)
3379          return pipeline->fs->nir;
3380       break;
3381    case BROADCOM_SHADER_COMPUTE:
3382       if(pipeline->cs)
3383          return pipeline->cs->nir;
3384       break;
3385    default:
3386       unreachable("Unsupported shader stage");
3387    }
3388 
3389    return NULL;
3390 }
3391 
3392 static struct v3d_prog_data *
pipeline_get_prog_data(struct v3dv_pipeline * pipeline,enum broadcom_shader_stage stage)3393 pipeline_get_prog_data(struct v3dv_pipeline *pipeline,
3394                        enum broadcom_shader_stage stage)
3395 {
3396    if (pipeline->shared_data->variants[stage])
3397       return pipeline->shared_data->variants[stage]->prog_data.base;
3398    return NULL;
3399 }
3400 
3401 static uint64_t *
pipeline_get_qpu(struct v3dv_pipeline * pipeline,enum broadcom_shader_stage stage,uint32_t * qpu_size)3402 pipeline_get_qpu(struct v3dv_pipeline *pipeline,
3403                  enum broadcom_shader_stage stage,
3404                  uint32_t *qpu_size)
3405 {
3406    struct v3dv_shader_variant *variant =
3407       pipeline->shared_data->variants[stage];
3408    if (!variant) {
3409       *qpu_size = 0;
3410       return NULL;
3411    }
3412 
3413    /* We expect the QPU BO to have been mapped before calling here */
3414    struct v3dv_bo *qpu_bo = pipeline->shared_data->assembly_bo;
3415    assert(qpu_bo && qpu_bo->map_size >= variant->assembly_offset +
3416                                         variant->qpu_insts_size);
3417 
3418    *qpu_size = variant->qpu_insts_size;
3419    uint64_t *qpu = (uint64_t *)
3420       (((uint8_t *) qpu_bo->map) + variant->assembly_offset);
3421    return qpu;
3422 }
3423 
3424 /* FIXME: we use the same macro in various drivers, maybe move it to
3425  * the comon vk_util.h?
3426  */
3427 #define WRITE_STR(field, ...) ({                                \
3428    memset(field, 0, sizeof(field));                             \
3429    UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \
3430    assert(_i > 0 && _i < sizeof(field));                        \
3431 })
3432 
3433 static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR * ir,const char * data)3434 write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
3435               const char *data)
3436 {
3437    ir->isText = VK_TRUE;
3438 
3439    size_t data_len = strlen(data) + 1;
3440 
3441    if (ir->pData == NULL) {
3442       ir->dataSize = data_len;
3443       return true;
3444    }
3445 
3446    strncpy(ir->pData, data, ir->dataSize);
3447    if (ir->dataSize < data_len)
3448       return false;
3449 
3450    ir->dataSize = data_len;
3451    return true;
3452 }
3453 
3454 static void
append(char ** str,size_t * offset,const char * fmt,...)3455 append(char **str, size_t *offset, const char *fmt, ...)
3456 {
3457    va_list args;
3458    va_start(args, fmt);
3459    ralloc_vasprintf_rewrite_tail(str, offset, fmt, args);
3460    va_end(args);
3461 }
3462 
3463 static void
pipeline_collect_executable_data(struct v3dv_pipeline * pipeline)3464 pipeline_collect_executable_data(struct v3dv_pipeline *pipeline)
3465 {
3466    if (pipeline->executables.mem_ctx)
3467       return;
3468 
3469    pipeline->executables.mem_ctx = ralloc_context(NULL);
3470    util_dynarray_init(&pipeline->executables.data,
3471                       pipeline->executables.mem_ctx);
3472 
3473    /* Don't crash for failed/bogus pipelines */
3474    if (!pipeline->shared_data || !pipeline->shared_data->assembly_bo)
3475       return;
3476 
3477    /* Map the assembly BO so we can read the pipeline's QPU code */
3478    struct v3dv_bo *qpu_bo = pipeline->shared_data->assembly_bo;
3479 
3480    if (!v3dv_bo_map(pipeline->device, qpu_bo, qpu_bo->size)) {
3481       fprintf(stderr, "failed to map QPU buffer\n");
3482       return;
3483    }
3484 
3485    for (int s = BROADCOM_SHADER_VERTEX; s <= BROADCOM_SHADER_COMPUTE; s++) {
3486       VkShaderStageFlags vk_stage =
3487          mesa_to_vk_shader_stage(broadcom_shader_stage_to_gl(s));
3488       if (!(vk_stage & pipeline->active_stages))
3489          continue;
3490 
3491       nir_shader *nir = pipeline_get_nir(pipeline, s);
3492       char *nir_str = nir ?
3493          nir_shader_as_str(nir, pipeline->executables.mem_ctx) : NULL;
3494 
3495       char *qpu_str = NULL;
3496       uint32_t qpu_size;
3497       uint64_t *qpu = pipeline_get_qpu(pipeline, s, &qpu_size);
3498       if (qpu) {
3499          uint32_t qpu_inst_count = qpu_size / sizeof(uint64_t);
3500          qpu_str = rzalloc_size(pipeline->executables.mem_ctx,
3501                                 qpu_inst_count * 96);
3502          size_t offset = 0;
3503          for (int i = 0; i < qpu_inst_count; i++) {
3504             const char *str = v3d_qpu_disasm(&pipeline->device->devinfo, qpu[i]);
3505             append(&qpu_str, &offset, "%s\n", str);
3506             ralloc_free((void *)str);
3507          }
3508       }
3509 
3510       struct v3dv_pipeline_executable_data data = {
3511          .stage = s,
3512          .nir_str = nir_str,
3513          .qpu_str = qpu_str,
3514       };
3515       util_dynarray_append(&pipeline->executables.data,
3516                            struct v3dv_pipeline_executable_data, data);
3517    }
3518 
3519    v3dv_bo_unmap(pipeline->device, qpu_bo);
3520 }
3521 
3522 static const struct v3dv_pipeline_executable_data *
pipeline_get_executable(struct v3dv_pipeline * pipeline,uint32_t index)3523 pipeline_get_executable(struct v3dv_pipeline *pipeline, uint32_t index)
3524 {
3525    assert(index < util_dynarray_num_elements(&pipeline->executables.data,
3526                                              struct v3dv_pipeline_executable_data));
3527    return util_dynarray_element(&pipeline->executables.data,
3528                                 struct v3dv_pipeline_executable_data,
3529                                 index);
3530 }
3531 
3532 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetPipelineExecutableInternalRepresentationsKHR(VkDevice device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pInternalRepresentationCount,VkPipelineExecutableInternalRepresentationKHR * pInternalRepresentations)3533 v3dv_GetPipelineExecutableInternalRepresentationsKHR(
3534    VkDevice device,
3535    const VkPipelineExecutableInfoKHR *pExecutableInfo,
3536    uint32_t *pInternalRepresentationCount,
3537    VkPipelineExecutableInternalRepresentationKHR *pInternalRepresentations)
3538 {
3539    V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, pExecutableInfo->pipeline);
3540 
3541    pipeline_collect_executable_data(pipeline);
3542 
3543    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
3544                           pInternalRepresentations, pInternalRepresentationCount);
3545 
3546    bool incomplete = false;
3547    const struct v3dv_pipeline_executable_data *exe =
3548       pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
3549 
3550    if (exe->nir_str) {
3551       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
3552                                &out, ir) {
3553          WRITE_STR(ir->name, "NIR (%s)", broadcom_shader_stage_name(exe->stage));
3554          WRITE_STR(ir->description, "Final NIR form");
3555          if (!write_ir_text(ir, exe->nir_str))
3556             incomplete = true;
3557       }
3558    }
3559 
3560    if (exe->qpu_str) {
3561       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
3562                                &out, ir) {
3563          WRITE_STR(ir->name, "QPU (%s)", broadcom_shader_stage_name(exe->stage));
3564          WRITE_STR(ir->description, "Final QPU assembly");
3565          if (!write_ir_text(ir, exe->qpu_str))
3566             incomplete = true;
3567       }
3568    }
3569 
3570    return incomplete ? VK_INCOMPLETE : vk_outarray_status(&out);
3571 }
3572 
3573 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetPipelineExecutablePropertiesKHR(VkDevice device,const VkPipelineInfoKHR * pPipelineInfo,uint32_t * pExecutableCount,VkPipelineExecutablePropertiesKHR * pProperties)3574 v3dv_GetPipelineExecutablePropertiesKHR(
3575    VkDevice device,
3576    const VkPipelineInfoKHR *pPipelineInfo,
3577    uint32_t *pExecutableCount,
3578    VkPipelineExecutablePropertiesKHR *pProperties)
3579 {
3580    V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, pPipelineInfo->pipeline);
3581 
3582    pipeline_collect_executable_data(pipeline);
3583 
3584    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
3585                           pProperties, pExecutableCount);
3586 
3587    util_dynarray_foreach(&pipeline->executables.data,
3588                          struct v3dv_pipeline_executable_data, exe) {
3589       vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
3590          gl_shader_stage mesa_stage = broadcom_shader_stage_to_gl(exe->stage);
3591          props->stages = mesa_to_vk_shader_stage(mesa_stage);
3592 
3593          WRITE_STR(props->name, "%s (%s)",
3594                    _mesa_shader_stage_to_abbrev(mesa_stage),
3595                    broadcom_shader_stage_is_binning(exe->stage) ?
3596                      "Binning" : "Render");
3597 
3598          WRITE_STR(props->description, "%s",
3599                    _mesa_shader_stage_to_string(mesa_stage));
3600 
3601          props->subgroupSize = V3D_CHANNELS;
3602       }
3603    }
3604 
3605    return vk_outarray_status(&out);
3606 }
3607 
3608 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetPipelineExecutableStatisticsKHR(VkDevice device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pStatisticCount,VkPipelineExecutableStatisticKHR * pStatistics)3609 v3dv_GetPipelineExecutableStatisticsKHR(
3610    VkDevice device,
3611    const VkPipelineExecutableInfoKHR *pExecutableInfo,
3612    uint32_t *pStatisticCount,
3613    VkPipelineExecutableStatisticKHR *pStatistics)
3614 {
3615    V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, pExecutableInfo->pipeline);
3616 
3617    pipeline_collect_executable_data(pipeline);
3618 
3619    const struct v3dv_pipeline_executable_data *exe =
3620       pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
3621 
3622    struct v3d_prog_data *prog_data =
3623       pipeline_get_prog_data(pipeline, exe->stage);
3624 
3625    struct v3dv_shader_variant *variant =
3626       pipeline->shared_data->variants[exe->stage];
3627    uint32_t qpu_inst_count = variant->qpu_insts_size / sizeof(uint64_t);
3628 
3629    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
3630                           pStatistics, pStatisticCount);
3631 
3632    if (qpu_inst_count > 0) {
3633       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
3634          WRITE_STR(stat->name, "Compile Strategy");
3635          WRITE_STR(stat->description, "Chosen compile strategy index");
3636          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3637          stat->value.u64 = prog_data->compile_strategy_idx;
3638       }
3639 
3640       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
3641          WRITE_STR(stat->name, "Instruction Count");
3642          WRITE_STR(stat->description, "Number of QPU instructions");
3643          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3644          stat->value.u64 = qpu_inst_count;
3645       }
3646 
3647       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
3648          WRITE_STR(stat->name, "Thread Count");
3649          WRITE_STR(stat->description, "Number of QPU threads dispatched");
3650          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3651          stat->value.u64 = prog_data->threads;
3652       }
3653 
3654       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
3655          WRITE_STR(stat->name, "Spill Size");
3656          WRITE_STR(stat->description, "Size of the spill buffer in bytes");
3657          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3658          stat->value.u64 = prog_data->spill_size;
3659       }
3660 
3661       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
3662          WRITE_STR(stat->name, "TMU Spills");
3663          WRITE_STR(stat->description, "Number of times a register was spilled "
3664                                       "to memory");
3665          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3666          stat->value.u64 = prog_data->spill_size;
3667       }
3668 
3669       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
3670          WRITE_STR(stat->name, "TMU Fills");
3671          WRITE_STR(stat->description, "Number of times a register was filled "
3672                                       "from memory");
3673          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3674          stat->value.u64 = prog_data->spill_size;
3675       }
3676 
3677       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
3678          WRITE_STR(stat->name, "QPU Read Stalls");
3679          WRITE_STR(stat->description, "Number of cycles the QPU stalls for a "
3680                                       "register read dependency");
3681          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3682          stat->value.u64 = prog_data->qpu_read_stalls;
3683       }
3684    }
3685 
3686    return vk_outarray_status(&out);
3687 }
3688