• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Raspberry Pi Ltd
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "vk_util.h"
25 
26 #include "v3dv_private.h"
27 
28 #include "common/v3d_debug.h"
29 #include "qpu/qpu_disasm.h"
30 
31 #include "compiler/nir/nir_builder.h"
32 #include "nir/nir_serialize.h"
33 
34 #include "util/u_atomic.h"
35 #include "util/os_time.h"
36 #include "util/perf/cpu_trace.h"
37 
38 #include "vk_format.h"
39 #include "vk_nir_convert_ycbcr.h"
40 #include "vk_pipeline.h"
41 
42 static VkResult
43 compute_vpm_config(struct v3dv_pipeline *pipeline);
44 
45 static void
pipeline_compute_sha1_from_nir(struct v3dv_pipeline_stage * p_stage)46 pipeline_compute_sha1_from_nir(struct v3dv_pipeline_stage *p_stage)
47 {
48    VkPipelineShaderStageCreateInfo info = {
49       .module = vk_shader_module_handle_from_nir(p_stage->nir),
50       .pName = p_stage->entrypoint,
51       .stage = mesa_to_vk_shader_stage(p_stage->nir->info.stage),
52    };
53 
54    vk_pipeline_hash_shader_stage(0, &info, NULL, p_stage->shader_sha1);
55 }
56 
57 void
v3dv_shader_variant_destroy(struct v3dv_device * device,struct v3dv_shader_variant * variant)58 v3dv_shader_variant_destroy(struct v3dv_device *device,
59                             struct v3dv_shader_variant *variant)
60 {
61    /* The assembly BO is shared by all variants in the pipeline, so it can't
62     * be freed here and should be freed with the pipeline
63     */
64    if (variant->qpu_insts) {
65       free(variant->qpu_insts);
66       variant->qpu_insts = NULL;
67    }
68    ralloc_free(variant->prog_data.base);
69    vk_free(&device->vk.alloc, variant);
70 }
71 
72 static void
destroy_pipeline_stage(struct v3dv_device * device,struct v3dv_pipeline_stage * p_stage,const VkAllocationCallbacks * pAllocator)73 destroy_pipeline_stage(struct v3dv_device *device,
74                        struct v3dv_pipeline_stage *p_stage,
75                        const VkAllocationCallbacks *pAllocator)
76 {
77    if (!p_stage)
78       return;
79 
80    ralloc_free(p_stage->nir);
81    vk_free2(&device->vk.alloc, pAllocator, p_stage);
82 }
83 
84 static void
pipeline_free_stages(struct v3dv_device * device,struct v3dv_pipeline * pipeline,const VkAllocationCallbacks * pAllocator)85 pipeline_free_stages(struct v3dv_device *device,
86                      struct v3dv_pipeline *pipeline,
87                      const VkAllocationCallbacks *pAllocator)
88 {
89    assert(pipeline);
90 
91    for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
92       destroy_pipeline_stage(device, pipeline->stages[stage], pAllocator);
93       pipeline->stages[stage] = NULL;
94    }
95 }
96 
97 static void
v3dv_destroy_pipeline(struct v3dv_pipeline * pipeline,struct v3dv_device * device,const VkAllocationCallbacks * pAllocator)98 v3dv_destroy_pipeline(struct v3dv_pipeline *pipeline,
99                       struct v3dv_device *device,
100                       const VkAllocationCallbacks *pAllocator)
101 {
102    if (!pipeline)
103       return;
104 
105    pipeline_free_stages(device, pipeline, pAllocator);
106 
107    if (pipeline->shared_data) {
108       v3dv_pipeline_shared_data_unref(device, pipeline->shared_data);
109       pipeline->shared_data = NULL;
110    }
111 
112    if (pipeline->spill.bo) {
113       assert(pipeline->spill.size_per_thread > 0);
114       v3dv_bo_free(device, pipeline->spill.bo);
115    }
116 
117    if (pipeline->default_attribute_values) {
118       v3dv_bo_free(device, pipeline->default_attribute_values);
119       pipeline->default_attribute_values = NULL;
120    }
121 
122    if (pipeline->executables.mem_ctx)
123       ralloc_free(pipeline->executables.mem_ctx);
124 
125    if (pipeline->layout)
126       v3dv_pipeline_layout_unref(device, pipeline->layout, pAllocator);
127 
128    vk_object_free(&device->vk, pAllocator, pipeline);
129 }
130 
131 VKAPI_ATTR void VKAPI_CALL
v3dv_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)132 v3dv_DestroyPipeline(VkDevice _device,
133                      VkPipeline _pipeline,
134                      const VkAllocationCallbacks *pAllocator)
135 {
136    V3DV_FROM_HANDLE(v3dv_device, device, _device);
137    V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, _pipeline);
138 
139    if (!pipeline)
140       return;
141 
142    v3dv_destroy_pipeline(pipeline, device, pAllocator);
143 }
144 
145 static const struct spirv_to_nir_options default_spirv_options =  {
146    .ubo_addr_format = nir_address_format_32bit_index_offset,
147    .ssbo_addr_format = nir_address_format_32bit_index_offset,
148    .phys_ssbo_addr_format = nir_address_format_2x32bit_global,
149    .push_const_addr_format = nir_address_format_logical,
150    .shared_addr_format = nir_address_format_32bit_offset,
151 };
152 
153 const nir_shader_compiler_options *
v3dv_pipeline_get_nir_options(const struct v3d_device_info * devinfo)154 v3dv_pipeline_get_nir_options(const struct v3d_device_info *devinfo)
155 {
156    static bool initialized = false;
157    static nir_shader_compiler_options options = {
158       .lower_uadd_sat = true,
159       .lower_usub_sat = true,
160       .lower_iadd_sat = true,
161       .lower_all_io_to_temps = true,
162       .lower_extract_byte = true,
163       .lower_extract_word = true,
164       .lower_insert_byte = true,
165       .lower_insert_word = true,
166       .lower_bitfield_insert = true,
167       .lower_bitfield_extract = true,
168       .lower_bitfield_reverse = true,
169       .lower_bit_count = true,
170       .lower_cs_local_id_to_index = true,
171       .lower_ffract = true,
172       .lower_fmod = true,
173       .lower_pack_unorm_2x16 = true,
174       .lower_pack_snorm_2x16 = true,
175       .lower_unpack_unorm_2x16 = true,
176       .lower_unpack_snorm_2x16 = true,
177       .lower_pack_unorm_4x8 = true,
178       .lower_pack_snorm_4x8 = true,
179       .lower_unpack_unorm_4x8 = true,
180       .lower_unpack_snorm_4x8 = true,
181       .lower_pack_half_2x16 = true,
182       .lower_unpack_half_2x16 = true,
183       .lower_pack_32_2x16 = true,
184       .lower_pack_32_2x16_split = true,
185       .lower_unpack_32_2x16_split = true,
186       .lower_mul_2x32_64 = true,
187       .lower_fdiv = true,
188       .lower_find_lsb = true,
189       .lower_ffma16 = true,
190       .lower_ffma32 = true,
191       .lower_ffma64 = true,
192       .lower_flrp32 = true,
193       .lower_fpow = true,
194       .lower_fsqrt = true,
195       .lower_ifind_msb = true,
196       .lower_isign = true,
197       .lower_ldexp = true,
198       .lower_mul_high = true,
199       .lower_wpos_pntc = false,
200       .lower_to_scalar = true,
201       .lower_device_index_to_zero = true,
202       .lower_fquantize2f16 = true,
203       .lower_ufind_msb = true,
204       .has_fsub = true,
205       .has_isub = true,
206       .has_uclz = true,
207       .vertex_id_zero_based = false, /* FIXME: to set this to true, the intrinsic
208                                       * needs to be supported */
209       .lower_interpolate_at = true,
210       .max_unroll_iterations = 16,
211       .force_indirect_unrolling = (nir_var_shader_in | nir_var_function_temp),
212       .divergence_analysis_options =
213          nir_divergence_multiple_workgroup_per_compute_subgroup,
214       .discard_is_demote = true,
215       .scalarize_ddx = true,
216    };
217 
218    if (!initialized) {
219       options.lower_fsat = devinfo->ver < 71;
220       initialized = true;
221     }
222 
223    return &options;
224 }
225 
226 static const struct vk_ycbcr_conversion_state *
lookup_ycbcr_conversion(const void * _pipeline_layout,uint32_t set,uint32_t binding,uint32_t array_index)227 lookup_ycbcr_conversion(const void *_pipeline_layout, uint32_t set,
228                         uint32_t binding, uint32_t array_index)
229 {
230    struct v3dv_pipeline_layout *pipeline_layout =
231       (struct v3dv_pipeline_layout *) _pipeline_layout;
232 
233    assert(set < pipeline_layout->num_sets);
234    struct v3dv_descriptor_set_layout *set_layout =
235       pipeline_layout->set[set].layout;
236 
237    assert(binding < set_layout->binding_count);
238    struct v3dv_descriptor_set_binding_layout *bind_layout =
239       &set_layout->binding[binding];
240 
241    if (bind_layout->immutable_samplers_offset) {
242       const struct v3dv_sampler *immutable_samplers =
243          v3dv_immutable_samplers(set_layout, bind_layout);
244       const struct v3dv_sampler *sampler = &immutable_samplers[array_index];
245       return sampler->conversion ? &sampler->conversion->state : NULL;
246    } else {
247       return NULL;
248    }
249 }
250 
251 static void
preprocess_nir(nir_shader * nir)252 preprocess_nir(nir_shader *nir)
253 {
254    const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
255       .frag_coord = true,
256       .point_coord = true,
257    };
258    NIR_PASS(_, nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
259 
260    /* Vulkan uses the separate-shader linking model */
261    nir->info.separate_shader = true;
262 
263    /* Make sure we lower variable initializers on output variables so that
264     * nir_remove_dead_variables below sees the corresponding stores
265     */
266    NIR_PASS(_, nir, nir_lower_variable_initializers, nir_var_shader_out);
267 
268    if (nir->info.stage == MESA_SHADER_FRAGMENT)
269       NIR_PASS(_, nir, nir_lower_io_to_vector, nir_var_shader_out);
270    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
271       NIR_PASS(_, nir, nir_lower_input_attachments,
272                  &(nir_input_attachment_options) {
273                     .use_fragcoord_sysval = false,
274                        });
275    }
276 
277    NIR_PASS_V(nir, nir_lower_io_to_temporaries,
278               nir_shader_get_entrypoint(nir), true, false);
279 
280    NIR_PASS(_, nir, nir_lower_system_values);
281 
282    NIR_PASS(_, nir, nir_lower_alu_to_scalar, NULL, NULL);
283 
284    NIR_PASS(_, nir, nir_normalize_cubemap_coords);
285 
286    NIR_PASS(_, nir, nir_lower_global_vars_to_local);
287 
288    NIR_PASS(_, nir, nir_split_var_copies);
289    NIR_PASS(_, nir, nir_split_struct_vars, nir_var_function_temp);
290 
291    v3d_optimize_nir(NULL, nir);
292 
293    NIR_PASS(_, nir, nir_lower_explicit_io,
294             nir_var_mem_push_const,
295             nir_address_format_32bit_offset);
296 
297    NIR_PASS(_, nir, nir_lower_explicit_io,
298             nir_var_mem_ubo | nir_var_mem_ssbo,
299             nir_address_format_32bit_index_offset);
300 
301    NIR_PASS(_, nir, nir_lower_explicit_io,
302             nir_var_mem_global,
303             nir_address_format_2x32bit_global);
304 
305    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
306 
307    /* Lower a bunch of stuff */
308    NIR_PASS(_, nir, nir_lower_var_copies);
309 
310    NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_shader_in, UINT32_MAX);
311 
312    NIR_PASS(_, nir, nir_lower_indirect_derefs,
313             nir_var_function_temp, 2);
314 
315    NIR_PASS(_, nir, nir_lower_array_deref_of_vec,
316             nir_var_mem_ubo | nir_var_mem_ssbo, NULL,
317             nir_lower_direct_array_deref_of_vec_load);
318 
319    NIR_PASS(_, nir, nir_lower_frexp);
320 
321    /* Get rid of split copies */
322    v3d_optimize_nir(NULL, nir);
323 }
324 
325 static nir_shader *
shader_module_compile_to_nir(struct v3dv_device * device,struct v3dv_pipeline_stage * stage)326 shader_module_compile_to_nir(struct v3dv_device *device,
327                              struct v3dv_pipeline_stage *stage)
328 {
329    assert(stage->module || stage->module_info);
330 
331    nir_shader *nir;
332    const nir_shader_compiler_options *nir_options =
333       v3dv_pipeline_get_nir_options(&device->devinfo);
334 
335    gl_shader_stage gl_stage = broadcom_shader_stage_to_gl(stage->stage);
336 
337    const VkPipelineShaderStageCreateInfo stage_info = {
338       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
339       .pNext = !stage->module ? stage->module_info : NULL,
340       .stage = mesa_to_vk_shader_stage(gl_stage),
341       .module = vk_shader_module_to_handle((struct vk_shader_module *)stage->module),
342       .pName = stage->entrypoint,
343       .pSpecializationInfo = stage->spec_info,
344    };
345 
346    /* vk_pipeline_shader_stage_to_nir also handles internal shaders when
347     * module->nir != NULL. It also calls nir_validate_shader on both cases
348     * so we don't have to call it here.
349     */
350    VkResult result = vk_pipeline_shader_stage_to_nir(&device->vk,
351                                                      stage->pipeline->flags,
352                                                      &stage_info,
353                                                      &default_spirv_options,
354                                                      nir_options,
355                                                      NULL, &nir);
356    if (result != VK_SUCCESS)
357       return NULL;
358    assert(nir->info.stage == gl_stage);
359 
360    if (V3D_DBG(SHADERDB) && (!stage->module || stage->module->nir == NULL)) {
361       char sha1buf[41];
362       _mesa_sha1_format(sha1buf, stage->pipeline->sha1);
363       nir->info.name = ralloc_strdup(nir, sha1buf);
364    }
365 
366    if (V3D_DBG(NIR) || v3d_debug_flag_for_shader_stage(gl_stage)) {
367       fprintf(stderr, "NIR after vk_pipeline_shader_stage_to_nir: %s prog %d NIR:\n",
368               broadcom_shader_stage_name(stage->stage),
369               stage->program_id);
370       nir_print_shader(nir, stderr);
371       fprintf(stderr, "\n");
372    }
373 
374    preprocess_nir(nir);
375 
376    return nir;
377 }
378 
379 static int
type_size_vec4(const struct glsl_type * type,bool bindless)380 type_size_vec4(const struct glsl_type *type, bool bindless)
381 {
382    return glsl_count_attribute_slots(type, false);
383 }
384 
385 /* FIXME: the number of parameters for this method is somewhat big. Perhaps
386  * rethink.
387  */
388 static unsigned
descriptor_map_add(struct v3dv_descriptor_map * map,int set,int binding,int array_index,int array_size,int start_index,uint8_t return_size,uint8_t plane)389 descriptor_map_add(struct v3dv_descriptor_map *map,
390                    int set,
391                    int binding,
392                    int array_index,
393                    int array_size,
394                    int start_index,
395                    uint8_t return_size,
396                    uint8_t plane)
397 {
398    assert(array_index < array_size);
399    assert(return_size == 16 || return_size == 32);
400 
401    unsigned index = start_index;
402    for (; index < map->num_desc; index++) {
403       if (map->used[index] &&
404           set == map->set[index] &&
405           binding == map->binding[index] &&
406           array_index == map->array_index[index] &&
407           plane == map->plane[index]) {
408          assert(array_size == map->array_size[index]);
409          if (return_size != map->return_size[index]) {
410             /* It the return_size is different it means that the same sampler
411              * was used for operations with different precision
412              * requirement. In this case we need to ensure that we use the
413              * larger one.
414              */
415             map->return_size[index] = 32;
416          }
417          return index;
418       } else if (!map->used[index]) {
419          break;
420       }
421    }
422 
423    assert(index < DESCRIPTOR_MAP_SIZE);
424    assert(!map->used[index]);
425 
426    map->used[index] = true;
427    map->set[index] = set;
428    map->binding[index] = binding;
429    map->array_index[index] = array_index;
430    map->array_size[index] = array_size;
431    map->return_size[index] = return_size;
432    map->plane[index] = plane;
433    map->num_desc = MAX2(map->num_desc, index + 1);
434 
435    return index;
436 }
437 
438 struct lower_pipeline_layout_state {
439    struct v3dv_pipeline *pipeline;
440    const struct v3dv_pipeline_layout *layout;
441    bool needs_default_sampler_state;
442 };
443 
444 
445 static void
lower_load_push_constant(nir_builder * b,nir_intrinsic_instr * instr,struct lower_pipeline_layout_state * state)446 lower_load_push_constant(nir_builder *b, nir_intrinsic_instr *instr,
447                          struct lower_pipeline_layout_state *state)
448 {
449    assert(instr->intrinsic == nir_intrinsic_load_push_constant);
450    instr->intrinsic = nir_intrinsic_load_uniform;
451 }
452 
453 static struct v3dv_descriptor_map*
pipeline_get_descriptor_map(struct v3dv_pipeline * pipeline,VkDescriptorType desc_type,gl_shader_stage gl_stage,bool is_sampler)454 pipeline_get_descriptor_map(struct v3dv_pipeline *pipeline,
455                             VkDescriptorType desc_type,
456                             gl_shader_stage gl_stage,
457                             bool is_sampler)
458 {
459    enum broadcom_shader_stage broadcom_stage =
460       gl_shader_stage_to_broadcom(gl_stage);
461 
462    assert(pipeline->shared_data &&
463           pipeline->shared_data->maps[broadcom_stage]);
464 
465    switch(desc_type) {
466    case VK_DESCRIPTOR_TYPE_SAMPLER:
467       return &pipeline->shared_data->maps[broadcom_stage]->sampler_map;
468    case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
469    case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
470    case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
471    case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
472    case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
473       return &pipeline->shared_data->maps[broadcom_stage]->texture_map;
474    case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
475       return is_sampler ?
476          &pipeline->shared_data->maps[broadcom_stage]->sampler_map :
477          &pipeline->shared_data->maps[broadcom_stage]->texture_map;
478    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
479    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
480    case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
481       return &pipeline->shared_data->maps[broadcom_stage]->ubo_map;
482    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
483    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
484       return &pipeline->shared_data->maps[broadcom_stage]->ssbo_map;
485    default:
486       unreachable("Descriptor type unknown or not having a descriptor map");
487    }
488 }
489 
490 /* Gathers info from the intrinsic (set and binding) and then lowers it so it
491  * could be used by the v3d_compiler */
492 static void
lower_vulkan_resource_index(nir_builder * b,nir_intrinsic_instr * instr,struct lower_pipeline_layout_state * state)493 lower_vulkan_resource_index(nir_builder *b,
494                             nir_intrinsic_instr *instr,
495                             struct lower_pipeline_layout_state *state)
496 {
497    assert(instr->intrinsic == nir_intrinsic_vulkan_resource_index);
498 
499    nir_const_value *const_val = nir_src_as_const_value(instr->src[0]);
500 
501    unsigned set = nir_intrinsic_desc_set(instr);
502    unsigned binding = nir_intrinsic_binding(instr);
503    struct v3dv_descriptor_set_layout *set_layout = state->layout->set[set].layout;
504    struct v3dv_descriptor_set_binding_layout *binding_layout =
505       &set_layout->binding[binding];
506    unsigned index = 0;
507 
508    switch (binding_layout->type) {
509    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
510    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
511    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
512    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
513    case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: {
514       struct v3dv_descriptor_map *descriptor_map =
515          pipeline_get_descriptor_map(state->pipeline, binding_layout->type,
516                                      b->shader->info.stage, false);
517 
518       if (!const_val)
519          unreachable("non-constant vulkan_resource_index array index");
520 
521       /* At compile-time we will need to know if we are processing a UBO load
522        * for an inline or a regular UBO so we can handle inline loads like
523        * push constants. At the level of NIR level however, the inline
524        * information is gone, so we rely on the index to make this distinction.
525        * Particularly, we reserve indices 1..MAX_INLINE_UNIFORM_BUFFERS for
526        * inline buffers. This means that at the descriptor map level
527        * we store inline buffers at slots 0..MAX_INLINE_UNIFORM_BUFFERS - 1,
528        * and regular UBOs at indices starting from MAX_INLINE_UNIFORM_BUFFERS.
529        */
530       uint32_t start_index = 0;
531       if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
532           binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) {
533          start_index += MAX_INLINE_UNIFORM_BUFFERS;
534       }
535 
536       index = descriptor_map_add(descriptor_map, set, binding,
537                                  const_val->u32,
538                                  binding_layout->array_size,
539                                  start_index,
540                                  32 /* return_size: doesn't really apply for this case */,
541                                  0);
542       break;
543    }
544 
545    default:
546       unreachable("unsupported descriptor type for vulkan_resource_index");
547       break;
548    }
549 
550    /* Since we use the deref pass, both vulkan_resource_index and
551     * vulkan_load_descriptor return a vec2 providing an index and
552     * offset. Our backend compiler only cares about the index part.
553     */
554    nir_def_replace(&instr->def, nir_imm_ivec2(b, index, 0));
555 }
556 
557 static uint8_t
tex_instr_get_and_remove_plane_src(nir_tex_instr * tex)558 tex_instr_get_and_remove_plane_src(nir_tex_instr *tex)
559 {
560    int plane_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_plane);
561    if (plane_src_idx < 0)
562        return 0;
563 
564    uint8_t plane = nir_src_as_uint(tex->src[plane_src_idx].src);
565    nir_tex_instr_remove_src(tex, plane_src_idx);
566    return plane;
567 }
568 
569 /* Returns return_size, so it could be used for the case of not having a
570  * sampler object
571  */
572 static uint8_t
lower_tex_src(nir_builder * b,nir_tex_instr * instr,unsigned src_idx,struct lower_pipeline_layout_state * state)573 lower_tex_src(nir_builder *b,
574               nir_tex_instr *instr,
575               unsigned src_idx,
576               struct lower_pipeline_layout_state *state)
577 {
578    nir_def *index = NULL;
579    unsigned base_index = 0;
580    unsigned array_elements = 1;
581    nir_tex_src *src = &instr->src[src_idx];
582    bool is_sampler = src->src_type == nir_tex_src_sampler_deref;
583 
584    uint8_t plane = tex_instr_get_and_remove_plane_src(instr);
585 
586    /* We compute first the offsets */
587    nir_deref_instr *deref = nir_instr_as_deref(src->src.ssa->parent_instr);
588    while (deref->deref_type != nir_deref_type_var) {
589       nir_deref_instr *parent =
590          nir_instr_as_deref(deref->parent.ssa->parent_instr);
591 
592       assert(deref->deref_type == nir_deref_type_array);
593 
594       if (nir_src_is_const(deref->arr.index) && index == NULL) {
595          /* We're still building a direct index */
596          base_index += nir_src_as_uint(deref->arr.index) * array_elements;
597       } else {
598          if (index == NULL) {
599             /* We used to be direct but not anymore */
600             index = nir_imm_int(b, base_index);
601             base_index = 0;
602          }
603 
604          index = nir_iadd(b, index,
605                           nir_imul_imm(b, deref->arr.index.ssa,
606                                        array_elements));
607       }
608 
609       array_elements *= glsl_get_length(parent->type);
610 
611       deref = parent;
612    }
613 
614    if (index)
615       index = nir_umin(b, index, nir_imm_int(b, array_elements - 1));
616 
617    /* We have the offsets, we apply them, rewriting the source or removing
618     * instr if needed
619     */
620    if (index) {
621       nir_src_rewrite(&src->src, index);
622 
623       src->src_type = is_sampler ?
624          nir_tex_src_sampler_offset :
625          nir_tex_src_texture_offset;
626    } else {
627       nir_tex_instr_remove_src(instr, src_idx);
628    }
629 
630    uint32_t set = deref->var->data.descriptor_set;
631    uint32_t binding = deref->var->data.binding;
632    /* FIXME: this is a really simplified check for the precision to be used
633     * for the sampling. Right now we are only checking for the variables used
634     * on the operation itself, but there are other cases that we could use to
635     * infer the precision requirement.
636     */
637    bool relaxed_precision = deref->var->data.precision == GLSL_PRECISION_MEDIUM ||
638                             deref->var->data.precision == GLSL_PRECISION_LOW;
639    struct v3dv_descriptor_set_layout *set_layout = state->layout->set[set].layout;
640    struct v3dv_descriptor_set_binding_layout *binding_layout =
641       &set_layout->binding[binding];
642 
643    uint8_t return_size;
644    if (V3D_DBG(TMU_16BIT))
645       return_size = 16;
646    else  if (V3D_DBG(TMU_32BIT))
647       return_size = 32;
648    else
649       return_size = relaxed_precision ? 16 : 32;
650 
651    struct v3dv_descriptor_map *map =
652       pipeline_get_descriptor_map(state->pipeline, binding_layout->type,
653                                   b->shader->info.stage, is_sampler);
654    int desc_index =
655       descriptor_map_add(map,
656                          deref->var->data.descriptor_set,
657                          deref->var->data.binding,
658                          base_index,
659                          binding_layout->array_size,
660                          0,
661                          return_size,
662                          plane);
663 
664    if (is_sampler)
665       instr->sampler_index = desc_index;
666    else
667       instr->texture_index = desc_index;
668 
669    return return_size;
670 }
671 
672 static bool
lower_sampler(nir_builder * b,nir_tex_instr * instr,struct lower_pipeline_layout_state * state)673 lower_sampler(nir_builder *b,
674               nir_tex_instr *instr,
675               struct lower_pipeline_layout_state *state)
676 {
677    uint8_t return_size = 0;
678 
679    int texture_idx =
680       nir_tex_instr_src_index(instr, nir_tex_src_texture_deref);
681 
682    if (texture_idx >= 0)
683       return_size = lower_tex_src(b, instr, texture_idx, state);
684 
685    int sampler_idx =
686       nir_tex_instr_src_index(instr, nir_tex_src_sampler_deref);
687 
688    if (sampler_idx >= 0) {
689       assert(nir_tex_instr_need_sampler(instr));
690       lower_tex_src(b, instr, sampler_idx, state);
691    }
692 
693    if (texture_idx < 0 && sampler_idx < 0)
694       return false;
695 
696    /* If the instruction doesn't have a sampler (i.e. txf) we use backend_flags
697     * to bind a default sampler state to configure precission.
698     */
699    if (sampler_idx < 0) {
700       state->needs_default_sampler_state = true;
701       instr->backend_flags = return_size == 16 ?
702          V3DV_NO_SAMPLER_16BIT_IDX : V3DV_NO_SAMPLER_32BIT_IDX;
703    }
704 
705    return true;
706 }
707 
708 /* FIXME: really similar to lower_tex_src, perhaps refactor? */
709 static void
lower_image_deref(nir_builder * b,nir_intrinsic_instr * instr,struct lower_pipeline_layout_state * state)710 lower_image_deref(nir_builder *b,
711                   nir_intrinsic_instr *instr,
712                   struct lower_pipeline_layout_state *state)
713 {
714    nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
715    nir_def *index = NULL;
716    unsigned array_elements = 1;
717    unsigned base_index = 0;
718 
719    while (deref->deref_type != nir_deref_type_var) {
720       nir_deref_instr *parent =
721          nir_instr_as_deref(deref->parent.ssa->parent_instr);
722 
723       assert(deref->deref_type == nir_deref_type_array);
724 
725       if (nir_src_is_const(deref->arr.index) && index == NULL) {
726          /* We're still building a direct index */
727          base_index += nir_src_as_uint(deref->arr.index) * array_elements;
728       } else {
729          if (index == NULL) {
730             /* We used to be direct but not anymore */
731             index = nir_imm_int(b, base_index);
732             base_index = 0;
733          }
734 
735          index = nir_iadd(b, index,
736                           nir_imul_imm(b, deref->arr.index.ssa,
737                                        array_elements));
738       }
739 
740       array_elements *= glsl_get_length(parent->type);
741 
742       deref = parent;
743    }
744 
745    if (index)
746       nir_umin(b, index, nir_imm_int(b, array_elements - 1));
747 
748    uint32_t set = deref->var->data.descriptor_set;
749    uint32_t binding = deref->var->data.binding;
750    struct v3dv_descriptor_set_layout *set_layout = state->layout->set[set].layout;
751    struct v3dv_descriptor_set_binding_layout *binding_layout =
752       &set_layout->binding[binding];
753 
754    assert(binding_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE ||
755           binding_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER);
756 
757    struct v3dv_descriptor_map *map =
758       pipeline_get_descriptor_map(state->pipeline, binding_layout->type,
759                                   b->shader->info.stage, false);
760 
761    int desc_index =
762       descriptor_map_add(map,
763                          deref->var->data.descriptor_set,
764                          deref->var->data.binding,
765                          base_index,
766                          binding_layout->array_size,
767                          0,
768                          32 /* return_size: doesn't apply for textures */,
769                          0);
770 
771    /* Note: we don't need to do anything here in relation to the precision and
772     * the output size because for images we can infer that info from the image
773     * intrinsic, that includes the image format (see
774     * NIR_INTRINSIC_FORMAT). That is done by the v3d compiler.
775     */
776 
777    index = nir_imm_int(b, desc_index);
778 
779    nir_rewrite_image_intrinsic(instr, index, false);
780 }
781 
782 static bool
lower_intrinsic(nir_builder * b,nir_intrinsic_instr * instr,struct lower_pipeline_layout_state * state)783 lower_intrinsic(nir_builder *b,
784                 nir_intrinsic_instr *instr,
785                 struct lower_pipeline_layout_state *state)
786 {
787    switch (instr->intrinsic) {
788    case nir_intrinsic_load_push_constant:
789       lower_load_push_constant(b, instr, state);
790       return true;
791 
792    case nir_intrinsic_vulkan_resource_index:
793       lower_vulkan_resource_index(b, instr, state);
794       return true;
795 
796    case nir_intrinsic_load_vulkan_descriptor: {
797       /* Loading the descriptor happens as part of load/store instructions,
798        * so for us this is a no-op.
799        */
800       nir_def_replace(&instr->def, instr->src[0].ssa);
801       return true;
802    }
803 
804    case nir_intrinsic_image_deref_load:
805    case nir_intrinsic_image_deref_store:
806    case nir_intrinsic_image_deref_atomic:
807    case nir_intrinsic_image_deref_atomic_swap:
808    case nir_intrinsic_image_deref_size:
809    case nir_intrinsic_image_deref_samples:
810       lower_image_deref(b, instr, state);
811       return true;
812 
813    default:
814       return false;
815    }
816 }
817 
818 static bool
lower_pipeline_layout_cb(nir_builder * b,nir_instr * instr,void * _state)819 lower_pipeline_layout_cb(nir_builder *b,
820                          nir_instr *instr,
821                          void *_state)
822 {
823    bool progress = false;
824    struct lower_pipeline_layout_state *state = _state;
825 
826    b->cursor = nir_before_instr(instr);
827    switch (instr->type) {
828    case nir_instr_type_tex:
829       progress |= lower_sampler(b, nir_instr_as_tex(instr), state);
830       break;
831    case nir_instr_type_intrinsic:
832       progress |= lower_intrinsic(b, nir_instr_as_intrinsic(instr), state);
833       break;
834    default:
835       break;
836    }
837 
838    return progress;
839 }
840 
841 static bool
lower_pipeline_layout_info(nir_shader * shader,struct v3dv_pipeline * pipeline,const struct v3dv_pipeline_layout * layout,bool * needs_default_sampler_state)842 lower_pipeline_layout_info(nir_shader *shader,
843                            struct v3dv_pipeline *pipeline,
844                            const struct v3dv_pipeline_layout *layout,
845                            bool *needs_default_sampler_state)
846 {
847    bool progress = false;
848 
849    struct lower_pipeline_layout_state state = {
850       .pipeline = pipeline,
851       .layout = layout,
852       .needs_default_sampler_state = false,
853    };
854 
855    progress = nir_shader_instructions_pass(shader, lower_pipeline_layout_cb,
856                                            nir_metadata_control_flow,
857                                            &state);
858 
859    *needs_default_sampler_state = state.needs_default_sampler_state;
860 
861    return progress;
862 }
863 
864 /* This flips gl_PointCoord.y to match Vulkan requirements */
865 static bool
lower_point_coord_cb(nir_builder * b,nir_intrinsic_instr * intr,void * _state)866 lower_point_coord_cb(nir_builder *b, nir_intrinsic_instr *intr, void *_state)
867 {
868    if (intr->intrinsic != nir_intrinsic_load_input)
869       return false;
870 
871    if (nir_intrinsic_io_semantics(intr).location != VARYING_SLOT_PNTC)
872       return false;
873 
874    b->cursor = nir_after_instr(&intr->instr);
875    nir_def *result = &intr->def;
876    result =
877       nir_vector_insert_imm(b, result,
878                             nir_fsub_imm(b, 1.0, nir_channel(b, result, 1)), 1);
879    nir_def_rewrite_uses_after(&intr->def,
880                                   result, result->parent_instr);
881    return true;
882 }
883 
884 static bool
v3d_nir_lower_point_coord(nir_shader * s)885 v3d_nir_lower_point_coord(nir_shader *s)
886 {
887    assert(s->info.stage == MESA_SHADER_FRAGMENT);
888    return nir_shader_intrinsics_pass(s, lower_point_coord_cb,
889                                        nir_metadata_control_flow, NULL);
890 }
891 
892 static void
lower_fs_io(nir_shader * nir)893 lower_fs_io(nir_shader *nir)
894 {
895    /* Our backend doesn't handle array fragment shader outputs */
896    NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false);
897    NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_shader_out, NULL);
898 
899    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
900                                MESA_SHADER_FRAGMENT);
901 
902    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
903                                MESA_SHADER_FRAGMENT);
904 
905    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
906             type_size_vec4, 0);
907 }
908 
909 static void
lower_gs_io(struct nir_shader * nir)910 lower_gs_io(struct nir_shader *nir)
911 {
912    NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false);
913 
914    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
915                                MESA_SHADER_GEOMETRY);
916 
917    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
918                                MESA_SHADER_GEOMETRY);
919 }
920 
921 static void
lower_vs_io(struct nir_shader * nir)922 lower_vs_io(struct nir_shader *nir)
923 {
924    NIR_PASS_V(nir, nir_lower_io_arrays_to_elements_no_indirects, false);
925 
926    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs,
927                                MESA_SHADER_VERTEX);
928 
929    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs,
930                                MESA_SHADER_VERTEX);
931 
932    /* FIXME: if we call nir_lower_io, we get a crash later. Likely because it
933     * overlaps with v3d_nir_lower_io. Need further research though.
934     */
935 }
936 
937 static void
shader_debug_output(const char * message,void * data)938 shader_debug_output(const char *message, void *data)
939 {
940    /* FIXME: We probably don't want to debug anything extra here, and in fact
941     * the compiler is not using this callback too much, only as an alternative
942     * way to debug out the shaderdb stats, that you can already get using
943     * V3D_DEBUG=shaderdb. Perhaps it would make sense to revisit the v3d
944     * compiler to remove that callback.
945     */
946 }
947 
948 static void
pipeline_populate_v3d_key(struct v3d_key * key,const struct v3dv_pipeline_stage * p_stage,uint32_t ucp_enables)949 pipeline_populate_v3d_key(struct v3d_key *key,
950                           const struct v3dv_pipeline_stage *p_stage,
951                           uint32_t ucp_enables)
952 {
953    assert(p_stage->pipeline->shared_data &&
954           p_stage->pipeline->shared_data->maps[p_stage->stage]);
955 
956    /* The following values are default values used at pipeline create. We use
957     * there 32 bit as default return size.
958     */
959    struct v3dv_descriptor_map *sampler_map =
960       &p_stage->pipeline->shared_data->maps[p_stage->stage]->sampler_map;
961    struct v3dv_descriptor_map *texture_map =
962       &p_stage->pipeline->shared_data->maps[p_stage->stage]->texture_map;
963 
964    key->num_tex_used = texture_map->num_desc;
965    assert(key->num_tex_used <= V3D_MAX_TEXTURE_SAMPLERS);
966    for (uint32_t tex_idx = 0; tex_idx < texture_map->num_desc; tex_idx++) {
967       key->tex[tex_idx].swizzle[0] = PIPE_SWIZZLE_X;
968       key->tex[tex_idx].swizzle[1] = PIPE_SWIZZLE_Y;
969       key->tex[tex_idx].swizzle[2] = PIPE_SWIZZLE_Z;
970       key->tex[tex_idx].swizzle[3] = PIPE_SWIZZLE_W;
971    }
972 
973    key->num_samplers_used = sampler_map->num_desc;
974    assert(key->num_samplers_used <= V3D_MAX_TEXTURE_SAMPLERS);
975    for (uint32_t sampler_idx = 0; sampler_idx < sampler_map->num_desc;
976         sampler_idx++) {
977       key->sampler[sampler_idx].return_size =
978          sampler_map->return_size[sampler_idx];
979 
980       key->sampler[sampler_idx].return_channels =
981          key->sampler[sampler_idx].return_size == 32 ? 4 : 2;
982    }
983 
984    switch (p_stage->stage) {
985    case BROADCOM_SHADER_VERTEX:
986    case BROADCOM_SHADER_VERTEX_BIN:
987       key->is_last_geometry_stage =
988          p_stage->pipeline->stages[BROADCOM_SHADER_GEOMETRY] == NULL;
989       break;
990    case BROADCOM_SHADER_GEOMETRY:
991    case BROADCOM_SHADER_GEOMETRY_BIN:
992       /* FIXME: while we don't implement tessellation shaders */
993       key->is_last_geometry_stage = true;
994       break;
995    case BROADCOM_SHADER_FRAGMENT:
996    case BROADCOM_SHADER_COMPUTE:
997       key->is_last_geometry_stage = false;
998       break;
999    default:
1000       unreachable("unsupported shader stage");
1001    }
1002 
1003    /* Vulkan doesn't have fixed function state for user clip planes. Instead,
1004     * shaders can write to gl_ClipDistance[], in which case the SPIR-V compiler
1005     * takes care of adding a single compact array variable at
1006     * VARYING_SLOT_CLIP_DIST0, so we don't need any user clip plane lowering.
1007     *
1008     * The only lowering we are interested is specific to the fragment shader,
1009     * where we want to emit discards to honor writes to gl_ClipDistance[] in
1010     * previous stages. This is done via nir_lower_clip_fs() so we only set up
1011     * the ucp enable mask for that stage.
1012     */
1013    key->ucp_enables = ucp_enables;
1014 
1015    const VkPipelineRobustnessBufferBehaviorEXT robust_buffer_enabled =
1016       VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT;
1017 
1018    const VkPipelineRobustnessImageBehaviorEXT robust_image_enabled =
1019       VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS_EXT;
1020 
1021    key->robust_uniform_access =
1022       p_stage->robustness.uniform_buffers == robust_buffer_enabled;
1023    key->robust_storage_access =
1024       p_stage->robustness.storage_buffers == robust_buffer_enabled;
1025    key->robust_image_access =
1026       p_stage->robustness.images == robust_image_enabled;
1027 }
1028 
1029 /* FIXME: anv maps to hw primitive type. Perhaps eventually we would do the
1030  * same. For not using prim_mode that is the one already used on v3d
1031  */
1032 static const enum mesa_prim vk_to_mesa_prim[] = {
1033    [VK_PRIMITIVE_TOPOLOGY_POINT_LIST] = MESA_PRIM_POINTS,
1034    [VK_PRIMITIVE_TOPOLOGY_LINE_LIST] = MESA_PRIM_LINES,
1035    [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP] = MESA_PRIM_LINE_STRIP,
1036    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST] = MESA_PRIM_TRIANGLES,
1037    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP] = MESA_PRIM_TRIANGLE_STRIP,
1038    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN] = MESA_PRIM_TRIANGLE_FAN,
1039    [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY] = MESA_PRIM_LINES_ADJACENCY,
1040    [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY] = MESA_PRIM_LINE_STRIP_ADJACENCY,
1041    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY] = MESA_PRIM_TRIANGLES_ADJACENCY,
1042    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = MESA_PRIM_TRIANGLE_STRIP_ADJACENCY,
1043 };
1044 
1045 uint32_t
v3dv_pipeline_primitive(VkPrimitiveTopology vk_prim)1046 v3dv_pipeline_primitive(VkPrimitiveTopology vk_prim)
1047 {
1048    return v3d_hw_prim_type(vk_to_mesa_prim[vk_prim]);
1049 }
1050 
1051 static const enum pipe_logicop vk_to_pipe_logicop[] = {
1052    [VK_LOGIC_OP_CLEAR] = PIPE_LOGICOP_CLEAR,
1053    [VK_LOGIC_OP_AND] = PIPE_LOGICOP_AND,
1054    [VK_LOGIC_OP_AND_REVERSE] = PIPE_LOGICOP_AND_REVERSE,
1055    [VK_LOGIC_OP_COPY] = PIPE_LOGICOP_COPY,
1056    [VK_LOGIC_OP_AND_INVERTED] = PIPE_LOGICOP_AND_INVERTED,
1057    [VK_LOGIC_OP_NO_OP] = PIPE_LOGICOP_NOOP,
1058    [VK_LOGIC_OP_XOR] = PIPE_LOGICOP_XOR,
1059    [VK_LOGIC_OP_OR] = PIPE_LOGICOP_OR,
1060    [VK_LOGIC_OP_NOR] = PIPE_LOGICOP_NOR,
1061    [VK_LOGIC_OP_EQUIVALENT] = PIPE_LOGICOP_EQUIV,
1062    [VK_LOGIC_OP_INVERT] = PIPE_LOGICOP_INVERT,
1063    [VK_LOGIC_OP_OR_REVERSE] = PIPE_LOGICOP_OR_REVERSE,
1064    [VK_LOGIC_OP_COPY_INVERTED] = PIPE_LOGICOP_COPY_INVERTED,
1065    [VK_LOGIC_OP_OR_INVERTED] = PIPE_LOGICOP_OR_INVERTED,
1066    [VK_LOGIC_OP_NAND] = PIPE_LOGICOP_NAND,
1067    [VK_LOGIC_OP_SET] = PIPE_LOGICOP_SET,
1068 };
1069 
1070 static bool
enable_line_smooth(struct v3dv_pipeline * pipeline,const VkPipelineRasterizationStateCreateInfo * rs_info)1071 enable_line_smooth(struct v3dv_pipeline *pipeline,
1072                    const VkPipelineRasterizationStateCreateInfo *rs_info)
1073 {
1074    if (!pipeline->rasterization_enabled)
1075       return false;
1076 
1077    const VkPipelineRasterizationLineStateCreateInfoKHR *ls_info =
1078       vk_find_struct_const(rs_info->pNext,
1079                            PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_KHR);
1080 
1081    if (!ls_info)
1082       return false;
1083 
1084    /* Although topology is dynamic now, the topology class can't change
1085     * because we don't support dynamicPrimitiveTopologyUnrestricted, so we can
1086     * use the static topology from the pipeline for this.
1087     */
1088    switch(pipeline->topology) {
1089    case MESA_PRIM_LINES:
1090    case MESA_PRIM_LINE_LOOP:
1091    case MESA_PRIM_LINE_STRIP:
1092    case MESA_PRIM_LINES_ADJACENCY:
1093    case MESA_PRIM_LINE_STRIP_ADJACENCY:
1094       return ls_info->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR;
1095    default:
1096       return false;
1097    }
1098 }
1099 
1100 static void
v3d_fs_key_set_color_attachment(struct v3d_fs_key * key,const struct v3dv_pipeline_stage * p_stage,uint32_t index,VkFormat fb_format)1101 v3d_fs_key_set_color_attachment(struct v3d_fs_key *key,
1102                                 const struct v3dv_pipeline_stage *p_stage,
1103                                 uint32_t index,
1104                                 VkFormat fb_format)
1105 {
1106    key->cbufs |= 1 << index;
1107 
1108    enum pipe_format fb_pipe_format = vk_format_to_pipe_format(fb_format);
1109 
1110    /* If logic operations are enabled then we might emit color reads and we
1111     * need to know the color buffer format and swizzle for that
1112     */
1113    if (key->logicop_func != PIPE_LOGICOP_COPY) {
1114       /* Framebuffer formats should be single plane */
1115       assert(vk_format_get_plane_count(fb_format) == 1);
1116       key->color_fmt[index].format = fb_pipe_format;
1117       memcpy(key->color_fmt[index].swizzle,
1118              v3dv_get_format_swizzle(p_stage->pipeline->device, fb_format, 0),
1119              sizeof(key->color_fmt[index].swizzle));
1120    }
1121 
1122    const struct util_format_description *desc =
1123       vk_format_description(fb_format);
1124 
1125    if (desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
1126        desc->channel[0].size == 32) {
1127       key->f32_color_rb |= 1 << index;
1128    }
1129 
1130    if (p_stage->nir->info.fs.untyped_color_outputs) {
1131       if (util_format_is_pure_uint(fb_pipe_format))
1132          key->uint_color_rb |= 1 << index;
1133       else if (util_format_is_pure_sint(fb_pipe_format))
1134          key->int_color_rb |= 1 << index;
1135    }
1136 }
1137 
1138 static void
pipeline_populate_v3d_fs_key(struct v3d_fs_key * key,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct vk_render_pass_state * rendering_info,const struct v3dv_pipeline_stage * p_stage,bool has_geometry_shader,uint32_t ucp_enables)1139 pipeline_populate_v3d_fs_key(struct v3d_fs_key *key,
1140                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
1141                              const struct vk_render_pass_state *rendering_info,
1142                              const struct v3dv_pipeline_stage *p_stage,
1143                              bool has_geometry_shader,
1144                              uint32_t ucp_enables)
1145 {
1146    assert(p_stage->stage == BROADCOM_SHADER_FRAGMENT);
1147 
1148    memset(key, 0, sizeof(*key));
1149 
1150    struct v3dv_device *device = p_stage->pipeline->device;
1151    assert(device);
1152 
1153    pipeline_populate_v3d_key(&key->base, p_stage, ucp_enables);
1154 
1155    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
1156       pCreateInfo->pInputAssemblyState;
1157    uint8_t topology = vk_to_mesa_prim[ia_info->topology];
1158 
1159    key->is_points = (topology == MESA_PRIM_POINTS);
1160    key->is_lines = (topology >= MESA_PRIM_LINES &&
1161                     topology <= MESA_PRIM_LINE_STRIP);
1162 
1163    if (key->is_points) {
1164       /* This mask represents state for GL_ARB_point_sprite which is not
1165        * relevant to Vulkan.
1166        */
1167       key->point_sprite_mask = 0;
1168 
1169       /* Vulkan mandates upper left. */
1170       key->point_coord_upper_left = true;
1171    }
1172 
1173    key->has_gs = has_geometry_shader;
1174 
1175    const VkPipelineColorBlendStateCreateInfo *cb_info =
1176       p_stage->pipeline->rasterization_enabled ?
1177       pCreateInfo->pColorBlendState : NULL;
1178 
1179    key->logicop_func = cb_info && cb_info->logicOpEnable == VK_TRUE ?
1180                        vk_to_pipe_logicop[cb_info->logicOp] :
1181                        PIPE_LOGICOP_COPY;
1182 
1183    /* Multisample rasterization state must be ignored if rasterization
1184     * is disabled.
1185     */
1186    const VkPipelineMultisampleStateCreateInfo *ms_info =
1187       p_stage->pipeline->rasterization_enabled ? pCreateInfo->pMultisampleState : NULL;
1188    if (ms_info) {
1189       assert(ms_info->rasterizationSamples == VK_SAMPLE_COUNT_1_BIT ||
1190              ms_info->rasterizationSamples == VK_SAMPLE_COUNT_4_BIT);
1191       key->msaa = ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT;
1192 
1193       if (key->msaa)
1194          key->sample_alpha_to_coverage = ms_info->alphaToCoverageEnable;
1195 
1196       key->sample_alpha_to_one = ms_info->alphaToOneEnable;
1197    }
1198 
1199    key->line_smoothing = enable_line_smooth(p_stage->pipeline,
1200                                             pCreateInfo->pRasterizationState);
1201 
1202    /* This is intended for V3D versions before 4.1, otherwise we just use the
1203     * tile buffer load/store swap R/B bit.
1204     */
1205    key->swap_color_rb = 0;
1206 
1207    for (uint32_t i = 0; i < rendering_info->color_attachment_count; i++) {
1208       if (rendering_info->color_attachment_formats[i] == VK_FORMAT_UNDEFINED)
1209          continue;
1210       v3d_fs_key_set_color_attachment(key, p_stage, i,
1211                                       rendering_info->color_attachment_formats[i]);
1212    }
1213 }
1214 
1215 static void
setup_stage_outputs_from_next_stage_inputs(uint8_t next_stage_num_inputs,struct v3d_varying_slot * next_stage_input_slots,uint8_t * num_used_outputs,struct v3d_varying_slot * used_output_slots,uint32_t size_of_used_output_slots)1216 setup_stage_outputs_from_next_stage_inputs(
1217    uint8_t next_stage_num_inputs,
1218    struct v3d_varying_slot *next_stage_input_slots,
1219    uint8_t *num_used_outputs,
1220    struct v3d_varying_slot *used_output_slots,
1221    uint32_t size_of_used_output_slots)
1222 {
1223    *num_used_outputs = next_stage_num_inputs;
1224    memcpy(used_output_slots, next_stage_input_slots, size_of_used_output_slots);
1225 }
1226 
1227 static void
pipeline_populate_v3d_gs_key(struct v3d_gs_key * key,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct v3dv_pipeline_stage * p_stage)1228 pipeline_populate_v3d_gs_key(struct v3d_gs_key *key,
1229                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
1230                              const struct v3dv_pipeline_stage *p_stage)
1231 {
1232    assert(p_stage->stage == BROADCOM_SHADER_GEOMETRY ||
1233           p_stage->stage == BROADCOM_SHADER_GEOMETRY_BIN);
1234 
1235    struct v3dv_device *device = p_stage->pipeline->device;
1236    assert(device);
1237 
1238    memset(key, 0, sizeof(*key));
1239 
1240    pipeline_populate_v3d_key(&key->base, p_stage, 0);
1241 
1242    struct v3dv_pipeline *pipeline = p_stage->pipeline;
1243 
1244    key->per_vertex_point_size =
1245       p_stage->nir->info.outputs_written & (1ull << VARYING_SLOT_PSIZ);
1246 
1247    key->is_coord = broadcom_shader_stage_is_binning(p_stage->stage);
1248 
1249    assert(key->base.is_last_geometry_stage);
1250    if (key->is_coord) {
1251       /* Output varyings in the last binning shader are only used for transform
1252        * feedback. Set to 0 as VK_EXT_transform_feedback is not supported.
1253        */
1254       key->num_used_outputs = 0;
1255    } else {
1256       struct v3dv_shader_variant *fs_variant =
1257          pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
1258 
1259       STATIC_ASSERT(sizeof(key->used_outputs) ==
1260                     sizeof(fs_variant->prog_data.fs->input_slots));
1261 
1262       setup_stage_outputs_from_next_stage_inputs(
1263          fs_variant->prog_data.fs->num_inputs,
1264          fs_variant->prog_data.fs->input_slots,
1265          &key->num_used_outputs,
1266          key->used_outputs,
1267          sizeof(key->used_outputs));
1268    }
1269 }
1270 
1271 static void
pipeline_populate_v3d_vs_key(struct v3d_vs_key * key,const VkGraphicsPipelineCreateInfo * pCreateInfo,const struct v3dv_pipeline_stage * p_stage)1272 pipeline_populate_v3d_vs_key(struct v3d_vs_key *key,
1273                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
1274                              const struct v3dv_pipeline_stage *p_stage)
1275 {
1276    assert(p_stage->stage == BROADCOM_SHADER_VERTEX ||
1277           p_stage->stage == BROADCOM_SHADER_VERTEX_BIN);
1278 
1279    struct v3dv_device *device = p_stage->pipeline->device;
1280    assert(device);
1281 
1282    memset(key, 0, sizeof(*key));
1283    pipeline_populate_v3d_key(&key->base, p_stage, 0);
1284 
1285    struct v3dv_pipeline *pipeline = p_stage->pipeline;
1286 
1287    key->per_vertex_point_size =
1288       p_stage->nir->info.outputs_written & (1ull << VARYING_SLOT_PSIZ);
1289 
1290    key->is_coord = broadcom_shader_stage_is_binning(p_stage->stage);
1291 
1292    if (key->is_coord) { /* Binning VS*/
1293       if (key->base.is_last_geometry_stage) {
1294          /* Output varyings in the last binning shader are only used for
1295           * transform feedback. Set to 0 as VK_EXT_transform_feedback is not
1296           * supported.
1297           */
1298          key->num_used_outputs = 0;
1299       } else {
1300          /* Linking against GS binning program */
1301          assert(pipeline->stages[BROADCOM_SHADER_GEOMETRY]);
1302          struct v3dv_shader_variant *gs_bin_variant =
1303             pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
1304 
1305          STATIC_ASSERT(sizeof(key->used_outputs) ==
1306                        sizeof(gs_bin_variant->prog_data.gs->input_slots));
1307 
1308          setup_stage_outputs_from_next_stage_inputs(
1309             gs_bin_variant->prog_data.gs->num_inputs,
1310             gs_bin_variant->prog_data.gs->input_slots,
1311             &key->num_used_outputs,
1312             key->used_outputs,
1313             sizeof(key->used_outputs));
1314       }
1315    } else { /* Render VS */
1316       if (pipeline->stages[BROADCOM_SHADER_GEOMETRY]) {
1317          /* Linking against GS render program */
1318          struct v3dv_shader_variant *gs_variant =
1319             pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
1320 
1321          STATIC_ASSERT(sizeof(key->used_outputs) ==
1322                        sizeof(gs_variant->prog_data.gs->input_slots));
1323 
1324          setup_stage_outputs_from_next_stage_inputs(
1325             gs_variant->prog_data.gs->num_inputs,
1326             gs_variant->prog_data.gs->input_slots,
1327             &key->num_used_outputs,
1328             key->used_outputs,
1329             sizeof(key->used_outputs));
1330       } else {
1331          /* Linking against FS program */
1332          struct v3dv_shader_variant *fs_variant =
1333             pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
1334 
1335          STATIC_ASSERT(sizeof(key->used_outputs) ==
1336                        sizeof(fs_variant->prog_data.fs->input_slots));
1337 
1338          setup_stage_outputs_from_next_stage_inputs(
1339             fs_variant->prog_data.fs->num_inputs,
1340             fs_variant->prog_data.fs->input_slots,
1341             &key->num_used_outputs,
1342             key->used_outputs,
1343             sizeof(key->used_outputs));
1344       }
1345    }
1346 
1347    const VkPipelineVertexInputStateCreateInfo *vi_info =
1348       pCreateInfo->pVertexInputState;
1349    for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
1350       const VkVertexInputAttributeDescription *desc =
1351          &vi_info->pVertexAttributeDescriptions[i];
1352       assert(desc->location < MAX_VERTEX_ATTRIBS);
1353       if (desc->format == VK_FORMAT_B8G8R8A8_UNORM ||
1354           desc->format == VK_FORMAT_A2R10G10B10_UNORM_PACK32) {
1355          key->va_swap_rb_mask |= 1 << (VERT_ATTRIB_GENERIC0 + desc->location);
1356       }
1357    }
1358 }
1359 
1360 /**
1361  * Creates the initial form of the pipeline stage for a binning shader by
1362  * cloning the render shader and flagging it as a coordinate shader.
1363  *
1364  * Returns NULL if it was not able to allocate the object, so it should be
1365  * handled as a VK_ERROR_OUT_OF_HOST_MEMORY error.
1366  */
1367 static struct v3dv_pipeline_stage *
pipeline_stage_create_binning(const struct v3dv_pipeline_stage * src,const VkAllocationCallbacks * pAllocator)1368 pipeline_stage_create_binning(const struct v3dv_pipeline_stage *src,
1369                               const VkAllocationCallbacks *pAllocator)
1370 {
1371    struct v3dv_device *device = src->pipeline->device;
1372 
1373    struct v3dv_pipeline_stage *p_stage =
1374       vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*p_stage), 8,
1375                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1376 
1377    if (p_stage == NULL)
1378       return NULL;
1379 
1380    assert(src->stage == BROADCOM_SHADER_VERTEX ||
1381           src->stage == BROADCOM_SHADER_GEOMETRY);
1382 
1383    enum broadcom_shader_stage bin_stage =
1384       src->stage == BROADCOM_SHADER_VERTEX ?
1385          BROADCOM_SHADER_VERTEX_BIN :
1386          BROADCOM_SHADER_GEOMETRY_BIN;
1387 
1388    p_stage->pipeline = src->pipeline;
1389    p_stage->stage = bin_stage;
1390    p_stage->entrypoint = src->entrypoint;
1391    p_stage->module = src->module;
1392    p_stage->module_info = src->module_info;
1393 
1394    /* For binning shaders we will clone the NIR code from the corresponding
1395     * render shader later, when we call pipeline_compile_xxx_shader. This way
1396     * we only have to run the relevant NIR lowerings once for render shaders
1397     */
1398    p_stage->nir = NULL;
1399    p_stage->program_id = src->program_id;
1400    p_stage->spec_info = src->spec_info;
1401    p_stage->feedback = (VkPipelineCreationFeedback) { 0 };
1402    p_stage->robustness = src->robustness;
1403    memcpy(p_stage->shader_sha1, src->shader_sha1, 20);
1404 
1405    return p_stage;
1406 }
1407 
1408 /*
1409  * Based on some creation flags we assume that the QPU would be needed later
1410  * to gather further info. In that case we just keep the qput_insts around,
1411  * instead of map/unmap the bo later.
1412  */
1413 static bool
pipeline_keep_qpu(struct v3dv_pipeline * pipeline)1414 pipeline_keep_qpu(struct v3dv_pipeline *pipeline)
1415 {
1416    return pipeline->flags &
1417       (VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR |
1418        VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR);
1419 }
1420 
1421 /**
1422  * Returns false if it was not able to allocate or map the assembly bo memory.
1423  */
1424 static bool
upload_assembly(struct v3dv_pipeline * pipeline)1425 upload_assembly(struct v3dv_pipeline *pipeline)
1426 {
1427    uint32_t total_size = 0;
1428    for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
1429       struct v3dv_shader_variant *variant =
1430          pipeline->shared_data->variants[stage];
1431 
1432       if (variant != NULL)
1433          total_size += variant->qpu_insts_size;
1434    }
1435 
1436    struct v3dv_bo *bo = v3dv_bo_alloc(pipeline->device, total_size,
1437                                       "pipeline shader assembly", true);
1438    if (!bo) {
1439       mesa_loge("failed to allocate memory for shader\n");
1440       return false;
1441    }
1442 
1443    bool ok = v3dv_bo_map(pipeline->device, bo, total_size);
1444    if (!ok) {
1445       mesa_loge("failed to map source shader buffer\n");
1446       return false;
1447    }
1448 
1449    uint32_t offset = 0;
1450    for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
1451       struct v3dv_shader_variant *variant =
1452          pipeline->shared_data->variants[stage];
1453 
1454       if (variant != NULL) {
1455          variant->assembly_offset = offset;
1456 
1457          memcpy(bo->map + offset, variant->qpu_insts, variant->qpu_insts_size);
1458          offset += variant->qpu_insts_size;
1459 
1460          if (!pipeline_keep_qpu(pipeline)) {
1461             free(variant->qpu_insts);
1462             variant->qpu_insts = NULL;
1463          }
1464       }
1465    }
1466    assert(total_size == offset);
1467 
1468    pipeline->shared_data->assembly_bo = bo;
1469 
1470    return true;
1471 }
1472 
1473 static void
pipeline_hash_graphics(const struct v3dv_pipeline * pipeline,struct v3dv_pipeline_key * key,unsigned char * sha1_out)1474 pipeline_hash_graphics(const struct v3dv_pipeline *pipeline,
1475                        struct v3dv_pipeline_key *key,
1476                        unsigned char *sha1_out)
1477 {
1478    struct mesa_sha1 ctx;
1479    _mesa_sha1_init(&ctx);
1480 
1481    if (pipeline->layout) {
1482       _mesa_sha1_update(&ctx, &pipeline->layout->sha1,
1483                         sizeof(pipeline->layout->sha1));
1484    }
1485 
1486    /* We need to include all shader stages in the sha1 key as linking may
1487     * modify the shader code in any stage. An alternative would be to use the
1488     * serialized NIR, but that seems like an overkill.
1489     */
1490    for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
1491       if (broadcom_shader_stage_is_binning(stage))
1492          continue;
1493 
1494       struct v3dv_pipeline_stage *p_stage = pipeline->stages[stage];
1495       if (p_stage == NULL)
1496          continue;
1497 
1498       assert(stage != BROADCOM_SHADER_COMPUTE);
1499 
1500       _mesa_sha1_update(&ctx, p_stage->shader_sha1, sizeof(p_stage->shader_sha1));
1501    }
1502 
1503    _mesa_sha1_update(&ctx, key, sizeof(struct v3dv_pipeline_key));
1504 
1505    _mesa_sha1_final(&ctx, sha1_out);
1506 }
1507 
1508 static void
pipeline_hash_compute(const struct v3dv_pipeline * pipeline,struct v3dv_pipeline_key * key,unsigned char * sha1_out)1509 pipeline_hash_compute(const struct v3dv_pipeline *pipeline,
1510                       struct v3dv_pipeline_key *key,
1511                       unsigned char *sha1_out)
1512 {
1513    struct mesa_sha1 ctx;
1514    _mesa_sha1_init(&ctx);
1515 
1516    if (pipeline->layout) {
1517       _mesa_sha1_update(&ctx, &pipeline->layout->sha1,
1518                         sizeof(pipeline->layout->sha1));
1519    }
1520 
1521    struct v3dv_pipeline_stage *p_stage =
1522       pipeline->stages[BROADCOM_SHADER_COMPUTE];
1523 
1524    _mesa_sha1_update(&ctx, p_stage->shader_sha1, sizeof(p_stage->shader_sha1));
1525 
1526    _mesa_sha1_update(&ctx, key, sizeof(struct v3dv_pipeline_key));
1527 
1528    _mesa_sha1_final(&ctx, sha1_out);
1529 }
1530 
1531 /* Checks that the pipeline has enough spill size to use for any of their
1532  * variants
1533  */
1534 static void
pipeline_check_spill_size(struct v3dv_pipeline * pipeline)1535 pipeline_check_spill_size(struct v3dv_pipeline *pipeline)
1536 {
1537    uint32_t max_spill_size = 0;
1538 
1539    for(uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
1540       struct v3dv_shader_variant *variant =
1541          pipeline->shared_data->variants[stage];
1542 
1543       if (variant != NULL) {
1544          max_spill_size = MAX2(variant->prog_data.base->spill_size,
1545                                max_spill_size);
1546       }
1547    }
1548 
1549    if (max_spill_size > 0) {
1550       struct v3dv_device *device = pipeline->device;
1551 
1552       /* The TIDX register we use for choosing the area to access
1553        * for scratch space is: (core << 6) | (qpu << 2) | thread.
1554        * Even at minimum threadcount in a particular shader, that
1555        * means we still multiply by qpus by 4.
1556        */
1557       const uint32_t total_spill_size =
1558          4 * device->devinfo.qpu_count * max_spill_size;
1559       if (pipeline->spill.bo) {
1560          assert(pipeline->spill.size_per_thread > 0);
1561          v3dv_bo_free(device, pipeline->spill.bo);
1562       }
1563       pipeline->spill.bo =
1564          v3dv_bo_alloc(device, total_spill_size, "spill", true);
1565       pipeline->spill.size_per_thread = max_spill_size;
1566    }
1567 }
1568 
1569 /**
1570  * Creates a new shader_variant_create. Note that for prog_data is not const,
1571  * so it is assumed that the caller will prove a pointer that the
1572  * shader_variant will own.
1573  *
1574  * Creation doesn't include allocate a BO to store the content of qpu_insts,
1575  * as we will try to share the same bo for several shader variants. Also note
1576  * that qpu_ints being NULL is valid, for example if we are creating the
1577  * shader_variants from the cache, so we can just upload the assembly of all
1578  * the shader stages at once.
1579  */
1580 struct v3dv_shader_variant *
v3dv_shader_variant_create(struct v3dv_device * device,enum broadcom_shader_stage stage,struct v3d_prog_data * prog_data,uint32_t prog_data_size,uint32_t assembly_offset,uint64_t * qpu_insts,uint32_t qpu_insts_size,VkResult * out_vk_result)1581 v3dv_shader_variant_create(struct v3dv_device *device,
1582                            enum broadcom_shader_stage stage,
1583                            struct v3d_prog_data *prog_data,
1584                            uint32_t prog_data_size,
1585                            uint32_t assembly_offset,
1586                            uint64_t *qpu_insts,
1587                            uint32_t qpu_insts_size,
1588                            VkResult *out_vk_result)
1589 {
1590    struct v3dv_shader_variant *variant =
1591       vk_zalloc(&device->vk.alloc, sizeof(*variant), 8,
1592                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1593 
1594    if (variant == NULL) {
1595       *out_vk_result = VK_ERROR_OUT_OF_HOST_MEMORY;
1596       return NULL;
1597    }
1598 
1599    variant->stage = stage;
1600    variant->prog_data_size = prog_data_size;
1601    variant->prog_data.base = prog_data;
1602 
1603    variant->assembly_offset = assembly_offset;
1604    variant->qpu_insts_size = qpu_insts_size;
1605    variant->qpu_insts = qpu_insts;
1606 
1607    *out_vk_result = VK_SUCCESS;
1608 
1609    return variant;
1610 }
1611 
1612 /* For a given key, it returns the compiled version of the shader.  Returns a
1613  * new reference to the shader_variant to the caller, or NULL.
1614  *
1615  * If the method returns NULL it means that something wrong happened:
1616  *   * Not enough memory: this is one of the possible outcomes defined by
1617  *     vkCreateXXXPipelines. out_vk_result will return the proper oom error.
1618  *   * Compilation error: hypothetically this shouldn't happen, as the spec
1619  *     states that vkShaderModule needs to be created with a valid SPIR-V, so
1620  *     any compilation failure is a driver bug. In the practice, something as
1621  *     common as failing to register allocate can lead to a compilation
1622  *     failure. In that case the only option (for any driver) is
1623  *     VK_ERROR_UNKNOWN, even if we know that the problem was a compiler
1624  *     error.
1625  */
1626 static struct v3dv_shader_variant *
pipeline_compile_shader_variant(struct v3dv_pipeline_stage * p_stage,struct v3d_key * key,size_t key_size,const VkAllocationCallbacks * pAllocator,VkResult * out_vk_result)1627 pipeline_compile_shader_variant(struct v3dv_pipeline_stage *p_stage,
1628                                 struct v3d_key *key,
1629                                 size_t key_size,
1630                                 const VkAllocationCallbacks *pAllocator,
1631                                 VkResult *out_vk_result)
1632 {
1633    int64_t stage_start = os_time_get_nano();
1634 
1635    struct v3dv_pipeline *pipeline = p_stage->pipeline;
1636    struct v3dv_physical_device *physical_device = pipeline->device->pdevice;
1637    const struct v3d_compiler *compiler = physical_device->compiler;
1638    gl_shader_stage gl_stage = broadcom_shader_stage_to_gl(p_stage->stage);
1639 
1640    if (V3D_DBG(NIR) || v3d_debug_flag_for_shader_stage(gl_stage)) {
1641       fprintf(stderr, "Just before v3d_compile: %s prog %d NIR:\n",
1642               broadcom_shader_stage_name(p_stage->stage),
1643               p_stage->program_id);
1644       nir_print_shader(p_stage->nir, stderr);
1645       fprintf(stderr, "\n");
1646    }
1647 
1648    uint64_t *qpu_insts;
1649    uint32_t qpu_insts_size;
1650    struct v3d_prog_data *prog_data;
1651    uint32_t prog_data_size = v3d_prog_data_size(gl_stage);
1652 
1653    qpu_insts = v3d_compile(compiler,
1654                            key, &prog_data,
1655                            p_stage->nir,
1656                            shader_debug_output, NULL,
1657                            p_stage->program_id, 0,
1658                            &qpu_insts_size);
1659 
1660    struct v3dv_shader_variant *variant = NULL;
1661 
1662    if (!qpu_insts) {
1663       mesa_loge("Failed to compile %s prog %d NIR to VIR\n",
1664                 broadcom_shader_stage_name(p_stage->stage),
1665                 p_stage->program_id);
1666       *out_vk_result = VK_ERROR_UNKNOWN;
1667    } else {
1668       variant =
1669          v3dv_shader_variant_create(pipeline->device, p_stage->stage,
1670                                     prog_data, prog_data_size,
1671                                     0, /* assembly_offset, no final value yet */
1672                                     qpu_insts, qpu_insts_size,
1673                                     out_vk_result);
1674    }
1675    /* At this point we don't need anymore the nir shader, but we are freeing
1676     * all the temporary p_stage structs used during the pipeline creation when
1677     * we finish it, so let's not worry about freeing the nir here.
1678     */
1679 
1680    p_stage->feedback.duration += os_time_get_nano() - stage_start;
1681 
1682    return variant;
1683 }
1684 
1685 static void
link_shaders(nir_shader * producer,nir_shader * consumer)1686 link_shaders(nir_shader *producer, nir_shader *consumer)
1687 {
1688    assert(producer);
1689    assert(consumer);
1690 
1691    if (producer->options->lower_to_scalar) {
1692       NIR_PASS(_, producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
1693       NIR_PASS(_, consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
1694    }
1695 
1696    nir_lower_io_arrays_to_elements(producer, consumer);
1697 
1698    v3d_optimize_nir(NULL, producer);
1699    v3d_optimize_nir(NULL, consumer);
1700 
1701    if (nir_link_opt_varyings(producer, consumer))
1702       v3d_optimize_nir(NULL, consumer);
1703 
1704    NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
1705    NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
1706 
1707    if (nir_remove_unused_varyings(producer, consumer)) {
1708       NIR_PASS(_, producer, nir_lower_global_vars_to_local);
1709       NIR_PASS(_, consumer, nir_lower_global_vars_to_local);
1710 
1711       v3d_optimize_nir(NULL, producer);
1712       v3d_optimize_nir(NULL, consumer);
1713 
1714       /* Optimizations can cause varyings to become unused.
1715        * nir_compact_varyings() depends on all dead varyings being removed so
1716        * we need to call nir_remove_dead_variables() again here.
1717        */
1718       NIR_PASS(_, producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
1719       NIR_PASS(_, consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
1720    }
1721 }
1722 
1723 static void
pipeline_lower_nir(struct v3dv_pipeline * pipeline,struct v3dv_pipeline_stage * p_stage,struct v3dv_pipeline_layout * layout)1724 pipeline_lower_nir(struct v3dv_pipeline *pipeline,
1725                    struct v3dv_pipeline_stage *p_stage,
1726                    struct v3dv_pipeline_layout *layout)
1727 {
1728    int64_t stage_start = os_time_get_nano();
1729 
1730    assert(pipeline->shared_data &&
1731           pipeline->shared_data->maps[p_stage->stage]);
1732 
1733    NIR_PASS_V(p_stage->nir, nir_vk_lower_ycbcr_tex,
1734               lookup_ycbcr_conversion, layout);
1735 
1736    nir_shader_gather_info(p_stage->nir, nir_shader_get_entrypoint(p_stage->nir));
1737 
1738    /* We add this because we need a valid sampler for nir_lower_tex to do
1739     * unpacking of the texture operation result, even for the case where there
1740     * is no sampler state.
1741     *
1742     * We add two of those, one for the case we need a 16bit return_size, and
1743     * another for the case we need a 32bit return size.
1744     */
1745    struct v3dv_descriptor_maps *maps =
1746       pipeline->shared_data->maps[p_stage->stage];
1747 
1748    UNUSED unsigned index;
1749    index = descriptor_map_add(&maps->sampler_map, -1, -1, -1, 0, 0, 16, 0);
1750    assert(index == V3DV_NO_SAMPLER_16BIT_IDX);
1751 
1752    index = descriptor_map_add(&maps->sampler_map, -2, -2, -2, 0, 0, 32, 0);
1753    assert(index == V3DV_NO_SAMPLER_32BIT_IDX);
1754 
1755    /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
1756    bool needs_default_sampler_state = false;
1757    NIR_PASS(_, p_stage->nir, lower_pipeline_layout_info, pipeline, layout,
1758             &needs_default_sampler_state);
1759 
1760    /* If in the end we didn't need to use the default sampler states and the
1761     * shader doesn't need any other samplers, get rid of them so we can
1762     * recognize that this program doesn't use any samplers at all.
1763     */
1764    if (!needs_default_sampler_state && maps->sampler_map.num_desc == 2)
1765       maps->sampler_map.num_desc = 0;
1766 
1767    p_stage->feedback.duration += os_time_get_nano() - stage_start;
1768 }
1769 
1770 /**
1771  * The SPIR-V compiler will insert a sized compact array for
1772  * VARYING_SLOT_CLIP_DIST0 if the vertex shader writes to gl_ClipDistance[],
1773  * where the size of the array determines the number of active clip planes.
1774  */
1775 static uint32_t
get_ucp_enable_mask(struct v3dv_pipeline_stage * p_stage)1776 get_ucp_enable_mask(struct v3dv_pipeline_stage *p_stage)
1777 {
1778    assert(p_stage->stage == BROADCOM_SHADER_VERTEX);
1779    const nir_shader *shader = p_stage->nir;
1780    assert(shader);
1781 
1782    nir_foreach_variable_with_modes(var, shader, nir_var_shader_out) {
1783       if (var->data.location == VARYING_SLOT_CLIP_DIST0) {
1784          assert(var->data.compact);
1785          return (1 << glsl_get_length(var->type)) - 1;
1786       }
1787    }
1788    return 0;
1789 }
1790 
1791 static nir_shader *
pipeline_stage_get_nir(struct v3dv_pipeline_stage * p_stage,struct v3dv_pipeline * pipeline,struct v3dv_pipeline_cache * cache)1792 pipeline_stage_get_nir(struct v3dv_pipeline_stage *p_stage,
1793                        struct v3dv_pipeline *pipeline,
1794                        struct v3dv_pipeline_cache *cache)
1795 {
1796    int64_t stage_start = os_time_get_nano();
1797 
1798    nir_shader *nir = NULL;
1799    const nir_shader_compiler_options *nir_options =
1800       v3dv_pipeline_get_nir_options(&pipeline->device->devinfo);
1801 
1802    nir = v3dv_pipeline_cache_search_for_nir(pipeline, cache,
1803                                             nir_options,
1804                                             p_stage->shader_sha1);
1805 
1806    if (nir) {
1807       assert(nir->info.stage == broadcom_shader_stage_to_gl(p_stage->stage));
1808 
1809       /* A NIR cache hit doesn't avoid the large majority of pipeline stage
1810        * creation so the cache hit is not recorded in the pipeline feedback
1811        * flags
1812        */
1813 
1814       p_stage->feedback.duration += os_time_get_nano() - stage_start;
1815 
1816       return nir;
1817    }
1818 
1819    nir = shader_module_compile_to_nir(pipeline->device, p_stage);
1820 
1821    if (nir) {
1822       struct v3dv_pipeline_cache *default_cache =
1823          &pipeline->device->default_pipeline_cache;
1824 
1825       v3dv_pipeline_cache_upload_nir(pipeline, cache, nir,
1826                                      p_stage->shader_sha1);
1827 
1828       /* Ensure that the variant is on the default cache, as cmd_buffer could
1829        * need to change the current variant
1830        */
1831       if (default_cache != cache) {
1832          v3dv_pipeline_cache_upload_nir(pipeline, default_cache, nir,
1833                                         p_stage->shader_sha1);
1834       }
1835 
1836       p_stage->feedback.duration += os_time_get_nano() - stage_start;
1837 
1838       return nir;
1839    }
1840 
1841    /* FIXME: this shouldn't happen, raise error? */
1842    return NULL;
1843 }
1844 
1845 static VkResult
pipeline_compile_vertex_shader(struct v3dv_pipeline * pipeline,const VkAllocationCallbacks * pAllocator,const VkGraphicsPipelineCreateInfo * pCreateInfo)1846 pipeline_compile_vertex_shader(struct v3dv_pipeline *pipeline,
1847                                const VkAllocationCallbacks *pAllocator,
1848                                const VkGraphicsPipelineCreateInfo *pCreateInfo)
1849 {
1850    struct v3dv_pipeline_stage *p_stage_vs =
1851       pipeline->stages[BROADCOM_SHADER_VERTEX];
1852    struct v3dv_pipeline_stage *p_stage_vs_bin =
1853       pipeline->stages[BROADCOM_SHADER_VERTEX_BIN];
1854 
1855    assert(p_stage_vs_bin != NULL);
1856    if (p_stage_vs_bin->nir == NULL) {
1857       assert(p_stage_vs->nir);
1858       p_stage_vs_bin->nir = nir_shader_clone(NULL, p_stage_vs->nir);
1859    }
1860 
1861    VkResult vk_result;
1862    struct v3d_vs_key key;
1863    pipeline_populate_v3d_vs_key(&key, pCreateInfo, p_stage_vs);
1864    pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX] =
1865       pipeline_compile_shader_variant(p_stage_vs, &key.base, sizeof(key),
1866                                       pAllocator, &vk_result);
1867    if (vk_result != VK_SUCCESS)
1868       return vk_result;
1869 
1870    pipeline_populate_v3d_vs_key(&key, pCreateInfo, p_stage_vs_bin);
1871    pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN] =
1872       pipeline_compile_shader_variant(p_stage_vs_bin, &key.base, sizeof(key),
1873                                       pAllocator, &vk_result);
1874 
1875    return vk_result;
1876 }
1877 
1878 static VkResult
pipeline_compile_geometry_shader(struct v3dv_pipeline * pipeline,const VkAllocationCallbacks * pAllocator,const VkGraphicsPipelineCreateInfo * pCreateInfo)1879 pipeline_compile_geometry_shader(struct v3dv_pipeline *pipeline,
1880                                  const VkAllocationCallbacks *pAllocator,
1881                                  const VkGraphicsPipelineCreateInfo *pCreateInfo)
1882 {
1883    struct v3dv_pipeline_stage *p_stage_gs =
1884       pipeline->stages[BROADCOM_SHADER_GEOMETRY];
1885    struct v3dv_pipeline_stage *p_stage_gs_bin =
1886       pipeline->stages[BROADCOM_SHADER_GEOMETRY_BIN];
1887 
1888    assert(p_stage_gs);
1889    assert(p_stage_gs_bin != NULL);
1890    if (p_stage_gs_bin->nir == NULL) {
1891       assert(p_stage_gs->nir);
1892       p_stage_gs_bin->nir = nir_shader_clone(NULL, p_stage_gs->nir);
1893    }
1894 
1895    VkResult vk_result;
1896    struct v3d_gs_key key;
1897    pipeline_populate_v3d_gs_key(&key, pCreateInfo, p_stage_gs);
1898    pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY] =
1899       pipeline_compile_shader_variant(p_stage_gs, &key.base, sizeof(key),
1900                                       pAllocator, &vk_result);
1901    if (vk_result != VK_SUCCESS)
1902       return vk_result;
1903 
1904    pipeline_populate_v3d_gs_key(&key, pCreateInfo, p_stage_gs_bin);
1905    pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN] =
1906       pipeline_compile_shader_variant(p_stage_gs_bin, &key.base, sizeof(key),
1907                                       pAllocator, &vk_result);
1908 
1909    return vk_result;
1910 }
1911 
1912 static VkResult
pipeline_compile_fragment_shader(struct v3dv_pipeline * pipeline,const VkAllocationCallbacks * pAllocator,const VkGraphicsPipelineCreateInfo * pCreateInfo)1913 pipeline_compile_fragment_shader(struct v3dv_pipeline *pipeline,
1914                                  const VkAllocationCallbacks *pAllocator,
1915                                  const VkGraphicsPipelineCreateInfo *pCreateInfo)
1916 {
1917    struct v3dv_pipeline_stage *p_stage_vs =
1918       pipeline->stages[BROADCOM_SHADER_VERTEX];
1919    struct v3dv_pipeline_stage *p_stage_fs =
1920       pipeline->stages[BROADCOM_SHADER_FRAGMENT];
1921    struct v3dv_pipeline_stage *p_stage_gs =
1922       pipeline->stages[BROADCOM_SHADER_GEOMETRY];
1923 
1924    struct v3d_fs_key key;
1925    pipeline_populate_v3d_fs_key(&key, pCreateInfo, &pipeline->rendering_info,
1926                                 p_stage_fs, p_stage_gs != NULL,
1927                                 get_ucp_enable_mask(p_stage_vs));
1928 
1929    if (key.is_points) {
1930       assert(key.point_coord_upper_left);
1931       NIR_PASS(_, p_stage_fs->nir, v3d_nir_lower_point_coord);
1932    }
1933 
1934    VkResult vk_result;
1935    pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT] =
1936       pipeline_compile_shader_variant(p_stage_fs, &key.base, sizeof(key),
1937                                       pAllocator, &vk_result);
1938 
1939    return vk_result;
1940 }
1941 
1942 static void
pipeline_populate_graphics_key(struct v3dv_pipeline * pipeline,struct v3dv_pipeline_key * key,const VkGraphicsPipelineCreateInfo * pCreateInfo)1943 pipeline_populate_graphics_key(struct v3dv_pipeline *pipeline,
1944                                struct v3dv_pipeline_key *key,
1945                                const VkGraphicsPipelineCreateInfo *pCreateInfo)
1946 {
1947    struct v3dv_device *device = pipeline->device;
1948    assert(device);
1949 
1950    memset(key, 0, sizeof(*key));
1951 
1952    key->line_smooth = pipeline->line_smooth;
1953 
1954    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
1955       pCreateInfo->pInputAssemblyState;
1956    key->topology = vk_to_mesa_prim[ia_info->topology];
1957 
1958    const VkPipelineColorBlendStateCreateInfo *cb_info =
1959       pipeline->rasterization_enabled ? pCreateInfo->pColorBlendState : NULL;
1960 
1961    key->logicop_func = cb_info && cb_info->logicOpEnable == VK_TRUE ?
1962       vk_to_pipe_logicop[cb_info->logicOp] :
1963       PIPE_LOGICOP_COPY;
1964 
1965    /* Multisample rasterization state must be ignored if rasterization
1966     * is disabled.
1967     */
1968    const VkPipelineMultisampleStateCreateInfo *ms_info =
1969       pipeline->rasterization_enabled ? pCreateInfo->pMultisampleState : NULL;
1970    if (ms_info) {
1971       assert(ms_info->rasterizationSamples == VK_SAMPLE_COUNT_1_BIT ||
1972              ms_info->rasterizationSamples == VK_SAMPLE_COUNT_4_BIT);
1973       key->msaa = ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT;
1974 
1975       if (key->msaa)
1976          key->sample_alpha_to_coverage = ms_info->alphaToCoverageEnable;
1977 
1978       key->sample_alpha_to_one = ms_info->alphaToOneEnable;
1979    }
1980 
1981    struct vk_render_pass_state *ri = &pipeline->rendering_info;
1982    for (uint32_t i = 0; i < ri->color_attachment_count; i++) {
1983       if (ri->color_attachment_formats[i] == VK_FORMAT_UNDEFINED)
1984          continue;
1985 
1986       key->cbufs |= 1 << i;
1987 
1988       VkFormat fb_format = ri->color_attachment_formats[i];
1989       enum pipe_format fb_pipe_format = vk_format_to_pipe_format(fb_format);
1990 
1991       /* If logic operations are enabled then we might emit color reads and we
1992        * need to know the color buffer format and swizzle for that
1993        */
1994       if (key->logicop_func != PIPE_LOGICOP_COPY) {
1995          /* Framebuffer formats should be single plane */
1996          assert(vk_format_get_plane_count(fb_format) == 1);
1997          key->color_fmt[i].format = fb_pipe_format;
1998          memcpy(key->color_fmt[i].swizzle,
1999                 v3dv_get_format_swizzle(pipeline->device, fb_format, 0),
2000                 sizeof(key->color_fmt[i].swizzle));
2001       }
2002 
2003       const struct util_format_description *desc =
2004          vk_format_description(fb_format);
2005 
2006       if (desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
2007           desc->channel[0].size == 32) {
2008          key->f32_color_rb |= 1 << i;
2009       }
2010    }
2011 
2012    const VkPipelineVertexInputStateCreateInfo *vi_info =
2013       pCreateInfo->pVertexInputState;
2014    for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
2015       const VkVertexInputAttributeDescription *desc =
2016          &vi_info->pVertexAttributeDescriptions[i];
2017       assert(desc->location < MAX_VERTEX_ATTRIBS);
2018       if (desc->format == VK_FORMAT_B8G8R8A8_UNORM ||
2019           desc->format == VK_FORMAT_A2R10G10B10_UNORM_PACK32) {
2020          key->va_swap_rb_mask |= 1 << (VERT_ATTRIB_GENERIC0 + desc->location);
2021       }
2022    }
2023 
2024    key->has_multiview = ri->view_mask != 0;
2025 }
2026 
2027 static void
pipeline_populate_compute_key(struct v3dv_pipeline * pipeline,struct v3dv_pipeline_key * key,const VkComputePipelineCreateInfo * pCreateInfo)2028 pipeline_populate_compute_key(struct v3dv_pipeline *pipeline,
2029                               struct v3dv_pipeline_key *key,
2030                               const VkComputePipelineCreateInfo *pCreateInfo)
2031 {
2032    struct v3dv_device *device = pipeline->device;
2033    assert(device);
2034 
2035    /* We use the same pipeline key for graphics and compute, but we don't need
2036     * to add a field to flag compute keys because this key is not used alone
2037     * to search in the cache, we also use the SPIR-V or the serialized NIR for
2038     * example, which already flags compute shaders.
2039     */
2040    memset(key, 0, sizeof(*key));
2041 }
2042 
2043 static struct v3dv_pipeline_shared_data *
v3dv_pipeline_shared_data_new_empty(const unsigned char sha1_key[20],struct v3dv_pipeline * pipeline,bool is_graphics_pipeline)2044 v3dv_pipeline_shared_data_new_empty(const unsigned char sha1_key[20],
2045                                     struct v3dv_pipeline *pipeline,
2046                                     bool is_graphics_pipeline)
2047 {
2048    /* We create new_entry using the device alloc. Right now shared_data is ref
2049     * and unref by both the pipeline and the pipeline cache, so we can't
2050     * ensure that the cache or pipeline alloc will be available on the last
2051     * unref.
2052     */
2053    struct v3dv_pipeline_shared_data *new_entry =
2054       vk_zalloc2(&pipeline->device->vk.alloc, NULL,
2055                  sizeof(struct v3dv_pipeline_shared_data), 8,
2056                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2057 
2058    if (new_entry == NULL)
2059       return NULL;
2060 
2061    for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
2062       /* We don't need specific descriptor maps for binning stages we use the
2063        * map for the render stage.
2064        */
2065       if (broadcom_shader_stage_is_binning(stage))
2066          continue;
2067 
2068       if ((is_graphics_pipeline && stage == BROADCOM_SHADER_COMPUTE) ||
2069           (!is_graphics_pipeline && stage != BROADCOM_SHADER_COMPUTE)) {
2070          continue;
2071       }
2072 
2073       if (stage == BROADCOM_SHADER_GEOMETRY &&
2074           !pipeline->stages[BROADCOM_SHADER_GEOMETRY]) {
2075          /* We always inject a custom GS if we have multiview */
2076          if (!pipeline->rendering_info.view_mask)
2077             continue;
2078       }
2079 
2080       struct v3dv_descriptor_maps *new_maps =
2081          vk_zalloc2(&pipeline->device->vk.alloc, NULL,
2082                     sizeof(struct v3dv_descriptor_maps), 8,
2083                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2084 
2085       if (new_maps == NULL)
2086          goto fail;
2087 
2088       new_entry->maps[stage] = new_maps;
2089    }
2090 
2091    new_entry->maps[BROADCOM_SHADER_VERTEX_BIN] =
2092       new_entry->maps[BROADCOM_SHADER_VERTEX];
2093 
2094    new_entry->maps[BROADCOM_SHADER_GEOMETRY_BIN] =
2095       new_entry->maps[BROADCOM_SHADER_GEOMETRY];
2096 
2097    new_entry->ref_cnt = 1;
2098    memcpy(new_entry->sha1_key, sha1_key, 20);
2099 
2100    return new_entry;
2101 
2102 fail:
2103    if (new_entry != NULL) {
2104       for (uint8_t stage = 0; stage < BROADCOM_SHADER_STAGES; stage++) {
2105          if (new_entry->maps[stage] != NULL)
2106             vk_free(&pipeline->device->vk.alloc, new_entry->maps[stage]);
2107       }
2108    }
2109 
2110    vk_free(&pipeline->device->vk.alloc, new_entry);
2111 
2112    return NULL;
2113 }
2114 
2115 static void
write_creation_feedback(struct v3dv_pipeline * pipeline,const void * next,const VkPipelineCreationFeedback * pipeline_feedback,uint32_t stage_count,const VkPipelineShaderStageCreateInfo * stages)2116 write_creation_feedback(struct v3dv_pipeline *pipeline,
2117                         const void *next,
2118                         const VkPipelineCreationFeedback *pipeline_feedback,
2119                         uint32_t stage_count,
2120                         const VkPipelineShaderStageCreateInfo *stages)
2121 {
2122    const VkPipelineCreationFeedbackCreateInfo *create_feedback =
2123       vk_find_struct_const(next, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
2124 
2125    if (create_feedback) {
2126       typed_memcpy(create_feedback->pPipelineCreationFeedback,
2127              pipeline_feedback,
2128              1);
2129 
2130       const uint32_t feedback_stage_count =
2131          create_feedback->pipelineStageCreationFeedbackCount;
2132       assert(feedback_stage_count <= stage_count);
2133 
2134       for (uint32_t i = 0; i < feedback_stage_count; i++) {
2135          gl_shader_stage s = vk_to_mesa_shader_stage(stages[i].stage);
2136          enum broadcom_shader_stage bs = gl_shader_stage_to_broadcom(s);
2137 
2138          create_feedback->pPipelineStageCreationFeedbacks[i] =
2139             pipeline->stages[bs]->feedback;
2140 
2141          if (broadcom_shader_stage_is_render_with_binning(bs)) {
2142             enum broadcom_shader_stage bs_bin =
2143                broadcom_binning_shader_stage_for_render_stage(bs);
2144             create_feedback->pPipelineStageCreationFeedbacks[i].duration +=
2145                pipeline->stages[bs_bin]->feedback.duration;
2146          }
2147       }
2148    }
2149 }
2150 
2151 /* Note that although PrimitiveTopology is now dynamic, it is still safe to
2152  * compute the gs_input/output_primitive from the topology saved at the
2153  * pipeline, as the topology class will not change, because we don't support
2154  * dynamicPrimitiveTopologyUnrestricted
2155  */
2156 static enum mesa_prim
multiview_gs_input_primitive_from_pipeline(struct v3dv_pipeline * pipeline)2157 multiview_gs_input_primitive_from_pipeline(struct v3dv_pipeline *pipeline)
2158 {
2159    switch (pipeline->topology) {
2160    case MESA_PRIM_POINTS:
2161       return MESA_PRIM_POINTS;
2162    case MESA_PRIM_LINES:
2163    case MESA_PRIM_LINE_STRIP:
2164       return MESA_PRIM_LINES;
2165    case MESA_PRIM_TRIANGLES:
2166    case MESA_PRIM_TRIANGLE_STRIP:
2167    case MESA_PRIM_TRIANGLE_FAN:
2168       return MESA_PRIM_TRIANGLES;
2169    default:
2170       /* Since we don't allow GS with multiview, we can only see non-adjacency
2171        * primitives.
2172        */
2173       unreachable("Unexpected pipeline primitive type");
2174    }
2175 }
2176 
2177 static enum mesa_prim
multiview_gs_output_primitive_from_pipeline(struct v3dv_pipeline * pipeline)2178 multiview_gs_output_primitive_from_pipeline(struct v3dv_pipeline *pipeline)
2179 {
2180    switch (pipeline->topology) {
2181    case MESA_PRIM_POINTS:
2182       return MESA_PRIM_POINTS;
2183    case MESA_PRIM_LINES:
2184    case MESA_PRIM_LINE_STRIP:
2185       return MESA_PRIM_LINE_STRIP;
2186    case MESA_PRIM_TRIANGLES:
2187    case MESA_PRIM_TRIANGLE_STRIP:
2188    case MESA_PRIM_TRIANGLE_FAN:
2189       return MESA_PRIM_TRIANGLE_STRIP;
2190    default:
2191       /* Since we don't allow GS with multiview, we can only see non-adjacency
2192        * primitives.
2193        */
2194       unreachable("Unexpected pipeline primitive type");
2195    }
2196 }
2197 
2198 static bool
pipeline_add_multiview_gs(struct v3dv_pipeline * pipeline,struct v3dv_pipeline_cache * cache,const VkAllocationCallbacks * pAllocator)2199 pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline,
2200                           struct v3dv_pipeline_cache *cache,
2201                           const VkAllocationCallbacks *pAllocator)
2202 {
2203    /* Create the passthrough GS from the VS output interface */
2204    struct v3dv_pipeline_stage *p_stage_vs = pipeline->stages[BROADCOM_SHADER_VERTEX];
2205    p_stage_vs->nir = pipeline_stage_get_nir(p_stage_vs, pipeline, cache);
2206    nir_shader *vs_nir = p_stage_vs->nir;
2207 
2208    const nir_shader_compiler_options *options =
2209       v3dv_pipeline_get_nir_options(&pipeline->device->devinfo);
2210    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_GEOMETRY, options,
2211                                                   "multiview broadcast gs");
2212    nir_shader *nir = b.shader;
2213    nir->info.inputs_read = vs_nir->info.outputs_written;
2214    nir->info.outputs_written = vs_nir->info.outputs_written |
2215                                (1ull << VARYING_SLOT_LAYER);
2216 
2217    uint32_t vertex_count = mesa_vertices_per_prim(pipeline->topology);
2218    nir->info.gs.input_primitive =
2219       multiview_gs_input_primitive_from_pipeline(pipeline);
2220    nir->info.gs.output_primitive =
2221       multiview_gs_output_primitive_from_pipeline(pipeline);
2222    nir->info.gs.vertices_in = vertex_count;
2223    nir->info.gs.vertices_out = nir->info.gs.vertices_in;
2224    nir->info.gs.invocations = 1;
2225    nir->info.gs.active_stream_mask = 0x1;
2226 
2227    /* Make a list of GS input/output variables from the VS outputs */
2228    nir_variable *in_vars[100];
2229    nir_variable *out_vars[100];
2230    uint32_t var_count = 0;
2231    nir_foreach_shader_out_variable(out_vs_var, vs_nir) {
2232       char name[8];
2233       snprintf(name, ARRAY_SIZE(name), "in_%d", var_count);
2234 
2235       in_vars[var_count] =
2236          nir_variable_create(nir, nir_var_shader_in,
2237                              glsl_array_type(out_vs_var->type, vertex_count, 0),
2238                              name);
2239       in_vars[var_count]->data.location = out_vs_var->data.location;
2240       in_vars[var_count]->data.location_frac = out_vs_var->data.location_frac;
2241       in_vars[var_count]->data.interpolation = out_vs_var->data.interpolation;
2242 
2243       snprintf(name, ARRAY_SIZE(name), "out_%d", var_count);
2244       out_vars[var_count] =
2245          nir_variable_create(nir, nir_var_shader_out, out_vs_var->type, name);
2246       out_vars[var_count]->data.location = out_vs_var->data.location;
2247       out_vars[var_count]->data.interpolation = out_vs_var->data.interpolation;
2248 
2249       var_count++;
2250    }
2251 
2252    /* Add the gl_Layer output variable */
2253    nir_variable *out_layer =
2254       nir_variable_create(nir, nir_var_shader_out, glsl_int_type(),
2255                           "out_Layer");
2256    out_layer->data.location = VARYING_SLOT_LAYER;
2257 
2258    /* Get the view index value that we will write to gl_Layer */
2259    nir_def *layer =
2260       nir_load_system_value(&b, nir_intrinsic_load_view_index, 0, 1, 32);
2261 
2262    /* Emit all output vertices */
2263    for (uint32_t vi = 0; vi < vertex_count; vi++) {
2264       /* Emit all output varyings */
2265       for (uint32_t i = 0; i < var_count; i++) {
2266          nir_deref_instr *in_value =
2267             nir_build_deref_array_imm(&b, nir_build_deref_var(&b, in_vars[i]), vi);
2268          nir_copy_deref(&b, nir_build_deref_var(&b, out_vars[i]), in_value);
2269       }
2270 
2271       /* Emit gl_Layer write */
2272       nir_store_var(&b, out_layer, layer, 0x1);
2273 
2274       nir_emit_vertex(&b, 0);
2275    }
2276    nir_end_primitive(&b, 0);
2277 
2278    /* Make sure we run our pre-process NIR passes so we produce NIR compatible
2279     * with what we expect from SPIR-V modules.
2280     */
2281    preprocess_nir(nir);
2282 
2283    /* Attach the geometry shader to the  pipeline */
2284    struct v3dv_device *device = pipeline->device;
2285    struct v3dv_physical_device *physical_device = device->pdevice;
2286 
2287    struct v3dv_pipeline_stage *p_stage =
2288       vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*p_stage), 8,
2289                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2290 
2291    if (p_stage == NULL) {
2292       ralloc_free(nir);
2293       return false;
2294    }
2295 
2296    p_stage->pipeline = pipeline;
2297    p_stage->stage = BROADCOM_SHADER_GEOMETRY;
2298    p_stage->entrypoint = "main";
2299    p_stage->module = NULL;
2300    p_stage->module_info = NULL;
2301    p_stage->nir = nir;
2302    pipeline_compute_sha1_from_nir(p_stage);
2303    p_stage->program_id = p_atomic_inc_return(&physical_device->next_program_id);
2304    p_stage->robustness = pipeline->stages[BROADCOM_SHADER_VERTEX]->robustness;
2305 
2306    pipeline->has_gs = true;
2307    pipeline->stages[BROADCOM_SHADER_GEOMETRY] = p_stage;
2308    pipeline->active_stages |= MESA_SHADER_GEOMETRY;
2309 
2310    pipeline->stages[BROADCOM_SHADER_GEOMETRY_BIN] =
2311       pipeline_stage_create_binning(p_stage, pAllocator);
2312    if (pipeline->stages[BROADCOM_SHADER_GEOMETRY_BIN] == NULL)
2313       return false;
2314 
2315    return true;
2316 }
2317 
2318 static void
pipeline_check_buffer_device_address(struct v3dv_pipeline * pipeline)2319 pipeline_check_buffer_device_address(struct v3dv_pipeline *pipeline)
2320 {
2321    for (int i = BROADCOM_SHADER_VERTEX; i < BROADCOM_SHADER_STAGES; i++) {
2322       struct v3dv_shader_variant *variant = pipeline->shared_data->variants[i];
2323       if (variant && variant->prog_data.base->has_global_address) {
2324          pipeline->uses_buffer_device_address = true;
2325          return;
2326       }
2327    }
2328 
2329    pipeline->uses_buffer_device_address = false;
2330 }
2331 
2332 /*
2333  * It compiles a pipeline. Note that it also allocate internal object, but if
2334  * some allocations success, but other fails, the method is not freeing the
2335  * successful ones.
2336  *
2337  * This is done to simplify the code, as what we do in this case is just call
2338  * the pipeline destroy method, and this would handle freeing the internal
2339  * objects allocated. We just need to be careful setting to NULL the objects
2340  * not allocated.
2341  */
2342 static VkResult
pipeline_compile_graphics(struct v3dv_pipeline * pipeline,struct v3dv_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator)2343 pipeline_compile_graphics(struct v3dv_pipeline *pipeline,
2344                           struct v3dv_pipeline_cache *cache,
2345                           const VkGraphicsPipelineCreateInfo *pCreateInfo,
2346                           const VkAllocationCallbacks *pAllocator)
2347 {
2348    VkPipelineCreationFeedback pipeline_feedback = {
2349       .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
2350    };
2351    int64_t pipeline_start = os_time_get_nano();
2352 
2353    struct v3dv_device *device = pipeline->device;
2354    struct v3dv_physical_device *physical_device = device->pdevice;
2355 
2356    /* First pass to get some common info from the shader, and create the
2357     * individual pipeline_stage objects
2358     */
2359    for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
2360       const VkPipelineShaderStageCreateInfo *sinfo = &pCreateInfo->pStages[i];
2361       gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage);
2362 
2363       struct v3dv_pipeline_stage *p_stage =
2364          vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*p_stage), 8,
2365                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2366 
2367       if (p_stage == NULL)
2368          return VK_ERROR_OUT_OF_HOST_MEMORY;
2369 
2370       p_stage->program_id =
2371          p_atomic_inc_return(&physical_device->next_program_id);
2372 
2373       enum broadcom_shader_stage broadcom_stage =
2374          gl_shader_stage_to_broadcom(stage);
2375 
2376       p_stage->pipeline = pipeline;
2377       p_stage->stage = broadcom_stage;
2378       p_stage->entrypoint = sinfo->pName;
2379       p_stage->module = vk_shader_module_from_handle(sinfo->module);
2380       p_stage->spec_info = sinfo->pSpecializationInfo;
2381       if (!p_stage->module) {
2382          p_stage->module_info =
2383             vk_find_struct_const(sinfo->pNext, SHADER_MODULE_CREATE_INFO);
2384       }
2385 
2386       vk_pipeline_robustness_state_fill(&device->vk, &p_stage->robustness,
2387                                         pCreateInfo->pNext, sinfo->pNext);
2388 
2389       vk_pipeline_hash_shader_stage(pipeline->flags,
2390                                     &pCreateInfo->pStages[i],
2391                                     &p_stage->robustness,
2392                                     p_stage->shader_sha1);
2393 
2394       pipeline->active_stages |= sinfo->stage;
2395 
2396       /* We will try to get directly the compiled shader variant, so let's not
2397        * worry about getting the nir shader for now.
2398        */
2399       p_stage->nir = NULL;
2400       pipeline->stages[broadcom_stage] = p_stage;
2401       if (broadcom_stage == BROADCOM_SHADER_GEOMETRY)
2402          pipeline->has_gs = true;
2403 
2404       if (broadcom_shader_stage_is_render_with_binning(broadcom_stage)) {
2405          enum broadcom_shader_stage broadcom_stage_bin =
2406             broadcom_binning_shader_stage_for_render_stage(broadcom_stage);
2407 
2408          pipeline->stages[broadcom_stage_bin] =
2409             pipeline_stage_create_binning(p_stage, pAllocator);
2410 
2411          if (pipeline->stages[broadcom_stage_bin] == NULL)
2412             return VK_ERROR_OUT_OF_HOST_MEMORY;
2413       }
2414    }
2415 
2416    /* Add a no-op fragment shader if needed */
2417    if (!pipeline->stages[BROADCOM_SHADER_FRAGMENT]) {
2418       const nir_shader_compiler_options *compiler_options =
2419          v3dv_pipeline_get_nir_options(&pipeline->device->devinfo);
2420       nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
2421                                                      compiler_options,
2422                                                      "noop_fs");
2423 
2424       struct v3dv_pipeline_stage *p_stage =
2425          vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*p_stage), 8,
2426                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2427 
2428       if (p_stage == NULL)
2429          return VK_ERROR_OUT_OF_HOST_MEMORY;
2430 
2431       p_stage->pipeline = pipeline;
2432       p_stage->stage = BROADCOM_SHADER_FRAGMENT;
2433       p_stage->entrypoint = "main";
2434       p_stage->module = NULL;
2435       p_stage->module_info = NULL;
2436       p_stage->nir = b.shader;
2437       vk_pipeline_robustness_state_fill(&device->vk, &p_stage->robustness,
2438                                         NULL, NULL);
2439       pipeline_compute_sha1_from_nir(p_stage);
2440       p_stage->program_id =
2441          p_atomic_inc_return(&physical_device->next_program_id);
2442 
2443       pipeline->stages[BROADCOM_SHADER_FRAGMENT] = p_stage;
2444       pipeline->active_stages |= MESA_SHADER_FRAGMENT;
2445    }
2446 
2447    /* If multiview is enabled, we inject a custom passthrough geometry shader
2448     * to broadcast draw calls to the appropriate views.
2449     */
2450    const uint32_t view_mask = pipeline->rendering_info.view_mask;
2451    assert(!view_mask ||
2452           (!pipeline->has_gs && !pipeline->stages[BROADCOM_SHADER_GEOMETRY]));
2453    if (view_mask) {
2454       if (!pipeline_add_multiview_gs(pipeline, cache, pAllocator))
2455          return VK_ERROR_OUT_OF_HOST_MEMORY;
2456    }
2457 
2458    /* First we try to get the variants from the pipeline cache (unless we are
2459     * required to capture internal representations, since in that case we need
2460     * compile).
2461     */
2462    bool needs_executable_info =
2463       pipeline->flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
2464    if (!needs_executable_info) {
2465       struct v3dv_pipeline_key pipeline_key;
2466       pipeline_populate_graphics_key(pipeline, &pipeline_key, pCreateInfo);
2467       pipeline_hash_graphics(pipeline, &pipeline_key, pipeline->sha1);
2468 
2469       bool cache_hit = false;
2470 
2471       pipeline->shared_data =
2472          v3dv_pipeline_cache_search_for_pipeline(cache,
2473                                                  pipeline->sha1,
2474                                                  &cache_hit);
2475 
2476       if (pipeline->shared_data != NULL) {
2477          /* A correct pipeline must have at least a VS and FS */
2478          assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX]);
2479          assert(pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]);
2480          assert(pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
2481          assert(!pipeline->stages[BROADCOM_SHADER_GEOMETRY] ||
2482                 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY]);
2483          assert(!pipeline->stages[BROADCOM_SHADER_GEOMETRY] ||
2484                 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]);
2485 
2486          if (cache_hit && cache != &pipeline->device->default_pipeline_cache)
2487             pipeline_feedback.flags |=
2488                VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
2489 
2490          goto success;
2491       }
2492    }
2493 
2494    if (pipeline->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT)
2495       return VK_PIPELINE_COMPILE_REQUIRED;
2496 
2497    /* Otherwise we try to get the NIR shaders (either from the original SPIR-V
2498     * shader or the pipeline cache) and compile.
2499     */
2500    pipeline->shared_data =
2501       v3dv_pipeline_shared_data_new_empty(pipeline->sha1, pipeline, true);
2502    if (!pipeline->shared_data)
2503       return VK_ERROR_OUT_OF_HOST_MEMORY;
2504 
2505    struct v3dv_pipeline_stage *p_stage_vs = pipeline->stages[BROADCOM_SHADER_VERTEX];
2506    struct v3dv_pipeline_stage *p_stage_fs = pipeline->stages[BROADCOM_SHADER_FRAGMENT];
2507    struct v3dv_pipeline_stage *p_stage_gs = pipeline->stages[BROADCOM_SHADER_GEOMETRY];
2508 
2509    p_stage_vs->feedback.flags |=
2510       VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
2511    if (p_stage_gs)
2512       p_stage_gs->feedback.flags |=
2513          VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
2514    p_stage_fs->feedback.flags |=
2515       VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
2516 
2517    if (!p_stage_vs->nir)
2518       p_stage_vs->nir = pipeline_stage_get_nir(p_stage_vs, pipeline, cache);
2519    if (p_stage_gs && !p_stage_gs->nir)
2520       p_stage_gs->nir = pipeline_stage_get_nir(p_stage_gs, pipeline, cache);
2521    if (!p_stage_fs->nir)
2522       p_stage_fs->nir = pipeline_stage_get_nir(p_stage_fs, pipeline, cache);
2523 
2524    /* Linking + pipeline lowerings */
2525    if (p_stage_gs) {
2526       link_shaders(p_stage_gs->nir, p_stage_fs->nir);
2527       link_shaders(p_stage_vs->nir, p_stage_gs->nir);
2528    } else {
2529       link_shaders(p_stage_vs->nir, p_stage_fs->nir);
2530    }
2531 
2532    pipeline_lower_nir(pipeline, p_stage_fs, pipeline->layout);
2533    lower_fs_io(p_stage_fs->nir);
2534 
2535    if (p_stage_gs) {
2536       pipeline_lower_nir(pipeline, p_stage_gs, pipeline->layout);
2537       lower_gs_io(p_stage_gs->nir);
2538    }
2539 
2540    pipeline_lower_nir(pipeline, p_stage_vs, pipeline->layout);
2541    lower_vs_io(p_stage_vs->nir);
2542 
2543    /* Compiling to vir */
2544    VkResult vk_result;
2545 
2546    /* We should have got all the variants or no variants from the cache */
2547    assert(!pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]);
2548    vk_result = pipeline_compile_fragment_shader(pipeline, pAllocator,
2549                                                 pCreateInfo);
2550    if (vk_result != VK_SUCCESS)
2551       return vk_result;
2552 
2553    assert(!pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY] &&
2554           !pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN]);
2555 
2556    if (p_stage_gs) {
2557       vk_result =
2558          pipeline_compile_geometry_shader(pipeline, pAllocator, pCreateInfo);
2559       if (vk_result != VK_SUCCESS)
2560          return vk_result;
2561    }
2562 
2563    assert(!pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX] &&
2564           !pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]);
2565 
2566    vk_result = pipeline_compile_vertex_shader(pipeline, pAllocator, pCreateInfo);
2567    if (vk_result != VK_SUCCESS)
2568       return vk_result;
2569 
2570    if (!upload_assembly(pipeline))
2571       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2572 
2573    v3dv_pipeline_cache_upload_pipeline(pipeline, cache);
2574 
2575  success:
2576 
2577    pipeline_check_buffer_device_address(pipeline);
2578 
2579    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
2580    write_creation_feedback(pipeline,
2581                            pCreateInfo->pNext,
2582                            &pipeline_feedback,
2583                            pCreateInfo->stageCount,
2584                            pCreateInfo->pStages);
2585 
2586    /* Since we have the variants in the pipeline shared data we can now free
2587     * the pipeline stages.
2588     */
2589    if (!needs_executable_info)
2590       pipeline_free_stages(device, pipeline, pAllocator);
2591 
2592    pipeline_check_spill_size(pipeline);
2593 
2594    return compute_vpm_config(pipeline);
2595 }
2596 
2597 static VkResult
compute_vpm_config(struct v3dv_pipeline * pipeline)2598 compute_vpm_config(struct v3dv_pipeline *pipeline)
2599 {
2600    struct v3dv_shader_variant *vs_variant =
2601       pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
2602    struct v3dv_shader_variant *vs_bin_variant =
2603       pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
2604    struct v3d_vs_prog_data *vs = vs_variant->prog_data.vs;
2605    struct v3d_vs_prog_data *vs_bin =vs_bin_variant->prog_data.vs;
2606 
2607    struct v3d_gs_prog_data *gs = NULL;
2608    struct v3d_gs_prog_data *gs_bin = NULL;
2609    if (pipeline->has_gs) {
2610       struct v3dv_shader_variant *gs_variant =
2611          pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
2612       struct v3dv_shader_variant *gs_bin_variant =
2613          pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
2614       gs = gs_variant->prog_data.gs;
2615       gs_bin = gs_bin_variant->prog_data.gs;
2616    }
2617 
2618    if (!v3d_compute_vpm_config(&pipeline->device->devinfo,
2619                                vs_bin, vs, gs_bin, gs,
2620                                &pipeline->vpm_cfg_bin,
2621                                &pipeline->vpm_cfg)) {
2622       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2623    }
2624 
2625    return VK_SUCCESS;
2626 }
2627 
2628 static bool
stencil_op_is_no_op(struct vk_stencil_test_face_state * stencil)2629 stencil_op_is_no_op(struct vk_stencil_test_face_state *stencil)
2630 {
2631    return stencil->op.depth_fail == VK_STENCIL_OP_KEEP &&
2632           stencil->op.compare == VK_COMPARE_OP_ALWAYS;
2633 }
2634 
2635 /* Computes the ez_state based on a given vk_dynamic_graphics_state.  Note
2636  * that the parameter dyn doesn't need to be pipeline->dynamic_graphics_state,
2637  * as this method can be used by the cmd_buffer too.
2638  */
2639 void
v3dv_compute_ez_state(struct vk_dynamic_graphics_state * dyn,struct v3dv_pipeline * pipeline,enum v3dv_ez_state * ez_state,bool * incompatible_ez_test)2640 v3dv_compute_ez_state(struct vk_dynamic_graphics_state *dyn,
2641                       struct v3dv_pipeline *pipeline,
2642                       enum v3dv_ez_state *ez_state,
2643                       bool *incompatible_ez_test)
2644 {
2645    if (!dyn->ds.depth.test_enable)  {
2646       *ez_state = V3D_EZ_DISABLED;
2647       return;
2648    }
2649 
2650    switch (dyn->ds.depth.compare_op) {
2651    case VK_COMPARE_OP_LESS:
2652    case VK_COMPARE_OP_LESS_OR_EQUAL:
2653       *ez_state = V3D_EZ_LT_LE;
2654       break;
2655    case VK_COMPARE_OP_GREATER:
2656    case VK_COMPARE_OP_GREATER_OR_EQUAL:
2657       *ez_state = V3D_EZ_GT_GE;
2658       break;
2659    case VK_COMPARE_OP_NEVER:
2660    case VK_COMPARE_OP_EQUAL:
2661       *ez_state = V3D_EZ_UNDECIDED;
2662       break;
2663    default:
2664       *ez_state = V3D_EZ_DISABLED;
2665       *incompatible_ez_test = true;
2666       break;
2667    }
2668 
2669    /* If stencil is enabled and is not a no-op, we need to disable EZ */
2670    if (dyn->ds.stencil.test_enable &&
2671        (!stencil_op_is_no_op(&dyn->ds.stencil.front) ||
2672         !stencil_op_is_no_op(&dyn->ds.stencil.back))) {
2673       *ez_state = V3D_EZ_DISABLED;
2674    }
2675 
2676    /* If the FS writes Z, then it may update against the chosen EZ direction */
2677    struct v3dv_shader_variant *fs_variant =
2678       pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
2679    if (fs_variant && fs_variant->prog_data.fs->writes_z &&
2680        !fs_variant->prog_data.fs->writes_z_from_fep) {
2681       *ez_state = V3D_EZ_DISABLED;
2682    }
2683 }
2684 
2685 
2686 static void
pipeline_set_sample_mask(struct v3dv_pipeline * pipeline,const VkPipelineMultisampleStateCreateInfo * ms_info)2687 pipeline_set_sample_mask(struct v3dv_pipeline *pipeline,
2688                          const VkPipelineMultisampleStateCreateInfo *ms_info)
2689 {
2690    pipeline->sample_mask = (1 << V3D_MAX_SAMPLES) - 1;
2691 
2692    /* Ignore pSampleMask if we are not enabling multisampling. The hardware
2693     * requires this to be 0xf or 0x0 if using a single sample.
2694     */
2695    if (ms_info && ms_info->pSampleMask &&
2696        ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT) {
2697       pipeline->sample_mask &= ms_info->pSampleMask[0];
2698    }
2699 }
2700 
2701 static void
pipeline_set_sample_rate_shading(struct v3dv_pipeline * pipeline,const VkPipelineMultisampleStateCreateInfo * ms_info)2702 pipeline_set_sample_rate_shading(struct v3dv_pipeline *pipeline,
2703                                  const VkPipelineMultisampleStateCreateInfo *ms_info)
2704 {
2705    pipeline->sample_rate_shading =
2706       ms_info && ms_info->rasterizationSamples > VK_SAMPLE_COUNT_1_BIT &&
2707       ms_info->sampleShadingEnable;
2708 }
2709 
2710 static void
pipeline_setup_rendering_info(struct v3dv_device * device,struct v3dv_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * alloc)2711 pipeline_setup_rendering_info(struct v3dv_device *device,
2712                               struct v3dv_pipeline *pipeline,
2713                               const VkGraphicsPipelineCreateInfo *pCreateInfo,
2714                               const VkAllocationCallbacks *alloc)
2715 {
2716    struct vk_render_pass_state *rp = &pipeline->rendering_info;
2717 
2718    if (pipeline->pass) {
2719       assert(pipeline->subpass);
2720       struct v3dv_render_pass *pass = pipeline->pass;
2721       struct v3dv_subpass *subpass = pipeline->subpass;
2722       const uint32_t attachment_idx = subpass->ds_attachment.attachment;
2723 
2724       rp->view_mask = subpass->view_mask;
2725 
2726       rp->depth_attachment_format = VK_FORMAT_UNDEFINED;
2727       rp->stencil_attachment_format = VK_FORMAT_UNDEFINED;
2728       rp->attachments = MESA_VK_RP_ATTACHMENT_NONE;
2729       if (attachment_idx != VK_ATTACHMENT_UNUSED) {
2730          VkFormat ds_format = pass->attachments[attachment_idx].desc.format;
2731          if (vk_format_has_depth(ds_format)) {
2732             rp->depth_attachment_format = ds_format;
2733             rp->attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
2734          }
2735          if (vk_format_has_stencil(ds_format)) {
2736             rp->stencil_attachment_format = ds_format;
2737             rp->attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
2738          }
2739       }
2740 
2741       rp->color_attachment_count = subpass->color_count;
2742       for (uint32_t i = 0; i < subpass->color_count; i++) {
2743          const uint32_t attachment_idx = subpass->color_attachments[i].attachment;
2744          if (attachment_idx == VK_ATTACHMENT_UNUSED) {
2745             rp->color_attachment_formats[i] = VK_FORMAT_UNDEFINED;
2746             continue;
2747          }
2748          rp->color_attachment_formats[i] =
2749             pass->attachments[attachment_idx].desc.format;
2750          rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
2751       }
2752       return;
2753    }
2754 
2755    const VkPipelineRenderingCreateInfo *ri =
2756       vk_find_struct_const(pCreateInfo->pNext,
2757                            PIPELINE_RENDERING_CREATE_INFO);
2758    if (ri) {
2759       rp->view_mask = ri->viewMask;
2760 
2761       rp->color_attachment_count = ri->colorAttachmentCount;
2762       for (int i = 0; i < ri->colorAttachmentCount; i++) {
2763          rp->color_attachment_formats[i] = ri->pColorAttachmentFormats[i];
2764          if (rp->color_attachment_formats[i] != VK_FORMAT_UNDEFINED) {
2765             rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
2766          }
2767       }
2768 
2769       rp->depth_attachment_format = ri->depthAttachmentFormat;
2770       if (ri->depthAttachmentFormat != VK_FORMAT_UNDEFINED)
2771          rp->attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
2772 
2773       rp->stencil_attachment_format = ri->stencilAttachmentFormat;
2774       if (ri->stencilAttachmentFormat != VK_FORMAT_UNDEFINED)
2775          rp->attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
2776 
2777       return;
2778    }
2779 
2780    /* From the Vulkan spec for VkPipelineRenderingCreateInfo:
2781     *
2782     *    "if this structure is not specified, and the pipeline does not include
2783     *     a VkRenderPass, viewMask and colorAttachmentCount are 0, and
2784     *     depthAttachmentFormat and stencilAttachmentFormat are
2785     *     VK_FORMAT_UNDEFINED.
2786     */
2787    pipeline->rendering_info = (struct vk_render_pass_state) {
2788       .view_mask = 0,
2789       .attachments = 0,
2790       .color_attachment_count = 0,
2791       .depth_attachment_format = VK_FORMAT_UNDEFINED,
2792       .stencil_attachment_format = VK_FORMAT_UNDEFINED,
2793    };
2794 }
2795 
2796 static VkResult
pipeline_init_dynamic_state(struct v3dv_device * device,struct v3dv_pipeline * pipeline,struct vk_graphics_pipeline_all_state * pipeline_all_state,struct vk_graphics_pipeline_state * pipeline_state,const VkGraphicsPipelineCreateInfo * pCreateInfo)2797 pipeline_init_dynamic_state(struct v3dv_device *device,
2798                             struct v3dv_pipeline *pipeline,
2799                             struct vk_graphics_pipeline_all_state *pipeline_all_state,
2800                             struct vk_graphics_pipeline_state *pipeline_state,
2801                             const VkGraphicsPipelineCreateInfo *pCreateInfo)
2802 {
2803    VkResult result = VK_SUCCESS;
2804    result = vk_graphics_pipeline_state_fill(&pipeline->device->vk, pipeline_state,
2805                                             pCreateInfo, &pipeline->rendering_info, 0,
2806                                             pipeline_all_state, NULL, 0, NULL);
2807    if (result != VK_SUCCESS)
2808       return result;
2809 
2810    vk_dynamic_graphics_state_fill(&pipeline->dynamic_graphics_state, pipeline_state);
2811 
2812    struct v3dv_dynamic_state *v3dv_dyn = &pipeline->dynamic;
2813    struct vk_dynamic_graphics_state *dyn = &pipeline->dynamic_graphics_state;
2814 
2815    if (BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
2816        BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_VP_SCISSORS)) {
2817       /* FIXME: right now we don't support multiViewport so viewporst[0] would
2818        * work now, but would need to change if we allow multiple viewports.
2819        */
2820       v3d_X((&device->devinfo), viewport_compute_xform)(&dyn->vp.viewports[0],
2821                                              v3dv_dyn->viewport.scale[0],
2822                                              v3dv_dyn->viewport.translate[0]);
2823 
2824    }
2825 
2826    v3dv_dyn->color_write_enable =
2827       (1ull << (4 * V3D_MAX_RENDER_TARGETS(device->devinfo.ver))) - 1;
2828    if (pipeline_state->cb) {
2829       const uint8_t color_writes = pipeline_state->cb->color_write_enables;
2830       v3dv_dyn->color_write_enable = 0;
2831       for (uint32_t i = 0; i < pipeline_state->cb->attachment_count; i++) {
2832          v3dv_dyn->color_write_enable |=
2833             (color_writes & BITFIELD_BIT(i)) ? (0xfu << (i * 4)) : 0;
2834       }
2835    }
2836 
2837    return result;
2838 }
2839 
2840 static VkResult
pipeline_init(struct v3dv_pipeline * pipeline,struct v3dv_device * device,struct v3dv_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator)2841 pipeline_init(struct v3dv_pipeline *pipeline,
2842               struct v3dv_device *device,
2843               struct v3dv_pipeline_cache *cache,
2844               const VkGraphicsPipelineCreateInfo *pCreateInfo,
2845               const VkAllocationCallbacks *pAllocator)
2846 {
2847    VkResult result = VK_SUCCESS;
2848 
2849    pipeline->device = device;
2850 
2851    V3DV_FROM_HANDLE(v3dv_pipeline_layout, layout, pCreateInfo->layout);
2852    pipeline->layout = layout;
2853    v3dv_pipeline_layout_ref(pipeline->layout);
2854 
2855    V3DV_FROM_HANDLE(v3dv_render_pass, render_pass, pCreateInfo->renderPass);
2856    if (render_pass) {
2857       assert(pCreateInfo->subpass < render_pass->subpass_count);
2858       pipeline->pass = render_pass;
2859       pipeline->subpass = &render_pass->subpasses[pCreateInfo->subpass];
2860    }
2861 
2862    pipeline_setup_rendering_info(device, pipeline, pCreateInfo, pAllocator);
2863 
2864    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
2865       pCreateInfo->pInputAssemblyState;
2866    pipeline->topology = vk_to_mesa_prim[ia_info->topology];
2867 
2868    struct vk_graphics_pipeline_all_state all;
2869    struct vk_graphics_pipeline_state pipeline_state = { };
2870    result = pipeline_init_dynamic_state(device, pipeline, &all, &pipeline_state,
2871                                         pCreateInfo);
2872 
2873    if (result != VK_SUCCESS) {
2874       /* Caller would already destroy the pipeline, and we didn't allocate any
2875        * extra info. We don't need to do anything else.
2876        */
2877       return result;
2878    }
2879 
2880    /* If rasterization is disabled, we just disable it through the CFG_BITS
2881     * packet, so for building the pipeline we always assume it is enabled
2882     */
2883    const bool raster_enabled =
2884       (pipeline_state.rs && !pipeline_state.rs->rasterizer_discard_enable) ||
2885       BITSET_TEST(pipeline_state.dynamic, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE);
2886 
2887    pipeline->rasterization_enabled = raster_enabled;
2888 
2889    const VkPipelineViewportStateCreateInfo *vp_info =
2890       raster_enabled ? pCreateInfo->pViewportState : NULL;
2891 
2892    const VkPipelineDepthStencilStateCreateInfo *ds_info =
2893       raster_enabled ? pCreateInfo->pDepthStencilState : NULL;
2894 
2895    const VkPipelineRasterizationStateCreateInfo *rs_info =
2896       raster_enabled ? pCreateInfo->pRasterizationState : NULL;
2897 
2898    const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *pv_info =
2899       raster_enabled ? vk_find_struct_const(
2900          rs_info->pNext,
2901          PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT) :
2902             NULL;
2903 
2904    const VkPipelineRasterizationLineStateCreateInfoEXT *ls_info =
2905       raster_enabled ? vk_find_struct_const(
2906          rs_info->pNext,
2907          PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT) :
2908             NULL;
2909 
2910    const VkPipelineColorBlendStateCreateInfo *cb_info =
2911       raster_enabled ? pCreateInfo->pColorBlendState : NULL;
2912 
2913    const VkPipelineMultisampleStateCreateInfo *ms_info =
2914       raster_enabled ? pCreateInfo->pMultisampleState : NULL;
2915 
2916    const VkPipelineViewportDepthClipControlCreateInfoEXT *depth_clip_control =
2917       vp_info ? vk_find_struct_const(vp_info->pNext,
2918                                      PIPELINE_VIEWPORT_DEPTH_CLIP_CONTROL_CREATE_INFO_EXT) :
2919                 NULL;
2920 
2921    if (depth_clip_control)
2922       pipeline->negative_one_to_one = depth_clip_control->negativeOneToOne;
2923 
2924    v3d_X((&device->devinfo), pipeline_pack_state)(pipeline, cb_info, ds_info,
2925                                        rs_info, pv_info, ls_info,
2926                                        ms_info,
2927                                        &pipeline_state);
2928 
2929    pipeline_set_sample_mask(pipeline, ms_info);
2930    pipeline_set_sample_rate_shading(pipeline, ms_info);
2931    pipeline->line_smooth = enable_line_smooth(pipeline, rs_info);
2932 
2933    result = pipeline_compile_graphics(pipeline, cache, pCreateInfo, pAllocator);
2934 
2935    if (result != VK_SUCCESS) {
2936       /* Caller would already destroy the pipeline, and we didn't allocate any
2937        * extra info. We don't need to do anything else.
2938        */
2939       return result;
2940    }
2941 
2942    const VkPipelineVertexInputStateCreateInfo *vi_info =
2943       pCreateInfo->pVertexInputState;
2944 
2945    const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info =
2946       vk_find_struct_const(vi_info->pNext,
2947                            PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
2948 
2949    v3d_X((&device->devinfo), pipeline_pack_compile_state)(pipeline, vi_info, vd_info);
2950 
2951    if (v3d_X((&device->devinfo), pipeline_needs_default_attribute_values)(pipeline)) {
2952       pipeline->default_attribute_values =
2953          v3d_X((&pipeline->device->devinfo), create_default_attribute_values)(pipeline->device, pipeline);
2954 
2955       if (!pipeline->default_attribute_values)
2956          return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2957    } else {
2958       pipeline->default_attribute_values = NULL;
2959    }
2960 
2961    /* This must be done after the pipeline has been compiled */
2962    v3dv_compute_ez_state(&pipeline->dynamic_graphics_state,
2963                          pipeline,
2964                          &pipeline->ez_state,
2965                          &pipeline->incompatible_ez_test);
2966 
2967    return result;
2968 }
2969 
2970 static VkPipelineCreateFlagBits2KHR
pipeline_create_info_get_flags(VkPipelineCreateFlags flags,const void * pNext)2971 pipeline_create_info_get_flags(VkPipelineCreateFlags flags, const void *pNext)
2972 {
2973    const VkPipelineCreateFlags2CreateInfoKHR *flags2 =
2974       vk_find_struct_const(pNext, PIPELINE_CREATE_FLAGS_2_CREATE_INFO_KHR);
2975    if (flags2)
2976       return flags2->flags;
2977    else
2978       return flags;
2979 }
2980 
2981 static VkResult
graphics_pipeline_create(VkDevice _device,VkPipelineCache _cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline,VkPipelineCreateFlagBits2KHR * flags)2982 graphics_pipeline_create(VkDevice _device,
2983                          VkPipelineCache _cache,
2984                          const VkGraphicsPipelineCreateInfo *pCreateInfo,
2985                          const VkAllocationCallbacks *pAllocator,
2986                          VkPipeline *pPipeline,
2987                          VkPipelineCreateFlagBits2KHR *flags)
2988 {
2989    V3DV_FROM_HANDLE(v3dv_device, device, _device);
2990    V3DV_FROM_HANDLE(v3dv_pipeline_cache, cache, _cache);
2991 
2992    struct v3dv_pipeline *pipeline;
2993    VkResult result;
2994 
2995    *flags = pipeline_create_info_get_flags(pCreateInfo->flags,
2996                                            pCreateInfo->pNext);
2997 
2998    /* Use the default pipeline cache if none is specified */
2999    if (cache == NULL && device->instance->default_pipeline_cache_enabled)
3000       cache = &device->default_pipeline_cache;
3001 
3002    pipeline = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pipeline),
3003                                VK_OBJECT_TYPE_PIPELINE);
3004 
3005    if (pipeline == NULL)
3006       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
3007 
3008    pipeline->flags = *flags;
3009    result = pipeline_init(pipeline, device, cache, pCreateInfo, pAllocator);
3010 
3011    if (result != VK_SUCCESS) {
3012       v3dv_destroy_pipeline(pipeline, device, pAllocator);
3013       if (result == VK_PIPELINE_COMPILE_REQUIRED)
3014          *pPipeline = VK_NULL_HANDLE;
3015       return result;
3016    }
3017 
3018    *pPipeline = v3dv_pipeline_to_handle(pipeline);
3019 
3020    return VK_SUCCESS;
3021 }
3022 
3023 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateGraphicsPipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t count,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)3024 v3dv_CreateGraphicsPipelines(VkDevice _device,
3025                              VkPipelineCache pipelineCache,
3026                              uint32_t count,
3027                              const VkGraphicsPipelineCreateInfo *pCreateInfos,
3028                              const VkAllocationCallbacks *pAllocator,
3029                              VkPipeline *pPipelines)
3030 {
3031    MESA_TRACE_FUNC();
3032    V3DV_FROM_HANDLE(v3dv_device, device, _device);
3033    VkResult result = VK_SUCCESS;
3034 
3035    if (V3D_DBG(SHADERS))
3036       mtx_lock(&device->pdevice->mutex);
3037 
3038    uint32_t i = 0;
3039    for (; i < count; i++) {
3040       VkResult local_result;
3041 
3042       VkPipelineCreateFlagBits2KHR flags;
3043       local_result = graphics_pipeline_create(_device,
3044                                               pipelineCache,
3045                                               &pCreateInfos[i],
3046                                               pAllocator,
3047                                               &pPipelines[i],
3048                                               &flags);
3049 
3050       if (local_result != VK_SUCCESS) {
3051          result = local_result;
3052          pPipelines[i] = VK_NULL_HANDLE;
3053          if (flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
3054             break;
3055       }
3056    }
3057 
3058    for (; i < count; i++)
3059       pPipelines[i] = VK_NULL_HANDLE;
3060 
3061    if (V3D_DBG(SHADERS))
3062       mtx_unlock(&device->pdevice->mutex);
3063 
3064    return result;
3065 }
3066 
3067 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)3068 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
3069 {
3070    assert(glsl_type_is_vector_or_scalar(type));
3071 
3072    uint32_t comp_size = glsl_type_is_boolean(type)
3073       ? 4 : glsl_get_bit_size(type) / 8;
3074    unsigned length = glsl_get_vector_elements(type);
3075    *size = comp_size * length,
3076    *align = comp_size * (length == 3 ? 4 : length);
3077 }
3078 
3079 static void
lower_compute(struct nir_shader * nir)3080 lower_compute(struct nir_shader *nir)
3081 {
3082    if (!nir->info.shared_memory_explicit_layout) {
3083       NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
3084                nir_var_mem_shared, shared_type_info);
3085    }
3086 
3087    NIR_PASS(_, nir, nir_lower_explicit_io,
3088             nir_var_mem_shared, nir_address_format_32bit_offset);
3089 
3090    struct nir_lower_compute_system_values_options sysval_options = {
3091       .has_base_workgroup_id = true,
3092    };
3093    NIR_PASS_V(nir, nir_lower_compute_system_values, &sysval_options);
3094 }
3095 
3096 static VkResult
pipeline_compile_compute(struct v3dv_pipeline * pipeline,struct v3dv_pipeline_cache * cache,const VkComputePipelineCreateInfo * info,const VkAllocationCallbacks * alloc)3097 pipeline_compile_compute(struct v3dv_pipeline *pipeline,
3098                          struct v3dv_pipeline_cache *cache,
3099                          const VkComputePipelineCreateInfo *info,
3100                          const VkAllocationCallbacks *alloc)
3101 {
3102    VkPipelineCreationFeedback pipeline_feedback = {
3103       .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
3104    };
3105    int64_t pipeline_start = os_time_get_nano();
3106 
3107    struct v3dv_device *device = pipeline->device;
3108    struct v3dv_physical_device *physical_device = device->pdevice;
3109 
3110    const VkPipelineShaderStageCreateInfo *sinfo = &info->stage;
3111    gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage);
3112 
3113    struct v3dv_pipeline_stage *p_stage =
3114       vk_zalloc2(&device->vk.alloc, alloc, sizeof(*p_stage), 8,
3115                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
3116    if (!p_stage)
3117       return VK_ERROR_OUT_OF_HOST_MEMORY;
3118 
3119    p_stage->program_id = p_atomic_inc_return(&physical_device->next_program_id);
3120    p_stage->pipeline = pipeline;
3121    p_stage->stage = gl_shader_stage_to_broadcom(stage);
3122    p_stage->entrypoint = sinfo->pName;
3123    p_stage->module = vk_shader_module_from_handle(sinfo->module);
3124    p_stage->spec_info = sinfo->pSpecializationInfo;
3125    p_stage->feedback = (VkPipelineCreationFeedback) { 0 };
3126    if (!p_stage->module) {
3127       p_stage->module_info =
3128          vk_find_struct_const(sinfo->pNext, SHADER_MODULE_CREATE_INFO);
3129    }
3130 
3131    vk_pipeline_robustness_state_fill(&device->vk, &p_stage->robustness,
3132                                      info->pNext, sinfo->pNext);
3133 
3134    vk_pipeline_hash_shader_stage(pipeline->flags,
3135                                  &info->stage,
3136                                  &p_stage->robustness,
3137                                  p_stage->shader_sha1);
3138 
3139    p_stage->nir = NULL;
3140 
3141    pipeline->stages[BROADCOM_SHADER_COMPUTE] = p_stage;
3142    pipeline->active_stages |= sinfo->stage;
3143 
3144    /* First we try to get the variants from the pipeline cache (unless we are
3145     * required to capture internal representations, since in that case we need
3146     * compile).
3147     */
3148    bool needs_executable_info =
3149       pipeline->flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
3150    if (!needs_executable_info) {
3151       struct v3dv_pipeline_key pipeline_key;
3152       pipeline_populate_compute_key(pipeline, &pipeline_key, info);
3153       pipeline_hash_compute(pipeline, &pipeline_key, pipeline->sha1);
3154 
3155       bool cache_hit = false;
3156       pipeline->shared_data =
3157          v3dv_pipeline_cache_search_for_pipeline(cache, pipeline->sha1, &cache_hit);
3158 
3159       if (pipeline->shared_data != NULL) {
3160          assert(pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
3161          if (cache_hit && cache != &pipeline->device->default_pipeline_cache)
3162             pipeline_feedback.flags |=
3163                VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
3164 
3165          goto success;
3166       }
3167    }
3168 
3169    if (pipeline->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT)
3170       return VK_PIPELINE_COMPILE_REQUIRED;
3171 
3172    pipeline->shared_data = v3dv_pipeline_shared_data_new_empty(pipeline->sha1,
3173                                                                pipeline,
3174                                                                false);
3175    if (!pipeline->shared_data)
3176       return VK_ERROR_OUT_OF_HOST_MEMORY;
3177 
3178    p_stage->feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
3179 
3180    /* If not found on cache, compile it */
3181    p_stage->nir = pipeline_stage_get_nir(p_stage, pipeline, cache);
3182    assert(p_stage->nir);
3183 
3184    v3d_optimize_nir(NULL, p_stage->nir);
3185    pipeline_lower_nir(pipeline, p_stage, pipeline->layout);
3186    lower_compute(p_stage->nir);
3187 
3188    VkResult result = VK_SUCCESS;
3189 
3190    struct v3d_key key;
3191    memset(&key, 0, sizeof(key));
3192    pipeline_populate_v3d_key(&key, p_stage, 0);
3193    pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE] =
3194       pipeline_compile_shader_variant(p_stage, &key, sizeof(key),
3195                                       alloc, &result);
3196 
3197    if (result != VK_SUCCESS)
3198       return result;
3199 
3200    if (!upload_assembly(pipeline))
3201       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
3202 
3203    v3dv_pipeline_cache_upload_pipeline(pipeline, cache);
3204 
3205 success:
3206 
3207    pipeline_check_buffer_device_address(pipeline);
3208 
3209    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
3210    write_creation_feedback(pipeline,
3211                            info->pNext,
3212                            &pipeline_feedback,
3213                            1,
3214                            &info->stage);
3215 
3216    /* As we got the variants in pipeline->shared_data, after compiling we
3217     * don't need the pipeline_stages.
3218     */
3219    if (!needs_executable_info)
3220       pipeline_free_stages(device, pipeline, alloc);
3221 
3222    pipeline_check_spill_size(pipeline);
3223 
3224    return VK_SUCCESS;
3225 }
3226 
3227 static VkResult
compute_pipeline_init(struct v3dv_pipeline * pipeline,struct v3dv_device * device,struct v3dv_pipeline_cache * cache,const VkComputePipelineCreateInfo * info,const VkAllocationCallbacks * alloc)3228 compute_pipeline_init(struct v3dv_pipeline *pipeline,
3229                       struct v3dv_device *device,
3230                       struct v3dv_pipeline_cache *cache,
3231                       const VkComputePipelineCreateInfo *info,
3232                       const VkAllocationCallbacks *alloc)
3233 {
3234    V3DV_FROM_HANDLE(v3dv_pipeline_layout, layout, info->layout);
3235 
3236    pipeline->device = device;
3237    pipeline->layout = layout;
3238    v3dv_pipeline_layout_ref(pipeline->layout);
3239 
3240    VkResult result = pipeline_compile_compute(pipeline, cache, info, alloc);
3241    if (result != VK_SUCCESS)
3242       return result;
3243 
3244    return result;
3245 }
3246 
3247 static VkResult
compute_pipeline_create(VkDevice _device,VkPipelineCache _cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline,VkPipelineCreateFlagBits2KHR * flags)3248 compute_pipeline_create(VkDevice _device,
3249                          VkPipelineCache _cache,
3250                          const VkComputePipelineCreateInfo *pCreateInfo,
3251                          const VkAllocationCallbacks *pAllocator,
3252                          VkPipeline *pPipeline,
3253                          VkPipelineCreateFlagBits2KHR *flags)
3254 {
3255    V3DV_FROM_HANDLE(v3dv_device, device, _device);
3256    V3DV_FROM_HANDLE(v3dv_pipeline_cache, cache, _cache);
3257 
3258    struct v3dv_pipeline *pipeline;
3259    VkResult result;
3260 
3261    *flags = pipeline_create_info_get_flags(pCreateInfo->flags,
3262                                            pCreateInfo->pNext);
3263 
3264    /* Use the default pipeline cache if none is specified */
3265    if (cache == NULL && device->instance->default_pipeline_cache_enabled)
3266       cache = &device->default_pipeline_cache;
3267 
3268    pipeline = vk_object_zalloc(&device->vk, pAllocator, sizeof(*pipeline),
3269                                VK_OBJECT_TYPE_PIPELINE);
3270    if (pipeline == NULL)
3271       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
3272 
3273    pipeline->flags = *flags;
3274    result = compute_pipeline_init(pipeline, device, cache,
3275                                   pCreateInfo, pAllocator);
3276    if (result != VK_SUCCESS) {
3277       v3dv_destroy_pipeline(pipeline, device, pAllocator);
3278       if (result == VK_PIPELINE_COMPILE_REQUIRED)
3279          *pPipeline = VK_NULL_HANDLE;
3280       return result;
3281    }
3282 
3283    *pPipeline = v3dv_pipeline_to_handle(pipeline);
3284 
3285    return VK_SUCCESS;
3286 }
3287 
3288 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateComputePipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t createInfoCount,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)3289 v3dv_CreateComputePipelines(VkDevice _device,
3290                             VkPipelineCache pipelineCache,
3291                             uint32_t createInfoCount,
3292                             const VkComputePipelineCreateInfo *pCreateInfos,
3293                             const VkAllocationCallbacks *pAllocator,
3294                             VkPipeline *pPipelines)
3295 {
3296    MESA_TRACE_FUNC();
3297    V3DV_FROM_HANDLE(v3dv_device, device, _device);
3298    VkResult result = VK_SUCCESS;
3299 
3300    if (V3D_DBG(SHADERS))
3301       mtx_lock(&device->pdevice->mutex);
3302 
3303    uint32_t i = 0;
3304    for (; i < createInfoCount; i++) {
3305       VkResult local_result;
3306       VkPipelineCreateFlagBits2KHR flags;
3307       local_result = compute_pipeline_create(_device,
3308                                               pipelineCache,
3309                                               &pCreateInfos[i],
3310                                               pAllocator,
3311                                               &pPipelines[i],
3312                                               &flags);
3313 
3314       if (local_result != VK_SUCCESS) {
3315          result = local_result;
3316          pPipelines[i] = VK_NULL_HANDLE;
3317          if (flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
3318             break;
3319       }
3320    }
3321 
3322    for (; i < createInfoCount; i++)
3323       pPipelines[i] = VK_NULL_HANDLE;
3324 
3325    if (V3D_DBG(SHADERS))
3326       mtx_unlock(&device->pdevice->mutex);
3327 
3328    return result;
3329 }
3330 
3331 static nir_shader *
pipeline_get_nir(struct v3dv_pipeline * pipeline,enum broadcom_shader_stage stage)3332 pipeline_get_nir(struct v3dv_pipeline *pipeline,
3333                  enum broadcom_shader_stage stage)
3334 {
3335    assert(stage >= 0 && stage < BROADCOM_SHADER_STAGES);
3336    if (pipeline->stages[stage])
3337       return pipeline->stages[stage]->nir;
3338 
3339    return NULL;
3340 }
3341 
3342 static struct v3d_prog_data *
pipeline_get_prog_data(struct v3dv_pipeline * pipeline,enum broadcom_shader_stage stage)3343 pipeline_get_prog_data(struct v3dv_pipeline *pipeline,
3344                        enum broadcom_shader_stage stage)
3345 {
3346    if (pipeline->shared_data->variants[stage])
3347       return pipeline->shared_data->variants[stage]->prog_data.base;
3348    return NULL;
3349 }
3350 
3351 static uint64_t *
pipeline_get_qpu(struct v3dv_pipeline * pipeline,enum broadcom_shader_stage stage,uint32_t * qpu_size)3352 pipeline_get_qpu(struct v3dv_pipeline *pipeline,
3353                  enum broadcom_shader_stage stage,
3354                  uint32_t *qpu_size)
3355 {
3356    struct v3dv_shader_variant *variant =
3357       pipeline->shared_data->variants[stage];
3358    if (!variant) {
3359       *qpu_size = 0;
3360       return NULL;
3361    }
3362 
3363    *qpu_size = variant->qpu_insts_size;
3364    return variant->qpu_insts;
3365 }
3366 
3367 /* FIXME: we use the same macro in various drivers, maybe move it to
3368  * the common vk_util.h?
3369  */
3370 #define WRITE_STR(field, ...) ({                                \
3371    memset(field, 0, sizeof(field));                             \
3372    UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \
3373    assert(_i > 0 && _i < sizeof(field));                        \
3374 })
3375 
3376 static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR * ir,const char * data)3377 write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
3378               const char *data)
3379 {
3380    ir->isText = VK_TRUE;
3381 
3382    size_t data_len = strlen(data) + 1;
3383 
3384    if (ir->pData == NULL) {
3385       ir->dataSize = data_len;
3386       return true;
3387    }
3388 
3389    strncpy(ir->pData, data, ir->dataSize);
3390    if (ir->dataSize < data_len)
3391       return false;
3392 
3393    ir->dataSize = data_len;
3394    return true;
3395 }
3396 
3397 static void
append(char ** str,size_t * offset,const char * fmt,...)3398 append(char **str, size_t *offset, const char *fmt, ...)
3399 {
3400    va_list args;
3401    va_start(args, fmt);
3402    ralloc_vasprintf_rewrite_tail(str, offset, fmt, args);
3403    va_end(args);
3404 }
3405 
3406 static void
pipeline_collect_executable_data(struct v3dv_pipeline * pipeline)3407 pipeline_collect_executable_data(struct v3dv_pipeline *pipeline)
3408 {
3409    if (pipeline->executables.mem_ctx)
3410       return;
3411 
3412    pipeline->executables.mem_ctx = ralloc_context(NULL);
3413    util_dynarray_init(&pipeline->executables.data,
3414                       pipeline->executables.mem_ctx);
3415 
3416    /* Don't crash for failed/bogus pipelines */
3417    if (!pipeline->shared_data)
3418       return;
3419 
3420    for (int s = BROADCOM_SHADER_VERTEX; s <= BROADCOM_SHADER_COMPUTE; s++) {
3421       VkShaderStageFlags vk_stage =
3422          mesa_to_vk_shader_stage(broadcom_shader_stage_to_gl(s));
3423       if (!(vk_stage & pipeline->active_stages))
3424          continue;
3425 
3426       char *nir_str = NULL;
3427       char *qpu_str = NULL;
3428 
3429       if (pipeline_keep_qpu(pipeline)) {
3430          nir_shader *nir = pipeline_get_nir(pipeline, s);
3431          nir_str = nir ?
3432             nir_shader_as_str(nir, pipeline->executables.mem_ctx) : NULL;
3433 
3434          uint32_t qpu_size;
3435          uint64_t *qpu = pipeline_get_qpu(pipeline, s, &qpu_size);
3436          if (qpu) {
3437             uint32_t qpu_inst_count = qpu_size / sizeof(uint64_t);
3438             qpu_str = rzalloc_size(pipeline->executables.mem_ctx,
3439                                    qpu_inst_count * 96);
3440             size_t offset = 0;
3441             for (int i = 0; i < qpu_inst_count; i++) {
3442                const char *str = v3d_qpu_disasm(&pipeline->device->devinfo, qpu[i]);
3443                append(&qpu_str, &offset, "%s\n", str);
3444                ralloc_free((void *)str);
3445             }
3446          }
3447       }
3448 
3449       struct v3dv_pipeline_executable_data data = {
3450          .stage = s,
3451          .nir_str = nir_str,
3452          .qpu_str = qpu_str,
3453       };
3454       util_dynarray_append(&pipeline->executables.data,
3455                            struct v3dv_pipeline_executable_data, data);
3456    }
3457 }
3458 
3459 static const struct v3dv_pipeline_executable_data *
pipeline_get_executable(struct v3dv_pipeline * pipeline,uint32_t index)3460 pipeline_get_executable(struct v3dv_pipeline *pipeline, uint32_t index)
3461 {
3462    assert(index < util_dynarray_num_elements(&pipeline->executables.data,
3463                                              struct v3dv_pipeline_executable_data));
3464    return util_dynarray_element(&pipeline->executables.data,
3465                                 struct v3dv_pipeline_executable_data,
3466                                 index);
3467 }
3468 
3469 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetPipelineExecutableInternalRepresentationsKHR(VkDevice device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pInternalRepresentationCount,VkPipelineExecutableInternalRepresentationKHR * pInternalRepresentations)3470 v3dv_GetPipelineExecutableInternalRepresentationsKHR(
3471    VkDevice device,
3472    const VkPipelineExecutableInfoKHR *pExecutableInfo,
3473    uint32_t *pInternalRepresentationCount,
3474    VkPipelineExecutableInternalRepresentationKHR *pInternalRepresentations)
3475 {
3476    V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, pExecutableInfo->pipeline);
3477 
3478    pipeline_collect_executable_data(pipeline);
3479 
3480    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
3481                           pInternalRepresentations, pInternalRepresentationCount);
3482 
3483    bool incomplete = false;
3484    const struct v3dv_pipeline_executable_data *exe =
3485       pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
3486 
3487    if (exe->nir_str) {
3488       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
3489                                &out, ir) {
3490          WRITE_STR(ir->name, "NIR (%s)", broadcom_shader_stage_name(exe->stage));
3491          WRITE_STR(ir->description, "Final NIR form");
3492          if (!write_ir_text(ir, exe->nir_str))
3493             incomplete = true;
3494       }
3495    }
3496 
3497    if (exe->qpu_str) {
3498       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR,
3499                                &out, ir) {
3500          WRITE_STR(ir->name, "QPU (%s)", broadcom_shader_stage_name(exe->stage));
3501          WRITE_STR(ir->description, "Final QPU assembly");
3502          if (!write_ir_text(ir, exe->qpu_str))
3503             incomplete = true;
3504       }
3505    }
3506 
3507    return incomplete ? VK_INCOMPLETE : vk_outarray_status(&out);
3508 }
3509 
3510 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetPipelineExecutablePropertiesKHR(VkDevice device,const VkPipelineInfoKHR * pPipelineInfo,uint32_t * pExecutableCount,VkPipelineExecutablePropertiesKHR * pProperties)3511 v3dv_GetPipelineExecutablePropertiesKHR(
3512    VkDevice device,
3513    const VkPipelineInfoKHR *pPipelineInfo,
3514    uint32_t *pExecutableCount,
3515    VkPipelineExecutablePropertiesKHR *pProperties)
3516 {
3517    V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, pPipelineInfo->pipeline);
3518 
3519    pipeline_collect_executable_data(pipeline);
3520 
3521    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
3522                           pProperties, pExecutableCount);
3523 
3524    util_dynarray_foreach(&pipeline->executables.data,
3525                          struct v3dv_pipeline_executable_data, exe) {
3526       vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
3527          gl_shader_stage mesa_stage = broadcom_shader_stage_to_gl(exe->stage);
3528          props->stages = mesa_to_vk_shader_stage(mesa_stage);
3529 
3530          WRITE_STR(props->name, "%s (%s)",
3531                    _mesa_shader_stage_to_abbrev(mesa_stage),
3532                    broadcom_shader_stage_is_binning(exe->stage) ?
3533                      "Binning" : "Render");
3534 
3535          WRITE_STR(props->description, "%s",
3536                    _mesa_shader_stage_to_string(mesa_stage));
3537 
3538          props->subgroupSize = V3D_CHANNELS;
3539       }
3540    }
3541 
3542    return vk_outarray_status(&out);
3543 }
3544 
3545 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetPipelineExecutableStatisticsKHR(VkDevice device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pStatisticCount,VkPipelineExecutableStatisticKHR * pStatistics)3546 v3dv_GetPipelineExecutableStatisticsKHR(
3547    VkDevice device,
3548    const VkPipelineExecutableInfoKHR *pExecutableInfo,
3549    uint32_t *pStatisticCount,
3550    VkPipelineExecutableStatisticKHR *pStatistics)
3551 {
3552    V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, pExecutableInfo->pipeline);
3553 
3554    pipeline_collect_executable_data(pipeline);
3555 
3556    const struct v3dv_pipeline_executable_data *exe =
3557       pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
3558 
3559    struct v3d_prog_data *prog_data =
3560       pipeline_get_prog_data(pipeline, exe->stage);
3561 
3562    struct v3dv_shader_variant *variant =
3563       pipeline->shared_data->variants[exe->stage];
3564    uint32_t qpu_inst_count = variant->qpu_insts_size / sizeof(uint64_t);
3565 
3566    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
3567                           pStatistics, pStatisticCount);
3568 
3569    if (qpu_inst_count > 0) {
3570       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
3571          WRITE_STR(stat->name, "Compile Strategy");
3572          WRITE_STR(stat->description, "Chosen compile strategy index");
3573          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3574          stat->value.u64 = prog_data->compile_strategy_idx;
3575       }
3576 
3577       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
3578          WRITE_STR(stat->name, "Instruction Count");
3579          WRITE_STR(stat->description, "Number of QPU instructions");
3580          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3581          stat->value.u64 = qpu_inst_count;
3582       }
3583 
3584       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
3585          WRITE_STR(stat->name, "Thread Count");
3586          WRITE_STR(stat->description, "Number of QPU threads dispatched");
3587          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3588          stat->value.u64 = prog_data->threads;
3589       }
3590 
3591       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
3592          WRITE_STR(stat->name, "Spill Size");
3593          WRITE_STR(stat->description, "Size of the spill buffer in bytes");
3594          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3595          stat->value.u64 = prog_data->spill_size;
3596       }
3597 
3598       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
3599          WRITE_STR(stat->name, "TMU Spills");
3600          WRITE_STR(stat->description, "Number of times a register was spilled "
3601                                       "to memory");
3602          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3603          stat->value.u64 = prog_data->spill_size;
3604       }
3605 
3606       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
3607          WRITE_STR(stat->name, "TMU Fills");
3608          WRITE_STR(stat->description, "Number of times a register was filled "
3609                                       "from memory");
3610          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3611          stat->value.u64 = prog_data->spill_size;
3612       }
3613 
3614       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
3615          WRITE_STR(stat->name, "QPU Read Stalls");
3616          WRITE_STR(stat->description, "Number of cycles the QPU stalls for a "
3617                                       "register read dependency");
3618          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
3619          stat->value.u64 = prog_data->qpu_read_stalls;
3620       }
3621    }
3622 
3623    return vk_outarray_status(&out);
3624 }
3625