• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Google LLC
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "tu_shader.h"
7 
8 #include "spirv/nir_spirv.h"
9 #include "util/mesa-sha1.h"
10 #include "nir/nir_xfb_info.h"
11 #include "vk_nir.h"
12 #include "vk_nir_convert_ycbcr.h"
13 #include "vk_pipeline.h"
14 #include "vk_util.h"
15 
16 #include "ir3/ir3_compiler.h"
17 #include "ir3/ir3_nir.h"
18 
19 #include "tu_device.h"
20 #include "tu_descriptor_set.h"
21 #include "tu_pipeline.h"
22 #include "tu_lrz.h"
23 
24 #include <initializer_list>
25 
26 nir_shader *
tu_spirv_to_nir(struct tu_device * dev,void * mem_ctx,const VkPipelineShaderStageCreateInfo * stage_info,gl_shader_stage stage)27 tu_spirv_to_nir(struct tu_device *dev,
28                 void *mem_ctx,
29                 const VkPipelineShaderStageCreateInfo *stage_info,
30                 gl_shader_stage stage)
31 {
32    /* TODO these are made-up */
33    const struct spirv_to_nir_options spirv_options = {
34       /* ViewID is a sysval in geometry stages and an input in the FS */
35       .view_index_is_input = stage == MESA_SHADER_FRAGMENT,
36 
37       /* Use 16-bit math for RelaxedPrecision ALU ops */
38       .mediump_16bit_alu = true,
39 
40       .caps = {
41          .demote_to_helper_invocation = true,
42          .descriptor_array_dynamic_indexing = true,
43          .descriptor_array_non_uniform_indexing = true,
44          .descriptor_indexing = true,
45          .device_group = true,
46          .draw_parameters = true,
47          .float_controls = true,
48          .float16 = true,
49          .fragment_density = true,
50          .geometry_streams = true,
51          .image_read_without_format = true,
52          .image_write_without_format = true,
53          .int16 = true,
54          .multiview = true,
55          .physical_storage_buffer_address = true,
56          .post_depth_coverage = true,
57          .runtime_descriptor_array = true,
58          .shader_viewport_index_layer = true,
59          .stencil_export = true,
60          .storage_16bit = dev->physical_device->info->a6xx.storage_16bit,
61          .subgroup_arithmetic = true,
62          .subgroup_ballot = true,
63          .subgroup_basic = true,
64          .subgroup_quad = true,
65          .subgroup_shuffle = true,
66          .subgroup_vote = true,
67          .tessellation = true,
68          .transform_feedback = true,
69          .variable_pointers = true,
70          .vk_memory_model_device_scope = true,
71          .vk_memory_model = true,
72       },
73 
74       .ubo_addr_format = nir_address_format_vec2_index_32bit_offset,
75       .ssbo_addr_format = nir_address_format_vec2_index_32bit_offset,
76 
77       /* Accessed via stg/ldg */
78       .phys_ssbo_addr_format = nir_address_format_64bit_global,
79 
80       /* Accessed via the const register file */
81       .push_const_addr_format = nir_address_format_logical,
82 
83       /* Accessed via ldl/stl */
84       .shared_addr_format = nir_address_format_32bit_offset,
85 
86       /* Accessed via stg/ldg (not used with Vulkan?) */
87       .global_addr_format = nir_address_format_64bit_global,
88    };
89 
90    const nir_shader_compiler_options *nir_options =
91       ir3_get_compiler_options(dev->compiler);
92 
93    nir_shader *nir;
94    VkResult result =
95       vk_pipeline_shader_stage_to_nir(&dev->vk, stage_info, &spirv_options,
96                                       nir_options, mem_ctx, &nir);
97    if (result != VK_SUCCESS)
98       return NULL;
99 
100    /* ir3 uses num_ubos and num_ssbos to track the number of *bindful*
101     * UBOs/SSBOs, but spirv_to_nir sets them to the total number of objects
102     * which is useless for us, so reset them here.
103     */
104    nir->info.num_ubos = 0;
105    nir->info.num_ssbos = 0;
106 
107    if (TU_DEBUG(NIR)) {
108       fprintf(stderr, "translated nir:\n");
109       nir_print_shader(nir, stderr);
110    }
111 
112    const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
113       .point_coord = true,
114    };
115    NIR_PASS_V(nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
116 
117    NIR_PASS_V(nir, nir_lower_global_vars_to_local);
118 
119    /* Older glslang missing bf6efd0316d8 ("SPV: Fix #2293: keep relaxed
120     * precision on arg passed to relaxed param") will pass function args through
121     * a highp temporary, so we need the nir_opt_find_array_copies() and a copy
122     * prop before we lower mediump vars, or you'll be unable to optimize out
123     * array copies after lowering.  We do this before splitting copies, since
124     * that works against nir_opt_find_array_copies().
125     * */
126    NIR_PASS_V(nir, nir_opt_find_array_copies);
127    NIR_PASS_V(nir, nir_opt_copy_prop_vars);
128    NIR_PASS_V(nir, nir_opt_dce);
129 
130    NIR_PASS_V(nir, nir_split_var_copies);
131    NIR_PASS_V(nir, nir_lower_var_copies);
132 
133    NIR_PASS_V(nir, nir_lower_mediump_vars, nir_var_function_temp | nir_var_shader_temp | nir_var_mem_shared);
134    NIR_PASS_V(nir, nir_opt_copy_prop_vars);
135    NIR_PASS_V(nir, nir_opt_combine_stores, nir_var_all);
136 
137    NIR_PASS_V(nir, nir_lower_system_values);
138    NIR_PASS_V(nir, nir_lower_is_helper_invocation);
139 
140    ir3_optimize_loop(dev->compiler, nir);
141 
142    NIR_PASS_V(nir, nir_opt_conditional_discard);
143 
144    return nir;
145 }
146 
147 static void
lower_load_push_constant(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * instr,struct tu_shader * shader,const struct tu_pipeline_layout * layout)148 lower_load_push_constant(struct tu_device *dev,
149                          nir_builder *b,
150                          nir_intrinsic_instr *instr,
151                          struct tu_shader *shader,
152                          const struct tu_pipeline_layout *layout)
153 {
154    uint32_t base = nir_intrinsic_base(instr);
155    assert(base % 4 == 0);
156 
157    if (tu6_shared_constants_enable(layout, dev->compiler)) {
158       /* All stages share the same range.  We could potentially add
159        * push_constant_offset to layout and apply it, but this is good for
160        * now.
161        */
162       base += dev->compiler->shared_consts_base_offset * 4;
163    } else {
164       assert(base >= shader->const_state.push_consts.lo * 4);
165       base -= shader->const_state.push_consts.lo * 4;
166    }
167 
168    nir_def *load =
169       nir_load_uniform(b, instr->num_components,
170             instr->def.bit_size,
171             nir_ushr_imm(b, instr->src[0].ssa, 2),
172             .base = base);
173 
174    nir_def_rewrite_uses(&instr->def, load);
175 
176    nir_instr_remove(&instr->instr);
177 }
178 
179 static void
lower_vulkan_resource_index(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * instr,struct tu_shader * shader,const struct tu_pipeline_layout * layout)180 lower_vulkan_resource_index(struct tu_device *dev, nir_builder *b,
181                             nir_intrinsic_instr *instr,
182                             struct tu_shader *shader,
183                             const struct tu_pipeline_layout *layout)
184 {
185    struct ir3_compiler *compiler = dev->compiler;
186    nir_def *vulkan_idx = instr->src[0].ssa;
187 
188    unsigned set = nir_intrinsic_desc_set(instr);
189    unsigned binding = nir_intrinsic_binding(instr);
190    struct tu_descriptor_set_layout *set_layout = layout->set[set].layout;
191    struct tu_descriptor_set_binding_layout *binding_layout =
192       &set_layout->binding[binding];
193    nir_def *base;
194 
195    if (binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
196       return;
197 
198    shader->active_desc_sets |= 1u << set;
199 
200    switch (binding_layout->type) {
201    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
202    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
203       int offset = 0;
204       for (unsigned i = 0; i < set; i++) {
205          if (shader->dynamic_descriptor_sizes[i] >= 0) {
206             offset += shader->dynamic_descriptor_sizes[i];
207          } else {
208             offset = -1;
209             break;
210          }
211       }
212 
213       if (offset < 0) {
214          /* With independent sets, we don't know
215           * layout->set[set].dynamic_offset_start until after link time which
216           * with fast linking means after the shader is compiled. We have to
217           * get it from the const file instead.
218           */
219          base = nir_imm_int(b, binding_layout->dynamic_offset_offset / (4 * A6XX_TEX_CONST_DWORDS));
220          nir_def *dynamic_offset_start;
221          if (compiler->load_shader_consts_via_preamble) {
222             dynamic_offset_start =
223                ir3_load_driver_ubo(b, 1, &shader->const_state.dynamic_offsets_ubo, set);
224          } else {
225             dynamic_offset_start =
226                nir_load_uniform(b, 1, 32, nir_imm_int(b, 0),
227                                 .base = shader->const_state.dynamic_offset_loc + set);
228          }
229          base = nir_iadd(b, base, dynamic_offset_start);
230       } else {
231          base = nir_imm_int(b, (offset +
232             binding_layout->dynamic_offset_offset) / (4 * A6XX_TEX_CONST_DWORDS));
233       }
234       assert(dev->physical_device->reserved_set_idx >= 0);
235       set = dev->physical_device->reserved_set_idx;
236       break;
237    }
238    default:
239       base = nir_imm_int(b, binding_layout->offset / (4 * A6XX_TEX_CONST_DWORDS));
240       break;
241    }
242 
243    unsigned stride = binding_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
244    assert(util_is_power_of_two_nonzero(stride));
245    nir_def *shift = nir_imm_int(b, util_logbase2(stride));
246 
247    nir_def *def = nir_vec3(b, nir_imm_int(b, set),
248                                nir_iadd(b, base,
249                                         nir_ishl(b, vulkan_idx, shift)),
250                                shift);
251 
252    nir_def_rewrite_uses(&instr->def, def);
253    nir_instr_remove(&instr->instr);
254 }
255 
256 static void
lower_vulkan_resource_reindex(nir_builder * b,nir_intrinsic_instr * instr)257 lower_vulkan_resource_reindex(nir_builder *b, nir_intrinsic_instr *instr)
258 {
259    nir_def *old_index = instr->src[0].ssa;
260    nir_def *delta = instr->src[1].ssa;
261    nir_def *shift = nir_channel(b, old_index, 2);
262 
263    nir_def *new_index =
264       nir_vec3(b, nir_channel(b, old_index, 0),
265                nir_iadd(b, nir_channel(b, old_index, 1),
266                         nir_ishl(b, delta, shift)),
267                shift);
268 
269    nir_def_rewrite_uses(&instr->def, new_index);
270    nir_instr_remove(&instr->instr);
271 }
272 
273 static void
lower_load_vulkan_descriptor(nir_builder * b,nir_intrinsic_instr * intrin)274 lower_load_vulkan_descriptor(nir_builder *b, nir_intrinsic_instr *intrin)
275 {
276    nir_def *old_index = intrin->src[0].ssa;
277    /* Loading the descriptor happens as part of the load/store instruction so
278     * this is a no-op. We just need to turn the shift into an offset of 0.
279     */
280    nir_def *new_index =
281       nir_vec3(b, nir_channel(b, old_index, 0),
282                nir_channel(b, old_index, 1),
283                nir_imm_int(b, 0));
284    nir_def_rewrite_uses(&intrin->def, new_index);
285    nir_instr_remove(&intrin->instr);
286 }
287 
288 static bool
lower_ssbo_ubo_intrinsic(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * intrin)289 lower_ssbo_ubo_intrinsic(struct tu_device *dev,
290                          nir_builder *b, nir_intrinsic_instr *intrin)
291 {
292    const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic];
293 
294    /* The bindless base is part of the instruction, which means that part of
295     * the "pointer" has to be constant. We solve this in the same way the blob
296     * does, by generating a bunch of if-statements. In the usual case where
297     * the descriptor set is constant we can skip that, though).
298     */
299 
300    unsigned buffer_src;
301    if (intrin->intrinsic == nir_intrinsic_store_ssbo) {
302       /* This has the value first */
303       buffer_src = 1;
304    } else {
305       buffer_src = 0;
306    }
307 
308    /* Don't lower non-bindless UBO loads of driver params */
309    if (intrin->src[buffer_src].ssa->num_components == 1)
310       return false;
311 
312    nir_scalar scalar_idx = nir_scalar_resolved(intrin->src[buffer_src].ssa, 0);
313    nir_def *descriptor_idx = nir_channel(b, intrin->src[buffer_src].ssa, 1);
314 
315    if (intrin->intrinsic == nir_intrinsic_load_ubo &&
316        dev->instance->allow_oob_indirect_ubo_loads) {
317       nir_scalar offset = nir_scalar_resolved(intrin->src[1].ssa, 0);
318       if (!nir_scalar_is_const(offset)) {
319          nir_intrinsic_set_range(intrin, ~0);
320       }
321    }
322 
323    /* For isam, we need to use the appropriate descriptor if 16-bit storage is
324     * enabled. Descriptor 0 is the 16-bit one, descriptor 1 is the 32-bit one.
325     */
326    if (dev->physical_device->info->a6xx.storage_16bit &&
327        intrin->intrinsic == nir_intrinsic_load_ssbo &&
328        (nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
329        intrin->def.bit_size > 16) {
330       descriptor_idx = nir_iadd_imm(b, descriptor_idx, 1);
331    }
332 
333    nir_def *results[MAX_SETS] = { NULL };
334 
335    if (nir_scalar_is_const(scalar_idx)) {
336       nir_def *bindless =
337          nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = nir_scalar_as_uint(scalar_idx));
338       nir_src_rewrite(&intrin->src[buffer_src], bindless);
339       return true;
340    }
341 
342    nir_def *base_idx = nir_channel(b, scalar_idx.def, scalar_idx.comp);
343    for (unsigned i = 0; i < dev->physical_device->info->a6xx.max_sets; i++) {
344       /* if (base_idx == i) { ... */
345       nir_if *nif = nir_push_if(b, nir_ieq_imm(b, base_idx, i));
346 
347       nir_def *bindless =
348          nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = i);
349 
350       nir_intrinsic_instr *copy =
351          nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
352 
353       copy->num_components = intrin->num_components;
354 
355       for (unsigned src = 0; src < info->num_srcs; src++) {
356          if (src == buffer_src)
357             copy->src[src] = nir_src_for_ssa(bindless);
358          else
359             copy->src[src] = nir_src_for_ssa(intrin->src[src].ssa);
360       }
361 
362       for (unsigned idx = 0; idx < info->num_indices; idx++) {
363          copy->const_index[idx] = intrin->const_index[idx];
364       }
365 
366       if (info->has_dest) {
367          nir_def_init(&copy->instr, &copy->def,
368                       intrin->def.num_components,
369                       intrin->def.bit_size);
370          results[i] = &copy->def;
371       }
372 
373       nir_builder_instr_insert(b, &copy->instr);
374 
375       /* } else { ... */
376       nir_push_else(b, nif);
377    }
378 
379    nir_def *result =
380       nir_undef(b, intrin->def.num_components, intrin->def.bit_size);
381    for (int i = dev->physical_device->info->a6xx.max_sets - 1; i >= 0; i--) {
382       nir_pop_if(b, NULL);
383       if (info->has_dest)
384          result = nir_if_phi(b, results[i], result);
385    }
386 
387    if (info->has_dest)
388       nir_def_rewrite_uses(&intrin->def, result);
389    nir_instr_remove(&intrin->instr);
390    return true;
391 }
392 
393 static nir_def *
build_bindless(struct tu_device * dev,nir_builder * b,nir_deref_instr * deref,bool is_sampler,struct tu_shader * shader,const struct tu_pipeline_layout * layout)394 build_bindless(struct tu_device *dev, nir_builder *b,
395                nir_deref_instr *deref, bool is_sampler,
396                struct tu_shader *shader,
397                const struct tu_pipeline_layout *layout)
398 {
399    nir_variable *var = nir_deref_instr_get_variable(deref);
400 
401    unsigned set = var->data.descriptor_set;
402    unsigned binding = var->data.binding;
403    const struct tu_descriptor_set_binding_layout *bind_layout =
404       &layout->set[set].layout->binding[binding];
405 
406    /* input attachments use non bindless workaround */
407    if (bind_layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT &&
408        !TU_DEBUG(DYNAMIC)) {
409       const struct glsl_type *glsl_type = glsl_without_array(var->type);
410       uint32_t idx = var->data.index * 2;
411 
412       BITSET_SET_RANGE_INSIDE_WORD(b->shader->info.textures_used, idx, (idx + bind_layout->array_size * 2) - 1);
413 
414       /* D24S8 workaround: stencil of D24S8 will be sampled as uint */
415       if (glsl_get_sampler_result_type(glsl_type) == GLSL_TYPE_UINT)
416          idx += 1;
417 
418       if (deref->deref_type == nir_deref_type_var)
419          return nir_imm_int(b, idx);
420 
421       nir_def *arr_index = deref->arr.index.ssa;
422       return nir_iadd_imm(b, nir_imul_imm(b, arr_index, 2), idx);
423    }
424 
425    shader->active_desc_sets |= 1u << set;
426 
427    nir_def *desc_offset;
428    unsigned descriptor_stride;
429    unsigned offset = 0;
430    /* Samplers come second in combined image/sampler descriptors, see
431       * write_combined_image_sampler_descriptor().
432       */
433    if (is_sampler && bind_layout->type ==
434          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
435       offset = 1;
436    }
437    desc_offset =
438       nir_imm_int(b, (bind_layout->offset / (4 * A6XX_TEX_CONST_DWORDS)) +
439                   offset);
440    descriptor_stride = bind_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
441 
442    if (deref->deref_type != nir_deref_type_var) {
443       assert(deref->deref_type == nir_deref_type_array);
444 
445       nir_def *arr_index = deref->arr.index.ssa;
446       desc_offset = nir_iadd(b, desc_offset,
447                              nir_imul_imm(b, arr_index, descriptor_stride));
448    }
449 
450    return nir_bindless_resource_ir3(b, 32, desc_offset, .desc_set = set);
451 }
452 
453 static void
lower_image_deref(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * instr,struct tu_shader * shader,const struct tu_pipeline_layout * layout)454 lower_image_deref(struct tu_device *dev, nir_builder *b,
455                   nir_intrinsic_instr *instr, struct tu_shader *shader,
456                   const struct tu_pipeline_layout *layout)
457 {
458    nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
459    nir_def *bindless = build_bindless(dev, b, deref, false, shader, layout);
460    nir_rewrite_image_intrinsic(instr, bindless, true);
461 }
462 
463 static bool
lower_intrinsic(nir_builder * b,nir_intrinsic_instr * instr,struct tu_device * dev,struct tu_shader * shader,const struct tu_pipeline_layout * layout)464 lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
465                 struct tu_device *dev,
466                 struct tu_shader *shader,
467                 const struct tu_pipeline_layout *layout)
468 {
469    switch (instr->intrinsic) {
470    case nir_intrinsic_load_push_constant:
471       lower_load_push_constant(dev, b, instr, shader, layout);
472       return true;
473 
474    case nir_intrinsic_load_vulkan_descriptor:
475       lower_load_vulkan_descriptor(b, instr);
476       return true;
477 
478    case nir_intrinsic_vulkan_resource_index:
479       lower_vulkan_resource_index(dev, b, instr, shader, layout);
480       return true;
481    case nir_intrinsic_vulkan_resource_reindex:
482       lower_vulkan_resource_reindex(b, instr);
483       return true;
484 
485    case nir_intrinsic_load_ubo:
486    case nir_intrinsic_load_ssbo:
487    case nir_intrinsic_store_ssbo:
488    case nir_intrinsic_ssbo_atomic:
489    case nir_intrinsic_ssbo_atomic_swap:
490    case nir_intrinsic_get_ssbo_size:
491       return lower_ssbo_ubo_intrinsic(dev, b, instr);
492 
493    case nir_intrinsic_image_deref_load:
494    case nir_intrinsic_image_deref_store:
495    case nir_intrinsic_image_deref_atomic:
496    case nir_intrinsic_image_deref_atomic_swap:
497    case nir_intrinsic_image_deref_size:
498    case nir_intrinsic_image_deref_samples:
499       lower_image_deref(dev, b, instr, shader, layout);
500       return true;
501 
502    case nir_intrinsic_load_frag_size_ir3:
503    case nir_intrinsic_load_frag_offset_ir3: {
504       if (!dev->compiler->load_shader_consts_via_preamble)
505          return false;
506 
507       enum ir3_driver_param param =
508          instr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
509          IR3_DP_FS_FRAG_SIZE : IR3_DP_FS_FRAG_OFFSET;
510 
511       nir_def *view = instr->src[0].ssa;
512       nir_def *result =
513          ir3_load_driver_ubo_indirect(b, 2, &shader->const_state.fdm_ubo,
514                                       param, view, nir_intrinsic_range(instr));
515 
516       nir_def_rewrite_uses(&instr->def, result);
517       nir_instr_remove(&instr->instr);
518       return true;
519    }
520    case nir_intrinsic_load_frag_invocation_count: {
521       if (!dev->compiler->load_shader_consts_via_preamble)
522          return false;
523 
524       nir_def *result =
525          ir3_load_driver_ubo(b, 1, &shader->const_state.fdm_ubo,
526                              IR3_DP_FS_FRAG_INVOCATION_COUNT);
527 
528       nir_def_rewrite_uses(&instr->def, result);
529       nir_instr_remove(&instr->instr);
530       return true;
531    }
532 
533    default:
534       return false;
535    }
536 }
537 
538 static void
lower_tex_ycbcr(const struct tu_pipeline_layout * layout,nir_builder * builder,nir_tex_instr * tex)539 lower_tex_ycbcr(const struct tu_pipeline_layout *layout,
540                 nir_builder *builder,
541                 nir_tex_instr *tex)
542 {
543    int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
544    assert(deref_src_idx >= 0);
545    nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
546 
547    nir_variable *var = nir_deref_instr_get_variable(deref);
548    const struct tu_descriptor_set_layout *set_layout =
549       layout->set[var->data.descriptor_set].layout;
550    const struct tu_descriptor_set_binding_layout *binding =
551       &set_layout->binding[var->data.binding];
552    const struct tu_sampler_ycbcr_conversion *ycbcr_samplers =
553       tu_immutable_ycbcr_samplers(set_layout, binding);
554 
555    if (!ycbcr_samplers)
556       return;
557 
558    /* For the following instructions, we don't apply any change */
559    if (tex->op == nir_texop_txs ||
560        tex->op == nir_texop_query_levels ||
561        tex->op == nir_texop_lod)
562       return;
563 
564    assert(tex->texture_index == 0);
565    unsigned array_index = 0;
566    if (deref->deref_type != nir_deref_type_var) {
567       assert(deref->deref_type == nir_deref_type_array);
568       if (!nir_src_is_const(deref->arr.index))
569          return;
570       array_index = nir_src_as_uint(deref->arr.index);
571       array_index = MIN2(array_index, binding->array_size - 1);
572    }
573    const struct tu_sampler_ycbcr_conversion *ycbcr_sampler = ycbcr_samplers + array_index;
574 
575    if (ycbcr_sampler->ycbcr_model == VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY)
576       return;
577 
578    builder->cursor = nir_after_instr(&tex->instr);
579 
580    uint8_t bits = vk_format_get_component_bits(ycbcr_sampler->format,
581                                                UTIL_FORMAT_COLORSPACE_RGB,
582                                                PIPE_SWIZZLE_X);
583 
584    switch (ycbcr_sampler->format) {
585    case VK_FORMAT_G8B8G8R8_422_UNORM:
586    case VK_FORMAT_B8G8R8G8_422_UNORM:
587    case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
588    case VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM:
589       /* util_format_get_component_bits doesn't return what we want */
590       bits = 8;
591       break;
592    default:
593       break;
594    }
595 
596    uint32_t bpcs[3] = {bits, bits, bits}; /* TODO: use right bpc for each channel ? */
597    nir_def *result = nir_convert_ycbcr_to_rgb(builder,
598                                                   ycbcr_sampler->ycbcr_model,
599                                                   ycbcr_sampler->ycbcr_range,
600                                                   &tex->def,
601                                                   bpcs);
602    nir_def_rewrite_uses_after(&tex->def, result,
603                                   result->parent_instr);
604 
605    builder->cursor = nir_before_instr(&tex->instr);
606 }
607 
608 static bool
lower_tex(nir_builder * b,nir_tex_instr * tex,struct tu_device * dev,struct tu_shader * shader,const struct tu_pipeline_layout * layout)609 lower_tex(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
610           struct tu_shader *shader, const struct tu_pipeline_layout *layout)
611 {
612    lower_tex_ycbcr(layout, b, tex);
613 
614    int sampler_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref);
615    if (sampler_src_idx >= 0) {
616       nir_deref_instr *deref = nir_src_as_deref(tex->src[sampler_src_idx].src);
617       nir_def *bindless = build_bindless(dev, b, deref, true, shader, layout);
618       nir_src_rewrite(&tex->src[sampler_src_idx].src, bindless);
619       tex->src[sampler_src_idx].src_type = nir_tex_src_sampler_handle;
620    }
621 
622    int tex_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
623    if (tex_src_idx >= 0) {
624       nir_deref_instr *deref = nir_src_as_deref(tex->src[tex_src_idx].src);
625       nir_def *bindless = build_bindless(dev, b, deref, false, shader, layout);
626       nir_src_rewrite(&tex->src[tex_src_idx].src, bindless);
627       tex->src[tex_src_idx].src_type = nir_tex_src_texture_handle;
628 
629       /* for the input attachment case: */
630       if (bindless->parent_instr->type != nir_instr_type_intrinsic)
631          tex->src[tex_src_idx].src_type = nir_tex_src_texture_offset;
632    }
633 
634    return true;
635 }
636 
637 struct lower_instr_params {
638    struct tu_device *dev;
639    struct tu_shader *shader;
640    const struct tu_pipeline_layout *layout;
641 };
642 
643 static bool
lower_instr(nir_builder * b,nir_instr * instr,void * cb_data)644 lower_instr(nir_builder *b, nir_instr *instr, void *cb_data)
645 {
646    struct lower_instr_params *params = (struct lower_instr_params *) cb_data;
647    b->cursor = nir_before_instr(instr);
648    switch (instr->type) {
649    case nir_instr_type_tex:
650       return lower_tex(b, nir_instr_as_tex(instr), params->dev, params->shader, params->layout);
651    case nir_instr_type_intrinsic:
652       return lower_intrinsic(b, nir_instr_as_intrinsic(instr), params->dev, params->shader, params->layout);
653    default:
654       return false;
655    }
656 }
657 
658 /* Since we always push inline uniforms into constant memory, lower loads of
659  * them to load_uniform which turns into constant memory loads.
660  */
661 static bool
lower_inline_ubo(nir_builder * b,nir_intrinsic_instr * intrin,void * cb_data)662 lower_inline_ubo(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data)
663 {
664    if (intrin->intrinsic != nir_intrinsic_load_ubo)
665       return false;
666 
667    struct lower_instr_params *params = (struct lower_instr_params *) cb_data;
668    struct tu_shader *shader = params->shader;
669    const struct tu_pipeline_layout *layout = params->layout;
670 
671    nir_binding binding = nir_chase_binding(intrin->src[0]);
672 
673    if (!binding.success)
674       return false;
675 
676    struct tu_descriptor_set_layout *set_layout = layout->set[binding.desc_set].layout;
677    struct tu_descriptor_set_binding_layout *binding_layout =
678       &set_layout->binding[binding.binding];
679 
680    if (binding_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
681       return false;
682 
683    /* lookup the const offset of the inline UBO */
684    struct tu_const_state *const_state = &shader->const_state;
685 
686    unsigned base = UINT_MAX;
687    unsigned range;
688    bool use_load = false;
689    bool use_ldg_k =
690       params->dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
691 
692    for (unsigned i = 0; i < const_state->num_inline_ubos; i++) {
693       if (const_state->ubos[i].base == binding.desc_set &&
694           const_state->ubos[i].offset == binding_layout->offset) {
695          range = const_state->ubos[i].size_vec4 * 4;
696          if (use_ldg_k) {
697             base = i * 2;
698          } else {
699             use_load = const_state->ubos[i].push_address;
700             base = const_state->ubos[i].const_offset_vec4 * 4;
701          }
702          break;
703       }
704    }
705 
706    if (base == UINT_MAX) {
707       /* Assume we're loading out-of-bounds from a 0-sized inline uniform
708        * filtered out below.
709        */
710       nir_def_rewrite_uses(&intrin->def,
711                                nir_undef(b, intrin->num_components,
712                                              intrin->def.bit_size));
713       return true;
714    }
715 
716    nir_def *offset = intrin->src[1].ssa;
717 
718    b->cursor = nir_before_instr(&intrin->instr);
719    nir_def *val;
720 
721    if (use_load || use_ldg_k) {
722       nir_def *base_addr;
723       if (use_ldg_k) {
724          base_addr = ir3_load_driver_ubo(b, 2,
725                                          &params->shader->const_state.inline_uniforms_ubo,
726                                          base);
727       } else {
728          base_addr = nir_load_uniform(b, 2, 32, nir_imm_int(b, 0), .base = base);
729       }
730       val = nir_load_global_ir3(b, intrin->num_components,
731                                 intrin->def.bit_size,
732                                 base_addr, nir_ishr_imm(b, offset, 2),
733                                 .access =
734                                  (enum gl_access_qualifier)(
735                                     (enum gl_access_qualifier)(ACCESS_NON_WRITEABLE | ACCESS_CAN_REORDER) |
736                                     ACCESS_CAN_SPECULATE),
737                                 .align_mul = 16,
738                                 .align_offset = 0,
739                                 .range_base = 0,
740                                 .range = range);
741    } else {
742       val = nir_load_uniform(b, intrin->num_components,
743                              intrin->def.bit_size,
744                              nir_ishr_imm(b, offset, 2), .base = base);
745    }
746 
747    nir_def_rewrite_uses(&intrin->def, val);
748    nir_instr_remove(&intrin->instr);
749    return true;
750 }
751 
752 /* Figure out the range of push constants that we're actually going to push to
753  * the shader, and tell the backend to reserve this range when pushing UBO
754  * constants.
755  */
756 
757 static void
gather_push_constants(nir_shader * shader,struct tu_shader * tu_shader)758 gather_push_constants(nir_shader *shader, struct tu_shader *tu_shader)
759 {
760    uint32_t min = UINT32_MAX, max = 0;
761    nir_foreach_function_impl(impl, shader) {
762       nir_foreach_block(block, impl) {
763          nir_foreach_instr_safe(instr, block) {
764             if (instr->type != nir_instr_type_intrinsic)
765                continue;
766 
767             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
768             if (intrin->intrinsic != nir_intrinsic_load_push_constant)
769                continue;
770 
771             uint32_t base = nir_intrinsic_base(intrin);
772             uint32_t range = nir_intrinsic_range(intrin);
773             min = MIN2(min, base);
774             max = MAX2(max, base + range);
775             break;
776          }
777       }
778    }
779 
780    if (min >= max) {
781       tu_shader->const_state.push_consts = (struct tu_push_constant_range) {};
782       return;
783    }
784 
785    /* CP_LOAD_STATE OFFSET and NUM_UNIT for SHARED_CONSTS are in units of
786     * dwords while loading regular consts is in units of vec4's.
787     * So we unify the unit here as dwords for tu_push_constant_range, then
788     * we should consider correct unit when emitting.
789     *
790     * Note there's an alignment requirement of 16 dwords on OFFSET. Expand
791     * the range and change units accordingly.
792     */
793    tu_shader->const_state.push_consts.lo = (min / 4) / 4 * 4;
794    tu_shader->const_state.push_consts.dwords =
795       align(max, 16) / 4 - tu_shader->const_state.push_consts.lo;
796 }
797 
798 static bool
shader_uses_push_consts(nir_shader * shader)799 shader_uses_push_consts(nir_shader *shader)
800 {
801    nir_foreach_function_impl (impl, shader) {
802       nir_foreach_block (block, impl) {
803          nir_foreach_instr_safe (instr, block) {
804             if (instr->type != nir_instr_type_intrinsic)
805                continue;
806 
807             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
808             if (intrin->intrinsic == nir_intrinsic_load_push_constant)
809                return true;
810          }
811       }
812    }
813    return false;
814 }
815 
816 static bool
tu_lower_io(nir_shader * shader,struct tu_device * dev,struct tu_shader * tu_shader,const struct tu_pipeline_layout * layout,unsigned * reserved_consts_vec4_out)817 tu_lower_io(nir_shader *shader, struct tu_device *dev,
818             struct tu_shader *tu_shader,
819             const struct tu_pipeline_layout *layout,
820             unsigned *reserved_consts_vec4_out)
821 {
822    tu_shader->const_state.push_consts = (struct tu_push_constant_range) {
823       .lo = 0,
824       .dwords = layout->push_constant_size / 4,
825       .type = tu_push_consts_type(layout, dev->compiler),
826    };
827 
828    if (tu_shader->const_state.push_consts.type == IR3_PUSH_CONSTS_PER_STAGE) {
829       gather_push_constants(shader, tu_shader);
830    } else if (tu_shader->const_state.push_consts.type ==
831             IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
832       /* Disable pushing constants for this stage if none were loaded in the
833        * shader.  If all stages don't load their declared push constants, as
834        * is often the case under zink, then we could additionally skip
835        * emitting REG_A7XX_HLSQ_SHARED_CONSTS_IMM entirely.
836        */
837       if (!shader_uses_push_consts(shader))
838          tu_shader->const_state.push_consts = (struct tu_push_constant_range) {};
839    }
840 
841    struct tu_const_state *const_state = &tu_shader->const_state;
842    unsigned reserved_consts_vec4 =
843       align(DIV_ROUND_UP(const_state->push_consts.dwords, 4),
844             dev->compiler->const_upload_unit);
845 
846    bool unknown_dynamic_size = false;
847    bool unknown_dynamic_offset = false;
848    for (unsigned i = 0; i < layout->num_sets; i++) {
849       if (tu_shader->dynamic_descriptor_sizes[i] == -1) {
850          unknown_dynamic_size = true;
851       } else if (unknown_dynamic_size &&
852                  tu_shader->dynamic_descriptor_sizes[i] > 0) {
853          /* If there is an unknown size followed by a known size, then we may
854           * need to dynamically determine the offset when linking.
855           */
856          unknown_dynamic_offset = true;
857       }
858    }
859 
860    if (unknown_dynamic_offset) {
861       const_state->dynamic_offset_loc = reserved_consts_vec4 * 4;
862       assert(dev->physical_device->reserved_set_idx >= 0);
863       reserved_consts_vec4 += DIV_ROUND_UP(dev->physical_device->reserved_set_idx, 4);
864    } else {
865       const_state->dynamic_offset_loc = UINT32_MAX;
866    }
867 
868    /* Reserve space for inline uniforms, so we can always load them from
869     * constants and not setup a UBO descriptor for them.
870     */
871    bool use_ldg_k =
872       dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
873    for (unsigned set = 0; set < layout->num_sets; set++) {
874       const struct tu_descriptor_set_layout *desc_layout =
875          layout->set[set].layout;
876 
877       if (!desc_layout || !desc_layout->has_inline_uniforms)
878          continue;
879 
880       for (unsigned b = 0; b < desc_layout->binding_count; b++) {
881          const struct tu_descriptor_set_binding_layout *binding =
882             &desc_layout->binding[b];
883 
884          if (binding->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
885             continue;
886          if (!(binding->shader_stages &
887                mesa_to_vk_shader_stage(shader->info.stage)))
888             continue;
889 
890          /* Workaround a CTS bug by ignoring zero-sized inline uniform
891           * blocks that aren't being properly filtered out when creating the
892           * descriptor set layout, see
893           * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/4115
894           */
895          if (binding->size == 0)
896             continue;
897 
898          /* If we don't know the size at compile time due to a variable
899           * descriptor count, then with descriptor buffers we cannot know
900           * how much space the real inline uniform has. In this case we fall
901           * back to pushing the address and using ldg, which is slower than
902           * setting up a descriptor but setting up our own descriptor with
903           * descriptor_buffer is also painful and has to be done on the GPU
904           * and doesn't avoid the UBO getting pushed anyway and faulting if a
905           * out-of-bounds access is hidden behind an if and not dynamically
906           * executed. Given the small max size, there shouldn't be much reason
907           * to use variable size anyway.
908           */
909          bool push_address = !use_ldg_k && desc_layout->has_variable_descriptors &&
910             b == desc_layout->binding_count - 1;
911 
912          if (push_address) {
913             perf_debug(dev,
914                        "falling back to ldg for variable-sized inline "
915                        "uniform block");
916          }
917 
918          assert(const_state->num_inline_ubos < ARRAY_SIZE(const_state->ubos));
919          unsigned size_vec4 = push_address ? 1 : DIV_ROUND_UP(binding->size, 16);
920          const_state->ubos[const_state->num_inline_ubos++] = (struct tu_inline_ubo) {
921             .base = set,
922             .offset = binding->offset,
923             .push_address = push_address,
924             .const_offset_vec4 = reserved_consts_vec4,
925             .size_vec4 = size_vec4,
926          };
927 
928          if (!use_ldg_k)
929             reserved_consts_vec4 += align(size_vec4, dev->compiler->const_upload_unit);
930       }
931    }
932 
933    *reserved_consts_vec4_out = reserved_consts_vec4;
934 
935    struct lower_instr_params params = {
936       .dev = dev,
937       .shader = tu_shader,
938       .layout = layout,
939    };
940 
941    bool progress = false;
942    if (const_state->num_inline_ubos) {
943       progress |= nir_shader_intrinsics_pass(shader, lower_inline_ubo,
944                                                nir_metadata_none,
945                                                &params);
946    }
947 
948    progress |= nir_shader_instructions_pass(shader,
949                                             lower_instr,
950                                             nir_metadata_none,
951                                             &params);
952 
953    /* Remove now-unused variables so that when we gather the shader info later
954     * they won't be counted.
955     */
956 
957    if (progress)
958       nir_opt_dce(shader);
959 
960    progress |=
961       nir_remove_dead_variables(shader,
962                                 nir_var_uniform | nir_var_mem_ubo | nir_var_mem_ssbo,
963                                 NULL);
964 
965    return progress;
966 }
967 
968 struct lower_fdm_options {
969    unsigned num_views;
970    bool adjust_fragcoord;
971    bool multiview;
972 };
973 
974 static bool
lower_fdm_filter(const nir_instr * instr,const void * data)975 lower_fdm_filter(const nir_instr *instr, const void *data)
976 {
977    const struct lower_fdm_options *options =
978       (const struct lower_fdm_options *)data;
979 
980    if (instr->type != nir_instr_type_intrinsic)
981       return false;
982 
983    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
984    return intrin->intrinsic == nir_intrinsic_load_frag_size ||
985       (intrin->intrinsic == nir_intrinsic_load_frag_coord &&
986        options->adjust_fragcoord);
987 }
988 
989 static nir_def *
lower_fdm_instr(struct nir_builder * b,nir_instr * instr,void * data)990 lower_fdm_instr(struct nir_builder *b, nir_instr *instr, void *data)
991 {
992    const struct lower_fdm_options *options =
993       (const struct lower_fdm_options *)data;
994 
995    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
996 
997    nir_def *view;
998    if (options->multiview) {
999       nir_variable *view_var =
1000          nir_find_variable_with_location(b->shader, nir_var_shader_in,
1001                                          VARYING_SLOT_VIEW_INDEX);
1002 
1003       if (view_var == NULL) {
1004          view_var = nir_variable_create(b->shader, nir_var_shader_in,
1005                                         glsl_int_type(), NULL);
1006          view_var->data.location = VARYING_SLOT_VIEW_INDEX;
1007          view_var->data.interpolation = INTERP_MODE_FLAT;
1008          view_var->data.driver_location = b->shader->num_inputs++;
1009       }
1010 
1011       view = nir_load_var(b, view_var);
1012    } else {
1013       view = nir_imm_int(b, 0);
1014    }
1015 
1016    nir_def *frag_size =
1017       nir_load_frag_size_ir3(b, view, .range = options->num_views);
1018 
1019    if (intrin->intrinsic == nir_intrinsic_load_frag_coord) {
1020       nir_def *frag_offset =
1021          nir_load_frag_offset_ir3(b, view, .range = options->num_views);
1022       nir_def *unscaled_coord = nir_load_frag_coord_unscaled_ir3(b);
1023       nir_def *xy = nir_trim_vector(b, unscaled_coord, 2);
1024       xy = nir_fmul(b, nir_fsub(b, xy, frag_offset), nir_i2f32(b, frag_size));
1025       return nir_vec4(b,
1026                       nir_channel(b, xy, 0),
1027                       nir_channel(b, xy, 1),
1028                       nir_channel(b, unscaled_coord, 2),
1029                       nir_channel(b, unscaled_coord, 3));
1030    }
1031 
1032    assert(intrin->intrinsic == nir_intrinsic_load_frag_size);
1033    return frag_size;
1034 }
1035 
1036 static bool
tu_nir_lower_fdm(nir_shader * shader,const struct lower_fdm_options * options)1037 tu_nir_lower_fdm(nir_shader *shader, const struct lower_fdm_options *options)
1038 {
1039    return nir_shader_lower_instructions(shader, lower_fdm_filter,
1040                                         lower_fdm_instr, (void *)options);
1041 }
1042 
1043 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)1044 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
1045 {
1046    assert(glsl_type_is_vector_or_scalar(type));
1047 
1048    unsigned comp_size =
1049       glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
1050    unsigned length = glsl_get_vector_elements(type);
1051    *size = comp_size * length;
1052    *align = comp_size;
1053 }
1054 
1055 static void
tu_gather_xfb_info(nir_shader * nir,struct ir3_stream_output_info * info)1056 tu_gather_xfb_info(nir_shader *nir, struct ir3_stream_output_info *info)
1057 {
1058    nir_shader_gather_xfb_info(nir);
1059 
1060    if (!nir->xfb_info)
1061       return;
1062 
1063    nir_xfb_info *xfb = nir->xfb_info;
1064 
1065    uint8_t output_map[VARYING_SLOT_TESS_MAX];
1066    memset(output_map, 0, sizeof(output_map));
1067 
1068    nir_foreach_shader_out_variable(var, nir) {
1069       unsigned slots = nir_variable_count_slots(var, var->type);
1070       for (unsigned i = 0; i < slots; i++)
1071          output_map[var->data.location + i] = var->data.driver_location + i;
1072    }
1073 
1074    assert(xfb->output_count <= IR3_MAX_SO_OUTPUTS);
1075    info->num_outputs = xfb->output_count;
1076 
1077    for (int i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
1078       info->stride[i] = xfb->buffers[i].stride / 4;
1079       info->buffer_to_stream[i] = xfb->buffer_to_stream[i];
1080    }
1081 
1082    info->streams_written = xfb->streams_written;
1083 
1084    for (int i = 0; i < xfb->output_count; i++) {
1085       info->output[i].register_index = output_map[xfb->outputs[i].location];
1086       info->output[i].start_component = xfb->outputs[i].component_offset;
1087       info->output[i].num_components =
1088                            util_bitcount(xfb->outputs[i].component_mask);
1089       info->output[i].output_buffer  = xfb->outputs[i].buffer;
1090       info->output[i].dst_offset = xfb->outputs[i].offset / 4;
1091       info->output[i].stream = xfb->buffer_to_stream[xfb->outputs[i].buffer];
1092    }
1093 }
1094 
1095 static uint32_t
tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant * xs)1096 tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs)
1097 {
1098    const struct ir3_const_state *const_state = ir3_const_state(xs);
1099    uint32_t base = const_state->offsets.immediate;
1100    int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4);
1101 
1102    /* truncate size to avoid writing constants that shader
1103     * does not use:
1104     */
1105    size = MIN2(size + base, xs->constlen) - base;
1106 
1107    return MAX2(size, 0) * 4;
1108 }
1109 
1110 /* We allocate fixed-length substreams for shader state, however some
1111  * parts of the state may have unbound length. Their additional space
1112  * requirements should be calculated here.
1113  */
1114 static uint32_t
tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant * xs)1115 tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs)
1116 {
1117    const struct ir3_const_state *const_state = ir3_const_state(xs);
1118 
1119    uint32_t size = tu_xs_get_immediates_packet_size_dwords(xs);
1120 
1121    /* Variable number of UBO upload ranges. */
1122    size += 4 * const_state->ubo_state.num_enabled;
1123 
1124    /* Variable number of dwords for the primitive map */
1125    size += xs->input_size;
1126 
1127    size += xs->constant_data_size / 4;
1128 
1129    return size;
1130 }
1131 
1132 static const struct xs_config {
1133    uint16_t reg_sp_xs_config;
1134    uint16_t reg_sp_xs_instrlen;
1135    uint16_t reg_sp_xs_first_exec_offset;
1136    uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
1137    uint16_t reg_sp_xs_vgpr_config;
1138 } xs_config[] = {
1139    [MESA_SHADER_VERTEX] = {
1140       REG_A6XX_SP_VS_CONFIG,
1141       REG_A6XX_SP_VS_INSTRLEN,
1142       REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET,
1143       REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET,
1144       REG_A7XX_SP_VS_VGPR_CONFIG,
1145    },
1146    [MESA_SHADER_TESS_CTRL] = {
1147       REG_A6XX_SP_HS_CONFIG,
1148       REG_A6XX_SP_HS_INSTRLEN,
1149       REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET,
1150       REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET,
1151       REG_A7XX_SP_HS_VGPR_CONFIG,
1152    },
1153    [MESA_SHADER_TESS_EVAL] = {
1154       REG_A6XX_SP_DS_CONFIG,
1155       REG_A6XX_SP_DS_INSTRLEN,
1156       REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET,
1157       REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET,
1158       REG_A7XX_SP_DS_VGPR_CONFIG,
1159    },
1160    [MESA_SHADER_GEOMETRY] = {
1161       REG_A6XX_SP_GS_CONFIG,
1162       REG_A6XX_SP_GS_INSTRLEN,
1163       REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET,
1164       REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET,
1165       REG_A7XX_SP_GS_VGPR_CONFIG,
1166    },
1167    [MESA_SHADER_FRAGMENT] = {
1168       REG_A6XX_SP_FS_CONFIG,
1169       REG_A6XX_SP_FS_INSTRLEN,
1170       REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET,
1171       REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET,
1172       REG_A7XX_SP_FS_VGPR_CONFIG,
1173    },
1174    [MESA_SHADER_COMPUTE] = {
1175       REG_A6XX_SP_CS_CONFIG,
1176       REG_A6XX_SP_CS_INSTRLEN,
1177       REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET,
1178       REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET,
1179       REG_A7XX_SP_CS_VGPR_CONFIG,
1180    },
1181 };
1182 
1183 void
tu6_emit_xs(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)1184 tu6_emit_xs(struct tu_cs *cs,
1185             gl_shader_stage stage, /* xs->type, but xs may be NULL */
1186             const struct ir3_shader_variant *xs,
1187             const struct tu_pvtmem_config *pvtmem,
1188             uint64_t binary_iova)
1189 {
1190    const struct xs_config *cfg = &xs_config[stage];
1191 
1192    if (!xs) {
1193       /* shader stage disabled */
1194       return;
1195    }
1196 
1197    enum a6xx_threadsize thrsz =
1198       xs->info.double_threadsize ? THREAD128 : THREAD64;
1199    switch (stage) {
1200    case MESA_SHADER_VERTEX:
1201       tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0(
1202                .halfregfootprint = xs->info.max_half_reg + 1,
1203                .fullregfootprint = xs->info.max_reg + 1,
1204                .branchstack = ir3_shader_branchstack_hw(xs),
1205                .mergedregs = xs->mergedregs,
1206       ));
1207       break;
1208    case MESA_SHADER_TESS_CTRL:
1209       tu_cs_emit_regs(cs, A6XX_SP_HS_CTRL_REG0(
1210                .halfregfootprint = xs->info.max_half_reg + 1,
1211                .fullregfootprint = xs->info.max_reg + 1,
1212                .branchstack = ir3_shader_branchstack_hw(xs),
1213       ));
1214       break;
1215    case MESA_SHADER_TESS_EVAL:
1216       tu_cs_emit_regs(cs, A6XX_SP_DS_CTRL_REG0(
1217                .halfregfootprint = xs->info.max_half_reg + 1,
1218                .fullregfootprint = xs->info.max_reg + 1,
1219                .branchstack = ir3_shader_branchstack_hw(xs),
1220       ));
1221       break;
1222    case MESA_SHADER_GEOMETRY:
1223       tu_cs_emit_regs(cs, A6XX_SP_GS_CTRL_REG0(
1224                .halfregfootprint = xs->info.max_half_reg + 1,
1225                .fullregfootprint = xs->info.max_reg + 1,
1226                .branchstack = ir3_shader_branchstack_hw(xs),
1227       ));
1228       break;
1229    case MESA_SHADER_FRAGMENT:
1230       tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0(
1231                .halfregfootprint = xs->info.max_half_reg + 1,
1232                .fullregfootprint = xs->info.max_reg + 1,
1233                .branchstack = ir3_shader_branchstack_hw(xs),
1234                .threadsize = thrsz,
1235                .varying = xs->total_in != 0,
1236                .lodpixmask = xs->need_full_quad,
1237                /* unknown bit, seems unnecessary */
1238                .unk24 = true,
1239                .pixlodenable = xs->need_pixlod,
1240                .mergedregs = xs->mergedregs,
1241       ));
1242       break;
1243    case MESA_SHADER_COMPUTE:
1244       thrsz = cs->device->physical_device->info->a6xx
1245             .supports_double_threadsize ? thrsz : THREAD128;
1246       tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0(
1247                .halfregfootprint = xs->info.max_half_reg + 1,
1248                .fullregfootprint = xs->info.max_reg + 1,
1249                .branchstack = ir3_shader_branchstack_hw(xs),
1250                .threadsize = thrsz,
1251                .mergedregs = xs->mergedregs,
1252       ));
1253       break;
1254    default:
1255       unreachable("bad shader stage");
1256    }
1257 
1258    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_instrlen, 1);
1259    tu_cs_emit(cs, xs->instrlen);
1260 
1261    /* emit program binary & private memory layout
1262     * binary_iova should be aligned to 1 instrlen unit (128 bytes)
1263     */
1264 
1265    assert((binary_iova & 0x7f) == 0);
1266    assert((pvtmem->iova & 0x1f) == 0);
1267 
1268    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_first_exec_offset, 7);
1269    tu_cs_emit(cs, 0);
1270    tu_cs_emit_qw(cs, binary_iova);
1271    tu_cs_emit(cs,
1272               A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(pvtmem->per_fiber_size));
1273    tu_cs_emit_qw(cs, pvtmem->iova);
1274    tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(pvtmem->per_sp_size) |
1275                   COND(pvtmem->per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT));
1276 
1277    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1);
1278    tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size));
1279 
1280    if (cs->device->physical_device->info->chip >= A7XX) {
1281       tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vgpr_config, 1);
1282       tu_cs_emit(cs, 0);
1283    }
1284 
1285    if (cs->device->physical_device->info->chip == A6XX) {
1286       uint32_t shader_preload_size =
1287          MIN2(xs->instrlen, cs->device->physical_device->info->a6xx.instr_cache_size);
1288 
1289       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
1290       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1291                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
1292                      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1293                      CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1294                      CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
1295       tu_cs_emit_qw(cs, binary_iova);
1296    }
1297 
1298    /* emit immediates */
1299 
1300    const struct ir3_const_state *const_state = ir3_const_state(xs);
1301    uint32_t base = const_state->offsets.immediate;
1302    unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs);
1303 
1304    if (immediate_size > 0) {
1305       assert(!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble);
1306       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size);
1307       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
1308                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1309                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1310                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1311                  CP_LOAD_STATE6_0_NUM_UNIT(immediate_size / 4));
1312       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1313       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1314 
1315       tu_cs_emit_array(cs, const_state->immediates, immediate_size);
1316    }
1317 
1318    if (const_state->consts_ubo.idx != -1) {
1319       uint64_t iova = binary_iova + xs->info.constant_data_offset;
1320       uint32_t offset = const_state->consts_ubo.idx;
1321 
1322       /* Upload UBO state for the constant data. */
1323       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
1324       tu_cs_emit(cs,
1325                  CP_LOAD_STATE6_0_DST_OFF(offset) |
1326                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
1327                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1328                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1329                  CP_LOAD_STATE6_0_NUM_UNIT(1));
1330       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1331       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1332       int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16);
1333       tu_cs_emit_qw(cs,
1334                     iova |
1335                     (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32);
1336 
1337       /* Upload the constant data to the const file if needed. */
1338       const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
1339 
1340       if (!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
1341          for (int i = 0; i < ubo_state->num_enabled; i++) {
1342             if (ubo_state->range[i].ubo.block != offset ||
1343                 ubo_state->range[i].ubo.bindless) {
1344                continue;
1345             }
1346 
1347             uint32_t start = ubo_state->range[i].start;
1348             uint32_t end = ubo_state->range[i].end;
1349             uint32_t size = MIN2(end - start,
1350                                  (16 * xs->constlen) - ubo_state->range[i].offset);
1351 
1352             tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
1353             tu_cs_emit(cs,
1354                      CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
1355                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1356                      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1357                      CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1358                      CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
1359             tu_cs_emit_qw(cs, iova + start);
1360          }
1361       }
1362    }
1363 
1364    /* emit statically-known FS driver param */
1365    if (stage == MESA_SHADER_FRAGMENT && const_state->driver_params_ubo.size > 0) {
1366       uint32_t data[4] = {xs->info.double_threadsize ? 128 : 64, 0, 0, 0};
1367       uint32_t size = ARRAY_SIZE(data);
1368 
1369       /* A7XX TODO: Emit data via sub_cs instead of NOP */
1370       uint64_t iova = tu_cs_emit_data_nop(cs, data, size, 4);
1371       uint32_t base = const_state->driver_params_ubo.idx;
1372 
1373       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
1374       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
1375                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
1376                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1377                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1378                  CP_LOAD_STATE6_0_NUM_UNIT(1));
1379       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1380       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1381       int size_vec4s = DIV_ROUND_UP(size, 4);
1382       tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
1383    } else if (stage == MESA_SHADER_FRAGMENT && const_state->num_driver_params > 0) {
1384       uint32_t base = const_state->offsets.driver_param;
1385       int32_t size = DIV_ROUND_UP(MAX2(const_state->num_driver_params, 4), 4);
1386       size = MAX2(MIN2(size + base, xs->constlen) - base, 0);
1387 
1388       if (size > 0) {
1389          tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + 4);
1390          tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
1391                     CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1392                     CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1393                     CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1394                     CP_LOAD_STATE6_0_NUM_UNIT(size));
1395          tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1396          tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1397 
1398          tu_cs_emit(cs, xs->info.double_threadsize ? 128 : 64);
1399          tu_cs_emit(cs, 0);
1400          tu_cs_emit(cs, 0);
1401          tu_cs_emit(cs, 0);
1402       }
1403    }
1404 }
1405 
1406 template <chip CHIP>
1407 static void
tu6_emit_cs_config(struct tu_cs * cs,const struct ir3_shader_variant * v,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)1408 tu6_emit_cs_config(struct tu_cs *cs,
1409                    const struct ir3_shader_variant *v,
1410                    const struct tu_pvtmem_config *pvtmem,
1411                    uint64_t binary_iova)
1412 {
1413    bool shared_consts_enable =
1414       ir3_const_state(v)->push_consts_type == IR3_PUSH_CONSTS_SHARED;
1415    tu6_emit_shared_consts_enable<CHIP>(cs, shared_consts_enable);
1416 
1417    tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
1418          .cs_state = true,
1419          .cs_ibo = true,
1420          .cs_shared_const = shared_consts_enable));
1421 
1422    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_COMPUTE, v);
1423    tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
1424 
1425    uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
1426    tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
1427    tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
1428                   A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
1429 
1430    if (CHIP == A6XX && cs->device->physical_device->info->a6xx.has_lpac) {
1431       tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1);
1432       tu_cs_emit(cs, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) |
1433                      A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6);
1434    }
1435 
1436    uint32_t local_invocation_id =
1437       ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
1438    uint32_t work_group_id =
1439       ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
1440 
1441    /*
1442     * Devices that do not support double threadsize take the threadsize from
1443     * A6XX_HLSQ_FS_CNTL_0_THREADSIZE instead of A6XX_HLSQ_CS_CNTL_1_THREADSIZE
1444     * which is always set to THREAD128.
1445     */
1446    enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
1447    enum a6xx_threadsize thrsz_cs = cs->device->physical_device->info->a6xx
1448       .supports_double_threadsize ? thrsz : THREAD128;
1449    if (CHIP == A6XX) {
1450       tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
1451       tu_cs_emit(cs,
1452                  A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
1453                  A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
1454                  A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
1455                  A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
1456       tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
1457                      A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz_cs));
1458       if (!cs->device->physical_device->info->a6xx.supports_double_threadsize) {
1459          tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1);
1460          tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz));
1461       }
1462 
1463       if (cs->device->physical_device->info->a6xx.has_lpac) {
1464          tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2);
1465          tu_cs_emit(cs,
1466                     A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
1467                     A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
1468                     A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
1469                     A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
1470          tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
1471                   A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
1472       }
1473    } else {
1474       enum a7xx_cs_yalign yalign = (v->local_size[1] % 8 == 0)   ? CS_YALIGN_8
1475                                    : (v->local_size[1] % 4 == 0) ? CS_YALIGN_4
1476                                    : (v->local_size[1] % 2 == 0) ? CS_YALIGN_2
1477                                                                  : CS_YALIGN_1;
1478       tu_cs_emit_regs(
1479          cs, A7XX_HLSQ_CS_CNTL_1(
1480                    .linearlocalidregid = regid(63, 0), .threadsize = thrsz_cs,
1481                    /* A7XX TODO: blob either sets all of these unknowns
1482                     * together or doesn't set them at all.
1483                     */
1484                    .unk11 = true, .unk22 = true, .yalign = yalign, ));
1485 
1486       tu_cs_emit_regs(cs, HLSQ_FS_CNTL_0(CHIP, .threadsize = THREAD64));
1487 
1488       tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 1);
1489       tu_cs_emit(cs, A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
1490                         A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
1491                         A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
1492                         A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
1493 
1494       tu_cs_emit_regs(cs,
1495                       A7XX_SP_CS_CNTL_1(
1496                         .linearlocalidregid = regid(63, 0),
1497                         .threadsize = thrsz_cs,
1498                         /* A7XX TODO: enable UNK15 when we don't use subgroup ops. */
1499                         .unk15 = false, ));
1500 
1501       tu_cs_emit_regs(
1502          cs, A7XX_HLSQ_CS_LOCAL_SIZE(.localsizex = v->local_size[0] - 1,
1503                                      .localsizey = v->local_size[1] - 1,
1504                                      .localsizez = v->local_size[2] - 1, ));
1505 
1506       tu_cs_emit_regs(cs, A7XX_SP_CS_UNKNOWN_A9BE(0)); // Sometimes is 0x08000000
1507    }
1508 }
1509 
1510 #define TU6_EMIT_VFD_DEST_MAX_DWORDS (MAX_VERTEX_ATTRIBS + 2)
1511 
1512 static void
tu6_emit_vfd_dest(struct tu_cs * cs,const struct ir3_shader_variant * vs)1513 tu6_emit_vfd_dest(struct tu_cs *cs,
1514                   const struct ir3_shader_variant *vs)
1515 {
1516    int32_t input_for_attr[MAX_VERTEX_ATTRIBS];
1517    uint32_t attr_count = 0;
1518 
1519    for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; i++)
1520       input_for_attr[i] = -1;
1521 
1522    for (unsigned i = 0; i < vs->inputs_count; i++) {
1523       if (vs->inputs[i].sysval || vs->inputs[i].regid == regid(63, 0))
1524          continue;
1525 
1526       assert(vs->inputs[i].slot >= VERT_ATTRIB_GENERIC0);
1527       unsigned loc = vs->inputs[i].slot - VERT_ATTRIB_GENERIC0;
1528       input_for_attr[loc] = i;
1529       attr_count = MAX2(attr_count, loc + 1);
1530    }
1531 
1532    tu_cs_emit_regs(cs,
1533                    A6XX_VFD_CONTROL_0(
1534                      .fetch_cnt = attr_count, /* decode_cnt for binning pass ? */
1535                      .decode_cnt = attr_count));
1536 
1537    if (attr_count)
1538       tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DEST_CNTL_INSTR(0), attr_count);
1539 
1540    for (unsigned i = 0; i < attr_count; i++) {
1541       if (input_for_attr[i] >= 0) {
1542             unsigned input_idx = input_for_attr[i];
1543             tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0,
1544                              .writemask = vs->inputs[input_idx].compmask,
1545                              .regid = vs->inputs[input_idx].regid).value);
1546       } else {
1547             tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0,
1548                              .writemask = 0,
1549                              .regid = regid(63, 0)).value);
1550       }
1551    }
1552 }
1553 
1554 static enum a6xx_tex_prefetch_cmd
tu6_tex_opc_to_prefetch_cmd(opc_t tex_opc)1555 tu6_tex_opc_to_prefetch_cmd(opc_t tex_opc)
1556 {
1557    switch (tex_opc) {
1558    case OPC_SAM:
1559       return TEX_PREFETCH_SAM;
1560    default:
1561       unreachable("Unknown tex opc for prefeth cmd");
1562    }
1563 }
1564 
1565 template <chip CHIP>
1566 static void
tu6_emit_fs_inputs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1567 tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
1568 {
1569    uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
1570    uint32_t ij_regid[IJ_COUNT];
1571    uint32_t smask_in_regid;
1572 
1573    bool sample_shading = fs->per_samp | fs->key.sample_shading;
1574    bool enable_varyings = fs->total_in > 0;
1575 
1576    samp_id_regid   = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID);
1577    smask_in_regid  = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN);
1578    face_regid      = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE);
1579    coord_regid     = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD);
1580    zwcoord_regid   = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0);
1581    for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
1582       ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
1583 
1584    if (fs->num_sampler_prefetch > 0) {
1585       /* It seems like ij_pix is *required* to be r0.x */
1586       assert(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]) ||
1587              ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
1588    }
1589 
1590    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);
1591    tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) |
1592                      COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID(0x1ff)) |
1593                      COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID4COORD(0x1ff)) |
1594                      COND(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]),
1595                           A6XX_SP_FS_PREFETCH_CNTL_IJ_WRITE_DISABLE) |
1596                      COND(fs->prefetch_end_of_quad,
1597                           A6XX_SP_FS_PREFETCH_CNTL_ENDOFQUAD));
1598    for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1599       const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1600       tu_cs_emit(
1601          cs, SP_FS_PREFETCH_CMD(
1602                 CHIP, i, .src = prefetch->src, .samp_id = prefetch->samp_id,
1603                 .tex_id = prefetch->tex_id, .dst = prefetch->dst,
1604                 .wrmask = prefetch->wrmask, .half = prefetch->half_precision,
1605                 .bindless = prefetch->bindless,
1606                 .cmd = tu6_tex_opc_to_prefetch_cmd(prefetch->tex_opc), ).value);
1607    }
1608 
1609    if (fs->num_sampler_prefetch > 0) {
1610       tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch);
1611       for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1612          const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1613          tu_cs_emit(cs,
1614                     A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) |
1615                     A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id));
1616       }
1617    }
1618 
1619    tu_cs_emit_regs(cs,
1620       HLSQ_CONTROL_1_REG(CHIP,
1621          .primallocthreshold =
1622             cs->device->physical_device->info->a6xx.prim_alloc_threshold),
1623       HLSQ_CONTROL_2_REG(CHIP, .faceregid = face_regid,
1624                          .sampleid = samp_id_regid,
1625                          .samplemask = smask_in_regid,
1626                          .centerrhw = ij_regid[IJ_PERSP_CENTER_RHW]),
1627       HLSQ_CONTROL_3_REG(CHIP, .ij_persp_pixel = ij_regid[IJ_PERSP_PIXEL],
1628                          .ij_linear_pixel = ij_regid[IJ_LINEAR_PIXEL],
1629                          .ij_persp_centroid = ij_regid[IJ_PERSP_CENTROID],
1630                          .ij_linear_centroid = ij_regid[IJ_LINEAR_CENTROID]),
1631       HLSQ_CONTROL_4_REG(CHIP, .ij_persp_sample = ij_regid[IJ_PERSP_SAMPLE],
1632                          .ij_linear_sample = ij_regid[IJ_LINEAR_SAMPLE],
1633                          .xycoordregid = coord_regid,
1634                          .zwcoordregid = zwcoord_regid),
1635       HLSQ_CONTROL_5_REG(CHIP, .dword = 0xfcfc), );
1636 
1637    if (CHIP >= A7XX) {
1638       uint32_t sysval_regs = 0;
1639       for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) {
1640          if (VALIDREG(ij_regid[i])) {
1641             if (i == IJ_PERSP_CENTER_RHW)
1642                sysval_regs += 1;
1643             else
1644                sysval_regs += 2;
1645          }
1646       }
1647 
1648       for (uint32_t sysval : { face_regid, samp_id_regid, smask_in_regid }) {
1649          if (VALIDREG(sysval))
1650             sysval_regs += 1;
1651       }
1652 
1653       for (uint32_t sysval : { coord_regid, zwcoord_regid }) {
1654          if (VALIDREG(sysval))
1655             sysval_regs += 2;
1656       }
1657 
1658       tu_cs_emit_regs(cs, A7XX_HLSQ_UNKNOWN_A9AE(.sysval_regs_count = sysval_regs,
1659                                                  .unk8 = 1,
1660                                                  .unk9 = 1));
1661    }
1662 
1663    enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64;
1664    tu_cs_emit_regs(cs, HLSQ_FS_CNTL_0(CHIP, .threadsize = thrsz, .varyings = enable_varyings));
1665 
1666    bool need_size = fs->frag_face || fs->fragcoord_compmask != 0;
1667    bool need_size_persamp = false;
1668    if (VALIDREG(ij_regid[IJ_PERSP_CENTER_RHW])) {
1669       if (sample_shading)
1670          need_size_persamp = true;
1671       else
1672          need_size = true;
1673    }
1674 
1675    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1);
1676    tu_cs_emit(cs,
1677          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
1678          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
1679          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
1680          CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1681          CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) |
1682          CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1683          COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1684          COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1685          COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
1686 
1687    tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2);
1688    tu_cs_emit(cs,
1689          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
1690          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
1691          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
1692          CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1693          CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) |
1694          CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1695          COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1696          COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
1697          COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1698          COND(fs->fragcoord_compmask != 0,
1699                            A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
1700    tu_cs_emit(cs,
1701          A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE(
1702             sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) |
1703          CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
1704          CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) |
1705          CONDREG(ij_regid[IJ_PERSP_CENTER_RHW], A6XX_RB_RENDER_CONTROL1_CENTERRHW) |
1706          COND(fs->post_depth_coverage, A6XX_RB_RENDER_CONTROL1_POSTDEPTHCOVERAGE)  |
1707          COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS));
1708 
1709    tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CNTL, 1);
1710    tu_cs_emit(cs, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE));
1711 
1712    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1);
1713    tu_cs_emit(cs, CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) |
1714               A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE(
1715                  sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER));
1716 
1717    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 1);
1718    tu_cs_emit(cs, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE));
1719 
1720    uint32_t varmask[4] = { 0 };
1721 
1722    for (int i = ir3_next_varying(fs, -1); i < fs->inputs_count;
1723         i = ir3_next_varying(fs, i)) {
1724       if (fs->inputs[i].inloc >= fs->total_in)
1725          continue;
1726 
1727       unsigned loc = fs->inputs[i].inloc;
1728       for (int j = 0; j < util_last_bit(fs->inputs[i].compmask); j++) {
1729          uint8_t comploc = loc + j;
1730          varmask[comploc / 32] |= 1 << (comploc % 32);
1731       }
1732    }
1733 
1734    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
1735    tu_cs_emit(cs, ~varmask[0]);
1736    tu_cs_emit(cs, ~varmask[1]);
1737    tu_cs_emit(cs, ~varmask[2]);
1738    tu_cs_emit(cs, ~varmask[3]);
1739 
1740    unsigned primid_loc = ir3_find_input_loc(fs, VARYING_SLOT_PRIMITIVE_ID);
1741    unsigned viewid_loc = ir3_find_input_loc(fs, VARYING_SLOT_VIEW_INDEX);
1742 
1743    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
1744    tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs->total_in) |
1745                   COND(fs && fs->total_in, A6XX_VPC_CNTL_0_VARYING) |
1746                   A6XX_VPC_CNTL_0_PRIMIDLOC(primid_loc) |
1747                   A6XX_VPC_CNTL_0_VIEWIDLOC(viewid_loc));
1748 }
1749 
1750 static void
tu6_emit_fs_outputs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1751 tu6_emit_fs_outputs(struct tu_cs *cs,
1752                     const struct ir3_shader_variant *fs)
1753 {
1754    uint32_t smask_regid, posz_regid, stencilref_regid;
1755 
1756    posz_regid      = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH);
1757    smask_regid     = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK);
1758    stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL);
1759 
1760    int output_reg_count = 0;
1761    uint32_t fragdata_regid[8];
1762 
1763    assert(!fs->color0_mrt);
1764    for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) {
1765       fragdata_regid[i] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + i);
1766       if (VALIDREG(fragdata_regid[i]))
1767          output_reg_count = i + 1;
1768    }
1769 
1770    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1);
1771    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) |
1772                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) |
1773                   A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) |
1774                   COND(fs->dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1775 
1776    /* There is no point in having component enabled which is not written
1777     * by the shader. Per VK spec it is an UB, however a few apps depend on
1778     * attachment not being changed if FS doesn't have corresponding output.
1779     */
1780    uint32_t fs_render_components = 0;
1781 
1782    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), output_reg_count);
1783    for (uint32_t i = 0; i < output_reg_count; i++) {
1784       tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) |
1785                      (COND(fragdata_regid[i] & HALF_REG_ID,
1786                            A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)));
1787 
1788       if (VALIDREG(fragdata_regid[i])) {
1789          fs_render_components |= 0xf << (i * 4);
1790       }
1791    }
1792 
1793    tu_cs_emit_regs(cs,
1794                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = fs_render_components));
1795 
1796    tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 1);
1797    tu_cs_emit(cs, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
1798                   COND(fs->writes_smask, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
1799                   COND(fs->writes_stencilref, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
1800                   COND(fs->dual_src_blend, A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1801 
1802    tu_cs_emit_regs(cs,
1803                    A6XX_RB_RENDER_COMPONENTS(.dword = fs_render_components));
1804 }
1805 
1806 template <chip CHIP>
1807 void
tu6_emit_vs(struct tu_cs * cs,const struct ir3_shader_variant * vs,uint32_t view_mask)1808 tu6_emit_vs(struct tu_cs *cs,
1809             const struct ir3_shader_variant *vs,
1810             uint32_t view_mask)
1811 {
1812    bool multi_pos_output = vs->multi_pos_output;
1813 
1814    uint32_t multiview_views = util_logbase2(view_mask) + 1;
1815    uint32_t multiview_cntl = view_mask ?
1816       A6XX_PC_MULTIVIEW_CNTL_ENABLE |
1817       A6XX_PC_MULTIVIEW_CNTL_VIEWS(multiview_views) |
1818       COND(!multi_pos_output, A6XX_PC_MULTIVIEW_CNTL_DISABLEMULTIPOS)
1819       : 0;
1820 
1821    /* Copy what the blob does here. This will emit an extra 0x3f
1822     * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
1823     * this is working around yet.
1824     */
1825    if (cs->device->physical_device->info->a6xx.has_cp_reg_write) {
1826       tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
1827       tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
1828       tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
1829    } else {
1830       tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_CNTL, 1);
1831    }
1832    tu_cs_emit(cs, multiview_cntl);
1833 
1834    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_MULTIVIEW_CNTL, 1);
1835    tu_cs_emit(cs, multiview_cntl);
1836 
1837    if (multiview_cntl &&
1838        cs->device->physical_device->info->a6xx.supports_multiview_mask) {
1839       tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_MASK, 1);
1840       tu_cs_emit(cs, view_mask);
1841    }
1842 
1843    if (CHIP >= A7XX) {
1844       tu_cs_emit_pkt4(cs, REG_A7XX_VPC_MULTIVIEW_CNTL, 1);
1845       tu_cs_emit(cs, multiview_cntl);
1846 
1847       tu_cs_emit_pkt4(cs, REG_A7XX_VPC_MULTIVIEW_MASK, 1);
1848       tu_cs_emit(cs, view_mask);
1849    }
1850 
1851    tu6_emit_vfd_dest(cs, vs);
1852 
1853    const uint32_t vertexid_regid =
1854          ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
1855    const uint32_t instanceid_regid =
1856          ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
1857 
1858    /* Note: we currently don't support multiview with tess or GS. If we did,
1859     * and the HW actually works, then we'd have to somehow share this across
1860     * stages. Note that the blob doesn't support this either.
1861     */
1862    const uint32_t viewid_regid =
1863       ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX);
1864 
1865    const uint32_t vs_primitiveid_regid =
1866       ir3_find_sysval_regid(vs, SYSTEM_VALUE_PRIMITIVE_ID);
1867 
1868    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_1, 1);
1869    tu_cs_emit(cs, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) |
1870                   A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) |
1871                   A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) |
1872                   A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid));
1873 }
1874 TU_GENX(tu6_emit_vs);
1875 
1876 template <chip CHIP>
1877 void
tu6_emit_hs(struct tu_cs * cs,const struct ir3_shader_variant * hs)1878 tu6_emit_hs(struct tu_cs *cs,
1879             const struct ir3_shader_variant *hs)
1880 {
1881    const uint32_t hs_rel_patch_regid =
1882          ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3);
1883    const uint32_t hs_invocation_regid =
1884          ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3);
1885 
1886    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_2, 1);
1887    tu_cs_emit(cs, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) |
1888                   A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid));
1889 
1890    if (hs) {
1891       tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
1892       tu_cs_emit(cs, hs->tess.tcs_vertices_out);
1893    }
1894 }
1895 TU_GENX(tu6_emit_hs);
1896 
1897 template <chip CHIP>
1898 void
tu6_emit_ds(struct tu_cs * cs,const struct ir3_shader_variant * ds)1899 tu6_emit_ds(struct tu_cs *cs,
1900             const struct ir3_shader_variant *ds)
1901 {
1902    const uint32_t ds_rel_patch_regid =
1903          ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3);
1904    const uint32_t tess_coord_x_regid =
1905          ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD);
1906    const uint32_t tess_coord_y_regid = VALIDREG(tess_coord_x_regid) ?
1907          tess_coord_x_regid + 1 :
1908          regid(63, 0);
1909    const uint32_t ds_primitiveid_regid =
1910          ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID);
1911 
1912    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_3, 2);
1913    tu_cs_emit(cs, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) |
1914                   A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) |
1915                   A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) |
1916                   A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid));
1917    tu_cs_emit(cs, 0x000000fc); /* VFD_CONTROL_4 */
1918 }
1919 TU_GENX(tu6_emit_ds);
1920 
1921 static enum a6xx_tess_output
primitive_to_tess(enum mesa_prim primitive)1922 primitive_to_tess(enum mesa_prim primitive) {
1923    switch (primitive) {
1924    case MESA_PRIM_POINTS:
1925       return TESS_POINTS;
1926    case MESA_PRIM_LINE_STRIP:
1927       return TESS_LINES;
1928    case MESA_PRIM_TRIANGLE_STRIP:
1929       return TESS_CW_TRIS;
1930    default:
1931       unreachable("");
1932    }
1933 }
1934 
1935 template <chip CHIP>
1936 void
tu6_emit_gs(struct tu_cs * cs,const struct ir3_shader_variant * gs)1937 tu6_emit_gs(struct tu_cs *cs,
1938             const struct ir3_shader_variant *gs)
1939 {
1940    const uint32_t gsheader_regid =
1941          ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3);
1942 
1943    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_5, 1);
1944    tu_cs_emit(cs, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) |
1945                   0xfc00);
1946 
1947    if (gs) {
1948       uint32_t vertices_out, invocations;
1949 
1950       vertices_out = gs->gs.vertices_out - 1;
1951       enum a6xx_tess_output output = primitive_to_tess((enum mesa_prim) gs->gs.output_primitive);
1952       invocations = gs->gs.invocations - 1;
1953 
1954       uint32_t primitive_cntl =
1955          A6XX_PC_PRIMITIVE_CNTL_5(.gs_vertices_out = vertices_out,
1956                                   .gs_invocations = invocations,
1957                                   .gs_output = output,).value;
1958 
1959       tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1);
1960       tu_cs_emit(cs, primitive_cntl);
1961 
1962       if (CHIP >= A7XX) {
1963          tu_cs_emit_pkt4(cs, REG_A7XX_VPC_PRIMITIVE_CNTL_5, 1);
1964          tu_cs_emit(cs, primitive_cntl);
1965       } else {
1966          tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_PARAM, 1);
1967          tu_cs_emit(cs, 0xff);
1968       }
1969    }
1970 }
1971 TU_GENX(tu6_emit_gs);
1972 
1973 template <chip CHIP>
1974 void
tu6_emit_fs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1975 tu6_emit_fs(struct tu_cs *cs,
1976             const struct ir3_shader_variant *fs)
1977 {
1978    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_6, 1);
1979    tu_cs_emit(cs, COND(fs && fs->reads_primid, A6XX_VFD_CONTROL_6_PRIMID4PSEN));
1980 
1981    tu_cs_emit_regs(cs, A6XX_PC_PS_CNTL(.primitiveiden = fs && fs->reads_primid));
1982 
1983    if (CHIP >= A7XX) {
1984       tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2));
1985       tu_cs_emit_regs(cs, A7XX_HLSQ_FS_UNKNOWN_A9AA(.consts_load_disable = false));
1986    }
1987 
1988    if (fs) {
1989       tu6_emit_fs_inputs<CHIP>(cs, fs);
1990       tu6_emit_fs_outputs(cs, fs);
1991    } else {
1992       /* TODO: check if these can be skipped if fs is disabled */
1993       struct ir3_shader_variant dummy_variant = {};
1994       tu6_emit_fs_inputs<CHIP>(cs, &dummy_variant);
1995       tu6_emit_fs_outputs(cs, &dummy_variant);
1996    }
1997 }
1998 TU_GENX(tu6_emit_fs);
1999 
2000 template <chip CHIP>
2001 static void
tu6_emit_variant(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs,struct tu_pvtmem_config * pvtmem_config,uint32_t view_mask,uint64_t binary_iova)2002 tu6_emit_variant(struct tu_cs *cs,
2003                  gl_shader_stage stage,
2004                  const struct ir3_shader_variant *xs,
2005                  struct tu_pvtmem_config *pvtmem_config,
2006                  uint32_t view_mask,
2007                  uint64_t binary_iova)
2008 {
2009    if (stage == MESA_SHADER_COMPUTE) {
2010       tu6_emit_cs_config<CHIP>(cs, xs, pvtmem_config, binary_iova);
2011       return;
2012    }
2013 
2014    tu6_emit_xs(cs, stage, xs, pvtmem_config, binary_iova);
2015 
2016    switch (stage) {
2017    case MESA_SHADER_VERTEX:
2018       tu6_emit_vs<CHIP>(cs, xs, view_mask);
2019       break;
2020    case MESA_SHADER_TESS_CTRL:
2021       tu6_emit_hs<CHIP>(cs, xs);
2022       break;
2023    case MESA_SHADER_TESS_EVAL:
2024       tu6_emit_ds<CHIP>(cs, xs);
2025       break;
2026    case MESA_SHADER_GEOMETRY:
2027       tu6_emit_gs<CHIP>(cs, xs);
2028       break;
2029    case MESA_SHADER_FRAGMENT:
2030       tu6_emit_fs<CHIP>(cs, xs);
2031       break;
2032    default:
2033       unreachable("unknown shader stage");
2034    }
2035 }
2036 
2037 static VkResult
tu_setup_pvtmem(struct tu_device * dev,struct tu_shader * shader,struct tu_pvtmem_config * config,uint32_t pvtmem_bytes,bool per_wave)2038 tu_setup_pvtmem(struct tu_device *dev,
2039                 struct tu_shader *shader,
2040                 struct tu_pvtmem_config *config,
2041                 uint32_t pvtmem_bytes,
2042                 bool per_wave)
2043 {
2044    if (!pvtmem_bytes) {
2045       memset(config, 0, sizeof(*config));
2046       return VK_SUCCESS;
2047    }
2048 
2049    /* There is a substantial memory footprint from private memory BOs being
2050     * allocated on a per-pipeline basis and it isn't required as the same
2051     * BO can be utilized by multiple pipelines as long as they have the
2052     * private memory layout (sizes and per-wave/per-fiber) to avoid being
2053     * overwritten by other active pipelines using the same BO with differing
2054     * private memory layouts resulting memory corruption.
2055     *
2056     * To avoid this, we create private memory BOs on a per-device level with
2057     * an associated private memory layout then dynamically grow them when
2058     * needed and reuse them across pipelines. Growth is done in terms of
2059     * powers of two so that we can avoid frequent reallocation of the
2060     * private memory BOs.
2061     */
2062 
2063    struct tu_pvtmem_bo *pvtmem_bo =
2064       per_wave ? &dev->wave_pvtmem_bo : &dev->fiber_pvtmem_bo;
2065    mtx_lock(&pvtmem_bo->mtx);
2066 
2067    if (pvtmem_bo->per_fiber_size < pvtmem_bytes) {
2068       if (pvtmem_bo->bo)
2069          tu_bo_finish(dev, pvtmem_bo->bo);
2070 
2071       pvtmem_bo->per_fiber_size =
2072          util_next_power_of_two(ALIGN(pvtmem_bytes, 512));
2073       pvtmem_bo->per_sp_size =
2074          ALIGN(pvtmem_bo->per_fiber_size *
2075                   dev->physical_device->info->fibers_per_sp,
2076                1 << 12);
2077       uint32_t total_size =
2078          dev->physical_device->info->num_sp_cores * pvtmem_bo->per_sp_size;
2079 
2080       VkResult result = tu_bo_init_new(dev, &pvtmem_bo->bo, total_size,
2081                                        TU_BO_ALLOC_NO_FLAGS, "pvtmem");
2082       if (result != VK_SUCCESS) {
2083          mtx_unlock(&pvtmem_bo->mtx);
2084          return result;
2085       }
2086    }
2087 
2088    config->per_wave = per_wave;
2089    config->per_fiber_size = pvtmem_bo->per_fiber_size;
2090    config->per_sp_size = pvtmem_bo->per_sp_size;
2091 
2092    shader->pvtmem_bo = tu_bo_get_ref(pvtmem_bo->bo);
2093    config->iova = shader->pvtmem_bo->iova;
2094 
2095    mtx_unlock(&pvtmem_bo->mtx);
2096 
2097    return VK_SUCCESS;
2098 }
2099 
2100 static uint64_t
tu_upload_variant(struct tu_cs * cs,const struct ir3_shader_variant * variant)2101 tu_upload_variant(struct tu_cs *cs,
2102                   const struct ir3_shader_variant *variant)
2103 {
2104    struct tu_cs_memory memory;
2105 
2106    if (!variant)
2107       return 0;
2108 
2109    /* this expects to get enough alignment because shaders are allocated first
2110     * and total size is always aligned correctly
2111     * note: an assert in tu6_emit_xs_config validates the alignment
2112     */
2113    tu_cs_alloc(cs, variant->info.size / 4, 1, &memory);
2114 
2115    memcpy(memory.map, variant->bin, variant->info.size);
2116    return memory.iova;
2117 }
2118 
2119 static VkResult
tu_upload_shader(struct tu_device * dev,struct tu_shader * shader)2120 tu_upload_shader(struct tu_device *dev,
2121                  struct tu_shader *shader)
2122 {
2123    const struct ir3_shader_variant *v = shader->variant;
2124    const struct ir3_shader_variant *binning = v ? v->binning : NULL;
2125    const struct ir3_shader_variant *safe_const = shader->safe_const_variant;
2126 
2127    if (v->type == MESA_SHADER_VERTEX && v->stream_output.num_outputs != 0)
2128       binning = v;
2129 
2130    uint32_t size = 0;
2131    if (v->type == MESA_SHADER_VERTEX)
2132       size += TU6_EMIT_VFD_DEST_MAX_DWORDS;
2133 
2134    const unsigned xs_size = 128;
2135    const unsigned vpc_size = 32 + (v->stream_output.num_outputs != 0 ? 256 : 0);
2136 
2137    size += xs_size + tu_xs_get_additional_cs_size_dwords(v);
2138    size += v->info.size / 4;
2139    if (binning) {
2140       size += xs_size + tu_xs_get_additional_cs_size_dwords(binning);
2141       size += binning->info.size / 4;
2142    }
2143 
2144    if (safe_const) {
2145       size += xs_size + tu_xs_get_additional_cs_size_dwords(safe_const);
2146       size += safe_const->info.size / 4;
2147    }
2148 
2149    /* We emit an empty VPC including streamout state in the binning draw state */
2150    if (binning || v->type == MESA_SHADER_GEOMETRY) {
2151       size += vpc_size;
2152    }
2153 
2154    pthread_mutex_lock(&dev->pipeline_mutex);
2155    VkResult result = tu_suballoc_bo_alloc(&shader->bo, &dev->pipeline_suballoc,
2156                                           size * 4, 128);
2157    pthread_mutex_unlock(&dev->pipeline_mutex);
2158 
2159    if (result != VK_SUCCESS)
2160       return result;
2161 
2162    uint32_t pvtmem_size = v->pvtmem_size;
2163    bool per_wave = v->pvtmem_per_wave;
2164 
2165    if (v->binning) {
2166       pvtmem_size = MAX2(pvtmem_size, shader->variant->binning->pvtmem_size);
2167       if (!shader->variant->binning->pvtmem_per_wave)
2168          per_wave = false;
2169    }
2170 
2171    if (shader->safe_const_variant) {
2172       pvtmem_size = MAX2(pvtmem_size, shader->safe_const_variant->pvtmem_size);
2173       if (!shader->safe_const_variant->pvtmem_per_wave)
2174          per_wave = false;
2175 
2176       if (shader->safe_const_variant->binning) {
2177          pvtmem_size = MAX2(pvtmem_size, shader->safe_const_variant->binning->pvtmem_size);
2178          if (!shader->safe_const_variant->binning->pvtmem_per_wave)
2179             per_wave = false;
2180       }
2181    }
2182 
2183    struct tu_pvtmem_config pvtmem_config;
2184 
2185    result = tu_setup_pvtmem(dev, shader, &pvtmem_config, pvtmem_size, per_wave);
2186    if (result != VK_SUCCESS) {
2187       pthread_mutex_lock(&dev->pipeline_mutex);
2188       tu_suballoc_bo_free(&dev->pipeline_suballoc, &shader->bo);
2189       pthread_mutex_unlock(&dev->pipeline_mutex);
2190       return result;
2191    }
2192 
2193    tu_cs_init_suballoc(&shader->cs, dev, &shader->bo);
2194 
2195    uint64_t iova = tu_upload_variant(&shader->cs, v);
2196    uint64_t binning_iova = tu_upload_variant(&shader->cs, binning);
2197    uint64_t safe_const_iova = tu_upload_variant(&shader->cs, safe_const);
2198 
2199    struct tu_cs sub_cs;
2200    tu_cs_begin_sub_stream(&shader->cs, xs_size +
2201                           tu_xs_get_additional_cs_size_dwords(v), &sub_cs);
2202    TU_CALLX(dev, tu6_emit_variant)(
2203       &sub_cs, shader->variant->type, shader->variant, &pvtmem_config,
2204       shader->view_mask, iova);
2205    shader->state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2206 
2207    if (safe_const) {
2208       tu_cs_begin_sub_stream(&shader->cs, xs_size +
2209                              tu_xs_get_additional_cs_size_dwords(safe_const), &sub_cs);
2210       TU_CALLX(dev, tu6_emit_variant)(
2211          &sub_cs, v->type, safe_const, &pvtmem_config, shader->view_mask,
2212          safe_const_iova);
2213       shader->safe_const_state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2214    }
2215 
2216    if (binning) {
2217       tu_cs_begin_sub_stream(&shader->cs, xs_size + vpc_size +
2218                              tu_xs_get_additional_cs_size_dwords(binning), &sub_cs);
2219       TU_CALLX(dev, tu6_emit_variant)(
2220          &sub_cs, v->type, binning, &pvtmem_config, shader->view_mask,
2221          binning_iova);
2222       /* emit an empty VPC */
2223       TU_CALLX(dev, tu6_emit_vpc)(&sub_cs, binning, NULL, NULL, NULL, NULL);
2224       shader->binning_state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2225    }
2226 
2227    /* We don't support binning variants for GS, so the same draw state is used
2228     * when binning and when drawing, but the VPC draw state is not executed
2229     * when binning so we still need to generate an appropriate VPC config for
2230     * binning.
2231     */
2232    if (v->type == MESA_SHADER_GEOMETRY) {
2233       tu_cs_begin_sub_stream(&shader->cs, vpc_size, &sub_cs);
2234       TU_CALLX(dev, tu6_emit_vpc)(&sub_cs, NULL, NULL, NULL, v, NULL);
2235       shader->binning_state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2236    }
2237 
2238    return VK_SUCCESS;
2239 }
2240 
2241 static bool
2242 tu_shader_serialize(struct vk_pipeline_cache_object *object,
2243                     struct blob *blob);
2244 
2245 static struct vk_pipeline_cache_object *
2246 tu_shader_deserialize(struct vk_pipeline_cache *cache,
2247                       const void *key_data,
2248                       size_t key_size,
2249                       struct blob_reader *blob);
2250 
2251 static void
tu_shader_destroy(struct vk_device * device,struct vk_pipeline_cache_object * object)2252 tu_shader_destroy(struct vk_device *device,
2253                   struct vk_pipeline_cache_object *object)
2254 {
2255    struct tu_shader *shader =
2256       container_of(object, struct tu_shader, base);
2257 
2258    vk_pipeline_cache_object_finish(&shader->base);
2259    vk_free(&device->alloc, shader);
2260 }
2261 
2262 const struct vk_pipeline_cache_object_ops tu_shader_ops = {
2263    .serialize = tu_shader_serialize,
2264    .deserialize = tu_shader_deserialize,
2265    .destroy = tu_shader_destroy,
2266 };
2267 
2268 static struct tu_shader *
tu_shader_init(struct tu_device * dev,const void * key_data,size_t key_size)2269 tu_shader_init(struct tu_device *dev, const void *key_data, size_t key_size)
2270 {
2271    VK_MULTIALLOC(ma);
2272    VK_MULTIALLOC_DECL(&ma, struct tu_shader, shader, 1);
2273    VK_MULTIALLOC_DECL_SIZE(&ma, char, obj_key_data, key_size);
2274 
2275    if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc,
2276                              VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
2277       return NULL;
2278 
2279    memcpy(obj_key_data, key_data, key_size);
2280 
2281    vk_pipeline_cache_object_init(&dev->vk, &shader->base,
2282                                  &tu_shader_ops, obj_key_data, key_size);
2283 
2284    shader->const_state.fdm_ubo.idx = -1;
2285    shader->const_state.dynamic_offsets_ubo.idx = -1;
2286    shader->const_state.inline_uniforms_ubo.idx = -1;
2287 
2288    return shader;
2289 }
2290 
2291 static bool
tu_shader_serialize(struct vk_pipeline_cache_object * object,struct blob * blob)2292 tu_shader_serialize(struct vk_pipeline_cache_object *object,
2293                     struct blob *blob)
2294 {
2295    struct tu_shader *shader =
2296       container_of(object, struct tu_shader, base);
2297 
2298    blob_write_bytes(blob, &shader->const_state, sizeof(shader->const_state));
2299    blob_write_bytes(blob, &shader->dynamic_descriptor_sizes,
2300                     sizeof(shader->dynamic_descriptor_sizes));
2301    blob_write_uint32(blob, shader->view_mask);
2302    blob_write_uint8(blob, shader->active_desc_sets);
2303 
2304    ir3_store_variant(blob, shader->variant);
2305 
2306    if (shader->safe_const_variant) {
2307       blob_write_uint8(blob, 1);
2308       ir3_store_variant(blob, shader->safe_const_variant);
2309    } else {
2310       blob_write_uint8(blob, 0);
2311    }
2312 
2313 
2314 
2315    switch (shader->variant->type) {
2316    case MESA_SHADER_TESS_EVAL:
2317       blob_write_bytes(blob, &shader->tes, sizeof(shader->tes));
2318       break;
2319    case MESA_SHADER_FRAGMENT:
2320       blob_write_bytes(blob, &shader->fs, sizeof(shader->fs));
2321       break;
2322    default:
2323       break;
2324    }
2325 
2326    return true;
2327 }
2328 
2329 static struct vk_pipeline_cache_object *
tu_shader_deserialize(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,struct blob_reader * blob)2330 tu_shader_deserialize(struct vk_pipeline_cache *cache,
2331                       const void *key_data,
2332                       size_t key_size,
2333                       struct blob_reader *blob)
2334 {
2335    struct tu_device *dev =
2336       container_of(cache->base.device, struct tu_device, vk);
2337    struct tu_shader *shader =
2338       tu_shader_init(dev, key_data, key_size);
2339 
2340    if (!shader)
2341       return NULL;
2342 
2343    blob_copy_bytes(blob, &shader->const_state, sizeof(shader->const_state));
2344    blob_copy_bytes(blob, &shader->dynamic_descriptor_sizes,
2345                    sizeof(shader->dynamic_descriptor_sizes));
2346    shader->view_mask = blob_read_uint32(blob);
2347    shader->active_desc_sets = blob_read_uint8(blob);
2348 
2349    shader->variant = ir3_retrieve_variant(blob, dev->compiler, NULL);
2350 
2351    bool has_safe_const = blob_read_uint8(blob);
2352    if (has_safe_const)
2353       shader->safe_const_variant = ir3_retrieve_variant(blob, dev->compiler, NULL);
2354 
2355    switch (shader->variant->type) {
2356    case MESA_SHADER_TESS_EVAL:
2357       blob_copy_bytes(blob, &shader->tes, sizeof(shader->tes));
2358       break;
2359    case MESA_SHADER_FRAGMENT:
2360       blob_copy_bytes(blob, &shader->fs, sizeof(shader->fs));
2361       break;
2362    default:
2363       break;
2364    }
2365 
2366    VkResult result = tu_upload_shader(dev, shader);
2367    if (result != VK_SUCCESS) {
2368       vk_free(&dev->vk.alloc, shader);
2369       return NULL;
2370    }
2371 
2372    return &shader->base;
2373 }
2374 
2375 VkResult
tu_shader_create(struct tu_device * dev,struct tu_shader ** shader_out,nir_shader * nir,const struct tu_shader_key * key,const struct ir3_shader_key * ir3_key,const void * key_data,size_t key_size,struct tu_pipeline_layout * layout,bool executable_info)2376 tu_shader_create(struct tu_device *dev,
2377                  struct tu_shader **shader_out,
2378                  nir_shader *nir,
2379                  const struct tu_shader_key *key,
2380                  const struct ir3_shader_key *ir3_key,
2381                  const void *key_data,
2382                  size_t key_size,
2383                  struct tu_pipeline_layout *layout,
2384                  bool executable_info)
2385 {
2386    struct tu_shader *shader = tu_shader_init(dev, key_data, key_size);
2387 
2388    if (!shader)
2389       return VK_ERROR_OUT_OF_HOST_MEMORY;
2390 
2391    const nir_opt_access_options access_options = {
2392       .is_vulkan = true,
2393    };
2394    NIR_PASS_V(nir, nir_opt_access, &access_options);
2395 
2396    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2397       const nir_input_attachment_options att_options = {
2398          .use_fragcoord_sysval = true,
2399          .use_layer_id_sysval = false,
2400          /* When using multiview rendering, we must use
2401           * gl_ViewIndex as the layer id to pass to the texture
2402           * sampling function. gl_Layer doesn't work when
2403           * multiview is enabled.
2404           */
2405          .use_view_id_for_layer = key->multiview_mask != 0,
2406          .unscaled_input_attachment_ir3 = key->unscaled_input_fragcoord,
2407       };
2408       NIR_PASS_V(nir, nir_lower_input_attachments, &att_options);
2409    }
2410 
2411    /* This has to happen before lower_input_attachments, because we have to
2412     * lower input attachment coordinates except if unscaled.
2413     */
2414    const struct lower_fdm_options fdm_options = {
2415       .num_views = MAX2(util_last_bit(key->multiview_mask), 1),
2416       .adjust_fragcoord = key->fragment_density_map,
2417    };
2418    NIR_PASS_V(nir, tu_nir_lower_fdm, &fdm_options);
2419 
2420 
2421    /* This needs to happen before multiview lowering which rewrites store
2422     * instructions of the position variable, so that we can just rewrite one
2423     * store at the end instead of having to rewrite every store specified by
2424     * the user.
2425     */
2426    ir3_nir_lower_io_to_temporaries(nir);
2427 
2428    if (nir->info.stage == MESA_SHADER_VERTEX && key->multiview_mask) {
2429       tu_nir_lower_multiview(nir, key->multiview_mask, dev);
2430    }
2431 
2432    if (nir->info.stage == MESA_SHADER_FRAGMENT && key->force_sample_interp) {
2433       nir_foreach_shader_in_variable(var, nir) {
2434          if (!var->data.centroid)
2435             var->data.sample = true;
2436       }
2437    }
2438 
2439    NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const,
2440               nir_address_format_32bit_offset);
2441 
2442    NIR_PASS_V(nir, nir_lower_explicit_io,
2443               nir_var_mem_ubo | nir_var_mem_ssbo,
2444               nir_address_format_vec2_index_32bit_offset);
2445 
2446    NIR_PASS_V(nir, nir_lower_explicit_io,
2447               nir_var_mem_global,
2448               nir_address_format_64bit_global);
2449 
2450    if (nir->info.stage == MESA_SHADER_COMPUTE) {
2451       NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
2452                  nir_var_mem_shared, shared_type_info);
2453       NIR_PASS_V(nir, nir_lower_explicit_io,
2454                  nir_var_mem_shared,
2455                  nir_address_format_32bit_offset);
2456 
2457       if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
2458          const unsigned chunk_size = 16; /* max single store size */
2459          /* Shared memory is allocated in 1024b chunks in HW, but the zero-init
2460           * extension only requires us to initialize the memory that the shader
2461           * is allocated at the API level, and it's up to the user to ensure
2462           * that accesses are limited to those bounds.
2463           */
2464          const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
2465          NIR_PASS_V(nir, nir_zero_initialize_shared_memory, shared_size, chunk_size);
2466       }
2467 
2468       const struct nir_lower_compute_system_values_options compute_sysval_options = {
2469          .has_base_workgroup_id = true,
2470       };
2471       NIR_PASS_V(nir, nir_lower_compute_system_values, &compute_sysval_options);
2472    }
2473 
2474    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
2475    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
2476 
2477   /* Gather information for transform feedback. This should be called after:
2478     * - nir_split_per_member_structs.
2479     * - nir_remove_dead_variables with varyings, so that we could align
2480     *   stream outputs correctly.
2481     * - nir_assign_io_var_locations - to have valid driver_location
2482     */
2483    struct ir3_stream_output_info so_info = {};
2484    if (nir->info.stage == MESA_SHADER_VERTEX ||
2485          nir->info.stage == MESA_SHADER_TESS_EVAL ||
2486          nir->info.stage == MESA_SHADER_GEOMETRY)
2487       tu_gather_xfb_info(nir, &so_info);
2488 
2489    for (unsigned i = 0; i < layout->num_sets; i++) {
2490       if (layout->set[i].layout) {
2491          shader->dynamic_descriptor_sizes[i] =
2492             layout->set[i].layout->dynamic_offset_size;
2493       } else {
2494          shader->dynamic_descriptor_sizes[i] = -1;
2495       }
2496    }
2497 
2498    unsigned reserved_consts_vec4 = 0;
2499    NIR_PASS_V(nir, tu_lower_io, dev, shader, layout, &reserved_consts_vec4);
2500 
2501    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
2502 
2503    ir3_finalize_nir(dev->compiler, nir);
2504 
2505    const struct ir3_shader_options options = {
2506       .num_reserved_user_consts = reserved_consts_vec4,
2507       .api_wavesize = key->api_wavesize,
2508       .real_wavesize = key->real_wavesize,
2509       .push_consts_type = shader->const_state.push_consts.type,
2510       .push_consts_base = shader->const_state.push_consts.lo,
2511       .push_consts_dwords = shader->const_state.push_consts.dwords,
2512    };
2513 
2514    struct ir3_shader *ir3_shader =
2515       ir3_shader_from_nir(dev->compiler, nir, &options, &so_info);
2516 
2517    shader->variant =
2518       ir3_shader_create_variant(ir3_shader, ir3_key, executable_info);
2519 
2520    if (ir3_exceeds_safe_constlen(shader->variant)) {
2521       struct ir3_shader_key safe_constlen_key = *ir3_key;
2522       safe_constlen_key.safe_constlen = true;
2523       shader->safe_const_variant =
2524          ir3_shader_create_variant(ir3_shader, &safe_constlen_key,
2525                                    executable_info);
2526    }
2527 
2528    shader->view_mask = key->multiview_mask;
2529 
2530    switch (shader->variant->type) {
2531    case MESA_SHADER_TESS_EVAL: {
2532       const struct ir3_shader_variant *tes = shader->variant;
2533       if (tes->tess.point_mode) {
2534          shader->tes.tess_output_lower_left =
2535             shader->tes.tess_output_upper_left = TESS_POINTS;
2536       } else if (tes->tess.primitive_mode == TESS_PRIMITIVE_ISOLINES) {
2537          shader->tes.tess_output_lower_left =
2538             shader->tes.tess_output_upper_left = TESS_LINES;
2539       } else if (tes->tess.ccw) {
2540          /* Tessellation orientation in HW is specified with a lower-left
2541           * origin, we need to swap them if the origin is upper-left.
2542           */
2543          shader->tes.tess_output_lower_left = TESS_CCW_TRIS;
2544          shader->tes.tess_output_upper_left = TESS_CW_TRIS;
2545       } else {
2546          shader->tes.tess_output_lower_left = TESS_CW_TRIS;
2547          shader->tes.tess_output_upper_left = TESS_CCW_TRIS;
2548       }
2549 
2550       switch (tes->tess.spacing) {
2551       case TESS_SPACING_EQUAL:
2552          shader->tes.tess_spacing = TESS_EQUAL;
2553          break;
2554       case TESS_SPACING_FRACTIONAL_ODD:
2555          shader->tes.tess_spacing = TESS_FRACTIONAL_ODD;
2556          break;
2557       case TESS_SPACING_FRACTIONAL_EVEN:
2558          shader->tes.tess_spacing = TESS_FRACTIONAL_EVEN;
2559          break;
2560       case TESS_SPACING_UNSPECIFIED:
2561       default:
2562          unreachable("invalid tess spacing");
2563       }
2564 
2565       break;
2566    }
2567    case MESA_SHADER_FRAGMENT: {
2568       const struct ir3_shader_variant *fs = shader->variant;
2569       shader->fs.per_samp = fs->per_samp || ir3_key->sample_shading;
2570       shader->fs.has_fdm = key->fragment_density_map;
2571       if (fs->has_kill)
2572          shader->fs.lrz.status |= TU_LRZ_FORCE_DISABLE_WRITE;
2573       if (fs->no_earlyz || fs->writes_pos)
2574          shader->fs.lrz.status = TU_LRZ_FORCE_DISABLE_LRZ;
2575       /* FDM isn't compatible with LRZ, because the LRZ image uses the original
2576        * resolution and we would need to use the low resolution.
2577        *
2578        * TODO: Use a patchpoint to only disable LRZ for scaled bins.
2579        */
2580       if (key->fragment_density_map)
2581          shader->fs.lrz.status = TU_LRZ_FORCE_DISABLE_LRZ;
2582       if (!fs->fs.early_fragment_tests &&
2583           (fs->no_earlyz || fs->writes_pos || fs->writes_stencilref || fs->writes_smask)) {
2584          shader->fs.lrz.force_late_z = true;
2585       }
2586       break;
2587    }
2588    default:
2589       break;
2590    }
2591 
2592    VkResult result = tu_upload_shader(dev, shader);
2593    if (result != VK_SUCCESS) {
2594       vk_free(&dev->vk.alloc, shader);
2595       return result;
2596    }
2597 
2598    *shader_out = shader;
2599    return VK_SUCCESS;
2600 }
2601 
2602 static void
tu_link_shaders(nir_shader ** shaders,unsigned shaders_count)2603 tu_link_shaders(nir_shader **shaders, unsigned shaders_count)
2604 {
2605    nir_shader *consumer = NULL;
2606    for (gl_shader_stage stage = (gl_shader_stage) (shaders_count - 1);
2607         stage >= MESA_SHADER_VERTEX; stage = (gl_shader_stage) (stage - 1)) {
2608       if (!shaders[stage])
2609          continue;
2610 
2611       nir_shader *producer = shaders[stage];
2612       if (!consumer) {
2613          consumer = producer;
2614          continue;
2615       }
2616 
2617       if (nir_link_opt_varyings(producer, consumer)) {
2618          NIR_PASS_V(consumer, nir_opt_constant_folding);
2619          NIR_PASS_V(consumer, nir_opt_algebraic);
2620          NIR_PASS_V(consumer, nir_opt_dce);
2621       }
2622 
2623       const nir_remove_dead_variables_options out_var_opts = {
2624          .can_remove_var = nir_vk_is_not_xfb_output,
2625       };
2626       NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, &out_var_opts);
2627 
2628       NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
2629 
2630       bool progress = nir_remove_unused_varyings(producer, consumer);
2631 
2632       nir_compact_varyings(producer, consumer, true);
2633       if (progress) {
2634          if (nir_lower_global_vars_to_local(producer)) {
2635             /* Remove dead writes, which can remove input loads */
2636             NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_temp, NULL);
2637             NIR_PASS_V(producer, nir_opt_dce);
2638          }
2639          nir_lower_global_vars_to_local(consumer);
2640       }
2641 
2642       consumer = producer;
2643    }
2644 
2645    /* Gather info after linking so that we can fill out the ir3 shader key.
2646     */
2647    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2648         stage <= MESA_SHADER_FRAGMENT; stage = (gl_shader_stage) (stage + 1)) {
2649       if (shaders[stage])
2650          nir_shader_gather_info(shaders[stage],
2651                                 nir_shader_get_entrypoint(shaders[stage]));
2652    }
2653 }
2654 
2655 static uint32_t
tu6_get_tessmode(const struct nir_shader * shader)2656 tu6_get_tessmode(const struct nir_shader *shader)
2657 {
2658    enum tess_primitive_mode primitive_mode = shader->info.tess._primitive_mode;
2659    switch (primitive_mode) {
2660    case TESS_PRIMITIVE_ISOLINES:
2661       return IR3_TESS_ISOLINES;
2662    case TESS_PRIMITIVE_TRIANGLES:
2663       return IR3_TESS_TRIANGLES;
2664    case TESS_PRIMITIVE_QUADS:
2665       return IR3_TESS_QUADS;
2666    case TESS_PRIMITIVE_UNSPECIFIED:
2667       return IR3_TESS_NONE;
2668    default:
2669       unreachable("bad tessmode");
2670    }
2671 }
2672 
2673 VkResult
tu_compile_shaders(struct tu_device * device,const VkPipelineShaderStageCreateInfo ** stage_infos,nir_shader ** nir,const struct tu_shader_key * keys,struct tu_pipeline_layout * layout,const unsigned char * pipeline_sha1,struct tu_shader ** shaders,char ** nir_initial_disasm,void * nir_initial_disasm_mem_ctx,nir_shader ** nir_out,VkPipelineCreationFeedback * stage_feedbacks)2674 tu_compile_shaders(struct tu_device *device,
2675                    const VkPipelineShaderStageCreateInfo **stage_infos,
2676                    nir_shader **nir,
2677                    const struct tu_shader_key *keys,
2678                    struct tu_pipeline_layout *layout,
2679                    const unsigned char *pipeline_sha1,
2680                    struct tu_shader **shaders,
2681                    char **nir_initial_disasm,
2682                    void *nir_initial_disasm_mem_ctx,
2683                    nir_shader **nir_out,
2684                    VkPipelineCreationFeedback *stage_feedbacks)
2685 {
2686    struct ir3_shader_key ir3_key = {};
2687    VkResult result = VK_SUCCESS;
2688    void *mem_ctx = ralloc_context(NULL);
2689 
2690    for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2691         stage = (gl_shader_stage) (stage + 1)) {
2692       const VkPipelineShaderStageCreateInfo *stage_info = stage_infos[stage];
2693       if (!stage_info)
2694          continue;
2695 
2696       int64_t stage_start = os_time_get_nano();
2697 
2698       nir[stage] = tu_spirv_to_nir(device, mem_ctx, stage_info, stage);
2699       if (!nir[stage]) {
2700          result = VK_ERROR_OUT_OF_HOST_MEMORY;
2701          goto fail;
2702       }
2703 
2704       stage_feedbacks[stage].flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
2705       stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
2706    }
2707 
2708    if (nir[MESA_SHADER_GEOMETRY])
2709       ir3_key.has_gs = true;
2710 
2711    ir3_key.sample_shading = keys[MESA_SHADER_FRAGMENT].force_sample_interp;
2712 
2713    if (nir_initial_disasm) {
2714       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2715            stage < MESA_SHADER_STAGES;
2716            stage = (gl_shader_stage) (stage + 1)) {
2717       if (!nir[stage])
2718          continue;
2719 
2720       nir_initial_disasm[stage] =
2721          nir_shader_as_str(nir[stage], nir_initial_disasm_mem_ctx);
2722       }
2723    }
2724 
2725    tu_link_shaders(nir, MESA_SHADER_STAGES);
2726 
2727    if (nir_out) {
2728       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2729            stage < MESA_SHADER_STAGES; stage = (gl_shader_stage) (stage + 1)) {
2730          if (!nir[stage])
2731             continue;
2732 
2733          nir_out[stage] = nir_shader_clone(NULL, nir[stage]);
2734       }
2735    }
2736 
2737    /* With pipelines, tessellation modes can be set on either shader, for
2738     * compatibility with HLSL and GLSL, and the driver is supposed to merge
2739     * them. Shader objects requires modes to be set on at least the TES except
2740     * for OutputVertices which has to be set at least on the TCS. Make sure
2741     * all modes are set on the TES when compiling together multiple shaders,
2742     * and then from this point on we will use the modes in the TES (and output
2743     * vertices on the TCS).
2744     */
2745    if (nir[MESA_SHADER_TESS_EVAL]) {
2746       nir_shader *tcs = nir[MESA_SHADER_TESS_CTRL];
2747       nir_shader *tes = nir[MESA_SHADER_TESS_EVAL];
2748 
2749       if (tes->info.tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED)
2750          tes->info.tess._primitive_mode = tcs->info.tess._primitive_mode;
2751 
2752       tes->info.tess.point_mode |= tcs->info.tess.point_mode;
2753       tes->info.tess.ccw |= tcs->info.tess.ccw;
2754 
2755       if (tes->info.tess.spacing == TESS_SPACING_UNSPECIFIED) {
2756          tes->info.tess.spacing = tcs->info.tess.spacing;
2757       }
2758 
2759       if (tcs->info.tess.tcs_vertices_out == 0)
2760          tcs->info.tess.tcs_vertices_out = tes->info.tess.tcs_vertices_out;
2761 
2762       ir3_key.tessellation = tu6_get_tessmode(tes);
2763    }
2764 
2765    for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2766         stage = (gl_shader_stage) (stage + 1)) {
2767       if (!nir[stage])
2768          continue;
2769 
2770       if (stage > MESA_SHADER_TESS_CTRL) {
2771          if (stage == MESA_SHADER_FRAGMENT) {
2772             ir3_key.tcs_store_primid = ir3_key.tcs_store_primid ||
2773                (nir[stage]->info.inputs_read & (1ull << VARYING_SLOT_PRIMITIVE_ID));
2774          } else {
2775             ir3_key.tcs_store_primid = ir3_key.tcs_store_primid ||
2776                BITSET_TEST(nir[stage]->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
2777          }
2778       }
2779    }
2780 
2781    /* In the the tess-but-not-FS case we don't know whether the FS will read
2782     * PrimID so we need to unconditionally store it.
2783     */
2784    if (nir[MESA_SHADER_TESS_CTRL] && !nir[MESA_SHADER_FRAGMENT])
2785       ir3_key.tcs_store_primid = true;
2786 
2787    for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2788         stage = (gl_shader_stage) (stage + 1)) {
2789       if (!nir[stage] || shaders[stage])
2790          continue;
2791 
2792       int64_t stage_start = os_time_get_nano();
2793 
2794       unsigned char shader_sha1[21];
2795       memcpy(shader_sha1, pipeline_sha1, 20);
2796       shader_sha1[20] = (unsigned char) stage;
2797 
2798       result = tu_shader_create(device,
2799                                 &shaders[stage], nir[stage], &keys[stage],
2800                                 &ir3_key, shader_sha1, sizeof(shader_sha1),
2801                                 layout, !!nir_initial_disasm);
2802       if (result != VK_SUCCESS) {
2803          goto fail;
2804       }
2805 
2806       stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
2807    }
2808 
2809    ralloc_free(mem_ctx);
2810 
2811    return VK_SUCCESS;
2812 
2813 fail:
2814    ralloc_free(mem_ctx);
2815 
2816    for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2817         stage = (gl_shader_stage) (stage + 1)) {
2818       if (shaders[stage]) {
2819          tu_shader_destroy(device, shaders[stage]);
2820       }
2821       if (nir_out && nir_out[stage]) {
2822          ralloc_free(nir_out[stage]);
2823       }
2824    }
2825 
2826    return result;
2827 }
2828 
2829 void
tu_shader_key_subgroup_size(struct tu_shader_key * key,bool allow_varying_subgroup_size,bool require_full_subgroups,const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo * subgroup_info,struct tu_device * dev)2830 tu_shader_key_subgroup_size(struct tu_shader_key *key,
2831                             bool allow_varying_subgroup_size,
2832                             bool require_full_subgroups,
2833                             const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info,
2834                             struct tu_device *dev)
2835 {
2836    enum ir3_wavesize_option api_wavesize, real_wavesize;
2837    if (!dev->physical_device->info->a6xx.supports_double_threadsize) {
2838       api_wavesize = IR3_SINGLE_ONLY;
2839       real_wavesize = IR3_SINGLE_ONLY;
2840    } else {
2841       if (allow_varying_subgroup_size) {
2842          api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE;
2843       } else {
2844          if (subgroup_info) {
2845             if (subgroup_info->requiredSubgroupSize == dev->compiler->threadsize_base) {
2846                api_wavesize = IR3_SINGLE_ONLY;
2847             } else {
2848                assert(subgroup_info->requiredSubgroupSize == dev->compiler->threadsize_base * 2);
2849                api_wavesize = IR3_DOUBLE_ONLY;
2850             }
2851          } else {
2852             /* Match the exposed subgroupSize. */
2853             api_wavesize = IR3_DOUBLE_ONLY;
2854          }
2855 
2856          if (require_full_subgroups)
2857             real_wavesize = api_wavesize;
2858          else if (api_wavesize == IR3_SINGLE_ONLY)
2859             real_wavesize = IR3_SINGLE_ONLY;
2860          else
2861             real_wavesize = IR3_SINGLE_OR_DOUBLE;
2862       }
2863    }
2864 
2865    key->api_wavesize = api_wavesize;
2866    key->real_wavesize = real_wavesize;
2867 }
2868 
2869 static VkResult
tu_empty_shader_create(struct tu_device * dev,struct tu_shader ** shader_out,gl_shader_stage stage)2870 tu_empty_shader_create(struct tu_device *dev,
2871                        struct tu_shader **shader_out,
2872                        gl_shader_stage stage)
2873 {
2874    struct tu_shader *shader = tu_shader_init(dev, NULL, 0);
2875 
2876    if (!shader)
2877       return VK_ERROR_OUT_OF_HOST_MEMORY;
2878 
2879    pthread_mutex_lock(&dev->pipeline_mutex);
2880    VkResult result = tu_suballoc_bo_alloc(&shader->bo, &dev->pipeline_suballoc,
2881                                           32 * 4, 128);
2882    pthread_mutex_unlock(&dev->pipeline_mutex);
2883 
2884    if (result != VK_SUCCESS) {
2885       vk_free(&dev->vk.alloc, shader);
2886       return result;
2887    }
2888 
2889    tu_cs_init_suballoc(&shader->cs, dev, &shader->bo);
2890 
2891    struct tu_pvtmem_config pvtmem_config = { };
2892 
2893    struct tu_cs sub_cs;
2894    tu_cs_begin_sub_stream(&shader->cs, 32, &sub_cs);
2895    TU_CALLX(dev, tu6_emit_variant)(&sub_cs, stage, NULL, &pvtmem_config, 0, 0);
2896    shader->state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2897 
2898    *shader_out = shader;
2899    return VK_SUCCESS;
2900 }
2901 
2902 static VkResult
tu_empty_fs_create(struct tu_device * dev,struct tu_shader ** shader,bool fragment_density_map)2903 tu_empty_fs_create(struct tu_device *dev, struct tu_shader **shader,
2904                    bool fragment_density_map)
2905 {
2906    struct ir3_shader_key key = {};
2907    const struct ir3_shader_options options = {};
2908    struct ir3_stream_output_info so_info = {};
2909    const nir_shader_compiler_options *nir_options =
2910       ir3_get_compiler_options(dev->compiler);
2911    nir_builder fs_b;
2912 
2913    fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, nir_options,
2914                                          "noop_fs");
2915 
2916    *shader = tu_shader_init(dev, NULL, 0);
2917    if (!*shader)
2918       return VK_ERROR_OUT_OF_HOST_MEMORY;
2919 
2920    (*shader)->fs.has_fdm = fragment_density_map;
2921    if (fragment_density_map)
2922       (*shader)->fs.lrz.status = TU_LRZ_FORCE_DISABLE_LRZ;
2923 
2924    for (unsigned i = 0; i < MAX_SETS; i++)
2925       (*shader)->dynamic_descriptor_sizes[i] = -1;
2926 
2927    struct ir3_shader *ir3_shader =
2928       ir3_shader_from_nir(dev->compiler, fs_b.shader, &options, &so_info);
2929    (*shader)->variant = ir3_shader_create_variant(ir3_shader, &key, false);
2930 
2931    return tu_upload_shader(dev, *shader);
2932 }
2933 
2934 VkResult
tu_init_empty_shaders(struct tu_device * dev)2935 tu_init_empty_shaders(struct tu_device *dev)
2936 {
2937    VkResult result;
2938 
2939    result = tu_empty_shader_create(dev, &dev->empty_tcs, MESA_SHADER_TESS_CTRL);
2940    if (result != VK_SUCCESS)
2941       goto out;
2942 
2943    result = tu_empty_shader_create(dev, &dev->empty_tes, MESA_SHADER_TESS_EVAL);
2944    if (result != VK_SUCCESS)
2945       goto out;
2946 
2947    result = tu_empty_shader_create(dev, &dev->empty_gs, MESA_SHADER_GEOMETRY);
2948    if (result != VK_SUCCESS)
2949       goto out;
2950 
2951    result = tu_empty_fs_create(dev, &dev->empty_fs, false);
2952    if (result != VK_SUCCESS)
2953       goto out;
2954 
2955    result = tu_empty_fs_create(dev, &dev->empty_fs_fdm, true);
2956    if (result != VK_SUCCESS)
2957       goto out;
2958 
2959    return VK_SUCCESS;
2960 
2961 out:
2962    if (dev->empty_tcs)
2963       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tcs->base);
2964    if (dev->empty_tes)
2965       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tes->base);
2966    if (dev->empty_gs)
2967       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_gs->base);
2968    if (dev->empty_fs)
2969       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs->base);
2970    if (dev->empty_fs_fdm)
2971       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs_fdm->base);
2972    return result;
2973 }
2974 
2975 void
tu_destroy_empty_shaders(struct tu_device * dev)2976 tu_destroy_empty_shaders(struct tu_device *dev)
2977 {
2978    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tcs->base);
2979    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tes->base);
2980    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_gs->base);
2981    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs->base);
2982    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs_fdm->base);
2983 }
2984 
2985 void
tu_shader_destroy(struct tu_device * dev,struct tu_shader * shader)2986 tu_shader_destroy(struct tu_device *dev,
2987                   struct tu_shader *shader)
2988 {
2989    tu_cs_finish(&shader->cs);
2990 
2991    pthread_mutex_lock(&dev->pipeline_mutex);
2992    tu_suballoc_bo_free(&dev->pipeline_suballoc, &shader->bo);
2993    pthread_mutex_unlock(&dev->pipeline_mutex);
2994 
2995    if (shader->pvtmem_bo)
2996       tu_bo_finish(dev, shader->pvtmem_bo);
2997 
2998    vk_free(&dev->vk.alloc, shader);
2999 }
3000