• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Google LLC
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "tu_shader.h"
7 
8 #include "spirv/nir_spirv.h"
9 #include "util/mesa-sha1.h"
10 #include "nir/nir_xfb_info.h"
11 #include "vk_nir.h"
12 #include "vk_nir_convert_ycbcr.h"
13 #include "vk_pipeline.h"
14 #include "vk_util.h"
15 
16 #include "ir3/ir3_compiler.h"
17 #include "ir3/ir3_nir.h"
18 
19 #include "tu_device.h"
20 #include "tu_descriptor_set.h"
21 #include "tu_lrz.h"
22 #include "tu_pipeline.h"
23 #include "tu_rmv.h"
24 
25 #include <initializer_list>
26 
27 static void
init_ir3_nir_options(struct ir3_shader_nir_options * options,const struct tu_shader_key * key)28 init_ir3_nir_options(struct ir3_shader_nir_options *options,
29                      const struct tu_shader_key *key)
30 {
31    *options = {
32       .robust_modes = (nir_variable_mode)
33          ((key->robust_storage_access2 ? nir_var_mem_ssbo : 0) |
34           (key->robust_uniform_access2 ? nir_var_mem_ubo : 0)),
35    };
36 }
37 
38 nir_shader *
tu_spirv_to_nir(struct tu_device * dev,void * mem_ctx,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo * stage_info,const struct tu_shader_key * key,gl_shader_stage stage)39 tu_spirv_to_nir(struct tu_device *dev,
40                 void *mem_ctx,
41                 VkPipelineCreateFlags2KHR pipeline_flags,
42                 const VkPipelineShaderStageCreateInfo *stage_info,
43                 const struct tu_shader_key *key,
44                 gl_shader_stage stage)
45 {
46    /* TODO these are made-up */
47    const struct spirv_to_nir_options spirv_options = {
48       /* ViewID is a sysval in geometry stages and an input in the FS */
49       .view_index_is_input =
50          stage == MESA_SHADER_FRAGMENT &&
51          !key->lower_view_index_to_device_index,
52 
53       /* Use 16-bit math for RelaxedPrecision ALU ops */
54       .mediump_16bit_alu = true,
55 
56       .ubo_addr_format = nir_address_format_vec2_index_32bit_offset,
57       .ssbo_addr_format = nir_address_format_vec2_index_32bit_offset,
58 
59       /* Accessed via stg/ldg */
60       .phys_ssbo_addr_format = nir_address_format_64bit_global,
61 
62       /* Accessed via the const register file */
63       .push_const_addr_format = nir_address_format_logical,
64 
65       /* Accessed via ldl/stl */
66       .shared_addr_format = nir_address_format_32bit_offset,
67 
68       /* Accessed via stg/ldg (not used with Vulkan?) */
69       .global_addr_format = nir_address_format_64bit_global,
70    };
71 
72    const nir_shader_compiler_options *nir_options =
73       ir3_get_compiler_options(dev->compiler);
74 
75    nir_shader *nir;
76    VkResult result =
77       vk_pipeline_shader_stage_to_nir(&dev->vk, pipeline_flags, stage_info,
78                                       &spirv_options, nir_options,
79                                       mem_ctx, &nir);
80    if (result != VK_SUCCESS)
81       return NULL;
82 
83    /* ir3 uses num_ubos and num_ssbos to track the number of *bindful*
84     * UBOs/SSBOs, but spirv_to_nir sets them to the total number of objects
85     * which is useless for us, so reset them here.
86     */
87    nir->info.num_ubos = 0;
88    nir->info.num_ssbos = 0;
89 
90    if (TU_DEBUG(NIR)) {
91       fprintf(stderr, "translated nir:\n");
92       nir_print_shader(nir, stderr);
93    }
94 
95    const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
96       .point_coord = true,
97    };
98    NIR_PASS_V(nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
99 
100    NIR_PASS_V(nir, nir_lower_global_vars_to_local);
101 
102    /* Older glslang missing bf6efd0316d8 ("SPV: Fix #2293: keep relaxed
103     * precision on arg passed to relaxed param") will pass function args through
104     * a highp temporary, so we need the nir_opt_find_array_copies() and a copy
105     * prop before we lower mediump vars, or you'll be unable to optimize out
106     * array copies after lowering.  We do this before splitting copies, since
107     * that works against nir_opt_find_array_copies().
108     * */
109    NIR_PASS_V(nir, nir_opt_find_array_copies);
110    NIR_PASS_V(nir, nir_opt_copy_prop_vars);
111    NIR_PASS_V(nir, nir_opt_dce);
112 
113    NIR_PASS_V(nir, nir_split_var_copies);
114    NIR_PASS_V(nir, nir_lower_var_copies);
115 
116    NIR_PASS_V(nir, nir_lower_mediump_vars, nir_var_function_temp | nir_var_shader_temp | nir_var_mem_shared);
117    NIR_PASS_V(nir, nir_opt_copy_prop_vars);
118    NIR_PASS_V(nir, nir_opt_combine_stores, nir_var_all);
119 
120    NIR_PASS_V(nir, nir_lower_system_values);
121    NIR_PASS_V(nir, nir_lower_is_helper_invocation);
122 
123    if (key->lower_view_index_to_device_index)
124       NIR_PASS_V(nir, nir_lower_view_index_to_device_index);
125 
126    struct ir3_shader_nir_options options;
127    init_ir3_nir_options(&options, key);
128    ir3_optimize_loop(dev->compiler, &options, nir);
129 
130    NIR_PASS_V(nir, nir_opt_conditional_discard);
131 
132    return nir;
133 }
134 
135 static void
lower_load_push_constant(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * instr,struct tu_shader * shader,const struct tu_pipeline_layout * layout,uint32_t push_consts_offset_vec4)136 lower_load_push_constant(struct tu_device *dev,
137                          nir_builder *b,
138                          nir_intrinsic_instr *instr,
139                          struct tu_shader *shader,
140                          const struct tu_pipeline_layout *layout,
141                          uint32_t push_consts_offset_vec4)
142 {
143    uint32_t base = nir_intrinsic_base(instr);
144    assert(base % 4 == 0);
145 
146    if (tu6_shared_constants_enable(layout, dev->compiler)) {
147       /* All stages share the same range.  We could potentially add
148        * push_constant_offset to layout and apply it, but this is good for
149        * now.
150        */
151       base += dev->compiler->shared_consts_base_offset * 4;
152    } else {
153       assert(base >= shader->const_state.push_consts.lo_dwords);
154       base -= shader->const_state.push_consts.lo_dwords;
155       base += push_consts_offset_vec4 * 4;
156    }
157 
158    nir_def *load =
159       nir_load_const_ir3(b, instr->num_components, instr->def.bit_size,
160                          nir_ushr_imm(b, instr->src[0].ssa, 2), .base = base);
161 
162    nir_def_replace(&instr->def, load);
163 }
164 
165 static void
lower_vulkan_resource_index(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * instr,struct tu_shader * shader,const struct tu_pipeline_layout * layout)166 lower_vulkan_resource_index(struct tu_device *dev, nir_builder *b,
167                             nir_intrinsic_instr *instr,
168                             struct tu_shader *shader,
169                             const struct tu_pipeline_layout *layout)
170 {
171    struct ir3_compiler *compiler = dev->compiler;
172    nir_def *vulkan_idx = instr->src[0].ssa;
173 
174    unsigned set = nir_intrinsic_desc_set(instr);
175    unsigned binding = nir_intrinsic_binding(instr);
176    struct tu_descriptor_set_layout *set_layout = layout->set[set].layout;
177    struct tu_descriptor_set_binding_layout *binding_layout =
178       &set_layout->binding[binding];
179    nir_def *base;
180 
181    if (binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
182       return;
183 
184    shader->active_desc_sets |= 1u << set;
185 
186    if (vk_descriptor_type_is_dynamic(binding_layout->type)) {
187       int offset = 0;
188       for (unsigned i = 0; i < set; i++) {
189          if (shader->dynamic_descriptor_sizes[i] >= 0) {
190             offset += shader->dynamic_descriptor_sizes[i];
191          } else {
192             offset = -1;
193             break;
194          }
195       }
196 
197       if (offset < 0) {
198          /* With independent sets, we don't know
199           * layout->set[set].dynamic_offset_start until after link time which
200           * with fast linking means after the shader is compiled. We have to
201           * get it from the const file instead.
202           */
203          base = nir_imm_int(b, binding_layout->dynamic_offset_offset / (4 * A6XX_TEX_CONST_DWORDS));
204          nir_def *dynamic_offset_start;
205          if (compiler->load_shader_consts_via_preamble) {
206             dynamic_offset_start =
207                ir3_load_driver_ubo(b, 1, &shader->const_state.dynamic_offsets_ubo, set);
208          } else {
209             dynamic_offset_start = nir_load_const_ir3(
210                b, 1, 32, nir_imm_int(b, 0),
211                .base = shader->const_state.dynamic_offset_loc + set);
212          }
213          base = nir_iadd(b, base, dynamic_offset_start);
214       } else {
215          base = nir_imm_int(b, (offset +
216             binding_layout->dynamic_offset_offset) / (4 * A6XX_TEX_CONST_DWORDS));
217       }
218       assert(dev->physical_device->reserved_set_idx >= 0);
219       set = dev->physical_device->reserved_set_idx;
220    } else
221       base = nir_imm_int(b, binding_layout->offset / (4 * A6XX_TEX_CONST_DWORDS));
222 
223    unsigned stride = binding_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
224    assert(util_is_power_of_two_nonzero(stride));
225    nir_def *shift = nir_imm_int(b, util_logbase2(stride));
226 
227    nir_def *def = nir_vec3(b, nir_imm_int(b, set),
228                                nir_iadd(b, base,
229                                         nir_ishl(b, vulkan_idx, shift)),
230                                shift);
231 
232    nir_def_replace(&instr->def, def);
233 }
234 
235 static void
lower_vulkan_resource_reindex(nir_builder * b,nir_intrinsic_instr * instr)236 lower_vulkan_resource_reindex(nir_builder *b, nir_intrinsic_instr *instr)
237 {
238    nir_def *old_index = instr->src[0].ssa;
239    nir_def *delta = instr->src[1].ssa;
240    nir_def *shift = nir_channel(b, old_index, 2);
241 
242    nir_def *new_index =
243       nir_vec3(b, nir_channel(b, old_index, 0),
244                nir_iadd(b, nir_channel(b, old_index, 1),
245                         nir_ishl(b, delta, shift)),
246                shift);
247 
248    nir_def_replace(&instr->def, new_index);
249 }
250 
251 static void
lower_load_vulkan_descriptor(nir_builder * b,nir_intrinsic_instr * intrin)252 lower_load_vulkan_descriptor(nir_builder *b, nir_intrinsic_instr *intrin)
253 {
254    nir_def *old_index = intrin->src[0].ssa;
255    /* Loading the descriptor happens as part of the load/store instruction so
256     * this is a no-op. We just need to turn the shift into an offset of 0.
257     */
258    nir_def *new_index =
259       nir_vec3(b, nir_channel(b, old_index, 0),
260                nir_channel(b, old_index, 1),
261                nir_imm_int(b, 0));
262    nir_def_replace(&intrin->def, new_index);
263 }
264 
265 static bool
lower_ssbo_ubo_intrinsic(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * intrin)266 lower_ssbo_ubo_intrinsic(struct tu_device *dev,
267                          nir_builder *b, nir_intrinsic_instr *intrin)
268 {
269    const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic];
270 
271    /* The bindless base is part of the instruction, which means that part of
272     * the "pointer" has to be constant. We solve this in the same way the blob
273     * does, by generating a bunch of if-statements. In the usual case where
274     * the descriptor set is constant we can skip that, though).
275     */
276 
277    unsigned buffer_src;
278    if (intrin->intrinsic == nir_intrinsic_store_ssbo) {
279       /* This has the value first */
280       buffer_src = 1;
281    } else {
282       buffer_src = 0;
283    }
284 
285    /* Don't lower non-bindless UBO loads of driver params */
286    if (intrin->src[buffer_src].ssa->num_components == 1)
287       return false;
288 
289    nir_scalar scalar_idx = nir_scalar_resolved(intrin->src[buffer_src].ssa, 0);
290    nir_def *descriptor_idx = nir_channel(b, intrin->src[buffer_src].ssa, 1);
291 
292    if (intrin->intrinsic == nir_intrinsic_load_ubo &&
293        dev->instance->allow_oob_indirect_ubo_loads) {
294       nir_scalar offset = nir_scalar_resolved(intrin->src[1].ssa, 0);
295       if (!nir_scalar_is_const(offset)) {
296          nir_intrinsic_set_range(intrin, ~0);
297       }
298    }
299 
300    /* Descriptor index has to be adjusted in the following cases:
301     *  - isam loads, when the 16-bit descriptor cannot also be used for 32-bit
302     *    loads -- next-index descriptor will be able to do that;
303     *  - 8-bit SSBO loads and stores -- next-index descriptor is dedicated to
304     *    storage accesses of that size.
305     */
306    if ((dev->physical_device->info->a6xx.storage_16bit &&
307         !dev->physical_device->info->a6xx.has_isam_v &&
308         intrin->intrinsic == nir_intrinsic_load_ssbo &&
309         (nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
310         intrin->def.bit_size > 16) ||
311        (dev->physical_device->info->a7xx.storage_8bit &&
312         ((intrin->intrinsic == nir_intrinsic_load_ssbo && intrin->def.bit_size == 8) ||
313          (intrin->intrinsic == nir_intrinsic_store_ssbo && intrin->src[0].ssa->bit_size == 8)))) {
314       descriptor_idx = nir_iadd_imm(b, descriptor_idx, 1);
315    }
316 
317    nir_def *results[MAX_SETS] = { NULL };
318 
319    if (nir_scalar_is_const(scalar_idx)) {
320       nir_def *bindless =
321          nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = nir_scalar_as_uint(scalar_idx));
322       nir_src_rewrite(&intrin->src[buffer_src], bindless);
323       return true;
324    }
325 
326    nir_def *base_idx = nir_channel(b, scalar_idx.def, scalar_idx.comp);
327    for (unsigned i = 0; i < dev->physical_device->info->a6xx.max_sets; i++) {
328       /* if (base_idx == i) { ... */
329       nir_if *nif = nir_push_if(b, nir_ieq_imm(b, base_idx, i));
330 
331       nir_def *bindless =
332          nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = i);
333 
334       nir_intrinsic_instr *copy =
335          nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
336 
337       copy->num_components = intrin->num_components;
338 
339       for (unsigned src = 0; src < info->num_srcs; src++) {
340          if (src == buffer_src)
341             copy->src[src] = nir_src_for_ssa(bindless);
342          else
343             copy->src[src] = nir_src_for_ssa(intrin->src[src].ssa);
344       }
345 
346       for (unsigned idx = 0; idx < info->num_indices; idx++) {
347          copy->const_index[idx] = intrin->const_index[idx];
348       }
349 
350       if (info->has_dest) {
351          nir_def_init(&copy->instr, &copy->def,
352                       intrin->def.num_components,
353                       intrin->def.bit_size);
354          results[i] = &copy->def;
355       }
356 
357       nir_builder_instr_insert(b, &copy->instr);
358 
359       /* } else { ... */
360       nir_push_else(b, nif);
361    }
362 
363    nir_def *result =
364       nir_undef(b, intrin->def.num_components, intrin->def.bit_size);
365    for (int i = dev->physical_device->info->a6xx.max_sets - 1; i >= 0; i--) {
366       nir_pop_if(b, NULL);
367       if (info->has_dest)
368          result = nir_if_phi(b, results[i], result);
369    }
370 
371    if (info->has_dest)
372       nir_def_rewrite_uses(&intrin->def, result);
373    nir_instr_remove(&intrin->instr);
374    return true;
375 }
376 
377 static nir_def *
build_bindless(struct tu_device * dev,nir_builder * b,nir_deref_instr * deref,bool is_sampler,struct tu_shader * shader,const struct tu_pipeline_layout * layout,uint32_t read_only_input_attachments,bool dynamic_renderpass)378 build_bindless(struct tu_device *dev, nir_builder *b,
379                nir_deref_instr *deref, bool is_sampler,
380                struct tu_shader *shader,
381                const struct tu_pipeline_layout *layout,
382                uint32_t read_only_input_attachments,
383                bool dynamic_renderpass)
384 {
385    nir_variable *var = nir_deref_instr_get_variable(deref);
386 
387    unsigned set = var->data.descriptor_set;
388    unsigned binding = var->data.binding;
389    const struct tu_descriptor_set_binding_layout *bind_layout =
390       &layout->set[set].layout->binding[binding];
391 
392    /* input attachments use non bindless workaround */
393    if (bind_layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT &&
394        (!dynamic_renderpass ||
395         (var->data.index == NIR_VARIABLE_NO_INDEX ?
396         !(read_only_input_attachments & 0x1) :
397         !(read_only_input_attachments & (1u << (var->data.index + 1))))) &&
398        !TU_DEBUG(DYNAMIC)) {
399       const struct glsl_type *glsl_type = glsl_without_array(var->type);
400       uint32_t idx;
401 
402       /* With dynamic renderpasses, we reserve the first two attachments for
403        * input attachments without an InputAttachmentIndex, which must be for
404        * depth/stencil if they are not read-only, and shift over the rest of
405        * the indices.
406        */
407       if (var->data.index == ~0u) {
408          assert(dynamic_renderpass);
409          idx = 0;
410       } else if (dynamic_renderpass) {
411          idx = (var->data.index + 1) * 2;
412       } else {
413          idx = var->data.index * 2;
414       }
415 
416       /* Record which input attachments are used for tracking feedback loops */
417       if (dynamic_renderpass)
418          shader->fs.dynamic_input_attachments_used |= (1u << (idx / 2));
419 
420       BITSET_SET_RANGE_INSIDE_WORD(b->shader->info.textures_used, idx, (idx + bind_layout->array_size * 2) - 1);
421 
422       /* D24S8 workaround: stencil of D24S8 will be sampled as uint */
423       if (glsl_get_sampler_result_type(glsl_type) == GLSL_TYPE_UINT)
424          idx += 1;
425 
426       if (deref->deref_type == nir_deref_type_var)
427          return nir_imm_int(b, idx);
428 
429       nir_def *arr_index = deref->arr.index.ssa;
430       return nir_iadd_imm(b, nir_imul_imm(b, arr_index, 2), idx);
431    }
432 
433    shader->active_desc_sets |= 1u << set;
434 
435    nir_def *desc_offset;
436    unsigned descriptor_stride;
437    unsigned offset = 0;
438    /* Samplers come second in combined image/sampler descriptors, see
439       * write_combined_image_sampler_descriptor().
440       */
441    if (is_sampler && bind_layout->type ==
442          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
443       offset = 1;
444    }
445    desc_offset =
446       nir_imm_int(b, (bind_layout->offset / (4 * A6XX_TEX_CONST_DWORDS)) +
447                   offset);
448    descriptor_stride = bind_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
449 
450    if (deref->deref_type != nir_deref_type_var) {
451       assert(deref->deref_type == nir_deref_type_array);
452 
453       nir_def *arr_index = deref->arr.index.ssa;
454       desc_offset = nir_iadd(b, desc_offset,
455                              nir_imul_imm(b, arr_index, descriptor_stride));
456    }
457 
458    return nir_bindless_resource_ir3(b, 32, desc_offset, .desc_set = set);
459 }
460 
461 static void
lower_image_deref(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * instr,struct tu_shader * shader,const struct tu_pipeline_layout * layout)462 lower_image_deref(struct tu_device *dev, nir_builder *b,
463                   nir_intrinsic_instr *instr, struct tu_shader *shader,
464                   const struct tu_pipeline_layout *layout)
465 {
466    nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
467    nir_def *bindless = build_bindless(dev, b, deref, false, shader, layout, 0, false);
468    nir_rewrite_image_intrinsic(instr, bindless, true);
469 }
470 
471 static bool
lower_intrinsic(nir_builder * b,nir_intrinsic_instr * instr,struct tu_device * dev,struct tu_shader * shader,const struct tu_pipeline_layout * layout,struct ir3_const_allocations * const_allocs)472 lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
473                 struct tu_device *dev,
474                 struct tu_shader *shader,
475                 const struct tu_pipeline_layout *layout,
476                 struct ir3_const_allocations *const_allocs)
477 {
478    switch (instr->intrinsic) {
479    case nir_intrinsic_load_push_constant:
480       lower_load_push_constant(
481          dev, b, instr, shader, layout,
482          const_allocs->consts[IR3_CONST_ALLOC_PUSH_CONSTS].offset_vec4);
483       return true;
484 
485    case nir_intrinsic_load_vulkan_descriptor:
486       lower_load_vulkan_descriptor(b, instr);
487       return true;
488 
489    case nir_intrinsic_vulkan_resource_index:
490       lower_vulkan_resource_index(dev, b, instr, shader, layout);
491       return true;
492    case nir_intrinsic_vulkan_resource_reindex:
493       lower_vulkan_resource_reindex(b, instr);
494       return true;
495 
496    case nir_intrinsic_load_ubo:
497    case nir_intrinsic_load_ssbo:
498    case nir_intrinsic_store_ssbo:
499    case nir_intrinsic_ssbo_atomic:
500    case nir_intrinsic_ssbo_atomic_swap:
501    case nir_intrinsic_get_ssbo_size:
502       return lower_ssbo_ubo_intrinsic(dev, b, instr);
503 
504    case nir_intrinsic_image_deref_load:
505    case nir_intrinsic_image_deref_store:
506    case nir_intrinsic_image_deref_atomic:
507    case nir_intrinsic_image_deref_atomic_swap:
508    case nir_intrinsic_image_deref_size:
509    case nir_intrinsic_image_deref_samples:
510       lower_image_deref(dev, b, instr, shader, layout);
511       return true;
512 
513    case nir_intrinsic_load_frag_size_ir3:
514    case nir_intrinsic_load_frag_offset_ir3: {
515       if (!dev->compiler->load_shader_consts_via_preamble)
516          return false;
517 
518       unsigned param =
519          instr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
520          IR3_DP_FS(frag_size) : IR3_DP_FS(frag_offset);
521 
522       unsigned offset = param - IR3_DP_FS_DYNAMIC;
523 
524       nir_def *view = instr->src[0].ssa;
525       nir_def *result =
526          ir3_load_driver_ubo_indirect(b, 2, &shader->const_state.fdm_ubo,
527                                       offset, view, nir_intrinsic_range(instr));
528 
529       nir_def_replace(&instr->def, result);
530       return true;
531    }
532    case nir_intrinsic_load_frag_invocation_count: {
533       if (!dev->compiler->load_shader_consts_via_preamble)
534          return false;
535 
536       nir_def *result =
537          ir3_load_driver_ubo(b, 1, &shader->const_state.fdm_ubo,
538                              IR3_DP_FS(frag_invocation_count) -
539                              IR3_DP_FS_DYNAMIC);
540 
541       nir_def_replace(&instr->def, result);
542       return true;
543    }
544 
545    default:
546       return false;
547    }
548 }
549 
550 static void
lower_tex_ycbcr(const struct tu_pipeline_layout * layout,nir_builder * builder,nir_tex_instr * tex)551 lower_tex_ycbcr(const struct tu_pipeline_layout *layout,
552                 nir_builder *builder,
553                 nir_tex_instr *tex)
554 {
555    int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
556    assert(deref_src_idx >= 0);
557    nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
558 
559    nir_variable *var = nir_deref_instr_get_variable(deref);
560    const struct tu_descriptor_set_layout *set_layout =
561       layout->set[var->data.descriptor_set].layout;
562    const struct tu_descriptor_set_binding_layout *binding =
563       &set_layout->binding[var->data.binding];
564    const struct vk_ycbcr_conversion_state *ycbcr_samplers =
565       tu_immutable_ycbcr_samplers(set_layout, binding);
566 
567    if (!ycbcr_samplers)
568       return;
569 
570    /* For the following instructions, we don't apply any change */
571    if (tex->op == nir_texop_txs ||
572        tex->op == nir_texop_query_levels ||
573        tex->op == nir_texop_lod)
574       return;
575 
576    assert(tex->texture_index == 0);
577    unsigned array_index = 0;
578    if (deref->deref_type != nir_deref_type_var) {
579       assert(deref->deref_type == nir_deref_type_array);
580       if (!nir_src_is_const(deref->arr.index))
581          return;
582       array_index = nir_src_as_uint(deref->arr.index);
583       array_index = MIN2(array_index, binding->array_size - 1);
584    }
585    const struct vk_ycbcr_conversion_state *ycbcr_sampler = ycbcr_samplers + array_index;
586 
587    if (ycbcr_sampler->ycbcr_model == VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY)
588       return;
589 
590    /* Skip if not actually a YCbCr format.  CtsGraphics, for example, tries to create
591     * YcbcrConversions for RGB formats.
592     */
593    if (!vk_format_get_ycbcr_info(ycbcr_sampler->format))
594       return;
595 
596    builder->cursor = nir_after_instr(&tex->instr);
597 
598    uint8_t bits = vk_format_get_bpc(ycbcr_sampler->format);
599    uint32_t bpcs[3] = {bits, bits, bits}; /* We only support uniform formats */
600    nir_def *result = nir_convert_ycbcr_to_rgb(builder,
601                                               ycbcr_sampler->ycbcr_model,
602                                               ycbcr_sampler->ycbcr_range,
603                                               &tex->def,
604                                               bpcs);
605    nir_def_rewrite_uses_after(&tex->def, result,
606                               result->parent_instr);
607 
608    builder->cursor = nir_before_instr(&tex->instr);
609 }
610 
611 static bool
lower_tex(nir_builder * b,nir_tex_instr * tex,struct tu_device * dev,struct tu_shader * shader,const struct tu_pipeline_layout * layout,uint32_t read_only_input_attachments,bool dynamic_renderpass)612 lower_tex(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
613           struct tu_shader *shader, const struct tu_pipeline_layout *layout,
614           uint32_t read_only_input_attachments, bool dynamic_renderpass)
615 {
616    lower_tex_ycbcr(layout, b, tex);
617 
618    int sampler_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref);
619    if (sampler_src_idx >= 0) {
620       nir_deref_instr *deref = nir_src_as_deref(tex->src[sampler_src_idx].src);
621       nir_def *bindless = build_bindless(dev, b, deref, true, shader, layout,
622                                          read_only_input_attachments,
623                                          dynamic_renderpass);
624       nir_src_rewrite(&tex->src[sampler_src_idx].src, bindless);
625       tex->src[sampler_src_idx].src_type = nir_tex_src_sampler_handle;
626    }
627 
628    int tex_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
629    if (tex_src_idx >= 0) {
630       nir_deref_instr *deref = nir_src_as_deref(tex->src[tex_src_idx].src);
631       nir_def *bindless = build_bindless(dev, b, deref, false, shader, layout,
632                                          read_only_input_attachments,
633                                          dynamic_renderpass);
634       nir_src_rewrite(&tex->src[tex_src_idx].src, bindless);
635       tex->src[tex_src_idx].src_type = nir_tex_src_texture_handle;
636 
637       /* for the input attachment case: */
638       if (bindless->parent_instr->type != nir_instr_type_intrinsic)
639          tex->src[tex_src_idx].src_type = nir_tex_src_texture_offset;
640    }
641 
642    return true;
643 }
644 
645 struct lower_instr_params {
646    struct tu_device *dev;
647    struct tu_shader *shader;
648    const struct tu_pipeline_layout *layout;
649    uint32_t read_only_input_attachments;
650    bool dynamic_renderpass;
651    struct ir3_const_allocations *const_allocs;
652 };
653 
654 static bool
lower_instr(nir_builder * b,nir_instr * instr,void * cb_data)655 lower_instr(nir_builder *b, nir_instr *instr, void *cb_data)
656 {
657    struct lower_instr_params *params = (struct lower_instr_params *) cb_data;
658    b->cursor = nir_before_instr(instr);
659    switch (instr->type) {
660    case nir_instr_type_tex:
661       return lower_tex(b, nir_instr_as_tex(instr), params->dev, params->shader, params->layout,
662                        params->read_only_input_attachments,
663                        params->dynamic_renderpass);
664    case nir_instr_type_intrinsic:
665       return lower_intrinsic(b, nir_instr_as_intrinsic(instr), params->dev,
666                              params->shader, params->layout,
667                              params->const_allocs);
668    default:
669       return false;
670    }
671 }
672 
673 /* Since we always push inline uniforms into constant memory, lower loads of
674  * them to load_uniform which turns into constant memory loads.
675  */
676 static bool
lower_inline_ubo(nir_builder * b,nir_intrinsic_instr * intrin,void * cb_data)677 lower_inline_ubo(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data)
678 {
679    if (intrin->intrinsic != nir_intrinsic_load_ubo)
680       return false;
681 
682    struct lower_instr_params *params = (struct lower_instr_params *) cb_data;
683    struct tu_shader *shader = params->shader;
684    const struct tu_pipeline_layout *layout = params->layout;
685 
686    nir_binding binding = nir_chase_binding(intrin->src[0]);
687 
688    if (!binding.success)
689       return false;
690 
691    struct tu_descriptor_set_layout *set_layout = layout->set[binding.desc_set].layout;
692    struct tu_descriptor_set_binding_layout *binding_layout =
693       &set_layout->binding[binding.binding];
694 
695    if (binding_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
696       return false;
697 
698    /* lookup the const offset of the inline UBO */
699    struct tu_const_state *const_state = &shader->const_state;
700 
701    unsigned base = UINT_MAX;
702    unsigned range;
703    bool use_load = false;
704    bool use_ldg_k =
705       params->dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
706 
707    for (unsigned i = 0; i < const_state->num_inline_ubos; i++) {
708       if (const_state->ubos[i].base == binding.desc_set &&
709           const_state->ubos[i].offset == binding_layout->offset) {
710          range = const_state->ubos[i].size_vec4 * 4;
711          if (use_ldg_k) {
712             base = i * 2;
713          } else {
714             use_load = const_state->ubos[i].push_address;
715             base = const_state->ubos[i].const_offset_vec4 * 4;
716          }
717          break;
718       }
719    }
720 
721    if (base == UINT_MAX) {
722       /* Assume we're loading out-of-bounds from a 0-sized inline uniform
723        * filtered out below.
724        */
725       nir_def_rewrite_uses(&intrin->def,
726                                nir_undef(b, intrin->num_components,
727                                              intrin->def.bit_size));
728       return true;
729    }
730 
731    nir_def *offset = intrin->src[1].ssa;
732 
733    b->cursor = nir_before_instr(&intrin->instr);
734    nir_def *val;
735 
736    if (use_load || use_ldg_k) {
737       nir_def *base_addr;
738       if (use_ldg_k) {
739          base_addr = ir3_load_driver_ubo(b, 2,
740                                          &params->shader->const_state.inline_uniforms_ubo,
741                                          base);
742       } else {
743          base_addr =
744             nir_load_const_ir3(b, 2, 32, nir_imm_int(b, 0), .base = base);
745       }
746       val = nir_load_global_ir3(b, intrin->num_components,
747                                 intrin->def.bit_size,
748                                 base_addr, nir_ishr_imm(b, offset, 2),
749                                 .access =
750                                  (enum gl_access_qualifier)(
751                                     (enum gl_access_qualifier)(ACCESS_NON_WRITEABLE | ACCESS_CAN_REORDER) |
752                                     ACCESS_CAN_SPECULATE),
753                                 .align_mul = 16,
754                                 .align_offset = 0,
755                                 .range_base = 0,
756                                 .range = range);
757    } else {
758       val =
759          nir_load_const_ir3(b, intrin->num_components, intrin->def.bit_size,
760                             nir_ishr_imm(b, offset, 2), .base = base);
761    }
762 
763    nir_def_replace(&intrin->def, val);
764    return true;
765 }
766 
767 /* Figure out the range of push constants that we're actually going to push to
768  * the shader, and tell the backend to reserve this range when pushing UBO
769  * constants.
770  */
771 
772 static void
gather_push_constants(nir_shader * shader,struct tu_shader * tu_shader)773 gather_push_constants(nir_shader *shader, struct tu_shader *tu_shader)
774 {
775    uint32_t min = UINT32_MAX, max = 0;
776    nir_foreach_function_impl(impl, shader) {
777       nir_foreach_block(block, impl) {
778          nir_foreach_instr_safe(instr, block) {
779             if (instr->type != nir_instr_type_intrinsic)
780                continue;
781 
782             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
783             if (intrin->intrinsic != nir_intrinsic_load_push_constant)
784                continue;
785 
786             uint32_t base = nir_intrinsic_base(intrin);
787             uint32_t range = nir_intrinsic_range(intrin);
788             min = MIN2(min, base);
789             max = MAX2(max, base + range);
790             break;
791          }
792       }
793    }
794 
795    if (min >= max) {
796       tu_shader->const_state.push_consts = (struct tu_push_constant_range) {};
797       return;
798    }
799 
800    /* CP_LOAD_STATE OFFSET and NUM_UNIT for SHARED_CONSTS are in units of
801     * dwords while loading regular consts is in units of vec4's.
802     * So we unify the unit here as dwords for tu_push_constant_range, then
803     * we should consider correct unit when emitting.
804     *
805     * Note there's an alignment requirement of 16 dwords on OFFSET. Expand
806     * the range and change units accordingly.
807     */
808    tu_shader->const_state.push_consts.lo_dwords += (min / 4) / 4 * 4;
809    tu_shader->const_state.push_consts.dwords =
810       align(max, 16) / 4 - tu_shader->const_state.push_consts.lo_dwords;
811 }
812 
813 static bool
shader_uses_push_consts(nir_shader * shader)814 shader_uses_push_consts(nir_shader *shader)
815 {
816    nir_foreach_function_impl (impl, shader) {
817       nir_foreach_block (block, impl) {
818          nir_foreach_instr_safe (instr, block) {
819             if (instr->type != nir_instr_type_intrinsic)
820                continue;
821 
822             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
823             if (intrin->intrinsic == nir_intrinsic_load_push_constant)
824                return true;
825          }
826       }
827    }
828    return false;
829 }
830 
831 static bool
tu_lower_io(nir_shader * shader,struct tu_device * dev,struct tu_shader * tu_shader,const struct tu_pipeline_layout * layout,uint32_t read_only_input_attachments,bool dynamic_renderpass,struct ir3_const_allocations * const_allocs)832 tu_lower_io(nir_shader *shader, struct tu_device *dev,
833             struct tu_shader *tu_shader,
834             const struct tu_pipeline_layout *layout,
835             uint32_t read_only_input_attachments,
836             bool dynamic_renderpass,
837             struct ir3_const_allocations *const_allocs)
838 {
839    /* Allocate driver params as early as possible as a workaround for the
840     * following case:
841     * - CP_DRAW_INDIRECT_MULTI_1_DST_OFF apparently tries to upload consts
842     *   even when there are 0 instances.
843     * - With zero instances, the draw state for VS constlen is not applied.
844     * - constlen therefor uses stale value and if
845     *   CP_DRAW_INDIRECT_MULTI_1_DST_OFF is higher than 0x3f - GPU hangs.
846     *
847     * To not rely on undefined behaviour, we will always allocate enough space
848     * to upload driver params.
849     */
850    if (shader->info.stage == MESA_SHADER_VERTEX) {
851       uint32_t num_driver_params =
852          ir3_nir_scan_driver_consts(dev->compiler, shader, nullptr);
853       ir3_alloc_driver_params(const_allocs, &num_driver_params, dev->compiler,
854                               shader->info.stage);
855    }
856 
857    struct tu_const_state *const_state = &tu_shader->const_state;
858    const_state->push_consts = (struct tu_push_constant_range) {
859       .lo_dwords = 0,
860       .dwords = layout->push_constant_size / 4,
861       .type = tu_push_consts_type(layout, dev->compiler),
862    };
863 
864    if (const_state->push_consts.type == IR3_PUSH_CONSTS_PER_STAGE) {
865       gather_push_constants(shader, tu_shader);
866    } else if (const_state->push_consts.type ==
867             IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
868       /* Disable pushing constants for this stage if none were loaded in the
869        * shader.  If all stages don't load their declared push constants, as
870        * is often the case under zink, then we could additionally skip
871        * emitting REG_A7XX_HLSQ_SHARED_CONSTS_IMM entirely.
872        */
873       if (!shader_uses_push_consts(shader))
874          const_state->push_consts = (struct tu_push_constant_range) {};
875    }
876 
877    if (const_state->push_consts.type != IR3_PUSH_CONSTS_SHARED) {
878       uint32_t offset_align_vec4 = 1;
879       if (const_state->push_consts.type == IR3_PUSH_CONSTS_PER_STAGE)
880          offset_align_vec4 = dev->compiler->const_upload_unit;
881 
882       unsigned push_consts_vec4 =
883          align(DIV_ROUND_UP(const_state->push_consts.dwords, 4),
884                dev->compiler->const_upload_unit);
885 
886       ir3_const_alloc(const_allocs, IR3_CONST_ALLOC_PUSH_CONSTS,
887                       push_consts_vec4, offset_align_vec4);
888    }
889 
890    bool unknown_dynamic_size = false;
891    bool unknown_dynamic_offset = false;
892    for (unsigned i = 0; i < layout->num_sets; i++) {
893       if (tu_shader->dynamic_descriptor_sizes[i] == -1) {
894          unknown_dynamic_size = true;
895       } else if (unknown_dynamic_size &&
896                  tu_shader->dynamic_descriptor_sizes[i] > 0) {
897          /* If there is an unknown size followed by a known size, then we may
898           * need to dynamically determine the offset when linking.
899           */
900          unknown_dynamic_offset = true;
901       }
902    }
903 
904    if (unknown_dynamic_offset) {
905       const_state->dynamic_offset_loc =
906          const_allocs->max_const_offset_vec4 * 4;
907       assert(dev->physical_device->reserved_set_idx >= 0);
908       ir3_const_alloc(
909          const_allocs, IR3_CONST_ALLOC_DYN_DESCRIPTOR_OFFSET,
910          DIV_ROUND_UP(dev->physical_device->reserved_set_idx, 4), 1);
911    } else {
912       const_state->dynamic_offset_loc = UINT32_MAX;
913    }
914 
915    /* Reserve space for inline uniforms, so we can always load them from
916     * constants and not setup a UBO descriptor for them.
917     */
918    size_t ldgk_consts = 0;
919    bool use_ldg_k =
920       dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
921    for (unsigned set = 0; set < layout->num_sets; set++) {
922       const struct tu_descriptor_set_layout *desc_layout =
923          layout->set[set].layout;
924 
925       if (!desc_layout || !desc_layout->has_inline_uniforms)
926          continue;
927 
928       for (unsigned b = 0; b < desc_layout->binding_count; b++) {
929          const struct tu_descriptor_set_binding_layout *binding =
930             &desc_layout->binding[b];
931 
932          if (binding->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
933             continue;
934          if (!(binding->shader_stages &
935                mesa_to_vk_shader_stage(shader->info.stage)))
936             continue;
937 
938          /* If we don't know the size at compile time due to a variable
939           * descriptor count, then with descriptor buffers we cannot know
940           * how much space the real inline uniform has. In this case we fall
941           * back to pushing the address and using ldg, which is slower than
942           * setting up a descriptor but setting up our own descriptor with
943           * descriptor_buffer is also painful and has to be done on the GPU
944           * and doesn't avoid the UBO getting pushed anyway and faulting if a
945           * out-of-bounds access is hidden behind an if and not dynamically
946           * executed. Given the small max size, there shouldn't be much reason
947           * to use variable size anyway.
948           */
949          bool push_address = !use_ldg_k && desc_layout->has_variable_descriptors &&
950             b == desc_layout->binding_count - 1;
951 
952          if (push_address) {
953             perf_debug(dev,
954                        "falling back to ldg for variable-sized inline "
955                        "uniform block");
956          }
957 
958          assert(const_state->num_inline_ubos < ARRAY_SIZE(const_state->ubos));
959          unsigned size_vec4 = push_address ? 1 : DIV_ROUND_UP(binding->size, 16);
960          const_state->ubos[const_state->num_inline_ubos++] =
961             (struct tu_inline_ubo) {
962                .base = set,
963                .offset = binding->offset,
964                .push_address = push_address,
965                .const_offset_vec4 =
966                   const_allocs->max_const_offset_vec4 + ldgk_consts,
967                .size_vec4 = size_vec4,
968             };
969 
970          if (!use_ldg_k) {
971             ldgk_consts += align(size_vec4, dev->compiler->const_upload_unit);
972          }
973       }
974    }
975 
976    ir3_const_alloc(const_allocs, IR3_CONST_ALLOC_INLINE_UNIFORM_ADDRS, ldgk_consts, 1);
977 
978    struct lower_instr_params params = {
979       .dev = dev,
980       .shader = tu_shader,
981       .layout = layout,
982       .read_only_input_attachments = read_only_input_attachments,
983       .dynamic_renderpass = dynamic_renderpass,
984       .const_allocs = const_allocs,
985    };
986 
987    bool progress = false;
988    if (const_state->num_inline_ubos) {
989       progress |= nir_shader_intrinsics_pass(shader, lower_inline_ubo,
990                                                nir_metadata_none,
991                                                &params);
992    }
993 
994    progress |= nir_shader_instructions_pass(shader,
995                                             lower_instr,
996                                             nir_metadata_none,
997                                             &params);
998 
999    /* Remove now-unused variables so that when we gather the shader info later
1000     * they won't be counted.
1001     */
1002 
1003    if (progress)
1004       nir_opt_dce(shader);
1005 
1006    progress |=
1007       nir_remove_dead_variables(shader,
1008                                 nir_var_uniform | nir_var_mem_ubo | nir_var_mem_ssbo,
1009                                 NULL);
1010 
1011    return progress;
1012 }
1013 
1014 struct lower_fdm_options {
1015    unsigned num_views;
1016    bool adjust_fragcoord;
1017    bool multiview;
1018 };
1019 
1020 static bool
lower_fdm_filter(const nir_instr * instr,const void * data)1021 lower_fdm_filter(const nir_instr *instr, const void *data)
1022 {
1023    const struct lower_fdm_options *options =
1024       (const struct lower_fdm_options *)data;
1025 
1026    if (instr->type != nir_instr_type_intrinsic)
1027       return false;
1028 
1029    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1030    return intrin->intrinsic == nir_intrinsic_load_frag_size ||
1031       (intrin->intrinsic == nir_intrinsic_load_frag_coord &&
1032        options->adjust_fragcoord);
1033 }
1034 
1035 static nir_def *
lower_fdm_instr(struct nir_builder * b,nir_instr * instr,void * data)1036 lower_fdm_instr(struct nir_builder *b, nir_instr *instr, void *data)
1037 {
1038    const struct lower_fdm_options *options =
1039       (const struct lower_fdm_options *)data;
1040 
1041    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1042 
1043    nir_def *view;
1044    if (options->multiview) {
1045       nir_variable *view_var =
1046          nir_find_variable_with_location(b->shader, nir_var_shader_in,
1047                                          VARYING_SLOT_VIEW_INDEX);
1048 
1049       if (view_var == NULL) {
1050          view_var = nir_variable_create(b->shader, nir_var_shader_in,
1051                                         glsl_int_type(), NULL);
1052          view_var->data.location = VARYING_SLOT_VIEW_INDEX;
1053          view_var->data.interpolation = INTERP_MODE_FLAT;
1054          view_var->data.driver_location = b->shader->num_inputs++;
1055       }
1056 
1057       view = nir_load_var(b, view_var);
1058    } else {
1059       view = nir_imm_int(b, 0);
1060    }
1061 
1062    nir_def *frag_size =
1063       nir_load_frag_size_ir3(b, view, .range = options->num_views);
1064 
1065    if (intrin->intrinsic == nir_intrinsic_load_frag_coord) {
1066       nir_def *frag_offset =
1067          nir_load_frag_offset_ir3(b, view, .range = options->num_views);
1068       nir_def *unscaled_coord = nir_load_frag_coord_unscaled_ir3(b);
1069       nir_def *xy = nir_trim_vector(b, unscaled_coord, 2);
1070       xy = nir_fmul(b, nir_fsub(b, xy, frag_offset), nir_i2f32(b, frag_size));
1071       return nir_vec4(b,
1072                       nir_channel(b, xy, 0),
1073                       nir_channel(b, xy, 1),
1074                       nir_channel(b, unscaled_coord, 2),
1075                       nir_channel(b, unscaled_coord, 3));
1076    }
1077 
1078    assert(intrin->intrinsic == nir_intrinsic_load_frag_size);
1079    return frag_size;
1080 }
1081 
1082 static bool
tu_nir_lower_fdm(nir_shader * shader,const struct lower_fdm_options * options)1083 tu_nir_lower_fdm(nir_shader *shader, const struct lower_fdm_options *options)
1084 {
1085    return nir_shader_lower_instructions(shader, lower_fdm_filter,
1086                                         lower_fdm_instr, (void *)options);
1087 }
1088 
1089 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)1090 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
1091 {
1092    assert(glsl_type_is_vector_or_scalar(type));
1093 
1094    unsigned comp_size =
1095       glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
1096    unsigned length = glsl_get_vector_elements(type);
1097    *size = comp_size * length;
1098    *align = comp_size;
1099 }
1100 
1101 static void
tu_gather_xfb_info(nir_shader * nir,struct ir3_stream_output_info * info)1102 tu_gather_xfb_info(nir_shader *nir, struct ir3_stream_output_info *info)
1103 {
1104    nir_shader_gather_xfb_info(nir);
1105 
1106    if (!nir->xfb_info)
1107       return;
1108 
1109    nir_xfb_info *xfb = nir->xfb_info;
1110 
1111    uint8_t output_map[VARYING_SLOT_TESS_MAX];
1112    memset(output_map, 0, sizeof(output_map));
1113 
1114    nir_foreach_shader_out_variable(var, nir) {
1115       unsigned slots = nir_variable_count_slots(var, var->type);
1116       for (unsigned i = 0; i < slots; i++)
1117          output_map[var->data.location + i] = var->data.driver_location + i;
1118    }
1119 
1120    assert(xfb->output_count <= IR3_MAX_SO_OUTPUTS);
1121    info->num_outputs = xfb->output_count;
1122 
1123    for (int i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
1124       info->stride[i] = xfb->buffers[i].stride / 4;
1125       info->buffer_to_stream[i] = xfb->buffer_to_stream[i];
1126    }
1127 
1128    info->streams_written = xfb->streams_written;
1129 
1130    for (int i = 0; i < xfb->output_count; i++) {
1131       info->output[i].register_index = output_map[xfb->outputs[i].location];
1132       info->output[i].start_component = xfb->outputs[i].component_offset;
1133       info->output[i].num_components =
1134                            util_bitcount(xfb->outputs[i].component_mask);
1135       info->output[i].output_buffer  = xfb->outputs[i].buffer;
1136       info->output[i].dst_offset = xfb->outputs[i].offset / 4;
1137       info->output[i].stream = xfb->buffer_to_stream[xfb->outputs[i].buffer];
1138    }
1139 }
1140 
1141 static uint32_t
tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant * xs)1142 tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs)
1143 {
1144    const struct ir3_const_state *const_state = ir3_const_state(xs);
1145    uint32_t base = const_state->allocs.max_const_offset_vec4;
1146    int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4);
1147 
1148    /* truncate size to avoid writing constants that shader
1149     * does not use:
1150     */
1151    size = MIN2(size + base, xs->constlen) - base;
1152 
1153    return MAX2(size, 0) * 4;
1154 }
1155 
1156 /* We allocate fixed-length substreams for shader state, however some
1157  * parts of the state may have unbound length. Their additional space
1158  * requirements should be calculated here.
1159  */
1160 static uint32_t
tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant * xs)1161 tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs)
1162 {
1163    const struct ir3_const_state *const_state = ir3_const_state(xs);
1164 
1165    uint32_t size = tu_xs_get_immediates_packet_size_dwords(xs);
1166 
1167    /* Variable number of UBO upload ranges. */
1168    size += 4 * const_state->ubo_state.num_enabled;
1169 
1170    /* Variable number of dwords for the primitive map */
1171    size += xs->input_size;
1172 
1173    size += xs->constant_data_size / 4;
1174 
1175    return size;
1176 }
1177 
1178 static const struct xs_config {
1179    uint16_t reg_sp_xs_config;
1180    uint16_t reg_sp_xs_instrlen;
1181    uint16_t reg_sp_xs_first_exec_offset;
1182    uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
1183    uint16_t reg_sp_xs_vgpr_config;
1184 } xs_config[] = {
1185    [MESA_SHADER_VERTEX] = {
1186       REG_A6XX_SP_VS_CONFIG,
1187       REG_A6XX_SP_VS_INSTRLEN,
1188       REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET,
1189       REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET,
1190       REG_A7XX_SP_VS_VGPR_CONFIG,
1191    },
1192    [MESA_SHADER_TESS_CTRL] = {
1193       REG_A6XX_SP_HS_CONFIG,
1194       REG_A6XX_SP_HS_INSTRLEN,
1195       REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET,
1196       REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET,
1197       REG_A7XX_SP_HS_VGPR_CONFIG,
1198    },
1199    [MESA_SHADER_TESS_EVAL] = {
1200       REG_A6XX_SP_DS_CONFIG,
1201       REG_A6XX_SP_DS_INSTRLEN,
1202       REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET,
1203       REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET,
1204       REG_A7XX_SP_DS_VGPR_CONFIG,
1205    },
1206    [MESA_SHADER_GEOMETRY] = {
1207       REG_A6XX_SP_GS_CONFIG,
1208       REG_A6XX_SP_GS_INSTRLEN,
1209       REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET,
1210       REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET,
1211       REG_A7XX_SP_GS_VGPR_CONFIG,
1212    },
1213    [MESA_SHADER_FRAGMENT] = {
1214       REG_A6XX_SP_FS_CONFIG,
1215       REG_A6XX_SP_FS_INSTRLEN,
1216       REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET,
1217       REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET,
1218       REG_A7XX_SP_FS_VGPR_CONFIG,
1219    },
1220    [MESA_SHADER_COMPUTE] = {
1221       REG_A6XX_SP_CS_CONFIG,
1222       REG_A6XX_SP_CS_INSTRLEN,
1223       REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET,
1224       REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET,
1225       REG_A7XX_SP_CS_VGPR_CONFIG,
1226    },
1227 };
1228 
1229 void
tu6_emit_xs(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)1230 tu6_emit_xs(struct tu_cs *cs,
1231             gl_shader_stage stage, /* xs->type, but xs may be NULL */
1232             const struct ir3_shader_variant *xs,
1233             const struct tu_pvtmem_config *pvtmem,
1234             uint64_t binary_iova)
1235 {
1236    const struct xs_config *cfg = &xs_config[stage];
1237 
1238    if (!xs) {
1239       /* shader stage disabled */
1240       return;
1241    }
1242 
1243    enum a6xx_threadsize thrsz =
1244       xs->info.double_threadsize ? THREAD128 : THREAD64;
1245    switch (stage) {
1246    case MESA_SHADER_VERTEX:
1247       tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0(
1248                .halfregfootprint = xs->info.max_half_reg + 1,
1249                .fullregfootprint = xs->info.max_reg + 1,
1250                .branchstack = ir3_shader_branchstack_hw(xs),
1251                .mergedregs = xs->mergedregs,
1252                .earlypreamble = xs->early_preamble,
1253       ));
1254       break;
1255    case MESA_SHADER_TESS_CTRL:
1256       tu_cs_emit_regs(cs, A6XX_SP_HS_CTRL_REG0(
1257                .halfregfootprint = xs->info.max_half_reg + 1,
1258                .fullregfootprint = xs->info.max_reg + 1,
1259                .branchstack = ir3_shader_branchstack_hw(xs),
1260                .earlypreamble = xs->early_preamble,
1261       ));
1262       break;
1263    case MESA_SHADER_TESS_EVAL:
1264       tu_cs_emit_regs(cs, A6XX_SP_DS_CTRL_REG0(
1265                .halfregfootprint = xs->info.max_half_reg + 1,
1266                .fullregfootprint = xs->info.max_reg + 1,
1267                .branchstack = ir3_shader_branchstack_hw(xs),
1268                .earlypreamble = xs->early_preamble,
1269       ));
1270       break;
1271    case MESA_SHADER_GEOMETRY:
1272       tu_cs_emit_regs(cs, A6XX_SP_GS_CTRL_REG0(
1273                .halfregfootprint = xs->info.max_half_reg + 1,
1274                .fullregfootprint = xs->info.max_reg + 1,
1275                .branchstack = ir3_shader_branchstack_hw(xs),
1276                .earlypreamble = xs->early_preamble,
1277       ));
1278       break;
1279    case MESA_SHADER_FRAGMENT:
1280       tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0(
1281                .halfregfootprint = xs->info.max_half_reg + 1,
1282                .fullregfootprint = xs->info.max_reg + 1,
1283                .branchstack = ir3_shader_branchstack_hw(xs),
1284                .threadsize = thrsz,
1285                .varying = xs->total_in != 0,
1286                .lodpixmask = xs->need_full_quad,
1287                /* unknown bit, seems unnecessary */
1288                .unk24 = true,
1289                .pixlodenable = xs->need_pixlod,
1290                .earlypreamble = xs->early_preamble,
1291                .mergedregs = xs->mergedregs,
1292       ));
1293       break;
1294    case MESA_SHADER_COMPUTE:
1295       thrsz = cs->device->physical_device->info->a6xx
1296             .supports_double_threadsize ? thrsz : THREAD128;
1297       tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0(
1298                .halfregfootprint = xs->info.max_half_reg + 1,
1299                .fullregfootprint = xs->info.max_reg + 1,
1300                .branchstack = ir3_shader_branchstack_hw(xs),
1301                .threadsize = thrsz,
1302                .earlypreamble = xs->early_preamble,
1303                .mergedregs = xs->mergedregs,
1304       ));
1305       break;
1306    default:
1307       unreachable("bad shader stage");
1308    }
1309 
1310    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_instrlen, 1);
1311    tu_cs_emit(cs, xs->instrlen);
1312 
1313    /* emit program binary & private memory layout
1314     * binary_iova should be aligned to 1 instrlen unit (128 bytes)
1315     */
1316 
1317    assert((binary_iova & 0x7f) == 0);
1318    assert((pvtmem->iova & 0x1f) == 0);
1319 
1320    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_first_exec_offset, 7);
1321    tu_cs_emit(cs, 0);
1322    tu_cs_emit_qw(cs, binary_iova);
1323    tu_cs_emit(cs,
1324               A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(pvtmem->per_fiber_size));
1325    tu_cs_emit_qw(cs, pvtmem->iova);
1326    tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(pvtmem->per_sp_size) |
1327                   COND(pvtmem->per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT));
1328 
1329    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1);
1330    tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size));
1331 
1332    if (cs->device->physical_device->info->chip >= A7XX) {
1333       tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vgpr_config, 1);
1334       tu_cs_emit(cs, 0);
1335    }
1336 
1337    if (cs->device->physical_device->info->chip == A6XX) {
1338       uint32_t shader_preload_size =
1339          MIN2(xs->instrlen, cs->device->physical_device->info->a6xx.instr_cache_size);
1340 
1341       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
1342       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1343                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
1344                      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1345                      CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1346                      CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
1347       tu_cs_emit_qw(cs, binary_iova);
1348    }
1349 
1350    /* emit immediates */
1351 
1352    const struct ir3_const_state *const_state = ir3_const_state(xs);
1353    uint32_t base = const_state->allocs.max_const_offset_vec4;
1354    unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs);
1355 
1356    if (immediate_size > 0) {
1357       assert(!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble);
1358       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size);
1359       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
1360                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1361                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1362                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1363                  CP_LOAD_STATE6_0_NUM_UNIT(immediate_size / 4));
1364       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1365       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1366 
1367       tu_cs_emit_array(cs, const_state->immediates, immediate_size);
1368    }
1369 
1370    if (const_state->consts_ubo.idx != -1) {
1371       uint64_t iova = binary_iova + xs->info.constant_data_offset;
1372       uint32_t offset = const_state->consts_ubo.idx;
1373 
1374       /* Upload UBO state for the constant data. */
1375       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
1376       tu_cs_emit(cs,
1377                  CP_LOAD_STATE6_0_DST_OFF(offset) |
1378                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
1379                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1380                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1381                  CP_LOAD_STATE6_0_NUM_UNIT(1));
1382       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1383       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1384       int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16);
1385       tu_cs_emit_qw(cs,
1386                     iova |
1387                     (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32);
1388 
1389       /* Upload the constant data to the const file if needed. */
1390       const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
1391 
1392       if (!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
1393          for (int i = 0; i < ubo_state->num_enabled; i++) {
1394             if (ubo_state->range[i].ubo.block != offset ||
1395                 ubo_state->range[i].ubo.bindless) {
1396                continue;
1397             }
1398 
1399             uint32_t start = ubo_state->range[i].start;
1400             uint32_t end = ubo_state->range[i].end;
1401             uint32_t size = MIN2(end - start,
1402                                  (16 * xs->constlen) - ubo_state->range[i].offset);
1403 
1404             tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
1405             tu_cs_emit(cs,
1406                      CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
1407                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1408                      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1409                      CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1410                      CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
1411             tu_cs_emit_qw(cs, iova + start);
1412          }
1413       }
1414    }
1415 
1416    /* emit statically-known FS driver param */
1417    if (stage == MESA_SHADER_FRAGMENT && const_state->driver_params_ubo.size > 0) {
1418       uint32_t data[4] = {xs->info.double_threadsize ? 128 : 64, 0, 0, 0};
1419       uint32_t size = ARRAY_SIZE(data);
1420 
1421       /* A7XX TODO: Emit data via sub_cs instead of NOP */
1422       uint64_t iova = tu_cs_emit_data_nop(cs, data, size, 4);
1423       uint32_t base = const_state->driver_params_ubo.idx;
1424 
1425       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
1426       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
1427                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
1428                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1429                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1430                  CP_LOAD_STATE6_0_NUM_UNIT(1));
1431       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1432       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1433       int size_vec4s = DIV_ROUND_UP(size, 4);
1434       tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
1435    } else if (stage == MESA_SHADER_FRAGMENT && const_state->num_driver_params > 0) {
1436       uint32_t base =
1437          const_state->allocs.consts[IR3_CONST_ALLOC_DRIVER_PARAMS].offset_vec4;
1438       int32_t size = DIV_ROUND_UP(MAX2(const_state->num_driver_params, 4), 4);
1439       size = MAX2(MIN2(size + base, xs->constlen) - base, 0);
1440 
1441       if (size > 0) {
1442          tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + 4);
1443          tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
1444                     CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1445                     CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1446                     CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1447                     CP_LOAD_STATE6_0_NUM_UNIT(size));
1448          tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1449          tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1450 
1451          tu_cs_emit(cs, xs->info.double_threadsize ? 128 : 64);
1452          tu_cs_emit(cs, 0);
1453          tu_cs_emit(cs, 0);
1454          tu_cs_emit(cs, 0);
1455       }
1456    }
1457 }
1458 
1459 template <chip CHIP>
1460 static void
tu6_emit_cs_config(struct tu_cs * cs,const struct ir3_shader_variant * v,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)1461 tu6_emit_cs_config(struct tu_cs *cs,
1462                    const struct ir3_shader_variant *v,
1463                    const struct tu_pvtmem_config *pvtmem,
1464                    uint64_t binary_iova)
1465 {
1466    bool shared_consts_enable =
1467       ir3_const_state(v)->push_consts_type == IR3_PUSH_CONSTS_SHARED;
1468    tu6_emit_shared_consts_enable<CHIP>(cs, shared_consts_enable);
1469 
1470    tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
1471          .cs_state = true,
1472          .cs_ibo = true,
1473          .cs_shared_const = shared_consts_enable));
1474 
1475    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_COMPUTE, v);
1476    tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
1477 
1478    uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
1479    tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
1480    tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
1481                   A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
1482 
1483    if (CHIP == A6XX && cs->device->physical_device->info->a6xx.has_lpac) {
1484       tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1);
1485       tu_cs_emit(cs, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) |
1486                      A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6);
1487    }
1488 
1489    uint32_t local_invocation_id =
1490       ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
1491    uint32_t work_group_id =
1492       ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
1493 
1494    /*
1495     * Devices that do not support double threadsize take the threadsize from
1496     * A6XX_HLSQ_FS_CNTL_0_THREADSIZE instead of A6XX_HLSQ_CS_CNTL_1_THREADSIZE
1497     * which is always set to THREAD128.
1498     */
1499    enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
1500    enum a6xx_threadsize thrsz_cs = cs->device->physical_device->info->a6xx
1501       .supports_double_threadsize ? thrsz : THREAD128;
1502    if (CHIP == A6XX) {
1503       tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
1504       tu_cs_emit(cs,
1505                  A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
1506                  A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
1507                  A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
1508                  A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
1509       tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
1510                      A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz_cs));
1511       if (!cs->device->physical_device->info->a6xx.supports_double_threadsize) {
1512          tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1);
1513          tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz));
1514       }
1515 
1516       if (cs->device->physical_device->info->a6xx.has_lpac) {
1517          tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2);
1518          tu_cs_emit(cs,
1519                     A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
1520                     A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
1521                     A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
1522                     A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
1523          tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
1524                   A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
1525       }
1526    } else {
1527       unsigned tile_height = (v->local_size[1] % 8 == 0)   ? 3
1528                              : (v->local_size[1] % 4 == 0) ? 5
1529                              : (v->local_size[1] % 2 == 0) ? 9
1530                                                            : 17;
1531       tu_cs_emit_regs(
1532          cs, HLSQ_CS_CNTL_1(CHIP,
1533                    .linearlocalidregid = regid(63, 0), .threadsize = thrsz_cs,
1534                    .workgrouprastorderzfirsten = true,
1535                    .wgtilewidth = 4, .wgtileheight = tile_height));
1536 
1537       tu_cs_emit_regs(cs, HLSQ_FS_CNTL_0(CHIP, .threadsize = THREAD64));
1538 
1539       tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 1);
1540       tu_cs_emit(cs, A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
1541                         A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
1542                         A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
1543                         A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
1544 
1545       tu_cs_emit_regs(cs,
1546                       SP_CS_CNTL_1(CHIP,
1547                         .linearlocalidregid = regid(63, 0),
1548                         .threadsize = thrsz_cs,
1549                         .workitemrastorder =
1550                            v->cs.force_linear_dispatch ?
1551                            WORKITEMRASTORDER_LINEAR :
1552                            WORKITEMRASTORDER_TILED, ));
1553 
1554       tu_cs_emit_regs(
1555          cs, A7XX_HLSQ_CS_LOCAL_SIZE(.localsizex = v->local_size[0] - 1,
1556                                      .localsizey = v->local_size[1] - 1,
1557                                      .localsizez = v->local_size[2] - 1, ));
1558 
1559       tu_cs_emit_regs(cs, A7XX_SP_CS_UNKNOWN_A9BE(0)); // Sometimes is 0x08000000
1560    }
1561 }
1562 
1563 #define TU6_EMIT_VFD_DEST_MAX_DWORDS (MAX_VERTEX_ATTRIBS + 2)
1564 
1565 static void
tu6_emit_vfd_dest(struct tu_cs * cs,const struct ir3_shader_variant * vs)1566 tu6_emit_vfd_dest(struct tu_cs *cs,
1567                   const struct ir3_shader_variant *vs)
1568 {
1569    int32_t input_for_attr[MAX_VERTEX_ATTRIBS];
1570    uint32_t attr_count = 0;
1571 
1572    for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; i++)
1573       input_for_attr[i] = -1;
1574 
1575    for (unsigned i = 0; i < vs->inputs_count; i++) {
1576       if (vs->inputs[i].sysval || vs->inputs[i].regid == regid(63, 0))
1577          continue;
1578 
1579       assert(vs->inputs[i].slot >= VERT_ATTRIB_GENERIC0);
1580       unsigned loc = vs->inputs[i].slot - VERT_ATTRIB_GENERIC0;
1581       input_for_attr[loc] = i;
1582       attr_count = MAX2(attr_count, loc + 1);
1583    }
1584 
1585    tu_cs_emit_regs(cs,
1586                    A6XX_VFD_CONTROL_0(
1587                      .fetch_cnt = attr_count, /* decode_cnt for binning pass ? */
1588                      .decode_cnt = attr_count));
1589 
1590    if (attr_count)
1591       tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DEST_CNTL_INSTR(0), attr_count);
1592 
1593    for (unsigned i = 0; i < attr_count; i++) {
1594       if (input_for_attr[i] >= 0) {
1595             unsigned input_idx = input_for_attr[i];
1596             tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0,
1597                              .writemask = vs->inputs[input_idx].compmask,
1598                              .regid = vs->inputs[input_idx].regid).value);
1599       } else {
1600             tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0,
1601                              .writemask = 0,
1602                              .regid = regid(63, 0)).value);
1603       }
1604    }
1605 }
1606 
1607 static enum a6xx_tex_prefetch_cmd
tu6_tex_opc_to_prefetch_cmd(opc_t tex_opc)1608 tu6_tex_opc_to_prefetch_cmd(opc_t tex_opc)
1609 {
1610    switch (tex_opc) {
1611    case OPC_SAM:
1612       return TEX_PREFETCH_SAM;
1613    default:
1614       unreachable("Unknown tex opc for prefeth cmd");
1615    }
1616 }
1617 
1618 template <chip CHIP>
1619 static void
tu6_emit_fs_inputs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1620 tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
1621 {
1622    uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
1623    uint32_t ij_regid[IJ_COUNT];
1624    uint32_t smask_in_regid, shading_rate_regid;
1625 
1626    bool sample_shading = fs->per_samp | fs->key.sample_shading;
1627    bool enable_varyings = fs->total_in > 0;
1628 
1629    samp_id_regid   = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID);
1630    smask_in_regid  = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN);
1631    face_regid      = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE);
1632    coord_regid     = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD);
1633    zwcoord_regid   = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0);
1634    shading_rate_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_SHADING_RATE);
1635    for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
1636       ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
1637 
1638    if (fs->num_sampler_prefetch > 0) {
1639       /* It seems like ij_pix is *required* to be r0.x */
1640       assert(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]) ||
1641              ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
1642    }
1643 
1644    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);
1645    tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) |
1646                      COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID(0x1ff)) |
1647                      COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID4COORD(0x1ff)) |
1648                      COND(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]),
1649                           A6XX_SP_FS_PREFETCH_CNTL_IJ_WRITE_DISABLE) |
1650                      COND(fs->prefetch_end_of_quad,
1651                           A6XX_SP_FS_PREFETCH_CNTL_ENDOFQUAD));
1652    for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1653       const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1654       tu_cs_emit(
1655          cs, SP_FS_PREFETCH_CMD(
1656                 CHIP, i, .src = prefetch->src, .samp_id = prefetch->samp_id,
1657                 .tex_id = prefetch->tex_id, .dst = prefetch->dst,
1658                 .wrmask = prefetch->wrmask, .half = prefetch->half_precision,
1659                 .bindless = prefetch->bindless,
1660                 .cmd = tu6_tex_opc_to_prefetch_cmd(prefetch->tex_opc), ).value);
1661    }
1662 
1663    if (fs->num_sampler_prefetch > 0) {
1664       tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch);
1665       for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1666          const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1667          tu_cs_emit(cs,
1668                     A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) |
1669                     A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id));
1670       }
1671    }
1672 
1673    tu_cs_emit_regs(cs,
1674       HLSQ_CONTROL_1_REG(CHIP,
1675          .primallocthreshold =
1676             cs->device->physical_device->info->a6xx.prim_alloc_threshold),
1677       HLSQ_CONTROL_2_REG(CHIP, .faceregid = face_regid,
1678                          .sampleid = samp_id_regid,
1679                          .samplemask = smask_in_regid,
1680                          .centerrhw = ij_regid[IJ_PERSP_CENTER_RHW]),
1681       HLSQ_CONTROL_3_REG(CHIP, .ij_persp_pixel = ij_regid[IJ_PERSP_PIXEL],
1682                          .ij_linear_pixel = ij_regid[IJ_LINEAR_PIXEL],
1683                          .ij_persp_centroid = ij_regid[IJ_PERSP_CENTROID],
1684                          .ij_linear_centroid = ij_regid[IJ_LINEAR_CENTROID]),
1685       HLSQ_CONTROL_4_REG(CHIP, .ij_persp_sample = ij_regid[IJ_PERSP_SAMPLE],
1686                          .ij_linear_sample = ij_regid[IJ_LINEAR_SAMPLE],
1687                          .xycoordregid = coord_regid,
1688                          .zwcoordregid = zwcoord_regid),
1689       HLSQ_CONTROL_5_REG(CHIP, .linelengthregid = 0xfc,
1690                          .foveationqualityregid = shading_rate_regid), );
1691 
1692    if (CHIP >= A7XX) {
1693       uint32_t sysval_regs = 0;
1694       for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) {
1695          if (VALIDREG(ij_regid[i])) {
1696             if (i == IJ_PERSP_CENTER_RHW)
1697                sysval_regs += 1;
1698             else
1699                sysval_regs += 2;
1700          }
1701       }
1702 
1703       for (uint32_t sysval : { face_regid, samp_id_regid, smask_in_regid,
1704                                shading_rate_regid }) {
1705          if (VALIDREG(sysval))
1706             sysval_regs += 1;
1707       }
1708 
1709       for (uint32_t sysval : { coord_regid, zwcoord_regid }) {
1710          if (VALIDREG(sysval))
1711             sysval_regs += 2;
1712       }
1713 
1714       tu_cs_emit_regs(cs, A7XX_HLSQ_UNKNOWN_A9AE(.sysval_regs_count = sysval_regs,
1715                                                  .unk8 = 1,
1716                                                  .unk9 = 1));
1717    }
1718 
1719    enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64;
1720    tu_cs_emit_regs(cs, HLSQ_FS_CNTL_0(CHIP, .threadsize = thrsz, .varyings = enable_varyings));
1721 
1722    bool need_size = fs->frag_face || fs->fragcoord_compmask != 0;
1723    bool need_size_persamp = false;
1724    if (VALIDREG(ij_regid[IJ_PERSP_CENTER_RHW])) {
1725       if (sample_shading)
1726          need_size_persamp = true;
1727       else
1728          need_size = true;
1729    }
1730 
1731    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1);
1732    tu_cs_emit(cs,
1733          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
1734          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
1735          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
1736          CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1737          CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) |
1738          CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1739          COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1740          COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1741          COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
1742 
1743    tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2);
1744    tu_cs_emit(cs,
1745          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
1746          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
1747          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
1748          CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1749          CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) |
1750          CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1751          COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1752          COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
1753          COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1754          COND(fs->fragcoord_compmask != 0,
1755                            A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
1756    tu_cs_emit(cs,
1757          A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE(
1758             sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) |
1759          CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
1760          CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) |
1761          CONDREG(ij_regid[IJ_PERSP_CENTER_RHW], A6XX_RB_RENDER_CONTROL1_CENTERRHW) |
1762          COND(fs->post_depth_coverage, A6XX_RB_RENDER_CONTROL1_POSTDEPTHCOVERAGE)  |
1763          COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS) |
1764          CONDREG(shading_rate_regid, A6XX_RB_RENDER_CONTROL1_FOVEATION));
1765 
1766    tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CNTL, 1);
1767    tu_cs_emit(cs, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE));
1768 
1769    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1);
1770    tu_cs_emit(cs, CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) |
1771               A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE(
1772                  sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER));
1773 
1774    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 1);
1775    tu_cs_emit(cs, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE));
1776 
1777    uint32_t varmask[4] = { 0 };
1778 
1779    for (int i = ir3_next_varying(fs, -1); i < fs->inputs_count;
1780         i = ir3_next_varying(fs, i)) {
1781       if (fs->inputs[i].inloc >= fs->total_in)
1782          continue;
1783 
1784       unsigned loc = fs->inputs[i].inloc;
1785       for (int j = 0; j < util_last_bit(fs->inputs[i].compmask); j++) {
1786          uint8_t comploc = loc + j;
1787          varmask[comploc / 32] |= 1 << (comploc % 32);
1788       }
1789    }
1790 
1791    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
1792    tu_cs_emit(cs, ~varmask[0]);
1793    tu_cs_emit(cs, ~varmask[1]);
1794    tu_cs_emit(cs, ~varmask[2]);
1795    tu_cs_emit(cs, ~varmask[3]);
1796 
1797    unsigned primid_loc = ir3_find_input_loc(fs, VARYING_SLOT_PRIMITIVE_ID);
1798    unsigned viewid_loc = ir3_find_input_loc(fs, VARYING_SLOT_VIEW_INDEX);
1799 
1800    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
1801    tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs->total_in) |
1802                   COND(fs && fs->total_in, A6XX_VPC_CNTL_0_VARYING) |
1803                   A6XX_VPC_CNTL_0_PRIMIDLOC(primid_loc) |
1804                   A6XX_VPC_CNTL_0_VIEWIDLOC(viewid_loc));
1805 }
1806 
1807 static void
tu6_emit_fs_outputs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1808 tu6_emit_fs_outputs(struct tu_cs *cs,
1809                     const struct ir3_shader_variant *fs)
1810 {
1811    uint32_t smask_regid, posz_regid, stencilref_regid;
1812 
1813    posz_regid      = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH);
1814    smask_regid     = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK);
1815    stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL);
1816 
1817    int output_reg_count = 0;
1818    uint32_t fragdata_regid[8];
1819 
1820    assert(!fs->color0_mrt);
1821    for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) {
1822       fragdata_regid[i] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + i);
1823       if (VALIDREG(fragdata_regid[i]))
1824          output_reg_count = i + 1;
1825    }
1826 
1827    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1);
1828    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) |
1829                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) |
1830                   A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) |
1831                   COND(fs->dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1832 
1833    /* There is no point in having component enabled which is not written
1834     * by the shader. Per VK spec it is an UB, however a few apps depend on
1835     * attachment not being changed if FS doesn't have corresponding output.
1836     */
1837    uint32_t fs_render_components = 0;
1838 
1839    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), output_reg_count);
1840    for (uint32_t i = 0; i < output_reg_count; i++) {
1841       tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) |
1842                      (COND(fragdata_regid[i] & HALF_REG_ID,
1843                            A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)));
1844 
1845       if (VALIDREG(fragdata_regid[i])) {
1846          fs_render_components |= 0xf << (i * 4);
1847       }
1848    }
1849 
1850    tu_cs_emit_regs(cs,
1851                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = fs_render_components));
1852 
1853    tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 1);
1854    tu_cs_emit(cs, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
1855                   COND(fs->writes_smask, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
1856                   COND(fs->writes_stencilref, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
1857                   COND(fs->dual_src_blend, A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1858 
1859    tu_cs_emit_regs(cs,
1860                    A6XX_RB_RENDER_COMPONENTS(.dword = fs_render_components));
1861 }
1862 
1863 template <chip CHIP>
1864 void
tu6_emit_vs(struct tu_cs * cs,const struct ir3_shader_variant * vs,uint32_t view_mask)1865 tu6_emit_vs(struct tu_cs *cs,
1866             const struct ir3_shader_variant *vs,
1867             uint32_t view_mask)
1868 {
1869    bool multi_pos_output = vs->multi_pos_output;
1870 
1871    uint32_t multiview_views = util_logbase2(view_mask) + 1;
1872    uint32_t multiview_cntl = view_mask ?
1873       A6XX_PC_MULTIVIEW_CNTL_ENABLE |
1874       A6XX_PC_MULTIVIEW_CNTL_VIEWS(multiview_views) |
1875       COND(!multi_pos_output, A6XX_PC_MULTIVIEW_CNTL_DISABLEMULTIPOS)
1876       : 0;
1877 
1878    /* Copy what the blob does here. This will emit an extra 0x3f
1879     * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
1880     * this is working around yet.
1881     */
1882    if (cs->device->physical_device->info->a6xx.has_cp_reg_write) {
1883       tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
1884       tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
1885       tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
1886    } else {
1887       tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_CNTL, 1);
1888    }
1889    tu_cs_emit(cs, multiview_cntl);
1890 
1891    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_MULTIVIEW_CNTL, 1);
1892    tu_cs_emit(cs, multiview_cntl);
1893 
1894    if (multiview_cntl &&
1895        cs->device->physical_device->info->a6xx.supports_multiview_mask) {
1896       tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_MASK, 1);
1897       tu_cs_emit(cs, view_mask);
1898    }
1899 
1900    if (CHIP >= A7XX) {
1901       tu_cs_emit_pkt4(cs, REG_A7XX_VPC_MULTIVIEW_CNTL, 1);
1902       tu_cs_emit(cs, multiview_cntl);
1903 
1904       tu_cs_emit_pkt4(cs, REG_A7XX_VPC_MULTIVIEW_MASK, 1);
1905       tu_cs_emit(cs, view_mask);
1906    }
1907 
1908    tu6_emit_vfd_dest(cs, vs);
1909 
1910    const uint32_t vertexid_regid =
1911          ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
1912    const uint32_t instanceid_regid =
1913          ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
1914 
1915    /* Note: we currently don't support multiview with tess or GS. If we did,
1916     * and the HW actually works, then we'd have to somehow share this across
1917     * stages. Note that the blob doesn't support this either.
1918     */
1919    const uint32_t viewid_regid =
1920       ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX);
1921 
1922    const uint32_t vs_primitiveid_regid =
1923       ir3_find_sysval_regid(vs, SYSTEM_VALUE_PRIMITIVE_ID);
1924 
1925    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_1, 1);
1926    tu_cs_emit(cs, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) |
1927                   A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) |
1928                   A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) |
1929                   A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid));
1930 }
1931 TU_GENX(tu6_emit_vs);
1932 
1933 template <chip CHIP>
1934 void
tu6_emit_hs(struct tu_cs * cs,const struct ir3_shader_variant * hs)1935 tu6_emit_hs(struct tu_cs *cs,
1936             const struct ir3_shader_variant *hs)
1937 {
1938    const uint32_t hs_rel_patch_regid =
1939          ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3);
1940    const uint32_t hs_invocation_regid =
1941          ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3);
1942 
1943    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_2, 1);
1944    tu_cs_emit(cs, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) |
1945                   A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid));
1946 
1947    if (hs) {
1948       tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
1949       tu_cs_emit(cs, hs->tess.tcs_vertices_out);
1950    }
1951 }
1952 TU_GENX(tu6_emit_hs);
1953 
1954 template <chip CHIP>
1955 void
tu6_emit_ds(struct tu_cs * cs,const struct ir3_shader_variant * ds)1956 tu6_emit_ds(struct tu_cs *cs,
1957             const struct ir3_shader_variant *ds)
1958 {
1959    const uint32_t ds_rel_patch_regid =
1960          ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3);
1961    const uint32_t tess_coord_x_regid =
1962          ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD);
1963    const uint32_t tess_coord_y_regid = VALIDREG(tess_coord_x_regid) ?
1964          tess_coord_x_regid + 1 :
1965          regid(63, 0);
1966    const uint32_t ds_primitiveid_regid =
1967          ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID);
1968 
1969    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_3, 2);
1970    tu_cs_emit(cs, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) |
1971                   A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) |
1972                   A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) |
1973                   A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid));
1974    tu_cs_emit(cs, 0x000000fc); /* VFD_CONTROL_4 */
1975 }
1976 TU_GENX(tu6_emit_ds);
1977 
1978 static enum a6xx_tess_output
primitive_to_tess(enum mesa_prim primitive)1979 primitive_to_tess(enum mesa_prim primitive) {
1980    switch (primitive) {
1981    case MESA_PRIM_POINTS:
1982       return TESS_POINTS;
1983    case MESA_PRIM_LINE_STRIP:
1984       return TESS_LINES;
1985    case MESA_PRIM_TRIANGLE_STRIP:
1986       return TESS_CW_TRIS;
1987    default:
1988       unreachable("");
1989    }
1990 }
1991 
1992 template <chip CHIP>
1993 void
tu6_emit_gs(struct tu_cs * cs,const struct ir3_shader_variant * gs)1994 tu6_emit_gs(struct tu_cs *cs,
1995             const struct ir3_shader_variant *gs)
1996 {
1997    const uint32_t gsheader_regid =
1998          ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3);
1999 
2000    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_5, 1);
2001    tu_cs_emit(cs, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) |
2002                   0xfc00);
2003 
2004    if (gs) {
2005       uint32_t vertices_out, invocations;
2006 
2007       vertices_out = gs->gs.vertices_out - 1;
2008       enum a6xx_tess_output output = primitive_to_tess((enum mesa_prim) gs->gs.output_primitive);
2009       invocations = gs->gs.invocations - 1;
2010 
2011       uint32_t primitive_cntl =
2012          A6XX_PC_PRIMITIVE_CNTL_5(.gs_vertices_out = vertices_out,
2013                                   .gs_invocations = invocations,
2014                                   .gs_output = output,).value;
2015 
2016       tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1);
2017       tu_cs_emit(cs, primitive_cntl);
2018 
2019       if (CHIP >= A7XX) {
2020          tu_cs_emit_pkt4(cs, REG_A7XX_VPC_PRIMITIVE_CNTL_5, 1);
2021          tu_cs_emit(cs, primitive_cntl);
2022       } else {
2023          tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_PARAM, 1);
2024          tu_cs_emit(cs, 0xff);
2025       }
2026    }
2027 }
2028 TU_GENX(tu6_emit_gs);
2029 
2030 template <chip CHIP>
2031 void
tu6_emit_fs(struct tu_cs * cs,const struct ir3_shader_variant * fs)2032 tu6_emit_fs(struct tu_cs *cs,
2033             const struct ir3_shader_variant *fs)
2034 {
2035    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_6, 1);
2036    tu_cs_emit(cs, COND(fs && fs->reads_primid, A6XX_VFD_CONTROL_6_PRIMID4PSEN));
2037 
2038    tu_cs_emit_regs(cs, A6XX_PC_PS_CNTL(.primitiveiden = fs && fs->reads_primid));
2039 
2040    if (CHIP >= A7XX) {
2041       tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2));
2042       tu_cs_emit_regs(cs, A7XX_HLSQ_FS_UNKNOWN_A9AA(.consts_load_disable = false));
2043    }
2044 
2045    if (fs) {
2046       tu6_emit_fs_inputs<CHIP>(cs, fs);
2047       tu6_emit_fs_outputs(cs, fs);
2048    } else {
2049       /* TODO: check if these can be skipped if fs is disabled */
2050       struct ir3_shader_variant dummy_variant = {};
2051       tu6_emit_fs_inputs<CHIP>(cs, &dummy_variant);
2052       tu6_emit_fs_outputs(cs, &dummy_variant);
2053    }
2054 }
2055 TU_GENX(tu6_emit_fs);
2056 
2057 template <chip CHIP>
2058 static void
tu6_emit_variant(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs,struct tu_pvtmem_config * pvtmem_config,uint32_t view_mask,uint64_t binary_iova)2059 tu6_emit_variant(struct tu_cs *cs,
2060                  gl_shader_stage stage,
2061                  const struct ir3_shader_variant *xs,
2062                  struct tu_pvtmem_config *pvtmem_config,
2063                  uint32_t view_mask,
2064                  uint64_t binary_iova)
2065 {
2066    if (stage == MESA_SHADER_COMPUTE) {
2067       tu6_emit_cs_config<CHIP>(cs, xs, pvtmem_config, binary_iova);
2068       return;
2069    }
2070 
2071    tu6_emit_xs(cs, stage, xs, pvtmem_config, binary_iova);
2072 
2073    switch (stage) {
2074    case MESA_SHADER_VERTEX:
2075       tu6_emit_vs<CHIP>(cs, xs, view_mask);
2076       break;
2077    case MESA_SHADER_TESS_CTRL:
2078       tu6_emit_hs<CHIP>(cs, xs);
2079       break;
2080    case MESA_SHADER_TESS_EVAL:
2081       tu6_emit_ds<CHIP>(cs, xs);
2082       break;
2083    case MESA_SHADER_GEOMETRY:
2084       tu6_emit_gs<CHIP>(cs, xs);
2085       break;
2086    case MESA_SHADER_FRAGMENT:
2087       tu6_emit_fs<CHIP>(cs, xs);
2088       break;
2089    default:
2090       unreachable("unknown shader stage");
2091    }
2092 }
2093 
2094 static VkResult
tu_setup_pvtmem(struct tu_device * dev,struct tu_shader * shader,struct tu_pvtmem_config * config,uint32_t pvtmem_bytes,bool per_wave)2095 tu_setup_pvtmem(struct tu_device *dev,
2096                 struct tu_shader *shader,
2097                 struct tu_pvtmem_config *config,
2098                 uint32_t pvtmem_bytes,
2099                 bool per_wave)
2100 {
2101    if (!pvtmem_bytes) {
2102       memset(config, 0, sizeof(*config));
2103       return VK_SUCCESS;
2104    }
2105 
2106    /* There is a substantial memory footprint from private memory BOs being
2107     * allocated on a per-pipeline basis and it isn't required as the same
2108     * BO can be utilized by multiple pipelines as long as they have the
2109     * private memory layout (sizes and per-wave/per-fiber) to avoid being
2110     * overwritten by other active pipelines using the same BO with differing
2111     * private memory layouts resulting memory corruption.
2112     *
2113     * To avoid this, we create private memory BOs on a per-device level with
2114     * an associated private memory layout then dynamically grow them when
2115     * needed and reuse them across pipelines. Growth is done in terms of
2116     * powers of two so that we can avoid frequent reallocation of the
2117     * private memory BOs.
2118     */
2119 
2120    struct tu_pvtmem_bo *pvtmem_bo =
2121       per_wave ? &dev->wave_pvtmem_bo : &dev->fiber_pvtmem_bo;
2122    mtx_lock(&pvtmem_bo->mtx);
2123 
2124    if (pvtmem_bo->per_fiber_size < pvtmem_bytes) {
2125       if (pvtmem_bo->bo)
2126          tu_bo_finish(dev, pvtmem_bo->bo);
2127 
2128       pvtmem_bo->per_fiber_size =
2129          util_next_power_of_two(ALIGN(pvtmem_bytes, 512));
2130       pvtmem_bo->per_sp_size =
2131          ALIGN(pvtmem_bo->per_fiber_size *
2132                   dev->physical_device->info->fibers_per_sp,
2133                1 << 12);
2134       uint32_t total_size =
2135          dev->physical_device->info->num_sp_cores * pvtmem_bo->per_sp_size;
2136 
2137       VkResult result = tu_bo_init_new(dev, NULL, &pvtmem_bo->bo, total_size,
2138                                        TU_BO_ALLOC_INTERNAL_RESOURCE, "pvtmem");
2139       if (result != VK_SUCCESS) {
2140          mtx_unlock(&pvtmem_bo->mtx);
2141          return result;
2142       }
2143    }
2144 
2145    config->per_wave = per_wave;
2146    config->per_fiber_size = pvtmem_bo->per_fiber_size;
2147    config->per_sp_size = pvtmem_bo->per_sp_size;
2148 
2149    shader->pvtmem_bo = tu_bo_get_ref(pvtmem_bo->bo);
2150    config->iova = shader->pvtmem_bo->iova;
2151 
2152    mtx_unlock(&pvtmem_bo->mtx);
2153 
2154    return VK_SUCCESS;
2155 }
2156 
2157 static uint64_t
tu_upload_variant(struct tu_cs * cs,const struct ir3_shader_variant * variant)2158 tu_upload_variant(struct tu_cs *cs,
2159                   const struct ir3_shader_variant *variant)
2160 {
2161    struct tu_cs_memory memory;
2162 
2163    if (!variant)
2164       return 0;
2165 
2166    /* this expects to get enough alignment because shaders are allocated first
2167     * and total size is always aligned correctly
2168     * note: an assert in tu6_emit_xs_config validates the alignment
2169     */
2170    tu_cs_alloc(cs, variant->info.size / 4, 1, &memory);
2171 
2172    memcpy(memory.map, variant->bin, variant->info.size);
2173    return memory.iova;
2174 }
2175 
2176 static VkResult
tu_upload_shader(struct tu_device * dev,struct tu_shader * shader)2177 tu_upload_shader(struct tu_device *dev,
2178                  struct tu_shader *shader)
2179 {
2180    const struct ir3_shader_variant *v = shader->variant;
2181    const struct ir3_shader_variant *binning = v ? v->binning : NULL;
2182    const struct ir3_shader_variant *safe_const = shader->safe_const_variant;
2183 
2184    if (v->type == MESA_SHADER_VERTEX && v->stream_output.num_outputs != 0)
2185       binning = v;
2186 
2187    uint32_t size = 0;
2188    if (v->type == MESA_SHADER_VERTEX)
2189       size += TU6_EMIT_VFD_DEST_MAX_DWORDS;
2190 
2191    const unsigned xs_size = 128;
2192    const unsigned vpc_size = 32 + (v->stream_output.num_outputs != 0 ? 256 : 0);
2193 
2194    size += xs_size + tu_xs_get_additional_cs_size_dwords(v);
2195    size += v->info.size / 4;
2196    if (binning) {
2197       size += xs_size + tu_xs_get_additional_cs_size_dwords(binning);
2198       size += binning->info.size / 4;
2199    }
2200 
2201    if (safe_const) {
2202       size += xs_size + tu_xs_get_additional_cs_size_dwords(safe_const);
2203       size += safe_const->info.size / 4;
2204    }
2205 
2206    /* We emit an empty VPC including streamout state in the binning draw state */
2207    if (binning || v->type == MESA_SHADER_GEOMETRY) {
2208       size += vpc_size;
2209    }
2210 
2211    pthread_mutex_lock(&dev->pipeline_mutex);
2212    VkResult result = tu_suballoc_bo_alloc(&shader->bo, &dev->pipeline_suballoc,
2213                                           size * 4, 128);
2214    pthread_mutex_unlock(&dev->pipeline_mutex);
2215 
2216    if (result != VK_SUCCESS)
2217       return result;
2218 
2219    uint32_t pvtmem_size = v->pvtmem_size;
2220    bool per_wave = v->pvtmem_per_wave;
2221 
2222    if (v->binning) {
2223       pvtmem_size = MAX2(pvtmem_size, shader->variant->binning->pvtmem_size);
2224       if (!shader->variant->binning->pvtmem_per_wave)
2225          per_wave = false;
2226    }
2227 
2228    if (shader->safe_const_variant) {
2229       pvtmem_size = MAX2(pvtmem_size, shader->safe_const_variant->pvtmem_size);
2230       if (!shader->safe_const_variant->pvtmem_per_wave)
2231          per_wave = false;
2232 
2233       if (shader->safe_const_variant->binning) {
2234          pvtmem_size = MAX2(pvtmem_size, shader->safe_const_variant->binning->pvtmem_size);
2235          if (!shader->safe_const_variant->binning->pvtmem_per_wave)
2236             per_wave = false;
2237       }
2238    }
2239 
2240    struct tu_pvtmem_config pvtmem_config;
2241 
2242    result = tu_setup_pvtmem(dev, shader, &pvtmem_config, pvtmem_size, per_wave);
2243    if (result != VK_SUCCESS) {
2244       pthread_mutex_lock(&dev->pipeline_mutex);
2245       tu_suballoc_bo_free(&dev->pipeline_suballoc, &shader->bo);
2246       pthread_mutex_unlock(&dev->pipeline_mutex);
2247       return result;
2248    }
2249 
2250    TU_RMV(cmd_buffer_suballoc_bo_create, dev, &shader->bo);
2251    tu_cs_init_suballoc(&shader->cs, dev, &shader->bo);
2252 
2253    uint64_t iova = tu_upload_variant(&shader->cs, v);
2254    uint64_t binning_iova = tu_upload_variant(&shader->cs, binning);
2255    uint64_t safe_const_iova = tu_upload_variant(&shader->cs, safe_const);
2256 
2257    struct tu_cs sub_cs;
2258    tu_cs_begin_sub_stream(&shader->cs, xs_size +
2259                           tu_xs_get_additional_cs_size_dwords(v), &sub_cs);
2260    TU_CALLX(dev, tu6_emit_variant)(
2261       &sub_cs, shader->variant->type, shader->variant, &pvtmem_config,
2262       shader->view_mask, iova);
2263    shader->state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2264 
2265    if (safe_const) {
2266       tu_cs_begin_sub_stream(&shader->cs, xs_size +
2267                              tu_xs_get_additional_cs_size_dwords(safe_const), &sub_cs);
2268       TU_CALLX(dev, tu6_emit_variant)(
2269          &sub_cs, v->type, safe_const, &pvtmem_config, shader->view_mask,
2270          safe_const_iova);
2271       shader->safe_const_state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2272    }
2273 
2274    if (binning) {
2275       tu_cs_begin_sub_stream(&shader->cs, xs_size + vpc_size +
2276                              tu_xs_get_additional_cs_size_dwords(binning), &sub_cs);
2277       TU_CALLX(dev, tu6_emit_variant)(
2278          &sub_cs, v->type, binning, &pvtmem_config, shader->view_mask,
2279          binning_iova);
2280       /* emit an empty VPC */
2281       TU_CALLX(dev, tu6_emit_vpc)(&sub_cs, binning, NULL, NULL, NULL, NULL);
2282       shader->binning_state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2283    }
2284 
2285    /* We don't support binning variants for GS, so the same draw state is used
2286     * when binning and when drawing, but the VPC draw state is not executed
2287     * when binning so we still need to generate an appropriate VPC config for
2288     * binning.
2289     */
2290    if (v->type == MESA_SHADER_GEOMETRY) {
2291       tu_cs_begin_sub_stream(&shader->cs, vpc_size, &sub_cs);
2292       TU_CALLX(dev, tu6_emit_vpc)(&sub_cs, NULL, NULL, NULL, v, NULL);
2293       shader->binning_state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2294    }
2295 
2296    return VK_SUCCESS;
2297 }
2298 
2299 static bool
2300 tu_shader_serialize(struct vk_pipeline_cache_object *object,
2301                     struct blob *blob);
2302 
2303 static struct vk_pipeline_cache_object *
2304 tu_shader_deserialize(struct vk_pipeline_cache *cache,
2305                       const void *key_data,
2306                       size_t key_size,
2307                       struct blob_reader *blob);
2308 
2309 static void
tu_shader_pipeline_cache_object_destroy(struct vk_device * vk_device,struct vk_pipeline_cache_object * object)2310 tu_shader_pipeline_cache_object_destroy(struct vk_device *vk_device,
2311                                         struct vk_pipeline_cache_object *object)
2312 {
2313    struct tu_device *device = container_of(vk_device, struct tu_device, vk);
2314    struct tu_shader *shader =
2315       container_of(object, struct tu_shader, base);
2316 
2317    vk_pipeline_cache_object_finish(&shader->base);
2318    tu_shader_destroy(device, shader);
2319 }
2320 
2321 const struct vk_pipeline_cache_object_ops tu_shader_ops = {
2322    .serialize = tu_shader_serialize,
2323    .deserialize = tu_shader_deserialize,
2324    .destroy = tu_shader_pipeline_cache_object_destroy,
2325 };
2326 
2327 static struct tu_shader *
tu_shader_init(struct tu_device * dev,const void * key_data,size_t key_size)2328 tu_shader_init(struct tu_device *dev, const void *key_data, size_t key_size)
2329 {
2330    VK_MULTIALLOC(ma);
2331    VK_MULTIALLOC_DECL(&ma, struct tu_shader, shader, 1);
2332    VK_MULTIALLOC_DECL_SIZE(&ma, char, obj_key_data, key_size);
2333 
2334    if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc,
2335                              VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
2336       return NULL;
2337 
2338    memcpy(obj_key_data, key_data, key_size);
2339 
2340    vk_pipeline_cache_object_init(&dev->vk, &shader->base,
2341                                  &tu_shader_ops, obj_key_data, key_size);
2342 
2343    shader->const_state.fdm_ubo.idx = -1;
2344    shader->const_state.dynamic_offsets_ubo.idx = -1;
2345    shader->const_state.inline_uniforms_ubo.idx = -1;
2346 
2347    return shader;
2348 }
2349 
2350 static bool
tu_shader_serialize(struct vk_pipeline_cache_object * object,struct blob * blob)2351 tu_shader_serialize(struct vk_pipeline_cache_object *object,
2352                     struct blob *blob)
2353 {
2354    struct tu_shader *shader =
2355       container_of(object, struct tu_shader, base);
2356 
2357    blob_write_bytes(blob, &shader->const_state, sizeof(shader->const_state));
2358    blob_write_bytes(blob, &shader->dynamic_descriptor_sizes,
2359                     sizeof(shader->dynamic_descriptor_sizes));
2360    blob_write_uint32(blob, shader->view_mask);
2361    blob_write_uint8(blob, shader->active_desc_sets);
2362 
2363    ir3_store_variant(blob, shader->variant);
2364 
2365    if (shader->safe_const_variant) {
2366       blob_write_uint8(blob, 1);
2367       ir3_store_variant(blob, shader->safe_const_variant);
2368    } else {
2369       blob_write_uint8(blob, 0);
2370    }
2371 
2372 
2373 
2374    switch (shader->variant->type) {
2375    case MESA_SHADER_TESS_EVAL:
2376       blob_write_bytes(blob, &shader->tes, sizeof(shader->tes));
2377       break;
2378    case MESA_SHADER_FRAGMENT:
2379       blob_write_bytes(blob, &shader->fs, sizeof(shader->fs));
2380       break;
2381    default:
2382       break;
2383    }
2384 
2385    return true;
2386 }
2387 
2388 static struct vk_pipeline_cache_object *
tu_shader_deserialize(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,struct blob_reader * blob)2389 tu_shader_deserialize(struct vk_pipeline_cache *cache,
2390                       const void *key_data,
2391                       size_t key_size,
2392                       struct blob_reader *blob)
2393 {
2394    struct tu_device *dev =
2395       container_of(cache->base.device, struct tu_device, vk);
2396    struct tu_shader *shader =
2397       tu_shader_init(dev, key_data, key_size);
2398 
2399    if (!shader)
2400       return NULL;
2401 
2402    blob_copy_bytes(blob, &shader->const_state, sizeof(shader->const_state));
2403    blob_copy_bytes(blob, &shader->dynamic_descriptor_sizes,
2404                    sizeof(shader->dynamic_descriptor_sizes));
2405    shader->view_mask = blob_read_uint32(blob);
2406    shader->active_desc_sets = blob_read_uint8(blob);
2407 
2408    shader->variant = ir3_retrieve_variant(blob, dev->compiler, NULL);
2409 
2410    bool has_safe_const = blob_read_uint8(blob);
2411    if (has_safe_const)
2412       shader->safe_const_variant = ir3_retrieve_variant(blob, dev->compiler, NULL);
2413 
2414    switch (shader->variant->type) {
2415    case MESA_SHADER_TESS_EVAL:
2416       blob_copy_bytes(blob, &shader->tes, sizeof(shader->tes));
2417       break;
2418    case MESA_SHADER_FRAGMENT:
2419       blob_copy_bytes(blob, &shader->fs, sizeof(shader->fs));
2420       break;
2421    default:
2422       break;
2423    }
2424 
2425    VkResult result = tu_upload_shader(dev, shader);
2426    if (result != VK_SUCCESS) {
2427       vk_free(&dev->vk.alloc, shader);
2428       return NULL;
2429    }
2430 
2431    return &shader->base;
2432 }
2433 
2434 VkResult
tu_shader_create(struct tu_device * dev,struct tu_shader ** shader_out,nir_shader * nir,const struct tu_shader_key * key,const struct ir3_shader_key * ir3_key,const void * key_data,size_t key_size,struct tu_pipeline_layout * layout,bool executable_info)2435 tu_shader_create(struct tu_device *dev,
2436                  struct tu_shader **shader_out,
2437                  nir_shader *nir,
2438                  const struct tu_shader_key *key,
2439                  const struct ir3_shader_key *ir3_key,
2440                  const void *key_data,
2441                  size_t key_size,
2442                  struct tu_pipeline_layout *layout,
2443                  bool executable_info)
2444 {
2445    struct tu_shader *shader = tu_shader_init(dev, key_data, key_size);
2446 
2447    if (!shader)
2448       return VK_ERROR_OUT_OF_HOST_MEMORY;
2449 
2450    const nir_opt_access_options access_options = {
2451       .is_vulkan = true,
2452    };
2453    NIR_PASS_V(nir, nir_opt_access, &access_options);
2454 
2455    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2456       const nir_input_attachment_options att_options = {
2457          .use_fragcoord_sysval = true,
2458          .use_layer_id_sysval = false,
2459          /* When using multiview rendering, we must use
2460           * gl_ViewIndex as the layer id to pass to the texture
2461           * sampling function. gl_Layer doesn't work when
2462           * multiview is enabled.
2463           */
2464          .use_view_id_for_layer = key->multiview_mask != 0,
2465          .unscaled_depth_stencil_ir3 =
2466             key->dynamic_renderpass && !(key->read_only_input_attachments & 1),
2467          .unscaled_input_attachment_ir3 =
2468             key->dynamic_renderpass ?
2469             ~(key->read_only_input_attachments >> 1) :
2470             key->unscaled_input_fragcoord,
2471       };
2472       NIR_PASS_V(nir, nir_lower_input_attachments, &att_options);
2473    }
2474 
2475    /* This has to happen before lower_input_attachments, because we have to
2476     * lower input attachment coordinates except if unscaled.
2477     */
2478    const struct lower_fdm_options fdm_options = {
2479       .num_views = MAX2(util_last_bit(key->multiview_mask), 1),
2480       .adjust_fragcoord = key->fragment_density_map,
2481    };
2482    NIR_PASS_V(nir, tu_nir_lower_fdm, &fdm_options);
2483 
2484 
2485    /* This needs to happen before multiview lowering which rewrites store
2486     * instructions of the position variable, so that we can just rewrite one
2487     * store at the end instead of having to rewrite every store specified by
2488     * the user.
2489     */
2490    ir3_nir_lower_io_to_temporaries(nir);
2491 
2492    if (nir->info.stage == MESA_SHADER_VERTEX && key->multiview_mask) {
2493       tu_nir_lower_multiview(nir, key->multiview_mask, dev);
2494    }
2495 
2496    if (nir->info.stage == MESA_SHADER_FRAGMENT && key->force_sample_interp) {
2497       nir_foreach_shader_in_variable(var, nir) {
2498          if (!var->data.centroid)
2499             var->data.sample = true;
2500       }
2501    }
2502 
2503    NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const,
2504               nir_address_format_32bit_offset);
2505 
2506    NIR_PASS_V(nir, nir_lower_explicit_io,
2507               nir_var_mem_ubo | nir_var_mem_ssbo,
2508               nir_address_format_vec2_index_32bit_offset);
2509 
2510    NIR_PASS_V(nir, nir_lower_explicit_io,
2511               nir_var_mem_global,
2512               nir_address_format_64bit_global);
2513 
2514    if (nir->info.stage == MESA_SHADER_COMPUTE) {
2515       if (!nir->info.shared_memory_explicit_layout) {
2516          NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
2517                     nir_var_mem_shared, shared_type_info);
2518       }
2519       NIR_PASS_V(nir, nir_lower_explicit_io,
2520                  nir_var_mem_shared,
2521                  nir_address_format_32bit_offset);
2522 
2523       if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
2524          const unsigned chunk_size = 16; /* max single store size */
2525          /* Shared memory is allocated in 1024b chunks in HW, but the zero-init
2526           * extension only requires us to initialize the memory that the shader
2527           * is allocated at the API level, and it's up to the user to ensure
2528           * that accesses are limited to those bounds.
2529           */
2530          const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
2531          NIR_PASS_V(nir, nir_zero_initialize_shared_memory, shared_size, chunk_size);
2532       }
2533 
2534       const struct nir_lower_compute_system_values_options compute_sysval_options = {
2535          .has_base_workgroup_id = true,
2536       };
2537       NIR_PASS_V(nir, nir_lower_compute_system_values, &compute_sysval_options);
2538    }
2539 
2540    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
2541    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
2542 
2543   /* Gather information for transform feedback. This should be called after:
2544     * - nir_split_per_member_structs.
2545     * - nir_remove_dead_variables with varyings, so that we could align
2546     *   stream outputs correctly.
2547     * - nir_assign_io_var_locations - to have valid driver_location
2548     */
2549    struct ir3_stream_output_info so_info = {};
2550    if (nir->info.stage == MESA_SHADER_VERTEX ||
2551          nir->info.stage == MESA_SHADER_TESS_EVAL ||
2552          nir->info.stage == MESA_SHADER_GEOMETRY)
2553       tu_gather_xfb_info(nir, &so_info);
2554 
2555    for (unsigned i = 0; i < layout->num_sets; i++) {
2556       if (layout->set[i].layout) {
2557          shader->dynamic_descriptor_sizes[i] =
2558             layout->set[i].layout->dynamic_offset_size;
2559       } else {
2560          shader->dynamic_descriptor_sizes[i] = -1;
2561       }
2562    }
2563 
2564    {
2565       /* Lower 64b push constants before lowering IO. */
2566       nir_lower_mem_access_bit_sizes_options options = {
2567          .callback = ir3_mem_access_size_align,
2568          .modes = nir_var_mem_push_const,
2569       };
2570 
2571       NIR_PASS_V(nir, nir_lower_mem_access_bit_sizes, &options);
2572    }
2573 
2574    struct ir3_const_allocations const_allocs = {};
2575    NIR_PASS_V(nir, tu_lower_io, dev, shader, layout,
2576               key->read_only_input_attachments, key->dynamic_renderpass,
2577               &const_allocs);
2578 
2579    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
2580 
2581    struct ir3_shader_nir_options nir_options;
2582    init_ir3_nir_options(&nir_options, key);
2583 
2584    ir3_finalize_nir(dev->compiler, &nir_options, nir);
2585 
2586    const struct ir3_shader_options options = {
2587       .api_wavesize = key->api_wavesize,
2588       .real_wavesize = key->real_wavesize,
2589       .push_consts_type = shader->const_state.push_consts.type,
2590       .push_consts_base = shader->const_state.push_consts.lo_dwords,
2591       .push_consts_dwords = shader->const_state.push_consts.dwords,
2592       .const_allocs = const_allocs,
2593       .nir_options = nir_options,
2594    };
2595 
2596    struct ir3_shader *ir3_shader =
2597       ir3_shader_from_nir(dev->compiler, nir, &options, &so_info);
2598 
2599    shader->variant =
2600       ir3_shader_create_variant(ir3_shader, ir3_key, executable_info);
2601 
2602    if (ir3_exceeds_safe_constlen(shader->variant)) {
2603       struct ir3_shader_key safe_constlen_key = *ir3_key;
2604       safe_constlen_key.safe_constlen = true;
2605       shader->safe_const_variant =
2606          ir3_shader_create_variant(ir3_shader, &safe_constlen_key,
2607                                    executable_info);
2608    }
2609 
2610    ir3_shader_destroy(ir3_shader);
2611 
2612    shader->view_mask = key->multiview_mask;
2613 
2614    switch (shader->variant->type) {
2615    case MESA_SHADER_TESS_EVAL: {
2616       const struct ir3_shader_variant *tes = shader->variant;
2617       if (tes->tess.point_mode) {
2618          shader->tes.tess_output_lower_left =
2619             shader->tes.tess_output_upper_left = TESS_POINTS;
2620       } else if (tes->tess.primitive_mode == TESS_PRIMITIVE_ISOLINES) {
2621          shader->tes.tess_output_lower_left =
2622             shader->tes.tess_output_upper_left = TESS_LINES;
2623       } else if (tes->tess.ccw) {
2624          /* Tessellation orientation in HW is specified with a lower-left
2625           * origin, we need to swap them if the origin is upper-left.
2626           */
2627          shader->tes.tess_output_lower_left = TESS_CCW_TRIS;
2628          shader->tes.tess_output_upper_left = TESS_CW_TRIS;
2629       } else {
2630          shader->tes.tess_output_lower_left = TESS_CW_TRIS;
2631          shader->tes.tess_output_upper_left = TESS_CCW_TRIS;
2632       }
2633 
2634       switch (tes->tess.spacing) {
2635       case TESS_SPACING_EQUAL:
2636          shader->tes.tess_spacing = TESS_EQUAL;
2637          break;
2638       case TESS_SPACING_FRACTIONAL_ODD:
2639          shader->tes.tess_spacing = TESS_FRACTIONAL_ODD;
2640          break;
2641       case TESS_SPACING_FRACTIONAL_EVEN:
2642          shader->tes.tess_spacing = TESS_FRACTIONAL_EVEN;
2643          break;
2644       case TESS_SPACING_UNSPECIFIED:
2645       default:
2646          unreachable("invalid tess spacing");
2647       }
2648 
2649       break;
2650    }
2651    case MESA_SHADER_FRAGMENT: {
2652       const struct ir3_shader_variant *fs = shader->variant;
2653       shader->fs.per_samp = fs->per_samp || ir3_key->sample_shading;
2654       shader->fs.has_fdm = key->fragment_density_map;
2655       if (fs->has_kill)
2656          shader->fs.lrz.status |= TU_LRZ_FORCE_DISABLE_WRITE;
2657       if (fs->no_earlyz || (fs->writes_pos && !fs->fs.early_fragment_tests))
2658          shader->fs.lrz.status = TU_LRZ_FORCE_DISABLE_LRZ;
2659       /* FDM isn't compatible with LRZ, because the LRZ image uses the original
2660        * resolution and we would need to use the low resolution.
2661        *
2662        * TODO: Use a patchpoint to only disable LRZ for scaled bins.
2663        */
2664       if (key->fragment_density_map)
2665          shader->fs.lrz.status = TU_LRZ_FORCE_DISABLE_LRZ;
2666       if (!fs->fs.early_fragment_tests &&
2667           (fs->no_earlyz || fs->writes_pos || fs->writes_stencilref || fs->writes_smask)) {
2668          shader->fs.lrz.force_late_z = true;
2669       }
2670       break;
2671    }
2672    default:
2673       break;
2674    }
2675 
2676    VkResult result = tu_upload_shader(dev, shader);
2677    if (result != VK_SUCCESS) {
2678       vk_free(&dev->vk.alloc, shader);
2679       return result;
2680    }
2681 
2682    *shader_out = shader;
2683    return VK_SUCCESS;
2684 }
2685 
2686 static void
tu_link_shaders(nir_shader ** shaders,unsigned shaders_count)2687 tu_link_shaders(nir_shader **shaders, unsigned shaders_count)
2688 {
2689    nir_shader *consumer = NULL;
2690    for (gl_shader_stage stage = (gl_shader_stage) (shaders_count - 1);
2691         stage >= MESA_SHADER_VERTEX; stage = (gl_shader_stage) (stage - 1)) {
2692       if (!shaders[stage])
2693          continue;
2694 
2695       nir_shader *producer = shaders[stage];
2696       if (!consumer) {
2697          consumer = producer;
2698          continue;
2699       }
2700 
2701       if (nir_link_opt_varyings(producer, consumer)) {
2702          NIR_PASS_V(consumer, nir_opt_constant_folding);
2703          NIR_PASS_V(consumer, nir_opt_algebraic);
2704          NIR_PASS_V(consumer, nir_opt_dce);
2705       }
2706 
2707       const nir_remove_dead_variables_options out_var_opts = {
2708          .can_remove_var = nir_vk_is_not_xfb_output,
2709       };
2710       NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, &out_var_opts);
2711 
2712       NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
2713 
2714       bool progress = nir_remove_unused_varyings(producer, consumer);
2715 
2716       nir_compact_varyings(producer, consumer, true);
2717       if (progress) {
2718          if (nir_lower_global_vars_to_local(producer)) {
2719             /* Remove dead writes, which can remove input loads */
2720             NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_temp, NULL);
2721             NIR_PASS_V(producer, nir_opt_dce);
2722          }
2723          nir_lower_global_vars_to_local(consumer);
2724       }
2725 
2726       consumer = producer;
2727    }
2728 
2729    /* Gather info after linking so that we can fill out the ir3 shader key.
2730     */
2731    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2732         stage <= MESA_SHADER_FRAGMENT; stage = (gl_shader_stage) (stage + 1)) {
2733       if (shaders[stage])
2734          nir_shader_gather_info(shaders[stage],
2735                                 nir_shader_get_entrypoint(shaders[stage]));
2736    }
2737 }
2738 
2739 static uint32_t
tu6_get_tessmode(const struct nir_shader * shader)2740 tu6_get_tessmode(const struct nir_shader *shader)
2741 {
2742    enum tess_primitive_mode primitive_mode = shader->info.tess._primitive_mode;
2743    switch (primitive_mode) {
2744    case TESS_PRIMITIVE_ISOLINES:
2745       return IR3_TESS_ISOLINES;
2746    case TESS_PRIMITIVE_TRIANGLES:
2747       return IR3_TESS_TRIANGLES;
2748    case TESS_PRIMITIVE_QUADS:
2749       return IR3_TESS_QUADS;
2750    case TESS_PRIMITIVE_UNSPECIFIED:
2751       return IR3_TESS_NONE;
2752    default:
2753       unreachable("bad tessmode");
2754    }
2755 }
2756 
2757 VkResult
tu_compile_shaders(struct tu_device * device,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo ** stage_infos,nir_shader ** nir,const struct tu_shader_key * keys,struct tu_pipeline_layout * layout,const unsigned char * pipeline_sha1,struct tu_shader ** shaders,char ** nir_initial_disasm,void * nir_initial_disasm_mem_ctx,nir_shader ** nir_out,VkPipelineCreationFeedback * stage_feedbacks)2758 tu_compile_shaders(struct tu_device *device,
2759                    VkPipelineCreateFlags2KHR pipeline_flags,
2760                    const VkPipelineShaderStageCreateInfo **stage_infos,
2761                    nir_shader **nir,
2762                    const struct tu_shader_key *keys,
2763                    struct tu_pipeline_layout *layout,
2764                    const unsigned char *pipeline_sha1,
2765                    struct tu_shader **shaders,
2766                    char **nir_initial_disasm,
2767                    void *nir_initial_disasm_mem_ctx,
2768                    nir_shader **nir_out,
2769                    VkPipelineCreationFeedback *stage_feedbacks)
2770 {
2771    struct ir3_shader_key ir3_key = {};
2772    VkResult result = VK_SUCCESS;
2773    void *mem_ctx = ralloc_context(NULL);
2774 
2775    for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2776         stage = (gl_shader_stage) (stage + 1)) {
2777       const VkPipelineShaderStageCreateInfo *stage_info = stage_infos[stage];
2778       if (!stage_info)
2779          continue;
2780 
2781       int64_t stage_start = os_time_get_nano();
2782 
2783       nir[stage] = tu_spirv_to_nir(device, mem_ctx, pipeline_flags,
2784                                    stage_info, &keys[stage], stage);
2785       if (!nir[stage]) {
2786          result = VK_ERROR_OUT_OF_HOST_MEMORY;
2787          goto fail;
2788       }
2789 
2790       stage_feedbacks[stage].flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
2791       stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
2792    }
2793 
2794    if (nir[MESA_SHADER_GEOMETRY])
2795       ir3_key.has_gs = true;
2796 
2797    ir3_key.sample_shading = keys[MESA_SHADER_FRAGMENT].force_sample_interp;
2798 
2799    if (nir_initial_disasm) {
2800       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2801            stage < MESA_SHADER_STAGES;
2802            stage = (gl_shader_stage) (stage + 1)) {
2803       if (!nir[stage])
2804          continue;
2805 
2806       nir_initial_disasm[stage] =
2807          nir_shader_as_str(nir[stage], nir_initial_disasm_mem_ctx);
2808       }
2809    }
2810 
2811    tu_link_shaders(nir, MESA_SHADER_STAGES);
2812 
2813    if (nir_out) {
2814       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2815            stage < MESA_SHADER_STAGES; stage = (gl_shader_stage) (stage + 1)) {
2816          if (!nir[stage])
2817             continue;
2818 
2819          nir_out[stage] = nir_shader_clone(NULL, nir[stage]);
2820       }
2821    }
2822 
2823    /* With pipelines, tessellation modes can be set on either shader, for
2824     * compatibility with HLSL and GLSL, and the driver is supposed to merge
2825     * them. Shader objects requires modes to be set on at least the TES except
2826     * for OutputVertices which has to be set at least on the TCS. Make sure
2827     * all modes are set on the TES when compiling together multiple shaders,
2828     * and then from this point on we will use the modes in the TES (and output
2829     * vertices on the TCS).
2830     */
2831    if (nir[MESA_SHADER_TESS_EVAL]) {
2832       nir_shader *tcs = nir[MESA_SHADER_TESS_CTRL];
2833       nir_shader *tes = nir[MESA_SHADER_TESS_EVAL];
2834 
2835       if (tes->info.tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED)
2836          tes->info.tess._primitive_mode = tcs->info.tess._primitive_mode;
2837 
2838       tes->info.tess.point_mode |= tcs->info.tess.point_mode;
2839       tes->info.tess.ccw |= tcs->info.tess.ccw;
2840 
2841       if (tes->info.tess.spacing == TESS_SPACING_UNSPECIFIED) {
2842          tes->info.tess.spacing = tcs->info.tess.spacing;
2843       }
2844 
2845       if (tcs->info.tess.tcs_vertices_out == 0)
2846          tcs->info.tess.tcs_vertices_out = tes->info.tess.tcs_vertices_out;
2847 
2848       ir3_key.tessellation = tu6_get_tessmode(tes);
2849    }
2850 
2851    for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2852         stage = (gl_shader_stage) (stage + 1)) {
2853       if (!nir[stage])
2854          continue;
2855 
2856       if (stage > MESA_SHADER_TESS_CTRL) {
2857          if (stage == MESA_SHADER_FRAGMENT) {
2858             ir3_key.tcs_store_primid = ir3_key.tcs_store_primid ||
2859                (nir[stage]->info.inputs_read & (1ull << VARYING_SLOT_PRIMITIVE_ID));
2860          } else {
2861             ir3_key.tcs_store_primid = ir3_key.tcs_store_primid ||
2862                BITSET_TEST(nir[stage]->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
2863          }
2864       }
2865    }
2866 
2867    /* In the the tess-but-not-FS case we don't know whether the FS will read
2868     * PrimID so we need to unconditionally store it.
2869     */
2870    if (nir[MESA_SHADER_TESS_CTRL] && !nir[MESA_SHADER_FRAGMENT])
2871       ir3_key.tcs_store_primid = true;
2872 
2873    for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2874         stage = (gl_shader_stage) (stage + 1)) {
2875       if (!nir[stage] || shaders[stage])
2876          continue;
2877 
2878       int64_t stage_start = os_time_get_nano();
2879 
2880       unsigned char shader_sha1[21];
2881       memcpy(shader_sha1, pipeline_sha1, 20);
2882       shader_sha1[20] = (unsigned char) stage;
2883 
2884       result = tu_shader_create(device,
2885                                 &shaders[stage], nir[stage], &keys[stage],
2886                                 &ir3_key, shader_sha1, sizeof(shader_sha1),
2887                                 layout, !!nir_initial_disasm);
2888       if (result != VK_SUCCESS) {
2889          goto fail;
2890       }
2891 
2892       stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
2893    }
2894 
2895    ralloc_free(mem_ctx);
2896 
2897    return VK_SUCCESS;
2898 
2899 fail:
2900    ralloc_free(mem_ctx);
2901 
2902    for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2903         stage = (gl_shader_stage) (stage + 1)) {
2904       if (shaders[stage]) {
2905          tu_shader_destroy(device, shaders[stage]);
2906       }
2907       if (nir_out && nir_out[stage]) {
2908          ralloc_free(nir_out[stage]);
2909       }
2910    }
2911 
2912    return result;
2913 }
2914 
2915 void
tu_shader_key_subgroup_size(struct tu_shader_key * key,bool allow_varying_subgroup_size,bool require_full_subgroups,const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo * subgroup_info,struct tu_device * dev)2916 tu_shader_key_subgroup_size(struct tu_shader_key *key,
2917                             bool allow_varying_subgroup_size,
2918                             bool require_full_subgroups,
2919                             const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info,
2920                             struct tu_device *dev)
2921 {
2922    enum ir3_wavesize_option api_wavesize, real_wavesize;
2923    if (!dev->physical_device->info->a6xx.supports_double_threadsize) {
2924       api_wavesize = IR3_SINGLE_ONLY;
2925       real_wavesize = IR3_SINGLE_ONLY;
2926    } else {
2927       if (allow_varying_subgroup_size) {
2928          api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE;
2929       } else {
2930          if (subgroup_info) {
2931             if (subgroup_info->requiredSubgroupSize == dev->compiler->threadsize_base) {
2932                api_wavesize = IR3_SINGLE_ONLY;
2933             } else {
2934                assert(subgroup_info->requiredSubgroupSize == dev->compiler->threadsize_base * 2);
2935                api_wavesize = IR3_DOUBLE_ONLY;
2936             }
2937          } else {
2938             /* Match the exposed subgroupSize. */
2939             api_wavesize = IR3_DOUBLE_ONLY;
2940          }
2941 
2942          if (require_full_subgroups)
2943             real_wavesize = api_wavesize;
2944          else if (api_wavesize == IR3_SINGLE_ONLY)
2945             real_wavesize = IR3_SINGLE_ONLY;
2946          else
2947             real_wavesize = IR3_SINGLE_OR_DOUBLE;
2948       }
2949    }
2950 
2951    key->api_wavesize = api_wavesize;
2952    key->real_wavesize = real_wavesize;
2953 }
2954 
2955 void
tu_shader_key_robustness(struct tu_shader_key * key,const struct vk_pipeline_robustness_state * rs)2956 tu_shader_key_robustness(struct tu_shader_key *key,
2957                          const struct vk_pipeline_robustness_state *rs)
2958 {
2959    key->robust_storage_access2 =
2960       (rs->storage_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT);
2961    key->robust_uniform_access2 =
2962       (rs->uniform_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT);
2963 }
2964 
2965 static VkResult
tu_empty_shader_create(struct tu_device * dev,struct tu_shader ** shader_out,gl_shader_stage stage)2966 tu_empty_shader_create(struct tu_device *dev,
2967                        struct tu_shader **shader_out,
2968                        gl_shader_stage stage)
2969 {
2970    struct tu_shader *shader = tu_shader_init(dev, NULL, 0);
2971 
2972    if (!shader)
2973       return VK_ERROR_OUT_OF_HOST_MEMORY;
2974 
2975    pthread_mutex_lock(&dev->pipeline_mutex);
2976    VkResult result = tu_suballoc_bo_alloc(&shader->bo, &dev->pipeline_suballoc,
2977                                           32 * 4, 128);
2978    pthread_mutex_unlock(&dev->pipeline_mutex);
2979 
2980    if (result != VK_SUCCESS) {
2981       vk_free(&dev->vk.alloc, shader);
2982       return result;
2983    }
2984 
2985    TU_RMV(cmd_buffer_suballoc_bo_create, dev, &shader->bo);
2986    tu_cs_init_suballoc(&shader->cs, dev, &shader->bo);
2987 
2988    struct tu_pvtmem_config pvtmem_config = { };
2989 
2990    struct tu_cs sub_cs;
2991    tu_cs_begin_sub_stream(&shader->cs, 32, &sub_cs);
2992    TU_CALLX(dev, tu6_emit_variant)(&sub_cs, stage, NULL, &pvtmem_config, 0, 0);
2993    shader->state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2994 
2995    *shader_out = shader;
2996    return VK_SUCCESS;
2997 }
2998 
2999 static VkResult
tu_empty_fs_create(struct tu_device * dev,struct tu_shader ** shader,bool fragment_density_map)3000 tu_empty_fs_create(struct tu_device *dev, struct tu_shader **shader,
3001                    bool fragment_density_map)
3002 {
3003    struct ir3_shader_key key = {};
3004    const struct ir3_shader_options options = {};
3005    struct ir3_stream_output_info so_info = {};
3006    const nir_shader_compiler_options *nir_options =
3007       ir3_get_compiler_options(dev->compiler);
3008    nir_builder fs_b;
3009 
3010    fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, nir_options,
3011                                          "noop_fs");
3012 
3013    *shader = tu_shader_init(dev, NULL, 0);
3014    if (!*shader)
3015       return VK_ERROR_OUT_OF_HOST_MEMORY;
3016 
3017    (*shader)->fs.has_fdm = fragment_density_map;
3018    if (fragment_density_map)
3019       (*shader)->fs.lrz.status = TU_LRZ_FORCE_DISABLE_LRZ;
3020 
3021    for (unsigned i = 0; i < MAX_SETS; i++)
3022       (*shader)->dynamic_descriptor_sizes[i] = -1;
3023 
3024    struct ir3_shader *ir3_shader =
3025       ir3_shader_from_nir(dev->compiler, fs_b.shader, &options, &so_info);
3026    (*shader)->variant = ir3_shader_create_variant(ir3_shader, &key, false);
3027    ir3_shader_destroy(ir3_shader);
3028 
3029    return tu_upload_shader(dev, *shader);
3030 }
3031 
3032 VkResult
tu_init_empty_shaders(struct tu_device * dev)3033 tu_init_empty_shaders(struct tu_device *dev)
3034 {
3035    VkResult result;
3036 
3037    result = tu_empty_shader_create(dev, &dev->empty_tcs, MESA_SHADER_TESS_CTRL);
3038    if (result != VK_SUCCESS)
3039       goto out;
3040 
3041    result = tu_empty_shader_create(dev, &dev->empty_tes, MESA_SHADER_TESS_EVAL);
3042    if (result != VK_SUCCESS)
3043       goto out;
3044 
3045    result = tu_empty_shader_create(dev, &dev->empty_gs, MESA_SHADER_GEOMETRY);
3046    if (result != VK_SUCCESS)
3047       goto out;
3048 
3049    result = tu_empty_fs_create(dev, &dev->empty_fs, false);
3050    if (result != VK_SUCCESS)
3051       goto out;
3052 
3053    result = tu_empty_fs_create(dev, &dev->empty_fs_fdm, true);
3054    if (result != VK_SUCCESS)
3055       goto out;
3056 
3057    return VK_SUCCESS;
3058 
3059 out:
3060    if (dev->empty_tcs)
3061       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tcs->base);
3062    if (dev->empty_tes)
3063       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tes->base);
3064    if (dev->empty_gs)
3065       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_gs->base);
3066    if (dev->empty_fs)
3067       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs->base);
3068    if (dev->empty_fs_fdm)
3069       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs_fdm->base);
3070    return result;
3071 }
3072 
3073 void
tu_destroy_empty_shaders(struct tu_device * dev)3074 tu_destroy_empty_shaders(struct tu_device *dev)
3075 {
3076    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tcs->base);
3077    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tes->base);
3078    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_gs->base);
3079    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs->base);
3080    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs_fdm->base);
3081 }
3082 
3083 void
tu_shader_destroy(struct tu_device * dev,struct tu_shader * shader)3084 tu_shader_destroy(struct tu_device *dev,
3085                   struct tu_shader *shader)
3086 {
3087    tu_cs_finish(&shader->cs);
3088    TU_RMV(resource_destroy, dev, &shader->bo);
3089 
3090    pthread_mutex_lock(&dev->pipeline_mutex);
3091    tu_suballoc_bo_free(&dev->pipeline_suballoc, &shader->bo);
3092    pthread_mutex_unlock(&dev->pipeline_mutex);
3093 
3094    if (shader->pvtmem_bo)
3095       tu_bo_finish(dev, shader->pvtmem_bo);
3096 
3097    if (shader->variant)
3098       ralloc_free((void *)shader->variant);
3099    if (shader->safe_const_variant)
3100       ralloc_free((void *)shader->safe_const_variant);
3101 
3102    vk_free(&dev->vk.alloc, shader);
3103 }
3104