• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Google LLC
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "tu_shader.h"
7 
8 #include "spirv/nir_spirv.h"
9 #include "util/mesa-sha1.h"
10 #include "nir/nir_xfb_info.h"
11 #include "vk_nir.h"
12 #include "vk_nir_convert_ycbcr.h"
13 #include "vk_pipeline.h"
14 #include "vk_util.h"
15 
16 #include "ir3/ir3_compiler.h"
17 #include "ir3/ir3_nir.h"
18 
19 #include "tu_device.h"
20 #include "tu_descriptor_set.h"
21 #include "tu_lrz.h"
22 #include "tu_pipeline.h"
23 #include "tu_rmv.h"
24 
25 #include <initializer_list>
26 
27 static void
init_ir3_nir_options(struct ir3_shader_nir_options * options,const struct tu_shader_key * key)28 init_ir3_nir_options(struct ir3_shader_nir_options *options,
29                      const struct tu_shader_key *key)
30 {
31    *options = {
32       .robust_modes = (nir_variable_mode)
33          ((key->robust_storage_access2 ? nir_var_mem_ssbo : 0) |
34           (key->robust_uniform_access2 ? nir_var_mem_ubo : 0)),
35    };
36 }
37 
38 nir_shader *
tu_spirv_to_nir(struct tu_device * dev,void * mem_ctx,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo * stage_info,const struct tu_shader_key * key,gl_shader_stage stage)39 tu_spirv_to_nir(struct tu_device *dev,
40                 void *mem_ctx,
41                 VkPipelineCreateFlags2KHR pipeline_flags,
42                 const VkPipelineShaderStageCreateInfo *stage_info,
43                 const struct tu_shader_key *key,
44                 gl_shader_stage stage)
45 {
46    /* TODO these are made-up */
47    const struct spirv_to_nir_options spirv_options = {
48       /* ViewID is a sysval in geometry stages and an input in the FS */
49       .view_index_is_input =
50          stage == MESA_SHADER_FRAGMENT &&
51          !key->lower_view_index_to_device_index,
52 
53       /* Use 16-bit math for RelaxedPrecision ALU ops */
54       .mediump_16bit_alu = true,
55 
56       .ubo_addr_format = nir_address_format_vec2_index_32bit_offset,
57       .ssbo_addr_format = nir_address_format_vec2_index_32bit_offset,
58 
59       /* Accessed via stg/ldg */
60       .phys_ssbo_addr_format = nir_address_format_64bit_global,
61 
62       /* Accessed via the const register file */
63       .push_const_addr_format = nir_address_format_logical,
64 
65       /* Accessed via ldl/stl */
66       .shared_addr_format = nir_address_format_32bit_offset,
67 
68       /* Accessed via stg/ldg (not used with Vulkan?) */
69       .global_addr_format = nir_address_format_64bit_global,
70    };
71 
72    const nir_shader_compiler_options *nir_options =
73       ir3_get_compiler_options(dev->compiler);
74 
75    nir_shader *nir;
76    VkResult result =
77       vk_pipeline_shader_stage_to_nir(&dev->vk, pipeline_flags, stage_info,
78                                       &spirv_options, nir_options,
79                                       mem_ctx, &nir);
80    if (result != VK_SUCCESS)
81       return NULL;
82 
83    /* ir3 uses num_ubos and num_ssbos to track the number of *bindful*
84     * UBOs/SSBOs, but spirv_to_nir sets them to the total number of objects
85     * which is useless for us, so reset them here.
86     */
87    nir->info.num_ubos = 0;
88    nir->info.num_ssbos = 0;
89 
90    if (TU_DEBUG(NIR)) {
91       fprintf(stderr, "translated nir:\n");
92       nir_print_shader(nir, stderr);
93    }
94 
95    const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
96       .point_coord = true,
97    };
98    NIR_PASS_V(nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
99 
100    NIR_PASS_V(nir, nir_lower_global_vars_to_local);
101 
102    /* Older glslang missing bf6efd0316d8 ("SPV: Fix #2293: keep relaxed
103     * precision on arg passed to relaxed param") will pass function args through
104     * a highp temporary, so we need the nir_opt_find_array_copies() and a copy
105     * prop before we lower mediump vars, or you'll be unable to optimize out
106     * array copies after lowering.  We do this before splitting copies, since
107     * that works against nir_opt_find_array_copies().
108     * */
109    NIR_PASS_V(nir, nir_opt_find_array_copies);
110    NIR_PASS_V(nir, nir_opt_copy_prop_vars);
111    NIR_PASS_V(nir, nir_opt_dce);
112 
113    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
114 
115    if (nir->info.ray_queries > 0) {
116       NIR_PASS(_, nir, nir_opt_ray_queries);
117       NIR_PASS(_, nir, nir_opt_ray_query_ranges);
118       NIR_PASS(_, nir, tu_nir_lower_ray_queries);
119    }
120 
121    NIR_PASS_V(nir, nir_split_var_copies);
122    NIR_PASS_V(nir, nir_lower_var_copies);
123 
124    NIR_PASS_V(nir, nir_lower_mediump_vars, nir_var_function_temp | nir_var_shader_temp | nir_var_mem_shared);
125    NIR_PASS_V(nir, nir_opt_copy_prop_vars);
126    NIR_PASS_V(nir, nir_opt_combine_stores, nir_var_all);
127 
128    NIR_PASS_V(nir, nir_lower_system_values);
129    NIR_PASS_V(nir, nir_lower_is_helper_invocation);
130 
131    if (key->lower_view_index_to_device_index)
132       NIR_PASS_V(nir, nir_lower_view_index_to_device_index);
133 
134    struct ir3_shader_nir_options options;
135    init_ir3_nir_options(&options, key);
136    ir3_optimize_loop(dev->compiler, &options, nir);
137 
138    NIR_PASS_V(nir, nir_opt_conditional_discard);
139 
140    return nir;
141 }
142 
143 static void
lower_load_push_constant(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * instr,struct tu_shader * shader,const struct tu_pipeline_layout * layout,uint32_t push_consts_offset_vec4)144 lower_load_push_constant(struct tu_device *dev,
145                          nir_builder *b,
146                          nir_intrinsic_instr *instr,
147                          struct tu_shader *shader,
148                          const struct tu_pipeline_layout *layout,
149                          uint32_t push_consts_offset_vec4)
150 {
151    uint32_t base = nir_intrinsic_base(instr);
152    assert(base % 4 == 0);
153 
154    if (tu6_shared_constants_enable(layout, dev->compiler)) {
155       /* All stages share the same range.  We could potentially add
156        * push_constant_offset to layout and apply it, but this is good for
157        * now.
158        */
159       base += dev->compiler->shared_consts_base_offset * 4;
160    } else {
161       assert(base >= shader->const_state.push_consts.lo_dwords);
162       base -= shader->const_state.push_consts.lo_dwords;
163       base += push_consts_offset_vec4 * 4;
164    }
165 
166    nir_def *load =
167       nir_load_const_ir3(b, instr->num_components, instr->def.bit_size,
168                          nir_ushr_imm(b, instr->src[0].ssa, 2), .base = base);
169 
170    nir_def_replace(&instr->def, load);
171 }
172 
173 static void
lower_vulkan_resource_index(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * instr,struct tu_shader * shader,const struct tu_pipeline_layout * layout)174 lower_vulkan_resource_index(struct tu_device *dev, nir_builder *b,
175                             nir_intrinsic_instr *instr,
176                             struct tu_shader *shader,
177                             const struct tu_pipeline_layout *layout)
178 {
179    struct ir3_compiler *compiler = dev->compiler;
180    nir_def *vulkan_idx = instr->src[0].ssa;
181 
182    unsigned set = nir_intrinsic_desc_set(instr);
183    unsigned binding = nir_intrinsic_binding(instr);
184    struct tu_descriptor_set_layout *set_layout = layout->set[set].layout;
185    struct tu_descriptor_set_binding_layout *binding_layout =
186       &set_layout->binding[binding];
187    nir_def *base;
188 
189    if (binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
190       return;
191 
192    shader->active_desc_sets |= 1u << set;
193 
194    if (vk_descriptor_type_is_dynamic(binding_layout->type)) {
195       int offset = 0;
196       for (unsigned i = 0; i < set; i++) {
197          if (shader->dynamic_descriptor_sizes[i] >= 0) {
198             offset += shader->dynamic_descriptor_sizes[i];
199          } else {
200             offset = -1;
201             break;
202          }
203       }
204 
205       if (offset < 0) {
206          /* With independent sets, we don't know
207           * layout->set[set].dynamic_offset_start until after link time which
208           * with fast linking means after the shader is compiled. We have to
209           * get it from the const file instead.
210           */
211          base = nir_imm_int(b, binding_layout->dynamic_offset_offset / (4 * A6XX_TEX_CONST_DWORDS));
212          nir_def *dynamic_offset_start;
213          if (compiler->load_shader_consts_via_preamble) {
214             dynamic_offset_start =
215                ir3_load_driver_ubo(b, 1, &shader->const_state.dynamic_offsets_ubo, set);
216          } else {
217             dynamic_offset_start = nir_load_const_ir3(
218                b, 1, 32, nir_imm_int(b, 0),
219                .base = shader->const_state.dynamic_offset_loc + set);
220          }
221          base = nir_iadd(b, base, dynamic_offset_start);
222       } else {
223          base = nir_imm_int(b, (offset +
224             binding_layout->dynamic_offset_offset) / (4 * A6XX_TEX_CONST_DWORDS));
225       }
226       assert(dev->physical_device->reserved_set_idx >= 0);
227       set = dev->physical_device->reserved_set_idx;
228    } else
229       base = nir_imm_int(b, binding_layout->offset / (4 * A6XX_TEX_CONST_DWORDS));
230 
231    unsigned stride = binding_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
232    assert(util_is_power_of_two_nonzero(stride));
233    nir_def *shift = nir_imm_int(b, util_logbase2(stride));
234 
235    nir_def *def = nir_vec3(b, nir_imm_int(b, set),
236                                nir_iadd(b, base,
237                                         nir_ishl(b, vulkan_idx, shift)),
238                                shift);
239 
240    nir_def_replace(&instr->def, def);
241 }
242 
243 static void
lower_vulkan_resource_reindex(nir_builder * b,nir_intrinsic_instr * instr)244 lower_vulkan_resource_reindex(nir_builder *b, nir_intrinsic_instr *instr)
245 {
246    nir_def *old_index = instr->src[0].ssa;
247    nir_def *delta = instr->src[1].ssa;
248    nir_def *shift = nir_channel(b, old_index, 2);
249 
250    nir_def *new_index =
251       nir_vec3(b, nir_channel(b, old_index, 0),
252                nir_iadd(b, nir_channel(b, old_index, 1),
253                         nir_ishl(b, delta, shift)),
254                shift);
255 
256    nir_def_replace(&instr->def, new_index);
257 }
258 
259 static void
lower_load_vulkan_descriptor(nir_builder * b,nir_intrinsic_instr * intrin)260 lower_load_vulkan_descriptor(nir_builder *b, nir_intrinsic_instr *intrin)
261 {
262    nir_def *old_index = intrin->src[0].ssa;
263    /* Loading the descriptor happens as part of the load/store instruction so
264     * this is a no-op. We just need to turn the shift into an offset of 0.
265     */
266    nir_def *new_index =
267       nir_vec3(b, nir_channel(b, old_index, 0),
268                nir_channel(b, old_index, 1),
269                nir_imm_int(b, 0));
270    nir_def_replace(&intrin->def, new_index);
271 }
272 
273 static bool
lower_ssbo_ubo_intrinsic(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * intrin)274 lower_ssbo_ubo_intrinsic(struct tu_device *dev,
275                          nir_builder *b, nir_intrinsic_instr *intrin)
276 {
277    const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic];
278 
279    /* The bindless base is part of the instruction, which means that part of
280     * the "pointer" has to be constant. We solve this in the same way the blob
281     * does, by generating a bunch of if-statements. In the usual case where
282     * the descriptor set is constant we can skip that, though).
283     */
284 
285    unsigned buffer_src;
286    if (intrin->intrinsic == nir_intrinsic_store_ssbo) {
287       /* This has the value first */
288       buffer_src = 1;
289    } else {
290       buffer_src = 0;
291    }
292 
293    /* Don't lower non-bindless UBO loads of driver params */
294    if (intrin->src[buffer_src].ssa->num_components == 1)
295       return false;
296 
297    nir_scalar scalar_idx = nir_scalar_resolved(intrin->src[buffer_src].ssa, 0);
298    nir_def *descriptor_idx = nir_channel(b, intrin->src[buffer_src].ssa, 1);
299 
300    if (intrin->intrinsic == nir_intrinsic_load_ubo &&
301        dev->instance->allow_oob_indirect_ubo_loads) {
302       nir_scalar offset = nir_scalar_resolved(intrin->src[1].ssa, 0);
303       if (!nir_scalar_is_const(offset)) {
304          nir_intrinsic_set_range(intrin, ~0);
305       }
306    }
307 
308    /* Descriptor index has to be adjusted in the following cases:
309     *  - isam loads, when the 16-bit descriptor cannot also be used for 32-bit
310     *    loads -- next-index descriptor will be able to do that;
311     *  - 8-bit SSBO loads and stores -- next-index descriptor is dedicated to
312     *    storage accesses of that size.
313     */
314    if ((dev->physical_device->info->a6xx.storage_16bit &&
315         !dev->physical_device->info->a6xx.has_isam_v &&
316         intrin->intrinsic == nir_intrinsic_load_ssbo &&
317         (nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
318         intrin->def.bit_size > 16) ||
319        (dev->physical_device->info->a7xx.storage_8bit &&
320         ((intrin->intrinsic == nir_intrinsic_load_ssbo && intrin->def.bit_size == 8) ||
321          (intrin->intrinsic == nir_intrinsic_store_ssbo && intrin->src[0].ssa->bit_size == 8)))) {
322       descriptor_idx = nir_iadd_imm(b, descriptor_idx, 1);
323    }
324 
325    nir_def *results[MAX_SETS] = { NULL };
326 
327    if (nir_scalar_is_const(scalar_idx)) {
328       nir_def *bindless =
329          nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = nir_scalar_as_uint(scalar_idx));
330       nir_src_rewrite(&intrin->src[buffer_src], bindless);
331       return true;
332    }
333 
334    nir_def *base_idx = nir_channel(b, scalar_idx.def, scalar_idx.comp);
335    for (unsigned i = 0; i < dev->physical_device->info->a6xx.max_sets; i++) {
336       /* if (base_idx == i) { ... */
337       nir_if *nif = nir_push_if(b, nir_ieq_imm(b, base_idx, i));
338 
339       nir_def *bindless =
340          nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = i);
341 
342       nir_intrinsic_instr *copy =
343          nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
344 
345       copy->num_components = intrin->num_components;
346 
347       for (unsigned src = 0; src < info->num_srcs; src++) {
348          if (src == buffer_src)
349             copy->src[src] = nir_src_for_ssa(bindless);
350          else
351             copy->src[src] = nir_src_for_ssa(intrin->src[src].ssa);
352       }
353 
354       for (unsigned idx = 0; idx < info->num_indices; idx++) {
355          copy->const_index[idx] = intrin->const_index[idx];
356       }
357 
358       if (info->has_dest) {
359          nir_def_init(&copy->instr, &copy->def,
360                       intrin->def.num_components,
361                       intrin->def.bit_size);
362          results[i] = &copy->def;
363       }
364 
365       nir_builder_instr_insert(b, &copy->instr);
366 
367       /* } else { ... */
368       nir_push_else(b, nif);
369    }
370 
371    nir_def *result =
372       nir_undef(b, intrin->def.num_components, intrin->def.bit_size);
373    for (int i = dev->physical_device->info->a6xx.max_sets - 1; i >= 0; i--) {
374       nir_pop_if(b, NULL);
375       if (info->has_dest)
376          result = nir_if_phi(b, results[i], result);
377    }
378 
379    if (info->has_dest)
380       nir_def_rewrite_uses(&intrin->def, result);
381    nir_instr_remove(&intrin->instr);
382    return true;
383 }
384 
385 static nir_def *
build_bindless(struct tu_device * dev,nir_builder * b,nir_deref_instr * deref,bool is_sampler,struct tu_shader * shader,const struct tu_pipeline_layout * layout,uint32_t read_only_input_attachments,bool dynamic_renderpass)386 build_bindless(struct tu_device *dev, nir_builder *b,
387                nir_deref_instr *deref, bool is_sampler,
388                struct tu_shader *shader,
389                const struct tu_pipeline_layout *layout,
390                uint32_t read_only_input_attachments,
391                bool dynamic_renderpass)
392 {
393    nir_variable *var = nir_deref_instr_get_variable(deref);
394 
395    unsigned set = var->data.descriptor_set;
396    unsigned binding = var->data.binding;
397    const struct tu_descriptor_set_binding_layout *bind_layout =
398       &layout->set[set].layout->binding[binding];
399 
400    /* input attachments use non bindless workaround */
401    if (bind_layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT &&
402        (!dynamic_renderpass ||
403         (var->data.index == NIR_VARIABLE_NO_INDEX ?
404         !(read_only_input_attachments & 0x1) :
405         !(read_only_input_attachments & (1u << (var->data.index + 1))))) &&
406        !TU_DEBUG(DYNAMIC)) {
407       const struct glsl_type *glsl_type = glsl_without_array(var->type);
408       uint32_t idx;
409 
410       /* With dynamic renderpasses, we reserve the first two attachments for
411        * input attachments without an InputAttachmentIndex, which must be for
412        * depth/stencil if they are not read-only, and shift over the rest of
413        * the indices.
414        */
415       if (var->data.index == ~0u) {
416          assert(dynamic_renderpass);
417          idx = 0;
418       } else if (dynamic_renderpass) {
419          idx = (var->data.index + 1) * 2;
420       } else {
421          idx = var->data.index * 2;
422       }
423 
424       /* Record which input attachments are used for tracking feedback loops */
425       if (dynamic_renderpass)
426          shader->fs.dynamic_input_attachments_used |= (1u << (idx / 2));
427 
428       BITSET_SET_RANGE_INSIDE_WORD(b->shader->info.textures_used, idx, (idx + bind_layout->array_size * 2) - 1);
429 
430       /* D24S8 workaround: stencil of D24S8 will be sampled as uint */
431       if (glsl_get_sampler_result_type(glsl_type) == GLSL_TYPE_UINT)
432          idx += 1;
433 
434       if (deref->deref_type == nir_deref_type_var)
435          return nir_imm_int(b, idx);
436 
437       nir_def *arr_index = deref->arr.index.ssa;
438       return nir_iadd_imm(b, nir_imul_imm(b, arr_index, 2), idx);
439    }
440 
441    shader->active_desc_sets |= 1u << set;
442 
443    nir_def *desc_offset;
444    unsigned descriptor_stride;
445    unsigned offset = 0;
446    /* Samplers come second in combined image/sampler descriptors, see
447       * write_combined_image_sampler_descriptor().
448       */
449    if (is_sampler && bind_layout->type ==
450          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
451       offset = 1;
452    }
453    desc_offset =
454       nir_imm_int(b, (bind_layout->offset / (4 * A6XX_TEX_CONST_DWORDS)) +
455                   offset);
456    descriptor_stride = bind_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
457 
458    if (deref->deref_type != nir_deref_type_var) {
459       assert(deref->deref_type == nir_deref_type_array);
460 
461       nir_def *arr_index = deref->arr.index.ssa;
462       desc_offset = nir_iadd(b, desc_offset,
463                              nir_imul_imm(b, arr_index, descriptor_stride));
464    }
465 
466    return nir_bindless_resource_ir3(b, 32, desc_offset, .desc_set = set);
467 }
468 
469 static void
lower_image_deref(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * instr,struct tu_shader * shader,const struct tu_pipeline_layout * layout)470 lower_image_deref(struct tu_device *dev, nir_builder *b,
471                   nir_intrinsic_instr *instr, struct tu_shader *shader,
472                   const struct tu_pipeline_layout *layout)
473 {
474    nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
475    nir_def *bindless = build_bindless(dev, b, deref, false, shader, layout, 0, false);
476    nir_rewrite_image_intrinsic(instr, bindless, true);
477 }
478 
479 static bool
lower_intrinsic(nir_builder * b,nir_intrinsic_instr * instr,struct tu_device * dev,struct tu_shader * shader,const struct tu_pipeline_layout * layout,struct ir3_const_allocations * const_allocs)480 lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
481                 struct tu_device *dev,
482                 struct tu_shader *shader,
483                 const struct tu_pipeline_layout *layout,
484                 struct ir3_const_allocations *const_allocs)
485 {
486    switch (instr->intrinsic) {
487    case nir_intrinsic_load_push_constant:
488       lower_load_push_constant(
489          dev, b, instr, shader, layout,
490          const_allocs->consts[IR3_CONST_ALLOC_PUSH_CONSTS].offset_vec4);
491       return true;
492 
493    case nir_intrinsic_load_vulkan_descriptor:
494       lower_load_vulkan_descriptor(b, instr);
495       return true;
496 
497    case nir_intrinsic_vulkan_resource_index:
498       lower_vulkan_resource_index(dev, b, instr, shader, layout);
499       return true;
500    case nir_intrinsic_vulkan_resource_reindex:
501       lower_vulkan_resource_reindex(b, instr);
502       return true;
503 
504    case nir_intrinsic_load_ubo:
505    case nir_intrinsic_load_ssbo:
506    case nir_intrinsic_load_uav_ir3:
507    case nir_intrinsic_store_ssbo:
508    case nir_intrinsic_ssbo_atomic:
509    case nir_intrinsic_ssbo_atomic_swap:
510    case nir_intrinsic_get_ssbo_size:
511       return lower_ssbo_ubo_intrinsic(dev, b, instr);
512 
513    case nir_intrinsic_image_deref_load:
514    case nir_intrinsic_image_deref_store:
515    case nir_intrinsic_image_deref_atomic:
516    case nir_intrinsic_image_deref_atomic_swap:
517    case nir_intrinsic_image_deref_size:
518    case nir_intrinsic_image_deref_samples:
519       lower_image_deref(dev, b, instr, shader, layout);
520       return true;
521 
522    case nir_intrinsic_load_frag_size_ir3:
523    case nir_intrinsic_load_frag_offset_ir3: {
524       if (!dev->compiler->load_shader_consts_via_preamble)
525          return false;
526 
527       unsigned param =
528          instr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
529          IR3_DP_FS(frag_size) : IR3_DP_FS(frag_offset);
530 
531       unsigned offset = param - IR3_DP_FS_DYNAMIC;
532 
533       nir_def *view = instr->src[0].ssa;
534       nir_def *result =
535          ir3_load_driver_ubo_indirect(b, 2, &shader->const_state.fdm_ubo,
536                                       offset, view, nir_intrinsic_range(instr));
537 
538       nir_def_replace(&instr->def, result);
539       return true;
540    }
541    case nir_intrinsic_load_frag_invocation_count: {
542       if (!dev->compiler->load_shader_consts_via_preamble)
543          return false;
544 
545       nir_def *result =
546          ir3_load_driver_ubo(b, 1, &shader->const_state.fdm_ubo,
547                              IR3_DP_FS(frag_invocation_count) -
548                              IR3_DP_FS_DYNAMIC);
549 
550       nir_def_replace(&instr->def, result);
551       return true;
552    }
553 
554    default:
555       return false;
556    }
557 }
558 
559 static void
lower_tex_ycbcr(const struct tu_pipeline_layout * layout,nir_builder * builder,nir_tex_instr * tex)560 lower_tex_ycbcr(const struct tu_pipeline_layout *layout,
561                 nir_builder *builder,
562                 nir_tex_instr *tex)
563 {
564    int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
565    assert(deref_src_idx >= 0);
566    nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
567 
568    nir_variable *var = nir_deref_instr_get_variable(deref);
569    const struct tu_descriptor_set_layout *set_layout =
570       layout->set[var->data.descriptor_set].layout;
571    const struct tu_descriptor_set_binding_layout *binding =
572       &set_layout->binding[var->data.binding];
573    const struct vk_ycbcr_conversion_state *ycbcr_samplers =
574       tu_immutable_ycbcr_samplers(set_layout, binding);
575 
576    if (!ycbcr_samplers)
577       return;
578 
579    /* For the following instructions, we don't apply any change */
580    if (tex->op == nir_texop_txs ||
581        tex->op == nir_texop_query_levels ||
582        tex->op == nir_texop_lod)
583       return;
584 
585    assert(tex->texture_index == 0);
586    unsigned array_index = 0;
587    if (deref->deref_type != nir_deref_type_var) {
588       assert(deref->deref_type == nir_deref_type_array);
589       if (!nir_src_is_const(deref->arr.index))
590          return;
591       array_index = nir_src_as_uint(deref->arr.index);
592       array_index = MIN2(array_index, binding->array_size - 1);
593    }
594    const struct vk_ycbcr_conversion_state *ycbcr_sampler = ycbcr_samplers + array_index;
595 
596    if (ycbcr_sampler->ycbcr_model == VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY)
597       return;
598 
599    /* Skip if not actually a YCbCr format.  CtsGraphics, for example, tries to create
600     * YcbcrConversions for RGB formats.
601     */
602    if (!vk_format_get_ycbcr_info(ycbcr_sampler->format))
603       return;
604 
605    builder->cursor = nir_after_instr(&tex->instr);
606 
607    uint8_t bits = vk_format_get_bpc(ycbcr_sampler->format);
608    uint32_t bpcs[3] = {bits, bits, bits}; /* We only support uniform formats */
609    nir_def *result = nir_convert_ycbcr_to_rgb(builder,
610                                               ycbcr_sampler->ycbcr_model,
611                                               ycbcr_sampler->ycbcr_range,
612                                               &tex->def,
613                                               bpcs);
614    nir_def_rewrite_uses_after(&tex->def, result,
615                               result->parent_instr);
616 
617    builder->cursor = nir_before_instr(&tex->instr);
618 }
619 
620 static bool
lower_tex(nir_builder * b,nir_tex_instr * tex,struct tu_device * dev,struct tu_shader * shader,const struct tu_pipeline_layout * layout,uint32_t read_only_input_attachments,bool dynamic_renderpass)621 lower_tex(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
622           struct tu_shader *shader, const struct tu_pipeline_layout *layout,
623           uint32_t read_only_input_attachments, bool dynamic_renderpass)
624 {
625    lower_tex_ycbcr(layout, b, tex);
626 
627    int sampler_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref);
628    if (sampler_src_idx >= 0) {
629       nir_deref_instr *deref = nir_src_as_deref(tex->src[sampler_src_idx].src);
630       nir_def *bindless = build_bindless(dev, b, deref, true, shader, layout,
631                                          read_only_input_attachments,
632                                          dynamic_renderpass);
633       nir_src_rewrite(&tex->src[sampler_src_idx].src, bindless);
634       tex->src[sampler_src_idx].src_type = nir_tex_src_sampler_handle;
635    }
636 
637    int tex_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
638    if (tex_src_idx >= 0) {
639       nir_deref_instr *deref = nir_src_as_deref(tex->src[tex_src_idx].src);
640       nir_def *bindless = build_bindless(dev, b, deref, false, shader, layout,
641                                          read_only_input_attachments,
642                                          dynamic_renderpass);
643       nir_src_rewrite(&tex->src[tex_src_idx].src, bindless);
644       tex->src[tex_src_idx].src_type = nir_tex_src_texture_handle;
645 
646       /* for the input attachment case: */
647       if (bindless->parent_instr->type != nir_instr_type_intrinsic)
648          tex->src[tex_src_idx].src_type = nir_tex_src_texture_offset;
649    }
650 
651    return true;
652 }
653 
654 struct lower_instr_params {
655    struct tu_device *dev;
656    struct tu_shader *shader;
657    const struct tu_pipeline_layout *layout;
658    uint32_t read_only_input_attachments;
659    bool dynamic_renderpass;
660    struct ir3_const_allocations *const_allocs;
661 };
662 
663 static bool
lower_instr(nir_builder * b,nir_instr * instr,void * cb_data)664 lower_instr(nir_builder *b, nir_instr *instr, void *cb_data)
665 {
666    struct lower_instr_params *params = (struct lower_instr_params *) cb_data;
667    b->cursor = nir_before_instr(instr);
668    switch (instr->type) {
669    case nir_instr_type_tex:
670       return lower_tex(b, nir_instr_as_tex(instr), params->dev, params->shader, params->layout,
671                        params->read_only_input_attachments,
672                        params->dynamic_renderpass);
673    case nir_instr_type_intrinsic:
674       return lower_intrinsic(b, nir_instr_as_intrinsic(instr), params->dev,
675                              params->shader, params->layout,
676                              params->const_allocs);
677    default:
678       return false;
679    }
680 }
681 
682 /* Since we always push inline uniforms into constant memory, lower loads of
683  * them to load_uniform which turns into constant memory loads.
684  */
685 static bool
lower_inline_ubo(nir_builder * b,nir_intrinsic_instr * intrin,void * cb_data)686 lower_inline_ubo(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data)
687 {
688    if (intrin->intrinsic != nir_intrinsic_load_ubo)
689       return false;
690 
691    struct lower_instr_params *params = (struct lower_instr_params *) cb_data;
692    struct tu_shader *shader = params->shader;
693    const struct tu_pipeline_layout *layout = params->layout;
694 
695    nir_binding binding = nir_chase_binding(intrin->src[0]);
696 
697    if (!binding.success)
698       return false;
699 
700    struct tu_descriptor_set_layout *set_layout = layout->set[binding.desc_set].layout;
701    struct tu_descriptor_set_binding_layout *binding_layout =
702       &set_layout->binding[binding.binding];
703 
704    if (binding_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
705       return false;
706 
707    /* lookup the const offset of the inline UBO */
708    struct tu_const_state *const_state = &shader->const_state;
709 
710    unsigned base = UINT_MAX;
711    unsigned range;
712    bool use_load = false;
713    bool use_ldg_k =
714       params->dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
715 
716    for (unsigned i = 0; i < const_state->num_inline_ubos; i++) {
717       if (const_state->ubos[i].base == binding.desc_set &&
718           const_state->ubos[i].offset == binding_layout->offset) {
719          range = const_state->ubos[i].size_vec4 * 4;
720          if (use_ldg_k) {
721             base = i * 2;
722          } else {
723             use_load = const_state->ubos[i].push_address;
724             base = const_state->ubos[i].const_offset_vec4 * 4;
725          }
726          break;
727       }
728    }
729 
730    if (base == UINT_MAX) {
731       /* Assume we're loading out-of-bounds from a 0-sized inline uniform
732        * filtered out below.
733        */
734       nir_def_rewrite_uses(&intrin->def,
735                                nir_undef(b, intrin->num_components,
736                                              intrin->def.bit_size));
737       return true;
738    }
739 
740    nir_def *offset = intrin->src[1].ssa;
741 
742    b->cursor = nir_before_instr(&intrin->instr);
743    nir_def *val;
744 
745    if (use_load || use_ldg_k) {
746       nir_def *base_addr;
747       if (use_ldg_k) {
748          base_addr = ir3_load_driver_ubo(b, 2,
749                                          &params->shader->const_state.inline_uniforms_ubo,
750                                          base);
751       } else {
752          base_addr =
753             nir_load_const_ir3(b, 2, 32, nir_imm_int(b, 0), .base = base);
754       }
755       val = nir_load_global_ir3(b, intrin->num_components,
756                                 intrin->def.bit_size,
757                                 base_addr, nir_ishr_imm(b, offset, 2),
758                                 .access =
759                                  (enum gl_access_qualifier)(
760                                     (enum gl_access_qualifier)(ACCESS_NON_WRITEABLE | ACCESS_CAN_REORDER) |
761                                     ACCESS_CAN_SPECULATE),
762                                 .align_mul = 16,
763                                 .align_offset = 0,
764                                 .range_base = 0,
765                                 .range = range);
766    } else {
767       val =
768          nir_load_const_ir3(b, intrin->num_components, intrin->def.bit_size,
769                             nir_ishr_imm(b, offset, 2), .base = base);
770    }
771 
772    nir_def_replace(&intrin->def, val);
773    return true;
774 }
775 
776 /* Figure out the range of push constants that we're actually going to push to
777  * the shader, and tell the backend to reserve this range when pushing UBO
778  * constants.
779  */
780 
781 static void
gather_push_constants(nir_shader * shader,struct tu_shader * tu_shader)782 gather_push_constants(nir_shader *shader, struct tu_shader *tu_shader)
783 {
784    uint32_t min = UINT32_MAX, max = 0;
785    nir_foreach_function_impl(impl, shader) {
786       nir_foreach_block(block, impl) {
787          nir_foreach_instr_safe(instr, block) {
788             if (instr->type != nir_instr_type_intrinsic)
789                continue;
790 
791             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
792             if (intrin->intrinsic != nir_intrinsic_load_push_constant)
793                continue;
794 
795             uint32_t base = nir_intrinsic_base(intrin);
796             uint32_t range = nir_intrinsic_range(intrin);
797             min = MIN2(min, base);
798             max = MAX2(max, base + range);
799             break;
800          }
801       }
802    }
803 
804    if (min >= max) {
805       tu_shader->const_state.push_consts = (struct tu_push_constant_range) {};
806       return;
807    }
808 
809    /* CP_LOAD_STATE OFFSET and NUM_UNIT for SHARED_CONSTS are in units of
810     * dwords while loading regular consts is in units of vec4's.
811     * So we unify the unit here as dwords for tu_push_constant_range, then
812     * we should consider correct unit when emitting.
813     *
814     * Note there's an alignment requirement of 16 dwords on OFFSET. Expand
815     * the range and change units accordingly.
816     */
817    tu_shader->const_state.push_consts.lo_dwords += (min / 4) / 4 * 4;
818    tu_shader->const_state.push_consts.dwords =
819       align(max, 16) / 4 - tu_shader->const_state.push_consts.lo_dwords;
820 }
821 
822 static bool
shader_uses_push_consts(nir_shader * shader)823 shader_uses_push_consts(nir_shader *shader)
824 {
825    nir_foreach_function_impl (impl, shader) {
826       nir_foreach_block (block, impl) {
827          nir_foreach_instr_safe (instr, block) {
828             if (instr->type != nir_instr_type_intrinsic)
829                continue;
830 
831             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
832             if (intrin->intrinsic == nir_intrinsic_load_push_constant)
833                return true;
834          }
835       }
836    }
837    return false;
838 }
839 
840 static bool
tu_lower_io(nir_shader * shader,struct tu_device * dev,struct tu_shader * tu_shader,const struct tu_pipeline_layout * layout,uint32_t read_only_input_attachments,bool dynamic_renderpass,struct ir3_const_allocations * const_allocs)841 tu_lower_io(nir_shader *shader, struct tu_device *dev,
842             struct tu_shader *tu_shader,
843             const struct tu_pipeline_layout *layout,
844             uint32_t read_only_input_attachments,
845             bool dynamic_renderpass,
846             struct ir3_const_allocations *const_allocs)
847 {
848    /* Allocate driver params as early as possible as a workaround for the
849     * following case:
850     * - CP_DRAW_INDIRECT_MULTI_1_DST_OFF apparently tries to upload consts
851     *   even when there are 0 instances.
852     * - With zero instances, the draw state for VS constlen is not applied.
853     * - constlen therefor uses stale value and if
854     *   CP_DRAW_INDIRECT_MULTI_1_DST_OFF is higher than 0x3f - GPU hangs.
855     *
856     * To not rely on undefined behaviour, we will always allocate enough space
857     * to upload driver params.
858     */
859    if (shader->info.stage == MESA_SHADER_VERTEX) {
860       uint32_t num_driver_params =
861          ir3_nir_scan_driver_consts(dev->compiler, shader, nullptr);
862       ir3_alloc_driver_params(const_allocs, &num_driver_params, dev->compiler,
863                               shader->info.stage);
864    }
865 
866    struct tu_const_state *const_state = &tu_shader->const_state;
867    const_state->push_consts = (struct tu_push_constant_range) {
868       .lo_dwords = 0,
869       .dwords = layout->push_constant_size / 4,
870       .type = tu_push_consts_type(layout, dev->compiler),
871    };
872 
873    if (const_state->push_consts.type == IR3_PUSH_CONSTS_PER_STAGE) {
874       gather_push_constants(shader, tu_shader);
875    } else if (const_state->push_consts.type ==
876             IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
877       /* Disable pushing constants for this stage if none were loaded in the
878        * shader.  If all stages don't load their declared push constants, as
879        * is often the case under zink, then we could additionally skip
880        * emitting REG_A7XX_HLSQ_SHARED_CONSTS_IMM entirely.
881        */
882       if (!shader_uses_push_consts(shader))
883          const_state->push_consts = (struct tu_push_constant_range) {};
884    }
885 
886    if (const_state->push_consts.type != IR3_PUSH_CONSTS_SHARED) {
887       uint32_t offset_align_vec4 = 1;
888       if (const_state->push_consts.type == IR3_PUSH_CONSTS_PER_STAGE)
889          offset_align_vec4 = dev->compiler->const_upload_unit;
890 
891       unsigned push_consts_vec4 =
892          align(DIV_ROUND_UP(const_state->push_consts.dwords, 4),
893                dev->compiler->const_upload_unit);
894 
895       ir3_const_alloc(const_allocs, IR3_CONST_ALLOC_PUSH_CONSTS,
896                       push_consts_vec4, offset_align_vec4);
897    }
898 
899    bool unknown_dynamic_size = false;
900    bool unknown_dynamic_offset = false;
901    for (unsigned i = 0; i < layout->num_sets; i++) {
902       if (tu_shader->dynamic_descriptor_sizes[i] == -1) {
903          unknown_dynamic_size = true;
904       } else if (unknown_dynamic_size &&
905                  tu_shader->dynamic_descriptor_sizes[i] > 0) {
906          /* If there is an unknown size followed by a known size, then we may
907           * need to dynamically determine the offset when linking.
908           */
909          unknown_dynamic_offset = true;
910       }
911    }
912 
913    if (unknown_dynamic_offset) {
914       const_state->dynamic_offset_loc =
915          const_allocs->max_const_offset_vec4 * 4;
916       assert(dev->physical_device->reserved_set_idx >= 0);
917       ir3_const_alloc(
918          const_allocs, IR3_CONST_ALLOC_DYN_DESCRIPTOR_OFFSET,
919          DIV_ROUND_UP(dev->physical_device->reserved_set_idx, 4), 1);
920    } else {
921       const_state->dynamic_offset_loc = UINT32_MAX;
922    }
923 
924    /* Reserve space for inline uniforms, so we can always load them from
925     * constants and not setup a UBO descriptor for them.
926     */
927    size_t ldgk_consts = 0;
928    bool use_ldg_k =
929       dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
930    for (unsigned set = 0; set < layout->num_sets; set++) {
931       const struct tu_descriptor_set_layout *desc_layout =
932          layout->set[set].layout;
933 
934       if (!desc_layout || !desc_layout->has_inline_uniforms)
935          continue;
936 
937       for (unsigned b = 0; b < desc_layout->binding_count; b++) {
938          const struct tu_descriptor_set_binding_layout *binding =
939             &desc_layout->binding[b];
940 
941          if (binding->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
942             continue;
943          if (!(binding->shader_stages &
944                mesa_to_vk_shader_stage(shader->info.stage)))
945             continue;
946 
947          /* If we don't know the size at compile time due to a variable
948           * descriptor count, then with descriptor buffers we cannot know
949           * how much space the real inline uniform has. In this case we fall
950           * back to pushing the address and using ldg, which is slower than
951           * setting up a descriptor but setting up our own descriptor with
952           * descriptor_buffer is also painful and has to be done on the GPU
953           * and doesn't avoid the UBO getting pushed anyway and faulting if a
954           * out-of-bounds access is hidden behind an if and not dynamically
955           * executed. Given the small max size, there shouldn't be much reason
956           * to use variable size anyway.
957           */
958          bool push_address = !use_ldg_k && desc_layout->has_variable_descriptors &&
959             b == desc_layout->binding_count - 1;
960 
961          if (push_address) {
962             perf_debug(dev,
963                        "falling back to ldg for variable-sized inline "
964                        "uniform block");
965          }
966 
967          assert(const_state->num_inline_ubos < ARRAY_SIZE(const_state->ubos));
968          unsigned size_vec4 = push_address ? 1 : DIV_ROUND_UP(binding->size, 16);
969          const_state->ubos[const_state->num_inline_ubos++] =
970             (struct tu_inline_ubo) {
971                .base = set,
972                .offset = binding->offset,
973                .push_address = push_address,
974                .const_offset_vec4 =
975                   const_allocs->max_const_offset_vec4 + ldgk_consts,
976                .size_vec4 = size_vec4,
977             };
978 
979          if (!use_ldg_k) {
980             ldgk_consts += align(size_vec4, dev->compiler->const_upload_unit);
981          }
982       }
983    }
984 
985    ir3_const_alloc(const_allocs, IR3_CONST_ALLOC_INLINE_UNIFORM_ADDRS, ldgk_consts, 1);
986 
987    struct lower_instr_params params = {
988       .dev = dev,
989       .shader = tu_shader,
990       .layout = layout,
991       .read_only_input_attachments = read_only_input_attachments,
992       .dynamic_renderpass = dynamic_renderpass,
993       .const_allocs = const_allocs,
994    };
995 
996    bool progress = false;
997    if (const_state->num_inline_ubos) {
998       progress |= nir_shader_intrinsics_pass(shader, lower_inline_ubo,
999                                                nir_metadata_none,
1000                                                &params);
1001    }
1002 
1003    progress |= nir_shader_instructions_pass(shader,
1004                                             lower_instr,
1005                                             nir_metadata_none,
1006                                             &params);
1007 
1008    /* Remove now-unused variables so that when we gather the shader info later
1009     * they won't be counted.
1010     */
1011 
1012    if (progress)
1013       nir_opt_dce(shader);
1014 
1015    progress |=
1016       nir_remove_dead_variables(shader,
1017                                 nir_var_uniform | nir_var_mem_ubo | nir_var_mem_ssbo,
1018                                 NULL);
1019 
1020    return progress;
1021 }
1022 
1023 struct lower_fdm_options {
1024    unsigned num_views;
1025    bool adjust_fragcoord;
1026    bool multiview;
1027 };
1028 
1029 static bool
lower_fdm_filter(const nir_instr * instr,const void * data)1030 lower_fdm_filter(const nir_instr *instr, const void *data)
1031 {
1032    const struct lower_fdm_options *options =
1033       (const struct lower_fdm_options *)data;
1034 
1035    if (instr->type != nir_instr_type_intrinsic)
1036       return false;
1037 
1038    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1039    return intrin->intrinsic == nir_intrinsic_load_frag_size ||
1040       (intrin->intrinsic == nir_intrinsic_load_frag_coord &&
1041        options->adjust_fragcoord);
1042 }
1043 
1044 static nir_def *
lower_fdm_instr(struct nir_builder * b,nir_instr * instr,void * data)1045 lower_fdm_instr(struct nir_builder *b, nir_instr *instr, void *data)
1046 {
1047    const struct lower_fdm_options *options =
1048       (const struct lower_fdm_options *)data;
1049 
1050    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1051 
1052    nir_def *view;
1053    if (options->multiview) {
1054       nir_variable *view_var =
1055          nir_find_variable_with_location(b->shader, nir_var_shader_in,
1056                                          VARYING_SLOT_VIEW_INDEX);
1057 
1058       if (view_var == NULL) {
1059          view_var = nir_variable_create(b->shader, nir_var_shader_in,
1060                                         glsl_int_type(), NULL);
1061          view_var->data.location = VARYING_SLOT_VIEW_INDEX;
1062          view_var->data.interpolation = INTERP_MODE_FLAT;
1063          view_var->data.driver_location = b->shader->num_inputs++;
1064       }
1065 
1066       view = nir_load_var(b, view_var);
1067    } else {
1068       view = nir_imm_int(b, 0);
1069    }
1070 
1071    nir_def *frag_size =
1072       nir_load_frag_size_ir3(b, view, .range = options->num_views);
1073 
1074    if (intrin->intrinsic == nir_intrinsic_load_frag_coord) {
1075       nir_def *frag_offset =
1076          nir_load_frag_offset_ir3(b, view, .range = options->num_views);
1077       nir_def *unscaled_coord = nir_load_frag_coord_unscaled_ir3(b);
1078       nir_def *xy = nir_trim_vector(b, unscaled_coord, 2);
1079       xy = nir_fmul(b, nir_fsub(b, xy, frag_offset), nir_i2f32(b, frag_size));
1080       return nir_vec4(b,
1081                       nir_channel(b, xy, 0),
1082                       nir_channel(b, xy, 1),
1083                       nir_channel(b, unscaled_coord, 2),
1084                       nir_channel(b, unscaled_coord, 3));
1085    }
1086 
1087    assert(intrin->intrinsic == nir_intrinsic_load_frag_size);
1088    return frag_size;
1089 }
1090 
1091 static bool
tu_nir_lower_fdm(nir_shader * shader,const struct lower_fdm_options * options)1092 tu_nir_lower_fdm(nir_shader *shader, const struct lower_fdm_options *options)
1093 {
1094    return nir_shader_lower_instructions(shader, lower_fdm_filter,
1095                                         lower_fdm_instr, (void *)options);
1096 }
1097 
1098 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)1099 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
1100 {
1101    assert(glsl_type_is_vector_or_scalar(type));
1102 
1103    unsigned comp_size =
1104       glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
1105    unsigned length = glsl_get_vector_elements(type);
1106    *size = comp_size * length;
1107    *align = comp_size;
1108 }
1109 
1110 static void
tu_gather_xfb_info(nir_shader * nir,struct ir3_stream_output_info * info)1111 tu_gather_xfb_info(nir_shader *nir, struct ir3_stream_output_info *info)
1112 {
1113    nir_shader_gather_xfb_info(nir);
1114 
1115    if (!nir->xfb_info)
1116       return;
1117 
1118    nir_xfb_info *xfb = nir->xfb_info;
1119 
1120    uint8_t output_map[VARYING_SLOT_TESS_MAX];
1121    memset(output_map, 0, sizeof(output_map));
1122 
1123    nir_foreach_shader_out_variable(var, nir) {
1124       unsigned slots = nir_variable_count_slots(var, var->type);
1125       for (unsigned i = 0; i < slots; i++)
1126          output_map[var->data.location + i] = var->data.driver_location + i;
1127    }
1128 
1129    assert(xfb->output_count <= IR3_MAX_SO_OUTPUTS);
1130    info->num_outputs = xfb->output_count;
1131 
1132    for (int i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
1133       info->stride[i] = xfb->buffers[i].stride / 4;
1134       info->buffer_to_stream[i] = xfb->buffer_to_stream[i];
1135    }
1136 
1137    info->streams_written = xfb->streams_written;
1138 
1139    for (int i = 0; i < xfb->output_count; i++) {
1140       info->output[i].register_index = output_map[xfb->outputs[i].location];
1141       info->output[i].start_component = xfb->outputs[i].component_offset;
1142       info->output[i].num_components =
1143                            util_bitcount(xfb->outputs[i].component_mask);
1144       info->output[i].output_buffer  = xfb->outputs[i].buffer;
1145       info->output[i].dst_offset = xfb->outputs[i].offset / 4;
1146       info->output[i].stream = xfb->buffer_to_stream[xfb->outputs[i].buffer];
1147    }
1148 }
1149 
1150 static uint32_t
tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant * xs)1151 tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs)
1152 {
1153    const struct ir3_const_state *const_state = ir3_const_state(xs);
1154    uint32_t base = const_state->allocs.max_const_offset_vec4;
1155    int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4);
1156 
1157    /* truncate size to avoid writing constants that shader
1158     * does not use:
1159     */
1160    size = MIN2(size + base, xs->constlen) - base;
1161 
1162    return MAX2(size, 0) * 4;
1163 }
1164 
1165 /* We allocate fixed-length substreams for shader state, however some
1166  * parts of the state may have unbound length. Their additional space
1167  * requirements should be calculated here.
1168  */
1169 static uint32_t
tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant * xs)1170 tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs)
1171 {
1172    const struct ir3_const_state *const_state = ir3_const_state(xs);
1173 
1174    uint32_t size = tu_xs_get_immediates_packet_size_dwords(xs);
1175 
1176    /* Variable number of UBO upload ranges. */
1177    size += 4 * const_state->ubo_state.num_enabled;
1178 
1179    /* Variable number of dwords for the primitive map */
1180    size += xs->input_size;
1181 
1182    size += xs->constant_data_size / 4;
1183 
1184    return size;
1185 }
1186 
1187 static const struct xs_config {
1188    uint16_t reg_sp_xs_config;
1189    uint16_t reg_sp_xs_instrlen;
1190    uint16_t reg_sp_xs_first_exec_offset;
1191    uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
1192    uint16_t reg_sp_xs_vgpr_config;
1193 } xs_config[] = {
1194    [MESA_SHADER_VERTEX] = {
1195       REG_A6XX_SP_VS_CONFIG,
1196       REG_A6XX_SP_VS_INSTRLEN,
1197       REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET,
1198       REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET,
1199       REG_A7XX_SP_VS_VGPR_CONFIG,
1200    },
1201    [MESA_SHADER_TESS_CTRL] = {
1202       REG_A6XX_SP_HS_CONFIG,
1203       REG_A6XX_SP_HS_INSTRLEN,
1204       REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET,
1205       REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET,
1206       REG_A7XX_SP_HS_VGPR_CONFIG,
1207    },
1208    [MESA_SHADER_TESS_EVAL] = {
1209       REG_A6XX_SP_DS_CONFIG,
1210       REG_A6XX_SP_DS_INSTRLEN,
1211       REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET,
1212       REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET,
1213       REG_A7XX_SP_DS_VGPR_CONFIG,
1214    },
1215    [MESA_SHADER_GEOMETRY] = {
1216       REG_A6XX_SP_GS_CONFIG,
1217       REG_A6XX_SP_GS_INSTRLEN,
1218       REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET,
1219       REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET,
1220       REG_A7XX_SP_GS_VGPR_CONFIG,
1221    },
1222    [MESA_SHADER_FRAGMENT] = {
1223       REG_A6XX_SP_FS_CONFIG,
1224       REG_A6XX_SP_FS_INSTRLEN,
1225       REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET,
1226       REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET,
1227       REG_A7XX_SP_FS_VGPR_CONFIG,
1228    },
1229    [MESA_SHADER_COMPUTE] = {
1230       REG_A6XX_SP_CS_CONFIG,
1231       REG_A6XX_SP_CS_INSTRLEN,
1232       REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET,
1233       REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET,
1234       REG_A7XX_SP_CS_VGPR_CONFIG,
1235    },
1236 };
1237 
1238 void
tu6_emit_xs(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)1239 tu6_emit_xs(struct tu_cs *cs,
1240             gl_shader_stage stage, /* xs->type, but xs may be NULL */
1241             const struct ir3_shader_variant *xs,
1242             const struct tu_pvtmem_config *pvtmem,
1243             uint64_t binary_iova)
1244 {
1245    const struct xs_config *cfg = &xs_config[stage];
1246 
1247    if (!xs) {
1248       /* shader stage disabled */
1249       return;
1250    }
1251 
1252    enum a6xx_threadsize thrsz =
1253       xs->info.double_threadsize ? THREAD128 : THREAD64;
1254    switch (stage) {
1255    case MESA_SHADER_VERTEX:
1256       tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0(
1257                .halfregfootprint = xs->info.max_half_reg + 1,
1258                .fullregfootprint = xs->info.max_reg + 1,
1259                .branchstack = ir3_shader_branchstack_hw(xs),
1260                .mergedregs = xs->mergedregs,
1261                .earlypreamble = xs->early_preamble,
1262       ));
1263       break;
1264    case MESA_SHADER_TESS_CTRL:
1265       tu_cs_emit_regs(cs, A6XX_SP_HS_CTRL_REG0(
1266                .halfregfootprint = xs->info.max_half_reg + 1,
1267                .fullregfootprint = xs->info.max_reg + 1,
1268                .branchstack = ir3_shader_branchstack_hw(xs),
1269                .earlypreamble = xs->early_preamble,
1270       ));
1271       break;
1272    case MESA_SHADER_TESS_EVAL:
1273       tu_cs_emit_regs(cs, A6XX_SP_DS_CTRL_REG0(
1274                .halfregfootprint = xs->info.max_half_reg + 1,
1275                .fullregfootprint = xs->info.max_reg + 1,
1276                .branchstack = ir3_shader_branchstack_hw(xs),
1277                .earlypreamble = xs->early_preamble,
1278       ));
1279       break;
1280    case MESA_SHADER_GEOMETRY:
1281       tu_cs_emit_regs(cs, A6XX_SP_GS_CTRL_REG0(
1282                .halfregfootprint = xs->info.max_half_reg + 1,
1283                .fullregfootprint = xs->info.max_reg + 1,
1284                .branchstack = ir3_shader_branchstack_hw(xs),
1285                .earlypreamble = xs->early_preamble,
1286       ));
1287       break;
1288    case MESA_SHADER_FRAGMENT:
1289       tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0(
1290                .halfregfootprint = xs->info.max_half_reg + 1,
1291                .fullregfootprint = xs->info.max_reg + 1,
1292                .branchstack = ir3_shader_branchstack_hw(xs),
1293                .threadsize = thrsz,
1294                .varying = xs->total_in != 0,
1295                .lodpixmask = xs->need_full_quad,
1296                /* unknown bit, seems unnecessary */
1297                .unk24 = true,
1298                .pixlodenable = xs->need_pixlod,
1299                .earlypreamble = xs->early_preamble,
1300                .mergedregs = xs->mergedregs,
1301       ));
1302       break;
1303    case MESA_SHADER_COMPUTE:
1304       thrsz = cs->device->physical_device->info->a6xx
1305             .supports_double_threadsize ? thrsz : THREAD128;
1306       tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0(
1307                .halfregfootprint = xs->info.max_half_reg + 1,
1308                .fullregfootprint = xs->info.max_reg + 1,
1309                .branchstack = ir3_shader_branchstack_hw(xs),
1310                .threadsize = thrsz,
1311                .earlypreamble = xs->early_preamble,
1312                .mergedregs = xs->mergedregs,
1313       ));
1314       break;
1315    default:
1316       unreachable("bad shader stage");
1317    }
1318 
1319    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_instrlen, 1);
1320    tu_cs_emit(cs, xs->instrlen);
1321 
1322    /* emit program binary & private memory layout
1323     * binary_iova should be aligned to 1 instrlen unit (128 bytes)
1324     */
1325 
1326    assert((binary_iova & 0x7f) == 0);
1327    assert((pvtmem->iova & 0x1f) == 0);
1328 
1329    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_first_exec_offset, 7);
1330    tu_cs_emit(cs, 0);
1331    tu_cs_emit_qw(cs, binary_iova);
1332    tu_cs_emit(cs,
1333               A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(pvtmem->per_fiber_size));
1334    tu_cs_emit_qw(cs, pvtmem->iova);
1335    tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(pvtmem->per_sp_size) |
1336                   COND(pvtmem->per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT));
1337 
1338    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1);
1339    tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size));
1340 
1341    if (cs->device->physical_device->info->chip >= A7XX) {
1342       tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vgpr_config, 1);
1343       tu_cs_emit(cs, 0);
1344    }
1345 
1346    if (cs->device->physical_device->info->chip == A6XX) {
1347       uint32_t shader_preload_size =
1348          MIN2(xs->instrlen, cs->device->physical_device->info->a6xx.instr_cache_size);
1349 
1350       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
1351       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1352                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
1353                      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1354                      CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1355                      CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
1356       tu_cs_emit_qw(cs, binary_iova);
1357    }
1358 
1359    /* emit immediates */
1360 
1361    const struct ir3_const_state *const_state = ir3_const_state(xs);
1362    uint32_t base = const_state->allocs.max_const_offset_vec4;
1363    unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs);
1364 
1365    if (immediate_size > 0) {
1366       assert(!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble);
1367       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size);
1368       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
1369                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1370                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1371                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1372                  CP_LOAD_STATE6_0_NUM_UNIT(immediate_size / 4));
1373       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1374       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1375 
1376       tu_cs_emit_array(cs, const_state->immediates, immediate_size);
1377    }
1378 
1379    if (const_state->consts_ubo.idx != -1) {
1380       uint64_t iova = binary_iova + xs->info.constant_data_offset;
1381       uint32_t offset = const_state->consts_ubo.idx;
1382 
1383       /* Upload UBO state for the constant data. */
1384       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
1385       tu_cs_emit(cs,
1386                  CP_LOAD_STATE6_0_DST_OFF(offset) |
1387                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
1388                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1389                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1390                  CP_LOAD_STATE6_0_NUM_UNIT(1));
1391       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1392       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1393       int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16);
1394       tu_cs_emit_qw(cs,
1395                     iova |
1396                     (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32);
1397 
1398       /* Upload the constant data to the const file if needed. */
1399       const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
1400 
1401       if (!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
1402          for (int i = 0; i < ubo_state->num_enabled; i++) {
1403             if (ubo_state->range[i].ubo.block != offset ||
1404                 ubo_state->range[i].ubo.bindless) {
1405                continue;
1406             }
1407 
1408             uint32_t start = ubo_state->range[i].start;
1409             uint32_t end = ubo_state->range[i].end;
1410             uint32_t size = MIN2(end - start,
1411                                  (16 * xs->constlen) - ubo_state->range[i].offset);
1412 
1413             tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
1414             tu_cs_emit(cs,
1415                      CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
1416                      CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1417                      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1418                      CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1419                      CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
1420             tu_cs_emit_qw(cs, iova + start);
1421          }
1422       }
1423    }
1424 
1425    /* emit statically-known FS driver param */
1426    if (stage == MESA_SHADER_FRAGMENT && const_state->driver_params_ubo.size > 0) {
1427       uint32_t data[4] = {xs->info.double_threadsize ? 128 : 64, 0, 0, 0};
1428       uint32_t size = ARRAY_SIZE(data);
1429 
1430       /* A7XX TODO: Emit data via sub_cs instead of NOP */
1431       uint64_t iova = tu_cs_emit_data_nop(cs, data, size, 4);
1432       uint32_t base = const_state->driver_params_ubo.idx;
1433 
1434       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
1435       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
1436                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
1437                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1438                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1439                  CP_LOAD_STATE6_0_NUM_UNIT(1));
1440       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1441       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1442       int size_vec4s = DIV_ROUND_UP(size, 4);
1443       tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
1444    } else if (stage == MESA_SHADER_FRAGMENT && const_state->num_driver_params > 0) {
1445       uint32_t base =
1446          const_state->allocs.consts[IR3_CONST_ALLOC_DRIVER_PARAMS].offset_vec4;
1447       int32_t size = DIV_ROUND_UP(MAX2(const_state->num_driver_params, 4), 4);
1448       size = MAX2(MIN2(size + base, xs->constlen) - base, 0);
1449 
1450       if (size > 0) {
1451          tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + 4);
1452          tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
1453                     CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1454                     CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1455                     CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1456                     CP_LOAD_STATE6_0_NUM_UNIT(size));
1457          tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1458          tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1459 
1460          tu_cs_emit(cs, xs->info.double_threadsize ? 128 : 64);
1461          tu_cs_emit(cs, 0);
1462          tu_cs_emit(cs, 0);
1463          tu_cs_emit(cs, 0);
1464       }
1465    }
1466 }
1467 
1468 template <chip CHIP>
1469 static void
tu6_emit_cs_config(struct tu_cs * cs,const struct ir3_shader_variant * v,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)1470 tu6_emit_cs_config(struct tu_cs *cs,
1471                    const struct ir3_shader_variant *v,
1472                    const struct tu_pvtmem_config *pvtmem,
1473                    uint64_t binary_iova)
1474 {
1475    bool shared_consts_enable =
1476       ir3_const_state(v)->push_consts_type == IR3_PUSH_CONSTS_SHARED;
1477    tu6_emit_shared_consts_enable<CHIP>(cs, shared_consts_enable);
1478 
1479    tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
1480          .cs_state = true,
1481          .cs_ibo = true,
1482          .cs_shared_const = shared_consts_enable));
1483 
1484    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_COMPUTE, v);
1485    tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
1486 
1487    uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
1488    tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
1489    tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
1490                   A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
1491 
1492    if (CHIP == A6XX && cs->device->physical_device->info->a6xx.has_lpac) {
1493       tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1);
1494       tu_cs_emit(cs, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) |
1495                      A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6);
1496    }
1497 
1498    uint32_t local_invocation_id =
1499       ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
1500    uint32_t work_group_id =
1501       ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
1502 
1503    /*
1504     * Devices that do not support double threadsize take the threadsize from
1505     * A6XX_HLSQ_FS_CNTL_0_THREADSIZE instead of A6XX_HLSQ_CS_CNTL_1_THREADSIZE
1506     * which is always set to THREAD128.
1507     */
1508    enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
1509    enum a6xx_threadsize thrsz_cs = cs->device->physical_device->info->a6xx
1510       .supports_double_threadsize ? thrsz : THREAD128;
1511    if (CHIP == A6XX) {
1512       tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
1513       tu_cs_emit(cs,
1514                  A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
1515                  A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
1516                  A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
1517                  A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
1518       tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
1519                      A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz_cs));
1520       if (!cs->device->physical_device->info->a6xx.supports_double_threadsize) {
1521          tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1);
1522          tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz));
1523       }
1524 
1525       if (cs->device->physical_device->info->a6xx.has_lpac) {
1526          tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2);
1527          tu_cs_emit(cs,
1528                     A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
1529                     A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
1530                     A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
1531                     A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
1532          tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
1533                   A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
1534       }
1535    } else {
1536       unsigned tile_height = (v->local_size[1] % 8 == 0)   ? 3
1537                              : (v->local_size[1] % 4 == 0) ? 5
1538                              : (v->local_size[1] % 2 == 0) ? 9
1539                                                            : 17;
1540       tu_cs_emit_regs(
1541          cs, HLSQ_CS_CNTL_1(CHIP,
1542                    .linearlocalidregid = regid(63, 0), .threadsize = thrsz_cs,
1543                    .workgrouprastorderzfirsten = true,
1544                    .wgtilewidth = 4, .wgtileheight = tile_height));
1545 
1546       tu_cs_emit_regs(cs, HLSQ_FS_CNTL_0(CHIP, .threadsize = THREAD64));
1547 
1548       tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 1);
1549       tu_cs_emit(cs, A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
1550                         A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
1551                         A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
1552                         A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
1553 
1554       tu_cs_emit_regs(cs,
1555                       SP_CS_CNTL_1(CHIP,
1556                         .linearlocalidregid = regid(63, 0),
1557                         .threadsize = thrsz_cs,
1558                         .workitemrastorder =
1559                            v->cs.force_linear_dispatch ?
1560                            WORKITEMRASTORDER_LINEAR :
1561                            WORKITEMRASTORDER_TILED, ));
1562 
1563       tu_cs_emit_regs(cs, A7XX_SP_CS_UNKNOWN_A9BE(0)); // Sometimes is 0x08000000
1564    }
1565 }
1566 
1567 #define TU6_EMIT_VFD_DEST_MAX_DWORDS (MAX_VERTEX_ATTRIBS + 2)
1568 
1569 static void
tu6_emit_vfd_dest(struct tu_cs * cs,const struct ir3_shader_variant * vs)1570 tu6_emit_vfd_dest(struct tu_cs *cs,
1571                   const struct ir3_shader_variant *vs)
1572 {
1573    int32_t input_for_attr[MAX_VERTEX_ATTRIBS];
1574    uint32_t attr_count = 0;
1575 
1576    for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; i++)
1577       input_for_attr[i] = -1;
1578 
1579    for (unsigned i = 0; i < vs->inputs_count; i++) {
1580       if (vs->inputs[i].sysval || vs->inputs[i].regid == regid(63, 0))
1581          continue;
1582 
1583       assert(vs->inputs[i].slot >= VERT_ATTRIB_GENERIC0);
1584       unsigned loc = vs->inputs[i].slot - VERT_ATTRIB_GENERIC0;
1585       input_for_attr[loc] = i;
1586       attr_count = MAX2(attr_count, loc + 1);
1587    }
1588 
1589    tu_cs_emit_regs(cs,
1590                    A6XX_VFD_CONTROL_0(
1591                      .fetch_cnt = attr_count, /* decode_cnt for binning pass ? */
1592                      .decode_cnt = attr_count));
1593 
1594    if (attr_count)
1595       tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DEST_CNTL_INSTR(0), attr_count);
1596 
1597    for (unsigned i = 0; i < attr_count; i++) {
1598       if (input_for_attr[i] >= 0) {
1599             unsigned input_idx = input_for_attr[i];
1600             tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0,
1601                              .writemask = vs->inputs[input_idx].compmask,
1602                              .regid = vs->inputs[input_idx].regid).value);
1603       } else {
1604             tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0,
1605                              .writemask = 0,
1606                              .regid = regid(63, 0)).value);
1607       }
1608    }
1609 }
1610 
1611 static enum a6xx_tex_prefetch_cmd
tu6_tex_opc_to_prefetch_cmd(opc_t tex_opc)1612 tu6_tex_opc_to_prefetch_cmd(opc_t tex_opc)
1613 {
1614    switch (tex_opc) {
1615    case OPC_SAM:
1616       return TEX_PREFETCH_SAM;
1617    default:
1618       unreachable("Unknown tex opc for prefeth cmd");
1619    }
1620 }
1621 
1622 template <chip CHIP>
1623 static void
tu6_emit_fs_inputs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1624 tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
1625 {
1626    uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
1627    uint32_t ij_regid[IJ_COUNT];
1628    uint32_t smask_in_regid, shading_rate_regid;
1629 
1630    bool sample_shading = fs->per_samp | fs->key.sample_shading;
1631    bool enable_varyings = fs->total_in > 0;
1632 
1633    samp_id_regid   = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID);
1634    smask_in_regid  = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN);
1635    face_regid      = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE);
1636    coord_regid     = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD);
1637    zwcoord_regid   = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0);
1638    shading_rate_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_SHADING_RATE);
1639    for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
1640       ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
1641 
1642    if (fs->num_sampler_prefetch > 0) {
1643       /* It seems like ij_pix is *required* to be r0.x */
1644       assert(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]) ||
1645              ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
1646    }
1647 
1648    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);
1649    tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) |
1650                      COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID(0x1ff)) |
1651                      COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID4COORD(0x1ff)) |
1652                      COND(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]),
1653                           A6XX_SP_FS_PREFETCH_CNTL_IJ_WRITE_DISABLE) |
1654                      COND(fs->prefetch_end_of_quad,
1655                           A6XX_SP_FS_PREFETCH_CNTL_ENDOFQUAD));
1656    for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1657       const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1658       tu_cs_emit(
1659          cs, SP_FS_PREFETCH_CMD(
1660                 CHIP, i, .src = prefetch->src, .samp_id = prefetch->samp_id,
1661                 .tex_id = prefetch->tex_id, .dst = prefetch->dst,
1662                 .wrmask = prefetch->wrmask, .half = prefetch->half_precision,
1663                 .bindless = prefetch->bindless,
1664                 .cmd = tu6_tex_opc_to_prefetch_cmd(prefetch->tex_opc), ).value);
1665    }
1666 
1667    if (fs->num_sampler_prefetch > 0) {
1668       tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch);
1669       for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1670          const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1671          tu_cs_emit(cs,
1672                     A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) |
1673                     A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id));
1674       }
1675    }
1676 
1677    tu_cs_emit_regs(cs,
1678       HLSQ_CONTROL_1_REG(CHIP,
1679          .primallocthreshold =
1680             cs->device->physical_device->info->a6xx.prim_alloc_threshold),
1681       HLSQ_CONTROL_2_REG(CHIP, .faceregid = face_regid,
1682                          .sampleid = samp_id_regid,
1683                          .samplemask = smask_in_regid,
1684                          .centerrhw = ij_regid[IJ_PERSP_CENTER_RHW]),
1685       HLSQ_CONTROL_3_REG(CHIP, .ij_persp_pixel = ij_regid[IJ_PERSP_PIXEL],
1686                          .ij_linear_pixel = ij_regid[IJ_LINEAR_PIXEL],
1687                          .ij_persp_centroid = ij_regid[IJ_PERSP_CENTROID],
1688                          .ij_linear_centroid = ij_regid[IJ_LINEAR_CENTROID]),
1689       HLSQ_CONTROL_4_REG(CHIP, .ij_persp_sample = ij_regid[IJ_PERSP_SAMPLE],
1690                          .ij_linear_sample = ij_regid[IJ_LINEAR_SAMPLE],
1691                          .xycoordregid = coord_regid,
1692                          .zwcoordregid = zwcoord_regid),
1693       HLSQ_CONTROL_5_REG(CHIP, .linelengthregid = 0xfc,
1694                          .foveationqualityregid = shading_rate_regid), );
1695 
1696    if (CHIP >= A7XX) {
1697       uint32_t sysval_regs = 0;
1698       for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) {
1699          if (VALIDREG(ij_regid[i])) {
1700             if (i == IJ_PERSP_CENTER_RHW)
1701                sysval_regs += 1;
1702             else
1703                sysval_regs += 2;
1704          }
1705       }
1706 
1707       for (uint32_t sysval : { face_regid, samp_id_regid, smask_in_regid,
1708                                shading_rate_regid }) {
1709          if (VALIDREG(sysval))
1710             sysval_regs += 1;
1711       }
1712 
1713       for (uint32_t sysval : { coord_regid, zwcoord_regid }) {
1714          if (VALIDREG(sysval))
1715             sysval_regs += 2;
1716       }
1717 
1718       tu_cs_emit_regs(cs, A7XX_HLSQ_UNKNOWN_A9AE(.sysval_regs_count = sysval_regs,
1719                                                  .unk8 = 1,
1720                                                  .unk9 = 1));
1721    }
1722 
1723    enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64;
1724    tu_cs_emit_regs(cs, HLSQ_FS_CNTL_0(CHIP, .threadsize = thrsz, .varyings = enable_varyings));
1725 
1726    bool need_size = fs->frag_face || fs->fragcoord_compmask != 0;
1727    bool need_size_persamp = false;
1728    if (VALIDREG(ij_regid[IJ_PERSP_CENTER_RHW])) {
1729       if (sample_shading)
1730          need_size_persamp = true;
1731       else
1732          need_size = true;
1733    }
1734 
1735    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1);
1736    tu_cs_emit(cs,
1737          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
1738          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
1739          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
1740          CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1741          CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) |
1742          CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1743          COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1744          COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1745          COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
1746 
1747    tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2);
1748    tu_cs_emit(cs,
1749          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
1750          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
1751          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
1752          CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1753          CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) |
1754          CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1755          COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1756          COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
1757          COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1758          COND(fs->fragcoord_compmask != 0,
1759                            A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
1760    tu_cs_emit(cs,
1761          A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE(
1762             sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) |
1763          CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
1764          CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) |
1765          CONDREG(ij_regid[IJ_PERSP_CENTER_RHW], A6XX_RB_RENDER_CONTROL1_CENTERRHW) |
1766          COND(fs->post_depth_coverage, A6XX_RB_RENDER_CONTROL1_POSTDEPTHCOVERAGE)  |
1767          COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS) |
1768          CONDREG(shading_rate_regid, A6XX_RB_RENDER_CONTROL1_FOVEATION));
1769 
1770    tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CNTL, 1);
1771    tu_cs_emit(cs, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE));
1772 
1773    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1);
1774    tu_cs_emit(cs, CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) |
1775               A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE(
1776                  sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER));
1777 
1778    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 1);
1779    tu_cs_emit(cs, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE));
1780 
1781    uint32_t varmask[4] = { 0 };
1782 
1783    for (int i = ir3_next_varying(fs, -1); i < fs->inputs_count;
1784         i = ir3_next_varying(fs, i)) {
1785       if (fs->inputs[i].inloc >= fs->total_in)
1786          continue;
1787 
1788       unsigned loc = fs->inputs[i].inloc;
1789       for (int j = 0; j < util_last_bit(fs->inputs[i].compmask); j++) {
1790          uint8_t comploc = loc + j;
1791          varmask[comploc / 32] |= 1 << (comploc % 32);
1792       }
1793    }
1794 
1795    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
1796    tu_cs_emit(cs, ~varmask[0]);
1797    tu_cs_emit(cs, ~varmask[1]);
1798    tu_cs_emit(cs, ~varmask[2]);
1799    tu_cs_emit(cs, ~varmask[3]);
1800 
1801    unsigned primid_loc = ir3_find_input_loc(fs, VARYING_SLOT_PRIMITIVE_ID);
1802    unsigned viewid_loc = ir3_find_input_loc(fs, VARYING_SLOT_VIEW_INDEX);
1803 
1804    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
1805    tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs->total_in) |
1806                   COND(fs && fs->total_in, A6XX_VPC_CNTL_0_VARYING) |
1807                   A6XX_VPC_CNTL_0_PRIMIDLOC(primid_loc) |
1808                   A6XX_VPC_CNTL_0_VIEWIDLOC(viewid_loc));
1809 }
1810 
1811 template <chip CHIP>
1812 static void
tu6_emit_fs_outputs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1813 tu6_emit_fs_outputs(struct tu_cs *cs,
1814                     const struct ir3_shader_variant *fs)
1815 {
1816    uint32_t smask_regid, posz_regid, stencilref_regid;
1817 
1818    posz_regid      = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH);
1819    smask_regid     = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK);
1820    stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL);
1821 
1822    int output_reg_count = 0;
1823    uint32_t fragdata_regid[8];
1824    uint32_t fragdata_aliased_components = 0;
1825 
1826    assert(!fs->color0_mrt);
1827    for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) {
1828       int output_idx =
1829          ir3_find_output(fs, (gl_varying_slot) (FRAG_RESULT_DATA0 + i));
1830 
1831       if (output_idx < 0) {
1832          fragdata_regid[i] = INVALID_REG;
1833          continue;
1834       }
1835 
1836       const struct ir3_shader_output *fragdata = &fs->outputs[output_idx];
1837       fragdata_regid[i] = ir3_get_output_regid(fragdata);
1838 
1839       if (VALIDREG(fragdata_regid[i]) || fragdata->aliased_components) {
1840          /* An invalid reg is only allowed if all components are aliased. */
1841          assert(VALIDREG(fragdata_regid[i] ||
1842                          fragdata->aliased_components == 0xf));
1843 
1844          output_reg_count = i + 1;
1845          fragdata_aliased_components |= fragdata->aliased_components
1846                                         << (i * 4);
1847       }
1848    }
1849 
1850    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1);
1851    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) |
1852                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) |
1853                   A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) |
1854                   COND(fs->dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1855 
1856    /* There is no point in having component enabled which is not written
1857     * by the shader. Per VK spec it is an UB, however a few apps depend on
1858     * attachment not being changed if FS doesn't have corresponding output.
1859     */
1860    uint32_t fs_render_components = 0;
1861 
1862    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), output_reg_count);
1863    for (uint32_t i = 0; i < output_reg_count; i++) {
1864       tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) |
1865                      (COND(fragdata_regid[i] & HALF_REG_ID,
1866                            A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)));
1867 
1868       if (VALIDREG(fragdata_regid[i]) ||
1869                    (fragdata_aliased_components & (0xf << (i * 4)))) {
1870          fs_render_components |= 0xf << (i * 4);
1871       }
1872    }
1873 
1874    tu_cs_emit_regs(cs,
1875                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = fs_render_components));
1876 
1877    if (CHIP >= A7XX) {
1878       tu_cs_emit_regs(
1879          cs,
1880          A7XX_SP_PS_ALIASED_COMPONENTS_CONTROL(
1881                .enabled = fragdata_aliased_components != 0),
1882          A7XX_SP_PS_ALIASED_COMPONENTS(.dword = fragdata_aliased_components));
1883    } else {
1884       assert(fragdata_aliased_components == 0);
1885    }
1886 
1887    tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 1);
1888    tu_cs_emit(cs, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
1889                   COND(fs->writes_smask, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
1890                   COND(fs->writes_stencilref, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
1891                   COND(fs->dual_src_blend, A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1892 
1893    tu_cs_emit_regs(cs,
1894                    A6XX_RB_RENDER_COMPONENTS(.dword = fs_render_components));
1895 }
1896 
1897 template <chip CHIP>
1898 void
tu6_emit_vs(struct tu_cs * cs,const struct ir3_shader_variant * vs,uint32_t view_mask)1899 tu6_emit_vs(struct tu_cs *cs,
1900             const struct ir3_shader_variant *vs,
1901             uint32_t view_mask)
1902 {
1903    bool multi_pos_output = vs->multi_pos_output;
1904 
1905    uint32_t multiview_views = util_logbase2(view_mask) + 1;
1906    uint32_t multiview_cntl = view_mask ?
1907       A6XX_PC_MULTIVIEW_CNTL_ENABLE |
1908       A6XX_PC_MULTIVIEW_CNTL_VIEWS(multiview_views) |
1909       COND(!multi_pos_output, A6XX_PC_MULTIVIEW_CNTL_DISABLEMULTIPOS)
1910       : 0;
1911 
1912    /* Copy what the blob does here. This will emit an extra 0x3f
1913     * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
1914     * this is working around yet.
1915     */
1916    if (cs->device->physical_device->info->a6xx.has_cp_reg_write) {
1917       tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
1918       tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
1919       tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
1920    } else {
1921       tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_CNTL, 1);
1922    }
1923    tu_cs_emit(cs, multiview_cntl);
1924 
1925    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_MULTIVIEW_CNTL, 1);
1926    tu_cs_emit(cs, multiview_cntl);
1927 
1928    if (multiview_cntl &&
1929        cs->device->physical_device->info->a6xx.supports_multiview_mask) {
1930       tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_MASK, 1);
1931       tu_cs_emit(cs, view_mask);
1932    }
1933 
1934    if (CHIP >= A7XX) {
1935       tu_cs_emit_pkt4(cs, REG_A7XX_VPC_MULTIVIEW_CNTL, 1);
1936       tu_cs_emit(cs, multiview_cntl);
1937 
1938       tu_cs_emit_pkt4(cs, REG_A7XX_VPC_MULTIVIEW_MASK, 1);
1939       tu_cs_emit(cs, view_mask);
1940    }
1941 
1942    tu6_emit_vfd_dest(cs, vs);
1943 
1944    const uint32_t vertexid_regid =
1945          ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
1946    const uint32_t instanceid_regid =
1947          ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
1948 
1949    /* Note: we currently don't support multiview with tess or GS. If we did,
1950     * and the HW actually works, then we'd have to somehow share this across
1951     * stages. Note that the blob doesn't support this either.
1952     */
1953    const uint32_t viewid_regid =
1954       ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX);
1955 
1956    const uint32_t vs_primitiveid_regid =
1957       ir3_find_sysval_regid(vs, SYSTEM_VALUE_PRIMITIVE_ID);
1958 
1959    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_1, 1);
1960    tu_cs_emit(cs, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) |
1961                   A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) |
1962                   A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) |
1963                   A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid));
1964 }
1965 TU_GENX(tu6_emit_vs);
1966 
1967 template <chip CHIP>
1968 void
tu6_emit_hs(struct tu_cs * cs,const struct ir3_shader_variant * hs)1969 tu6_emit_hs(struct tu_cs *cs,
1970             const struct ir3_shader_variant *hs)
1971 {
1972    const uint32_t hs_rel_patch_regid =
1973          ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3);
1974    const uint32_t hs_invocation_regid =
1975          ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3);
1976 
1977    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_2, 1);
1978    tu_cs_emit(cs, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) |
1979                   A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid));
1980 
1981    if (hs) {
1982       tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
1983       tu_cs_emit(cs, hs->tess.tcs_vertices_out);
1984    }
1985 }
1986 TU_GENX(tu6_emit_hs);
1987 
1988 template <chip CHIP>
1989 void
tu6_emit_ds(struct tu_cs * cs,const struct ir3_shader_variant * ds)1990 tu6_emit_ds(struct tu_cs *cs,
1991             const struct ir3_shader_variant *ds)
1992 {
1993    const uint32_t ds_rel_patch_regid =
1994          ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3);
1995    const uint32_t tess_coord_x_regid =
1996          ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD);
1997    const uint32_t tess_coord_y_regid = VALIDREG(tess_coord_x_regid) ?
1998          tess_coord_x_regid + 1 :
1999          regid(63, 0);
2000    const uint32_t ds_primitiveid_regid =
2001          ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID);
2002 
2003    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_3, 2);
2004    tu_cs_emit(cs, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) |
2005                   A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) |
2006                   A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) |
2007                   A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid));
2008    tu_cs_emit(cs, 0x000000fc); /* VFD_CONTROL_4 */
2009 }
2010 TU_GENX(tu6_emit_ds);
2011 
2012 static enum a6xx_tess_output
primitive_to_tess(enum mesa_prim primitive)2013 primitive_to_tess(enum mesa_prim primitive) {
2014    switch (primitive) {
2015    case MESA_PRIM_POINTS:
2016       return TESS_POINTS;
2017    case MESA_PRIM_LINE_STRIP:
2018       return TESS_LINES;
2019    case MESA_PRIM_TRIANGLE_STRIP:
2020       return TESS_CW_TRIS;
2021    default:
2022       unreachable("");
2023    }
2024 }
2025 
2026 template <chip CHIP>
2027 void
tu6_emit_gs(struct tu_cs * cs,const struct ir3_shader_variant * gs)2028 tu6_emit_gs(struct tu_cs *cs,
2029             const struct ir3_shader_variant *gs)
2030 {
2031    const uint32_t gsheader_regid =
2032          ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3);
2033 
2034    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_5, 1);
2035    tu_cs_emit(cs, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) |
2036                   0xfc00);
2037 
2038    if (gs) {
2039       uint32_t vertices_out, invocations;
2040 
2041       vertices_out = gs->gs.vertices_out - 1;
2042       enum a6xx_tess_output output = primitive_to_tess((enum mesa_prim) gs->gs.output_primitive);
2043       invocations = gs->gs.invocations - 1;
2044 
2045       uint32_t primitive_cntl =
2046          A6XX_PC_PRIMITIVE_CNTL_5(.gs_vertices_out = vertices_out,
2047                                   .gs_invocations = invocations,
2048                                   .gs_output = output,).value;
2049 
2050       tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1);
2051       tu_cs_emit(cs, primitive_cntl);
2052 
2053       if (CHIP >= A7XX) {
2054          tu_cs_emit_pkt4(cs, REG_A7XX_VPC_PRIMITIVE_CNTL_5, 1);
2055          tu_cs_emit(cs, primitive_cntl);
2056       } else {
2057          tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_PARAM, 1);
2058          tu_cs_emit(cs, 0xff);
2059       }
2060    }
2061 }
2062 TU_GENX(tu6_emit_gs);
2063 
2064 template <chip CHIP>
2065 void
tu6_emit_fs(struct tu_cs * cs,const struct ir3_shader_variant * fs)2066 tu6_emit_fs(struct tu_cs *cs,
2067             const struct ir3_shader_variant *fs)
2068 {
2069    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_6, 1);
2070    tu_cs_emit(cs, COND(fs && fs->reads_primid, A6XX_VFD_CONTROL_6_PRIMID4PSEN));
2071 
2072    tu_cs_emit_regs(cs, A6XX_PC_PS_CNTL(.primitiveiden = fs && fs->reads_primid));
2073 
2074    if (CHIP >= A7XX) {
2075       tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2));
2076       tu_cs_emit_regs(cs, A7XX_HLSQ_FS_UNKNOWN_A9AA(.consts_load_disable = false));
2077    }
2078 
2079    if (fs) {
2080       tu6_emit_fs_inputs<CHIP>(cs, fs);
2081       tu6_emit_fs_outputs<CHIP>(cs, fs);
2082    } else {
2083       /* TODO: check if these can be skipped if fs is disabled */
2084       struct ir3_shader_variant dummy_variant = {};
2085       tu6_emit_fs_inputs<CHIP>(cs, &dummy_variant);
2086       tu6_emit_fs_outputs<CHIP>(cs, &dummy_variant);
2087    }
2088 }
2089 TU_GENX(tu6_emit_fs);
2090 
2091 template <chip CHIP>
2092 static void
tu6_emit_variant(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs,struct tu_pvtmem_config * pvtmem_config,uint32_t view_mask,uint64_t binary_iova)2093 tu6_emit_variant(struct tu_cs *cs,
2094                  gl_shader_stage stage,
2095                  const struct ir3_shader_variant *xs,
2096                  struct tu_pvtmem_config *pvtmem_config,
2097                  uint32_t view_mask,
2098                  uint64_t binary_iova)
2099 {
2100    if (stage == MESA_SHADER_COMPUTE) {
2101       tu6_emit_cs_config<CHIP>(cs, xs, pvtmem_config, binary_iova);
2102       return;
2103    }
2104 
2105    tu6_emit_xs(cs, stage, xs, pvtmem_config, binary_iova);
2106 
2107    switch (stage) {
2108    case MESA_SHADER_VERTEX:
2109       tu6_emit_vs<CHIP>(cs, xs, view_mask);
2110       break;
2111    case MESA_SHADER_TESS_CTRL:
2112       tu6_emit_hs<CHIP>(cs, xs);
2113       break;
2114    case MESA_SHADER_TESS_EVAL:
2115       tu6_emit_ds<CHIP>(cs, xs);
2116       break;
2117    case MESA_SHADER_GEOMETRY:
2118       tu6_emit_gs<CHIP>(cs, xs);
2119       break;
2120    case MESA_SHADER_FRAGMENT:
2121       tu6_emit_fs<CHIP>(cs, xs);
2122       break;
2123    default:
2124       unreachable("unknown shader stage");
2125    }
2126 }
2127 
2128 static VkResult
tu_setup_pvtmem(struct tu_device * dev,struct tu_shader * shader,struct tu_pvtmem_config * config,uint32_t pvtmem_bytes,bool per_wave)2129 tu_setup_pvtmem(struct tu_device *dev,
2130                 struct tu_shader *shader,
2131                 struct tu_pvtmem_config *config,
2132                 uint32_t pvtmem_bytes,
2133                 bool per_wave)
2134 {
2135    if (!pvtmem_bytes) {
2136       memset(config, 0, sizeof(*config));
2137       return VK_SUCCESS;
2138    }
2139 
2140    /* There is a substantial memory footprint from private memory BOs being
2141     * allocated on a per-pipeline basis and it isn't required as the same
2142     * BO can be utilized by multiple pipelines as long as they have the
2143     * private memory layout (sizes and per-wave/per-fiber) to avoid being
2144     * overwritten by other active pipelines using the same BO with differing
2145     * private memory layouts resulting memory corruption.
2146     *
2147     * To avoid this, we create private memory BOs on a per-device level with
2148     * an associated private memory layout then dynamically grow them when
2149     * needed and reuse them across pipelines. Growth is done in terms of
2150     * powers of two so that we can avoid frequent reallocation of the
2151     * private memory BOs.
2152     */
2153 
2154    struct tu_pvtmem_bo *pvtmem_bo =
2155       per_wave ? &dev->wave_pvtmem_bo : &dev->fiber_pvtmem_bo;
2156    mtx_lock(&pvtmem_bo->mtx);
2157 
2158    if (pvtmem_bo->per_fiber_size < pvtmem_bytes) {
2159       if (pvtmem_bo->bo)
2160          tu_bo_finish(dev, pvtmem_bo->bo);
2161 
2162       pvtmem_bo->per_fiber_size =
2163          util_next_power_of_two(ALIGN(pvtmem_bytes, 512));
2164       pvtmem_bo->per_sp_size =
2165          ALIGN(pvtmem_bo->per_fiber_size *
2166                   dev->physical_device->info->fibers_per_sp,
2167                1 << 12);
2168       uint32_t total_size =
2169          dev->physical_device->info->num_sp_cores * pvtmem_bo->per_sp_size;
2170 
2171       VkResult result = tu_bo_init_new(dev, NULL, &pvtmem_bo->bo, total_size,
2172                                        TU_BO_ALLOC_INTERNAL_RESOURCE, "pvtmem");
2173       if (result != VK_SUCCESS) {
2174          mtx_unlock(&pvtmem_bo->mtx);
2175          return result;
2176       }
2177    }
2178 
2179    config->per_wave = per_wave;
2180    config->per_fiber_size = pvtmem_bo->per_fiber_size;
2181    config->per_sp_size = pvtmem_bo->per_sp_size;
2182 
2183    shader->pvtmem_bo = tu_bo_get_ref(pvtmem_bo->bo);
2184    config->iova = shader->pvtmem_bo->iova;
2185 
2186    mtx_unlock(&pvtmem_bo->mtx);
2187 
2188    return VK_SUCCESS;
2189 }
2190 
2191 static uint64_t
tu_upload_variant(struct tu_cs * cs,const struct ir3_shader_variant * variant)2192 tu_upload_variant(struct tu_cs *cs,
2193                   const struct ir3_shader_variant *variant)
2194 {
2195    struct tu_cs_memory memory;
2196 
2197    if (!variant)
2198       return 0;
2199 
2200    /* this expects to get enough alignment because shaders are allocated first
2201     * and total size is always aligned correctly
2202     * note: an assert in tu6_emit_xs_config validates the alignment
2203     */
2204    tu_cs_alloc(cs, variant->info.size / 4, 1, &memory);
2205 
2206    memcpy(memory.map, variant->bin, variant->info.size);
2207    return memory.iova;
2208 }
2209 
2210 static VkResult
tu_upload_shader(struct tu_device * dev,struct tu_shader * shader)2211 tu_upload_shader(struct tu_device *dev,
2212                  struct tu_shader *shader)
2213 {
2214    const struct ir3_shader_variant *v = shader->variant;
2215    const struct ir3_shader_variant *binning = v ? v->binning : NULL;
2216    const struct ir3_shader_variant *safe_const = shader->safe_const_variant;
2217 
2218    if (v->type == MESA_SHADER_VERTEX && v->stream_output.num_outputs != 0)
2219       binning = v;
2220 
2221    uint32_t size = 0;
2222    if (v->type == MESA_SHADER_VERTEX)
2223       size += TU6_EMIT_VFD_DEST_MAX_DWORDS;
2224 
2225    const unsigned xs_size = 128;
2226    const unsigned vpc_size = 32 + (v->stream_output.num_outputs != 0 ? 256 : 0);
2227 
2228    size += xs_size + tu_xs_get_additional_cs_size_dwords(v);
2229    size += v->info.size / 4;
2230    if (binning) {
2231       size += xs_size + tu_xs_get_additional_cs_size_dwords(binning);
2232       size += binning->info.size / 4;
2233    }
2234 
2235    if (safe_const) {
2236       size += xs_size + tu_xs_get_additional_cs_size_dwords(safe_const);
2237       size += safe_const->info.size / 4;
2238    }
2239 
2240    /* We emit an empty VPC including streamout state in the binning draw state */
2241    if (binning || v->type == MESA_SHADER_GEOMETRY) {
2242       size += vpc_size;
2243    }
2244 
2245    pthread_mutex_lock(&dev->pipeline_mutex);
2246    VkResult result = tu_suballoc_bo_alloc(&shader->bo, &dev->pipeline_suballoc,
2247                                           size * 4, 128);
2248    pthread_mutex_unlock(&dev->pipeline_mutex);
2249 
2250    if (result != VK_SUCCESS)
2251       return result;
2252 
2253    uint32_t pvtmem_size = v->pvtmem_size;
2254    bool per_wave = v->pvtmem_per_wave;
2255 
2256    if (v->binning) {
2257       pvtmem_size = MAX2(pvtmem_size, shader->variant->binning->pvtmem_size);
2258       if (!shader->variant->binning->pvtmem_per_wave)
2259          per_wave = false;
2260    }
2261 
2262    if (shader->safe_const_variant) {
2263       pvtmem_size = MAX2(pvtmem_size, shader->safe_const_variant->pvtmem_size);
2264       if (!shader->safe_const_variant->pvtmem_per_wave)
2265          per_wave = false;
2266 
2267       if (shader->safe_const_variant->binning) {
2268          pvtmem_size = MAX2(pvtmem_size, shader->safe_const_variant->binning->pvtmem_size);
2269          if (!shader->safe_const_variant->binning->pvtmem_per_wave)
2270             per_wave = false;
2271       }
2272    }
2273 
2274    struct tu_pvtmem_config pvtmem_config;
2275 
2276    result = tu_setup_pvtmem(dev, shader, &pvtmem_config, pvtmem_size, per_wave);
2277    if (result != VK_SUCCESS) {
2278       pthread_mutex_lock(&dev->pipeline_mutex);
2279       tu_suballoc_bo_free(&dev->pipeline_suballoc, &shader->bo);
2280       pthread_mutex_unlock(&dev->pipeline_mutex);
2281       return result;
2282    }
2283 
2284    TU_RMV(cmd_buffer_suballoc_bo_create, dev, &shader->bo);
2285    tu_cs_init_suballoc(&shader->cs, dev, &shader->bo);
2286 
2287    uint64_t iova = tu_upload_variant(&shader->cs, v);
2288    uint64_t binning_iova = tu_upload_variant(&shader->cs, binning);
2289    uint64_t safe_const_iova = tu_upload_variant(&shader->cs, safe_const);
2290 
2291    struct tu_cs sub_cs;
2292    tu_cs_begin_sub_stream(&shader->cs, xs_size +
2293                           tu_xs_get_additional_cs_size_dwords(v), &sub_cs);
2294    TU_CALLX(dev, tu6_emit_variant)(
2295       &sub_cs, shader->variant->type, shader->variant, &pvtmem_config,
2296       shader->view_mask, iova);
2297    shader->state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2298 
2299    if (safe_const) {
2300       tu_cs_begin_sub_stream(&shader->cs, xs_size +
2301                              tu_xs_get_additional_cs_size_dwords(safe_const), &sub_cs);
2302       TU_CALLX(dev, tu6_emit_variant)(
2303          &sub_cs, v->type, safe_const, &pvtmem_config, shader->view_mask,
2304          safe_const_iova);
2305       shader->safe_const_state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2306    }
2307 
2308    if (binning) {
2309       tu_cs_begin_sub_stream(&shader->cs, xs_size + vpc_size +
2310                              tu_xs_get_additional_cs_size_dwords(binning), &sub_cs);
2311       TU_CALLX(dev, tu6_emit_variant)(
2312          &sub_cs, v->type, binning, &pvtmem_config, shader->view_mask,
2313          binning_iova);
2314       /* emit an empty VPC */
2315       TU_CALLX(dev, tu6_emit_vpc)(&sub_cs, binning, NULL, NULL, NULL, NULL);
2316       shader->binning_state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2317    }
2318 
2319    /* We don't support binning variants for GS, so the same draw state is used
2320     * when binning and when drawing, but the VPC draw state is not executed
2321     * when binning so we still need to generate an appropriate VPC config for
2322     * binning.
2323     */
2324    if (v->type == MESA_SHADER_GEOMETRY) {
2325       tu_cs_begin_sub_stream(&shader->cs, vpc_size, &sub_cs);
2326       TU_CALLX(dev, tu6_emit_vpc)(&sub_cs, NULL, NULL, NULL, v, NULL);
2327       shader->binning_state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2328    }
2329 
2330    return VK_SUCCESS;
2331 }
2332 
2333 static bool
2334 tu_shader_serialize(struct vk_pipeline_cache_object *object,
2335                     struct blob *blob);
2336 
2337 static struct vk_pipeline_cache_object *
2338 tu_shader_deserialize(struct vk_pipeline_cache *cache,
2339                       const void *key_data,
2340                       size_t key_size,
2341                       struct blob_reader *blob);
2342 
2343 static void
tu_shader_pipeline_cache_object_destroy(struct vk_device * vk_device,struct vk_pipeline_cache_object * object)2344 tu_shader_pipeline_cache_object_destroy(struct vk_device *vk_device,
2345                                         struct vk_pipeline_cache_object *object)
2346 {
2347    struct tu_device *device = container_of(vk_device, struct tu_device, vk);
2348    struct tu_shader *shader =
2349       container_of(object, struct tu_shader, base);
2350 
2351    vk_pipeline_cache_object_finish(&shader->base);
2352    tu_shader_destroy(device, shader);
2353 }
2354 
2355 const struct vk_pipeline_cache_object_ops tu_shader_ops = {
2356    .serialize = tu_shader_serialize,
2357    .deserialize = tu_shader_deserialize,
2358    .destroy = tu_shader_pipeline_cache_object_destroy,
2359 };
2360 
2361 static struct tu_shader *
tu_shader_init(struct tu_device * dev,const void * key_data,size_t key_size)2362 tu_shader_init(struct tu_device *dev, const void *key_data, size_t key_size)
2363 {
2364    VK_MULTIALLOC(ma);
2365    VK_MULTIALLOC_DECL(&ma, struct tu_shader, shader, 1);
2366    VK_MULTIALLOC_DECL_SIZE(&ma, char, obj_key_data, key_size);
2367 
2368    if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc,
2369                              VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
2370       return NULL;
2371 
2372    memcpy(obj_key_data, key_data, key_size);
2373 
2374    vk_pipeline_cache_object_init(&dev->vk, &shader->base,
2375                                  &tu_shader_ops, obj_key_data, key_size);
2376 
2377    shader->const_state.fdm_ubo.idx = -1;
2378    shader->const_state.dynamic_offsets_ubo.idx = -1;
2379    shader->const_state.inline_uniforms_ubo.idx = -1;
2380 
2381    return shader;
2382 }
2383 
2384 static bool
tu_shader_serialize(struct vk_pipeline_cache_object * object,struct blob * blob)2385 tu_shader_serialize(struct vk_pipeline_cache_object *object,
2386                     struct blob *blob)
2387 {
2388    struct tu_shader *shader =
2389       container_of(object, struct tu_shader, base);
2390 
2391    blob_write_bytes(blob, &shader->const_state, sizeof(shader->const_state));
2392    blob_write_bytes(blob, &shader->dynamic_descriptor_sizes,
2393                     sizeof(shader->dynamic_descriptor_sizes));
2394    blob_write_uint32(blob, shader->view_mask);
2395    blob_write_uint8(blob, shader->active_desc_sets);
2396 
2397    ir3_store_variant(blob, shader->variant);
2398 
2399    if (shader->safe_const_variant) {
2400       blob_write_uint8(blob, 1);
2401       ir3_store_variant(blob, shader->safe_const_variant);
2402    } else {
2403       blob_write_uint8(blob, 0);
2404    }
2405 
2406 
2407 
2408    switch (shader->variant->type) {
2409    case MESA_SHADER_TESS_EVAL:
2410       blob_write_bytes(blob, &shader->tes, sizeof(shader->tes));
2411       break;
2412    case MESA_SHADER_FRAGMENT:
2413       blob_write_bytes(blob, &shader->fs, sizeof(shader->fs));
2414       break;
2415    default:
2416       break;
2417    }
2418 
2419    return true;
2420 }
2421 
2422 static struct vk_pipeline_cache_object *
tu_shader_deserialize(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,struct blob_reader * blob)2423 tu_shader_deserialize(struct vk_pipeline_cache *cache,
2424                       const void *key_data,
2425                       size_t key_size,
2426                       struct blob_reader *blob)
2427 {
2428    struct tu_device *dev =
2429       container_of(cache->base.device, struct tu_device, vk);
2430    struct tu_shader *shader =
2431       tu_shader_init(dev, key_data, key_size);
2432 
2433    if (!shader)
2434       return NULL;
2435 
2436    blob_copy_bytes(blob, &shader->const_state, sizeof(shader->const_state));
2437    blob_copy_bytes(blob, &shader->dynamic_descriptor_sizes,
2438                    sizeof(shader->dynamic_descriptor_sizes));
2439    shader->view_mask = blob_read_uint32(blob);
2440    shader->active_desc_sets = blob_read_uint8(blob);
2441 
2442    shader->variant = ir3_retrieve_variant(blob, dev->compiler, NULL);
2443 
2444    bool has_safe_const = blob_read_uint8(blob);
2445    if (has_safe_const)
2446       shader->safe_const_variant = ir3_retrieve_variant(blob, dev->compiler, NULL);
2447 
2448    switch (shader->variant->type) {
2449    case MESA_SHADER_TESS_EVAL:
2450       blob_copy_bytes(blob, &shader->tes, sizeof(shader->tes));
2451       break;
2452    case MESA_SHADER_FRAGMENT:
2453       blob_copy_bytes(blob, &shader->fs, sizeof(shader->fs));
2454       break;
2455    default:
2456       break;
2457    }
2458 
2459    VkResult result = tu_upload_shader(dev, shader);
2460    if (result != VK_SUCCESS) {
2461       vk_free(&dev->vk.alloc, shader);
2462       return NULL;
2463    }
2464 
2465    return &shader->base;
2466 }
2467 
2468 VkResult
tu_shader_create(struct tu_device * dev,struct tu_shader ** shader_out,nir_shader * nir,const struct tu_shader_key * key,const struct ir3_shader_key * ir3_key,const void * key_data,size_t key_size,struct tu_pipeline_layout * layout,bool executable_info)2469 tu_shader_create(struct tu_device *dev,
2470                  struct tu_shader **shader_out,
2471                  nir_shader *nir,
2472                  const struct tu_shader_key *key,
2473                  const struct ir3_shader_key *ir3_key,
2474                  const void *key_data,
2475                  size_t key_size,
2476                  struct tu_pipeline_layout *layout,
2477                  bool executable_info)
2478 {
2479    struct tu_shader *shader = tu_shader_init(dev, key_data, key_size);
2480 
2481    if (!shader)
2482       return VK_ERROR_OUT_OF_HOST_MEMORY;
2483 
2484    const nir_opt_access_options access_options = {
2485       .is_vulkan = true,
2486    };
2487    NIR_PASS_V(nir, nir_opt_access, &access_options);
2488 
2489    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2490       const nir_input_attachment_options att_options = {
2491          .use_fragcoord_sysval = true,
2492          .use_layer_id_sysval = false,
2493          /* When using multiview rendering, we must use
2494           * gl_ViewIndex as the layer id to pass to the texture
2495           * sampling function. gl_Layer doesn't work when
2496           * multiview is enabled.
2497           */
2498          .use_view_id_for_layer = key->multiview_mask != 0,
2499          .unscaled_depth_stencil_ir3 =
2500             key->dynamic_renderpass && !(key->read_only_input_attachments & 1),
2501          .unscaled_input_attachment_ir3 =
2502             key->dynamic_renderpass ?
2503             ~(key->read_only_input_attachments >> 1) :
2504             key->unscaled_input_fragcoord,
2505       };
2506       NIR_PASS_V(nir, nir_lower_input_attachments, &att_options);
2507    }
2508 
2509    /* This has to happen before lower_input_attachments, because we have to
2510     * lower input attachment coordinates except if unscaled.
2511     */
2512    const struct lower_fdm_options fdm_options = {
2513       .num_views = MAX2(util_last_bit(key->multiview_mask), 1),
2514       .adjust_fragcoord = key->fragment_density_map,
2515    };
2516    NIR_PASS_V(nir, tu_nir_lower_fdm, &fdm_options);
2517 
2518 
2519    /* This needs to happen before multiview lowering which rewrites store
2520     * instructions of the position variable, so that we can just rewrite one
2521     * store at the end instead of having to rewrite every store specified by
2522     * the user.
2523     */
2524    ir3_nir_lower_io_to_temporaries(nir);
2525 
2526    if (nir->info.stage == MESA_SHADER_VERTEX && key->multiview_mask) {
2527       tu_nir_lower_multiview(nir, key->multiview_mask, dev);
2528    }
2529 
2530    if (nir->info.stage == MESA_SHADER_FRAGMENT && key->force_sample_interp) {
2531       nir_foreach_shader_in_variable(var, nir) {
2532          if (!var->data.centroid)
2533             var->data.sample = true;
2534       }
2535    }
2536 
2537    NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const,
2538               nir_address_format_32bit_offset);
2539 
2540    NIR_PASS_V(nir, nir_lower_explicit_io,
2541               nir_var_mem_ubo | nir_var_mem_ssbo,
2542               nir_address_format_vec2_index_32bit_offset);
2543 
2544    NIR_PASS_V(nir, nir_lower_explicit_io,
2545               nir_var_mem_global,
2546               nir_address_format_64bit_global);
2547 
2548    if (nir->info.stage == MESA_SHADER_COMPUTE) {
2549       if (!nir->info.shared_memory_explicit_layout) {
2550          NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
2551                     nir_var_mem_shared, shared_type_info);
2552       }
2553       NIR_PASS_V(nir, nir_lower_explicit_io,
2554                  nir_var_mem_shared,
2555                  nir_address_format_32bit_offset);
2556 
2557       if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
2558          const unsigned chunk_size = 16; /* max single store size */
2559          /* Shared memory is allocated in 1024b chunks in HW, but the zero-init
2560           * extension only requires us to initialize the memory that the shader
2561           * is allocated at the API level, and it's up to the user to ensure
2562           * that accesses are limited to those bounds.
2563           */
2564          const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
2565          NIR_PASS_V(nir, nir_zero_initialize_shared_memory, shared_size, chunk_size);
2566       }
2567 
2568       const struct nir_lower_compute_system_values_options compute_sysval_options = {
2569          .has_base_workgroup_id = true,
2570       };
2571       NIR_PASS_V(nir, nir_lower_compute_system_values, &compute_sysval_options);
2572    }
2573 
2574    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
2575    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
2576 
2577   /* Gather information for transform feedback. This should be called after:
2578     * - nir_split_per_member_structs.
2579     * - nir_remove_dead_variables with varyings, so that we could align
2580     *   stream outputs correctly.
2581     * - nir_assign_io_var_locations - to have valid driver_location
2582     */
2583    struct ir3_stream_output_info so_info = {};
2584    if (nir->info.stage == MESA_SHADER_VERTEX ||
2585          nir->info.stage == MESA_SHADER_TESS_EVAL ||
2586          nir->info.stage == MESA_SHADER_GEOMETRY)
2587       tu_gather_xfb_info(nir, &so_info);
2588 
2589    for (unsigned i = 0; i < layout->num_sets; i++) {
2590       if (layout->set[i].layout) {
2591          shader->dynamic_descriptor_sizes[i] =
2592             layout->set[i].layout->dynamic_offset_size;
2593       } else {
2594          shader->dynamic_descriptor_sizes[i] = -1;
2595       }
2596    }
2597 
2598    {
2599       /* Lower 64b push constants before lowering IO. */
2600       nir_lower_mem_access_bit_sizes_options options = {
2601          .callback = ir3_mem_access_size_align,
2602          .modes = nir_var_mem_push_const,
2603       };
2604 
2605       NIR_PASS_V(nir, nir_lower_mem_access_bit_sizes, &options);
2606    }
2607 
2608    struct ir3_const_allocations const_allocs = {};
2609    NIR_PASS_V(nir, tu_lower_io, dev, shader, layout,
2610               key->read_only_input_attachments, key->dynamic_renderpass,
2611               &const_allocs);
2612 
2613    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
2614 
2615    struct ir3_shader_nir_options nir_options;
2616    init_ir3_nir_options(&nir_options, key);
2617 
2618    ir3_finalize_nir(dev->compiler, &nir_options, nir);
2619 
2620    const struct ir3_shader_options options = {
2621       .api_wavesize = key->api_wavesize,
2622       .real_wavesize = key->real_wavesize,
2623       .push_consts_type = shader->const_state.push_consts.type,
2624       .push_consts_base = shader->const_state.push_consts.lo_dwords,
2625       .push_consts_dwords = shader->const_state.push_consts.dwords,
2626       .const_allocs = const_allocs,
2627       .nir_options = nir_options,
2628    };
2629 
2630    struct ir3_shader *ir3_shader =
2631       ir3_shader_from_nir(dev->compiler, nir, &options, &so_info);
2632 
2633    shader->variant =
2634       ir3_shader_create_variant(ir3_shader, ir3_key, executable_info);
2635 
2636    if (ir3_exceeds_safe_constlen(shader->variant)) {
2637       struct ir3_shader_key safe_constlen_key = *ir3_key;
2638       safe_constlen_key.safe_constlen = true;
2639       shader->safe_const_variant =
2640          ir3_shader_create_variant(ir3_shader, &safe_constlen_key,
2641                                    executable_info);
2642    }
2643 
2644    ir3_shader_destroy(ir3_shader);
2645 
2646    shader->view_mask = key->multiview_mask;
2647 
2648    switch (shader->variant->type) {
2649    case MESA_SHADER_TESS_EVAL: {
2650       const struct ir3_shader_variant *tes = shader->variant;
2651       if (tes->tess.point_mode) {
2652          shader->tes.tess_output_lower_left =
2653             shader->tes.tess_output_upper_left = TESS_POINTS;
2654       } else if (tes->tess.primitive_mode == TESS_PRIMITIVE_ISOLINES) {
2655          shader->tes.tess_output_lower_left =
2656             shader->tes.tess_output_upper_left = TESS_LINES;
2657       } else if (tes->tess.ccw) {
2658          /* Tessellation orientation in HW is specified with a lower-left
2659           * origin, we need to swap them if the origin is upper-left.
2660           */
2661          shader->tes.tess_output_lower_left = TESS_CCW_TRIS;
2662          shader->tes.tess_output_upper_left = TESS_CW_TRIS;
2663       } else {
2664          shader->tes.tess_output_lower_left = TESS_CW_TRIS;
2665          shader->tes.tess_output_upper_left = TESS_CCW_TRIS;
2666       }
2667 
2668       switch (tes->tess.spacing) {
2669       case TESS_SPACING_EQUAL:
2670          shader->tes.tess_spacing = TESS_EQUAL;
2671          break;
2672       case TESS_SPACING_FRACTIONAL_ODD:
2673          shader->tes.tess_spacing = TESS_FRACTIONAL_ODD;
2674          break;
2675       case TESS_SPACING_FRACTIONAL_EVEN:
2676          shader->tes.tess_spacing = TESS_FRACTIONAL_EVEN;
2677          break;
2678       case TESS_SPACING_UNSPECIFIED:
2679       default:
2680          unreachable("invalid tess spacing");
2681       }
2682 
2683       break;
2684    }
2685    case MESA_SHADER_FRAGMENT: {
2686       const struct ir3_shader_variant *fs = shader->variant;
2687       shader->fs.per_samp = fs->per_samp || ir3_key->sample_shading;
2688       shader->fs.has_fdm = key->fragment_density_map;
2689       if (fs->has_kill)
2690          shader->fs.lrz.status |= TU_LRZ_FORCE_DISABLE_WRITE;
2691       if (fs->no_earlyz || (fs->writes_pos && !fs->fs.early_fragment_tests))
2692          shader->fs.lrz.status = TU_LRZ_FORCE_DISABLE_LRZ;
2693       /* FDM isn't compatible with LRZ, because the LRZ image uses the original
2694        * resolution and we would need to use the low resolution.
2695        *
2696        * TODO: Use a patchpoint to only disable LRZ for scaled bins.
2697        */
2698       if (key->fragment_density_map)
2699          shader->fs.lrz.status = TU_LRZ_FORCE_DISABLE_LRZ;
2700       if (!fs->fs.early_fragment_tests &&
2701           (fs->no_earlyz || fs->writes_pos || fs->writes_stencilref || fs->writes_smask)) {
2702          shader->fs.lrz.force_late_z = true;
2703       }
2704       break;
2705    }
2706    default:
2707       break;
2708    }
2709 
2710    VkResult result = tu_upload_shader(dev, shader);
2711    if (result != VK_SUCCESS) {
2712       vk_free(&dev->vk.alloc, shader);
2713       return result;
2714    }
2715 
2716    *shader_out = shader;
2717    return VK_SUCCESS;
2718 }
2719 
2720 static void
tu_link_shaders(nir_shader ** shaders,unsigned shaders_count)2721 tu_link_shaders(nir_shader **shaders, unsigned shaders_count)
2722 {
2723    nir_shader *consumer = NULL;
2724    for (gl_shader_stage stage = (gl_shader_stage) (shaders_count - 1);
2725         stage >= MESA_SHADER_VERTEX; stage = (gl_shader_stage) (stage - 1)) {
2726       if (!shaders[stage])
2727          continue;
2728 
2729       nir_shader *producer = shaders[stage];
2730       if (!consumer) {
2731          consumer = producer;
2732          continue;
2733       }
2734 
2735       if (nir_link_opt_varyings(producer, consumer)) {
2736          NIR_PASS_V(consumer, nir_opt_constant_folding);
2737          NIR_PASS_V(consumer, nir_opt_algebraic);
2738          NIR_PASS_V(consumer, nir_opt_dce);
2739       }
2740 
2741       const nir_remove_dead_variables_options out_var_opts = {
2742          .can_remove_var = nir_vk_is_not_xfb_output,
2743       };
2744       NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, &out_var_opts);
2745 
2746       NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
2747 
2748       bool progress = nir_remove_unused_varyings(producer, consumer);
2749 
2750       nir_compact_varyings(producer, consumer, true);
2751       if (progress) {
2752          if (nir_lower_global_vars_to_local(producer)) {
2753             /* Remove dead writes, which can remove input loads */
2754             NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_temp, NULL);
2755             NIR_PASS_V(producer, nir_opt_dce);
2756          }
2757          nir_lower_global_vars_to_local(consumer);
2758       }
2759 
2760       consumer = producer;
2761    }
2762 
2763    /* Gather info after linking so that we can fill out the ir3 shader key.
2764     */
2765    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2766         stage <= MESA_SHADER_FRAGMENT; stage = (gl_shader_stage) (stage + 1)) {
2767       if (shaders[stage])
2768          nir_shader_gather_info(shaders[stage],
2769                                 nir_shader_get_entrypoint(shaders[stage]));
2770    }
2771 }
2772 
2773 static uint32_t
tu6_get_tessmode(const struct nir_shader * shader)2774 tu6_get_tessmode(const struct nir_shader *shader)
2775 {
2776    enum tess_primitive_mode primitive_mode = shader->info.tess._primitive_mode;
2777    switch (primitive_mode) {
2778    case TESS_PRIMITIVE_ISOLINES:
2779       return IR3_TESS_ISOLINES;
2780    case TESS_PRIMITIVE_TRIANGLES:
2781       return IR3_TESS_TRIANGLES;
2782    case TESS_PRIMITIVE_QUADS:
2783       return IR3_TESS_QUADS;
2784    case TESS_PRIMITIVE_UNSPECIFIED:
2785       return IR3_TESS_NONE;
2786    default:
2787       unreachable("bad tessmode");
2788    }
2789 }
2790 
2791 VkResult
tu_compile_shaders(struct tu_device * device,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo ** stage_infos,nir_shader ** nir,const struct tu_shader_key * keys,struct tu_pipeline_layout * layout,const unsigned char * pipeline_sha1,struct tu_shader ** shaders,char ** nir_initial_disasm,void * nir_initial_disasm_mem_ctx,nir_shader ** nir_out,VkPipelineCreationFeedback * stage_feedbacks)2792 tu_compile_shaders(struct tu_device *device,
2793                    VkPipelineCreateFlags2KHR pipeline_flags,
2794                    const VkPipelineShaderStageCreateInfo **stage_infos,
2795                    nir_shader **nir,
2796                    const struct tu_shader_key *keys,
2797                    struct tu_pipeline_layout *layout,
2798                    const unsigned char *pipeline_sha1,
2799                    struct tu_shader **shaders,
2800                    char **nir_initial_disasm,
2801                    void *nir_initial_disasm_mem_ctx,
2802                    nir_shader **nir_out,
2803                    VkPipelineCreationFeedback *stage_feedbacks)
2804 {
2805    struct ir3_shader_key ir3_key = {};
2806    VkResult result = VK_SUCCESS;
2807    void *mem_ctx = ralloc_context(NULL);
2808 
2809    for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2810         stage = (gl_shader_stage) (stage + 1)) {
2811       const VkPipelineShaderStageCreateInfo *stage_info = stage_infos[stage];
2812       if (!stage_info)
2813          continue;
2814 
2815       int64_t stage_start = os_time_get_nano();
2816 
2817       nir[stage] = tu_spirv_to_nir(device, mem_ctx, pipeline_flags,
2818                                    stage_info, &keys[stage], stage);
2819       if (!nir[stage]) {
2820          result = VK_ERROR_OUT_OF_HOST_MEMORY;
2821          goto fail;
2822       }
2823 
2824       stage_feedbacks[stage].flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
2825       stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
2826    }
2827 
2828    if (nir[MESA_SHADER_GEOMETRY])
2829       ir3_key.has_gs = true;
2830 
2831    ir3_key.sample_shading = keys[MESA_SHADER_FRAGMENT].force_sample_interp;
2832 
2833    if (nir_initial_disasm) {
2834       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2835            stage < MESA_SHADER_STAGES;
2836            stage = (gl_shader_stage) (stage + 1)) {
2837       if (!nir[stage])
2838          continue;
2839 
2840       nir_initial_disasm[stage] =
2841          nir_shader_as_str(nir[stage], nir_initial_disasm_mem_ctx);
2842       }
2843    }
2844 
2845    tu_link_shaders(nir, MESA_SHADER_STAGES);
2846 
2847    if (nir_out) {
2848       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2849            stage < MESA_SHADER_STAGES; stage = (gl_shader_stage) (stage + 1)) {
2850          if (!nir[stage])
2851             continue;
2852 
2853          nir_out[stage] = nir_shader_clone(NULL, nir[stage]);
2854       }
2855    }
2856 
2857    /* With pipelines, tessellation modes can be set on either shader, for
2858     * compatibility with HLSL and GLSL, and the driver is supposed to merge
2859     * them. Shader objects requires modes to be set on at least the TES except
2860     * for OutputVertices which has to be set at least on the TCS. Make sure
2861     * all modes are set on the TES when compiling together multiple shaders,
2862     * and then from this point on we will use the modes in the TES (and output
2863     * vertices on the TCS).
2864     */
2865    if (nir[MESA_SHADER_TESS_EVAL]) {
2866       nir_shader *tcs = nir[MESA_SHADER_TESS_CTRL];
2867       nir_shader *tes = nir[MESA_SHADER_TESS_EVAL];
2868 
2869       if (tes->info.tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED)
2870          tes->info.tess._primitive_mode = tcs->info.tess._primitive_mode;
2871 
2872       tes->info.tess.point_mode |= tcs->info.tess.point_mode;
2873       tes->info.tess.ccw |= tcs->info.tess.ccw;
2874 
2875       if (tes->info.tess.spacing == TESS_SPACING_UNSPECIFIED) {
2876          tes->info.tess.spacing = tcs->info.tess.spacing;
2877       }
2878 
2879       if (tcs->info.tess.tcs_vertices_out == 0)
2880          tcs->info.tess.tcs_vertices_out = tes->info.tess.tcs_vertices_out;
2881 
2882       ir3_key.tessellation = tu6_get_tessmode(tes);
2883    }
2884 
2885    for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2886         stage = (gl_shader_stage) (stage + 1)) {
2887       if (!nir[stage])
2888          continue;
2889 
2890       if (stage > MESA_SHADER_TESS_CTRL) {
2891          if (stage == MESA_SHADER_FRAGMENT) {
2892             ir3_key.tcs_store_primid = ir3_key.tcs_store_primid ||
2893                (nir[stage]->info.inputs_read & (1ull << VARYING_SLOT_PRIMITIVE_ID));
2894          } else {
2895             ir3_key.tcs_store_primid = ir3_key.tcs_store_primid ||
2896                BITSET_TEST(nir[stage]->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
2897          }
2898       }
2899    }
2900 
2901    /* In the the tess-but-not-FS case we don't know whether the FS will read
2902     * PrimID so we need to unconditionally store it.
2903     */
2904    if (nir[MESA_SHADER_TESS_CTRL] && !nir[MESA_SHADER_FRAGMENT])
2905       ir3_key.tcs_store_primid = true;
2906 
2907    for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2908         stage = (gl_shader_stage) (stage + 1)) {
2909       if (!nir[stage] || shaders[stage])
2910          continue;
2911 
2912       int64_t stage_start = os_time_get_nano();
2913 
2914       unsigned char shader_sha1[21];
2915       memcpy(shader_sha1, pipeline_sha1, 20);
2916       shader_sha1[20] = (unsigned char) stage;
2917 
2918       result = tu_shader_create(device,
2919                                 &shaders[stage], nir[stage], &keys[stage],
2920                                 &ir3_key, shader_sha1, sizeof(shader_sha1),
2921                                 layout, !!nir_initial_disasm);
2922       if (result != VK_SUCCESS) {
2923          goto fail;
2924       }
2925 
2926       stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
2927    }
2928 
2929    ralloc_free(mem_ctx);
2930 
2931    return VK_SUCCESS;
2932 
2933 fail:
2934    ralloc_free(mem_ctx);
2935 
2936    for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2937         stage = (gl_shader_stage) (stage + 1)) {
2938       if (shaders[stage]) {
2939          tu_shader_destroy(device, shaders[stage]);
2940       }
2941       if (nir_out && nir_out[stage]) {
2942          ralloc_free(nir_out[stage]);
2943       }
2944    }
2945 
2946    return result;
2947 }
2948 
2949 void
tu_shader_key_subgroup_size(struct tu_shader_key * key,bool allow_varying_subgroup_size,bool require_full_subgroups,const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo * subgroup_info,struct tu_device * dev)2950 tu_shader_key_subgroup_size(struct tu_shader_key *key,
2951                             bool allow_varying_subgroup_size,
2952                             bool require_full_subgroups,
2953                             const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info,
2954                             struct tu_device *dev)
2955 {
2956    enum ir3_wavesize_option api_wavesize, real_wavesize;
2957    if (!dev->physical_device->info->a6xx.supports_double_threadsize) {
2958       api_wavesize = IR3_SINGLE_ONLY;
2959       real_wavesize = IR3_SINGLE_ONLY;
2960    } else {
2961       if (allow_varying_subgroup_size) {
2962          api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE;
2963       } else {
2964          if (subgroup_info) {
2965             if (subgroup_info->requiredSubgroupSize == dev->compiler->threadsize_base) {
2966                api_wavesize = IR3_SINGLE_ONLY;
2967             } else {
2968                assert(subgroup_info->requiredSubgroupSize == dev->compiler->threadsize_base * 2);
2969                api_wavesize = IR3_DOUBLE_ONLY;
2970             }
2971          } else {
2972             /* Match the exposed subgroupSize. */
2973             api_wavesize = IR3_DOUBLE_ONLY;
2974          }
2975 
2976          if (require_full_subgroups)
2977             real_wavesize = api_wavesize;
2978          else if (api_wavesize == IR3_SINGLE_ONLY)
2979             real_wavesize = IR3_SINGLE_ONLY;
2980          else
2981             real_wavesize = IR3_SINGLE_OR_DOUBLE;
2982       }
2983    }
2984 
2985    key->api_wavesize = api_wavesize;
2986    key->real_wavesize = real_wavesize;
2987 }
2988 
2989 void
tu_shader_key_robustness(struct tu_shader_key * key,const struct vk_pipeline_robustness_state * rs)2990 tu_shader_key_robustness(struct tu_shader_key *key,
2991                          const struct vk_pipeline_robustness_state *rs)
2992 {
2993    key->robust_storage_access2 =
2994       (rs->storage_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT);
2995    key->robust_uniform_access2 =
2996       (rs->uniform_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT);
2997 }
2998 
2999 static VkResult
tu_empty_shader_create(struct tu_device * dev,struct tu_shader ** shader_out,gl_shader_stage stage)3000 tu_empty_shader_create(struct tu_device *dev,
3001                        struct tu_shader **shader_out,
3002                        gl_shader_stage stage)
3003 {
3004    struct tu_shader *shader = tu_shader_init(dev, NULL, 0);
3005 
3006    if (!shader)
3007       return VK_ERROR_OUT_OF_HOST_MEMORY;
3008 
3009    pthread_mutex_lock(&dev->pipeline_mutex);
3010    VkResult result = tu_suballoc_bo_alloc(&shader->bo, &dev->pipeline_suballoc,
3011                                           32 * 4, 128);
3012    pthread_mutex_unlock(&dev->pipeline_mutex);
3013 
3014    if (result != VK_SUCCESS) {
3015       vk_free(&dev->vk.alloc, shader);
3016       return result;
3017    }
3018 
3019    TU_RMV(cmd_buffer_suballoc_bo_create, dev, &shader->bo);
3020    tu_cs_init_suballoc(&shader->cs, dev, &shader->bo);
3021 
3022    struct tu_pvtmem_config pvtmem_config = { };
3023 
3024    struct tu_cs sub_cs;
3025    tu_cs_begin_sub_stream(&shader->cs, 32, &sub_cs);
3026    TU_CALLX(dev, tu6_emit_variant)(&sub_cs, stage, NULL, &pvtmem_config, 0, 0);
3027    shader->state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
3028 
3029    *shader_out = shader;
3030    return VK_SUCCESS;
3031 }
3032 
3033 static VkResult
tu_empty_fs_create(struct tu_device * dev,struct tu_shader ** shader,bool fragment_density_map)3034 tu_empty_fs_create(struct tu_device *dev, struct tu_shader **shader,
3035                    bool fragment_density_map)
3036 {
3037    struct ir3_shader_key key = {};
3038    const struct ir3_shader_options options = {};
3039    struct ir3_stream_output_info so_info = {};
3040    const nir_shader_compiler_options *nir_options =
3041       ir3_get_compiler_options(dev->compiler);
3042    nir_builder fs_b;
3043 
3044    fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, nir_options,
3045                                          "noop_fs");
3046 
3047    *shader = tu_shader_init(dev, NULL, 0);
3048    if (!*shader)
3049       return VK_ERROR_OUT_OF_HOST_MEMORY;
3050 
3051    (*shader)->fs.has_fdm = fragment_density_map;
3052    if (fragment_density_map)
3053       (*shader)->fs.lrz.status = TU_LRZ_FORCE_DISABLE_LRZ;
3054 
3055    for (unsigned i = 0; i < MAX_SETS; i++)
3056       (*shader)->dynamic_descriptor_sizes[i] = -1;
3057 
3058    struct ir3_shader *ir3_shader =
3059       ir3_shader_from_nir(dev->compiler, fs_b.shader, &options, &so_info);
3060    (*shader)->variant = ir3_shader_create_variant(ir3_shader, &key, false);
3061    ir3_shader_destroy(ir3_shader);
3062 
3063    return tu_upload_shader(dev, *shader);
3064 }
3065 
3066 VkResult
tu_init_empty_shaders(struct tu_device * dev)3067 tu_init_empty_shaders(struct tu_device *dev)
3068 {
3069    VkResult result;
3070 
3071    result = tu_empty_shader_create(dev, &dev->empty_tcs, MESA_SHADER_TESS_CTRL);
3072    if (result != VK_SUCCESS)
3073       goto out;
3074 
3075    result = tu_empty_shader_create(dev, &dev->empty_tes, MESA_SHADER_TESS_EVAL);
3076    if (result != VK_SUCCESS)
3077       goto out;
3078 
3079    result = tu_empty_shader_create(dev, &dev->empty_gs, MESA_SHADER_GEOMETRY);
3080    if (result != VK_SUCCESS)
3081       goto out;
3082 
3083    result = tu_empty_fs_create(dev, &dev->empty_fs, false);
3084    if (result != VK_SUCCESS)
3085       goto out;
3086 
3087    result = tu_empty_fs_create(dev, &dev->empty_fs_fdm, true);
3088    if (result != VK_SUCCESS)
3089       goto out;
3090 
3091    return VK_SUCCESS;
3092 
3093 out:
3094    if (dev->empty_tcs)
3095       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tcs->base);
3096    if (dev->empty_tes)
3097       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tes->base);
3098    if (dev->empty_gs)
3099       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_gs->base);
3100    if (dev->empty_fs)
3101       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs->base);
3102    if (dev->empty_fs_fdm)
3103       vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs_fdm->base);
3104    return result;
3105 }
3106 
3107 void
tu_destroy_empty_shaders(struct tu_device * dev)3108 tu_destroy_empty_shaders(struct tu_device *dev)
3109 {
3110    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tcs->base);
3111    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tes->base);
3112    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_gs->base);
3113    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs->base);
3114    vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs_fdm->base);
3115 }
3116 
3117 void
tu_shader_destroy(struct tu_device * dev,struct tu_shader * shader)3118 tu_shader_destroy(struct tu_device *dev,
3119                   struct tu_shader *shader)
3120 {
3121    tu_cs_finish(&shader->cs);
3122    TU_RMV(resource_destroy, dev, &shader->bo);
3123 
3124    pthread_mutex_lock(&dev->pipeline_mutex);
3125    tu_suballoc_bo_free(&dev->pipeline_suballoc, &shader->bo);
3126    pthread_mutex_unlock(&dev->pipeline_mutex);
3127 
3128    if (shader->pvtmem_bo)
3129       tu_bo_finish(dev, shader->pvtmem_bo);
3130 
3131    if (shader->variant)
3132       ralloc_free((void *)shader->variant);
3133    if (shader->safe_const_variant)
3134       ralloc_free((void *)shader->safe_const_variant);
3135 
3136    vk_free(&dev->vk.alloc, shader);
3137 }
3138