1 /*
2 * Copyright © 2019 Google LLC
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "tu_shader.h"
7
8 #include "spirv/nir_spirv.h"
9 #include "util/mesa-sha1.h"
10 #include "nir/nir_xfb_info.h"
11 #include "vk_nir.h"
12 #include "vk_nir_convert_ycbcr.h"
13 #include "vk_pipeline.h"
14 #include "vk_util.h"
15
16 #include "ir3/ir3_compiler.h"
17 #include "ir3/ir3_nir.h"
18
19 #include "tu_device.h"
20 #include "tu_descriptor_set.h"
21 #include "tu_lrz.h"
22 #include "tu_pipeline.h"
23 #include "tu_rmv.h"
24
25 #include <initializer_list>
26
27 static void
init_ir3_nir_options(struct ir3_shader_nir_options * options,const struct tu_shader_key * key)28 init_ir3_nir_options(struct ir3_shader_nir_options *options,
29 const struct tu_shader_key *key)
30 {
31 *options = {
32 .robust_modes = (nir_variable_mode)
33 ((key->robust_storage_access2 ? nir_var_mem_ssbo : 0) |
34 (key->robust_uniform_access2 ? nir_var_mem_ubo : 0)),
35 };
36 }
37
38 nir_shader *
tu_spirv_to_nir(struct tu_device * dev,void * mem_ctx,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo * stage_info,const struct tu_shader_key * key,gl_shader_stage stage)39 tu_spirv_to_nir(struct tu_device *dev,
40 void *mem_ctx,
41 VkPipelineCreateFlags2KHR pipeline_flags,
42 const VkPipelineShaderStageCreateInfo *stage_info,
43 const struct tu_shader_key *key,
44 gl_shader_stage stage)
45 {
46 /* TODO these are made-up */
47 const struct spirv_to_nir_options spirv_options = {
48 /* ViewID is a sysval in geometry stages and an input in the FS */
49 .view_index_is_input =
50 stage == MESA_SHADER_FRAGMENT &&
51 !key->lower_view_index_to_device_index,
52
53 /* Use 16-bit math for RelaxedPrecision ALU ops */
54 .mediump_16bit_alu = true,
55
56 .ubo_addr_format = nir_address_format_vec2_index_32bit_offset,
57 .ssbo_addr_format = nir_address_format_vec2_index_32bit_offset,
58
59 /* Accessed via stg/ldg */
60 .phys_ssbo_addr_format = nir_address_format_64bit_global,
61
62 /* Accessed via the const register file */
63 .push_const_addr_format = nir_address_format_logical,
64
65 /* Accessed via ldl/stl */
66 .shared_addr_format = nir_address_format_32bit_offset,
67
68 /* Accessed via stg/ldg (not used with Vulkan?) */
69 .global_addr_format = nir_address_format_64bit_global,
70 };
71
72 const nir_shader_compiler_options *nir_options =
73 ir3_get_compiler_options(dev->compiler);
74
75 nir_shader *nir;
76 VkResult result =
77 vk_pipeline_shader_stage_to_nir(&dev->vk, pipeline_flags, stage_info,
78 &spirv_options, nir_options,
79 mem_ctx, &nir);
80 if (result != VK_SUCCESS)
81 return NULL;
82
83 /* ir3 uses num_ubos and num_ssbos to track the number of *bindful*
84 * UBOs/SSBOs, but spirv_to_nir sets them to the total number of objects
85 * which is useless for us, so reset them here.
86 */
87 nir->info.num_ubos = 0;
88 nir->info.num_ssbos = 0;
89
90 if (TU_DEBUG(NIR)) {
91 fprintf(stderr, "translated nir:\n");
92 nir_print_shader(nir, stderr);
93 }
94
95 const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
96 .point_coord = true,
97 };
98 NIR_PASS_V(nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
99
100 NIR_PASS_V(nir, nir_lower_global_vars_to_local);
101
102 /* Older glslang missing bf6efd0316d8 ("SPV: Fix #2293: keep relaxed
103 * precision on arg passed to relaxed param") will pass function args through
104 * a highp temporary, so we need the nir_opt_find_array_copies() and a copy
105 * prop before we lower mediump vars, or you'll be unable to optimize out
106 * array copies after lowering. We do this before splitting copies, since
107 * that works against nir_opt_find_array_copies().
108 * */
109 NIR_PASS_V(nir, nir_opt_find_array_copies);
110 NIR_PASS_V(nir, nir_opt_copy_prop_vars);
111 NIR_PASS_V(nir, nir_opt_dce);
112
113 NIR_PASS_V(nir, nir_split_var_copies);
114 NIR_PASS_V(nir, nir_lower_var_copies);
115
116 NIR_PASS_V(nir, nir_lower_mediump_vars, nir_var_function_temp | nir_var_shader_temp | nir_var_mem_shared);
117 NIR_PASS_V(nir, nir_opt_copy_prop_vars);
118 NIR_PASS_V(nir, nir_opt_combine_stores, nir_var_all);
119
120 NIR_PASS_V(nir, nir_lower_system_values);
121 NIR_PASS_V(nir, nir_lower_is_helper_invocation);
122
123 if (key->lower_view_index_to_device_index)
124 NIR_PASS_V(nir, nir_lower_view_index_to_device_index);
125
126 struct ir3_shader_nir_options options;
127 init_ir3_nir_options(&options, key);
128 ir3_optimize_loop(dev->compiler, &options, nir);
129
130 NIR_PASS_V(nir, nir_opt_conditional_discard);
131
132 return nir;
133 }
134
135 static void
lower_load_push_constant(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * instr,struct tu_shader * shader,const struct tu_pipeline_layout * layout,uint32_t push_consts_offset_vec4)136 lower_load_push_constant(struct tu_device *dev,
137 nir_builder *b,
138 nir_intrinsic_instr *instr,
139 struct tu_shader *shader,
140 const struct tu_pipeline_layout *layout,
141 uint32_t push_consts_offset_vec4)
142 {
143 uint32_t base = nir_intrinsic_base(instr);
144 assert(base % 4 == 0);
145
146 if (tu6_shared_constants_enable(layout, dev->compiler)) {
147 /* All stages share the same range. We could potentially add
148 * push_constant_offset to layout and apply it, but this is good for
149 * now.
150 */
151 base += dev->compiler->shared_consts_base_offset * 4;
152 } else {
153 assert(base >= shader->const_state.push_consts.lo_dwords);
154 base -= shader->const_state.push_consts.lo_dwords;
155 base += push_consts_offset_vec4 * 4;
156 }
157
158 nir_def *load =
159 nir_load_const_ir3(b, instr->num_components, instr->def.bit_size,
160 nir_ushr_imm(b, instr->src[0].ssa, 2), .base = base);
161
162 nir_def_replace(&instr->def, load);
163 }
164
165 static void
lower_vulkan_resource_index(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * instr,struct tu_shader * shader,const struct tu_pipeline_layout * layout)166 lower_vulkan_resource_index(struct tu_device *dev, nir_builder *b,
167 nir_intrinsic_instr *instr,
168 struct tu_shader *shader,
169 const struct tu_pipeline_layout *layout)
170 {
171 struct ir3_compiler *compiler = dev->compiler;
172 nir_def *vulkan_idx = instr->src[0].ssa;
173
174 unsigned set = nir_intrinsic_desc_set(instr);
175 unsigned binding = nir_intrinsic_binding(instr);
176 struct tu_descriptor_set_layout *set_layout = layout->set[set].layout;
177 struct tu_descriptor_set_binding_layout *binding_layout =
178 &set_layout->binding[binding];
179 nir_def *base;
180
181 if (binding_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
182 return;
183
184 shader->active_desc_sets |= 1u << set;
185
186 if (vk_descriptor_type_is_dynamic(binding_layout->type)) {
187 int offset = 0;
188 for (unsigned i = 0; i < set; i++) {
189 if (shader->dynamic_descriptor_sizes[i] >= 0) {
190 offset += shader->dynamic_descriptor_sizes[i];
191 } else {
192 offset = -1;
193 break;
194 }
195 }
196
197 if (offset < 0) {
198 /* With independent sets, we don't know
199 * layout->set[set].dynamic_offset_start until after link time which
200 * with fast linking means after the shader is compiled. We have to
201 * get it from the const file instead.
202 */
203 base = nir_imm_int(b, binding_layout->dynamic_offset_offset / (4 * A6XX_TEX_CONST_DWORDS));
204 nir_def *dynamic_offset_start;
205 if (compiler->load_shader_consts_via_preamble) {
206 dynamic_offset_start =
207 ir3_load_driver_ubo(b, 1, &shader->const_state.dynamic_offsets_ubo, set);
208 } else {
209 dynamic_offset_start = nir_load_const_ir3(
210 b, 1, 32, nir_imm_int(b, 0),
211 .base = shader->const_state.dynamic_offset_loc + set);
212 }
213 base = nir_iadd(b, base, dynamic_offset_start);
214 } else {
215 base = nir_imm_int(b, (offset +
216 binding_layout->dynamic_offset_offset) / (4 * A6XX_TEX_CONST_DWORDS));
217 }
218 assert(dev->physical_device->reserved_set_idx >= 0);
219 set = dev->physical_device->reserved_set_idx;
220 } else
221 base = nir_imm_int(b, binding_layout->offset / (4 * A6XX_TEX_CONST_DWORDS));
222
223 unsigned stride = binding_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
224 assert(util_is_power_of_two_nonzero(stride));
225 nir_def *shift = nir_imm_int(b, util_logbase2(stride));
226
227 nir_def *def = nir_vec3(b, nir_imm_int(b, set),
228 nir_iadd(b, base,
229 nir_ishl(b, vulkan_idx, shift)),
230 shift);
231
232 nir_def_replace(&instr->def, def);
233 }
234
235 static void
lower_vulkan_resource_reindex(nir_builder * b,nir_intrinsic_instr * instr)236 lower_vulkan_resource_reindex(nir_builder *b, nir_intrinsic_instr *instr)
237 {
238 nir_def *old_index = instr->src[0].ssa;
239 nir_def *delta = instr->src[1].ssa;
240 nir_def *shift = nir_channel(b, old_index, 2);
241
242 nir_def *new_index =
243 nir_vec3(b, nir_channel(b, old_index, 0),
244 nir_iadd(b, nir_channel(b, old_index, 1),
245 nir_ishl(b, delta, shift)),
246 shift);
247
248 nir_def_replace(&instr->def, new_index);
249 }
250
251 static void
lower_load_vulkan_descriptor(nir_builder * b,nir_intrinsic_instr * intrin)252 lower_load_vulkan_descriptor(nir_builder *b, nir_intrinsic_instr *intrin)
253 {
254 nir_def *old_index = intrin->src[0].ssa;
255 /* Loading the descriptor happens as part of the load/store instruction so
256 * this is a no-op. We just need to turn the shift into an offset of 0.
257 */
258 nir_def *new_index =
259 nir_vec3(b, nir_channel(b, old_index, 0),
260 nir_channel(b, old_index, 1),
261 nir_imm_int(b, 0));
262 nir_def_replace(&intrin->def, new_index);
263 }
264
265 static bool
lower_ssbo_ubo_intrinsic(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * intrin)266 lower_ssbo_ubo_intrinsic(struct tu_device *dev,
267 nir_builder *b, nir_intrinsic_instr *intrin)
268 {
269 const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic];
270
271 /* The bindless base is part of the instruction, which means that part of
272 * the "pointer" has to be constant. We solve this in the same way the blob
273 * does, by generating a bunch of if-statements. In the usual case where
274 * the descriptor set is constant we can skip that, though).
275 */
276
277 unsigned buffer_src;
278 if (intrin->intrinsic == nir_intrinsic_store_ssbo) {
279 /* This has the value first */
280 buffer_src = 1;
281 } else {
282 buffer_src = 0;
283 }
284
285 /* Don't lower non-bindless UBO loads of driver params */
286 if (intrin->src[buffer_src].ssa->num_components == 1)
287 return false;
288
289 nir_scalar scalar_idx = nir_scalar_resolved(intrin->src[buffer_src].ssa, 0);
290 nir_def *descriptor_idx = nir_channel(b, intrin->src[buffer_src].ssa, 1);
291
292 if (intrin->intrinsic == nir_intrinsic_load_ubo &&
293 dev->instance->allow_oob_indirect_ubo_loads) {
294 nir_scalar offset = nir_scalar_resolved(intrin->src[1].ssa, 0);
295 if (!nir_scalar_is_const(offset)) {
296 nir_intrinsic_set_range(intrin, ~0);
297 }
298 }
299
300 /* Descriptor index has to be adjusted in the following cases:
301 * - isam loads, when the 16-bit descriptor cannot also be used for 32-bit
302 * loads -- next-index descriptor will be able to do that;
303 * - 8-bit SSBO loads and stores -- next-index descriptor is dedicated to
304 * storage accesses of that size.
305 */
306 if ((dev->physical_device->info->a6xx.storage_16bit &&
307 !dev->physical_device->info->a6xx.has_isam_v &&
308 intrin->intrinsic == nir_intrinsic_load_ssbo &&
309 (nir_intrinsic_access(intrin) & ACCESS_CAN_REORDER) &&
310 intrin->def.bit_size > 16) ||
311 (dev->physical_device->info->a7xx.storage_8bit &&
312 ((intrin->intrinsic == nir_intrinsic_load_ssbo && intrin->def.bit_size == 8) ||
313 (intrin->intrinsic == nir_intrinsic_store_ssbo && intrin->src[0].ssa->bit_size == 8)))) {
314 descriptor_idx = nir_iadd_imm(b, descriptor_idx, 1);
315 }
316
317 nir_def *results[MAX_SETS] = { NULL };
318
319 if (nir_scalar_is_const(scalar_idx)) {
320 nir_def *bindless =
321 nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = nir_scalar_as_uint(scalar_idx));
322 nir_src_rewrite(&intrin->src[buffer_src], bindless);
323 return true;
324 }
325
326 nir_def *base_idx = nir_channel(b, scalar_idx.def, scalar_idx.comp);
327 for (unsigned i = 0; i < dev->physical_device->info->a6xx.max_sets; i++) {
328 /* if (base_idx == i) { ... */
329 nir_if *nif = nir_push_if(b, nir_ieq_imm(b, base_idx, i));
330
331 nir_def *bindless =
332 nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = i);
333
334 nir_intrinsic_instr *copy =
335 nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
336
337 copy->num_components = intrin->num_components;
338
339 for (unsigned src = 0; src < info->num_srcs; src++) {
340 if (src == buffer_src)
341 copy->src[src] = nir_src_for_ssa(bindless);
342 else
343 copy->src[src] = nir_src_for_ssa(intrin->src[src].ssa);
344 }
345
346 for (unsigned idx = 0; idx < info->num_indices; idx++) {
347 copy->const_index[idx] = intrin->const_index[idx];
348 }
349
350 if (info->has_dest) {
351 nir_def_init(©->instr, ©->def,
352 intrin->def.num_components,
353 intrin->def.bit_size);
354 results[i] = ©->def;
355 }
356
357 nir_builder_instr_insert(b, ©->instr);
358
359 /* } else { ... */
360 nir_push_else(b, nif);
361 }
362
363 nir_def *result =
364 nir_undef(b, intrin->def.num_components, intrin->def.bit_size);
365 for (int i = dev->physical_device->info->a6xx.max_sets - 1; i >= 0; i--) {
366 nir_pop_if(b, NULL);
367 if (info->has_dest)
368 result = nir_if_phi(b, results[i], result);
369 }
370
371 if (info->has_dest)
372 nir_def_rewrite_uses(&intrin->def, result);
373 nir_instr_remove(&intrin->instr);
374 return true;
375 }
376
377 static nir_def *
build_bindless(struct tu_device * dev,nir_builder * b,nir_deref_instr * deref,bool is_sampler,struct tu_shader * shader,const struct tu_pipeline_layout * layout,uint32_t read_only_input_attachments,bool dynamic_renderpass)378 build_bindless(struct tu_device *dev, nir_builder *b,
379 nir_deref_instr *deref, bool is_sampler,
380 struct tu_shader *shader,
381 const struct tu_pipeline_layout *layout,
382 uint32_t read_only_input_attachments,
383 bool dynamic_renderpass)
384 {
385 nir_variable *var = nir_deref_instr_get_variable(deref);
386
387 unsigned set = var->data.descriptor_set;
388 unsigned binding = var->data.binding;
389 const struct tu_descriptor_set_binding_layout *bind_layout =
390 &layout->set[set].layout->binding[binding];
391
392 /* input attachments use non bindless workaround */
393 if (bind_layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT &&
394 (!dynamic_renderpass ||
395 (var->data.index == NIR_VARIABLE_NO_INDEX ?
396 !(read_only_input_attachments & 0x1) :
397 !(read_only_input_attachments & (1u << (var->data.index + 1))))) &&
398 !TU_DEBUG(DYNAMIC)) {
399 const struct glsl_type *glsl_type = glsl_without_array(var->type);
400 uint32_t idx;
401
402 /* With dynamic renderpasses, we reserve the first two attachments for
403 * input attachments without an InputAttachmentIndex, which must be for
404 * depth/stencil if they are not read-only, and shift over the rest of
405 * the indices.
406 */
407 if (var->data.index == ~0u) {
408 assert(dynamic_renderpass);
409 idx = 0;
410 } else if (dynamic_renderpass) {
411 idx = (var->data.index + 1) * 2;
412 } else {
413 idx = var->data.index * 2;
414 }
415
416 /* Record which input attachments are used for tracking feedback loops */
417 if (dynamic_renderpass)
418 shader->fs.dynamic_input_attachments_used |= (1u << (idx / 2));
419
420 BITSET_SET_RANGE_INSIDE_WORD(b->shader->info.textures_used, idx, (idx + bind_layout->array_size * 2) - 1);
421
422 /* D24S8 workaround: stencil of D24S8 will be sampled as uint */
423 if (glsl_get_sampler_result_type(glsl_type) == GLSL_TYPE_UINT)
424 idx += 1;
425
426 if (deref->deref_type == nir_deref_type_var)
427 return nir_imm_int(b, idx);
428
429 nir_def *arr_index = deref->arr.index.ssa;
430 return nir_iadd_imm(b, nir_imul_imm(b, arr_index, 2), idx);
431 }
432
433 shader->active_desc_sets |= 1u << set;
434
435 nir_def *desc_offset;
436 unsigned descriptor_stride;
437 unsigned offset = 0;
438 /* Samplers come second in combined image/sampler descriptors, see
439 * write_combined_image_sampler_descriptor().
440 */
441 if (is_sampler && bind_layout->type ==
442 VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
443 offset = 1;
444 }
445 desc_offset =
446 nir_imm_int(b, (bind_layout->offset / (4 * A6XX_TEX_CONST_DWORDS)) +
447 offset);
448 descriptor_stride = bind_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
449
450 if (deref->deref_type != nir_deref_type_var) {
451 assert(deref->deref_type == nir_deref_type_array);
452
453 nir_def *arr_index = deref->arr.index.ssa;
454 desc_offset = nir_iadd(b, desc_offset,
455 nir_imul_imm(b, arr_index, descriptor_stride));
456 }
457
458 return nir_bindless_resource_ir3(b, 32, desc_offset, .desc_set = set);
459 }
460
461 static void
lower_image_deref(struct tu_device * dev,nir_builder * b,nir_intrinsic_instr * instr,struct tu_shader * shader,const struct tu_pipeline_layout * layout)462 lower_image_deref(struct tu_device *dev, nir_builder *b,
463 nir_intrinsic_instr *instr, struct tu_shader *shader,
464 const struct tu_pipeline_layout *layout)
465 {
466 nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
467 nir_def *bindless = build_bindless(dev, b, deref, false, shader, layout, 0, false);
468 nir_rewrite_image_intrinsic(instr, bindless, true);
469 }
470
471 static bool
lower_intrinsic(nir_builder * b,nir_intrinsic_instr * instr,struct tu_device * dev,struct tu_shader * shader,const struct tu_pipeline_layout * layout,struct ir3_const_allocations * const_allocs)472 lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
473 struct tu_device *dev,
474 struct tu_shader *shader,
475 const struct tu_pipeline_layout *layout,
476 struct ir3_const_allocations *const_allocs)
477 {
478 switch (instr->intrinsic) {
479 case nir_intrinsic_load_push_constant:
480 lower_load_push_constant(
481 dev, b, instr, shader, layout,
482 const_allocs->consts[IR3_CONST_ALLOC_PUSH_CONSTS].offset_vec4);
483 return true;
484
485 case nir_intrinsic_load_vulkan_descriptor:
486 lower_load_vulkan_descriptor(b, instr);
487 return true;
488
489 case nir_intrinsic_vulkan_resource_index:
490 lower_vulkan_resource_index(dev, b, instr, shader, layout);
491 return true;
492 case nir_intrinsic_vulkan_resource_reindex:
493 lower_vulkan_resource_reindex(b, instr);
494 return true;
495
496 case nir_intrinsic_load_ubo:
497 case nir_intrinsic_load_ssbo:
498 case nir_intrinsic_store_ssbo:
499 case nir_intrinsic_ssbo_atomic:
500 case nir_intrinsic_ssbo_atomic_swap:
501 case nir_intrinsic_get_ssbo_size:
502 return lower_ssbo_ubo_intrinsic(dev, b, instr);
503
504 case nir_intrinsic_image_deref_load:
505 case nir_intrinsic_image_deref_store:
506 case nir_intrinsic_image_deref_atomic:
507 case nir_intrinsic_image_deref_atomic_swap:
508 case nir_intrinsic_image_deref_size:
509 case nir_intrinsic_image_deref_samples:
510 lower_image_deref(dev, b, instr, shader, layout);
511 return true;
512
513 case nir_intrinsic_load_frag_size_ir3:
514 case nir_intrinsic_load_frag_offset_ir3: {
515 if (!dev->compiler->load_shader_consts_via_preamble)
516 return false;
517
518 unsigned param =
519 instr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
520 IR3_DP_FS(frag_size) : IR3_DP_FS(frag_offset);
521
522 unsigned offset = param - IR3_DP_FS_DYNAMIC;
523
524 nir_def *view = instr->src[0].ssa;
525 nir_def *result =
526 ir3_load_driver_ubo_indirect(b, 2, &shader->const_state.fdm_ubo,
527 offset, view, nir_intrinsic_range(instr));
528
529 nir_def_replace(&instr->def, result);
530 return true;
531 }
532 case nir_intrinsic_load_frag_invocation_count: {
533 if (!dev->compiler->load_shader_consts_via_preamble)
534 return false;
535
536 nir_def *result =
537 ir3_load_driver_ubo(b, 1, &shader->const_state.fdm_ubo,
538 IR3_DP_FS(frag_invocation_count) -
539 IR3_DP_FS_DYNAMIC);
540
541 nir_def_replace(&instr->def, result);
542 return true;
543 }
544
545 default:
546 return false;
547 }
548 }
549
550 static void
lower_tex_ycbcr(const struct tu_pipeline_layout * layout,nir_builder * builder,nir_tex_instr * tex)551 lower_tex_ycbcr(const struct tu_pipeline_layout *layout,
552 nir_builder *builder,
553 nir_tex_instr *tex)
554 {
555 int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
556 assert(deref_src_idx >= 0);
557 nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
558
559 nir_variable *var = nir_deref_instr_get_variable(deref);
560 const struct tu_descriptor_set_layout *set_layout =
561 layout->set[var->data.descriptor_set].layout;
562 const struct tu_descriptor_set_binding_layout *binding =
563 &set_layout->binding[var->data.binding];
564 const struct vk_ycbcr_conversion_state *ycbcr_samplers =
565 tu_immutable_ycbcr_samplers(set_layout, binding);
566
567 if (!ycbcr_samplers)
568 return;
569
570 /* For the following instructions, we don't apply any change */
571 if (tex->op == nir_texop_txs ||
572 tex->op == nir_texop_query_levels ||
573 tex->op == nir_texop_lod)
574 return;
575
576 assert(tex->texture_index == 0);
577 unsigned array_index = 0;
578 if (deref->deref_type != nir_deref_type_var) {
579 assert(deref->deref_type == nir_deref_type_array);
580 if (!nir_src_is_const(deref->arr.index))
581 return;
582 array_index = nir_src_as_uint(deref->arr.index);
583 array_index = MIN2(array_index, binding->array_size - 1);
584 }
585 const struct vk_ycbcr_conversion_state *ycbcr_sampler = ycbcr_samplers + array_index;
586
587 if (ycbcr_sampler->ycbcr_model == VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY)
588 return;
589
590 /* Skip if not actually a YCbCr format. CtsGraphics, for example, tries to create
591 * YcbcrConversions for RGB formats.
592 */
593 if (!vk_format_get_ycbcr_info(ycbcr_sampler->format))
594 return;
595
596 builder->cursor = nir_after_instr(&tex->instr);
597
598 uint8_t bits = vk_format_get_bpc(ycbcr_sampler->format);
599 uint32_t bpcs[3] = {bits, bits, bits}; /* We only support uniform formats */
600 nir_def *result = nir_convert_ycbcr_to_rgb(builder,
601 ycbcr_sampler->ycbcr_model,
602 ycbcr_sampler->ycbcr_range,
603 &tex->def,
604 bpcs);
605 nir_def_rewrite_uses_after(&tex->def, result,
606 result->parent_instr);
607
608 builder->cursor = nir_before_instr(&tex->instr);
609 }
610
611 static bool
lower_tex(nir_builder * b,nir_tex_instr * tex,struct tu_device * dev,struct tu_shader * shader,const struct tu_pipeline_layout * layout,uint32_t read_only_input_attachments,bool dynamic_renderpass)612 lower_tex(nir_builder *b, nir_tex_instr *tex, struct tu_device *dev,
613 struct tu_shader *shader, const struct tu_pipeline_layout *layout,
614 uint32_t read_only_input_attachments, bool dynamic_renderpass)
615 {
616 lower_tex_ycbcr(layout, b, tex);
617
618 int sampler_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref);
619 if (sampler_src_idx >= 0) {
620 nir_deref_instr *deref = nir_src_as_deref(tex->src[sampler_src_idx].src);
621 nir_def *bindless = build_bindless(dev, b, deref, true, shader, layout,
622 read_only_input_attachments,
623 dynamic_renderpass);
624 nir_src_rewrite(&tex->src[sampler_src_idx].src, bindless);
625 tex->src[sampler_src_idx].src_type = nir_tex_src_sampler_handle;
626 }
627
628 int tex_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
629 if (tex_src_idx >= 0) {
630 nir_deref_instr *deref = nir_src_as_deref(tex->src[tex_src_idx].src);
631 nir_def *bindless = build_bindless(dev, b, deref, false, shader, layout,
632 read_only_input_attachments,
633 dynamic_renderpass);
634 nir_src_rewrite(&tex->src[tex_src_idx].src, bindless);
635 tex->src[tex_src_idx].src_type = nir_tex_src_texture_handle;
636
637 /* for the input attachment case: */
638 if (bindless->parent_instr->type != nir_instr_type_intrinsic)
639 tex->src[tex_src_idx].src_type = nir_tex_src_texture_offset;
640 }
641
642 return true;
643 }
644
645 struct lower_instr_params {
646 struct tu_device *dev;
647 struct tu_shader *shader;
648 const struct tu_pipeline_layout *layout;
649 uint32_t read_only_input_attachments;
650 bool dynamic_renderpass;
651 struct ir3_const_allocations *const_allocs;
652 };
653
654 static bool
lower_instr(nir_builder * b,nir_instr * instr,void * cb_data)655 lower_instr(nir_builder *b, nir_instr *instr, void *cb_data)
656 {
657 struct lower_instr_params *params = (struct lower_instr_params *) cb_data;
658 b->cursor = nir_before_instr(instr);
659 switch (instr->type) {
660 case nir_instr_type_tex:
661 return lower_tex(b, nir_instr_as_tex(instr), params->dev, params->shader, params->layout,
662 params->read_only_input_attachments,
663 params->dynamic_renderpass);
664 case nir_instr_type_intrinsic:
665 return lower_intrinsic(b, nir_instr_as_intrinsic(instr), params->dev,
666 params->shader, params->layout,
667 params->const_allocs);
668 default:
669 return false;
670 }
671 }
672
673 /* Since we always push inline uniforms into constant memory, lower loads of
674 * them to load_uniform which turns into constant memory loads.
675 */
676 static bool
lower_inline_ubo(nir_builder * b,nir_intrinsic_instr * intrin,void * cb_data)677 lower_inline_ubo(nir_builder *b, nir_intrinsic_instr *intrin, void *cb_data)
678 {
679 if (intrin->intrinsic != nir_intrinsic_load_ubo)
680 return false;
681
682 struct lower_instr_params *params = (struct lower_instr_params *) cb_data;
683 struct tu_shader *shader = params->shader;
684 const struct tu_pipeline_layout *layout = params->layout;
685
686 nir_binding binding = nir_chase_binding(intrin->src[0]);
687
688 if (!binding.success)
689 return false;
690
691 struct tu_descriptor_set_layout *set_layout = layout->set[binding.desc_set].layout;
692 struct tu_descriptor_set_binding_layout *binding_layout =
693 &set_layout->binding[binding.binding];
694
695 if (binding_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
696 return false;
697
698 /* lookup the const offset of the inline UBO */
699 struct tu_const_state *const_state = &shader->const_state;
700
701 unsigned base = UINT_MAX;
702 unsigned range;
703 bool use_load = false;
704 bool use_ldg_k =
705 params->dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
706
707 for (unsigned i = 0; i < const_state->num_inline_ubos; i++) {
708 if (const_state->ubos[i].base == binding.desc_set &&
709 const_state->ubos[i].offset == binding_layout->offset) {
710 range = const_state->ubos[i].size_vec4 * 4;
711 if (use_ldg_k) {
712 base = i * 2;
713 } else {
714 use_load = const_state->ubos[i].push_address;
715 base = const_state->ubos[i].const_offset_vec4 * 4;
716 }
717 break;
718 }
719 }
720
721 if (base == UINT_MAX) {
722 /* Assume we're loading out-of-bounds from a 0-sized inline uniform
723 * filtered out below.
724 */
725 nir_def_rewrite_uses(&intrin->def,
726 nir_undef(b, intrin->num_components,
727 intrin->def.bit_size));
728 return true;
729 }
730
731 nir_def *offset = intrin->src[1].ssa;
732
733 b->cursor = nir_before_instr(&intrin->instr);
734 nir_def *val;
735
736 if (use_load || use_ldg_k) {
737 nir_def *base_addr;
738 if (use_ldg_k) {
739 base_addr = ir3_load_driver_ubo(b, 2,
740 ¶ms->shader->const_state.inline_uniforms_ubo,
741 base);
742 } else {
743 base_addr =
744 nir_load_const_ir3(b, 2, 32, nir_imm_int(b, 0), .base = base);
745 }
746 val = nir_load_global_ir3(b, intrin->num_components,
747 intrin->def.bit_size,
748 base_addr, nir_ishr_imm(b, offset, 2),
749 .access =
750 (enum gl_access_qualifier)(
751 (enum gl_access_qualifier)(ACCESS_NON_WRITEABLE | ACCESS_CAN_REORDER) |
752 ACCESS_CAN_SPECULATE),
753 .align_mul = 16,
754 .align_offset = 0,
755 .range_base = 0,
756 .range = range);
757 } else {
758 val =
759 nir_load_const_ir3(b, intrin->num_components, intrin->def.bit_size,
760 nir_ishr_imm(b, offset, 2), .base = base);
761 }
762
763 nir_def_replace(&intrin->def, val);
764 return true;
765 }
766
767 /* Figure out the range of push constants that we're actually going to push to
768 * the shader, and tell the backend to reserve this range when pushing UBO
769 * constants.
770 */
771
772 static void
gather_push_constants(nir_shader * shader,struct tu_shader * tu_shader)773 gather_push_constants(nir_shader *shader, struct tu_shader *tu_shader)
774 {
775 uint32_t min = UINT32_MAX, max = 0;
776 nir_foreach_function_impl(impl, shader) {
777 nir_foreach_block(block, impl) {
778 nir_foreach_instr_safe(instr, block) {
779 if (instr->type != nir_instr_type_intrinsic)
780 continue;
781
782 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
783 if (intrin->intrinsic != nir_intrinsic_load_push_constant)
784 continue;
785
786 uint32_t base = nir_intrinsic_base(intrin);
787 uint32_t range = nir_intrinsic_range(intrin);
788 min = MIN2(min, base);
789 max = MAX2(max, base + range);
790 break;
791 }
792 }
793 }
794
795 if (min >= max) {
796 tu_shader->const_state.push_consts = (struct tu_push_constant_range) {};
797 return;
798 }
799
800 /* CP_LOAD_STATE OFFSET and NUM_UNIT for SHARED_CONSTS are in units of
801 * dwords while loading regular consts is in units of vec4's.
802 * So we unify the unit here as dwords for tu_push_constant_range, then
803 * we should consider correct unit when emitting.
804 *
805 * Note there's an alignment requirement of 16 dwords on OFFSET. Expand
806 * the range and change units accordingly.
807 */
808 tu_shader->const_state.push_consts.lo_dwords += (min / 4) / 4 * 4;
809 tu_shader->const_state.push_consts.dwords =
810 align(max, 16) / 4 - tu_shader->const_state.push_consts.lo_dwords;
811 }
812
813 static bool
shader_uses_push_consts(nir_shader * shader)814 shader_uses_push_consts(nir_shader *shader)
815 {
816 nir_foreach_function_impl (impl, shader) {
817 nir_foreach_block (block, impl) {
818 nir_foreach_instr_safe (instr, block) {
819 if (instr->type != nir_instr_type_intrinsic)
820 continue;
821
822 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
823 if (intrin->intrinsic == nir_intrinsic_load_push_constant)
824 return true;
825 }
826 }
827 }
828 return false;
829 }
830
831 static bool
tu_lower_io(nir_shader * shader,struct tu_device * dev,struct tu_shader * tu_shader,const struct tu_pipeline_layout * layout,uint32_t read_only_input_attachments,bool dynamic_renderpass,struct ir3_const_allocations * const_allocs)832 tu_lower_io(nir_shader *shader, struct tu_device *dev,
833 struct tu_shader *tu_shader,
834 const struct tu_pipeline_layout *layout,
835 uint32_t read_only_input_attachments,
836 bool dynamic_renderpass,
837 struct ir3_const_allocations *const_allocs)
838 {
839 /* Allocate driver params as early as possible as a workaround for the
840 * following case:
841 * - CP_DRAW_INDIRECT_MULTI_1_DST_OFF apparently tries to upload consts
842 * even when there are 0 instances.
843 * - With zero instances, the draw state for VS constlen is not applied.
844 * - constlen therefor uses stale value and if
845 * CP_DRAW_INDIRECT_MULTI_1_DST_OFF is higher than 0x3f - GPU hangs.
846 *
847 * To not rely on undefined behaviour, we will always allocate enough space
848 * to upload driver params.
849 */
850 if (shader->info.stage == MESA_SHADER_VERTEX) {
851 uint32_t num_driver_params =
852 ir3_nir_scan_driver_consts(dev->compiler, shader, nullptr);
853 ir3_alloc_driver_params(const_allocs, &num_driver_params, dev->compiler,
854 shader->info.stage);
855 }
856
857 struct tu_const_state *const_state = &tu_shader->const_state;
858 const_state->push_consts = (struct tu_push_constant_range) {
859 .lo_dwords = 0,
860 .dwords = layout->push_constant_size / 4,
861 .type = tu_push_consts_type(layout, dev->compiler),
862 };
863
864 if (const_state->push_consts.type == IR3_PUSH_CONSTS_PER_STAGE) {
865 gather_push_constants(shader, tu_shader);
866 } else if (const_state->push_consts.type ==
867 IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
868 /* Disable pushing constants for this stage if none were loaded in the
869 * shader. If all stages don't load their declared push constants, as
870 * is often the case under zink, then we could additionally skip
871 * emitting REG_A7XX_HLSQ_SHARED_CONSTS_IMM entirely.
872 */
873 if (!shader_uses_push_consts(shader))
874 const_state->push_consts = (struct tu_push_constant_range) {};
875 }
876
877 if (const_state->push_consts.type != IR3_PUSH_CONSTS_SHARED) {
878 uint32_t offset_align_vec4 = 1;
879 if (const_state->push_consts.type == IR3_PUSH_CONSTS_PER_STAGE)
880 offset_align_vec4 = dev->compiler->const_upload_unit;
881
882 unsigned push_consts_vec4 =
883 align(DIV_ROUND_UP(const_state->push_consts.dwords, 4),
884 dev->compiler->const_upload_unit);
885
886 ir3_const_alloc(const_allocs, IR3_CONST_ALLOC_PUSH_CONSTS,
887 push_consts_vec4, offset_align_vec4);
888 }
889
890 bool unknown_dynamic_size = false;
891 bool unknown_dynamic_offset = false;
892 for (unsigned i = 0; i < layout->num_sets; i++) {
893 if (tu_shader->dynamic_descriptor_sizes[i] == -1) {
894 unknown_dynamic_size = true;
895 } else if (unknown_dynamic_size &&
896 tu_shader->dynamic_descriptor_sizes[i] > 0) {
897 /* If there is an unknown size followed by a known size, then we may
898 * need to dynamically determine the offset when linking.
899 */
900 unknown_dynamic_offset = true;
901 }
902 }
903
904 if (unknown_dynamic_offset) {
905 const_state->dynamic_offset_loc =
906 const_allocs->max_const_offset_vec4 * 4;
907 assert(dev->physical_device->reserved_set_idx >= 0);
908 ir3_const_alloc(
909 const_allocs, IR3_CONST_ALLOC_DYN_DESCRIPTOR_OFFSET,
910 DIV_ROUND_UP(dev->physical_device->reserved_set_idx, 4), 1);
911 } else {
912 const_state->dynamic_offset_loc = UINT32_MAX;
913 }
914
915 /* Reserve space for inline uniforms, so we can always load them from
916 * constants and not setup a UBO descriptor for them.
917 */
918 size_t ldgk_consts = 0;
919 bool use_ldg_k =
920 dev->physical_device->info->a7xx.load_inline_uniforms_via_preamble_ldgk;
921 for (unsigned set = 0; set < layout->num_sets; set++) {
922 const struct tu_descriptor_set_layout *desc_layout =
923 layout->set[set].layout;
924
925 if (!desc_layout || !desc_layout->has_inline_uniforms)
926 continue;
927
928 for (unsigned b = 0; b < desc_layout->binding_count; b++) {
929 const struct tu_descriptor_set_binding_layout *binding =
930 &desc_layout->binding[b];
931
932 if (binding->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
933 continue;
934 if (!(binding->shader_stages &
935 mesa_to_vk_shader_stage(shader->info.stage)))
936 continue;
937
938 /* If we don't know the size at compile time due to a variable
939 * descriptor count, then with descriptor buffers we cannot know
940 * how much space the real inline uniform has. In this case we fall
941 * back to pushing the address and using ldg, which is slower than
942 * setting up a descriptor but setting up our own descriptor with
943 * descriptor_buffer is also painful and has to be done on the GPU
944 * and doesn't avoid the UBO getting pushed anyway and faulting if a
945 * out-of-bounds access is hidden behind an if and not dynamically
946 * executed. Given the small max size, there shouldn't be much reason
947 * to use variable size anyway.
948 */
949 bool push_address = !use_ldg_k && desc_layout->has_variable_descriptors &&
950 b == desc_layout->binding_count - 1;
951
952 if (push_address) {
953 perf_debug(dev,
954 "falling back to ldg for variable-sized inline "
955 "uniform block");
956 }
957
958 assert(const_state->num_inline_ubos < ARRAY_SIZE(const_state->ubos));
959 unsigned size_vec4 = push_address ? 1 : DIV_ROUND_UP(binding->size, 16);
960 const_state->ubos[const_state->num_inline_ubos++] =
961 (struct tu_inline_ubo) {
962 .base = set,
963 .offset = binding->offset,
964 .push_address = push_address,
965 .const_offset_vec4 =
966 const_allocs->max_const_offset_vec4 + ldgk_consts,
967 .size_vec4 = size_vec4,
968 };
969
970 if (!use_ldg_k) {
971 ldgk_consts += align(size_vec4, dev->compiler->const_upload_unit);
972 }
973 }
974 }
975
976 ir3_const_alloc(const_allocs, IR3_CONST_ALLOC_INLINE_UNIFORM_ADDRS, ldgk_consts, 1);
977
978 struct lower_instr_params params = {
979 .dev = dev,
980 .shader = tu_shader,
981 .layout = layout,
982 .read_only_input_attachments = read_only_input_attachments,
983 .dynamic_renderpass = dynamic_renderpass,
984 .const_allocs = const_allocs,
985 };
986
987 bool progress = false;
988 if (const_state->num_inline_ubos) {
989 progress |= nir_shader_intrinsics_pass(shader, lower_inline_ubo,
990 nir_metadata_none,
991 ¶ms);
992 }
993
994 progress |= nir_shader_instructions_pass(shader,
995 lower_instr,
996 nir_metadata_none,
997 ¶ms);
998
999 /* Remove now-unused variables so that when we gather the shader info later
1000 * they won't be counted.
1001 */
1002
1003 if (progress)
1004 nir_opt_dce(shader);
1005
1006 progress |=
1007 nir_remove_dead_variables(shader,
1008 nir_var_uniform | nir_var_mem_ubo | nir_var_mem_ssbo,
1009 NULL);
1010
1011 return progress;
1012 }
1013
1014 struct lower_fdm_options {
1015 unsigned num_views;
1016 bool adjust_fragcoord;
1017 bool multiview;
1018 };
1019
1020 static bool
lower_fdm_filter(const nir_instr * instr,const void * data)1021 lower_fdm_filter(const nir_instr *instr, const void *data)
1022 {
1023 const struct lower_fdm_options *options =
1024 (const struct lower_fdm_options *)data;
1025
1026 if (instr->type != nir_instr_type_intrinsic)
1027 return false;
1028
1029 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1030 return intrin->intrinsic == nir_intrinsic_load_frag_size ||
1031 (intrin->intrinsic == nir_intrinsic_load_frag_coord &&
1032 options->adjust_fragcoord);
1033 }
1034
1035 static nir_def *
lower_fdm_instr(struct nir_builder * b,nir_instr * instr,void * data)1036 lower_fdm_instr(struct nir_builder *b, nir_instr *instr, void *data)
1037 {
1038 const struct lower_fdm_options *options =
1039 (const struct lower_fdm_options *)data;
1040
1041 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1042
1043 nir_def *view;
1044 if (options->multiview) {
1045 nir_variable *view_var =
1046 nir_find_variable_with_location(b->shader, nir_var_shader_in,
1047 VARYING_SLOT_VIEW_INDEX);
1048
1049 if (view_var == NULL) {
1050 view_var = nir_variable_create(b->shader, nir_var_shader_in,
1051 glsl_int_type(), NULL);
1052 view_var->data.location = VARYING_SLOT_VIEW_INDEX;
1053 view_var->data.interpolation = INTERP_MODE_FLAT;
1054 view_var->data.driver_location = b->shader->num_inputs++;
1055 }
1056
1057 view = nir_load_var(b, view_var);
1058 } else {
1059 view = nir_imm_int(b, 0);
1060 }
1061
1062 nir_def *frag_size =
1063 nir_load_frag_size_ir3(b, view, .range = options->num_views);
1064
1065 if (intrin->intrinsic == nir_intrinsic_load_frag_coord) {
1066 nir_def *frag_offset =
1067 nir_load_frag_offset_ir3(b, view, .range = options->num_views);
1068 nir_def *unscaled_coord = nir_load_frag_coord_unscaled_ir3(b);
1069 nir_def *xy = nir_trim_vector(b, unscaled_coord, 2);
1070 xy = nir_fmul(b, nir_fsub(b, xy, frag_offset), nir_i2f32(b, frag_size));
1071 return nir_vec4(b,
1072 nir_channel(b, xy, 0),
1073 nir_channel(b, xy, 1),
1074 nir_channel(b, unscaled_coord, 2),
1075 nir_channel(b, unscaled_coord, 3));
1076 }
1077
1078 assert(intrin->intrinsic == nir_intrinsic_load_frag_size);
1079 return frag_size;
1080 }
1081
1082 static bool
tu_nir_lower_fdm(nir_shader * shader,const struct lower_fdm_options * options)1083 tu_nir_lower_fdm(nir_shader *shader, const struct lower_fdm_options *options)
1084 {
1085 return nir_shader_lower_instructions(shader, lower_fdm_filter,
1086 lower_fdm_instr, (void *)options);
1087 }
1088
1089 static void
shared_type_info(const struct glsl_type * type,unsigned * size,unsigned * align)1090 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
1091 {
1092 assert(glsl_type_is_vector_or_scalar(type));
1093
1094 unsigned comp_size =
1095 glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
1096 unsigned length = glsl_get_vector_elements(type);
1097 *size = comp_size * length;
1098 *align = comp_size;
1099 }
1100
1101 static void
tu_gather_xfb_info(nir_shader * nir,struct ir3_stream_output_info * info)1102 tu_gather_xfb_info(nir_shader *nir, struct ir3_stream_output_info *info)
1103 {
1104 nir_shader_gather_xfb_info(nir);
1105
1106 if (!nir->xfb_info)
1107 return;
1108
1109 nir_xfb_info *xfb = nir->xfb_info;
1110
1111 uint8_t output_map[VARYING_SLOT_TESS_MAX];
1112 memset(output_map, 0, sizeof(output_map));
1113
1114 nir_foreach_shader_out_variable(var, nir) {
1115 unsigned slots = nir_variable_count_slots(var, var->type);
1116 for (unsigned i = 0; i < slots; i++)
1117 output_map[var->data.location + i] = var->data.driver_location + i;
1118 }
1119
1120 assert(xfb->output_count <= IR3_MAX_SO_OUTPUTS);
1121 info->num_outputs = xfb->output_count;
1122
1123 for (int i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
1124 info->stride[i] = xfb->buffers[i].stride / 4;
1125 info->buffer_to_stream[i] = xfb->buffer_to_stream[i];
1126 }
1127
1128 info->streams_written = xfb->streams_written;
1129
1130 for (int i = 0; i < xfb->output_count; i++) {
1131 info->output[i].register_index = output_map[xfb->outputs[i].location];
1132 info->output[i].start_component = xfb->outputs[i].component_offset;
1133 info->output[i].num_components =
1134 util_bitcount(xfb->outputs[i].component_mask);
1135 info->output[i].output_buffer = xfb->outputs[i].buffer;
1136 info->output[i].dst_offset = xfb->outputs[i].offset / 4;
1137 info->output[i].stream = xfb->buffer_to_stream[xfb->outputs[i].buffer];
1138 }
1139 }
1140
1141 static uint32_t
tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant * xs)1142 tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs)
1143 {
1144 const struct ir3_const_state *const_state = ir3_const_state(xs);
1145 uint32_t base = const_state->allocs.max_const_offset_vec4;
1146 int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4);
1147
1148 /* truncate size to avoid writing constants that shader
1149 * does not use:
1150 */
1151 size = MIN2(size + base, xs->constlen) - base;
1152
1153 return MAX2(size, 0) * 4;
1154 }
1155
1156 /* We allocate fixed-length substreams for shader state, however some
1157 * parts of the state may have unbound length. Their additional space
1158 * requirements should be calculated here.
1159 */
1160 static uint32_t
tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant * xs)1161 tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs)
1162 {
1163 const struct ir3_const_state *const_state = ir3_const_state(xs);
1164
1165 uint32_t size = tu_xs_get_immediates_packet_size_dwords(xs);
1166
1167 /* Variable number of UBO upload ranges. */
1168 size += 4 * const_state->ubo_state.num_enabled;
1169
1170 /* Variable number of dwords for the primitive map */
1171 size += xs->input_size;
1172
1173 size += xs->constant_data_size / 4;
1174
1175 return size;
1176 }
1177
1178 static const struct xs_config {
1179 uint16_t reg_sp_xs_config;
1180 uint16_t reg_sp_xs_instrlen;
1181 uint16_t reg_sp_xs_first_exec_offset;
1182 uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
1183 uint16_t reg_sp_xs_vgpr_config;
1184 } xs_config[] = {
1185 [MESA_SHADER_VERTEX] = {
1186 REG_A6XX_SP_VS_CONFIG,
1187 REG_A6XX_SP_VS_INSTRLEN,
1188 REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET,
1189 REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET,
1190 REG_A7XX_SP_VS_VGPR_CONFIG,
1191 },
1192 [MESA_SHADER_TESS_CTRL] = {
1193 REG_A6XX_SP_HS_CONFIG,
1194 REG_A6XX_SP_HS_INSTRLEN,
1195 REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET,
1196 REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET,
1197 REG_A7XX_SP_HS_VGPR_CONFIG,
1198 },
1199 [MESA_SHADER_TESS_EVAL] = {
1200 REG_A6XX_SP_DS_CONFIG,
1201 REG_A6XX_SP_DS_INSTRLEN,
1202 REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET,
1203 REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET,
1204 REG_A7XX_SP_DS_VGPR_CONFIG,
1205 },
1206 [MESA_SHADER_GEOMETRY] = {
1207 REG_A6XX_SP_GS_CONFIG,
1208 REG_A6XX_SP_GS_INSTRLEN,
1209 REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET,
1210 REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET,
1211 REG_A7XX_SP_GS_VGPR_CONFIG,
1212 },
1213 [MESA_SHADER_FRAGMENT] = {
1214 REG_A6XX_SP_FS_CONFIG,
1215 REG_A6XX_SP_FS_INSTRLEN,
1216 REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET,
1217 REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET,
1218 REG_A7XX_SP_FS_VGPR_CONFIG,
1219 },
1220 [MESA_SHADER_COMPUTE] = {
1221 REG_A6XX_SP_CS_CONFIG,
1222 REG_A6XX_SP_CS_INSTRLEN,
1223 REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET,
1224 REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET,
1225 REG_A7XX_SP_CS_VGPR_CONFIG,
1226 },
1227 };
1228
1229 void
tu6_emit_xs(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)1230 tu6_emit_xs(struct tu_cs *cs,
1231 gl_shader_stage stage, /* xs->type, but xs may be NULL */
1232 const struct ir3_shader_variant *xs,
1233 const struct tu_pvtmem_config *pvtmem,
1234 uint64_t binary_iova)
1235 {
1236 const struct xs_config *cfg = &xs_config[stage];
1237
1238 if (!xs) {
1239 /* shader stage disabled */
1240 return;
1241 }
1242
1243 enum a6xx_threadsize thrsz =
1244 xs->info.double_threadsize ? THREAD128 : THREAD64;
1245 switch (stage) {
1246 case MESA_SHADER_VERTEX:
1247 tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0(
1248 .halfregfootprint = xs->info.max_half_reg + 1,
1249 .fullregfootprint = xs->info.max_reg + 1,
1250 .branchstack = ir3_shader_branchstack_hw(xs),
1251 .mergedregs = xs->mergedregs,
1252 .earlypreamble = xs->early_preamble,
1253 ));
1254 break;
1255 case MESA_SHADER_TESS_CTRL:
1256 tu_cs_emit_regs(cs, A6XX_SP_HS_CTRL_REG0(
1257 .halfregfootprint = xs->info.max_half_reg + 1,
1258 .fullregfootprint = xs->info.max_reg + 1,
1259 .branchstack = ir3_shader_branchstack_hw(xs),
1260 .earlypreamble = xs->early_preamble,
1261 ));
1262 break;
1263 case MESA_SHADER_TESS_EVAL:
1264 tu_cs_emit_regs(cs, A6XX_SP_DS_CTRL_REG0(
1265 .halfregfootprint = xs->info.max_half_reg + 1,
1266 .fullregfootprint = xs->info.max_reg + 1,
1267 .branchstack = ir3_shader_branchstack_hw(xs),
1268 .earlypreamble = xs->early_preamble,
1269 ));
1270 break;
1271 case MESA_SHADER_GEOMETRY:
1272 tu_cs_emit_regs(cs, A6XX_SP_GS_CTRL_REG0(
1273 .halfregfootprint = xs->info.max_half_reg + 1,
1274 .fullregfootprint = xs->info.max_reg + 1,
1275 .branchstack = ir3_shader_branchstack_hw(xs),
1276 .earlypreamble = xs->early_preamble,
1277 ));
1278 break;
1279 case MESA_SHADER_FRAGMENT:
1280 tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0(
1281 .halfregfootprint = xs->info.max_half_reg + 1,
1282 .fullregfootprint = xs->info.max_reg + 1,
1283 .branchstack = ir3_shader_branchstack_hw(xs),
1284 .threadsize = thrsz,
1285 .varying = xs->total_in != 0,
1286 .lodpixmask = xs->need_full_quad,
1287 /* unknown bit, seems unnecessary */
1288 .unk24 = true,
1289 .pixlodenable = xs->need_pixlod,
1290 .earlypreamble = xs->early_preamble,
1291 .mergedregs = xs->mergedregs,
1292 ));
1293 break;
1294 case MESA_SHADER_COMPUTE:
1295 thrsz = cs->device->physical_device->info->a6xx
1296 .supports_double_threadsize ? thrsz : THREAD128;
1297 tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0(
1298 .halfregfootprint = xs->info.max_half_reg + 1,
1299 .fullregfootprint = xs->info.max_reg + 1,
1300 .branchstack = ir3_shader_branchstack_hw(xs),
1301 .threadsize = thrsz,
1302 .earlypreamble = xs->early_preamble,
1303 .mergedregs = xs->mergedregs,
1304 ));
1305 break;
1306 default:
1307 unreachable("bad shader stage");
1308 }
1309
1310 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_instrlen, 1);
1311 tu_cs_emit(cs, xs->instrlen);
1312
1313 /* emit program binary & private memory layout
1314 * binary_iova should be aligned to 1 instrlen unit (128 bytes)
1315 */
1316
1317 assert((binary_iova & 0x7f) == 0);
1318 assert((pvtmem->iova & 0x1f) == 0);
1319
1320 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_first_exec_offset, 7);
1321 tu_cs_emit(cs, 0);
1322 tu_cs_emit_qw(cs, binary_iova);
1323 tu_cs_emit(cs,
1324 A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(pvtmem->per_fiber_size));
1325 tu_cs_emit_qw(cs, pvtmem->iova);
1326 tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(pvtmem->per_sp_size) |
1327 COND(pvtmem->per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT));
1328
1329 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1);
1330 tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size));
1331
1332 if (cs->device->physical_device->info->chip >= A7XX) {
1333 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vgpr_config, 1);
1334 tu_cs_emit(cs, 0);
1335 }
1336
1337 if (cs->device->physical_device->info->chip == A6XX) {
1338 uint32_t shader_preload_size =
1339 MIN2(xs->instrlen, cs->device->physical_device->info->a6xx.instr_cache_size);
1340
1341 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
1342 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1343 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
1344 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1345 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1346 CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
1347 tu_cs_emit_qw(cs, binary_iova);
1348 }
1349
1350 /* emit immediates */
1351
1352 const struct ir3_const_state *const_state = ir3_const_state(xs);
1353 uint32_t base = const_state->allocs.max_const_offset_vec4;
1354 unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs);
1355
1356 if (immediate_size > 0) {
1357 assert(!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble);
1358 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size);
1359 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
1360 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1361 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1362 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1363 CP_LOAD_STATE6_0_NUM_UNIT(immediate_size / 4));
1364 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1365 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1366
1367 tu_cs_emit_array(cs, const_state->immediates, immediate_size);
1368 }
1369
1370 if (const_state->consts_ubo.idx != -1) {
1371 uint64_t iova = binary_iova + xs->info.constant_data_offset;
1372 uint32_t offset = const_state->consts_ubo.idx;
1373
1374 /* Upload UBO state for the constant data. */
1375 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
1376 tu_cs_emit(cs,
1377 CP_LOAD_STATE6_0_DST_OFF(offset) |
1378 CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
1379 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1380 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1381 CP_LOAD_STATE6_0_NUM_UNIT(1));
1382 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1383 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1384 int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16);
1385 tu_cs_emit_qw(cs,
1386 iova |
1387 (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32);
1388
1389 /* Upload the constant data to the const file if needed. */
1390 const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
1391
1392 if (!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
1393 for (int i = 0; i < ubo_state->num_enabled; i++) {
1394 if (ubo_state->range[i].ubo.block != offset ||
1395 ubo_state->range[i].ubo.bindless) {
1396 continue;
1397 }
1398
1399 uint32_t start = ubo_state->range[i].start;
1400 uint32_t end = ubo_state->range[i].end;
1401 uint32_t size = MIN2(end - start,
1402 (16 * xs->constlen) - ubo_state->range[i].offset);
1403
1404 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
1405 tu_cs_emit(cs,
1406 CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
1407 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1408 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1409 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1410 CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
1411 tu_cs_emit_qw(cs, iova + start);
1412 }
1413 }
1414 }
1415
1416 /* emit statically-known FS driver param */
1417 if (stage == MESA_SHADER_FRAGMENT && const_state->driver_params_ubo.size > 0) {
1418 uint32_t data[4] = {xs->info.double_threadsize ? 128 : 64, 0, 0, 0};
1419 uint32_t size = ARRAY_SIZE(data);
1420
1421 /* A7XX TODO: Emit data via sub_cs instead of NOP */
1422 uint64_t iova = tu_cs_emit_data_nop(cs, data, size, 4);
1423 uint32_t base = const_state->driver_params_ubo.idx;
1424
1425 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
1426 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
1427 CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
1428 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1429 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1430 CP_LOAD_STATE6_0_NUM_UNIT(1));
1431 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1432 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1433 int size_vec4s = DIV_ROUND_UP(size, 4);
1434 tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
1435 } else if (stage == MESA_SHADER_FRAGMENT && const_state->num_driver_params > 0) {
1436 uint32_t base =
1437 const_state->allocs.consts[IR3_CONST_ALLOC_DRIVER_PARAMS].offset_vec4;
1438 int32_t size = DIV_ROUND_UP(MAX2(const_state->num_driver_params, 4), 4);
1439 size = MAX2(MIN2(size + base, xs->constlen) - base, 0);
1440
1441 if (size > 0) {
1442 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + 4);
1443 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
1444 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1445 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
1446 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
1447 CP_LOAD_STATE6_0_NUM_UNIT(size));
1448 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
1449 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
1450
1451 tu_cs_emit(cs, xs->info.double_threadsize ? 128 : 64);
1452 tu_cs_emit(cs, 0);
1453 tu_cs_emit(cs, 0);
1454 tu_cs_emit(cs, 0);
1455 }
1456 }
1457 }
1458
1459 template <chip CHIP>
1460 static void
tu6_emit_cs_config(struct tu_cs * cs,const struct ir3_shader_variant * v,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)1461 tu6_emit_cs_config(struct tu_cs *cs,
1462 const struct ir3_shader_variant *v,
1463 const struct tu_pvtmem_config *pvtmem,
1464 uint64_t binary_iova)
1465 {
1466 bool shared_consts_enable =
1467 ir3_const_state(v)->push_consts_type == IR3_PUSH_CONSTS_SHARED;
1468 tu6_emit_shared_consts_enable<CHIP>(cs, shared_consts_enable);
1469
1470 tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
1471 .cs_state = true,
1472 .cs_ibo = true,
1473 .cs_shared_const = shared_consts_enable));
1474
1475 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_COMPUTE, v);
1476 tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
1477
1478 uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
1479 tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
1480 tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
1481 A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
1482
1483 if (CHIP == A6XX && cs->device->physical_device->info->a6xx.has_lpac) {
1484 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1);
1485 tu_cs_emit(cs, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) |
1486 A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6);
1487 }
1488
1489 uint32_t local_invocation_id =
1490 ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
1491 uint32_t work_group_id =
1492 ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
1493
1494 /*
1495 * Devices that do not support double threadsize take the threadsize from
1496 * A6XX_HLSQ_FS_CNTL_0_THREADSIZE instead of A6XX_HLSQ_CS_CNTL_1_THREADSIZE
1497 * which is always set to THREAD128.
1498 */
1499 enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
1500 enum a6xx_threadsize thrsz_cs = cs->device->physical_device->info->a6xx
1501 .supports_double_threadsize ? thrsz : THREAD128;
1502 if (CHIP == A6XX) {
1503 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
1504 tu_cs_emit(cs,
1505 A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
1506 A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
1507 A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
1508 A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
1509 tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
1510 A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz_cs));
1511 if (!cs->device->physical_device->info->a6xx.supports_double_threadsize) {
1512 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1);
1513 tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz));
1514 }
1515
1516 if (cs->device->physical_device->info->a6xx.has_lpac) {
1517 tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2);
1518 tu_cs_emit(cs,
1519 A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
1520 A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
1521 A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
1522 A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
1523 tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
1524 A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
1525 }
1526 } else {
1527 unsigned tile_height = (v->local_size[1] % 8 == 0) ? 3
1528 : (v->local_size[1] % 4 == 0) ? 5
1529 : (v->local_size[1] % 2 == 0) ? 9
1530 : 17;
1531 tu_cs_emit_regs(
1532 cs, HLSQ_CS_CNTL_1(CHIP,
1533 .linearlocalidregid = regid(63, 0), .threadsize = thrsz_cs,
1534 .workgrouprastorderzfirsten = true,
1535 .wgtilewidth = 4, .wgtileheight = tile_height));
1536
1537 tu_cs_emit_regs(cs, HLSQ_FS_CNTL_0(CHIP, .threadsize = THREAD64));
1538
1539 tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 1);
1540 tu_cs_emit(cs, A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
1541 A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
1542 A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
1543 A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
1544
1545 tu_cs_emit_regs(cs,
1546 SP_CS_CNTL_1(CHIP,
1547 .linearlocalidregid = regid(63, 0),
1548 .threadsize = thrsz_cs,
1549 .workitemrastorder =
1550 v->cs.force_linear_dispatch ?
1551 WORKITEMRASTORDER_LINEAR :
1552 WORKITEMRASTORDER_TILED, ));
1553
1554 tu_cs_emit_regs(
1555 cs, A7XX_HLSQ_CS_LOCAL_SIZE(.localsizex = v->local_size[0] - 1,
1556 .localsizey = v->local_size[1] - 1,
1557 .localsizez = v->local_size[2] - 1, ));
1558
1559 tu_cs_emit_regs(cs, A7XX_SP_CS_UNKNOWN_A9BE(0)); // Sometimes is 0x08000000
1560 }
1561 }
1562
1563 #define TU6_EMIT_VFD_DEST_MAX_DWORDS (MAX_VERTEX_ATTRIBS + 2)
1564
1565 static void
tu6_emit_vfd_dest(struct tu_cs * cs,const struct ir3_shader_variant * vs)1566 tu6_emit_vfd_dest(struct tu_cs *cs,
1567 const struct ir3_shader_variant *vs)
1568 {
1569 int32_t input_for_attr[MAX_VERTEX_ATTRIBS];
1570 uint32_t attr_count = 0;
1571
1572 for (unsigned i = 0; i < MAX_VERTEX_ATTRIBS; i++)
1573 input_for_attr[i] = -1;
1574
1575 for (unsigned i = 0; i < vs->inputs_count; i++) {
1576 if (vs->inputs[i].sysval || vs->inputs[i].regid == regid(63, 0))
1577 continue;
1578
1579 assert(vs->inputs[i].slot >= VERT_ATTRIB_GENERIC0);
1580 unsigned loc = vs->inputs[i].slot - VERT_ATTRIB_GENERIC0;
1581 input_for_attr[loc] = i;
1582 attr_count = MAX2(attr_count, loc + 1);
1583 }
1584
1585 tu_cs_emit_regs(cs,
1586 A6XX_VFD_CONTROL_0(
1587 .fetch_cnt = attr_count, /* decode_cnt for binning pass ? */
1588 .decode_cnt = attr_count));
1589
1590 if (attr_count)
1591 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DEST_CNTL_INSTR(0), attr_count);
1592
1593 for (unsigned i = 0; i < attr_count; i++) {
1594 if (input_for_attr[i] >= 0) {
1595 unsigned input_idx = input_for_attr[i];
1596 tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0,
1597 .writemask = vs->inputs[input_idx].compmask,
1598 .regid = vs->inputs[input_idx].regid).value);
1599 } else {
1600 tu_cs_emit(cs, A6XX_VFD_DEST_CNTL_INSTR(0,
1601 .writemask = 0,
1602 .regid = regid(63, 0)).value);
1603 }
1604 }
1605 }
1606
1607 static enum a6xx_tex_prefetch_cmd
tu6_tex_opc_to_prefetch_cmd(opc_t tex_opc)1608 tu6_tex_opc_to_prefetch_cmd(opc_t tex_opc)
1609 {
1610 switch (tex_opc) {
1611 case OPC_SAM:
1612 return TEX_PREFETCH_SAM;
1613 default:
1614 unreachable("Unknown tex opc for prefeth cmd");
1615 }
1616 }
1617
1618 template <chip CHIP>
1619 static void
tu6_emit_fs_inputs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1620 tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
1621 {
1622 uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
1623 uint32_t ij_regid[IJ_COUNT];
1624 uint32_t smask_in_regid, shading_rate_regid;
1625
1626 bool sample_shading = fs->per_samp | fs->key.sample_shading;
1627 bool enable_varyings = fs->total_in > 0;
1628
1629 samp_id_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID);
1630 smask_in_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN);
1631 face_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE);
1632 coord_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD);
1633 zwcoord_regid = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0);
1634 shading_rate_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_SHADING_RATE);
1635 for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
1636 ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
1637
1638 if (fs->num_sampler_prefetch > 0) {
1639 /* It seems like ij_pix is *required* to be r0.x */
1640 assert(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]) ||
1641 ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
1642 }
1643
1644 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);
1645 tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) |
1646 COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID(0x1ff)) |
1647 COND(CHIP >= A7XX, A6XX_SP_FS_PREFETCH_CNTL_CONSTSLOTID4COORD(0x1ff)) |
1648 COND(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]),
1649 A6XX_SP_FS_PREFETCH_CNTL_IJ_WRITE_DISABLE) |
1650 COND(fs->prefetch_end_of_quad,
1651 A6XX_SP_FS_PREFETCH_CNTL_ENDOFQUAD));
1652 for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1653 const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1654 tu_cs_emit(
1655 cs, SP_FS_PREFETCH_CMD(
1656 CHIP, i, .src = prefetch->src, .samp_id = prefetch->samp_id,
1657 .tex_id = prefetch->tex_id, .dst = prefetch->dst,
1658 .wrmask = prefetch->wrmask, .half = prefetch->half_precision,
1659 .bindless = prefetch->bindless,
1660 .cmd = tu6_tex_opc_to_prefetch_cmd(prefetch->tex_opc), ).value);
1661 }
1662
1663 if (fs->num_sampler_prefetch > 0) {
1664 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch);
1665 for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1666 const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1667 tu_cs_emit(cs,
1668 A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) |
1669 A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id));
1670 }
1671 }
1672
1673 tu_cs_emit_regs(cs,
1674 HLSQ_CONTROL_1_REG(CHIP,
1675 .primallocthreshold =
1676 cs->device->physical_device->info->a6xx.prim_alloc_threshold),
1677 HLSQ_CONTROL_2_REG(CHIP, .faceregid = face_regid,
1678 .sampleid = samp_id_regid,
1679 .samplemask = smask_in_regid,
1680 .centerrhw = ij_regid[IJ_PERSP_CENTER_RHW]),
1681 HLSQ_CONTROL_3_REG(CHIP, .ij_persp_pixel = ij_regid[IJ_PERSP_PIXEL],
1682 .ij_linear_pixel = ij_regid[IJ_LINEAR_PIXEL],
1683 .ij_persp_centroid = ij_regid[IJ_PERSP_CENTROID],
1684 .ij_linear_centroid = ij_regid[IJ_LINEAR_CENTROID]),
1685 HLSQ_CONTROL_4_REG(CHIP, .ij_persp_sample = ij_regid[IJ_PERSP_SAMPLE],
1686 .ij_linear_sample = ij_regid[IJ_LINEAR_SAMPLE],
1687 .xycoordregid = coord_regid,
1688 .zwcoordregid = zwcoord_regid),
1689 HLSQ_CONTROL_5_REG(CHIP, .linelengthregid = 0xfc,
1690 .foveationqualityregid = shading_rate_regid), );
1691
1692 if (CHIP >= A7XX) {
1693 uint32_t sysval_regs = 0;
1694 for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) {
1695 if (VALIDREG(ij_regid[i])) {
1696 if (i == IJ_PERSP_CENTER_RHW)
1697 sysval_regs += 1;
1698 else
1699 sysval_regs += 2;
1700 }
1701 }
1702
1703 for (uint32_t sysval : { face_regid, samp_id_regid, smask_in_regid,
1704 shading_rate_regid }) {
1705 if (VALIDREG(sysval))
1706 sysval_regs += 1;
1707 }
1708
1709 for (uint32_t sysval : { coord_regid, zwcoord_regid }) {
1710 if (VALIDREG(sysval))
1711 sysval_regs += 2;
1712 }
1713
1714 tu_cs_emit_regs(cs, A7XX_HLSQ_UNKNOWN_A9AE(.sysval_regs_count = sysval_regs,
1715 .unk8 = 1,
1716 .unk9 = 1));
1717 }
1718
1719 enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64;
1720 tu_cs_emit_regs(cs, HLSQ_FS_CNTL_0(CHIP, .threadsize = thrsz, .varyings = enable_varyings));
1721
1722 bool need_size = fs->frag_face || fs->fragcoord_compmask != 0;
1723 bool need_size_persamp = false;
1724 if (VALIDREG(ij_regid[IJ_PERSP_CENTER_RHW])) {
1725 if (sample_shading)
1726 need_size_persamp = true;
1727 else
1728 need_size = true;
1729 }
1730
1731 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1);
1732 tu_cs_emit(cs,
1733 CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
1734 CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
1735 CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
1736 CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1737 CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) |
1738 CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1739 COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1740 COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1741 COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
1742
1743 tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2);
1744 tu_cs_emit(cs,
1745 CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
1746 CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
1747 CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
1748 CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1749 CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) |
1750 CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1751 COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1752 COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
1753 COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1754 COND(fs->fragcoord_compmask != 0,
1755 A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
1756 tu_cs_emit(cs,
1757 A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE(
1758 sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) |
1759 CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
1760 CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) |
1761 CONDREG(ij_regid[IJ_PERSP_CENTER_RHW], A6XX_RB_RENDER_CONTROL1_CENTERRHW) |
1762 COND(fs->post_depth_coverage, A6XX_RB_RENDER_CONTROL1_POSTDEPTHCOVERAGE) |
1763 COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS) |
1764 CONDREG(shading_rate_regid, A6XX_RB_RENDER_CONTROL1_FOVEATION));
1765
1766 tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CNTL, 1);
1767 tu_cs_emit(cs, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE));
1768
1769 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1);
1770 tu_cs_emit(cs, CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) |
1771 A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE(
1772 sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER));
1773
1774 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 1);
1775 tu_cs_emit(cs, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE));
1776
1777 uint32_t varmask[4] = { 0 };
1778
1779 for (int i = ir3_next_varying(fs, -1); i < fs->inputs_count;
1780 i = ir3_next_varying(fs, i)) {
1781 if (fs->inputs[i].inloc >= fs->total_in)
1782 continue;
1783
1784 unsigned loc = fs->inputs[i].inloc;
1785 for (int j = 0; j < util_last_bit(fs->inputs[i].compmask); j++) {
1786 uint8_t comploc = loc + j;
1787 varmask[comploc / 32] |= 1 << (comploc % 32);
1788 }
1789 }
1790
1791 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
1792 tu_cs_emit(cs, ~varmask[0]);
1793 tu_cs_emit(cs, ~varmask[1]);
1794 tu_cs_emit(cs, ~varmask[2]);
1795 tu_cs_emit(cs, ~varmask[3]);
1796
1797 unsigned primid_loc = ir3_find_input_loc(fs, VARYING_SLOT_PRIMITIVE_ID);
1798 unsigned viewid_loc = ir3_find_input_loc(fs, VARYING_SLOT_VIEW_INDEX);
1799
1800 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
1801 tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs->total_in) |
1802 COND(fs && fs->total_in, A6XX_VPC_CNTL_0_VARYING) |
1803 A6XX_VPC_CNTL_0_PRIMIDLOC(primid_loc) |
1804 A6XX_VPC_CNTL_0_VIEWIDLOC(viewid_loc));
1805 }
1806
1807 static void
tu6_emit_fs_outputs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1808 tu6_emit_fs_outputs(struct tu_cs *cs,
1809 const struct ir3_shader_variant *fs)
1810 {
1811 uint32_t smask_regid, posz_regid, stencilref_regid;
1812
1813 posz_regid = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH);
1814 smask_regid = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK);
1815 stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL);
1816
1817 int output_reg_count = 0;
1818 uint32_t fragdata_regid[8];
1819
1820 assert(!fs->color0_mrt);
1821 for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) {
1822 fragdata_regid[i] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + i);
1823 if (VALIDREG(fragdata_regid[i]))
1824 output_reg_count = i + 1;
1825 }
1826
1827 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1);
1828 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) |
1829 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) |
1830 A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) |
1831 COND(fs->dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1832
1833 /* There is no point in having component enabled which is not written
1834 * by the shader. Per VK spec it is an UB, however a few apps depend on
1835 * attachment not being changed if FS doesn't have corresponding output.
1836 */
1837 uint32_t fs_render_components = 0;
1838
1839 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), output_reg_count);
1840 for (uint32_t i = 0; i < output_reg_count; i++) {
1841 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) |
1842 (COND(fragdata_regid[i] & HALF_REG_ID,
1843 A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)));
1844
1845 if (VALIDREG(fragdata_regid[i])) {
1846 fs_render_components |= 0xf << (i * 4);
1847 }
1848 }
1849
1850 tu_cs_emit_regs(cs,
1851 A6XX_SP_FS_RENDER_COMPONENTS(.dword = fs_render_components));
1852
1853 tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 1);
1854 tu_cs_emit(cs, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
1855 COND(fs->writes_smask, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
1856 COND(fs->writes_stencilref, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
1857 COND(fs->dual_src_blend, A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1858
1859 tu_cs_emit_regs(cs,
1860 A6XX_RB_RENDER_COMPONENTS(.dword = fs_render_components));
1861 }
1862
1863 template <chip CHIP>
1864 void
tu6_emit_vs(struct tu_cs * cs,const struct ir3_shader_variant * vs,uint32_t view_mask)1865 tu6_emit_vs(struct tu_cs *cs,
1866 const struct ir3_shader_variant *vs,
1867 uint32_t view_mask)
1868 {
1869 bool multi_pos_output = vs->multi_pos_output;
1870
1871 uint32_t multiview_views = util_logbase2(view_mask) + 1;
1872 uint32_t multiview_cntl = view_mask ?
1873 A6XX_PC_MULTIVIEW_CNTL_ENABLE |
1874 A6XX_PC_MULTIVIEW_CNTL_VIEWS(multiview_views) |
1875 COND(!multi_pos_output, A6XX_PC_MULTIVIEW_CNTL_DISABLEMULTIPOS)
1876 : 0;
1877
1878 /* Copy what the blob does here. This will emit an extra 0x3f
1879 * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
1880 * this is working around yet.
1881 */
1882 if (cs->device->physical_device->info->a6xx.has_cp_reg_write) {
1883 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
1884 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
1885 tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
1886 } else {
1887 tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_CNTL, 1);
1888 }
1889 tu_cs_emit(cs, multiview_cntl);
1890
1891 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_MULTIVIEW_CNTL, 1);
1892 tu_cs_emit(cs, multiview_cntl);
1893
1894 if (multiview_cntl &&
1895 cs->device->physical_device->info->a6xx.supports_multiview_mask) {
1896 tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_MASK, 1);
1897 tu_cs_emit(cs, view_mask);
1898 }
1899
1900 if (CHIP >= A7XX) {
1901 tu_cs_emit_pkt4(cs, REG_A7XX_VPC_MULTIVIEW_CNTL, 1);
1902 tu_cs_emit(cs, multiview_cntl);
1903
1904 tu_cs_emit_pkt4(cs, REG_A7XX_VPC_MULTIVIEW_MASK, 1);
1905 tu_cs_emit(cs, view_mask);
1906 }
1907
1908 tu6_emit_vfd_dest(cs, vs);
1909
1910 const uint32_t vertexid_regid =
1911 ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
1912 const uint32_t instanceid_regid =
1913 ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
1914
1915 /* Note: we currently don't support multiview with tess or GS. If we did,
1916 * and the HW actually works, then we'd have to somehow share this across
1917 * stages. Note that the blob doesn't support this either.
1918 */
1919 const uint32_t viewid_regid =
1920 ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX);
1921
1922 const uint32_t vs_primitiveid_regid =
1923 ir3_find_sysval_regid(vs, SYSTEM_VALUE_PRIMITIVE_ID);
1924
1925 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_1, 1);
1926 tu_cs_emit(cs, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) |
1927 A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) |
1928 A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) |
1929 A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid));
1930 }
1931 TU_GENX(tu6_emit_vs);
1932
1933 template <chip CHIP>
1934 void
tu6_emit_hs(struct tu_cs * cs,const struct ir3_shader_variant * hs)1935 tu6_emit_hs(struct tu_cs *cs,
1936 const struct ir3_shader_variant *hs)
1937 {
1938 const uint32_t hs_rel_patch_regid =
1939 ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3);
1940 const uint32_t hs_invocation_regid =
1941 ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3);
1942
1943 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_2, 1);
1944 tu_cs_emit(cs, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) |
1945 A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid));
1946
1947 if (hs) {
1948 tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
1949 tu_cs_emit(cs, hs->tess.tcs_vertices_out);
1950 }
1951 }
1952 TU_GENX(tu6_emit_hs);
1953
1954 template <chip CHIP>
1955 void
tu6_emit_ds(struct tu_cs * cs,const struct ir3_shader_variant * ds)1956 tu6_emit_ds(struct tu_cs *cs,
1957 const struct ir3_shader_variant *ds)
1958 {
1959 const uint32_t ds_rel_patch_regid =
1960 ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3);
1961 const uint32_t tess_coord_x_regid =
1962 ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD);
1963 const uint32_t tess_coord_y_regid = VALIDREG(tess_coord_x_regid) ?
1964 tess_coord_x_regid + 1 :
1965 regid(63, 0);
1966 const uint32_t ds_primitiveid_regid =
1967 ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID);
1968
1969 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_3, 2);
1970 tu_cs_emit(cs, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) |
1971 A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) |
1972 A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) |
1973 A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid));
1974 tu_cs_emit(cs, 0x000000fc); /* VFD_CONTROL_4 */
1975 }
1976 TU_GENX(tu6_emit_ds);
1977
1978 static enum a6xx_tess_output
primitive_to_tess(enum mesa_prim primitive)1979 primitive_to_tess(enum mesa_prim primitive) {
1980 switch (primitive) {
1981 case MESA_PRIM_POINTS:
1982 return TESS_POINTS;
1983 case MESA_PRIM_LINE_STRIP:
1984 return TESS_LINES;
1985 case MESA_PRIM_TRIANGLE_STRIP:
1986 return TESS_CW_TRIS;
1987 default:
1988 unreachable("");
1989 }
1990 }
1991
1992 template <chip CHIP>
1993 void
tu6_emit_gs(struct tu_cs * cs,const struct ir3_shader_variant * gs)1994 tu6_emit_gs(struct tu_cs *cs,
1995 const struct ir3_shader_variant *gs)
1996 {
1997 const uint32_t gsheader_regid =
1998 ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3);
1999
2000 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_5, 1);
2001 tu_cs_emit(cs, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) |
2002 0xfc00);
2003
2004 if (gs) {
2005 uint32_t vertices_out, invocations;
2006
2007 vertices_out = gs->gs.vertices_out - 1;
2008 enum a6xx_tess_output output = primitive_to_tess((enum mesa_prim) gs->gs.output_primitive);
2009 invocations = gs->gs.invocations - 1;
2010
2011 uint32_t primitive_cntl =
2012 A6XX_PC_PRIMITIVE_CNTL_5(.gs_vertices_out = vertices_out,
2013 .gs_invocations = invocations,
2014 .gs_output = output,).value;
2015
2016 tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1);
2017 tu_cs_emit(cs, primitive_cntl);
2018
2019 if (CHIP >= A7XX) {
2020 tu_cs_emit_pkt4(cs, REG_A7XX_VPC_PRIMITIVE_CNTL_5, 1);
2021 tu_cs_emit(cs, primitive_cntl);
2022 } else {
2023 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_PARAM, 1);
2024 tu_cs_emit(cs, 0xff);
2025 }
2026 }
2027 }
2028 TU_GENX(tu6_emit_gs);
2029
2030 template <chip CHIP>
2031 void
tu6_emit_fs(struct tu_cs * cs,const struct ir3_shader_variant * fs)2032 tu6_emit_fs(struct tu_cs *cs,
2033 const struct ir3_shader_variant *fs)
2034 {
2035 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_6, 1);
2036 tu_cs_emit(cs, COND(fs && fs->reads_primid, A6XX_VFD_CONTROL_6_PRIMID4PSEN));
2037
2038 tu_cs_emit_regs(cs, A6XX_PC_PS_CNTL(.primitiveiden = fs && fs->reads_primid));
2039
2040 if (CHIP >= A7XX) {
2041 tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2));
2042 tu_cs_emit_regs(cs, A7XX_HLSQ_FS_UNKNOWN_A9AA(.consts_load_disable = false));
2043 }
2044
2045 if (fs) {
2046 tu6_emit_fs_inputs<CHIP>(cs, fs);
2047 tu6_emit_fs_outputs(cs, fs);
2048 } else {
2049 /* TODO: check if these can be skipped if fs is disabled */
2050 struct ir3_shader_variant dummy_variant = {};
2051 tu6_emit_fs_inputs<CHIP>(cs, &dummy_variant);
2052 tu6_emit_fs_outputs(cs, &dummy_variant);
2053 }
2054 }
2055 TU_GENX(tu6_emit_fs);
2056
2057 template <chip CHIP>
2058 static void
tu6_emit_variant(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs,struct tu_pvtmem_config * pvtmem_config,uint32_t view_mask,uint64_t binary_iova)2059 tu6_emit_variant(struct tu_cs *cs,
2060 gl_shader_stage stage,
2061 const struct ir3_shader_variant *xs,
2062 struct tu_pvtmem_config *pvtmem_config,
2063 uint32_t view_mask,
2064 uint64_t binary_iova)
2065 {
2066 if (stage == MESA_SHADER_COMPUTE) {
2067 tu6_emit_cs_config<CHIP>(cs, xs, pvtmem_config, binary_iova);
2068 return;
2069 }
2070
2071 tu6_emit_xs(cs, stage, xs, pvtmem_config, binary_iova);
2072
2073 switch (stage) {
2074 case MESA_SHADER_VERTEX:
2075 tu6_emit_vs<CHIP>(cs, xs, view_mask);
2076 break;
2077 case MESA_SHADER_TESS_CTRL:
2078 tu6_emit_hs<CHIP>(cs, xs);
2079 break;
2080 case MESA_SHADER_TESS_EVAL:
2081 tu6_emit_ds<CHIP>(cs, xs);
2082 break;
2083 case MESA_SHADER_GEOMETRY:
2084 tu6_emit_gs<CHIP>(cs, xs);
2085 break;
2086 case MESA_SHADER_FRAGMENT:
2087 tu6_emit_fs<CHIP>(cs, xs);
2088 break;
2089 default:
2090 unreachable("unknown shader stage");
2091 }
2092 }
2093
2094 static VkResult
tu_setup_pvtmem(struct tu_device * dev,struct tu_shader * shader,struct tu_pvtmem_config * config,uint32_t pvtmem_bytes,bool per_wave)2095 tu_setup_pvtmem(struct tu_device *dev,
2096 struct tu_shader *shader,
2097 struct tu_pvtmem_config *config,
2098 uint32_t pvtmem_bytes,
2099 bool per_wave)
2100 {
2101 if (!pvtmem_bytes) {
2102 memset(config, 0, sizeof(*config));
2103 return VK_SUCCESS;
2104 }
2105
2106 /* There is a substantial memory footprint from private memory BOs being
2107 * allocated on a per-pipeline basis and it isn't required as the same
2108 * BO can be utilized by multiple pipelines as long as they have the
2109 * private memory layout (sizes and per-wave/per-fiber) to avoid being
2110 * overwritten by other active pipelines using the same BO with differing
2111 * private memory layouts resulting memory corruption.
2112 *
2113 * To avoid this, we create private memory BOs on a per-device level with
2114 * an associated private memory layout then dynamically grow them when
2115 * needed and reuse them across pipelines. Growth is done in terms of
2116 * powers of two so that we can avoid frequent reallocation of the
2117 * private memory BOs.
2118 */
2119
2120 struct tu_pvtmem_bo *pvtmem_bo =
2121 per_wave ? &dev->wave_pvtmem_bo : &dev->fiber_pvtmem_bo;
2122 mtx_lock(&pvtmem_bo->mtx);
2123
2124 if (pvtmem_bo->per_fiber_size < pvtmem_bytes) {
2125 if (pvtmem_bo->bo)
2126 tu_bo_finish(dev, pvtmem_bo->bo);
2127
2128 pvtmem_bo->per_fiber_size =
2129 util_next_power_of_two(ALIGN(pvtmem_bytes, 512));
2130 pvtmem_bo->per_sp_size =
2131 ALIGN(pvtmem_bo->per_fiber_size *
2132 dev->physical_device->info->fibers_per_sp,
2133 1 << 12);
2134 uint32_t total_size =
2135 dev->physical_device->info->num_sp_cores * pvtmem_bo->per_sp_size;
2136
2137 VkResult result = tu_bo_init_new(dev, NULL, &pvtmem_bo->bo, total_size,
2138 TU_BO_ALLOC_INTERNAL_RESOURCE, "pvtmem");
2139 if (result != VK_SUCCESS) {
2140 mtx_unlock(&pvtmem_bo->mtx);
2141 return result;
2142 }
2143 }
2144
2145 config->per_wave = per_wave;
2146 config->per_fiber_size = pvtmem_bo->per_fiber_size;
2147 config->per_sp_size = pvtmem_bo->per_sp_size;
2148
2149 shader->pvtmem_bo = tu_bo_get_ref(pvtmem_bo->bo);
2150 config->iova = shader->pvtmem_bo->iova;
2151
2152 mtx_unlock(&pvtmem_bo->mtx);
2153
2154 return VK_SUCCESS;
2155 }
2156
2157 static uint64_t
tu_upload_variant(struct tu_cs * cs,const struct ir3_shader_variant * variant)2158 tu_upload_variant(struct tu_cs *cs,
2159 const struct ir3_shader_variant *variant)
2160 {
2161 struct tu_cs_memory memory;
2162
2163 if (!variant)
2164 return 0;
2165
2166 /* this expects to get enough alignment because shaders are allocated first
2167 * and total size is always aligned correctly
2168 * note: an assert in tu6_emit_xs_config validates the alignment
2169 */
2170 tu_cs_alloc(cs, variant->info.size / 4, 1, &memory);
2171
2172 memcpy(memory.map, variant->bin, variant->info.size);
2173 return memory.iova;
2174 }
2175
2176 static VkResult
tu_upload_shader(struct tu_device * dev,struct tu_shader * shader)2177 tu_upload_shader(struct tu_device *dev,
2178 struct tu_shader *shader)
2179 {
2180 const struct ir3_shader_variant *v = shader->variant;
2181 const struct ir3_shader_variant *binning = v ? v->binning : NULL;
2182 const struct ir3_shader_variant *safe_const = shader->safe_const_variant;
2183
2184 if (v->type == MESA_SHADER_VERTEX && v->stream_output.num_outputs != 0)
2185 binning = v;
2186
2187 uint32_t size = 0;
2188 if (v->type == MESA_SHADER_VERTEX)
2189 size += TU6_EMIT_VFD_DEST_MAX_DWORDS;
2190
2191 const unsigned xs_size = 128;
2192 const unsigned vpc_size = 32 + (v->stream_output.num_outputs != 0 ? 256 : 0);
2193
2194 size += xs_size + tu_xs_get_additional_cs_size_dwords(v);
2195 size += v->info.size / 4;
2196 if (binning) {
2197 size += xs_size + tu_xs_get_additional_cs_size_dwords(binning);
2198 size += binning->info.size / 4;
2199 }
2200
2201 if (safe_const) {
2202 size += xs_size + tu_xs_get_additional_cs_size_dwords(safe_const);
2203 size += safe_const->info.size / 4;
2204 }
2205
2206 /* We emit an empty VPC including streamout state in the binning draw state */
2207 if (binning || v->type == MESA_SHADER_GEOMETRY) {
2208 size += vpc_size;
2209 }
2210
2211 pthread_mutex_lock(&dev->pipeline_mutex);
2212 VkResult result = tu_suballoc_bo_alloc(&shader->bo, &dev->pipeline_suballoc,
2213 size * 4, 128);
2214 pthread_mutex_unlock(&dev->pipeline_mutex);
2215
2216 if (result != VK_SUCCESS)
2217 return result;
2218
2219 uint32_t pvtmem_size = v->pvtmem_size;
2220 bool per_wave = v->pvtmem_per_wave;
2221
2222 if (v->binning) {
2223 pvtmem_size = MAX2(pvtmem_size, shader->variant->binning->pvtmem_size);
2224 if (!shader->variant->binning->pvtmem_per_wave)
2225 per_wave = false;
2226 }
2227
2228 if (shader->safe_const_variant) {
2229 pvtmem_size = MAX2(pvtmem_size, shader->safe_const_variant->pvtmem_size);
2230 if (!shader->safe_const_variant->pvtmem_per_wave)
2231 per_wave = false;
2232
2233 if (shader->safe_const_variant->binning) {
2234 pvtmem_size = MAX2(pvtmem_size, shader->safe_const_variant->binning->pvtmem_size);
2235 if (!shader->safe_const_variant->binning->pvtmem_per_wave)
2236 per_wave = false;
2237 }
2238 }
2239
2240 struct tu_pvtmem_config pvtmem_config;
2241
2242 result = tu_setup_pvtmem(dev, shader, &pvtmem_config, pvtmem_size, per_wave);
2243 if (result != VK_SUCCESS) {
2244 pthread_mutex_lock(&dev->pipeline_mutex);
2245 tu_suballoc_bo_free(&dev->pipeline_suballoc, &shader->bo);
2246 pthread_mutex_unlock(&dev->pipeline_mutex);
2247 return result;
2248 }
2249
2250 TU_RMV(cmd_buffer_suballoc_bo_create, dev, &shader->bo);
2251 tu_cs_init_suballoc(&shader->cs, dev, &shader->bo);
2252
2253 uint64_t iova = tu_upload_variant(&shader->cs, v);
2254 uint64_t binning_iova = tu_upload_variant(&shader->cs, binning);
2255 uint64_t safe_const_iova = tu_upload_variant(&shader->cs, safe_const);
2256
2257 struct tu_cs sub_cs;
2258 tu_cs_begin_sub_stream(&shader->cs, xs_size +
2259 tu_xs_get_additional_cs_size_dwords(v), &sub_cs);
2260 TU_CALLX(dev, tu6_emit_variant)(
2261 &sub_cs, shader->variant->type, shader->variant, &pvtmem_config,
2262 shader->view_mask, iova);
2263 shader->state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2264
2265 if (safe_const) {
2266 tu_cs_begin_sub_stream(&shader->cs, xs_size +
2267 tu_xs_get_additional_cs_size_dwords(safe_const), &sub_cs);
2268 TU_CALLX(dev, tu6_emit_variant)(
2269 &sub_cs, v->type, safe_const, &pvtmem_config, shader->view_mask,
2270 safe_const_iova);
2271 shader->safe_const_state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2272 }
2273
2274 if (binning) {
2275 tu_cs_begin_sub_stream(&shader->cs, xs_size + vpc_size +
2276 tu_xs_get_additional_cs_size_dwords(binning), &sub_cs);
2277 TU_CALLX(dev, tu6_emit_variant)(
2278 &sub_cs, v->type, binning, &pvtmem_config, shader->view_mask,
2279 binning_iova);
2280 /* emit an empty VPC */
2281 TU_CALLX(dev, tu6_emit_vpc)(&sub_cs, binning, NULL, NULL, NULL, NULL);
2282 shader->binning_state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2283 }
2284
2285 /* We don't support binning variants for GS, so the same draw state is used
2286 * when binning and when drawing, but the VPC draw state is not executed
2287 * when binning so we still need to generate an appropriate VPC config for
2288 * binning.
2289 */
2290 if (v->type == MESA_SHADER_GEOMETRY) {
2291 tu_cs_begin_sub_stream(&shader->cs, vpc_size, &sub_cs);
2292 TU_CALLX(dev, tu6_emit_vpc)(&sub_cs, NULL, NULL, NULL, v, NULL);
2293 shader->binning_state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2294 }
2295
2296 return VK_SUCCESS;
2297 }
2298
2299 static bool
2300 tu_shader_serialize(struct vk_pipeline_cache_object *object,
2301 struct blob *blob);
2302
2303 static struct vk_pipeline_cache_object *
2304 tu_shader_deserialize(struct vk_pipeline_cache *cache,
2305 const void *key_data,
2306 size_t key_size,
2307 struct blob_reader *blob);
2308
2309 static void
tu_shader_pipeline_cache_object_destroy(struct vk_device * vk_device,struct vk_pipeline_cache_object * object)2310 tu_shader_pipeline_cache_object_destroy(struct vk_device *vk_device,
2311 struct vk_pipeline_cache_object *object)
2312 {
2313 struct tu_device *device = container_of(vk_device, struct tu_device, vk);
2314 struct tu_shader *shader =
2315 container_of(object, struct tu_shader, base);
2316
2317 vk_pipeline_cache_object_finish(&shader->base);
2318 tu_shader_destroy(device, shader);
2319 }
2320
2321 const struct vk_pipeline_cache_object_ops tu_shader_ops = {
2322 .serialize = tu_shader_serialize,
2323 .deserialize = tu_shader_deserialize,
2324 .destroy = tu_shader_pipeline_cache_object_destroy,
2325 };
2326
2327 static struct tu_shader *
tu_shader_init(struct tu_device * dev,const void * key_data,size_t key_size)2328 tu_shader_init(struct tu_device *dev, const void *key_data, size_t key_size)
2329 {
2330 VK_MULTIALLOC(ma);
2331 VK_MULTIALLOC_DECL(&ma, struct tu_shader, shader, 1);
2332 VK_MULTIALLOC_DECL_SIZE(&ma, char, obj_key_data, key_size);
2333
2334 if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc,
2335 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
2336 return NULL;
2337
2338 memcpy(obj_key_data, key_data, key_size);
2339
2340 vk_pipeline_cache_object_init(&dev->vk, &shader->base,
2341 &tu_shader_ops, obj_key_data, key_size);
2342
2343 shader->const_state.fdm_ubo.idx = -1;
2344 shader->const_state.dynamic_offsets_ubo.idx = -1;
2345 shader->const_state.inline_uniforms_ubo.idx = -1;
2346
2347 return shader;
2348 }
2349
2350 static bool
tu_shader_serialize(struct vk_pipeline_cache_object * object,struct blob * blob)2351 tu_shader_serialize(struct vk_pipeline_cache_object *object,
2352 struct blob *blob)
2353 {
2354 struct tu_shader *shader =
2355 container_of(object, struct tu_shader, base);
2356
2357 blob_write_bytes(blob, &shader->const_state, sizeof(shader->const_state));
2358 blob_write_bytes(blob, &shader->dynamic_descriptor_sizes,
2359 sizeof(shader->dynamic_descriptor_sizes));
2360 blob_write_uint32(blob, shader->view_mask);
2361 blob_write_uint8(blob, shader->active_desc_sets);
2362
2363 ir3_store_variant(blob, shader->variant);
2364
2365 if (shader->safe_const_variant) {
2366 blob_write_uint8(blob, 1);
2367 ir3_store_variant(blob, shader->safe_const_variant);
2368 } else {
2369 blob_write_uint8(blob, 0);
2370 }
2371
2372
2373
2374 switch (shader->variant->type) {
2375 case MESA_SHADER_TESS_EVAL:
2376 blob_write_bytes(blob, &shader->tes, sizeof(shader->tes));
2377 break;
2378 case MESA_SHADER_FRAGMENT:
2379 blob_write_bytes(blob, &shader->fs, sizeof(shader->fs));
2380 break;
2381 default:
2382 break;
2383 }
2384
2385 return true;
2386 }
2387
2388 static struct vk_pipeline_cache_object *
tu_shader_deserialize(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,struct blob_reader * blob)2389 tu_shader_deserialize(struct vk_pipeline_cache *cache,
2390 const void *key_data,
2391 size_t key_size,
2392 struct blob_reader *blob)
2393 {
2394 struct tu_device *dev =
2395 container_of(cache->base.device, struct tu_device, vk);
2396 struct tu_shader *shader =
2397 tu_shader_init(dev, key_data, key_size);
2398
2399 if (!shader)
2400 return NULL;
2401
2402 blob_copy_bytes(blob, &shader->const_state, sizeof(shader->const_state));
2403 blob_copy_bytes(blob, &shader->dynamic_descriptor_sizes,
2404 sizeof(shader->dynamic_descriptor_sizes));
2405 shader->view_mask = blob_read_uint32(blob);
2406 shader->active_desc_sets = blob_read_uint8(blob);
2407
2408 shader->variant = ir3_retrieve_variant(blob, dev->compiler, NULL);
2409
2410 bool has_safe_const = blob_read_uint8(blob);
2411 if (has_safe_const)
2412 shader->safe_const_variant = ir3_retrieve_variant(blob, dev->compiler, NULL);
2413
2414 switch (shader->variant->type) {
2415 case MESA_SHADER_TESS_EVAL:
2416 blob_copy_bytes(blob, &shader->tes, sizeof(shader->tes));
2417 break;
2418 case MESA_SHADER_FRAGMENT:
2419 blob_copy_bytes(blob, &shader->fs, sizeof(shader->fs));
2420 break;
2421 default:
2422 break;
2423 }
2424
2425 VkResult result = tu_upload_shader(dev, shader);
2426 if (result != VK_SUCCESS) {
2427 vk_free(&dev->vk.alloc, shader);
2428 return NULL;
2429 }
2430
2431 return &shader->base;
2432 }
2433
2434 VkResult
tu_shader_create(struct tu_device * dev,struct tu_shader ** shader_out,nir_shader * nir,const struct tu_shader_key * key,const struct ir3_shader_key * ir3_key,const void * key_data,size_t key_size,struct tu_pipeline_layout * layout,bool executable_info)2435 tu_shader_create(struct tu_device *dev,
2436 struct tu_shader **shader_out,
2437 nir_shader *nir,
2438 const struct tu_shader_key *key,
2439 const struct ir3_shader_key *ir3_key,
2440 const void *key_data,
2441 size_t key_size,
2442 struct tu_pipeline_layout *layout,
2443 bool executable_info)
2444 {
2445 struct tu_shader *shader = tu_shader_init(dev, key_data, key_size);
2446
2447 if (!shader)
2448 return VK_ERROR_OUT_OF_HOST_MEMORY;
2449
2450 const nir_opt_access_options access_options = {
2451 .is_vulkan = true,
2452 };
2453 NIR_PASS_V(nir, nir_opt_access, &access_options);
2454
2455 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
2456 const nir_input_attachment_options att_options = {
2457 .use_fragcoord_sysval = true,
2458 .use_layer_id_sysval = false,
2459 /* When using multiview rendering, we must use
2460 * gl_ViewIndex as the layer id to pass to the texture
2461 * sampling function. gl_Layer doesn't work when
2462 * multiview is enabled.
2463 */
2464 .use_view_id_for_layer = key->multiview_mask != 0,
2465 .unscaled_depth_stencil_ir3 =
2466 key->dynamic_renderpass && !(key->read_only_input_attachments & 1),
2467 .unscaled_input_attachment_ir3 =
2468 key->dynamic_renderpass ?
2469 ~(key->read_only_input_attachments >> 1) :
2470 key->unscaled_input_fragcoord,
2471 };
2472 NIR_PASS_V(nir, nir_lower_input_attachments, &att_options);
2473 }
2474
2475 /* This has to happen before lower_input_attachments, because we have to
2476 * lower input attachment coordinates except if unscaled.
2477 */
2478 const struct lower_fdm_options fdm_options = {
2479 .num_views = MAX2(util_last_bit(key->multiview_mask), 1),
2480 .adjust_fragcoord = key->fragment_density_map,
2481 };
2482 NIR_PASS_V(nir, tu_nir_lower_fdm, &fdm_options);
2483
2484
2485 /* This needs to happen before multiview lowering which rewrites store
2486 * instructions of the position variable, so that we can just rewrite one
2487 * store at the end instead of having to rewrite every store specified by
2488 * the user.
2489 */
2490 ir3_nir_lower_io_to_temporaries(nir);
2491
2492 if (nir->info.stage == MESA_SHADER_VERTEX && key->multiview_mask) {
2493 tu_nir_lower_multiview(nir, key->multiview_mask, dev);
2494 }
2495
2496 if (nir->info.stage == MESA_SHADER_FRAGMENT && key->force_sample_interp) {
2497 nir_foreach_shader_in_variable(var, nir) {
2498 if (!var->data.centroid)
2499 var->data.sample = true;
2500 }
2501 }
2502
2503 NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const,
2504 nir_address_format_32bit_offset);
2505
2506 NIR_PASS_V(nir, nir_lower_explicit_io,
2507 nir_var_mem_ubo | nir_var_mem_ssbo,
2508 nir_address_format_vec2_index_32bit_offset);
2509
2510 NIR_PASS_V(nir, nir_lower_explicit_io,
2511 nir_var_mem_global,
2512 nir_address_format_64bit_global);
2513
2514 if (nir->info.stage == MESA_SHADER_COMPUTE) {
2515 if (!nir->info.shared_memory_explicit_layout) {
2516 NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
2517 nir_var_mem_shared, shared_type_info);
2518 }
2519 NIR_PASS_V(nir, nir_lower_explicit_io,
2520 nir_var_mem_shared,
2521 nir_address_format_32bit_offset);
2522
2523 if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
2524 const unsigned chunk_size = 16; /* max single store size */
2525 /* Shared memory is allocated in 1024b chunks in HW, but the zero-init
2526 * extension only requires us to initialize the memory that the shader
2527 * is allocated at the API level, and it's up to the user to ensure
2528 * that accesses are limited to those bounds.
2529 */
2530 const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
2531 NIR_PASS_V(nir, nir_zero_initialize_shared_memory, shared_size, chunk_size);
2532 }
2533
2534 const struct nir_lower_compute_system_values_options compute_sysval_options = {
2535 .has_base_workgroup_id = true,
2536 };
2537 NIR_PASS_V(nir, nir_lower_compute_system_values, &compute_sysval_options);
2538 }
2539
2540 nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
2541 nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
2542
2543 /* Gather information for transform feedback. This should be called after:
2544 * - nir_split_per_member_structs.
2545 * - nir_remove_dead_variables with varyings, so that we could align
2546 * stream outputs correctly.
2547 * - nir_assign_io_var_locations - to have valid driver_location
2548 */
2549 struct ir3_stream_output_info so_info = {};
2550 if (nir->info.stage == MESA_SHADER_VERTEX ||
2551 nir->info.stage == MESA_SHADER_TESS_EVAL ||
2552 nir->info.stage == MESA_SHADER_GEOMETRY)
2553 tu_gather_xfb_info(nir, &so_info);
2554
2555 for (unsigned i = 0; i < layout->num_sets; i++) {
2556 if (layout->set[i].layout) {
2557 shader->dynamic_descriptor_sizes[i] =
2558 layout->set[i].layout->dynamic_offset_size;
2559 } else {
2560 shader->dynamic_descriptor_sizes[i] = -1;
2561 }
2562 }
2563
2564 {
2565 /* Lower 64b push constants before lowering IO. */
2566 nir_lower_mem_access_bit_sizes_options options = {
2567 .callback = ir3_mem_access_size_align,
2568 .modes = nir_var_mem_push_const,
2569 };
2570
2571 NIR_PASS_V(nir, nir_lower_mem_access_bit_sizes, &options);
2572 }
2573
2574 struct ir3_const_allocations const_allocs = {};
2575 NIR_PASS_V(nir, tu_lower_io, dev, shader, layout,
2576 key->read_only_input_attachments, key->dynamic_renderpass,
2577 &const_allocs);
2578
2579 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
2580
2581 struct ir3_shader_nir_options nir_options;
2582 init_ir3_nir_options(&nir_options, key);
2583
2584 ir3_finalize_nir(dev->compiler, &nir_options, nir);
2585
2586 const struct ir3_shader_options options = {
2587 .api_wavesize = key->api_wavesize,
2588 .real_wavesize = key->real_wavesize,
2589 .push_consts_type = shader->const_state.push_consts.type,
2590 .push_consts_base = shader->const_state.push_consts.lo_dwords,
2591 .push_consts_dwords = shader->const_state.push_consts.dwords,
2592 .const_allocs = const_allocs,
2593 .nir_options = nir_options,
2594 };
2595
2596 struct ir3_shader *ir3_shader =
2597 ir3_shader_from_nir(dev->compiler, nir, &options, &so_info);
2598
2599 shader->variant =
2600 ir3_shader_create_variant(ir3_shader, ir3_key, executable_info);
2601
2602 if (ir3_exceeds_safe_constlen(shader->variant)) {
2603 struct ir3_shader_key safe_constlen_key = *ir3_key;
2604 safe_constlen_key.safe_constlen = true;
2605 shader->safe_const_variant =
2606 ir3_shader_create_variant(ir3_shader, &safe_constlen_key,
2607 executable_info);
2608 }
2609
2610 ir3_shader_destroy(ir3_shader);
2611
2612 shader->view_mask = key->multiview_mask;
2613
2614 switch (shader->variant->type) {
2615 case MESA_SHADER_TESS_EVAL: {
2616 const struct ir3_shader_variant *tes = shader->variant;
2617 if (tes->tess.point_mode) {
2618 shader->tes.tess_output_lower_left =
2619 shader->tes.tess_output_upper_left = TESS_POINTS;
2620 } else if (tes->tess.primitive_mode == TESS_PRIMITIVE_ISOLINES) {
2621 shader->tes.tess_output_lower_left =
2622 shader->tes.tess_output_upper_left = TESS_LINES;
2623 } else if (tes->tess.ccw) {
2624 /* Tessellation orientation in HW is specified with a lower-left
2625 * origin, we need to swap them if the origin is upper-left.
2626 */
2627 shader->tes.tess_output_lower_left = TESS_CCW_TRIS;
2628 shader->tes.tess_output_upper_left = TESS_CW_TRIS;
2629 } else {
2630 shader->tes.tess_output_lower_left = TESS_CW_TRIS;
2631 shader->tes.tess_output_upper_left = TESS_CCW_TRIS;
2632 }
2633
2634 switch (tes->tess.spacing) {
2635 case TESS_SPACING_EQUAL:
2636 shader->tes.tess_spacing = TESS_EQUAL;
2637 break;
2638 case TESS_SPACING_FRACTIONAL_ODD:
2639 shader->tes.tess_spacing = TESS_FRACTIONAL_ODD;
2640 break;
2641 case TESS_SPACING_FRACTIONAL_EVEN:
2642 shader->tes.tess_spacing = TESS_FRACTIONAL_EVEN;
2643 break;
2644 case TESS_SPACING_UNSPECIFIED:
2645 default:
2646 unreachable("invalid tess spacing");
2647 }
2648
2649 break;
2650 }
2651 case MESA_SHADER_FRAGMENT: {
2652 const struct ir3_shader_variant *fs = shader->variant;
2653 shader->fs.per_samp = fs->per_samp || ir3_key->sample_shading;
2654 shader->fs.has_fdm = key->fragment_density_map;
2655 if (fs->has_kill)
2656 shader->fs.lrz.status |= TU_LRZ_FORCE_DISABLE_WRITE;
2657 if (fs->no_earlyz || (fs->writes_pos && !fs->fs.early_fragment_tests))
2658 shader->fs.lrz.status = TU_LRZ_FORCE_DISABLE_LRZ;
2659 /* FDM isn't compatible with LRZ, because the LRZ image uses the original
2660 * resolution and we would need to use the low resolution.
2661 *
2662 * TODO: Use a patchpoint to only disable LRZ for scaled bins.
2663 */
2664 if (key->fragment_density_map)
2665 shader->fs.lrz.status = TU_LRZ_FORCE_DISABLE_LRZ;
2666 if (!fs->fs.early_fragment_tests &&
2667 (fs->no_earlyz || fs->writes_pos || fs->writes_stencilref || fs->writes_smask)) {
2668 shader->fs.lrz.force_late_z = true;
2669 }
2670 break;
2671 }
2672 default:
2673 break;
2674 }
2675
2676 VkResult result = tu_upload_shader(dev, shader);
2677 if (result != VK_SUCCESS) {
2678 vk_free(&dev->vk.alloc, shader);
2679 return result;
2680 }
2681
2682 *shader_out = shader;
2683 return VK_SUCCESS;
2684 }
2685
2686 static void
tu_link_shaders(nir_shader ** shaders,unsigned shaders_count)2687 tu_link_shaders(nir_shader **shaders, unsigned shaders_count)
2688 {
2689 nir_shader *consumer = NULL;
2690 for (gl_shader_stage stage = (gl_shader_stage) (shaders_count - 1);
2691 stage >= MESA_SHADER_VERTEX; stage = (gl_shader_stage) (stage - 1)) {
2692 if (!shaders[stage])
2693 continue;
2694
2695 nir_shader *producer = shaders[stage];
2696 if (!consumer) {
2697 consumer = producer;
2698 continue;
2699 }
2700
2701 if (nir_link_opt_varyings(producer, consumer)) {
2702 NIR_PASS_V(consumer, nir_opt_constant_folding);
2703 NIR_PASS_V(consumer, nir_opt_algebraic);
2704 NIR_PASS_V(consumer, nir_opt_dce);
2705 }
2706
2707 const nir_remove_dead_variables_options out_var_opts = {
2708 .can_remove_var = nir_vk_is_not_xfb_output,
2709 };
2710 NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, &out_var_opts);
2711
2712 NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
2713
2714 bool progress = nir_remove_unused_varyings(producer, consumer);
2715
2716 nir_compact_varyings(producer, consumer, true);
2717 if (progress) {
2718 if (nir_lower_global_vars_to_local(producer)) {
2719 /* Remove dead writes, which can remove input loads */
2720 NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_temp, NULL);
2721 NIR_PASS_V(producer, nir_opt_dce);
2722 }
2723 nir_lower_global_vars_to_local(consumer);
2724 }
2725
2726 consumer = producer;
2727 }
2728
2729 /* Gather info after linking so that we can fill out the ir3 shader key.
2730 */
2731 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2732 stage <= MESA_SHADER_FRAGMENT; stage = (gl_shader_stage) (stage + 1)) {
2733 if (shaders[stage])
2734 nir_shader_gather_info(shaders[stage],
2735 nir_shader_get_entrypoint(shaders[stage]));
2736 }
2737 }
2738
2739 static uint32_t
tu6_get_tessmode(const struct nir_shader * shader)2740 tu6_get_tessmode(const struct nir_shader *shader)
2741 {
2742 enum tess_primitive_mode primitive_mode = shader->info.tess._primitive_mode;
2743 switch (primitive_mode) {
2744 case TESS_PRIMITIVE_ISOLINES:
2745 return IR3_TESS_ISOLINES;
2746 case TESS_PRIMITIVE_TRIANGLES:
2747 return IR3_TESS_TRIANGLES;
2748 case TESS_PRIMITIVE_QUADS:
2749 return IR3_TESS_QUADS;
2750 case TESS_PRIMITIVE_UNSPECIFIED:
2751 return IR3_TESS_NONE;
2752 default:
2753 unreachable("bad tessmode");
2754 }
2755 }
2756
2757 VkResult
tu_compile_shaders(struct tu_device * device,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo ** stage_infos,nir_shader ** nir,const struct tu_shader_key * keys,struct tu_pipeline_layout * layout,const unsigned char * pipeline_sha1,struct tu_shader ** shaders,char ** nir_initial_disasm,void * nir_initial_disasm_mem_ctx,nir_shader ** nir_out,VkPipelineCreationFeedback * stage_feedbacks)2758 tu_compile_shaders(struct tu_device *device,
2759 VkPipelineCreateFlags2KHR pipeline_flags,
2760 const VkPipelineShaderStageCreateInfo **stage_infos,
2761 nir_shader **nir,
2762 const struct tu_shader_key *keys,
2763 struct tu_pipeline_layout *layout,
2764 const unsigned char *pipeline_sha1,
2765 struct tu_shader **shaders,
2766 char **nir_initial_disasm,
2767 void *nir_initial_disasm_mem_ctx,
2768 nir_shader **nir_out,
2769 VkPipelineCreationFeedback *stage_feedbacks)
2770 {
2771 struct ir3_shader_key ir3_key = {};
2772 VkResult result = VK_SUCCESS;
2773 void *mem_ctx = ralloc_context(NULL);
2774
2775 for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2776 stage = (gl_shader_stage) (stage + 1)) {
2777 const VkPipelineShaderStageCreateInfo *stage_info = stage_infos[stage];
2778 if (!stage_info)
2779 continue;
2780
2781 int64_t stage_start = os_time_get_nano();
2782
2783 nir[stage] = tu_spirv_to_nir(device, mem_ctx, pipeline_flags,
2784 stage_info, &keys[stage], stage);
2785 if (!nir[stage]) {
2786 result = VK_ERROR_OUT_OF_HOST_MEMORY;
2787 goto fail;
2788 }
2789
2790 stage_feedbacks[stage].flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
2791 stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
2792 }
2793
2794 if (nir[MESA_SHADER_GEOMETRY])
2795 ir3_key.has_gs = true;
2796
2797 ir3_key.sample_shading = keys[MESA_SHADER_FRAGMENT].force_sample_interp;
2798
2799 if (nir_initial_disasm) {
2800 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2801 stage < MESA_SHADER_STAGES;
2802 stage = (gl_shader_stage) (stage + 1)) {
2803 if (!nir[stage])
2804 continue;
2805
2806 nir_initial_disasm[stage] =
2807 nir_shader_as_str(nir[stage], nir_initial_disasm_mem_ctx);
2808 }
2809 }
2810
2811 tu_link_shaders(nir, MESA_SHADER_STAGES);
2812
2813 if (nir_out) {
2814 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2815 stage < MESA_SHADER_STAGES; stage = (gl_shader_stage) (stage + 1)) {
2816 if (!nir[stage])
2817 continue;
2818
2819 nir_out[stage] = nir_shader_clone(NULL, nir[stage]);
2820 }
2821 }
2822
2823 /* With pipelines, tessellation modes can be set on either shader, for
2824 * compatibility with HLSL and GLSL, and the driver is supposed to merge
2825 * them. Shader objects requires modes to be set on at least the TES except
2826 * for OutputVertices which has to be set at least on the TCS. Make sure
2827 * all modes are set on the TES when compiling together multiple shaders,
2828 * and then from this point on we will use the modes in the TES (and output
2829 * vertices on the TCS).
2830 */
2831 if (nir[MESA_SHADER_TESS_EVAL]) {
2832 nir_shader *tcs = nir[MESA_SHADER_TESS_CTRL];
2833 nir_shader *tes = nir[MESA_SHADER_TESS_EVAL];
2834
2835 if (tes->info.tess._primitive_mode == TESS_PRIMITIVE_UNSPECIFIED)
2836 tes->info.tess._primitive_mode = tcs->info.tess._primitive_mode;
2837
2838 tes->info.tess.point_mode |= tcs->info.tess.point_mode;
2839 tes->info.tess.ccw |= tcs->info.tess.ccw;
2840
2841 if (tes->info.tess.spacing == TESS_SPACING_UNSPECIFIED) {
2842 tes->info.tess.spacing = tcs->info.tess.spacing;
2843 }
2844
2845 if (tcs->info.tess.tcs_vertices_out == 0)
2846 tcs->info.tess.tcs_vertices_out = tes->info.tess.tcs_vertices_out;
2847
2848 ir3_key.tessellation = tu6_get_tessmode(tes);
2849 }
2850
2851 for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2852 stage = (gl_shader_stage) (stage + 1)) {
2853 if (!nir[stage])
2854 continue;
2855
2856 if (stage > MESA_SHADER_TESS_CTRL) {
2857 if (stage == MESA_SHADER_FRAGMENT) {
2858 ir3_key.tcs_store_primid = ir3_key.tcs_store_primid ||
2859 (nir[stage]->info.inputs_read & (1ull << VARYING_SLOT_PRIMITIVE_ID));
2860 } else {
2861 ir3_key.tcs_store_primid = ir3_key.tcs_store_primid ||
2862 BITSET_TEST(nir[stage]->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
2863 }
2864 }
2865 }
2866
2867 /* In the the tess-but-not-FS case we don't know whether the FS will read
2868 * PrimID so we need to unconditionally store it.
2869 */
2870 if (nir[MESA_SHADER_TESS_CTRL] && !nir[MESA_SHADER_FRAGMENT])
2871 ir3_key.tcs_store_primid = true;
2872
2873 for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2874 stage = (gl_shader_stage) (stage + 1)) {
2875 if (!nir[stage] || shaders[stage])
2876 continue;
2877
2878 int64_t stage_start = os_time_get_nano();
2879
2880 unsigned char shader_sha1[21];
2881 memcpy(shader_sha1, pipeline_sha1, 20);
2882 shader_sha1[20] = (unsigned char) stage;
2883
2884 result = tu_shader_create(device,
2885 &shaders[stage], nir[stage], &keys[stage],
2886 &ir3_key, shader_sha1, sizeof(shader_sha1),
2887 layout, !!nir_initial_disasm);
2888 if (result != VK_SUCCESS) {
2889 goto fail;
2890 }
2891
2892 stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
2893 }
2894
2895 ralloc_free(mem_ctx);
2896
2897 return VK_SUCCESS;
2898
2899 fail:
2900 ralloc_free(mem_ctx);
2901
2902 for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES;
2903 stage = (gl_shader_stage) (stage + 1)) {
2904 if (shaders[stage]) {
2905 tu_shader_destroy(device, shaders[stage]);
2906 }
2907 if (nir_out && nir_out[stage]) {
2908 ralloc_free(nir_out[stage]);
2909 }
2910 }
2911
2912 return result;
2913 }
2914
2915 void
tu_shader_key_subgroup_size(struct tu_shader_key * key,bool allow_varying_subgroup_size,bool require_full_subgroups,const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo * subgroup_info,struct tu_device * dev)2916 tu_shader_key_subgroup_size(struct tu_shader_key *key,
2917 bool allow_varying_subgroup_size,
2918 bool require_full_subgroups,
2919 const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info,
2920 struct tu_device *dev)
2921 {
2922 enum ir3_wavesize_option api_wavesize, real_wavesize;
2923 if (!dev->physical_device->info->a6xx.supports_double_threadsize) {
2924 api_wavesize = IR3_SINGLE_ONLY;
2925 real_wavesize = IR3_SINGLE_ONLY;
2926 } else {
2927 if (allow_varying_subgroup_size) {
2928 api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE;
2929 } else {
2930 if (subgroup_info) {
2931 if (subgroup_info->requiredSubgroupSize == dev->compiler->threadsize_base) {
2932 api_wavesize = IR3_SINGLE_ONLY;
2933 } else {
2934 assert(subgroup_info->requiredSubgroupSize == dev->compiler->threadsize_base * 2);
2935 api_wavesize = IR3_DOUBLE_ONLY;
2936 }
2937 } else {
2938 /* Match the exposed subgroupSize. */
2939 api_wavesize = IR3_DOUBLE_ONLY;
2940 }
2941
2942 if (require_full_subgroups)
2943 real_wavesize = api_wavesize;
2944 else if (api_wavesize == IR3_SINGLE_ONLY)
2945 real_wavesize = IR3_SINGLE_ONLY;
2946 else
2947 real_wavesize = IR3_SINGLE_OR_DOUBLE;
2948 }
2949 }
2950
2951 key->api_wavesize = api_wavesize;
2952 key->real_wavesize = real_wavesize;
2953 }
2954
2955 void
tu_shader_key_robustness(struct tu_shader_key * key,const struct vk_pipeline_robustness_state * rs)2956 tu_shader_key_robustness(struct tu_shader_key *key,
2957 const struct vk_pipeline_robustness_state *rs)
2958 {
2959 key->robust_storage_access2 =
2960 (rs->storage_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT);
2961 key->robust_uniform_access2 =
2962 (rs->uniform_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT);
2963 }
2964
2965 static VkResult
tu_empty_shader_create(struct tu_device * dev,struct tu_shader ** shader_out,gl_shader_stage stage)2966 tu_empty_shader_create(struct tu_device *dev,
2967 struct tu_shader **shader_out,
2968 gl_shader_stage stage)
2969 {
2970 struct tu_shader *shader = tu_shader_init(dev, NULL, 0);
2971
2972 if (!shader)
2973 return VK_ERROR_OUT_OF_HOST_MEMORY;
2974
2975 pthread_mutex_lock(&dev->pipeline_mutex);
2976 VkResult result = tu_suballoc_bo_alloc(&shader->bo, &dev->pipeline_suballoc,
2977 32 * 4, 128);
2978 pthread_mutex_unlock(&dev->pipeline_mutex);
2979
2980 if (result != VK_SUCCESS) {
2981 vk_free(&dev->vk.alloc, shader);
2982 return result;
2983 }
2984
2985 TU_RMV(cmd_buffer_suballoc_bo_create, dev, &shader->bo);
2986 tu_cs_init_suballoc(&shader->cs, dev, &shader->bo);
2987
2988 struct tu_pvtmem_config pvtmem_config = { };
2989
2990 struct tu_cs sub_cs;
2991 tu_cs_begin_sub_stream(&shader->cs, 32, &sub_cs);
2992 TU_CALLX(dev, tu6_emit_variant)(&sub_cs, stage, NULL, &pvtmem_config, 0, 0);
2993 shader->state = tu_cs_end_draw_state(&shader->cs, &sub_cs);
2994
2995 *shader_out = shader;
2996 return VK_SUCCESS;
2997 }
2998
2999 static VkResult
tu_empty_fs_create(struct tu_device * dev,struct tu_shader ** shader,bool fragment_density_map)3000 tu_empty_fs_create(struct tu_device *dev, struct tu_shader **shader,
3001 bool fragment_density_map)
3002 {
3003 struct ir3_shader_key key = {};
3004 const struct ir3_shader_options options = {};
3005 struct ir3_stream_output_info so_info = {};
3006 const nir_shader_compiler_options *nir_options =
3007 ir3_get_compiler_options(dev->compiler);
3008 nir_builder fs_b;
3009
3010 fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, nir_options,
3011 "noop_fs");
3012
3013 *shader = tu_shader_init(dev, NULL, 0);
3014 if (!*shader)
3015 return VK_ERROR_OUT_OF_HOST_MEMORY;
3016
3017 (*shader)->fs.has_fdm = fragment_density_map;
3018 if (fragment_density_map)
3019 (*shader)->fs.lrz.status = TU_LRZ_FORCE_DISABLE_LRZ;
3020
3021 for (unsigned i = 0; i < MAX_SETS; i++)
3022 (*shader)->dynamic_descriptor_sizes[i] = -1;
3023
3024 struct ir3_shader *ir3_shader =
3025 ir3_shader_from_nir(dev->compiler, fs_b.shader, &options, &so_info);
3026 (*shader)->variant = ir3_shader_create_variant(ir3_shader, &key, false);
3027 ir3_shader_destroy(ir3_shader);
3028
3029 return tu_upload_shader(dev, *shader);
3030 }
3031
3032 VkResult
tu_init_empty_shaders(struct tu_device * dev)3033 tu_init_empty_shaders(struct tu_device *dev)
3034 {
3035 VkResult result;
3036
3037 result = tu_empty_shader_create(dev, &dev->empty_tcs, MESA_SHADER_TESS_CTRL);
3038 if (result != VK_SUCCESS)
3039 goto out;
3040
3041 result = tu_empty_shader_create(dev, &dev->empty_tes, MESA_SHADER_TESS_EVAL);
3042 if (result != VK_SUCCESS)
3043 goto out;
3044
3045 result = tu_empty_shader_create(dev, &dev->empty_gs, MESA_SHADER_GEOMETRY);
3046 if (result != VK_SUCCESS)
3047 goto out;
3048
3049 result = tu_empty_fs_create(dev, &dev->empty_fs, false);
3050 if (result != VK_SUCCESS)
3051 goto out;
3052
3053 result = tu_empty_fs_create(dev, &dev->empty_fs_fdm, true);
3054 if (result != VK_SUCCESS)
3055 goto out;
3056
3057 return VK_SUCCESS;
3058
3059 out:
3060 if (dev->empty_tcs)
3061 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tcs->base);
3062 if (dev->empty_tes)
3063 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tes->base);
3064 if (dev->empty_gs)
3065 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_gs->base);
3066 if (dev->empty_fs)
3067 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs->base);
3068 if (dev->empty_fs_fdm)
3069 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs_fdm->base);
3070 return result;
3071 }
3072
3073 void
tu_destroy_empty_shaders(struct tu_device * dev)3074 tu_destroy_empty_shaders(struct tu_device *dev)
3075 {
3076 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tcs->base);
3077 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_tes->base);
3078 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_gs->base);
3079 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs->base);
3080 vk_pipeline_cache_object_unref(&dev->vk, &dev->empty_fs_fdm->base);
3081 }
3082
3083 void
tu_shader_destroy(struct tu_device * dev,struct tu_shader * shader)3084 tu_shader_destroy(struct tu_device *dev,
3085 struct tu_shader *shader)
3086 {
3087 tu_cs_finish(&shader->cs);
3088 TU_RMV(resource_destroy, dev, &shader->bo);
3089
3090 pthread_mutex_lock(&dev->pipeline_mutex);
3091 tu_suballoc_bo_free(&dev->pipeline_suballoc, &shader->bo);
3092 pthread_mutex_unlock(&dev->pipeline_mutex);
3093
3094 if (shader->pvtmem_bo)
3095 tu_bo_finish(dev, shader->pvtmem_bo);
3096
3097 if (shader->variant)
3098 ralloc_free((void *)shader->variant);
3099 if (shader->safe_const_variant)
3100 ralloc_free((void *)shader->safe_const_variant);
3101
3102 vk_free(&dev->vk.alloc, shader);
3103 }
3104