• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2024 Valve Corporation
3  * Copyright 2024 Alyssa Rosenzweig
4  * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5  * SPDX-License-Identifier: MIT
6  */
7 #include "pipe/p_defines.h"
8 #include "vulkan/vulkan_core.h"
9 #include "agx_nir_texture.h"
10 #include "hk_cmd_buffer.h"
11 #include "hk_descriptor_set.h"
12 #include "hk_descriptor_set_layout.h"
13 #include "hk_shader.h"
14 
15 #include "nir.h"
16 #include "nir_builder.h"
17 #include "nir_builder_opcodes.h"
18 #include "nir_intrinsics.h"
19 #include "nir_intrinsics_indices.h"
20 #include "shader_enums.h"
21 #include "vk_pipeline.h"
22 
23 struct lower_descriptors_ctx {
24    const struct hk_descriptor_set_layout *set_layouts[HK_MAX_SETS];
25 
26    bool clamp_desc_array_bounds;
27    nir_address_format ubo_addr_format;
28    nir_address_format ssbo_addr_format;
29 };
30 
31 static const struct hk_descriptor_set_binding_layout *
get_binding_layout(uint32_t set,uint32_t binding,const struct lower_descriptors_ctx * ctx)32 get_binding_layout(uint32_t set, uint32_t binding,
33                    const struct lower_descriptors_ctx *ctx)
34 {
35    assert(set < HK_MAX_SETS);
36    assert(ctx->set_layouts[set] != NULL);
37 
38    const struct hk_descriptor_set_layout *set_layout = ctx->set_layouts[set];
39 
40    assert(binding < set_layout->binding_count);
41    return &set_layout->binding[binding];
42 }
43 
44 static nir_def *
load_speculatable(nir_builder * b,unsigned num_components,unsigned bit_size,nir_def * addr,unsigned align)45 load_speculatable(nir_builder *b, unsigned num_components, unsigned bit_size,
46                   nir_def *addr, unsigned align)
47 {
48    return nir_build_load_global_constant(b, num_components, bit_size, addr,
49                                          .align_mul = align,
50                                          .access = ACCESS_CAN_SPECULATE);
51 }
52 
53 static nir_def *
load_root(nir_builder * b,unsigned num_components,unsigned bit_size,nir_def * offset,unsigned align)54 load_root(nir_builder *b, unsigned num_components, unsigned bit_size,
55           nir_def *offset, unsigned align)
56 {
57    nir_def *root = nir_load_preamble(b, 1, 64, .base = HK_ROOT_UNIFORM);
58 
59    /* We've bound the address of the root descriptor, index in. */
60    nir_def *addr = nir_iadd(b, root, nir_u2u64(b, offset));
61 
62    return load_speculatable(b, num_components, bit_size, addr, align);
63 }
64 
65 static bool
lower_load_constant(nir_builder * b,nir_intrinsic_instr * load,const struct lower_descriptors_ctx * ctx)66 lower_load_constant(nir_builder *b, nir_intrinsic_instr *load,
67                     const struct lower_descriptors_ctx *ctx)
68 {
69    assert(load->intrinsic == nir_intrinsic_load_constant);
70    unreachable("todo: stick an address in the root descriptor or something");
71 
72    uint32_t base = nir_intrinsic_base(load);
73    uint32_t range = nir_intrinsic_range(load);
74 
75    b->cursor = nir_before_instr(&load->instr);
76 
77    nir_def *offset = nir_iadd_imm(b, load->src[0].ssa, base);
78    nir_def *data = nir_load_ubo(
79       b, load->def.num_components, load->def.bit_size, nir_imm_int(b, 0),
80       offset, .align_mul = nir_intrinsic_align_mul(load),
81       .align_offset = nir_intrinsic_align_offset(load), .range_base = base,
82       .range = range);
83 
84    nir_def_rewrite_uses(&load->def, data);
85 
86    return true;
87 }
88 
89 static nir_def *
load_descriptor_set_addr(nir_builder * b,uint32_t set,UNUSED const struct lower_descriptors_ctx * ctx)90 load_descriptor_set_addr(nir_builder *b, uint32_t set,
91                          UNUSED const struct lower_descriptors_ctx *ctx)
92 {
93    uint32_t set_addr_offset =
94       hk_root_descriptor_offset(sets) + set * sizeof(uint64_t);
95 
96    return load_root(b, 1, 64, nir_imm_int(b, set_addr_offset), 8);
97 }
98 
99 static nir_def *
load_dynamic_buffer_start(nir_builder * b,uint32_t set,const struct lower_descriptors_ctx * ctx)100 load_dynamic_buffer_start(nir_builder *b, uint32_t set,
101                           const struct lower_descriptors_ctx *ctx)
102 {
103    int dynamic_buffer_start_imm = 0;
104    for (uint32_t s = 0; s < set; s++) {
105       if (ctx->set_layouts[s] == NULL) {
106          dynamic_buffer_start_imm = -1;
107          break;
108       }
109 
110       dynamic_buffer_start_imm += ctx->set_layouts[s]->dynamic_buffer_count;
111    }
112 
113    if (dynamic_buffer_start_imm >= 0) {
114       return nir_imm_int(b, dynamic_buffer_start_imm);
115    } else {
116       uint32_t root_offset =
117          hk_root_descriptor_offset(set_dynamic_buffer_start) + set;
118 
119       return nir_u2u32(b, load_root(b, 1, 8, nir_imm_int(b, root_offset), 1));
120    }
121 }
122 
123 static nir_def *
load_descriptor(nir_builder * b,unsigned num_components,unsigned bit_size,uint32_t set,uint32_t binding,nir_def * index,unsigned offset_B,const struct lower_descriptors_ctx * ctx)124 load_descriptor(nir_builder *b, unsigned num_components, unsigned bit_size,
125                 uint32_t set, uint32_t binding, nir_def *index,
126                 unsigned offset_B, const struct lower_descriptors_ctx *ctx)
127 {
128    const struct hk_descriptor_set_binding_layout *binding_layout =
129       get_binding_layout(set, binding, ctx);
130 
131    if (ctx->clamp_desc_array_bounds)
132       index =
133          nir_umin(b, index, nir_imm_int(b, binding_layout->array_size - 1));
134 
135    switch (binding_layout->type) {
136    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
137    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
138       /* Get the index in the root descriptor table dynamic_buffers array. */
139       nir_def *dynamic_buffer_start = load_dynamic_buffer_start(b, set, ctx);
140 
141       index = nir_iadd(b, index,
142                        nir_iadd_imm(b, dynamic_buffer_start,
143                                     binding_layout->dynamic_buffer_index));
144 
145       nir_def *root_desc_offset = nir_iadd_imm(
146          b, nir_imul_imm(b, index, sizeof(struct hk_buffer_address)),
147          hk_root_descriptor_offset(dynamic_buffers));
148 
149       assert(num_components == 4 && bit_size == 32);
150       nir_def *desc = load_root(b, 4, 32, root_desc_offset, 16);
151 
152       /* We know a priori that the the .w compnent (offset) is zero */
153       return nir_vector_insert_imm(b, desc, nir_imm_int(b, 0), 3);
154    }
155 
156    case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: {
157       nir_def *base_addr = nir_iadd_imm(
158          b, load_descriptor_set_addr(b, set, ctx), binding_layout->offset);
159 
160       assert(binding_layout->stride == 1);
161       const uint32_t binding_size = binding_layout->array_size;
162 
163       /* Convert it to nir_address_format_64bit_bounded_global */
164       assert(num_components == 4 && bit_size == 32);
165       return nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_addr),
166                       nir_unpack_64_2x32_split_y(b, base_addr),
167                       nir_imm_int(b, binding_size), nir_imm_int(b, 0));
168    }
169 
170    default: {
171       assert(binding_layout->stride > 0);
172       nir_def *desc_ubo_offset =
173          nir_iadd_imm(b, nir_imul_imm(b, index, binding_layout->stride),
174                       binding_layout->offset + offset_B);
175 
176       unsigned desc_align_mul = (1 << (ffs(binding_layout->stride) - 1));
177       desc_align_mul = MIN2(desc_align_mul, 16);
178       unsigned desc_align_offset = binding_layout->offset + offset_B;
179       desc_align_offset %= desc_align_mul;
180 
181       nir_def *desc;
182       nir_def *set_addr = load_descriptor_set_addr(b, set, ctx);
183       desc = nir_load_global_constant_offset(
184          b, num_components, bit_size, set_addr, desc_ubo_offset,
185          .align_mul = desc_align_mul, .align_offset = desc_align_offset,
186          .access = ACCESS_CAN_SPECULATE);
187 
188       if (binding_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
189           binding_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER) {
190          /* We know a priori that the the .w compnent (offset) is zero */
191          assert(num_components == 4 && bit_size == 32);
192          desc = nir_vector_insert_imm(b, desc, nir_imm_int(b, 0), 3);
193       }
194       return desc;
195    }
196    }
197 }
198 
199 static bool
is_idx_intrin(nir_intrinsic_instr * intrin)200 is_idx_intrin(nir_intrinsic_instr *intrin)
201 {
202    while (intrin->intrinsic == nir_intrinsic_vulkan_resource_reindex) {
203       intrin = nir_src_as_intrinsic(intrin->src[0]);
204       if (intrin == NULL)
205          return false;
206    }
207 
208    return intrin->intrinsic == nir_intrinsic_vulkan_resource_index;
209 }
210 
211 static nir_def *
load_descriptor_for_idx_intrin(nir_builder * b,nir_intrinsic_instr * intrin,const struct lower_descriptors_ctx * ctx)212 load_descriptor_for_idx_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
213                                const struct lower_descriptors_ctx *ctx)
214 {
215    nir_def *index = nir_imm_int(b, 0);
216 
217    while (intrin->intrinsic == nir_intrinsic_vulkan_resource_reindex) {
218       index = nir_iadd(b, index, intrin->src[1].ssa);
219       intrin = nir_src_as_intrinsic(intrin->src[0]);
220    }
221 
222    assert(intrin->intrinsic == nir_intrinsic_vulkan_resource_index);
223    uint32_t set = nir_intrinsic_desc_set(intrin);
224    uint32_t binding = nir_intrinsic_binding(intrin);
225    index = nir_iadd(b, index, intrin->src[0].ssa);
226 
227    return load_descriptor(b, 4, 32, set, binding, index, 0, ctx);
228 }
229 
230 static bool
try_lower_load_vulkan_descriptor(nir_builder * b,nir_intrinsic_instr * intrin,const struct lower_descriptors_ctx * ctx)231 try_lower_load_vulkan_descriptor(nir_builder *b, nir_intrinsic_instr *intrin,
232                                  const struct lower_descriptors_ctx *ctx)
233 {
234    ASSERTED const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
235    b->cursor = nir_before_instr(&intrin->instr);
236 
237    nir_intrinsic_instr *idx_intrin = nir_src_as_intrinsic(intrin->src[0]);
238    if (idx_intrin == NULL || !is_idx_intrin(idx_intrin)) {
239       assert(desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER ||
240              desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC);
241       return false;
242    }
243 
244    nir_def *desc = load_descriptor_for_idx_intrin(b, idx_intrin, ctx);
245 
246    nir_def_rewrite_uses(&intrin->def, desc);
247 
248    return true;
249 }
250 
251 static bool
_lower_sysval_to_root_table(nir_builder * b,nir_intrinsic_instr * intrin,uint32_t root_table_offset)252 _lower_sysval_to_root_table(nir_builder *b, nir_intrinsic_instr *intrin,
253                             uint32_t root_table_offset)
254 {
255    b->cursor = nir_instr_remove(&intrin->instr);
256    assert((root_table_offset & 3) == 0 && "aligned");
257 
258    nir_def *val = load_root(b, intrin->def.num_components, intrin->def.bit_size,
259                             nir_imm_int(b, root_table_offset), 4);
260 
261    nir_def_rewrite_uses(&intrin->def, val);
262 
263    return true;
264 }
265 
266 #define lower_sysval_to_root_table(b, intrin, member)                          \
267    _lower_sysval_to_root_table(b, intrin, hk_root_descriptor_offset(member))
268 
269 static bool
lower_load_push_constant(nir_builder * b,nir_intrinsic_instr * load,const struct lower_descriptors_ctx * ctx)270 lower_load_push_constant(nir_builder *b, nir_intrinsic_instr *load,
271                          const struct lower_descriptors_ctx *ctx)
272 {
273    const uint32_t push_region_offset = hk_root_descriptor_offset(push);
274    const uint32_t base = nir_intrinsic_base(load);
275 
276    b->cursor = nir_before_instr(&load->instr);
277 
278    nir_def *offset =
279       nir_iadd_imm(b, load->src[0].ssa, push_region_offset + base);
280 
281    nir_def *val = load_root(b, load->def.num_components, load->def.bit_size,
282                             offset, load->def.bit_size / 8);
283 
284    nir_def_rewrite_uses(&load->def, val);
285 
286    return true;
287 }
288 
289 static void
get_resource_deref_binding(nir_builder * b,nir_deref_instr * deref,uint32_t * set,uint32_t * binding,nir_def ** index)290 get_resource_deref_binding(nir_builder *b, nir_deref_instr *deref,
291                            uint32_t *set, uint32_t *binding, nir_def **index)
292 {
293    if (deref->deref_type == nir_deref_type_array) {
294       *index = deref->arr.index.ssa;
295       deref = nir_deref_instr_parent(deref);
296    } else {
297       *index = nir_imm_int(b, 0);
298    }
299 
300    assert(deref->deref_type == nir_deref_type_var);
301    nir_variable *var = deref->var;
302 
303    *set = var->data.descriptor_set;
304    *binding = var->data.binding;
305 }
306 
307 static nir_def *
load_resource_deref_desc(nir_builder * b,unsigned num_components,unsigned bit_size,nir_deref_instr * deref,unsigned offset_B,const struct lower_descriptors_ctx * ctx)308 load_resource_deref_desc(nir_builder *b, unsigned num_components,
309                          unsigned bit_size, nir_deref_instr *deref,
310                          unsigned offset_B,
311                          const struct lower_descriptors_ctx *ctx)
312 {
313    uint32_t set, binding;
314    nir_def *index;
315    get_resource_deref_binding(b, deref, &set, &binding, &index);
316    return load_descriptor(b, num_components, bit_size, set, binding, index,
317                           offset_B, ctx);
318 }
319 
320 /*
321  * Returns an AGX bindless handle to access an indexed image within the global
322  * image heap.
323  */
324 static nir_def *
image_heap_handle(nir_builder * b,nir_def * offset)325 image_heap_handle(nir_builder *b, nir_def *offset)
326 {
327    return nir_vec2(b, nir_imm_int(b, HK_IMAGE_HEAP_UNIFORM), offset);
328 }
329 
330 static bool
lower_image_intrin(nir_builder * b,nir_intrinsic_instr * intr,const struct lower_descriptors_ctx * ctx)331 lower_image_intrin(nir_builder *b, nir_intrinsic_instr *intr,
332                    const struct lower_descriptors_ctx *ctx)
333 {
334    b->cursor = nir_before_instr(&intr->instr);
335    nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
336 
337    /* Reads and queries use the texture descriptor; writes and atomics PBE. */
338    unsigned offs;
339    if (intr->intrinsic != nir_intrinsic_image_deref_load &&
340        intr->intrinsic != nir_intrinsic_image_deref_size &&
341        intr->intrinsic != nir_intrinsic_image_deref_samples) {
342 
343       offs = offsetof(struct hk_storage_image_descriptor, pbe_offset);
344    } else {
345       offs = offsetof(struct hk_storage_image_descriptor, tex_offset);
346    }
347 
348    nir_def *offset = load_resource_deref_desc(b, 1, 32, deref, offs, ctx);
349    nir_rewrite_image_intrinsic(intr, image_heap_handle(b, offset), true);
350 
351    return true;
352 }
353 
354 static VkQueryPipelineStatisticFlagBits
translate_pipeline_stat_bit(enum pipe_statistics_query_index pipe)355 translate_pipeline_stat_bit(enum pipe_statistics_query_index pipe)
356 {
357    switch (pipe) {
358    case PIPE_STAT_QUERY_IA_VERTICES:
359       return VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT;
360    case PIPE_STAT_QUERY_IA_PRIMITIVES:
361       return VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT;
362    case PIPE_STAT_QUERY_VS_INVOCATIONS:
363       return VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT;
364    case PIPE_STAT_QUERY_GS_INVOCATIONS:
365       return VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT;
366    case PIPE_STAT_QUERY_GS_PRIMITIVES:
367       return VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT;
368    case PIPE_STAT_QUERY_C_INVOCATIONS:
369       return VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT;
370    case PIPE_STAT_QUERY_C_PRIMITIVES:
371       return VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT;
372    case PIPE_STAT_QUERY_PS_INVOCATIONS:
373       return VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT;
374    case PIPE_STAT_QUERY_HS_INVOCATIONS:
375       return VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT;
376    case PIPE_STAT_QUERY_DS_INVOCATIONS:
377       return VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT;
378    case PIPE_STAT_QUERY_CS_INVOCATIONS:
379       return VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
380    case PIPE_STAT_QUERY_TS_INVOCATIONS:
381       return VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT;
382    case PIPE_STAT_QUERY_MS_INVOCATIONS:
383       return VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT;
384    }
385 
386    unreachable("invalid statistic");
387 }
388 
389 static bool
lower_uvs_index(nir_builder * b,nir_intrinsic_instr * intrin,void * data)390 lower_uvs_index(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
391 {
392    unsigned *vs_uniform_base = data;
393 
394    switch (intrin->intrinsic) {
395    case nir_intrinsic_load_uvs_index_agx: {
396       gl_varying_slot slot = nir_intrinsic_io_semantics(intrin).location;
397       unsigned offset = hk_root_descriptor_offset(draw.uvs_index[slot]);
398       b->cursor = nir_instr_remove(&intrin->instr);
399 
400       nir_def *val = load_root(b, 1, 8, nir_imm_int(b, offset), 1);
401       nir_def_rewrite_uses(&intrin->def, nir_u2u16(b, val));
402       return true;
403    }
404 
405    case nir_intrinsic_load_shader_part_tests_zs_agx:
406       return lower_sysval_to_root_table(b, intrin, draw.no_epilog_discard);
407 
408    case nir_intrinsic_load_api_sample_mask_agx:
409       return lower_sysval_to_root_table(b, intrin, draw.api_sample_mask);
410 
411    case nir_intrinsic_load_sample_positions_agx:
412       return lower_sysval_to_root_table(b, intrin, draw.ppp_multisamplectl);
413 
414    case nir_intrinsic_load_depth_never_agx:
415       return lower_sysval_to_root_table(b, intrin, draw.force_never_in_shader);
416 
417    case nir_intrinsic_load_geometry_param_buffer_agx:
418       return lower_sysval_to_root_table(b, intrin, draw.geometry_params);
419 
420    case nir_intrinsic_load_vs_output_buffer_agx:
421       return lower_sysval_to_root_table(b, intrin, draw.vertex_output_buffer);
422 
423    case nir_intrinsic_load_vs_outputs_agx:
424       return lower_sysval_to_root_table(b, intrin, draw.vertex_outputs);
425 
426    case nir_intrinsic_load_tess_param_buffer_agx:
427       return lower_sysval_to_root_table(b, intrin, draw.tess_params);
428 
429    case nir_intrinsic_load_is_first_fan_agx: {
430       unsigned offset = hk_root_descriptor_offset(draw.provoking);
431       b->cursor = nir_instr_remove(&intrin->instr);
432       nir_def *val = load_root(b, 1, 16, nir_imm_int(b, offset), 2);
433       nir_def_rewrite_uses(&intrin->def, nir_ieq_imm(b, val, 1));
434       return true;
435    }
436 
437    case nir_intrinsic_load_provoking_last: {
438       unsigned offset = hk_root_descriptor_offset(draw.provoking);
439       b->cursor = nir_instr_remove(&intrin->instr);
440       nir_def *val = load_root(b, 1, 16, nir_imm_int(b, offset), 2);
441       nir_def_rewrite_uses(&intrin->def, nir_b2b32(b, nir_ieq_imm(b, val, 2)));
442       return true;
443    }
444 
445    case nir_intrinsic_load_base_vertex:
446    case nir_intrinsic_load_first_vertex:
447    case nir_intrinsic_load_base_instance:
448    case nir_intrinsic_load_draw_id:
449    case nir_intrinsic_load_input_assembly_buffer_agx: {
450       b->cursor = nir_instr_remove(&intrin->instr);
451 
452       unsigned base = *vs_uniform_base;
453       unsigned size = 32;
454 
455       if (intrin->intrinsic == nir_intrinsic_load_base_instance) {
456          base += 2;
457       } else if (intrin->intrinsic == nir_intrinsic_load_draw_id) {
458          base += 4;
459          size = 16;
460       } else if (intrin->intrinsic ==
461                  nir_intrinsic_load_input_assembly_buffer_agx) {
462          base += 8;
463          size = 64;
464       }
465 
466       nir_def *val = nir_load_preamble(b, 1, size, .base = base);
467       nir_def_rewrite_uses(&intrin->def,
468                            nir_u2uN(b, val, intrin->def.bit_size));
469       return true;
470    }
471 
472    case nir_intrinsic_load_stat_query_address_agx: {
473       b->cursor = nir_instr_remove(&intrin->instr);
474 
475       unsigned off1 = hk_root_descriptor_offset(draw.pipeline_stats);
476       unsigned off2 = hk_root_descriptor_offset(draw.pipeline_stats_flags);
477 
478       nir_def *base = load_root(b, 1, 64, nir_imm_int(b, off1), 8);
479       nir_def *flags = load_root(b, 1, 16, nir_imm_int(b, off2), 2);
480 
481       unsigned query = nir_intrinsic_base(intrin);
482       VkQueryPipelineStatisticFlagBits bit = translate_pipeline_stat_bit(query);
483 
484       /* Prefix sum to find the compacted offset */
485       nir_def *idx = nir_bit_count(b, nir_iand_imm(b, flags, bit - 1));
486       nir_def *addr = nir_iadd(
487          b, base, nir_imul_imm(b, nir_u2u64(b, idx), sizeof(uint64_t)));
488 
489       /* The above returns garbage if the query isn't actually enabled, handle
490        * that case.
491        *
492        * TODO: Optimize case where we *know* the query is present?
493        */
494       nir_def *present = nir_ine_imm(b, nir_iand_imm(b, flags, bit), 0);
495 
496       /* Sometimes we insert a GS internally, it should not contribute to GS
497        * statistics. This is not strictly needed for Vulkan but vkd3d-proton
498        * tests it and we should avoid the surprising behaviour.
499        */
500       if (query == PIPE_STAT_QUERY_GS_INVOCATIONS ||
501           query == PIPE_STAT_QUERY_GS_PRIMITIVES) {
502 
503          unsigned api_gs_offset = hk_root_descriptor_offset(draw.api_gs);
504          nir_def *api_gs =
505             load_root(b, 1, 16, nir_imm_int(b, api_gs_offset), 4);
506 
507          present = nir_iand(b, present, nir_ine_imm(b, api_gs, 0));
508       }
509 
510       addr = nir_bcsel(b, present, addr, nir_imm_int64(b, 0));
511 
512       nir_def_rewrite_uses(&intrin->def, addr);
513       return true;
514    }
515 
516    default:
517       return false;
518    }
519 }
520 
521 bool
hk_lower_uvs_index(nir_shader * s,unsigned vs_uniform_base)522 hk_lower_uvs_index(nir_shader *s, unsigned vs_uniform_base)
523 {
524    return nir_shader_intrinsics_pass(
525       s, lower_uvs_index, nir_metadata_control_flow, &vs_uniform_base);
526 }
527 
528 static bool
try_lower_intrin(nir_builder * b,nir_intrinsic_instr * intrin,const struct lower_descriptors_ctx * ctx)529 try_lower_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
530                  const struct lower_descriptors_ctx *ctx)
531 {
532    switch (intrin->intrinsic) {
533    case nir_intrinsic_load_constant:
534       return lower_load_constant(b, intrin, ctx);
535 
536    case nir_intrinsic_load_vulkan_descriptor:
537       return try_lower_load_vulkan_descriptor(b, intrin, ctx);
538 
539    case nir_intrinsic_load_workgroup_size:
540       unreachable("Should have been lowered by nir_lower_cs_intrinsics()");
541 
542    case nir_intrinsic_load_base_workgroup_id:
543       return lower_sysval_to_root_table(b, intrin, cs.base_group);
544 
545    case nir_intrinsic_load_push_constant:
546       return lower_load_push_constant(b, intrin, ctx);
547 
548    case nir_intrinsic_load_view_index:
549       return lower_sysval_to_root_table(b, intrin, draw.view_index);
550 
551    case nir_intrinsic_image_deref_load:
552    case nir_intrinsic_image_deref_sparse_load:
553    case nir_intrinsic_image_deref_store:
554    case nir_intrinsic_image_deref_atomic:
555    case nir_intrinsic_image_deref_atomic_swap:
556    case nir_intrinsic_image_deref_size:
557    case nir_intrinsic_image_deref_samples:
558    case nir_intrinsic_image_deref_store_block_agx:
559       return lower_image_intrin(b, intrin, ctx);
560 
561    case nir_intrinsic_load_num_workgroups: {
562       b->cursor = nir_instr_remove(&intrin->instr);
563 
564       unsigned offset = hk_root_descriptor_offset(cs.group_count_addr);
565       nir_def *ptr = load_root(b, 1, 64, nir_imm_int(b, offset), 4);
566       nir_def *val = load_speculatable(b, 3, 32, ptr, 4);
567 
568       nir_def_rewrite_uses(&intrin->def, val);
569       return true;
570    }
571 
572    default:
573       return false;
574    }
575 }
576 
577 static bool
lower_tex(nir_builder * b,nir_tex_instr * tex,const struct lower_descriptors_ctx * ctx)578 lower_tex(nir_builder *b, nir_tex_instr *tex,
579           const struct lower_descriptors_ctx *ctx)
580 {
581    b->cursor = nir_before_instr(&tex->instr);
582 
583    nir_def *texture = nir_steal_tex_src(tex, nir_tex_src_texture_deref);
584    nir_def *sampler = nir_steal_tex_src(tex, nir_tex_src_sampler_deref);
585    if (!texture) {
586       assert(!sampler);
587       return false;
588    }
589 
590    nir_def *plane_ssa = nir_steal_tex_src(tex, nir_tex_src_plane);
591    const uint32_t plane =
592       plane_ssa ? nir_src_as_uint(nir_src_for_ssa(plane_ssa)) : 0;
593    const uint64_t plane_offset_B =
594       plane * sizeof(struct hk_sampled_image_descriptor);
595 
596    /* LOD bias is passed in the descriptor set, rather than embedded into
597     * the sampler descriptor. There's no spot in the hardware descriptor,
598     * plus this saves on precious sampler heap spots.
599     */
600    if (tex->op == nir_texop_lod_bias_agx) {
601       unsigned offs =
602          offsetof(struct hk_sampled_image_descriptor, lod_bias_fp16);
603 
604       nir_def *bias = load_resource_deref_desc(
605          b, 1, 16, nir_src_as_deref(nir_src_for_ssa(sampler)),
606          plane_offset_B + offs, ctx);
607 
608       nir_def_replace(&tex->def, bias);
609       return true;
610    }
611 
612    if (tex->op == nir_texop_has_custom_border_color_agx) {
613       unsigned offs = offsetof(struct hk_sampled_image_descriptor, has_border);
614 
615       nir_def *res = load_resource_deref_desc(
616          b, 1, 16, nir_src_as_deref(nir_src_for_ssa(sampler)),
617          plane_offset_B + offs, ctx);
618 
619       nir_def_replace(&tex->def, nir_ine_imm(b, res, 0));
620       return true;
621    }
622 
623    if (tex->op == nir_texop_custom_border_color_agx) {
624       unsigned offs = offsetof(struct hk_sampled_image_descriptor, border);
625 
626       nir_def *border = load_resource_deref_desc(
627          b, 4, 32, nir_src_as_deref(nir_src_for_ssa(sampler)),
628          plane_offset_B + offs, ctx);
629 
630       nir_alu_type T = nir_alu_type_get_base_type(tex->dest_type);
631       border = nir_convert_to_bit_size(b, border, T, tex->def.bit_size);
632 
633       nir_def_replace(&tex->def, border);
634       return true;
635    }
636 
637    {
638       unsigned offs =
639          offsetof(struct hk_sampled_image_descriptor, image_offset);
640 
641       nir_def *offset = load_resource_deref_desc(
642          b, 1, 32, nir_src_as_deref(nir_src_for_ssa(texture)),
643          plane_offset_B + offs, ctx);
644 
645       nir_def *handle = image_heap_handle(b, offset);
646       nir_tex_instr_add_src(tex, nir_tex_src_texture_handle, handle);
647    }
648 
649    if (sampler != NULL) {
650       unsigned offs =
651          offsetof(struct hk_sampled_image_descriptor, sampler_index);
652 
653       if (tex->backend_flags & AGX_TEXTURE_FLAG_CLAMP_TO_0) {
654          offs =
655             offsetof(struct hk_sampled_image_descriptor, clamp_0_sampler_index);
656       }
657 
658       nir_def *index = load_resource_deref_desc(
659          b, 1, 16, nir_src_as_deref(nir_src_for_ssa(sampler)),
660          plane_offset_B + offs, ctx);
661 
662       nir_tex_instr_add_src(tex, nir_tex_src_sampler_handle, index);
663    }
664 
665    return true;
666 }
667 
668 static bool
try_lower_descriptors_instr(nir_builder * b,nir_instr * instr,void * _data)669 try_lower_descriptors_instr(nir_builder *b, nir_instr *instr, void *_data)
670 {
671    const struct lower_descriptors_ctx *ctx = _data;
672 
673    switch (instr->type) {
674    case nir_instr_type_tex:
675       return lower_tex(b, nir_instr_as_tex(instr), ctx);
676    case nir_instr_type_intrinsic:
677       return try_lower_intrin(b, nir_instr_as_intrinsic(instr), ctx);
678    default:
679       return false;
680    }
681 }
682 
683 static bool
lower_ssbo_resource_index(nir_builder * b,nir_intrinsic_instr * intrin,const struct lower_descriptors_ctx * ctx)684 lower_ssbo_resource_index(nir_builder *b, nir_intrinsic_instr *intrin,
685                           const struct lower_descriptors_ctx *ctx)
686 {
687    const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
688    if (desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER &&
689        desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)
690       return false;
691 
692    b->cursor = nir_instr_remove(&intrin->instr);
693 
694    uint32_t set = nir_intrinsic_desc_set(intrin);
695    uint32_t binding = nir_intrinsic_binding(intrin);
696    nir_def *index = intrin->src[0].ssa;
697 
698    const struct hk_descriptor_set_binding_layout *binding_layout =
699       get_binding_layout(set, binding, ctx);
700 
701    nir_def *binding_addr;
702    uint8_t binding_stride;
703    switch (binding_layout->type) {
704    case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
705    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: {
706       nir_def *set_addr = load_descriptor_set_addr(b, set, ctx);
707       binding_addr = nir_iadd_imm(b, set_addr, binding_layout->offset);
708       binding_stride = binding_layout->stride;
709       break;
710    }
711 
712    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
713       const uint32_t root_desc_addr_offset =
714          hk_root_descriptor_offset(root_desc_addr);
715 
716       nir_def *root_desc_addr =
717          load_root(b, 1, 64, nir_imm_int(b, root_desc_addr_offset), 8);
718 
719       nir_def *dynamic_buffer_start =
720          nir_iadd_imm(b, load_dynamic_buffer_start(b, set, ctx),
721                       binding_layout->dynamic_buffer_index);
722 
723       nir_def *dynamic_binding_offset =
724          nir_iadd_imm(b,
725                       nir_imul_imm(b, dynamic_buffer_start,
726                                    sizeof(struct hk_buffer_address)),
727                       hk_root_descriptor_offset(dynamic_buffers));
728 
729       binding_addr =
730          nir_iadd(b, root_desc_addr, nir_u2u64(b, dynamic_binding_offset));
731       binding_stride = sizeof(struct hk_buffer_address);
732       break;
733    }
734 
735    default:
736       unreachable("Not an SSBO descriptor");
737    }
738 
739    /* Tuck the stride in the top 8 bits of the binding address */
740    binding_addr = nir_ior_imm(b, binding_addr, (uint64_t)binding_stride << 56);
741 
742    const uint32_t binding_size = binding_layout->array_size * binding_stride;
743    nir_def *offset_in_binding = nir_imul_imm(b, index, binding_stride);
744 
745    nir_def *addr = nir_vec4(b, nir_unpack_64_2x32_split_x(b, binding_addr),
746                             nir_unpack_64_2x32_split_y(b, binding_addr),
747                             nir_imm_int(b, binding_size), offset_in_binding);
748 
749    nir_def_rewrite_uses(&intrin->def, addr);
750 
751    return true;
752 }
753 
754 static bool
lower_ssbo_resource_reindex(nir_builder * b,nir_intrinsic_instr * intrin,const struct lower_descriptors_ctx * ctx)755 lower_ssbo_resource_reindex(nir_builder *b, nir_intrinsic_instr *intrin,
756                             const struct lower_descriptors_ctx *ctx)
757 {
758    const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
759    if (desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER &&
760        desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)
761       return false;
762 
763    b->cursor = nir_instr_remove(&intrin->instr);
764 
765    nir_def *addr = intrin->src[0].ssa;
766    nir_def *index = intrin->src[1].ssa;
767 
768    nir_def *addr_high32 = nir_channel(b, addr, 1);
769    nir_def *stride = nir_ushr_imm(b, addr_high32, 24);
770    nir_def *offset = nir_imul(b, index, stride);
771 
772    addr = nir_build_addr_iadd(b, addr, ctx->ssbo_addr_format, nir_var_mem_ssbo,
773                               offset);
774    nir_def_rewrite_uses(&intrin->def, addr);
775 
776    return true;
777 }
778 
779 static bool
lower_load_ssbo_descriptor(nir_builder * b,nir_intrinsic_instr * intrin,const struct lower_descriptors_ctx * ctx)780 lower_load_ssbo_descriptor(nir_builder *b, nir_intrinsic_instr *intrin,
781                            const struct lower_descriptors_ctx *ctx)
782 {
783    const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
784    if (desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER &&
785        desc_type != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)
786       return false;
787 
788    b->cursor = nir_instr_remove(&intrin->instr);
789 
790    nir_def *addr = intrin->src[0].ssa;
791 
792    nir_def *desc;
793    switch (ctx->ssbo_addr_format) {
794    case nir_address_format_64bit_global_32bit_offset: {
795       nir_def *base = nir_pack_64_2x32(b, nir_trim_vector(b, addr, 2));
796       nir_def *offset = nir_channel(b, addr, 3);
797       /* Mask off the binding stride */
798       base = nir_iand_imm(b, base, BITFIELD64_MASK(56));
799       desc = nir_load_global_constant_offset(b, 4, 32, base, offset,
800                                              .align_mul = 16, .align_offset = 0,
801                                              .access = ACCESS_CAN_SPECULATE);
802       break;
803    }
804 
805    case nir_address_format_64bit_bounded_global: {
806       nir_def *base = nir_pack_64_2x32(b, nir_trim_vector(b, addr, 2));
807       nir_def *size = nir_channel(b, addr, 2);
808       nir_def *offset = nir_channel(b, addr, 3);
809       /* Mask off the binding stride */
810       base = nir_iand_imm(b, base, BITFIELD64_MASK(56));
811       desc = nir_load_global_constant_bounded(
812          b, 4, 32, base, offset, size, .align_mul = 16, .align_offset = 0,
813          .access = ACCESS_CAN_SPECULATE);
814       break;
815    }
816 
817    default:
818       unreachable("Unknown address mode");
819    }
820 
821    nir_def_rewrite_uses(&intrin->def, desc);
822 
823    return true;
824 }
825 
826 static bool
lower_ssbo_descriptor(nir_builder * b,nir_intrinsic_instr * intr,void * _data)827 lower_ssbo_descriptor(nir_builder *b, nir_intrinsic_instr *intr, void *_data)
828 {
829    const struct lower_descriptors_ctx *ctx = _data;
830 
831    switch (intr->intrinsic) {
832    case nir_intrinsic_vulkan_resource_index:
833       return lower_ssbo_resource_index(b, intr, ctx);
834    case nir_intrinsic_vulkan_resource_reindex:
835       return lower_ssbo_resource_reindex(b, intr, ctx);
836    case nir_intrinsic_load_vulkan_descriptor:
837       return lower_load_ssbo_descriptor(b, intr, ctx);
838    default:
839       return false;
840    }
841 }
842 
843 bool
hk_nir_lower_descriptors(nir_shader * nir,const struct vk_pipeline_robustness_state * rs,uint32_t set_layout_count,struct vk_descriptor_set_layout * const * set_layouts)844 hk_nir_lower_descriptors(nir_shader *nir,
845                          const struct vk_pipeline_robustness_state *rs,
846                          uint32_t set_layout_count,
847                          struct vk_descriptor_set_layout *const *set_layouts)
848 {
849    struct lower_descriptors_ctx ctx = {
850       .clamp_desc_array_bounds =
851          rs->storage_buffers !=
852             VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT ||
853 
854          rs->uniform_buffers !=
855             VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT ||
856 
857          rs->images != VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DISABLED_EXT,
858 
859       .ssbo_addr_format = hk_buffer_addr_format(rs->storage_buffers),
860       .ubo_addr_format = hk_buffer_addr_format(rs->uniform_buffers),
861    };
862 
863    assert(set_layout_count <= HK_MAX_SETS);
864    for (uint32_t s = 0; s < set_layout_count; s++) {
865       if (set_layouts[s] != NULL)
866          ctx.set_layouts[s] = vk_to_hk_descriptor_set_layout(set_layouts[s]);
867    }
868 
869    /* First lower everything but complex SSBOs, then lower complex SSBOs.
870     *
871     * TODO: See if we can unify this, not sure if the fast path matters on
872     * Apple. This is inherited from NVK.
873     */
874    bool pass_lower_descriptors = nir_shader_instructions_pass(
875       nir, try_lower_descriptors_instr, nir_metadata_control_flow, &ctx);
876 
877    bool pass_lower_ssbo = nir_shader_intrinsics_pass(
878       nir, lower_ssbo_descriptor, nir_metadata_control_flow, &ctx);
879 
880    return pass_lower_descriptors || pass_lower_ssbo;
881 }
882