• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2024 Valve Corporation
3  * Copyright 2024 Alyssa Rosenzweig
4  * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5  * SPDX-License-Identifier: MIT
6  */
7 #include "hk_shader.h"
8 
9 #include "agx_debug.h"
10 #include "agx_device.h"
11 #include "agx_helpers.h"
12 #include "agx_nir_lower_gs.h"
13 #include "glsl_types.h"
14 #include "nir.h"
15 #include "nir_builder.h"
16 
17 #include "agx_bo.h"
18 #include "hk_cmd_buffer.h"
19 #include "hk_descriptor_set_layout.h"
20 #include "hk_device.h"
21 #include "hk_physical_device.h"
22 #include "hk_sampler.h"
23 #include "hk_shader.h"
24 
25 #include "nir_builder_opcodes.h"
26 #include "nir_builtin_builder.h"
27 #include "nir_intrinsics.h"
28 #include "nir_intrinsics_indices.h"
29 #include "nir_xfb_info.h"
30 #include "shader_enums.h"
31 #include "vk_nir_convert_ycbcr.h"
32 #include "vk_pipeline.h"
33 #include "vk_pipeline_layout.h"
34 #include "vk_shader.h"
35 #include "vk_shader_module.h"
36 #include "vk_ycbcr_conversion.h"
37 
38 #include "asahi/compiler/agx_compile.h"
39 #include "asahi/compiler/agx_nir.h"
40 #include "asahi/compiler/agx_nir_texture.h"
41 #include "asahi/lib/agx_abi.h"
42 #include "asahi/lib/agx_linker.h"
43 #include "asahi/lib/agx_tilebuffer.h"
44 #include "asahi/lib/agx_uvs.h"
45 #include "compiler/spirv/nir_spirv.h"
46 
47 #include "util/blob.h"
48 #include "util/hash_table.h"
49 #include "util/macros.h"
50 #include "util/mesa-sha1.h"
51 #include "util/simple_mtx.h"
52 #include "util/u_debug.h"
53 #include "vulkan/vulkan_core.h"
54 
55 struct hk_fs_key {
56    bool zs_self_dep;
57 
58    /** True if sample shading is forced on via an API knob such as
59     * VkPipelineMultisampleStateCreateInfo::minSampleShading
60     */
61    bool force_sample_shading;
62 
63    uint8_t pad[2];
64 };
65 static_assert(sizeof(struct hk_fs_key) == 4, "packed");
66 
67 static void
shared_var_info(const struct glsl_type * type,unsigned * size,unsigned * align)68 shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
69 {
70    assert(glsl_type_is_vector_or_scalar(type));
71 
72    uint32_t comp_size =
73       glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
74    unsigned length = glsl_get_vector_elements(type);
75    *size = comp_size * length, *align = comp_size;
76 }
77 
78 uint64_t
hk_physical_device_compiler_flags(const struct hk_physical_device * pdev)79 hk_physical_device_compiler_flags(const struct hk_physical_device *pdev)
80 {
81    /* This could be optimized but it doesn't matter */
82    return pdev->dev.debug;
83 }
84 
85 const nir_shader_compiler_options *
hk_get_nir_options(struct vk_physical_device * vk_pdev,gl_shader_stage stage,UNUSED const struct vk_pipeline_robustness_state * rs)86 hk_get_nir_options(struct vk_physical_device *vk_pdev, gl_shader_stage stage,
87                    UNUSED const struct vk_pipeline_robustness_state *rs)
88 {
89    return &agx_nir_options;
90 }
91 
92 static struct spirv_to_nir_options
hk_get_spirv_options(struct vk_physical_device * vk_pdev,UNUSED gl_shader_stage stage,const struct vk_pipeline_robustness_state * rs)93 hk_get_spirv_options(struct vk_physical_device *vk_pdev,
94                      UNUSED gl_shader_stage stage,
95                      const struct vk_pipeline_robustness_state *rs)
96 {
97    return (struct spirv_to_nir_options){
98       .ssbo_addr_format = hk_buffer_addr_format(rs->storage_buffers),
99       .phys_ssbo_addr_format = nir_address_format_64bit_global,
100       .ubo_addr_format = hk_buffer_addr_format(rs->uniform_buffers),
101       .shared_addr_format = nir_address_format_32bit_offset,
102       .min_ssbo_alignment = HK_MIN_SSBO_ALIGNMENT,
103       .min_ubo_alignment = HK_MIN_UBO_ALIGNMENT,
104    };
105 }
106 
107 static bool
lower_halt_to_return(nir_builder * b,nir_instr * instr,UNUSED void * _data)108 lower_halt_to_return(nir_builder *b, nir_instr *instr, UNUSED void *_data)
109 {
110    if (instr->type != nir_instr_type_jump)
111       return false;
112 
113    nir_jump_instr *jump = nir_instr_as_jump(instr);
114    if (jump->type != nir_jump_halt)
115       return false;
116 
117    assert(b->impl == nir_shader_get_entrypoint(b->shader));
118    jump->type = nir_jump_return;
119    return true;
120 }
121 
122 void
hk_preprocess_nir_internal(struct vk_physical_device * vk_pdev,nir_shader * nir)123 hk_preprocess_nir_internal(struct vk_physical_device *vk_pdev, nir_shader *nir)
124 {
125    /* Must lower before io to temps */
126    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
127       NIR_PASS(_, nir, nir_lower_terminate_to_demote);
128       NIR_PASS(_, nir, nir_shader_instructions_pass, lower_halt_to_return,
129                nir_metadata_all, NULL);
130       NIR_PASS(_, nir, nir_lower_returns);
131    }
132 
133    /* Unroll loops before lowering indirects via nir_lower_io_to_temporaries */
134    UNUSED bool progress = false;
135    NIR_PASS(_, nir, nir_lower_global_vars_to_local);
136 
137    do {
138       progress = false;
139       NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
140       NIR_PASS(progress, nir, nir_copy_prop);
141       NIR_PASS(progress, nir, nir_opt_dce);
142       NIR_PASS(progress, nir, nir_opt_constant_folding);
143       NIR_PASS(progress, nir, nir_opt_loop);
144       NIR_PASS(progress, nir, nir_opt_loop_unroll);
145    } while (progress);
146 
147    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
148       struct nir_lower_sysvals_to_varyings_options sysvals_opts = {
149          .point_coord = true,
150       };
151 
152       nir_lower_sysvals_to_varyings(nir, &sysvals_opts);
153    }
154 
155    NIR_PASS(_, nir, nir_lower_system_values);
156 
157    /* Gather info before preprocess_nir but after some general lowering, so
158     * inputs_read and system_values_read are accurately set.
159     */
160    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
161 
162    NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir),
163               true, false);
164 
165    NIR_PASS(_, nir, nir_lower_global_vars_to_local);
166 
167    NIR_PASS(_, nir, nir_split_var_copies);
168    NIR_PASS(_, nir, nir_split_struct_vars, nir_var_function_temp);
169 
170    /* Optimize but allow copies because we haven't lowered them yet */
171    agx_preprocess_nir(nir, NULL);
172 
173    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
174    NIR_PASS(_, nir, nir_lower_var_copies);
175 }
176 
177 static void
hk_preprocess_nir(struct vk_physical_device * vk_pdev,nir_shader * nir)178 hk_preprocess_nir(struct vk_physical_device *vk_pdev, nir_shader *nir)
179 {
180    hk_preprocess_nir_internal(vk_pdev, nir);
181    nir_lower_compute_system_values_options csv_options = {
182       .has_base_workgroup_id = true,
183    };
184    NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options);
185 }
186 
187 static void
hk_populate_fs_key(struct hk_fs_key * key,const struct vk_graphics_pipeline_state * state)188 hk_populate_fs_key(struct hk_fs_key *key,
189                    const struct vk_graphics_pipeline_state *state)
190 {
191    memset(key, 0, sizeof(*key));
192 
193    if (state == NULL)
194       return;
195 
196    if (state->pipeline_flags &
197        VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT)
198       key->zs_self_dep = true;
199 
200    /* We force per-sample interpolation whenever sampleShadingEnable is set
201     * regardless of minSampleShading or rasterizationSamples.
202     *
203     * When sampleShadingEnable is set, few guarantees are made about the
204     * location of interpolation of the inputs.  The only real guarantees are
205     * that the inputs are interpolated within the pixel and that you get at
206     * least `rasterizationSamples * minSampleShading` unique positions.
207     * Importantly, it does not require that when `rasterizationSamples *
208     * minSampleShading <= 1.0` that those positions are at the fragment
209     * center.  Therefore, it's valid to just always do per-sample all the time.
210     *
211     * The one caveat here is that we have to be careful about gl_SampleMaskIn.
212     * When `hk_fs_key::force_sample_shading = true` we also turn any reads of
213     * gl_SampleMaskIn into `1 << gl_SampleID` because the hardware sample mask
214     * is actually per-fragment, not per-pass.  We handle this by smashing
215     * minSampleShading to 1.0 whenever gl_SampleMaskIn is read.
216     */
217    const struct vk_multisample_state *ms = state->ms;
218    if (ms != NULL && ms->sample_shading_enable)
219       key->force_sample_shading = true;
220 }
221 
222 static void
hk_hash_graphics_state(struct vk_physical_device * device,const struct vk_graphics_pipeline_state * state,VkShaderStageFlags stages,blake3_hash blake3_out)223 hk_hash_graphics_state(struct vk_physical_device *device,
224                        const struct vk_graphics_pipeline_state *state,
225                        VkShaderStageFlags stages, blake3_hash blake3_out)
226 {
227    struct mesa_blake3 blake3_ctx;
228    _mesa_blake3_init(&blake3_ctx);
229    if (stages & VK_SHADER_STAGE_FRAGMENT_BIT) {
230       struct hk_fs_key key;
231       hk_populate_fs_key(&key, state);
232       _mesa_blake3_update(&blake3_ctx, &key, sizeof(key));
233 
234       const bool is_multiview = state->rp->view_mask != 0;
235       _mesa_blake3_update(&blake3_ctx, &is_multiview, sizeof(is_multiview));
236    }
237    _mesa_blake3_final(&blake3_ctx, blake3_out);
238 }
239 
240 static nir_def *
bounds_check(nir_builder * b,nir_def * data,nir_def * offs,nir_def * bound)241 bounds_check(nir_builder *b, nir_def *data, nir_def *offs, nir_def *bound)
242 {
243    if (data->bit_size == 32 && data->num_components == 1) {
244       return nir_bounds_agx(b, data, offs, bound);
245    } else {
246       /* TODO: Optimize */
247       return nir_bcsel(b, nir_uge(b, bound, offs), data,
248                        nir_imm_zero(b, data->num_components, data->bit_size));
249    }
250 }
251 
252 static bool
lower_load_global_constant_offset_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)253 lower_load_global_constant_offset_instr(nir_builder *b,
254                                         nir_intrinsic_instr *intrin, void *data)
255 {
256    if (intrin->intrinsic != nir_intrinsic_load_global_constant_offset &&
257        intrin->intrinsic != nir_intrinsic_load_global_constant_bounded)
258       return false;
259 
260    b->cursor = nir_before_instr(&intrin->instr);
261    bool *has_soft_fault = data;
262 
263    nir_def *base_addr = intrin->src[0].ssa;
264    nir_def *offset = intrin->src[1].ssa;
265    nir_def *bound = NULL;
266    nir_def *zero = NULL;
267 
268    unsigned bit_size = intrin->def.bit_size;
269    assert(bit_size >= 8 && bit_size % 8 == 0);
270    unsigned byte_size = bit_size / 8;
271    unsigned load_size = byte_size * intrin->num_components;
272 
273    if (intrin->intrinsic == nir_intrinsic_load_global_constant_bounded) {
274       bound = intrin->src[2].ssa;
275       zero = nir_imm_zero(b, intrin->num_components, bit_size);
276 
277       nir_def *sat_offset =
278          nir_umin(b, offset, nir_imm_int(b, UINT32_MAX - (load_size - 1)));
279       nir_def *in_bounds =
280          nir_ilt(b, nir_iadd_imm(b, sat_offset, load_size - 1), bound);
281 
282       /* If we do not have soft fault, we branch to bounds check. This is slow,
283        * fortunately we always have soft fault for release drivers.
284        *
285        * With soft fault, we speculatively load and smash to zero at the end.
286        */
287       if (!(*has_soft_fault))
288          nir_push_if(b, in_bounds);
289    }
290 
291    unsigned align_mul = nir_intrinsic_align_mul(intrin);
292    unsigned align_offset = nir_intrinsic_align_offset(intrin);
293 
294    nir_def *val = nir_build_load_global_constant(
295       b, intrin->def.num_components, intrin->def.bit_size,
296       nir_iadd(b, base_addr, nir_u2u64(b, offset)), .align_mul = align_mul,
297       .align_offset = align_offset, .access = nir_intrinsic_access(intrin));
298 
299    if (intrin->intrinsic == nir_intrinsic_load_global_constant_bounded) {
300       if (*has_soft_fault) {
301          nir_scalar offs = nir_scalar_resolved(offset, 0);
302          if (nir_scalar_is_const(offs)) {
303             /* Calculate last byte loaded */
304             unsigned offs_imm = nir_scalar_as_uint(offs) + load_size;
305 
306             /* Simplify the bounds check. Uniform buffers are bounds checked at
307              * 64B granularity, so `bound` is a multiple of K = 64. Then
308              *
309              * offs_imm < bound <==> round_down(offs_imm, K) < bound. Proof:
310              *
311              * "=>" round_down(offs_imm, K) <= offs_imm < bound.
312              *
313              * "<=" Let a, b be integer s.t. offs_imm = K a + b with b < K.
314              *      Note round_down(offs_imm, K) = Ka.
315              *
316              *      Let c be integer s.t. bound = Kc.
317              *      We have Ka < Kc => a < c.
318              *      b < K => Ka + b < K(a + 1).
319              *
320              *      a < c with integers => a + 1 <= c.
321              *      offs_imm < K(a + 1) <= Kc = bound.
322              *      Hence offs_imm < bound.
323              */
324             assert(align_mul == 64);
325             offs_imm &= ~(align_mul - 1);
326 
327             /* Bounds checks are `offset > bound ? 0 : val` so if offset = 0,
328              * the bounds check is useless.
329              */
330             if (offs_imm) {
331                val = bounds_check(b, val, nir_imm_int(b, offs_imm), bound);
332             }
333          } else {
334             offset = nir_iadd_imm(b, offset, load_size);
335             val = bounds_check(b, val, offset, bound);
336          }
337 
338       } else {
339          nir_pop_if(b, NULL);
340          val = nir_if_phi(b, val, zero);
341       }
342    }
343 
344    nir_def_replace(&intrin->def, val);
345    return true;
346 }
347 
348 struct lower_ycbcr_state {
349    uint32_t set_layout_count;
350    struct vk_descriptor_set_layout *const *set_layouts;
351 };
352 
353 static const struct vk_ycbcr_conversion_state *
lookup_ycbcr_conversion(const void * _state,uint32_t set,uint32_t binding,uint32_t array_index)354 lookup_ycbcr_conversion(const void *_state, uint32_t set, uint32_t binding,
355                         uint32_t array_index)
356 {
357    const struct lower_ycbcr_state *state = _state;
358    assert(set < state->set_layout_count);
359    assert(state->set_layouts[set] != NULL);
360    const struct hk_descriptor_set_layout *set_layout =
361       vk_to_hk_descriptor_set_layout(state->set_layouts[set]);
362    assert(binding < set_layout->binding_count);
363 
364    const struct hk_descriptor_set_binding_layout *bind_layout =
365       &set_layout->binding[binding];
366 
367    if (bind_layout->immutable_samplers == NULL)
368       return NULL;
369 
370    array_index = MIN2(array_index, bind_layout->array_size - 1);
371 
372    const struct hk_sampler *sampler =
373       bind_layout->immutable_samplers[array_index];
374 
375    return sampler && sampler->vk.ycbcr_conversion
376              ? &sampler->vk.ycbcr_conversion->state
377              : NULL;
378 }
379 
380 static int
glsl_type_size(const struct glsl_type * type,bool bindless)381 glsl_type_size(const struct glsl_type *type, bool bindless)
382 {
383    return glsl_count_attribute_slots(type, false);
384 }
385 
386 /*
387  * This is the world's worst multiview implementation. We simply duplicate each
388  * draw on the CPU side, changing a uniform in between, and then plumb the view
389  * index into the layer ID here. Whatever, it works.
390  *
391  * The "proper" implementation on AGX would use vertex amplification, but a
392  * MacBook is not a VR headset.
393  */
394 static void
hk_lower_multiview(nir_shader * nir)395 hk_lower_multiview(nir_shader *nir)
396 {
397    /* If there's an existing layer ID write, ignore it. This avoids validation
398     * splat with vk_meta.
399     */
400    nir_variable *existing = nir_find_variable_with_location(
401       nir, nir_var_shader_out, VARYING_SLOT_LAYER);
402 
403    if (existing) {
404       existing->data.mode = nir_var_shader_temp;
405       existing->data.location = 0;
406       nir_fixup_deref_modes(nir);
407    }
408 
409    /* Now write the view index as the layer */
410    nir_builder b =
411       nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(nir)));
412 
413    nir_variable *layer =
414       nir_variable_create(nir, nir_var_shader_out, glsl_uint_type(), NULL);
415 
416    layer->data.location = VARYING_SLOT_LAYER;
417 
418    nir_store_var(&b, layer, nir_load_view_index(&b), nir_component_mask(1));
419    b.shader->info.outputs_written |= VARYING_BIT_LAYER;
420 }
421 
422 /*
423  * KHR_maintenance5 requires that points rasterize with a default point size of
424  * 1.0, while our hardware requires an explicit point size write for this.
425  * Since topology may be dynamic, we insert an unconditional write if necessary.
426  */
427 static bool
hk_nir_insert_psiz_write(nir_shader * nir)428 hk_nir_insert_psiz_write(nir_shader *nir)
429 {
430    nir_function_impl *impl = nir_shader_get_entrypoint(nir);
431 
432    if (nir->info.outputs_written & VARYING_BIT_PSIZ) {
433       nir_metadata_preserve(impl, nir_metadata_all);
434       return false;
435    }
436 
437    nir_builder b = nir_builder_at(nir_after_impl(impl));
438 
439    nir_store_output(&b, nir_imm_float(&b, 1.0), nir_imm_int(&b, 0),
440                     .write_mask = nir_component_mask(1),
441                     .io_semantics.location = VARYING_SLOT_PSIZ,
442                     .io_semantics.num_slots = 1, .src_type = nir_type_float32);
443 
444    nir->info.outputs_written |= VARYING_BIT_PSIZ;
445    nir_metadata_preserve(b.impl, nir_metadata_control_flow);
446    return true;
447 }
448 
449 static nir_def *
query_custom_border(nir_builder * b,nir_tex_instr * tex)450 query_custom_border(nir_builder *b, nir_tex_instr *tex)
451 {
452    return nir_build_texture_query(b, tex, nir_texop_custom_border_color_agx, 4,
453                                   tex->dest_type, false, false);
454 }
455 
456 static nir_def *
has_custom_border(nir_builder * b,nir_tex_instr * tex)457 has_custom_border(nir_builder *b, nir_tex_instr *tex)
458 {
459    return nir_build_texture_query(b, tex, nir_texop_has_custom_border_color_agx,
460                                   1, nir_type_bool1, false, false);
461 }
462 
463 static bool
lower(nir_builder * b,nir_instr * instr,UNUSED void * _data)464 lower(nir_builder *b, nir_instr *instr, UNUSED void *_data)
465 {
466    if (instr->type != nir_instr_type_tex)
467       return false;
468 
469    nir_tex_instr *tex = nir_instr_as_tex(instr);
470    if (!nir_tex_instr_need_sampler(tex) || nir_tex_instr_is_query(tex))
471       return false;
472 
473    /* XXX: this is a really weird edge case, is this even well-defined? */
474    if (tex->is_shadow)
475       return false;
476 
477    b->cursor = nir_after_instr(&tex->instr);
478    nir_def *has_custom = has_custom_border(b, tex);
479 
480    nir_instr *orig = nir_instr_clone(b->shader, &tex->instr);
481    nir_builder_instr_insert(b, orig);
482    nir_def *clamp_to_1 = &nir_instr_as_tex(orig)->def;
483 
484    nir_push_if(b, has_custom);
485    nir_def *replaced = NULL;
486    {
487       /* Sample again, this time with clamp-to-0 instead of clamp-to-1 */
488       nir_instr *clone_instr = nir_instr_clone(b->shader, &tex->instr);
489       nir_builder_instr_insert(b, clone_instr);
490 
491       nir_tex_instr *tex_0 = nir_instr_as_tex(clone_instr);
492       nir_def *clamp_to_0 = &tex_0->def;
493 
494       tex_0->backend_flags |= AGX_TEXTURE_FLAG_CLAMP_TO_0;
495 
496       /* Grab the border colour */
497       nir_def *border = query_custom_border(b, tex_0);
498 
499       if (tex->op == nir_texop_tg4) {
500          border = nir_replicate(b, nir_channel(b, border, tex->component), 4);
501       }
502 
503       /* Combine together with the border */
504       if (nir_alu_type_get_base_type(tex->dest_type) == nir_type_float &&
505           tex->op != nir_texop_tg4) {
506 
507          /* For floats, lerp together:
508           *
509           * For border texels:  (1 * border) + (0 * border      ) = border
510           * For regular texels: (x * border) + (x * (1 - border)) = x.
511           *
512           * Linear filtering is linear (duh), so lerping is compatible.
513           */
514          replaced = nir_flrp(b, clamp_to_0, clamp_to_1, border);
515       } else {
516          /* For integers, just select componentwise since there is no linear
517           * filtering. Gathers also use this path since they are unfiltered in
518           * each component.
519           */
520          replaced = nir_bcsel(b, nir_ieq(b, clamp_to_0, clamp_to_1), clamp_to_0,
521                               border);
522       }
523    }
524    nir_pop_if(b, NULL);
525 
526    /* Put it together with a phi */
527    nir_def *phi = nir_if_phi(b, replaced, clamp_to_1);
528    nir_def_replace(&tex->def, phi);
529    return true;
530 }
531 
532 static bool
agx_nir_lower_custom_border(nir_shader * nir)533 agx_nir_lower_custom_border(nir_shader *nir)
534 {
535    return nir_shader_instructions_pass(nir, lower, nir_metadata_none, NULL);
536 }
537 
538 /*
539  * In Vulkan, the VIEWPORT should read 0 in the fragment shader if it is not
540  * written by the vertex shader, but in our implementation, the varying would
541  * otherwise be undefined. This small pass predicates VIEWPORT reads based on
542  * whether the hardware vertex shader writes the VIEWPORT (nonzero UVS index).
543  */
544 static bool
lower_viewport_fs(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)545 lower_viewport_fs(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data)
546 {
547    if (intr->intrinsic != nir_intrinsic_load_input)
548       return false;
549 
550    nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
551    if (sem.location != VARYING_SLOT_VIEWPORT)
552       return false;
553 
554    b->cursor = nir_after_instr(&intr->instr);
555    nir_def *orig = &intr->def;
556 
557    nir_def *uvs = nir_load_uvs_index_agx(b, .io_semantics = sem);
558    nir_def *def = nir_bcsel(b, nir_ine_imm(b, uvs, 0), orig, nir_imm_int(b, 0));
559 
560    nir_def_rewrite_uses_after(orig, def, def->parent_instr);
561    return true;
562 }
563 
564 static bool
lower_subpass_dim(nir_builder * b,nir_instr * instr,UNUSED void * _data)565 lower_subpass_dim(nir_builder *b, nir_instr *instr, UNUSED void *_data)
566 {
567    if (instr->type != nir_instr_type_tex)
568       return false;
569 
570    nir_tex_instr *tex = nir_instr_as_tex(instr);
571    if (tex->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS)
572       tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
573    else if (tex->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS)
574       tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
575    else
576       return false;
577 
578    return true;
579 }
580 
581 static bool
should_lower_robust(const nir_intrinsic_instr * intr,const void * _)582 should_lower_robust(const nir_intrinsic_instr *intr, const void *_)
583 {
584    /* The hardware is robust, but our software image atomics are not. Unlike the
585     * GL driver, we don't use the common buffer image lowering, using the
586     * agx_nir_lower_texture lowering for robustImageAccess2 semantics.
587     */
588    return intr->intrinsic == nir_intrinsic_image_deref_atomic ||
589           intr->intrinsic == nir_intrinsic_image_deref_atomic_swap;
590 }
591 
592 void
hk_lower_nir(struct hk_device * dev,nir_shader * nir,const struct vk_pipeline_robustness_state * rs,bool is_multiview,uint32_t set_layout_count,struct vk_descriptor_set_layout * const * set_layouts)593 hk_lower_nir(struct hk_device *dev, nir_shader *nir,
594              const struct vk_pipeline_robustness_state *rs, bool is_multiview,
595              uint32_t set_layout_count,
596              struct vk_descriptor_set_layout *const *set_layouts)
597 {
598    if (HK_PERF(dev, NOROBUST)) {
599       rs = &vk_robustness_disabled;
600    }
601 
602    const nir_opt_access_options access_options = {
603       .is_vulkan = true,
604    };
605    NIR_PASS_V(nir, nir_opt_access, &access_options);
606 
607    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
608       NIR_PASS(_, nir, nir_lower_input_attachments,
609                &(nir_input_attachment_options){
610                   .use_fragcoord_sysval = true,
611                   .use_layer_id_sysval = true,
612                   .use_view_id_for_layer = is_multiview,
613                });
614 
615       NIR_PASS(_, nir, nir_shader_instructions_pass, lower_subpass_dim,
616                nir_metadata_all, NULL);
617       NIR_PASS(_, nir, nir_lower_wpos_center);
618    }
619 
620    /* XXX: should be last geometry stage, how do I get to that? */
621    if (nir->info.stage == MESA_SHADER_VERTEX) {
622       if (is_multiview)
623          hk_lower_multiview(nir);
624    }
625 
626    if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
627       NIR_PASS(_, nir, nir_lower_patch_vertices,
628                nir->info.tess.tcs_vertices_out, NULL);
629    }
630 
631    const struct lower_ycbcr_state ycbcr_state = {
632       .set_layout_count = set_layout_count,
633       .set_layouts = set_layouts,
634    };
635    NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex, lookup_ycbcr_conversion,
636             &ycbcr_state);
637 
638    /* Lower push constants before lower_descriptors */
639    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const,
640             nir_address_format_32bit_offset);
641 
642    // NIR_PASS(_, nir, nir_opt_large_constants, NULL, 32);
643 
644    /* Turn cache flushes into image coherency bits while we still have derefs */
645    NIR_PASS(_, nir, nir_lower_memory_model);
646 
647    NIR_PASS(_, nir, nir_lower_robust_access, should_lower_robust, NULL);
648 
649    /* We must do early lowering before hk_nir_lower_descriptors, since this will
650     * create lod_bias_agx instructions.
651     */
652    NIR_PASS(_, nir, agx_nir_lower_texture_early, true /* support_lod_bias */);
653 
654    if (!HK_PERF(dev, NOBORDER)) {
655       NIR_PASS(_, nir, agx_nir_lower_custom_border);
656    }
657 
658    NIR_PASS(_, nir, hk_nir_lower_descriptors, rs, set_layout_count,
659             set_layouts);
660    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
661             nir_address_format_64bit_global);
662    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo,
663             hk_buffer_addr_format(rs->storage_buffers));
664    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
665             hk_buffer_addr_format(rs->uniform_buffers));
666 
667    /* Before inserting bounds checks, we want to do a fair bit of optimization.
668     * lower_load_global_constant_offset_instr has special optimizations for
669     * constant offsets, so we want as many offsets to be constant as possible.
670     */
671    bool progress;
672    do {
673       progress = false;
674       NIR_PASS(progress, nir, nir_opt_constant_folding);
675       NIR_PASS(progress, nir, nir_opt_algebraic);
676       NIR_PASS(progress, nir, nir_copy_prop);
677       NIR_PASS(progress, nir, nir_opt_dce);
678    } while (progress);
679 
680    bool soft_fault = agx_has_soft_fault(&dev->dev);
681    NIR_PASS(_, nir, nir_shader_intrinsics_pass,
682             lower_load_global_constant_offset_instr, nir_metadata_none,
683             &soft_fault);
684 
685    if (!nir->info.shared_memory_explicit_layout) {
686       /* There may be garbage in shared_size, but it's the job of
687        * nir_lower_vars_to_explicit_types to allocate it. We have to reset to
688        * avoid overallocation.
689        */
690       nir->info.shared_size = 0;
691 
692       NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared,
693                shared_var_info);
694    }
695    NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared,
696             nir_address_format_32bit_offset);
697 
698    if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
699       /* Align everything up to 16B so we can write whole vec4s. */
700       nir->info.shared_size = align(nir->info.shared_size, 16);
701       NIR_PASS(_, nir, nir_zero_initialize_shared_memory, nir->info.shared_size,
702                16);
703 
704       /* We need to call lower_compute_system_values again because
705        * nir_zero_initialize_shared_memory generates load_invocation_id which
706        * has to be lowered to load_invocation_index.
707        */
708       NIR_PASS(_, nir, nir_lower_compute_system_values, NULL);
709    }
710 
711    /* TODO: we can do indirect VS output */
712    nir_variable_mode lower_indirect_modes = 0;
713    if (nir->info.stage == MESA_SHADER_FRAGMENT)
714       lower_indirect_modes |= nir_var_shader_out;
715    else if (nir->info.stage == MESA_SHADER_VERTEX)
716       lower_indirect_modes |= nir_var_shader_in | nir_var_shader_out;
717 
718    NIR_PASS(_, nir, nir_lower_indirect_derefs, lower_indirect_modes,
719             UINT32_MAX);
720 
721    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
722             glsl_type_size,
723             nir_lower_io_lower_64bit_to_32 |
724                nir_lower_io_use_interpolated_input_intrinsics);
725 
726    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
727       NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_viewport_fs,
728                nir_metadata_control_flow, NULL);
729    }
730 
731    NIR_PASS(_, nir, agx_nir_lower_texture);
732    NIR_PASS(_, nir, agx_nir_lower_multisampled_image_store);
733 
734    agx_preprocess_nir(nir, dev->dev.libagx);
735    NIR_PASS(_, nir, nir_opt_conditional_discard);
736    NIR_PASS(_, nir, nir_opt_if,
737             nir_opt_if_optimize_phi_true_false | nir_opt_if_avoid_64bit_phis);
738 }
739 
740 static void
hk_upload_shader(struct hk_device * dev,struct hk_shader * shader)741 hk_upload_shader(struct hk_device *dev, struct hk_shader *shader)
742 {
743    if (shader->b.info.has_preamble || shader->b.info.rodata.size_16) {
744       /* TODO: Do we wnat to compact? Revisit when we rework prolog/epilogs. */
745       size_t size = shader->b.info.binary_size;
746       assert(size > 0);
747 
748       shader->bo = agx_bo_create(&dev->dev, size, 0,
749                                  AGX_BO_EXEC | AGX_BO_LOW_VA, "Preamble");
750       memcpy(agx_bo_map(shader->bo), shader->b.binary, size);
751       shader->preamble_addr =
752          shader->bo->va->addr + shader->b.info.preamble_offset;
753    }
754 
755    if (!shader->linked.ht) {
756       /* If we only have a single variant, link now. */
757       shader->only_linked = hk_fast_link(dev, false, shader, NULL, NULL, 0);
758    }
759 
760    if (shader->info.stage == MESA_SHADER_FRAGMENT) {
761       agx_pack_fragment_face_2(&shader->frag_face, 0, &shader->b.info);
762    }
763 
764    agx_pack(&shader->counts, COUNTS, cfg) {
765       cfg.uniform_register_count = shader->b.info.push_count;
766       cfg.preshader_register_count = shader->b.info.nr_preamble_gprs;
767       cfg.sampler_state_register_count = agx_translate_sampler_state_count(
768          shader->b.info.uses_txf ? 1 : 0, false);
769    }
770 }
771 
772 DERIVE_HASH_TABLE(hk_fast_link_key_vs);
773 DERIVE_HASH_TABLE(hk_fast_link_key_fs);
774 
775 static VkResult
hk_init_link_ht(struct hk_shader * shader,gl_shader_stage sw_stage)776 hk_init_link_ht(struct hk_shader *shader, gl_shader_stage sw_stage)
777 {
778    simple_mtx_init(&shader->linked.lock, mtx_plain);
779 
780    bool multiple_variants =
781       sw_stage == MESA_SHADER_VERTEX || sw_stage == MESA_SHADER_FRAGMENT;
782 
783    if (!multiple_variants)
784       return VK_SUCCESS;
785 
786    if (sw_stage == MESA_SHADER_VERTEX)
787       shader->linked.ht = hk_fast_link_key_vs_table_create(NULL);
788    else
789       shader->linked.ht = hk_fast_link_key_fs_table_create(NULL);
790 
791    return (shader->linked.ht == NULL) ? VK_ERROR_OUT_OF_HOST_MEMORY
792                                       : VK_SUCCESS;
793 }
794 
795 static VkResult
hk_compile_nir(struct hk_device * dev,const VkAllocationCallbacks * pAllocator,nir_shader * nir,VkShaderCreateFlagsEXT shader_flags,const struct vk_pipeline_robustness_state * rs,const struct hk_fs_key * fs_key,struct hk_shader * shader,gl_shader_stage sw_stage,bool hw,nir_xfb_info * xfb_info)796 hk_compile_nir(struct hk_device *dev, const VkAllocationCallbacks *pAllocator,
797                nir_shader *nir, VkShaderCreateFlagsEXT shader_flags,
798                const struct vk_pipeline_robustness_state *rs,
799                const struct hk_fs_key *fs_key, struct hk_shader *shader,
800                gl_shader_stage sw_stage, bool hw, nir_xfb_info *xfb_info)
801 {
802    unsigned vs_uniform_base = 0;
803 
804    /* For now, only shader objects are supported */
805    if (sw_stage == MESA_SHADER_VERTEX) {
806       vs_uniform_base =
807          6 * DIV_ROUND_UP(
808                 BITSET_LAST_BIT(shader->info.vs.attrib_components_read), 4);
809    } else if (sw_stage == MESA_SHADER_FRAGMENT) {
810       shader->info.fs.interp = agx_gather_interp_info(nir);
811       shader->info.fs.writes_memory = nir->info.writes_memory;
812 
813       /* Discards must be lowering before lowering MSAA to handle discards */
814       NIR_PASS(_, nir, agx_nir_lower_discard_zs_emit);
815       NIR_PASS(_, nir, agx_nir_lower_fs_output_to_epilog,
816                &shader->info.fs.epilog_key);
817       NIR_PASS(_, nir, agx_nir_lower_sample_mask);
818 
819       if (nir->info.fs.uses_sample_shading) {
820          /* Ensure the sample mask is preserved in register */
821          nir_builder b =
822             nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(nir)));
823 
824          nir_def *mask =
825             nir_load_exported_agx(&b, 1, 16, .base = AGX_ABI_FIN_SAMPLE_MASK);
826 
827          nir_export_agx(&b, mask, .base = AGX_ABI_FOUT_SAMPLE_MASK);
828 
829          NIR_PASS(_, nir, agx_nir_lower_to_per_sample);
830       }
831 
832       NIR_PASS(_, nir, agx_nir_lower_fs_active_samples_to_register);
833       NIR_PASS(_, nir, agx_nir_lower_interpolation);
834    } else if (sw_stage == MESA_SHADER_TESS_EVAL ||
835               sw_stage == MESA_SHADER_TESS_CTRL) {
836 
837       shader->info.tess.info.ccw = nir->info.tess.ccw;
838       shader->info.tess.info.points = nir->info.tess.point_mode;
839       shader->info.tess.info.spacing = nir->info.tess.spacing;
840       shader->info.tess.info.mode = nir->info.tess._primitive_mode;
841 
842       if (sw_stage == MESA_SHADER_TESS_CTRL) {
843          shader->info.tess.tcs_output_patch_size =
844             nir->info.tess.tcs_vertices_out;
845          shader->info.tess.tcs_per_vertex_outputs =
846             agx_tcs_per_vertex_outputs(nir);
847          shader->info.tess.tcs_nr_patch_outputs =
848             util_last_bit(nir->info.patch_outputs_written);
849          shader->info.tess.tcs_output_stride = agx_tcs_output_stride(nir);
850       } else {
851          /* This destroys info so it needs to happen after the gather */
852          NIR_PASS(_, nir, agx_nir_lower_tes, dev->dev.libagx, hw);
853       }
854    }
855 
856    uint64_t outputs = nir->info.outputs_written;
857    if (!hw &&
858        (sw_stage == MESA_SHADER_VERTEX || sw_stage == MESA_SHADER_TESS_EVAL)) {
859       nir->info.stage = MESA_SHADER_COMPUTE;
860       memset(&nir->info.cs, 0, sizeof(nir->info.cs));
861       nir->xfb_info = NULL;
862    }
863 
864    /* XXX: rename */
865    NIR_PASS(_, nir, hk_lower_uvs_index, vs_uniform_base);
866 
867 #if 0
868    /* TODO */
869    nir_variable_mode robust2_modes = 0;
870    if (rs->uniform_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
871       robust2_modes |= nir_var_mem_ubo;
872    if (rs->storage_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
873       robust2_modes |= nir_var_mem_ssbo;
874 #endif
875 
876    struct agx_shader_key backend_key = {
877       .dev = agx_gather_device_key(&dev->dev),
878       .reserved_preamble = 128 /* TODO */,
879       .libagx = dev->dev.libagx,
880       .no_stop = nir->info.stage == MESA_SHADER_FRAGMENT,
881       .has_scratch = !nir->info.internal,
882       .promote_constants = true,
883    };
884 
885    /* For now, sample shading is always dynamic. Indicate that. */
886    if (nir->info.stage == MESA_SHADER_FRAGMENT &&
887        nir->info.fs.uses_sample_shading)
888       backend_key.fs.inside_sample_loop = true;
889 
890    simple_mtx_t *lock = NULL;
891    if (agx_get_compiler_debug())
892       lock = &hk_device_physical(dev)->debug_compile_lock;
893 
894    if (lock)
895       simple_mtx_lock(lock);
896 
897    agx_compile_shader_nir(nir, &backend_key, NULL, &shader->b);
898 
899    if (lock)
900       simple_mtx_unlock(lock);
901 
902    shader->code_ptr = shader->b.binary;
903    shader->code_size = shader->b.info.binary_size;
904 
905    shader->info.stage = sw_stage;
906    shader->info.clip_distance_array_size = nir->info.clip_distance_array_size;
907    shader->info.cull_distance_array_size = nir->info.cull_distance_array_size;
908    shader->b.info.outputs = outputs;
909 
910    if (xfb_info) {
911       assert(xfb_info->output_count < ARRAY_SIZE(shader->info.xfb_outputs));
912 
913       memcpy(&shader->info.xfb_info, xfb_info,
914              nir_xfb_info_size(xfb_info->output_count));
915 
916       typed_memcpy(shader->info.xfb_stride, nir->info.xfb_stride, 4);
917    }
918 
919    if (nir->constant_data_size > 0) {
920       uint32_t data_size = align(nir->constant_data_size, HK_MIN_UBO_ALIGNMENT);
921 
922       void *data = malloc(data_size);
923       if (data == NULL) {
924          ralloc_free(nir);
925          return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
926       }
927 
928       memcpy(data, nir->constant_data, nir->constant_data_size);
929 
930       assert(nir->constant_data_size <= data_size);
931       memset(data + nir->constant_data_size, 0,
932              data_size - nir->constant_data_size);
933 
934       shader->data_ptr = data;
935       shader->data_size = data_size;
936    }
937 
938    ralloc_free(nir);
939 
940    VkResult result = hk_init_link_ht(shader, sw_stage);
941    if (result != VK_SUCCESS)
942       return vk_error(dev, result);
943 
944    hk_upload_shader(dev, shader);
945    return VK_SUCCESS;
946 }
947 
948 static const struct vk_shader_ops hk_shader_ops;
949 
950 static void
hk_destroy_linked_shader(struct hk_device * dev,struct hk_linked_shader * linked)951 hk_destroy_linked_shader(struct hk_device *dev, struct hk_linked_shader *linked)
952 {
953    agx_bo_unreference(&dev->dev, linked->b.bo);
954    ralloc_free(linked);
955 }
956 
957 static void
hk_shader_destroy(struct hk_device * dev,struct hk_shader * s)958 hk_shader_destroy(struct hk_device *dev, struct hk_shader *s)
959 {
960    free((void *)s->code_ptr);
961    free((void *)s->data_ptr);
962    agx_bo_unreference(&dev->dev, s->bo);
963 
964    simple_mtx_destroy(&s->linked.lock);
965 
966    if (s->only_linked)
967       hk_destroy_linked_shader(dev, s->only_linked);
968 
969    if (s->linked.ht) {
970       hash_table_foreach(s->linked.ht, entry) {
971          hk_destroy_linked_shader(dev, entry->data);
972       }
973       _mesa_hash_table_destroy(s->linked.ht, NULL);
974    }
975 }
976 
977 void
hk_api_shader_destroy(struct vk_device * vk_dev,struct vk_shader * vk_shader,const VkAllocationCallbacks * pAllocator)978 hk_api_shader_destroy(struct vk_device *vk_dev, struct vk_shader *vk_shader,
979                       const VkAllocationCallbacks *pAllocator)
980 {
981    struct hk_device *dev = container_of(vk_dev, struct hk_device, vk);
982    struct hk_api_shader *obj =
983       container_of(vk_shader, struct hk_api_shader, vk);
984 
985    hk_foreach_variant(obj, shader) {
986       hk_shader_destroy(dev, shader);
987    }
988 
989    vk_shader_free(&dev->vk, pAllocator, &obj->vk);
990 }
991 
992 static void
hk_lower_hw_vs(nir_shader * nir,struct hk_shader * shader)993 hk_lower_hw_vs(nir_shader *nir, struct hk_shader *shader)
994 {
995    /* Point size must be clamped, excessively large points don't render
996     * properly on G13.
997     *
998     * Must be synced with pointSizeRange.
999     */
1000    NIR_PASS(_, nir, nir_lower_point_size, 1.0f, 511.95f);
1001 
1002    /* TODO: Optimize out for monolithic? */
1003    NIR_PASS(_, nir, hk_nir_insert_psiz_write);
1004 
1005    NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1006    NIR_PASS(_, nir, agx_nir_lower_cull_distance_vs);
1007 
1008    NIR_PASS(_, nir, agx_nir_lower_uvs, &shader->info.uvs);
1009 
1010    shader->info.vs.cull_distance_array_size =
1011       nir->info.cull_distance_array_size;
1012 }
1013 
1014 VkResult
hk_compile_shader(struct hk_device * dev,struct vk_shader_compile_info * info,const struct vk_graphics_pipeline_state * state,const VkAllocationCallbacks * pAllocator,struct hk_api_shader ** shader_out)1015 hk_compile_shader(struct hk_device *dev, struct vk_shader_compile_info *info,
1016                   const struct vk_graphics_pipeline_state *state,
1017                   const VkAllocationCallbacks *pAllocator,
1018                   struct hk_api_shader **shader_out)
1019 {
1020    VkResult result;
1021 
1022    /* We consume the NIR, regardless of success or failure */
1023    nir_shader *nir = info->nir;
1024 
1025    size_t size = sizeof(struct hk_api_shader) +
1026                  sizeof(struct hk_shader) * hk_num_variants(info->stage);
1027    struct hk_api_shader *obj =
1028       vk_shader_zalloc(&dev->vk, &hk_shader_ops, info->stage, pAllocator, size);
1029 
1030    if (obj == NULL) {
1031       ralloc_free(nir);
1032       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1033    }
1034 
1035    /* TODO: Multiview with ESO */
1036    const bool is_multiview = state && state->rp->view_mask != 0;
1037 
1038    hk_lower_nir(dev, nir, info->robustness, is_multiview,
1039                 info->set_layout_count, info->set_layouts);
1040 
1041    gl_shader_stage sw_stage = nir->info.stage;
1042 
1043    struct hk_fs_key fs_key_tmp, *fs_key = NULL;
1044    if (sw_stage == MESA_SHADER_FRAGMENT) {
1045       hk_populate_fs_key(&fs_key_tmp, state);
1046       fs_key = &fs_key_tmp;
1047 
1048       nir->info.fs.uses_sample_shading |= fs_key->force_sample_shading;
1049 
1050       /* Force late-Z for Z/S self-deps. TODO: There's probably a less silly way
1051        * to do this.
1052        */
1053       if (fs_key->zs_self_dep) {
1054          nir_builder b =
1055             nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(nir)));
1056          nir_discard_if(&b, nir_imm_false(&b));
1057          nir->info.fs.uses_discard = true;
1058       }
1059 
1060       NIR_PASS(_, nir, agx_nir_lower_sample_intrinsics, false);
1061    } else if (sw_stage == MESA_SHADER_TESS_CTRL) {
1062       NIR_PASS_V(nir, agx_nir_lower_tcs, dev->dev.libagx);
1063    }
1064 
1065    /* Compile all variants up front */
1066    if (sw_stage == MESA_SHADER_GEOMETRY) {
1067       for (unsigned rast_disc = 0; rast_disc < 2; ++rast_disc) {
1068          struct hk_shader *count_variant = hk_count_gs_variant(obj, rast_disc);
1069          bool last = (rast_disc + 1) == 2;
1070 
1071          /* Each variant gets its own NIR. To save an extra clone, we use the
1072           * original NIR for the last stage.
1073           */
1074          nir_shader *clone = last ? nir : nir_shader_clone(NULL, nir);
1075 
1076          enum mesa_prim out_prim = MESA_PRIM_MAX;
1077          nir_shader *count = NULL, *rast = NULL, *pre_gs = NULL;
1078 
1079          NIR_PASS(_, clone, agx_nir_lower_gs, dev->dev.libagx, rast_disc,
1080                   &count, &rast, &pre_gs, &out_prim,
1081                   &count_variant->info.gs.count_words);
1082 
1083          if (!rast_disc) {
1084             struct hk_shader *shader = &obj->variants[HK_GS_VARIANT_RAST];
1085 
1086             hk_lower_hw_vs(rast, shader);
1087             shader->info.gs.out_prim = out_prim;
1088          }
1089 
1090          struct {
1091             nir_shader *in;
1092             struct hk_shader *out;
1093          } variants[] = {
1094             {clone, hk_main_gs_variant(obj, rast_disc)},
1095             {pre_gs, hk_pre_gs_variant(obj, rast_disc)},
1096             {count, count_variant},
1097             {rast_disc ? NULL : rast, &obj->variants[HK_GS_VARIANT_RAST]},
1098          };
1099 
1100          for (unsigned v = 0; v < ARRAY_SIZE(variants); ++v) {
1101             if (variants[v].in) {
1102                result = hk_compile_nir(dev, pAllocator, variants[v].in,
1103                                        info->flags, info->robustness, NULL,
1104                                        variants[v].out, sw_stage, true, NULL);
1105                if (result != VK_SUCCESS) {
1106                   hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
1107                   if (clone != nir) {
1108                      ralloc_free(nir);
1109                   }
1110 
1111                   ralloc_free(clone);
1112                   ralloc_free(pre_gs);
1113                   ralloc_free(count);
1114                   ralloc_free(rast);
1115                   return result;
1116                }
1117             }
1118          }
1119 
1120          /* Nothing consumes this otherwise throw it away.
1121           *
1122           * TODO: We should just not generate it.
1123           */
1124          if (rast_disc) {
1125             ralloc_free(rast);
1126          }
1127       }
1128    } else if (sw_stage == MESA_SHADER_VERTEX ||
1129               sw_stage == MESA_SHADER_TESS_EVAL) {
1130 
1131       VkShaderStageFlags next_stage = info->next_stage_mask;
1132 
1133       /* Transform feedback is layered on top of geometry shaders. If there is
1134        * not a geometry shader in the pipeline, we will compile a geometry
1135        * shader for the purpose. Update the next_stage mask accordingly.
1136        */
1137       if (nir->xfb_info != NULL) {
1138          next_stage |= VK_SHADER_STAGE_GEOMETRY_BIT;
1139       }
1140 
1141       if (sw_stage == MESA_SHADER_VERTEX) {
1142          assert(
1143             !(nir->info.inputs_read & BITFIELD64_MASK(VERT_ATTRIB_GENERIC0)) &&
1144             "Fixed-function attributes not used in Vulkan");
1145 
1146          NIR_PASS(_, nir, nir_recompute_io_bases, nir_var_shader_in);
1147       }
1148 
1149       /* the shader_out portion of this is load-bearing even for tess eval */
1150       NIR_PASS(_, nir, nir_io_add_const_offset_to_base,
1151                nir_var_shader_in | nir_var_shader_out);
1152 
1153       for (enum hk_vs_variant v = 0; v < HK_VS_VARIANTS; ++v) {
1154          /* Only compile the software variant if we might use this shader with
1155           * geometry/tessellation. We need to compile the hardware variant
1156           * unconditionally to handle the VS -> null FS case, which does not
1157           * require setting the FRAGMENT bit.
1158           */
1159          if (v == HK_VS_VARIANT_SW &&
1160              !(next_stage & (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
1161                              VK_SHADER_STAGE_GEOMETRY_BIT)))
1162             continue;
1163 
1164          struct hk_shader *shader = &obj->variants[v];
1165          bool hw = v == HK_VS_VARIANT_HW;
1166          bool last = (v + 1) == HK_VS_VARIANTS;
1167 
1168          /* Each variant gets its own NIR. To save an extra clone, we use the
1169           * original NIR for the last stage.
1170           */
1171          nir_shader *clone = last ? nir : nir_shader_clone(NULL, nir);
1172 
1173          if (sw_stage == MESA_SHADER_VERTEX) {
1174             NIR_PASS(_, clone, agx_nir_lower_vs_input_to_prolog,
1175                      shader->info.vs.attrib_components_read);
1176 
1177             shader->info.vs.attribs_read =
1178                nir->info.inputs_read >> VERT_ATTRIB_GENERIC0;
1179          }
1180 
1181          if (hw) {
1182             hk_lower_hw_vs(clone, shader);
1183          } else {
1184             NIR_PASS(_, clone, agx_nir_lower_vs_before_gs, dev->dev.libagx);
1185          }
1186 
1187          /* hk_compile_nir takes ownership of the clone */
1188          result = hk_compile_nir(dev, pAllocator, clone, info->flags,
1189                                  info->robustness, fs_key, shader, sw_stage, hw,
1190                                  nir->xfb_info);
1191          if (result != VK_SUCCESS) {
1192             hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
1193             ralloc_free(nir);
1194             return result;
1195          }
1196       }
1197    } else {
1198       struct hk_shader *shader = hk_only_variant(obj);
1199 
1200       /* hk_compile_nir takes ownership of nir */
1201       result =
1202          hk_compile_nir(dev, pAllocator, nir, info->flags, info->robustness,
1203                         fs_key, shader, sw_stage, true, NULL);
1204       if (result != VK_SUCCESS) {
1205          hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
1206          return result;
1207       }
1208    }
1209 
1210    *shader_out = obj;
1211    return VK_SUCCESS;
1212 }
1213 
1214 static VkResult
hk_compile_shaders(struct vk_device * vk_dev,uint32_t shader_count,struct vk_shader_compile_info * infos,const struct vk_graphics_pipeline_state * state,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shaders_out)1215 hk_compile_shaders(struct vk_device *vk_dev, uint32_t shader_count,
1216                    struct vk_shader_compile_info *infos,
1217                    const struct vk_graphics_pipeline_state *state,
1218                    const VkAllocationCallbacks *pAllocator,
1219                    struct vk_shader **shaders_out)
1220 {
1221    struct hk_device *dev = container_of(vk_dev, struct hk_device, vk);
1222 
1223    for (uint32_t i = 0; i < shader_count; i++) {
1224       VkResult result =
1225          hk_compile_shader(dev, &infos[i], state, pAllocator,
1226                            (struct hk_api_shader **)&shaders_out[i]);
1227       if (result != VK_SUCCESS) {
1228          /* Clean up all the shaders before this point */
1229          for (uint32_t j = 0; j < i; j++)
1230             hk_api_shader_destroy(&dev->vk, shaders_out[j], pAllocator);
1231 
1232          /* Clean up all the NIR after this point */
1233          for (uint32_t j = i + 1; j < shader_count; j++)
1234             ralloc_free(infos[j].nir);
1235 
1236          /* Memset the output array */
1237          memset(shaders_out, 0, shader_count * sizeof(*shaders_out));
1238 
1239          return result;
1240       }
1241    }
1242 
1243    return VK_SUCCESS;
1244 }
1245 
1246 static VkResult
hk_deserialize_shader(struct hk_device * dev,struct blob_reader * blob,struct hk_shader * shader)1247 hk_deserialize_shader(struct hk_device *dev, struct blob_reader *blob,
1248                       struct hk_shader *shader)
1249 {
1250    struct hk_shader_info info;
1251    blob_copy_bytes(blob, &info, sizeof(info));
1252 
1253    struct agx_shader_info b_info;
1254    blob_copy_bytes(blob, &b_info, sizeof(b_info));
1255 
1256    const uint32_t code_size = blob_read_uint32(blob);
1257    const uint32_t data_size = blob_read_uint32(blob);
1258    if (blob->overrun)
1259       return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
1260 
1261    VkResult result = hk_init_link_ht(shader, info.stage);
1262    if (result != VK_SUCCESS)
1263       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1264 
1265    simple_mtx_init(&shader->linked.lock, mtx_plain);
1266 
1267    shader->b.info = b_info;
1268    shader->info = info;
1269    shader->code_size = code_size;
1270    shader->data_size = data_size;
1271    shader->b.info.binary_size = code_size;
1272 
1273    shader->code_ptr = malloc(code_size);
1274    if (shader->code_ptr == NULL)
1275       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1276 
1277    shader->data_ptr = malloc(data_size);
1278    if (shader->data_ptr == NULL)
1279       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1280 
1281    blob_copy_bytes(blob, (void *)shader->code_ptr, shader->code_size);
1282    blob_copy_bytes(blob, (void *)shader->data_ptr, shader->data_size);
1283    if (blob->overrun)
1284       return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
1285 
1286    shader->b.binary = (void *)shader->code_ptr;
1287    hk_upload_shader(dev, shader);
1288    return VK_SUCCESS;
1289 }
1290 
1291 static VkResult
hk_deserialize_api_shader(struct vk_device * vk_dev,struct blob_reader * blob,uint32_t binary_version,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shader_out)1292 hk_deserialize_api_shader(struct vk_device *vk_dev, struct blob_reader *blob,
1293                           uint32_t binary_version,
1294                           const VkAllocationCallbacks *pAllocator,
1295                           struct vk_shader **shader_out)
1296 {
1297    struct hk_device *dev = container_of(vk_dev, struct hk_device, vk);
1298 
1299    gl_shader_stage stage = blob_read_uint8(blob);
1300    if (blob->overrun)
1301       return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
1302 
1303    size_t size = sizeof(struct hk_api_shader) +
1304                  sizeof(struct hk_shader) * hk_num_variants(stage);
1305 
1306    struct hk_api_shader *obj =
1307       vk_shader_zalloc(&dev->vk, &hk_shader_ops, stage, pAllocator, size);
1308 
1309    if (obj == NULL)
1310       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1311 
1312    hk_foreach_variant(obj, shader) {
1313       VkResult result = hk_deserialize_shader(dev, blob, shader);
1314 
1315       if (result != VK_SUCCESS) {
1316          hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
1317          return result;
1318       }
1319    }
1320 
1321    *shader_out = &obj->vk;
1322    return VK_SUCCESS;
1323 }
1324 
1325 static void
hk_shader_serialize(struct vk_device * vk_dev,const struct hk_shader * shader,struct blob * blob)1326 hk_shader_serialize(struct vk_device *vk_dev, const struct hk_shader *shader,
1327                     struct blob *blob)
1328 {
1329    blob_write_bytes(blob, &shader->info, sizeof(shader->info));
1330    blob_write_bytes(blob, &shader->b.info, sizeof(shader->b.info));
1331 
1332    blob_write_uint32(blob, shader->code_size);
1333    blob_write_uint32(blob, shader->data_size);
1334    blob_write_bytes(blob, shader->code_ptr, shader->code_size);
1335    blob_write_bytes(blob, shader->data_ptr, shader->data_size);
1336 }
1337 
1338 static bool
hk_api_shader_serialize(struct vk_device * vk_dev,const struct vk_shader * vk_shader,struct blob * blob)1339 hk_api_shader_serialize(struct vk_device *vk_dev,
1340                         const struct vk_shader *vk_shader, struct blob *blob)
1341 {
1342    struct hk_api_shader *obj =
1343       container_of(vk_shader, struct hk_api_shader, vk);
1344 
1345    blob_write_uint8(blob, vk_shader->stage);
1346 
1347    hk_foreach_variant(obj, shader) {
1348       hk_shader_serialize(vk_dev, shader, blob);
1349    }
1350 
1351    return !blob->out_of_memory;
1352 }
1353 
1354 #define WRITE_STR(field, ...)                                                  \
1355    ({                                                                          \
1356       memset(field, 0, sizeof(field));                                         \
1357       UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__);              \
1358       assert(i > 0 && i < sizeof(field));                                      \
1359    })
1360 
1361 static VkResult
hk_shader_get_executable_properties(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t * executable_count,VkPipelineExecutablePropertiesKHR * properties)1362 hk_shader_get_executable_properties(
1363    UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
1364    uint32_t *executable_count, VkPipelineExecutablePropertiesKHR *properties)
1365 {
1366    struct hk_api_shader *obj =
1367       container_of(vk_shader, struct hk_api_shader, vk);
1368 
1369    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out, properties,
1370                           executable_count);
1371 
1372    vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props)
1373    {
1374       props->stages = mesa_to_vk_shader_stage(obj->vk.stage);
1375       props->subgroupSize = 32;
1376       WRITE_STR(props->name, "%s", _mesa_shader_stage_to_string(obj->vk.stage));
1377       WRITE_STR(props->description, "%s shader",
1378                 _mesa_shader_stage_to_string(obj->vk.stage));
1379    }
1380 
1381    return vk_outarray_status(&out);
1382 }
1383 
1384 static VkResult
hk_shader_get_executable_statistics(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t executable_index,uint32_t * statistic_count,VkPipelineExecutableStatisticKHR * statistics)1385 hk_shader_get_executable_statistics(
1386    UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
1387    uint32_t executable_index, uint32_t *statistic_count,
1388    VkPipelineExecutableStatisticKHR *statistics)
1389 {
1390    struct hk_api_shader *obj =
1391       container_of(vk_shader, struct hk_api_shader, vk);
1392 
1393    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, statistics,
1394                           statistic_count);
1395 
1396    assert(executable_index == 0);
1397 
1398    /* TODO: find a sane way to report multiple variants and have that play nice
1399     * with zink.
1400     */
1401    struct hk_shader *shader = hk_any_variant(obj);
1402 
1403    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat)
1404    {
1405       WRITE_STR(stat->name, "Code Size");
1406       WRITE_STR(stat->description,
1407                 "Size of the compiled shader binary, in bytes");
1408       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1409       stat->value.u64 = shader->code_size;
1410    }
1411 
1412    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat)
1413    {
1414       WRITE_STR(stat->name, "Number of GPRs");
1415       WRITE_STR(stat->description, "Number of GPRs used by this pipeline");
1416       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1417       stat->value.u64 = shader->b.info.nr_gprs;
1418    }
1419 
1420    return vk_outarray_status(&out);
1421 }
1422 
1423 static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR * ir,const char * data)1424 write_ir_text(VkPipelineExecutableInternalRepresentationKHR *ir,
1425               const char *data)
1426 {
1427    ir->isText = VK_TRUE;
1428 
1429    size_t data_len = strlen(data) + 1;
1430 
1431    if (ir->pData == NULL) {
1432       ir->dataSize = data_len;
1433       return true;
1434    }
1435 
1436    strncpy(ir->pData, data, ir->dataSize);
1437    if (ir->dataSize < data_len)
1438       return false;
1439 
1440    ir->dataSize = data_len;
1441    return true;
1442 }
1443 
1444 static VkResult
hk_shader_get_executable_internal_representations(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t executable_index,uint32_t * internal_representation_count,VkPipelineExecutableInternalRepresentationKHR * internal_representations)1445 hk_shader_get_executable_internal_representations(
1446    UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
1447    uint32_t executable_index, uint32_t *internal_representation_count,
1448    VkPipelineExecutableInternalRepresentationKHR *internal_representations)
1449 {
1450    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
1451                           internal_representations,
1452                           internal_representation_count);
1453    bool incomplete_text = false;
1454 
1455    assert(executable_index == 0);
1456 
1457    /* TODO */
1458 #if 0
1459    vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
1460       WRITE_STR(ir->name, "AGX assembly");
1461       WRITE_STR(ir->description, "AGX assembly");
1462       if (!write_ir_text(ir, TODO))
1463          incomplete_text = true;
1464    }
1465 #endif
1466 
1467    return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
1468 }
1469 
1470 static const struct vk_shader_ops hk_shader_ops = {
1471    .destroy = hk_api_shader_destroy,
1472    .serialize = hk_api_shader_serialize,
1473    .get_executable_properties = hk_shader_get_executable_properties,
1474    .get_executable_statistics = hk_shader_get_executable_statistics,
1475    .get_executable_internal_representations =
1476       hk_shader_get_executable_internal_representations,
1477 };
1478 
1479 const struct vk_device_shader_ops hk_device_shader_ops = {
1480    .get_nir_options = hk_get_nir_options,
1481    .get_spirv_options = hk_get_spirv_options,
1482    .preprocess_nir = hk_preprocess_nir,
1483    .hash_graphics_state = hk_hash_graphics_state,
1484    .compile = hk_compile_shaders,
1485    .deserialize = hk_deserialize_api_shader,
1486    .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
1487    .cmd_bind_shaders = hk_cmd_bind_shaders,
1488 };
1489 
1490 struct hk_linked_shader *
hk_fast_link(struct hk_device * dev,bool fragment,struct hk_shader * main,struct agx_shader_part * prolog,struct agx_shader_part * epilog,unsigned nr_samples_shaded)1491 hk_fast_link(struct hk_device *dev, bool fragment, struct hk_shader *main,
1492              struct agx_shader_part *prolog, struct agx_shader_part *epilog,
1493              unsigned nr_samples_shaded)
1494 {
1495    struct hk_linked_shader *s = rzalloc(NULL, struct hk_linked_shader);
1496    agx_fast_link(&s->b, &dev->dev, fragment, &main->b, prolog, epilog,
1497                  nr_samples_shaded);
1498 
1499    if (fragment) {
1500       agx_pack(&s->fs_counts, FRAGMENT_SHADER_WORD_0, cfg) {
1501          cfg.cf_binding_count = s->b.cf.nr_bindings;
1502          cfg.uniform_register_count = main->b.info.push_count;
1503          cfg.preshader_register_count = main->b.info.nr_preamble_gprs;
1504          cfg.sampler_state_register_count =
1505             agx_translate_sampler_state_count(s->b.uses_txf ? 1 : 0, false);
1506       }
1507    }
1508 
1509    /* Now that we've linked, bake the USC words to bind this program */
1510    struct agx_usc_builder b = agx_usc_builder(s->usc.data, sizeof(s->usc.data));
1511 
1512    if (main && main->b.info.rodata.size_16) {
1513       agx_usc_immediates(&b, &main->b.info.rodata, main->bo->va->addr);
1514    }
1515 
1516    agx_usc_push_packed(&b, UNIFORM, dev->rodata.image_heap);
1517 
1518    if (s->b.uses_txf)
1519       agx_usc_push_packed(&b, SAMPLER, dev->dev.txf_sampler);
1520 
1521    agx_usc_shared_non_fragment(&b, &main->b.info, 0);
1522    agx_usc_push_packed(&b, SHADER, s->b.shader);
1523    agx_usc_push_packed(&b, REGISTERS, s->b.regs);
1524 
1525    if (fragment)
1526       agx_usc_push_packed(&b, FRAGMENT_PROPERTIES, s->b.fragment_props);
1527 
1528    if (main && main->b.info.has_preamble) {
1529       agx_usc_pack(&b, PRESHADER, cfg) {
1530          cfg.code = agx_usc_addr(&dev->dev, main->preamble_addr);
1531       }
1532    } else {
1533       agx_usc_pack(&b, NO_PRESHADER, cfg)
1534          ;
1535    }
1536 
1537    s->usc.size = b.head - s->usc.data;
1538    return s;
1539 }
1540