1 /*
2 * Copyright 2024 Valve Corporation
3 * Copyright 2024 Alyssa Rosenzweig
4 * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5 * SPDX-License-Identifier: MIT
6 */
7 #include "hk_shader.h"
8
9 #include "agx_debug.h"
10 #include "agx_device.h"
11 #include "agx_helpers.h"
12 #include "agx_nir_lower_gs.h"
13 #include "glsl_types.h"
14 #include "nir.h"
15 #include "nir_builder.h"
16
17 #include "agx_bo.h"
18 #include "hk_cmd_buffer.h"
19 #include "hk_descriptor_set_layout.h"
20 #include "hk_device.h"
21 #include "hk_physical_device.h"
22 #include "hk_sampler.h"
23 #include "hk_shader.h"
24
25 #include "nir_builder_opcodes.h"
26 #include "nir_builtin_builder.h"
27 #include "nir_intrinsics.h"
28 #include "nir_intrinsics_indices.h"
29 #include "nir_xfb_info.h"
30 #include "shader_enums.h"
31 #include "vk_nir_convert_ycbcr.h"
32 #include "vk_pipeline.h"
33 #include "vk_pipeline_layout.h"
34 #include "vk_shader.h"
35 #include "vk_shader_module.h"
36 #include "vk_ycbcr_conversion.h"
37
38 #include "asahi/compiler/agx_compile.h"
39 #include "asahi/compiler/agx_nir.h"
40 #include "asahi/compiler/agx_nir_texture.h"
41 #include "asahi/lib/agx_abi.h"
42 #include "asahi/lib/agx_linker.h"
43 #include "asahi/lib/agx_tilebuffer.h"
44 #include "asahi/lib/agx_uvs.h"
45 #include "compiler/spirv/nir_spirv.h"
46
47 #include "util/blob.h"
48 #include "util/hash_table.h"
49 #include "util/macros.h"
50 #include "util/mesa-sha1.h"
51 #include "util/simple_mtx.h"
52 #include "util/u_debug.h"
53 #include "vulkan/vulkan_core.h"
54
55 struct hk_fs_key {
56 bool zs_self_dep;
57
58 /** True if sample shading is forced on via an API knob such as
59 * VkPipelineMultisampleStateCreateInfo::minSampleShading
60 */
61 bool force_sample_shading;
62
63 uint8_t pad[2];
64 };
65 static_assert(sizeof(struct hk_fs_key) == 4, "packed");
66
67 static void
shared_var_info(const struct glsl_type * type,unsigned * size,unsigned * align)68 shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
69 {
70 assert(glsl_type_is_vector_or_scalar(type));
71
72 uint32_t comp_size =
73 glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
74 unsigned length = glsl_get_vector_elements(type);
75 *size = comp_size * length, *align = comp_size;
76 }
77
78 uint64_t
hk_physical_device_compiler_flags(const struct hk_physical_device * pdev)79 hk_physical_device_compiler_flags(const struct hk_physical_device *pdev)
80 {
81 /* This could be optimized but it doesn't matter */
82 return pdev->dev.debug;
83 }
84
85 const nir_shader_compiler_options *
hk_get_nir_options(struct vk_physical_device * vk_pdev,gl_shader_stage stage,UNUSED const struct vk_pipeline_robustness_state * rs)86 hk_get_nir_options(struct vk_physical_device *vk_pdev, gl_shader_stage stage,
87 UNUSED const struct vk_pipeline_robustness_state *rs)
88 {
89 return &agx_nir_options;
90 }
91
92 static struct spirv_to_nir_options
hk_get_spirv_options(struct vk_physical_device * vk_pdev,UNUSED gl_shader_stage stage,const struct vk_pipeline_robustness_state * rs)93 hk_get_spirv_options(struct vk_physical_device *vk_pdev,
94 UNUSED gl_shader_stage stage,
95 const struct vk_pipeline_robustness_state *rs)
96 {
97 return (struct spirv_to_nir_options){
98 .ssbo_addr_format = hk_buffer_addr_format(rs->storage_buffers),
99 .phys_ssbo_addr_format = nir_address_format_64bit_global,
100 .ubo_addr_format = hk_buffer_addr_format(rs->uniform_buffers),
101 .shared_addr_format = nir_address_format_32bit_offset,
102 .min_ssbo_alignment = HK_MIN_SSBO_ALIGNMENT,
103 .min_ubo_alignment = HK_MIN_UBO_ALIGNMENT,
104 };
105 }
106
107 static bool
lower_halt_to_return(nir_builder * b,nir_instr * instr,UNUSED void * _data)108 lower_halt_to_return(nir_builder *b, nir_instr *instr, UNUSED void *_data)
109 {
110 if (instr->type != nir_instr_type_jump)
111 return false;
112
113 nir_jump_instr *jump = nir_instr_as_jump(instr);
114 if (jump->type != nir_jump_halt)
115 return false;
116
117 assert(b->impl == nir_shader_get_entrypoint(b->shader));
118 jump->type = nir_jump_return;
119 return true;
120 }
121
122 void
hk_preprocess_nir_internal(struct vk_physical_device * vk_pdev,nir_shader * nir)123 hk_preprocess_nir_internal(struct vk_physical_device *vk_pdev, nir_shader *nir)
124 {
125 /* Must lower before io to temps */
126 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
127 NIR_PASS(_, nir, nir_lower_terminate_to_demote);
128 NIR_PASS(_, nir, nir_shader_instructions_pass, lower_halt_to_return,
129 nir_metadata_all, NULL);
130 NIR_PASS(_, nir, nir_lower_returns);
131 }
132
133 /* Unroll loops before lowering indirects via nir_lower_io_to_temporaries */
134 UNUSED bool progress = false;
135 NIR_PASS(_, nir, nir_lower_global_vars_to_local);
136
137 do {
138 progress = false;
139 NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
140 NIR_PASS(progress, nir, nir_copy_prop);
141 NIR_PASS(progress, nir, nir_opt_dce);
142 NIR_PASS(progress, nir, nir_opt_constant_folding);
143 NIR_PASS(progress, nir, nir_opt_loop);
144 NIR_PASS(progress, nir, nir_opt_loop_unroll);
145 } while (progress);
146
147 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
148 struct nir_lower_sysvals_to_varyings_options sysvals_opts = {
149 .point_coord = true,
150 };
151
152 nir_lower_sysvals_to_varyings(nir, &sysvals_opts);
153 }
154
155 NIR_PASS(_, nir, nir_lower_system_values);
156
157 /* Gather info before preprocess_nir but after some general lowering, so
158 * inputs_read and system_values_read are accurately set.
159 */
160 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
161
162 NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir),
163 true, false);
164
165 NIR_PASS(_, nir, nir_lower_global_vars_to_local);
166
167 NIR_PASS(_, nir, nir_split_var_copies);
168 NIR_PASS(_, nir, nir_split_struct_vars, nir_var_function_temp);
169
170 /* Optimize but allow copies because we haven't lowered them yet */
171 agx_preprocess_nir(nir, NULL);
172
173 NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
174 NIR_PASS(_, nir, nir_lower_var_copies);
175 }
176
177 static void
hk_preprocess_nir(struct vk_physical_device * vk_pdev,nir_shader * nir)178 hk_preprocess_nir(struct vk_physical_device *vk_pdev, nir_shader *nir)
179 {
180 hk_preprocess_nir_internal(vk_pdev, nir);
181 nir_lower_compute_system_values_options csv_options = {
182 .has_base_workgroup_id = true,
183 };
184 NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options);
185 }
186
187 static void
hk_populate_fs_key(struct hk_fs_key * key,const struct vk_graphics_pipeline_state * state)188 hk_populate_fs_key(struct hk_fs_key *key,
189 const struct vk_graphics_pipeline_state *state)
190 {
191 memset(key, 0, sizeof(*key));
192
193 if (state == NULL)
194 return;
195
196 if (state->pipeline_flags &
197 VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT)
198 key->zs_self_dep = true;
199
200 /* We force per-sample interpolation whenever sampleShadingEnable is set
201 * regardless of minSampleShading or rasterizationSamples.
202 *
203 * When sampleShadingEnable is set, few guarantees are made about the
204 * location of interpolation of the inputs. The only real guarantees are
205 * that the inputs are interpolated within the pixel and that you get at
206 * least `rasterizationSamples * minSampleShading` unique positions.
207 * Importantly, it does not require that when `rasterizationSamples *
208 * minSampleShading <= 1.0` that those positions are at the fragment
209 * center. Therefore, it's valid to just always do per-sample all the time.
210 *
211 * The one caveat here is that we have to be careful about gl_SampleMaskIn.
212 * When `hk_fs_key::force_sample_shading = true` we also turn any reads of
213 * gl_SampleMaskIn into `1 << gl_SampleID` because the hardware sample mask
214 * is actually per-fragment, not per-pass. We handle this by smashing
215 * minSampleShading to 1.0 whenever gl_SampleMaskIn is read.
216 */
217 const struct vk_multisample_state *ms = state->ms;
218 if (ms != NULL && ms->sample_shading_enable)
219 key->force_sample_shading = true;
220 }
221
222 static void
hk_hash_graphics_state(struct vk_physical_device * device,const struct vk_graphics_pipeline_state * state,VkShaderStageFlags stages,blake3_hash blake3_out)223 hk_hash_graphics_state(struct vk_physical_device *device,
224 const struct vk_graphics_pipeline_state *state,
225 VkShaderStageFlags stages, blake3_hash blake3_out)
226 {
227 struct mesa_blake3 blake3_ctx;
228 _mesa_blake3_init(&blake3_ctx);
229 if (stages & VK_SHADER_STAGE_FRAGMENT_BIT) {
230 struct hk_fs_key key;
231 hk_populate_fs_key(&key, state);
232 _mesa_blake3_update(&blake3_ctx, &key, sizeof(key));
233
234 const bool is_multiview = state->rp->view_mask != 0;
235 _mesa_blake3_update(&blake3_ctx, &is_multiview, sizeof(is_multiview));
236 }
237 _mesa_blake3_final(&blake3_ctx, blake3_out);
238 }
239
240 static nir_def *
bounds_check(nir_builder * b,nir_def * data,nir_def * offs,nir_def * bound)241 bounds_check(nir_builder *b, nir_def *data, nir_def *offs, nir_def *bound)
242 {
243 if (data->bit_size == 32 && data->num_components == 1) {
244 return nir_bounds_agx(b, data, offs, bound);
245 } else {
246 /* TODO: Optimize */
247 return nir_bcsel(b, nir_uge(b, bound, offs), data,
248 nir_imm_zero(b, data->num_components, data->bit_size));
249 }
250 }
251
252 static bool
lower_load_global_constant_offset_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * data)253 lower_load_global_constant_offset_instr(nir_builder *b,
254 nir_intrinsic_instr *intrin, void *data)
255 {
256 if (intrin->intrinsic != nir_intrinsic_load_global_constant_offset &&
257 intrin->intrinsic != nir_intrinsic_load_global_constant_bounded)
258 return false;
259
260 b->cursor = nir_before_instr(&intrin->instr);
261 bool *has_soft_fault = data;
262
263 nir_def *base_addr = intrin->src[0].ssa;
264 nir_def *offset = intrin->src[1].ssa;
265 nir_def *bound = NULL;
266 nir_def *zero = NULL;
267
268 unsigned bit_size = intrin->def.bit_size;
269 assert(bit_size >= 8 && bit_size % 8 == 0);
270 unsigned byte_size = bit_size / 8;
271 unsigned load_size = byte_size * intrin->num_components;
272
273 if (intrin->intrinsic == nir_intrinsic_load_global_constant_bounded) {
274 bound = intrin->src[2].ssa;
275 zero = nir_imm_zero(b, intrin->num_components, bit_size);
276
277 nir_def *sat_offset =
278 nir_umin(b, offset, nir_imm_int(b, UINT32_MAX - (load_size - 1)));
279 nir_def *in_bounds =
280 nir_ilt(b, nir_iadd_imm(b, sat_offset, load_size - 1), bound);
281
282 /* If we do not have soft fault, we branch to bounds check. This is slow,
283 * fortunately we always have soft fault for release drivers.
284 *
285 * With soft fault, we speculatively load and smash to zero at the end.
286 */
287 if (!(*has_soft_fault))
288 nir_push_if(b, in_bounds);
289 }
290
291 unsigned align_mul = nir_intrinsic_align_mul(intrin);
292 unsigned align_offset = nir_intrinsic_align_offset(intrin);
293
294 nir_def *val = nir_build_load_global_constant(
295 b, intrin->def.num_components, intrin->def.bit_size,
296 nir_iadd(b, base_addr, nir_u2u64(b, offset)), .align_mul = align_mul,
297 .align_offset = align_offset, .access = nir_intrinsic_access(intrin));
298
299 if (intrin->intrinsic == nir_intrinsic_load_global_constant_bounded) {
300 if (*has_soft_fault) {
301 nir_scalar offs = nir_scalar_resolved(offset, 0);
302 if (nir_scalar_is_const(offs)) {
303 /* Calculate last byte loaded */
304 unsigned offs_imm = nir_scalar_as_uint(offs) + load_size;
305
306 /* Simplify the bounds check. Uniform buffers are bounds checked at
307 * 64B granularity, so `bound` is a multiple of K = 64. Then
308 *
309 * offs_imm < bound <==> round_down(offs_imm, K) < bound. Proof:
310 *
311 * "=>" round_down(offs_imm, K) <= offs_imm < bound.
312 *
313 * "<=" Let a, b be integer s.t. offs_imm = K a + b with b < K.
314 * Note round_down(offs_imm, K) = Ka.
315 *
316 * Let c be integer s.t. bound = Kc.
317 * We have Ka < Kc => a < c.
318 * b < K => Ka + b < K(a + 1).
319 *
320 * a < c with integers => a + 1 <= c.
321 * offs_imm < K(a + 1) <= Kc = bound.
322 * Hence offs_imm < bound.
323 */
324 assert(align_mul == 64);
325 offs_imm &= ~(align_mul - 1);
326
327 /* Bounds checks are `offset > bound ? 0 : val` so if offset = 0,
328 * the bounds check is useless.
329 */
330 if (offs_imm) {
331 val = bounds_check(b, val, nir_imm_int(b, offs_imm), bound);
332 }
333 } else {
334 offset = nir_iadd_imm(b, offset, load_size);
335 val = bounds_check(b, val, offset, bound);
336 }
337
338 } else {
339 nir_pop_if(b, NULL);
340 val = nir_if_phi(b, val, zero);
341 }
342 }
343
344 nir_def_replace(&intrin->def, val);
345 return true;
346 }
347
348 struct lower_ycbcr_state {
349 uint32_t set_layout_count;
350 struct vk_descriptor_set_layout *const *set_layouts;
351 };
352
353 static const struct vk_ycbcr_conversion_state *
lookup_ycbcr_conversion(const void * _state,uint32_t set,uint32_t binding,uint32_t array_index)354 lookup_ycbcr_conversion(const void *_state, uint32_t set, uint32_t binding,
355 uint32_t array_index)
356 {
357 const struct lower_ycbcr_state *state = _state;
358 assert(set < state->set_layout_count);
359 assert(state->set_layouts[set] != NULL);
360 const struct hk_descriptor_set_layout *set_layout =
361 vk_to_hk_descriptor_set_layout(state->set_layouts[set]);
362 assert(binding < set_layout->binding_count);
363
364 const struct hk_descriptor_set_binding_layout *bind_layout =
365 &set_layout->binding[binding];
366
367 if (bind_layout->immutable_samplers == NULL)
368 return NULL;
369
370 array_index = MIN2(array_index, bind_layout->array_size - 1);
371
372 const struct hk_sampler *sampler =
373 bind_layout->immutable_samplers[array_index];
374
375 return sampler && sampler->vk.ycbcr_conversion
376 ? &sampler->vk.ycbcr_conversion->state
377 : NULL;
378 }
379
380 static int
glsl_type_size(const struct glsl_type * type,bool bindless)381 glsl_type_size(const struct glsl_type *type, bool bindless)
382 {
383 return glsl_count_attribute_slots(type, false);
384 }
385
386 /*
387 * This is the world's worst multiview implementation. We simply duplicate each
388 * draw on the CPU side, changing a uniform in between, and then plumb the view
389 * index into the layer ID here. Whatever, it works.
390 *
391 * The "proper" implementation on AGX would use vertex amplification, but a
392 * MacBook is not a VR headset.
393 */
394 static void
hk_lower_multiview(nir_shader * nir)395 hk_lower_multiview(nir_shader *nir)
396 {
397 /* If there's an existing layer ID write, ignore it. This avoids validation
398 * splat with vk_meta.
399 */
400 nir_variable *existing = nir_find_variable_with_location(
401 nir, nir_var_shader_out, VARYING_SLOT_LAYER);
402
403 if (existing) {
404 existing->data.mode = nir_var_shader_temp;
405 existing->data.location = 0;
406 nir_fixup_deref_modes(nir);
407 }
408
409 /* Now write the view index as the layer */
410 nir_builder b =
411 nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(nir)));
412
413 nir_variable *layer =
414 nir_variable_create(nir, nir_var_shader_out, glsl_uint_type(), NULL);
415
416 layer->data.location = VARYING_SLOT_LAYER;
417
418 nir_store_var(&b, layer, nir_load_view_index(&b), nir_component_mask(1));
419 b.shader->info.outputs_written |= VARYING_BIT_LAYER;
420 }
421
422 /*
423 * KHR_maintenance5 requires that points rasterize with a default point size of
424 * 1.0, while our hardware requires an explicit point size write for this.
425 * Since topology may be dynamic, we insert an unconditional write if necessary.
426 */
427 static bool
hk_nir_insert_psiz_write(nir_shader * nir)428 hk_nir_insert_psiz_write(nir_shader *nir)
429 {
430 nir_function_impl *impl = nir_shader_get_entrypoint(nir);
431
432 if (nir->info.outputs_written & VARYING_BIT_PSIZ) {
433 nir_metadata_preserve(impl, nir_metadata_all);
434 return false;
435 }
436
437 nir_builder b = nir_builder_at(nir_after_impl(impl));
438
439 nir_store_output(&b, nir_imm_float(&b, 1.0), nir_imm_int(&b, 0),
440 .write_mask = nir_component_mask(1),
441 .io_semantics.location = VARYING_SLOT_PSIZ,
442 .io_semantics.num_slots = 1, .src_type = nir_type_float32);
443
444 nir->info.outputs_written |= VARYING_BIT_PSIZ;
445 nir_metadata_preserve(b.impl, nir_metadata_control_flow);
446 return true;
447 }
448
449 static nir_def *
query_custom_border(nir_builder * b,nir_tex_instr * tex)450 query_custom_border(nir_builder *b, nir_tex_instr *tex)
451 {
452 return nir_build_texture_query(b, tex, nir_texop_custom_border_color_agx, 4,
453 tex->dest_type, false, false);
454 }
455
456 static nir_def *
has_custom_border(nir_builder * b,nir_tex_instr * tex)457 has_custom_border(nir_builder *b, nir_tex_instr *tex)
458 {
459 return nir_build_texture_query(b, tex, nir_texop_has_custom_border_color_agx,
460 1, nir_type_bool1, false, false);
461 }
462
463 static bool
lower(nir_builder * b,nir_instr * instr,UNUSED void * _data)464 lower(nir_builder *b, nir_instr *instr, UNUSED void *_data)
465 {
466 if (instr->type != nir_instr_type_tex)
467 return false;
468
469 nir_tex_instr *tex = nir_instr_as_tex(instr);
470 if (!nir_tex_instr_need_sampler(tex) || nir_tex_instr_is_query(tex))
471 return false;
472
473 /* XXX: this is a really weird edge case, is this even well-defined? */
474 if (tex->is_shadow)
475 return false;
476
477 b->cursor = nir_after_instr(&tex->instr);
478 nir_def *has_custom = has_custom_border(b, tex);
479
480 nir_instr *orig = nir_instr_clone(b->shader, &tex->instr);
481 nir_builder_instr_insert(b, orig);
482 nir_def *clamp_to_1 = &nir_instr_as_tex(orig)->def;
483
484 nir_push_if(b, has_custom);
485 nir_def *replaced = NULL;
486 {
487 /* Sample again, this time with clamp-to-0 instead of clamp-to-1 */
488 nir_instr *clone_instr = nir_instr_clone(b->shader, &tex->instr);
489 nir_builder_instr_insert(b, clone_instr);
490
491 nir_tex_instr *tex_0 = nir_instr_as_tex(clone_instr);
492 nir_def *clamp_to_0 = &tex_0->def;
493
494 tex_0->backend_flags |= AGX_TEXTURE_FLAG_CLAMP_TO_0;
495
496 /* Grab the border colour */
497 nir_def *border = query_custom_border(b, tex_0);
498
499 if (tex->op == nir_texop_tg4) {
500 border = nir_replicate(b, nir_channel(b, border, tex->component), 4);
501 }
502
503 /* Combine together with the border */
504 if (nir_alu_type_get_base_type(tex->dest_type) == nir_type_float &&
505 tex->op != nir_texop_tg4) {
506
507 /* For floats, lerp together:
508 *
509 * For border texels: (1 * border) + (0 * border ) = border
510 * For regular texels: (x * border) + (x * (1 - border)) = x.
511 *
512 * Linear filtering is linear (duh), so lerping is compatible.
513 */
514 replaced = nir_flrp(b, clamp_to_0, clamp_to_1, border);
515 } else {
516 /* For integers, just select componentwise since there is no linear
517 * filtering. Gathers also use this path since they are unfiltered in
518 * each component.
519 */
520 replaced = nir_bcsel(b, nir_ieq(b, clamp_to_0, clamp_to_1), clamp_to_0,
521 border);
522 }
523 }
524 nir_pop_if(b, NULL);
525
526 /* Put it together with a phi */
527 nir_def *phi = nir_if_phi(b, replaced, clamp_to_1);
528 nir_def_replace(&tex->def, phi);
529 return true;
530 }
531
532 static bool
agx_nir_lower_custom_border(nir_shader * nir)533 agx_nir_lower_custom_border(nir_shader *nir)
534 {
535 return nir_shader_instructions_pass(nir, lower, nir_metadata_none, NULL);
536 }
537
538 /*
539 * In Vulkan, the VIEWPORT should read 0 in the fragment shader if it is not
540 * written by the vertex shader, but in our implementation, the varying would
541 * otherwise be undefined. This small pass predicates VIEWPORT reads based on
542 * whether the hardware vertex shader writes the VIEWPORT (nonzero UVS index).
543 */
544 static bool
lower_viewport_fs(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)545 lower_viewport_fs(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data)
546 {
547 if (intr->intrinsic != nir_intrinsic_load_input)
548 return false;
549
550 nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
551 if (sem.location != VARYING_SLOT_VIEWPORT)
552 return false;
553
554 b->cursor = nir_after_instr(&intr->instr);
555 nir_def *orig = &intr->def;
556
557 nir_def *uvs = nir_load_uvs_index_agx(b, .io_semantics = sem);
558 nir_def *def = nir_bcsel(b, nir_ine_imm(b, uvs, 0), orig, nir_imm_int(b, 0));
559
560 nir_def_rewrite_uses_after(orig, def, def->parent_instr);
561 return true;
562 }
563
564 static bool
lower_subpass_dim(nir_builder * b,nir_instr * instr,UNUSED void * _data)565 lower_subpass_dim(nir_builder *b, nir_instr *instr, UNUSED void *_data)
566 {
567 if (instr->type != nir_instr_type_tex)
568 return false;
569
570 nir_tex_instr *tex = nir_instr_as_tex(instr);
571 if (tex->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS)
572 tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
573 else if (tex->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS)
574 tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
575 else
576 return false;
577
578 return true;
579 }
580
581 static bool
should_lower_robust(const nir_intrinsic_instr * intr,const void * _)582 should_lower_robust(const nir_intrinsic_instr *intr, const void *_)
583 {
584 /* The hardware is robust, but our software image atomics are not. Unlike the
585 * GL driver, we don't use the common buffer image lowering, using the
586 * agx_nir_lower_texture lowering for robustImageAccess2 semantics.
587 */
588 return intr->intrinsic == nir_intrinsic_image_deref_atomic ||
589 intr->intrinsic == nir_intrinsic_image_deref_atomic_swap;
590 }
591
592 void
hk_lower_nir(struct hk_device * dev,nir_shader * nir,const struct vk_pipeline_robustness_state * rs,bool is_multiview,uint32_t set_layout_count,struct vk_descriptor_set_layout * const * set_layouts)593 hk_lower_nir(struct hk_device *dev, nir_shader *nir,
594 const struct vk_pipeline_robustness_state *rs, bool is_multiview,
595 uint32_t set_layout_count,
596 struct vk_descriptor_set_layout *const *set_layouts)
597 {
598 if (HK_PERF(dev, NOROBUST)) {
599 rs = &vk_robustness_disabled;
600 }
601
602 const nir_opt_access_options access_options = {
603 .is_vulkan = true,
604 };
605 NIR_PASS_V(nir, nir_opt_access, &access_options);
606
607 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
608 NIR_PASS(_, nir, nir_lower_input_attachments,
609 &(nir_input_attachment_options){
610 .use_fragcoord_sysval = true,
611 .use_layer_id_sysval = true,
612 .use_view_id_for_layer = is_multiview,
613 });
614
615 NIR_PASS(_, nir, nir_shader_instructions_pass, lower_subpass_dim,
616 nir_metadata_all, NULL);
617 NIR_PASS(_, nir, nir_lower_wpos_center);
618 }
619
620 /* XXX: should be last geometry stage, how do I get to that? */
621 if (nir->info.stage == MESA_SHADER_VERTEX) {
622 if (is_multiview)
623 hk_lower_multiview(nir);
624 }
625
626 if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
627 NIR_PASS(_, nir, nir_lower_patch_vertices,
628 nir->info.tess.tcs_vertices_out, NULL);
629 }
630
631 const struct lower_ycbcr_state ycbcr_state = {
632 .set_layout_count = set_layout_count,
633 .set_layouts = set_layouts,
634 };
635 NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex, lookup_ycbcr_conversion,
636 &ycbcr_state);
637
638 /* Lower push constants before lower_descriptors */
639 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const,
640 nir_address_format_32bit_offset);
641
642 // NIR_PASS(_, nir, nir_opt_large_constants, NULL, 32);
643
644 /* Turn cache flushes into image coherency bits while we still have derefs */
645 NIR_PASS(_, nir, nir_lower_memory_model);
646
647 NIR_PASS(_, nir, nir_lower_robust_access, should_lower_robust, NULL);
648
649 /* We must do early lowering before hk_nir_lower_descriptors, since this will
650 * create lod_bias_agx instructions.
651 */
652 NIR_PASS(_, nir, agx_nir_lower_texture_early, true /* support_lod_bias */);
653
654 if (!HK_PERF(dev, NOBORDER)) {
655 NIR_PASS(_, nir, agx_nir_lower_custom_border);
656 }
657
658 NIR_PASS(_, nir, hk_nir_lower_descriptors, rs, set_layout_count,
659 set_layouts);
660 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
661 nir_address_format_64bit_global);
662 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo,
663 hk_buffer_addr_format(rs->storage_buffers));
664 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
665 hk_buffer_addr_format(rs->uniform_buffers));
666
667 /* Before inserting bounds checks, we want to do a fair bit of optimization.
668 * lower_load_global_constant_offset_instr has special optimizations for
669 * constant offsets, so we want as many offsets to be constant as possible.
670 */
671 bool progress;
672 do {
673 progress = false;
674 NIR_PASS(progress, nir, nir_opt_constant_folding);
675 NIR_PASS(progress, nir, nir_opt_algebraic);
676 NIR_PASS(progress, nir, nir_copy_prop);
677 NIR_PASS(progress, nir, nir_opt_dce);
678 } while (progress);
679
680 bool soft_fault = agx_has_soft_fault(&dev->dev);
681 NIR_PASS(_, nir, nir_shader_intrinsics_pass,
682 lower_load_global_constant_offset_instr, nir_metadata_none,
683 &soft_fault);
684
685 if (!nir->info.shared_memory_explicit_layout) {
686 /* There may be garbage in shared_size, but it's the job of
687 * nir_lower_vars_to_explicit_types to allocate it. We have to reset to
688 * avoid overallocation.
689 */
690 nir->info.shared_size = 0;
691
692 NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared,
693 shared_var_info);
694 }
695 NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared,
696 nir_address_format_32bit_offset);
697
698 if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
699 /* Align everything up to 16B so we can write whole vec4s. */
700 nir->info.shared_size = align(nir->info.shared_size, 16);
701 NIR_PASS(_, nir, nir_zero_initialize_shared_memory, nir->info.shared_size,
702 16);
703
704 /* We need to call lower_compute_system_values again because
705 * nir_zero_initialize_shared_memory generates load_invocation_id which
706 * has to be lowered to load_invocation_index.
707 */
708 NIR_PASS(_, nir, nir_lower_compute_system_values, NULL);
709 }
710
711 /* TODO: we can do indirect VS output */
712 nir_variable_mode lower_indirect_modes = 0;
713 if (nir->info.stage == MESA_SHADER_FRAGMENT)
714 lower_indirect_modes |= nir_var_shader_out;
715 else if (nir->info.stage == MESA_SHADER_VERTEX)
716 lower_indirect_modes |= nir_var_shader_in | nir_var_shader_out;
717
718 NIR_PASS(_, nir, nir_lower_indirect_derefs, lower_indirect_modes,
719 UINT32_MAX);
720
721 NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
722 glsl_type_size,
723 nir_lower_io_lower_64bit_to_32 |
724 nir_lower_io_use_interpolated_input_intrinsics);
725
726 if (nir->info.stage == MESA_SHADER_FRAGMENT) {
727 NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_viewport_fs,
728 nir_metadata_control_flow, NULL);
729 }
730
731 NIR_PASS(_, nir, agx_nir_lower_texture);
732 NIR_PASS(_, nir, agx_nir_lower_multisampled_image_store);
733
734 agx_preprocess_nir(nir, dev->dev.libagx);
735 NIR_PASS(_, nir, nir_opt_conditional_discard);
736 NIR_PASS(_, nir, nir_opt_if,
737 nir_opt_if_optimize_phi_true_false | nir_opt_if_avoid_64bit_phis);
738 }
739
740 static void
hk_upload_shader(struct hk_device * dev,struct hk_shader * shader)741 hk_upload_shader(struct hk_device *dev, struct hk_shader *shader)
742 {
743 if (shader->b.info.has_preamble || shader->b.info.rodata.size_16) {
744 /* TODO: Do we wnat to compact? Revisit when we rework prolog/epilogs. */
745 size_t size = shader->b.info.binary_size;
746 assert(size > 0);
747
748 shader->bo = agx_bo_create(&dev->dev, size, 0,
749 AGX_BO_EXEC | AGX_BO_LOW_VA, "Preamble");
750 memcpy(agx_bo_map(shader->bo), shader->b.binary, size);
751 shader->preamble_addr =
752 shader->bo->va->addr + shader->b.info.preamble_offset;
753 }
754
755 if (!shader->linked.ht) {
756 /* If we only have a single variant, link now. */
757 shader->only_linked = hk_fast_link(dev, false, shader, NULL, NULL, 0);
758 }
759
760 if (shader->info.stage == MESA_SHADER_FRAGMENT) {
761 agx_pack_fragment_face_2(&shader->frag_face, 0, &shader->b.info);
762 }
763
764 agx_pack(&shader->counts, COUNTS, cfg) {
765 cfg.uniform_register_count = shader->b.info.push_count;
766 cfg.preshader_register_count = shader->b.info.nr_preamble_gprs;
767 cfg.sampler_state_register_count = agx_translate_sampler_state_count(
768 shader->b.info.uses_txf ? 1 : 0, false);
769 }
770 }
771
772 DERIVE_HASH_TABLE(hk_fast_link_key_vs);
773 DERIVE_HASH_TABLE(hk_fast_link_key_fs);
774
775 static VkResult
hk_init_link_ht(struct hk_shader * shader,gl_shader_stage sw_stage)776 hk_init_link_ht(struct hk_shader *shader, gl_shader_stage sw_stage)
777 {
778 simple_mtx_init(&shader->linked.lock, mtx_plain);
779
780 bool multiple_variants =
781 sw_stage == MESA_SHADER_VERTEX || sw_stage == MESA_SHADER_FRAGMENT;
782
783 if (!multiple_variants)
784 return VK_SUCCESS;
785
786 if (sw_stage == MESA_SHADER_VERTEX)
787 shader->linked.ht = hk_fast_link_key_vs_table_create(NULL);
788 else
789 shader->linked.ht = hk_fast_link_key_fs_table_create(NULL);
790
791 return (shader->linked.ht == NULL) ? VK_ERROR_OUT_OF_HOST_MEMORY
792 : VK_SUCCESS;
793 }
794
795 static VkResult
hk_compile_nir(struct hk_device * dev,const VkAllocationCallbacks * pAllocator,nir_shader * nir,VkShaderCreateFlagsEXT shader_flags,const struct vk_pipeline_robustness_state * rs,const struct hk_fs_key * fs_key,struct hk_shader * shader,gl_shader_stage sw_stage,bool hw,nir_xfb_info * xfb_info)796 hk_compile_nir(struct hk_device *dev, const VkAllocationCallbacks *pAllocator,
797 nir_shader *nir, VkShaderCreateFlagsEXT shader_flags,
798 const struct vk_pipeline_robustness_state *rs,
799 const struct hk_fs_key *fs_key, struct hk_shader *shader,
800 gl_shader_stage sw_stage, bool hw, nir_xfb_info *xfb_info)
801 {
802 unsigned vs_uniform_base = 0;
803
804 /* For now, only shader objects are supported */
805 if (sw_stage == MESA_SHADER_VERTEX) {
806 vs_uniform_base =
807 6 * DIV_ROUND_UP(
808 BITSET_LAST_BIT(shader->info.vs.attrib_components_read), 4);
809 } else if (sw_stage == MESA_SHADER_FRAGMENT) {
810 shader->info.fs.interp = agx_gather_interp_info(nir);
811 shader->info.fs.writes_memory = nir->info.writes_memory;
812
813 /* Discards must be lowering before lowering MSAA to handle discards */
814 NIR_PASS(_, nir, agx_nir_lower_discard_zs_emit);
815 NIR_PASS(_, nir, agx_nir_lower_fs_output_to_epilog,
816 &shader->info.fs.epilog_key);
817 NIR_PASS(_, nir, agx_nir_lower_sample_mask);
818
819 if (nir->info.fs.uses_sample_shading) {
820 /* Ensure the sample mask is preserved in register */
821 nir_builder b =
822 nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(nir)));
823
824 nir_def *mask =
825 nir_load_exported_agx(&b, 1, 16, .base = AGX_ABI_FIN_SAMPLE_MASK);
826
827 nir_export_agx(&b, mask, .base = AGX_ABI_FOUT_SAMPLE_MASK);
828
829 NIR_PASS(_, nir, agx_nir_lower_to_per_sample);
830 }
831
832 NIR_PASS(_, nir, agx_nir_lower_fs_active_samples_to_register);
833 NIR_PASS(_, nir, agx_nir_lower_interpolation);
834 } else if (sw_stage == MESA_SHADER_TESS_EVAL ||
835 sw_stage == MESA_SHADER_TESS_CTRL) {
836
837 shader->info.tess.info.ccw = nir->info.tess.ccw;
838 shader->info.tess.info.points = nir->info.tess.point_mode;
839 shader->info.tess.info.spacing = nir->info.tess.spacing;
840 shader->info.tess.info.mode = nir->info.tess._primitive_mode;
841
842 if (sw_stage == MESA_SHADER_TESS_CTRL) {
843 shader->info.tess.tcs_output_patch_size =
844 nir->info.tess.tcs_vertices_out;
845 shader->info.tess.tcs_per_vertex_outputs =
846 agx_tcs_per_vertex_outputs(nir);
847 shader->info.tess.tcs_nr_patch_outputs =
848 util_last_bit(nir->info.patch_outputs_written);
849 shader->info.tess.tcs_output_stride = agx_tcs_output_stride(nir);
850 } else {
851 /* This destroys info so it needs to happen after the gather */
852 NIR_PASS(_, nir, agx_nir_lower_tes, dev->dev.libagx, hw);
853 }
854 }
855
856 uint64_t outputs = nir->info.outputs_written;
857 if (!hw &&
858 (sw_stage == MESA_SHADER_VERTEX || sw_stage == MESA_SHADER_TESS_EVAL)) {
859 nir->info.stage = MESA_SHADER_COMPUTE;
860 memset(&nir->info.cs, 0, sizeof(nir->info.cs));
861 nir->xfb_info = NULL;
862 }
863
864 /* XXX: rename */
865 NIR_PASS(_, nir, hk_lower_uvs_index, vs_uniform_base);
866
867 #if 0
868 /* TODO */
869 nir_variable_mode robust2_modes = 0;
870 if (rs->uniform_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
871 robust2_modes |= nir_var_mem_ubo;
872 if (rs->storage_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
873 robust2_modes |= nir_var_mem_ssbo;
874 #endif
875
876 struct agx_shader_key backend_key = {
877 .dev = agx_gather_device_key(&dev->dev),
878 .reserved_preamble = 128 /* TODO */,
879 .libagx = dev->dev.libagx,
880 .no_stop = nir->info.stage == MESA_SHADER_FRAGMENT,
881 .has_scratch = !nir->info.internal,
882 .promote_constants = true,
883 };
884
885 /* For now, sample shading is always dynamic. Indicate that. */
886 if (nir->info.stage == MESA_SHADER_FRAGMENT &&
887 nir->info.fs.uses_sample_shading)
888 backend_key.fs.inside_sample_loop = true;
889
890 simple_mtx_t *lock = NULL;
891 if (agx_get_compiler_debug())
892 lock = &hk_device_physical(dev)->debug_compile_lock;
893
894 if (lock)
895 simple_mtx_lock(lock);
896
897 agx_compile_shader_nir(nir, &backend_key, NULL, &shader->b);
898
899 if (lock)
900 simple_mtx_unlock(lock);
901
902 shader->code_ptr = shader->b.binary;
903 shader->code_size = shader->b.info.binary_size;
904
905 shader->info.stage = sw_stage;
906 shader->info.clip_distance_array_size = nir->info.clip_distance_array_size;
907 shader->info.cull_distance_array_size = nir->info.cull_distance_array_size;
908 shader->b.info.outputs = outputs;
909
910 if (xfb_info) {
911 assert(xfb_info->output_count < ARRAY_SIZE(shader->info.xfb_outputs));
912
913 memcpy(&shader->info.xfb_info, xfb_info,
914 nir_xfb_info_size(xfb_info->output_count));
915
916 typed_memcpy(shader->info.xfb_stride, nir->info.xfb_stride, 4);
917 }
918
919 if (nir->constant_data_size > 0) {
920 uint32_t data_size = align(nir->constant_data_size, HK_MIN_UBO_ALIGNMENT);
921
922 void *data = malloc(data_size);
923 if (data == NULL) {
924 ralloc_free(nir);
925 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
926 }
927
928 memcpy(data, nir->constant_data, nir->constant_data_size);
929
930 assert(nir->constant_data_size <= data_size);
931 memset(data + nir->constant_data_size, 0,
932 data_size - nir->constant_data_size);
933
934 shader->data_ptr = data;
935 shader->data_size = data_size;
936 }
937
938 ralloc_free(nir);
939
940 VkResult result = hk_init_link_ht(shader, sw_stage);
941 if (result != VK_SUCCESS)
942 return vk_error(dev, result);
943
944 hk_upload_shader(dev, shader);
945 return VK_SUCCESS;
946 }
947
948 static const struct vk_shader_ops hk_shader_ops;
949
950 static void
hk_destroy_linked_shader(struct hk_device * dev,struct hk_linked_shader * linked)951 hk_destroy_linked_shader(struct hk_device *dev, struct hk_linked_shader *linked)
952 {
953 agx_bo_unreference(&dev->dev, linked->b.bo);
954 ralloc_free(linked);
955 }
956
957 static void
hk_shader_destroy(struct hk_device * dev,struct hk_shader * s)958 hk_shader_destroy(struct hk_device *dev, struct hk_shader *s)
959 {
960 free((void *)s->code_ptr);
961 free((void *)s->data_ptr);
962 agx_bo_unreference(&dev->dev, s->bo);
963
964 simple_mtx_destroy(&s->linked.lock);
965
966 if (s->only_linked)
967 hk_destroy_linked_shader(dev, s->only_linked);
968
969 if (s->linked.ht) {
970 hash_table_foreach(s->linked.ht, entry) {
971 hk_destroy_linked_shader(dev, entry->data);
972 }
973 _mesa_hash_table_destroy(s->linked.ht, NULL);
974 }
975 }
976
977 void
hk_api_shader_destroy(struct vk_device * vk_dev,struct vk_shader * vk_shader,const VkAllocationCallbacks * pAllocator)978 hk_api_shader_destroy(struct vk_device *vk_dev, struct vk_shader *vk_shader,
979 const VkAllocationCallbacks *pAllocator)
980 {
981 struct hk_device *dev = container_of(vk_dev, struct hk_device, vk);
982 struct hk_api_shader *obj =
983 container_of(vk_shader, struct hk_api_shader, vk);
984
985 hk_foreach_variant(obj, shader) {
986 hk_shader_destroy(dev, shader);
987 }
988
989 vk_shader_free(&dev->vk, pAllocator, &obj->vk);
990 }
991
992 static void
hk_lower_hw_vs(nir_shader * nir,struct hk_shader * shader)993 hk_lower_hw_vs(nir_shader *nir, struct hk_shader *shader)
994 {
995 /* Point size must be clamped, excessively large points don't render
996 * properly on G13.
997 *
998 * Must be synced with pointSizeRange.
999 */
1000 NIR_PASS(_, nir, nir_lower_point_size, 1.0f, 511.95f);
1001
1002 /* TODO: Optimize out for monolithic? */
1003 NIR_PASS(_, nir, hk_nir_insert_psiz_write);
1004
1005 NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1006 NIR_PASS(_, nir, agx_nir_lower_cull_distance_vs);
1007
1008 NIR_PASS(_, nir, agx_nir_lower_uvs, &shader->info.uvs);
1009
1010 shader->info.vs.cull_distance_array_size =
1011 nir->info.cull_distance_array_size;
1012 }
1013
1014 VkResult
hk_compile_shader(struct hk_device * dev,struct vk_shader_compile_info * info,const struct vk_graphics_pipeline_state * state,const VkAllocationCallbacks * pAllocator,struct hk_api_shader ** shader_out)1015 hk_compile_shader(struct hk_device *dev, struct vk_shader_compile_info *info,
1016 const struct vk_graphics_pipeline_state *state,
1017 const VkAllocationCallbacks *pAllocator,
1018 struct hk_api_shader **shader_out)
1019 {
1020 VkResult result;
1021
1022 /* We consume the NIR, regardless of success or failure */
1023 nir_shader *nir = info->nir;
1024
1025 size_t size = sizeof(struct hk_api_shader) +
1026 sizeof(struct hk_shader) * hk_num_variants(info->stage);
1027 struct hk_api_shader *obj =
1028 vk_shader_zalloc(&dev->vk, &hk_shader_ops, info->stage, pAllocator, size);
1029
1030 if (obj == NULL) {
1031 ralloc_free(nir);
1032 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1033 }
1034
1035 /* TODO: Multiview with ESO */
1036 const bool is_multiview = state && state->rp->view_mask != 0;
1037
1038 hk_lower_nir(dev, nir, info->robustness, is_multiview,
1039 info->set_layout_count, info->set_layouts);
1040
1041 gl_shader_stage sw_stage = nir->info.stage;
1042
1043 struct hk_fs_key fs_key_tmp, *fs_key = NULL;
1044 if (sw_stage == MESA_SHADER_FRAGMENT) {
1045 hk_populate_fs_key(&fs_key_tmp, state);
1046 fs_key = &fs_key_tmp;
1047
1048 nir->info.fs.uses_sample_shading |= fs_key->force_sample_shading;
1049
1050 /* Force late-Z for Z/S self-deps. TODO: There's probably a less silly way
1051 * to do this.
1052 */
1053 if (fs_key->zs_self_dep) {
1054 nir_builder b =
1055 nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(nir)));
1056 nir_discard_if(&b, nir_imm_false(&b));
1057 nir->info.fs.uses_discard = true;
1058 }
1059
1060 NIR_PASS(_, nir, agx_nir_lower_sample_intrinsics, false);
1061 } else if (sw_stage == MESA_SHADER_TESS_CTRL) {
1062 NIR_PASS_V(nir, agx_nir_lower_tcs, dev->dev.libagx);
1063 }
1064
1065 /* Compile all variants up front */
1066 if (sw_stage == MESA_SHADER_GEOMETRY) {
1067 for (unsigned rast_disc = 0; rast_disc < 2; ++rast_disc) {
1068 struct hk_shader *count_variant = hk_count_gs_variant(obj, rast_disc);
1069 bool last = (rast_disc + 1) == 2;
1070
1071 /* Each variant gets its own NIR. To save an extra clone, we use the
1072 * original NIR for the last stage.
1073 */
1074 nir_shader *clone = last ? nir : nir_shader_clone(NULL, nir);
1075
1076 enum mesa_prim out_prim = MESA_PRIM_MAX;
1077 nir_shader *count = NULL, *rast = NULL, *pre_gs = NULL;
1078
1079 NIR_PASS(_, clone, agx_nir_lower_gs, dev->dev.libagx, rast_disc,
1080 &count, &rast, &pre_gs, &out_prim,
1081 &count_variant->info.gs.count_words);
1082
1083 if (!rast_disc) {
1084 struct hk_shader *shader = &obj->variants[HK_GS_VARIANT_RAST];
1085
1086 hk_lower_hw_vs(rast, shader);
1087 shader->info.gs.out_prim = out_prim;
1088 }
1089
1090 struct {
1091 nir_shader *in;
1092 struct hk_shader *out;
1093 } variants[] = {
1094 {clone, hk_main_gs_variant(obj, rast_disc)},
1095 {pre_gs, hk_pre_gs_variant(obj, rast_disc)},
1096 {count, count_variant},
1097 {rast_disc ? NULL : rast, &obj->variants[HK_GS_VARIANT_RAST]},
1098 };
1099
1100 for (unsigned v = 0; v < ARRAY_SIZE(variants); ++v) {
1101 if (variants[v].in) {
1102 result = hk_compile_nir(dev, pAllocator, variants[v].in,
1103 info->flags, info->robustness, NULL,
1104 variants[v].out, sw_stage, true, NULL);
1105 if (result != VK_SUCCESS) {
1106 hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
1107 if (clone != nir) {
1108 ralloc_free(nir);
1109 }
1110
1111 ralloc_free(clone);
1112 ralloc_free(pre_gs);
1113 ralloc_free(count);
1114 ralloc_free(rast);
1115 return result;
1116 }
1117 }
1118 }
1119
1120 /* Nothing consumes this otherwise throw it away.
1121 *
1122 * TODO: We should just not generate it.
1123 */
1124 if (rast_disc) {
1125 ralloc_free(rast);
1126 }
1127 }
1128 } else if (sw_stage == MESA_SHADER_VERTEX ||
1129 sw_stage == MESA_SHADER_TESS_EVAL) {
1130
1131 VkShaderStageFlags next_stage = info->next_stage_mask;
1132
1133 /* Transform feedback is layered on top of geometry shaders. If there is
1134 * not a geometry shader in the pipeline, we will compile a geometry
1135 * shader for the purpose. Update the next_stage mask accordingly.
1136 */
1137 if (nir->xfb_info != NULL) {
1138 next_stage |= VK_SHADER_STAGE_GEOMETRY_BIT;
1139 }
1140
1141 if (sw_stage == MESA_SHADER_VERTEX) {
1142 assert(
1143 !(nir->info.inputs_read & BITFIELD64_MASK(VERT_ATTRIB_GENERIC0)) &&
1144 "Fixed-function attributes not used in Vulkan");
1145
1146 NIR_PASS(_, nir, nir_recompute_io_bases, nir_var_shader_in);
1147 }
1148
1149 /* the shader_out portion of this is load-bearing even for tess eval */
1150 NIR_PASS(_, nir, nir_io_add_const_offset_to_base,
1151 nir_var_shader_in | nir_var_shader_out);
1152
1153 for (enum hk_vs_variant v = 0; v < HK_VS_VARIANTS; ++v) {
1154 /* Only compile the software variant if we might use this shader with
1155 * geometry/tessellation. We need to compile the hardware variant
1156 * unconditionally to handle the VS -> null FS case, which does not
1157 * require setting the FRAGMENT bit.
1158 */
1159 if (v == HK_VS_VARIANT_SW &&
1160 !(next_stage & (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
1161 VK_SHADER_STAGE_GEOMETRY_BIT)))
1162 continue;
1163
1164 struct hk_shader *shader = &obj->variants[v];
1165 bool hw = v == HK_VS_VARIANT_HW;
1166 bool last = (v + 1) == HK_VS_VARIANTS;
1167
1168 /* Each variant gets its own NIR. To save an extra clone, we use the
1169 * original NIR for the last stage.
1170 */
1171 nir_shader *clone = last ? nir : nir_shader_clone(NULL, nir);
1172
1173 if (sw_stage == MESA_SHADER_VERTEX) {
1174 NIR_PASS(_, clone, agx_nir_lower_vs_input_to_prolog,
1175 shader->info.vs.attrib_components_read);
1176
1177 shader->info.vs.attribs_read =
1178 nir->info.inputs_read >> VERT_ATTRIB_GENERIC0;
1179 }
1180
1181 if (hw) {
1182 hk_lower_hw_vs(clone, shader);
1183 } else {
1184 NIR_PASS(_, clone, agx_nir_lower_vs_before_gs, dev->dev.libagx);
1185 }
1186
1187 /* hk_compile_nir takes ownership of the clone */
1188 result = hk_compile_nir(dev, pAllocator, clone, info->flags,
1189 info->robustness, fs_key, shader, sw_stage, hw,
1190 nir->xfb_info);
1191 if (result != VK_SUCCESS) {
1192 hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
1193 ralloc_free(nir);
1194 return result;
1195 }
1196 }
1197 } else {
1198 struct hk_shader *shader = hk_only_variant(obj);
1199
1200 /* hk_compile_nir takes ownership of nir */
1201 result =
1202 hk_compile_nir(dev, pAllocator, nir, info->flags, info->robustness,
1203 fs_key, shader, sw_stage, true, NULL);
1204 if (result != VK_SUCCESS) {
1205 hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
1206 return result;
1207 }
1208 }
1209
1210 *shader_out = obj;
1211 return VK_SUCCESS;
1212 }
1213
1214 static VkResult
hk_compile_shaders(struct vk_device * vk_dev,uint32_t shader_count,struct vk_shader_compile_info * infos,const struct vk_graphics_pipeline_state * state,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shaders_out)1215 hk_compile_shaders(struct vk_device *vk_dev, uint32_t shader_count,
1216 struct vk_shader_compile_info *infos,
1217 const struct vk_graphics_pipeline_state *state,
1218 const VkAllocationCallbacks *pAllocator,
1219 struct vk_shader **shaders_out)
1220 {
1221 struct hk_device *dev = container_of(vk_dev, struct hk_device, vk);
1222
1223 for (uint32_t i = 0; i < shader_count; i++) {
1224 VkResult result =
1225 hk_compile_shader(dev, &infos[i], state, pAllocator,
1226 (struct hk_api_shader **)&shaders_out[i]);
1227 if (result != VK_SUCCESS) {
1228 /* Clean up all the shaders before this point */
1229 for (uint32_t j = 0; j < i; j++)
1230 hk_api_shader_destroy(&dev->vk, shaders_out[j], pAllocator);
1231
1232 /* Clean up all the NIR after this point */
1233 for (uint32_t j = i + 1; j < shader_count; j++)
1234 ralloc_free(infos[j].nir);
1235
1236 /* Memset the output array */
1237 memset(shaders_out, 0, shader_count * sizeof(*shaders_out));
1238
1239 return result;
1240 }
1241 }
1242
1243 return VK_SUCCESS;
1244 }
1245
1246 static VkResult
hk_deserialize_shader(struct hk_device * dev,struct blob_reader * blob,struct hk_shader * shader)1247 hk_deserialize_shader(struct hk_device *dev, struct blob_reader *blob,
1248 struct hk_shader *shader)
1249 {
1250 struct hk_shader_info info;
1251 blob_copy_bytes(blob, &info, sizeof(info));
1252
1253 struct agx_shader_info b_info;
1254 blob_copy_bytes(blob, &b_info, sizeof(b_info));
1255
1256 const uint32_t code_size = blob_read_uint32(blob);
1257 const uint32_t data_size = blob_read_uint32(blob);
1258 if (blob->overrun)
1259 return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
1260
1261 VkResult result = hk_init_link_ht(shader, info.stage);
1262 if (result != VK_SUCCESS)
1263 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1264
1265 simple_mtx_init(&shader->linked.lock, mtx_plain);
1266
1267 shader->b.info = b_info;
1268 shader->info = info;
1269 shader->code_size = code_size;
1270 shader->data_size = data_size;
1271 shader->b.info.binary_size = code_size;
1272
1273 shader->code_ptr = malloc(code_size);
1274 if (shader->code_ptr == NULL)
1275 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1276
1277 shader->data_ptr = malloc(data_size);
1278 if (shader->data_ptr == NULL)
1279 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1280
1281 blob_copy_bytes(blob, (void *)shader->code_ptr, shader->code_size);
1282 blob_copy_bytes(blob, (void *)shader->data_ptr, shader->data_size);
1283 if (blob->overrun)
1284 return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
1285
1286 shader->b.binary = (void *)shader->code_ptr;
1287 hk_upload_shader(dev, shader);
1288 return VK_SUCCESS;
1289 }
1290
1291 static VkResult
hk_deserialize_api_shader(struct vk_device * vk_dev,struct blob_reader * blob,uint32_t binary_version,const VkAllocationCallbacks * pAllocator,struct vk_shader ** shader_out)1292 hk_deserialize_api_shader(struct vk_device *vk_dev, struct blob_reader *blob,
1293 uint32_t binary_version,
1294 const VkAllocationCallbacks *pAllocator,
1295 struct vk_shader **shader_out)
1296 {
1297 struct hk_device *dev = container_of(vk_dev, struct hk_device, vk);
1298
1299 gl_shader_stage stage = blob_read_uint8(blob);
1300 if (blob->overrun)
1301 return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
1302
1303 size_t size = sizeof(struct hk_api_shader) +
1304 sizeof(struct hk_shader) * hk_num_variants(stage);
1305
1306 struct hk_api_shader *obj =
1307 vk_shader_zalloc(&dev->vk, &hk_shader_ops, stage, pAllocator, size);
1308
1309 if (obj == NULL)
1310 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
1311
1312 hk_foreach_variant(obj, shader) {
1313 VkResult result = hk_deserialize_shader(dev, blob, shader);
1314
1315 if (result != VK_SUCCESS) {
1316 hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator);
1317 return result;
1318 }
1319 }
1320
1321 *shader_out = &obj->vk;
1322 return VK_SUCCESS;
1323 }
1324
1325 static void
hk_shader_serialize(struct vk_device * vk_dev,const struct hk_shader * shader,struct blob * blob)1326 hk_shader_serialize(struct vk_device *vk_dev, const struct hk_shader *shader,
1327 struct blob *blob)
1328 {
1329 blob_write_bytes(blob, &shader->info, sizeof(shader->info));
1330 blob_write_bytes(blob, &shader->b.info, sizeof(shader->b.info));
1331
1332 blob_write_uint32(blob, shader->code_size);
1333 blob_write_uint32(blob, shader->data_size);
1334 blob_write_bytes(blob, shader->code_ptr, shader->code_size);
1335 blob_write_bytes(blob, shader->data_ptr, shader->data_size);
1336 }
1337
1338 static bool
hk_api_shader_serialize(struct vk_device * vk_dev,const struct vk_shader * vk_shader,struct blob * blob)1339 hk_api_shader_serialize(struct vk_device *vk_dev,
1340 const struct vk_shader *vk_shader, struct blob *blob)
1341 {
1342 struct hk_api_shader *obj =
1343 container_of(vk_shader, struct hk_api_shader, vk);
1344
1345 blob_write_uint8(blob, vk_shader->stage);
1346
1347 hk_foreach_variant(obj, shader) {
1348 hk_shader_serialize(vk_dev, shader, blob);
1349 }
1350
1351 return !blob->out_of_memory;
1352 }
1353
1354 #define WRITE_STR(field, ...) \
1355 ({ \
1356 memset(field, 0, sizeof(field)); \
1357 UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__); \
1358 assert(i > 0 && i < sizeof(field)); \
1359 })
1360
1361 static VkResult
hk_shader_get_executable_properties(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t * executable_count,VkPipelineExecutablePropertiesKHR * properties)1362 hk_shader_get_executable_properties(
1363 UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
1364 uint32_t *executable_count, VkPipelineExecutablePropertiesKHR *properties)
1365 {
1366 struct hk_api_shader *obj =
1367 container_of(vk_shader, struct hk_api_shader, vk);
1368
1369 VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out, properties,
1370 executable_count);
1371
1372 vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props)
1373 {
1374 props->stages = mesa_to_vk_shader_stage(obj->vk.stage);
1375 props->subgroupSize = 32;
1376 WRITE_STR(props->name, "%s", _mesa_shader_stage_to_string(obj->vk.stage));
1377 WRITE_STR(props->description, "%s shader",
1378 _mesa_shader_stage_to_string(obj->vk.stage));
1379 }
1380
1381 return vk_outarray_status(&out);
1382 }
1383
1384 static VkResult
hk_shader_get_executable_statistics(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t executable_index,uint32_t * statistic_count,VkPipelineExecutableStatisticKHR * statistics)1385 hk_shader_get_executable_statistics(
1386 UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
1387 uint32_t executable_index, uint32_t *statistic_count,
1388 VkPipelineExecutableStatisticKHR *statistics)
1389 {
1390 struct hk_api_shader *obj =
1391 container_of(vk_shader, struct hk_api_shader, vk);
1392
1393 VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, statistics,
1394 statistic_count);
1395
1396 assert(executable_index == 0);
1397
1398 /* TODO: find a sane way to report multiple variants and have that play nice
1399 * with zink.
1400 */
1401 struct hk_shader *shader = hk_any_variant(obj);
1402
1403 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat)
1404 {
1405 WRITE_STR(stat->name, "Code Size");
1406 WRITE_STR(stat->description,
1407 "Size of the compiled shader binary, in bytes");
1408 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1409 stat->value.u64 = shader->code_size;
1410 }
1411
1412 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat)
1413 {
1414 WRITE_STR(stat->name, "Number of GPRs");
1415 WRITE_STR(stat->description, "Number of GPRs used by this pipeline");
1416 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
1417 stat->value.u64 = shader->b.info.nr_gprs;
1418 }
1419
1420 return vk_outarray_status(&out);
1421 }
1422
1423 static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR * ir,const char * data)1424 write_ir_text(VkPipelineExecutableInternalRepresentationKHR *ir,
1425 const char *data)
1426 {
1427 ir->isText = VK_TRUE;
1428
1429 size_t data_len = strlen(data) + 1;
1430
1431 if (ir->pData == NULL) {
1432 ir->dataSize = data_len;
1433 return true;
1434 }
1435
1436 strncpy(ir->pData, data, ir->dataSize);
1437 if (ir->dataSize < data_len)
1438 return false;
1439
1440 ir->dataSize = data_len;
1441 return true;
1442 }
1443
1444 static VkResult
hk_shader_get_executable_internal_representations(UNUSED struct vk_device * device,const struct vk_shader * vk_shader,uint32_t executable_index,uint32_t * internal_representation_count,VkPipelineExecutableInternalRepresentationKHR * internal_representations)1445 hk_shader_get_executable_internal_representations(
1446 UNUSED struct vk_device *device, const struct vk_shader *vk_shader,
1447 uint32_t executable_index, uint32_t *internal_representation_count,
1448 VkPipelineExecutableInternalRepresentationKHR *internal_representations)
1449 {
1450 VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
1451 internal_representations,
1452 internal_representation_count);
1453 bool incomplete_text = false;
1454
1455 assert(executable_index == 0);
1456
1457 /* TODO */
1458 #if 0
1459 vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
1460 WRITE_STR(ir->name, "AGX assembly");
1461 WRITE_STR(ir->description, "AGX assembly");
1462 if (!write_ir_text(ir, TODO))
1463 incomplete_text = true;
1464 }
1465 #endif
1466
1467 return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
1468 }
1469
1470 static const struct vk_shader_ops hk_shader_ops = {
1471 .destroy = hk_api_shader_destroy,
1472 .serialize = hk_api_shader_serialize,
1473 .get_executable_properties = hk_shader_get_executable_properties,
1474 .get_executable_statistics = hk_shader_get_executable_statistics,
1475 .get_executable_internal_representations =
1476 hk_shader_get_executable_internal_representations,
1477 };
1478
1479 const struct vk_device_shader_ops hk_device_shader_ops = {
1480 .get_nir_options = hk_get_nir_options,
1481 .get_spirv_options = hk_get_spirv_options,
1482 .preprocess_nir = hk_preprocess_nir,
1483 .hash_graphics_state = hk_hash_graphics_state,
1484 .compile = hk_compile_shaders,
1485 .deserialize = hk_deserialize_api_shader,
1486 .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
1487 .cmd_bind_shaders = hk_cmd_bind_shaders,
1488 };
1489
1490 struct hk_linked_shader *
hk_fast_link(struct hk_device * dev,bool fragment,struct hk_shader * main,struct agx_shader_part * prolog,struct agx_shader_part * epilog,unsigned nr_samples_shaded)1491 hk_fast_link(struct hk_device *dev, bool fragment, struct hk_shader *main,
1492 struct agx_shader_part *prolog, struct agx_shader_part *epilog,
1493 unsigned nr_samples_shaded)
1494 {
1495 struct hk_linked_shader *s = rzalloc(NULL, struct hk_linked_shader);
1496 agx_fast_link(&s->b, &dev->dev, fragment, &main->b, prolog, epilog,
1497 nr_samples_shaded);
1498
1499 if (fragment) {
1500 agx_pack(&s->fs_counts, FRAGMENT_SHADER_WORD_0, cfg) {
1501 cfg.cf_binding_count = s->b.cf.nr_bindings;
1502 cfg.uniform_register_count = main->b.info.push_count;
1503 cfg.preshader_register_count = main->b.info.nr_preamble_gprs;
1504 cfg.sampler_state_register_count =
1505 agx_translate_sampler_state_count(s->b.uses_txf ? 1 : 0, false);
1506 }
1507 }
1508
1509 /* Now that we've linked, bake the USC words to bind this program */
1510 struct agx_usc_builder b = agx_usc_builder(s->usc.data, sizeof(s->usc.data));
1511
1512 if (main && main->b.info.rodata.size_16) {
1513 agx_usc_immediates(&b, &main->b.info.rodata, main->bo->va->addr);
1514 }
1515
1516 agx_usc_push_packed(&b, UNIFORM, dev->rodata.image_heap);
1517
1518 if (s->b.uses_txf)
1519 agx_usc_push_packed(&b, SAMPLER, dev->dev.txf_sampler);
1520
1521 agx_usc_shared_non_fragment(&b, &main->b.info, 0);
1522 agx_usc_push_packed(&b, SHADER, s->b.shader);
1523 agx_usc_push_packed(&b, REGISTERS, s->b.regs);
1524
1525 if (fragment)
1526 agx_usc_push_packed(&b, FRAGMENT_PROPERTIES, s->b.fragment_props);
1527
1528 if (main && main->b.info.has_preamble) {
1529 agx_usc_pack(&b, PRESHADER, cfg) {
1530 cfg.code = agx_usc_addr(&dev->dev, main->preamble_addr);
1531 }
1532 } else {
1533 agx_usc_pack(&b, NO_PRESHADER, cfg)
1534 ;
1535 }
1536
1537 s->usc.size = b.head - s->usc.data;
1538 return s;
1539 }
1540