1 /*
2 * Copyright 2024 Valve Corporation
3 * Copyright 2024 Alyssa Rosenzweig
4 * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5 * SPDX-License-Identifier: MIT
6 */
7 #include <assert.h>
8 #include "agx_bg_eot.h"
9 #include "agx_bo.h"
10 #include "agx_compile.h"
11 #include "agx_compiler.h"
12 #include "agx_device.h"
13 #include "agx_helpers.h"
14 #include "agx_linker.h"
15 #include "agx_nir_lower_gs.h"
16 #include "agx_nir_lower_vbo.h"
17 #include "agx_ppp.h"
18 #include "agx_tilebuffer.h"
19 #include "agx_usc.h"
20 #include "agx_uvs.h"
21 #include "hk_buffer.h"
22 #include "hk_cmd_buffer.h"
23 #include "hk_device.h"
24 #include "hk_entrypoints.h"
25 #include "hk_image.h"
26 #include "hk_image_view.h"
27 #include "hk_physical_device.h"
28 #include "hk_private.h"
29 #include "hk_shader.h"
30
31 #include "asahi/genxml/agx_pack.h"
32 #include "asahi/libagx/compression.h"
33 #include "asahi/libagx/geometry.h"
34 #include "asahi/libagx/libagx.h"
35 #include "asahi/libagx/query.h"
36 #include "asahi/libagx/tessellator.h"
37 #include "util/blend.h"
38 #include "util/format/format_utils.h"
39 #include "util/format/u_formats.h"
40 #include "util/macros.h"
41 #include "util/ralloc.h"
42 #include "util/u_prim.h"
43 #include "vulkan/vulkan_core.h"
44 #include "layout.h"
45 #include "libagx_dgc.h"
46 #include "libagx_shaders.h"
47 #include "nir.h"
48 #include "nir_builder.h"
49 #include "nir_lower_blend.h"
50 #include "nir_xfb_info.h"
51 #include "pool.h"
52 #include "shader_enums.h"
53 #include "vk_blend.h"
54 #include "vk_enum_to_str.h"
55 #include "vk_format.h"
56 #include "vk_graphics_state.h"
57 #include "vk_pipeline.h"
58 #include "vk_render_pass.h"
59 #include "vk_standard_sample_locations.h"
60 #include "vk_util.h"
61
62 #define IS_DIRTY(bit) BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_##bit)
63
64 #define IS_SHADER_DIRTY(bit) \
65 (cmd->state.gfx.shaders_dirty & BITFIELD_BIT(MESA_SHADER_##bit))
66
67 #define IS_LINKED_DIRTY(bit) \
68 (cmd->state.gfx.linked_dirty & BITFIELD_BIT(MESA_SHADER_##bit))
69
70 /* CTS coverage of indirect draws is pretty bad, so it's helpful to be able to
71 * get some extra smoke testing.
72 */
73 #define HK_TEST_INDIRECTS (0)
74
75 UNUSED static inline void
print_draw(struct agx_draw d,FILE * fp)76 print_draw(struct agx_draw d, FILE *fp)
77 {
78 if (agx_is_indirect(d.b))
79 fprintf(fp, "indirect (buffer %" PRIx64 "):", d.b.ptr);
80 else
81 fprintf(fp, "direct (%ux%u):", d.b.count[0], d.b.count[1]);
82
83 if (d.index_size)
84 fprintf(fp, " index_size=%u", agx_index_size_to_B(d.index_size));
85 else
86 fprintf(fp, " non-indexed");
87
88 if (d.restart)
89 fprintf(fp, " restart");
90
91 if (d.index_bias)
92 fprintf(fp, " index_bias=%u", d.index_bias);
93
94 if (d.start)
95 fprintf(fp, " start=%u", d.start);
96
97 if (d.start_instance)
98 fprintf(fp, " start_instance=%u", d.start_instance);
99
100 fprintf(fp, "\n");
101 }
102
103 /* XXX: deduplicate */
104 static inline enum mesa_prim
vk_conv_topology(VkPrimitiveTopology topology)105 vk_conv_topology(VkPrimitiveTopology topology)
106 {
107 switch (topology) {
108 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
109 return MESA_PRIM_POINTS;
110 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
111 return MESA_PRIM_LINES;
112 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
113 return MESA_PRIM_LINE_STRIP;
114 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
115 #pragma GCC diagnostic push
116 #pragma GCC diagnostic ignored "-Wswitch"
117 case VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA:
118 #pragma GCC diagnostic pop
119 return MESA_PRIM_TRIANGLES;
120 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
121 return MESA_PRIM_TRIANGLE_STRIP;
122 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
123 return MESA_PRIM_TRIANGLE_FAN;
124 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
125 return MESA_PRIM_LINES_ADJACENCY;
126 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
127 return MESA_PRIM_LINE_STRIP_ADJACENCY;
128 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
129 return MESA_PRIM_TRIANGLES_ADJACENCY;
130 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
131 return MESA_PRIM_TRIANGLE_STRIP_ADJACENCY;
132 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
133 return MESA_PRIM_PATCHES;
134 default:
135 unreachable("invalid");
136 }
137 }
138
139 static void
hk_cmd_buffer_dirty_render_pass(struct hk_cmd_buffer * cmd)140 hk_cmd_buffer_dirty_render_pass(struct hk_cmd_buffer *cmd)
141 {
142 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
143
144 /* These depend on color attachment count */
145 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
146 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
147 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
148 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
149
150 /* These depend on the depth/stencil format */
151 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
152 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
153 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
154 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
155 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
156
157 /* This may depend on render targets for ESO */
158 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
159
160 /* This may depend on render targets */
161 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP);
162 }
163
164 void
hk_cmd_buffer_begin_graphics(struct hk_cmd_buffer * cmd,const VkCommandBufferBeginInfo * pBeginInfo)165 hk_cmd_buffer_begin_graphics(struct hk_cmd_buffer *cmd,
166 const VkCommandBufferBeginInfo *pBeginInfo)
167 {
168 if (cmd->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
169 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
170 char gcbiar_data[VK_GCBIARR_DATA_SIZE(HK_MAX_RTS)];
171 const VkRenderingInfo *resume_info =
172 vk_get_command_buffer_inheritance_as_rendering_resume(
173 cmd->vk.level, pBeginInfo, gcbiar_data);
174 if (resume_info) {
175 hk_CmdBeginRendering(hk_cmd_buffer_to_handle(cmd), resume_info);
176 } else {
177 const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
178 vk_get_command_buffer_inheritance_rendering_info(cmd->vk.level,
179 pBeginInfo);
180 assert(inheritance_info);
181
182 struct hk_rendering_state *render = &cmd->state.gfx.render;
183 render->flags = inheritance_info->flags;
184 render->area = (VkRect2D){};
185 render->layer_count = 0;
186 render->view_mask = inheritance_info->viewMask;
187 render->tilebuffer.nr_samples = inheritance_info->rasterizationSamples;
188
189 render->color_att_count = inheritance_info->colorAttachmentCount;
190 for (uint32_t i = 0; i < render->color_att_count; i++) {
191 render->color_att[i].vk_format =
192 inheritance_info->pColorAttachmentFormats[i];
193 }
194 render->depth_att.vk_format = inheritance_info->depthAttachmentFormat;
195 render->stencil_att.vk_format =
196 inheritance_info->stencilAttachmentFormat;
197
198 const VkRenderingAttachmentLocationInfoKHR att_loc_info_default = {
199 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
200 .colorAttachmentCount = inheritance_info->colorAttachmentCount,
201 };
202 const VkRenderingAttachmentLocationInfoKHR *att_loc_info =
203 vk_get_command_buffer_rendering_attachment_location_info(
204 cmd->vk.level, pBeginInfo);
205 if (att_loc_info == NULL)
206 att_loc_info = &att_loc_info_default;
207
208 vk_cmd_set_rendering_attachment_locations(&cmd->vk, att_loc_info);
209
210 hk_cmd_buffer_dirty_render_pass(cmd);
211 }
212 }
213
214 hk_cmd_buffer_dirty_all(cmd);
215
216 /* If multiview is disabled, always read 0. If multiview is enabled,
217 * hk_set_view_index will dirty the root each draw.
218 */
219 cmd->state.gfx.descriptors.root.draw.view_index = 0;
220 cmd->state.gfx.descriptors.root_dirty = true;
221 }
222
223 void
hk_cmd_invalidate_graphics_state(struct hk_cmd_buffer * cmd)224 hk_cmd_invalidate_graphics_state(struct hk_cmd_buffer *cmd)
225 {
226 hk_cmd_buffer_dirty_all(cmd);
227
228 /* From the Vulkan 1.3.275 spec:
229 *
230 * "...There is one exception to this rule - if the primary command
231 * buffer is inside a render pass instance, then the render pass and
232 * subpass state is not disturbed by executing secondary command
233 * buffers."
234 *
235 * We need to reset everything EXCEPT the render pass state.
236 */
237 struct hk_rendering_state render_save = cmd->state.gfx.render;
238 memset(&cmd->state.gfx, 0, sizeof(cmd->state.gfx));
239 cmd->state.gfx.render = render_save;
240 }
241
242 static void
hk_attachment_init(struct hk_attachment * att,const VkRenderingAttachmentInfo * info)243 hk_attachment_init(struct hk_attachment *att,
244 const VkRenderingAttachmentInfo *info)
245 {
246 if (info == NULL || info->imageView == VK_NULL_HANDLE) {
247 *att = (struct hk_attachment){
248 .iview = NULL,
249 };
250 return;
251 }
252
253 VK_FROM_HANDLE(hk_image_view, iview, info->imageView);
254 *att = (struct hk_attachment){
255 .vk_format = iview->vk.format,
256 .iview = iview,
257 };
258
259 if (info->resolveMode != VK_RESOLVE_MODE_NONE) {
260 VK_FROM_HANDLE(hk_image_view, res_iview, info->resolveImageView);
261 att->resolve_mode = info->resolveMode;
262 att->resolve_iview = res_iview;
263 }
264 }
265
266 VKAPI_ATTR void VKAPI_CALL
hk_GetRenderingAreaGranularityKHR(VkDevice device,const VkRenderingAreaInfoKHR * pRenderingAreaInfo,VkExtent2D * pGranularity)267 hk_GetRenderingAreaGranularityKHR(
268 VkDevice device, const VkRenderingAreaInfoKHR *pRenderingAreaInfo,
269 VkExtent2D *pGranularity)
270 {
271 *pGranularity = (VkExtent2D){.width = 1, .height = 1};
272 }
273
274 static bool
is_attachment_stored(const VkRenderingAttachmentInfo * att)275 is_attachment_stored(const VkRenderingAttachmentInfo *att)
276 {
277 /* When resolving, we store the intermediate multisampled image as the
278 * resolve is a separate control stream. This could be optimized.
279 */
280 return att->storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
281 att->resolveMode != VK_RESOLVE_MODE_NONE;
282 }
283
284 static struct hk_bg_eot
hk_build_bg_eot(struct hk_cmd_buffer * cmd,const VkRenderingInfo * info,bool store,bool partial_render,bool incomplete_render_area)285 hk_build_bg_eot(struct hk_cmd_buffer *cmd, const VkRenderingInfo *info,
286 bool store, bool partial_render, bool incomplete_render_area)
287 {
288 struct hk_device *dev = hk_cmd_buffer_device(cmd);
289 struct hk_rendering_state *render = &cmd->state.gfx.render;
290
291 /* Construct the key */
292 struct agx_bg_eot_key key = {.tib = render->tilebuffer};
293 static_assert(AGX_BG_EOT_NONE == 0, "default initializer");
294
295 key.tib.layered = (render->cr.layers > 1);
296
297 bool needs_textures_for_spilled_rts =
298 agx_tilebuffer_spills(&render->tilebuffer) && !partial_render && !store;
299
300 for (unsigned i = 0; i < info->colorAttachmentCount; ++i) {
301 const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[i];
302 if (att_info->imageView == VK_NULL_HANDLE)
303 continue;
304
305 /* Partial render programs exist only to store/load the tilebuffer to
306 * main memory. When render targets are already spilled to main memory,
307 * there's nothing to do.
308 */
309 if (key.tib.spilled[i] && (partial_render || store))
310 continue;
311
312 if (store) {
313 bool should_store = is_attachment_stored(att_info);
314
315 /* Partial renders always need to flush to memory. */
316 should_store |= partial_render;
317
318 if (should_store)
319 key.op[i] = AGX_EOT_STORE;
320 } else {
321 bool load = att_info->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD;
322 bool clear = att_info->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR;
323
324 /* The background program used for partial renders must always load
325 * whatever was stored in the mid-frame end-of-tile program.
326 */
327 load |= partial_render;
328
329 /* With an incomplete render area, we're forced to load back tiles and
330 * then use the 3D pipe for the clear.
331 */
332 load |= incomplete_render_area;
333
334 /* Don't read back spilled render targets, they're already in memory */
335 load &= !key.tib.spilled[i];
336
337 /* This is a very frustrating corner case. From the spec:
338 *
339 * VK_ATTACHMENT_STORE_OP_NONE specifies the contents within the
340 * render area are not accessed by the store operation as long as
341 * no values are written to the attachment during the render pass.
342 *
343 * With VK_ATTACHMENT_STORE_OP_NONE, we suppress stores on the main
344 * end-of-tile program. Unfortunately, that's not enough: we also need
345 * to preserve the contents throughout partial renders. The easiest way
346 * to do that is forcing a load in the background program, so that
347 * partial stores for unused attachments will be no-op'd by writing
348 * existing contents.
349 *
350 * Optimizing this would require nontrivial tracking. Fortunately,
351 * this is all Android gunk and we don't have to care too much for
352 * dekstop games. So do the simple thing.
353 */
354 bool no_store = (att_info->storeOp == VK_ATTACHMENT_STORE_OP_NONE);
355 bool no_store_wa = no_store && !load && !clear;
356 if (no_store_wa) {
357 perf_debug(dev, "STORE_OP_NONE workaround");
358 }
359
360 load |= no_store_wa;
361
362 /* Don't apply clears for spilled render targets when we clear the
363 * render area explicitly after.
364 */
365 if (key.tib.spilled[i] && incomplete_render_area)
366 continue;
367
368 if (load)
369 key.op[i] = AGX_BG_LOAD;
370 else if (clear)
371 key.op[i] = AGX_BG_CLEAR;
372 }
373 }
374
375 /* Begin building the pipeline */
376 size_t usc_size = agx_usc_size(3 + HK_MAX_RTS);
377 struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64);
378 if (!t.cpu)
379 return (struct hk_bg_eot){.usc = t.gpu};
380
381 struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
382
383 bool uses_txf = false;
384 unsigned uniforms = 0;
385 unsigned nr_tex = 0;
386
387 for (unsigned rt = 0; rt < HK_MAX_RTS; ++rt) {
388 const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[rt];
389 struct hk_image_view *iview = render->color_att[rt].iview;
390
391 if (key.op[rt] == AGX_BG_LOAD) {
392 uses_txf = true;
393
394 uint32_t index = key.tib.layered
395 ? iview->planes[0].layered_background_desc_index
396 : iview->planes[0].background_desc_index;
397
398 agx_usc_pack(&b, TEXTURE, cfg) {
399 /* Shifted to match eMRT indexing, could be optimized */
400 cfg.start = rt * 2;
401 cfg.count = 1;
402 cfg.buffer = dev->images.bo->va->addr + index * AGX_TEXTURE_LENGTH;
403 }
404
405 nr_tex = (rt * 2) + 1;
406 } else if (key.op[rt] == AGX_BG_CLEAR) {
407 static_assert(sizeof(att_info->clearValue.color) == 16, "fixed ABI");
408 uint64_t colour =
409 hk_pool_upload(cmd, &att_info->clearValue.color, 16, 16);
410
411 agx_usc_uniform(&b, 4 + (8 * rt), 8, colour);
412 uniforms = MAX2(uniforms, 4 + (8 * rt) + 8);
413 } else if (key.op[rt] == AGX_EOT_STORE) {
414 uint32_t index = key.tib.layered
415 ? iview->planes[0].layered_eot_pbe_desc_index
416 : iview->planes[0].eot_pbe_desc_index;
417
418 agx_usc_pack(&b, TEXTURE, cfg) {
419 cfg.start = rt;
420 cfg.count = 1;
421 cfg.buffer = dev->images.bo->va->addr + index * AGX_TEXTURE_LENGTH;
422 }
423
424 nr_tex = rt + 1;
425 }
426 }
427
428 if (needs_textures_for_spilled_rts) {
429 hk_usc_upload_spilled_rt_descs(&b, cmd);
430 uniforms = MAX2(uniforms, 4);
431 }
432
433 if (uses_txf) {
434 agx_usc_push_packed(&b, SAMPLER, dev->dev.txf_sampler);
435 }
436
437 /* For attachmentless rendering, we don't know the sample count until
438 * draw-time. But we have trivial bg/eot programs in that case too.
439 */
440 if (key.tib.nr_samples >= 1) {
441 agx_usc_push_packed(&b, SHARED, &key.tib.usc);
442 } else {
443 assert(key.tib.sample_size_B == 0);
444 agx_usc_shared_none(&b);
445
446 key.tib.nr_samples = 1;
447 }
448
449 /* Get the shader */
450 key.reserved_preamble = uniforms;
451 /* XXX: locking? */
452 struct agx_bg_eot_shader *shader = agx_get_bg_eot_shader(&dev->bg_eot, &key);
453
454 agx_usc_pack(&b, SHADER, cfg) {
455 cfg.code = agx_usc_addr(&dev->dev, shader->ptr);
456 cfg.unk_2 = 0;
457 }
458
459 agx_usc_pack(&b, REGISTERS, cfg)
460 cfg.register_count = shader->info.nr_gprs;
461
462 if (shader->info.has_preamble) {
463 agx_usc_pack(&b, PRESHADER, cfg) {
464 cfg.code =
465 agx_usc_addr(&dev->dev, shader->ptr + shader->info.preamble_offset);
466 }
467 } else {
468 agx_usc_pack(&b, NO_PRESHADER, cfg)
469 ;
470 }
471
472 struct hk_bg_eot ret = {.usc = t.gpu};
473
474 agx_pack(&ret.counts, COUNTS, cfg) {
475 cfg.uniform_register_count = shader->info.push_count;
476 cfg.preshader_register_count = shader->info.nr_preamble_gprs;
477 cfg.texture_state_register_count = nr_tex;
478 cfg.sampler_state_register_count =
479 agx_translate_sampler_state_count(uses_txf ? 1 : 0, false);
480 }
481
482 return ret;
483 }
484
485 static bool
is_aligned(unsigned x,unsigned pot_alignment)486 is_aligned(unsigned x, unsigned pot_alignment)
487 {
488 assert(util_is_power_of_two_nonzero(pot_alignment));
489 return (x & (pot_alignment - 1)) == 0;
490 }
491
492 static void
hk_merge_render_iview(struct hk_rendering_state * render,struct hk_image_view * iview,bool zls)493 hk_merge_render_iview(struct hk_rendering_state *render,
494 struct hk_image_view *iview, bool zls)
495 {
496 if (iview) {
497 unsigned samples = iview->vk.image->samples;
498 /* TODO: is this right for ycbcr? */
499 unsigned level = iview->vk.base_mip_level;
500 unsigned width = u_minify(iview->vk.image->extent.width, level);
501 unsigned height = u_minify(iview->vk.image->extent.height, level);
502
503 assert(render->tilebuffer.nr_samples == 0 ||
504 render->tilebuffer.nr_samples == samples);
505 render->tilebuffer.nr_samples = samples;
506
507 /* TODO: Is this merging logic sound? Not sure how this is supposed to
508 * work conceptually.
509 */
510 render->cr.width = MAX2(render->cr.width, width);
511 render->cr.height = MAX2(render->cr.height, height);
512
513 if (zls) {
514 render->cr.zls_width = width;
515 render->cr.zls_height = height;
516 }
517 }
518 }
519
520 static void
hk_pack_zls_control(struct agx_zls_control_packed * packed,struct ail_layout * z_layout,struct ail_layout * s_layout,const VkRenderingAttachmentInfo * attach_z,const VkRenderingAttachmentInfo * attach_s,bool incomplete_render_area,bool partial_render)521 hk_pack_zls_control(struct agx_zls_control_packed *packed,
522 struct ail_layout *z_layout, struct ail_layout *s_layout,
523 const VkRenderingAttachmentInfo *attach_z,
524 const VkRenderingAttachmentInfo *attach_s,
525 bool incomplete_render_area, bool partial_render)
526 {
527 agx_pack(packed, ZLS_CONTROL, zls_control) {
528 if (z_layout) {
529 /* XXX: Dropping Z stores is wrong if the render pass gets split into
530 * multiple control streams (can that ever happen?) We need more ZLS
531 * variants. Force || true for now.
532 */
533 zls_control.z_store_enable =
534 attach_z->storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
535 attach_z->resolveMode != VK_RESOLVE_MODE_NONE || partial_render ||
536 true;
537
538 zls_control.z_load_enable =
539 attach_z->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD || partial_render ||
540 incomplete_render_area;
541
542 if (ail_is_compressed(z_layout)) {
543 zls_control.z_compress_1 = true;
544 zls_control.z_compress_2 = true;
545 }
546
547 if (z_layout->format == PIPE_FORMAT_Z16_UNORM) {
548 zls_control.z_format = AGX_ZLS_FORMAT_16;
549 } else {
550 zls_control.z_format = AGX_ZLS_FORMAT_32F;
551 }
552 }
553
554 if (s_layout) {
555 /* TODO:
556 * Fail
557 * dEQP-VK.renderpass.dedicated_allocation.formats.d32_sfloat_s8_uint.input.dont_care.store.self_dep_clear_draw_use_input_aspect
558 * without the force
559 * .. maybe a VkRenderPass emulation bug.
560 */
561 zls_control.s_store_enable =
562 attach_s->storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
563 attach_s->resolveMode != VK_RESOLVE_MODE_NONE || partial_render ||
564 true;
565
566 zls_control.s_load_enable =
567 attach_s->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD || partial_render ||
568 incomplete_render_area;
569
570 if (ail_is_compressed(s_layout)) {
571 zls_control.s_compress_1 = true;
572 zls_control.s_compress_2 = true;
573 }
574 }
575 }
576 }
577
578 VKAPI_ATTR void VKAPI_CALL
hk_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)579 hk_CmdBeginRendering(VkCommandBuffer commandBuffer,
580 const VkRenderingInfo *pRenderingInfo)
581 {
582 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
583 struct hk_rendering_state *render = &cmd->state.gfx.render;
584 struct hk_device *dev = hk_cmd_buffer_device(cmd);
585
586 memset(render, 0, sizeof(*render));
587
588 render->flags = pRenderingInfo->flags;
589 render->area = pRenderingInfo->renderArea;
590 render->view_mask = pRenderingInfo->viewMask;
591 render->layer_count = pRenderingInfo->layerCount;
592 render->tilebuffer.nr_samples = 0;
593
594 const uint32_t layer_count = render->view_mask
595 ? util_last_bit(render->view_mask)
596 : render->layer_count;
597
598 render->color_att_count = pRenderingInfo->colorAttachmentCount;
599 for (uint32_t i = 0; i < render->color_att_count; i++) {
600 hk_attachment_init(&render->color_att[i],
601 &pRenderingInfo->pColorAttachments[i]);
602 }
603
604 hk_attachment_init(&render->depth_att, pRenderingInfo->pDepthAttachment);
605 hk_attachment_init(&render->stencil_att, pRenderingInfo->pStencilAttachment);
606
607 for (uint32_t i = 0; i < render->color_att_count; i++) {
608 hk_merge_render_iview(render, render->color_att[i].iview, false);
609 }
610
611 hk_merge_render_iview(
612 render, render->depth_att.iview ?: render->stencil_att.iview, true);
613
614 /* Infer for attachmentless. samples is inferred at draw-time. */
615 render->cr.width =
616 MAX2(render->cr.width, render->area.offset.x + render->area.extent.width);
617
618 render->cr.height = MAX2(render->cr.height,
619 render->area.offset.y + render->area.extent.height);
620
621 if (!render->cr.zls_width) {
622 render->cr.zls_width = render->cr.width;
623 render->cr.zls_height = render->cr.height;
624 }
625
626 render->cr.layers = layer_count;
627
628 /* Choose a tilebuffer layout given the framebuffer key */
629 enum pipe_format formats[HK_MAX_RTS] = {0};
630 for (unsigned i = 0; i < render->color_att_count; ++i) {
631 formats[i] = hk_format_to_pipe_format(render->color_att[i].vk_format);
632 }
633
634 /* For now, we force layered=true since it makes compatibility problems way
635 * easier.
636 */
637 render->tilebuffer = agx_build_tilebuffer_layout(
638 formats, render->color_att_count, render->tilebuffer.nr_samples, true);
639
640 const VkRenderingAttachmentLocationInfoKHR ral_info = {
641 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
642 .colorAttachmentCount = pRenderingInfo->colorAttachmentCount,
643 };
644 vk_cmd_set_rendering_attachment_locations(&cmd->vk, &ral_info);
645
646 hk_cmd_buffer_dirty_render_pass(cmd);
647
648 /* Determine whether the render area is complete, enabling us to use a
649 * fast-clear.
650 *
651 * TODO: If it is incomplete but tile aligned, it should be possibly to fast
652 * clear with the appropriate settings. This is critical for performance.
653 */
654 bool incomplete_render_area =
655 render->area.offset.x > 0 || render->area.offset.y > 0 ||
656 render->area.extent.width < render->cr.width ||
657 render->area.extent.height < render->cr.height ||
658 (render->view_mask &&
659 render->view_mask != BITFIELD64_MASK(render->cr.layers));
660
661 perf_debug(dev, "Rendering %ux%ux%u@%u %s%s", render->cr.width,
662 render->cr.height, render->cr.layers,
663 render->tilebuffer.nr_samples,
664 render->view_mask ? " multiview" : "",
665 incomplete_render_area ? " incomplete" : "");
666
667 render->cr.bg.main = hk_build_bg_eot(cmd, pRenderingInfo, false, false,
668 incomplete_render_area);
669 render->cr.bg.partial =
670 hk_build_bg_eot(cmd, pRenderingInfo, false, true, incomplete_render_area);
671
672 render->cr.eot.main =
673 hk_build_bg_eot(cmd, pRenderingInfo, true, false, incomplete_render_area);
674 render->cr.eot.partial =
675 hk_build_bg_eot(cmd, pRenderingInfo, true, true, incomplete_render_area);
676
677 render->cr.isp_bgobjvals = 0x300;
678
679 const VkRenderingAttachmentInfo *attach_z = pRenderingInfo->pDepthAttachment;
680 const VkRenderingAttachmentInfo *attach_s =
681 pRenderingInfo->pStencilAttachment;
682
683 render->cr.iogpu_unk_214 = 0xc000;
684
685 struct ail_layout *z_layout = NULL, *s_layout = NULL;
686
687 if (attach_z != NULL && attach_z != VK_NULL_HANDLE && attach_z->imageView) {
688 struct hk_image_view *view = render->depth_att.iview;
689 struct hk_image *image =
690 container_of(view->vk.image, struct hk_image, vk);
691
692 z_layout = &image->planes[0].layout;
693
694 unsigned level = view->vk.base_mip_level;
695 unsigned first_layer = view->vk.base_array_layer;
696
697 const struct util_format_description *desc =
698 util_format_description(hk_format_to_pipe_format(view->vk.format));
699
700 assert(desc->format == PIPE_FORMAT_Z32_FLOAT ||
701 desc->format == PIPE_FORMAT_Z16_UNORM ||
702 desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
703
704 render->cr.depth.buffer =
705 hk_image_base_address(image, 0) +
706 ail_get_layer_level_B(z_layout, first_layer, level);
707
708 /* Main stride in pages */
709 assert((z_layout->depth_px == 1 ||
710 is_aligned(z_layout->layer_stride_B, AIL_PAGESIZE)) &&
711 "Page aligned Z layers");
712
713 unsigned stride_pages = z_layout->layer_stride_B / AIL_PAGESIZE;
714 render->cr.depth.stride = ((stride_pages - 1) << 14) | 1;
715
716 assert(z_layout->tiling != AIL_TILING_LINEAR && "must tile");
717
718 if (ail_is_compressed(z_layout)) {
719 render->cr.depth.meta =
720 hk_image_base_address(image, 0) + z_layout->metadata_offset_B +
721 (first_layer * z_layout->compression_layer_stride_B) +
722 z_layout->level_offsets_compressed_B[level];
723
724 /* Meta stride in cache lines */
725 assert(
726 is_aligned(z_layout->compression_layer_stride_B, AIL_CACHELINE) &&
727 "Cacheline aligned Z meta layers");
728
729 unsigned stride_lines =
730 z_layout->compression_layer_stride_B / AIL_CACHELINE;
731 render->cr.depth.meta_stride = (stride_lines - 1) << 14;
732 }
733
734 float clear_depth = attach_z->clearValue.depthStencil.depth;
735
736 if (z_layout->format == PIPE_FORMAT_Z16_UNORM) {
737 render->cr.isp_bgobjdepth = _mesa_float_to_unorm(clear_depth, 16);
738 } else {
739 render->cr.isp_bgobjdepth = fui(clear_depth);
740 }
741 }
742
743 if (attach_s != NULL && attach_s != VK_NULL_HANDLE && attach_s->imageView) {
744 struct hk_image_view *view = render->stencil_att.iview;
745 struct hk_image *image =
746 container_of(view->vk.image, struct hk_image, vk);
747
748 /* Stencil is always the last plane (possibly the only plane) */
749 unsigned plane = image->plane_count - 1;
750 s_layout = &image->planes[plane].layout;
751 assert(s_layout->format == PIPE_FORMAT_S8_UINT);
752
753 unsigned level = view->vk.base_mip_level;
754 unsigned first_layer = view->vk.base_array_layer;
755
756 render->cr.stencil.buffer =
757 hk_image_base_address(image, plane) +
758 ail_get_layer_level_B(s_layout, first_layer, level);
759
760 /* Main stride in pages */
761 assert((s_layout->depth_px == 1 ||
762 is_aligned(s_layout->layer_stride_B, AIL_PAGESIZE)) &&
763 "Page aligned S layers");
764 unsigned stride_pages = s_layout->layer_stride_B / AIL_PAGESIZE;
765 render->cr.stencil.stride = ((stride_pages - 1) << 14) | 1;
766
767 if (ail_is_compressed(s_layout)) {
768 render->cr.stencil.meta =
769 hk_image_base_address(image, plane) + s_layout->metadata_offset_B +
770 (first_layer * s_layout->compression_layer_stride_B) +
771 s_layout->level_offsets_compressed_B[level];
772
773 /* Meta stride in cache lines */
774 assert(
775 is_aligned(s_layout->compression_layer_stride_B, AIL_CACHELINE) &&
776 "Cacheline aligned S meta layers");
777
778 unsigned stride_lines =
779 s_layout->compression_layer_stride_B / AIL_CACHELINE;
780
781 render->cr.stencil.meta_stride = (stride_lines - 1) << 14;
782 }
783
784 render->cr.isp_bgobjvals |= attach_s->clearValue.depthStencil.stencil;
785 }
786
787 hk_pack_zls_control(&render->cr.zls_control, z_layout, s_layout, attach_z,
788 attach_s, incomplete_render_area, false);
789
790 hk_pack_zls_control(&render->cr.zls_control_partial, z_layout, s_layout,
791 attach_z, attach_s, incomplete_render_area, true);
792
793 /* If multiview is disabled, always read 0. If multiview is enabled,
794 * hk_set_view_index will dirty the root each draw.
795 */
796 cmd->state.gfx.descriptors.root.draw.view_index = 0;
797 cmd->state.gfx.descriptors.root_dirty = true;
798
799 if (render->flags & VK_RENDERING_RESUMING_BIT)
800 return;
801
802 /* The first control stream of the render pass is special since it gets
803 * the clears. Create it and swap in the clear.
804 */
805 assert(!cmd->current_cs.gfx && "not already in a render pass");
806 struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, false /* compute */);
807 if (!cs)
808 return;
809
810 cs->cr.bg.main = render->cr.bg.main;
811 cs->cr.zls_control = render->cr.zls_control;
812
813 /* Reordering barrier for post-gfx, in case we had any. */
814 hk_cmd_buffer_end_compute_internal(cmd, &cmd->current_cs.post_gfx);
815
816 /* Don't reorder compute across render passes.
817 *
818 * TODO: Check if this is necessary if the proper PipelineBarriers are
819 * handled... there may be CTS bugs...
820 */
821 hk_cmd_buffer_end_compute(cmd);
822
823 /* If we spill colour attachments, we need to decompress them. This happens
824 * at the start of the render; it is not re-emitted when resuming
825 * secondaries. It could be hoisted to the start of the command buffer but
826 * we're not that clever yet.
827 */
828 if (agx_tilebuffer_spills(&render->tilebuffer)) {
829 perf_debug(dev, "eMRT render pass");
830
831 for (unsigned i = 0; i < render->color_att_count; ++i) {
832 struct hk_image_view *view = render->color_att[i].iview;
833 if (view) {
834 struct hk_image *image =
835 container_of(view->vk.image, struct hk_image, vk);
836
837 /* TODO: YCbCr interaction? */
838 uint8_t plane = 0;
839 uint8_t image_plane = view->planes[plane].image_plane;
840 struct ail_layout *layout = &image->planes[image_plane].layout;
841
842 if (ail_is_level_compressed(layout, view->vk.base_mip_level)) {
843 struct hk_device *dev = hk_cmd_buffer_device(cmd);
844 perf_debug(dev, "Decompressing in-place");
845
846 struct hk_cs *cs = hk_cmd_buffer_get_cs_general(
847 cmd, &cmd->current_cs.pre_gfx, true);
848 if (!cs)
849 return;
850
851 unsigned level = view->vk.base_mip_level;
852 unsigned layer = view->vk.base_array_layer;
853 uint64_t base = hk_image_base_address(image, image_plane);
854
855 struct libagx_decompress_images imgs = {
856 .compressed = view->planes[plane].emrt_texture,
857 .uncompressed = view->planes[plane].emrt_pbe,
858 };
859
860 struct agx_grid grid =
861 agx_3d(ail_metadata_width_tl(layout, level) * 32,
862 ail_metadata_height_tl(layout, level), layer_count);
863
864 libagx_decompress(cs, grid, AGX_BARRIER_ALL, layout, layer,
865 level, base,
866 hk_pool_upload(cmd, &imgs, sizeof(imgs), 64));
867 }
868 }
869 }
870 }
871
872 uint32_t clear_count = 0;
873 VkClearAttachment clear_att[HK_MAX_RTS + 1];
874 bool resolved_clear = false;
875
876 for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
877 const VkRenderingAttachmentInfo *att_info =
878 &pRenderingInfo->pColorAttachments[i];
879 if (att_info->imageView == VK_NULL_HANDLE ||
880 att_info->loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
881 continue;
882
883 clear_att[clear_count++] = (VkClearAttachment){
884 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
885 .colorAttachment = i,
886 .clearValue = att_info->clearValue,
887 };
888
889 resolved_clear |= is_attachment_stored(att_info);
890 }
891
892 clear_att[clear_count] = (VkClearAttachment){
893 .aspectMask = 0,
894 };
895
896 if (attach_z && attach_z->imageView != VK_NULL_HANDLE &&
897 attach_z->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
898 clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
899 clear_att[clear_count].clearValue.depthStencil.depth =
900 attach_z->clearValue.depthStencil.depth;
901
902 resolved_clear |= is_attachment_stored(attach_z);
903 }
904
905 if (attach_s != NULL && attach_s->imageView != VK_NULL_HANDLE &&
906 attach_s->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
907 clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
908 clear_att[clear_count].clearValue.depthStencil.stencil =
909 attach_s->clearValue.depthStencil.stencil;
910
911 resolved_clear |= is_attachment_stored(attach_s);
912 }
913
914 if (clear_att[clear_count].aspectMask != 0)
915 clear_count++;
916
917 if (clear_count > 0 && incomplete_render_area) {
918 const VkClearRect clear_rect = {
919 .rect = render->area,
920 .baseArrayLayer = 0,
921 .layerCount = render->view_mask ? 1 : render->layer_count,
922 };
923
924 hk_CmdClearAttachments(hk_cmd_buffer_to_handle(cmd), clear_count,
925 clear_att, 1, &clear_rect);
926 } else {
927 /* If a tile is empty, we do not want to process it, as the redundant
928 * roundtrip of memory-->tilebuffer-->memory wastes a tremendous amount of
929 * memory bandwidth. Any draw marks a tile as non-empty, so we only need
930 * to process empty tiles if the background+EOT programs have a side
931 * effect. This is the case exactly when there is an attachment we are
932 * fast clearing and then storing.
933 */
934 cs->cr.process_empty_tiles = resolved_clear;
935 }
936 }
937
938 VKAPI_ATTR void VKAPI_CALL
hk_CmdEndRendering(VkCommandBuffer commandBuffer)939 hk_CmdEndRendering(VkCommandBuffer commandBuffer)
940 {
941 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
942 struct hk_rendering_state *render = &cmd->state.gfx.render;
943 struct hk_device *dev = hk_cmd_buffer_device(cmd);
944
945 /* The last control stream of the render pass is special since it gets its
946 * stores dropped. Swap it in.
947 */
948 struct hk_cs *cs = cmd->current_cs.gfx;
949 if (cs) {
950 cs->cr.eot.main = render->cr.eot.main;
951 }
952
953 perf_debug(dev, "End rendering");
954 hk_cmd_buffer_end_graphics(cmd);
955
956 bool need_resolve = false;
957
958 /* Translate render state back to VK for meta */
959 VkRenderingAttachmentInfo vk_color_att[HK_MAX_RTS];
960 for (uint32_t i = 0; i < render->color_att_count; i++) {
961 if (render->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE)
962 need_resolve = true;
963
964 vk_color_att[i] = (VkRenderingAttachmentInfo){
965 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
966 .imageView = hk_image_view_to_handle(render->color_att[i].iview),
967 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
968 .resolveMode = render->color_att[i].resolve_mode,
969 .resolveImageView =
970 hk_image_view_to_handle(render->color_att[i].resolve_iview),
971 .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
972 };
973 }
974
975 const VkRenderingAttachmentInfo vk_depth_att = {
976 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
977 .imageView = hk_image_view_to_handle(render->depth_att.iview),
978 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
979 .resolveMode = render->depth_att.resolve_mode,
980 .resolveImageView =
981 hk_image_view_to_handle(render->depth_att.resolve_iview),
982 .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
983 };
984 if (render->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE)
985 need_resolve = true;
986
987 const VkRenderingAttachmentInfo vk_stencil_att = {
988 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
989 .imageView = hk_image_view_to_handle(render->stencil_att.iview),
990 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
991 .resolveMode = render->stencil_att.resolve_mode,
992 .resolveImageView =
993 hk_image_view_to_handle(render->stencil_att.resolve_iview),
994 .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
995 };
996 if (render->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)
997 need_resolve = true;
998
999 const VkRenderingInfo vk_render = {
1000 .sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
1001 .renderArea = render->area,
1002 .layerCount = render->layer_count,
1003 .viewMask = render->view_mask,
1004 .colorAttachmentCount = render->color_att_count,
1005 .pColorAttachments = vk_color_att,
1006 .pDepthAttachment = &vk_depth_att,
1007 .pStencilAttachment = &vk_stencil_att,
1008 };
1009
1010 if (render->flags & VK_RENDERING_SUSPENDING_BIT)
1011 need_resolve = false;
1012
1013 memset(render, 0, sizeof(*render));
1014
1015 if (need_resolve) {
1016 perf_debug(dev, "Resolving render pass, colour store op %u",
1017 vk_color_att[0].storeOp);
1018
1019 hk_meta_resolve_rendering(cmd, &vk_render);
1020 }
1021 }
1022
1023 static uint64_t
hk_geometry_state(struct hk_cmd_buffer * cmd)1024 hk_geometry_state(struct hk_cmd_buffer *cmd)
1025 {
1026 struct hk_device *dev = hk_cmd_buffer_device(cmd);
1027
1028 /* We tie heap allocation to geometry state allocation, so allocate now. */
1029 if (unlikely(!dev->heap)) {
1030 perf_debug(dev, "Allocating heap");
1031
1032 size_t size = 128 * 1024 * 1024;
1033 dev->heap = agx_bo_create(&dev->dev, size, 0, 0, "Geometry heap");
1034
1035 /* The geometry state buffer is initialized here and then is treated by
1036 * the CPU as rodata, even though the GPU uses it for scratch internally.
1037 */
1038 off_t off = dev->rodata.geometry_state - dev->rodata.bo->va->addr;
1039 struct agx_geometry_state *map = agx_bo_map(dev->rodata.bo) + off;
1040
1041 *map = (struct agx_geometry_state){
1042 .heap = dev->heap->va->addr,
1043 .heap_size = size,
1044 };
1045 }
1046
1047 /* We need to free all allocations after each command buffer execution */
1048 if (!cmd->uses_heap) {
1049 perf_debug(dev, "Freeing heap");
1050 uint64_t addr = dev->rodata.geometry_state;
1051
1052 /* Zeroing the allocated index frees everything */
1053 hk_queue_write(cmd,
1054 addr + offsetof(struct agx_geometry_state, heap_bottom), 0,
1055 true /* after gfx */);
1056
1057 cmd->uses_heap = true;
1058 }
1059
1060 return dev->rodata.geometry_state;
1061 }
1062
1063 static uint64_t
hk_upload_ia_params(struct hk_cmd_buffer * cmd,struct agx_draw draw)1064 hk_upload_ia_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
1065 {
1066 struct hk_device *dev = hk_cmd_buffer_device(cmd);
1067 assert(!agx_is_indirect(draw.b) && "indirect params written by GPU");
1068
1069 struct agx_ia_state ia = {.verts_per_instance = draw.b.count[0]};
1070
1071 if (draw.indexed) {
1072 unsigned index_size_B = agx_index_size_to_B(draw.index_size);
1073 unsigned range_el = agx_draw_index_range_el(draw);
1074
1075 ia.index_buffer =
1076 libagx_index_buffer(agx_draw_index_buffer(draw), range_el, 0,
1077 index_size_B, dev->rodata.zero_sink);
1078
1079 ia.index_buffer_range_el = range_el;
1080 }
1081
1082 return hk_pool_upload(cmd, &ia, sizeof(ia), 8);
1083 }
1084
1085 static enum mesa_prim
hk_gs_in_prim(struct hk_cmd_buffer * cmd)1086 hk_gs_in_prim(struct hk_cmd_buffer *cmd)
1087 {
1088 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1089 struct hk_graphics_state *gfx = &cmd->state.gfx;
1090 struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL];
1091
1092 if (tes != NULL)
1093 return gfx->tess.prim;
1094 else
1095 return vk_conv_topology(dyn->ia.primitive_topology);
1096 }
1097
1098 static enum mesa_prim
hk_rast_prim(struct hk_cmd_buffer * cmd)1099 hk_rast_prim(struct hk_cmd_buffer *cmd)
1100 {
1101 struct hk_graphics_state *gfx = &cmd->state.gfx;
1102 struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
1103 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1104
1105 if (gs != NULL) {
1106 return gs->variants[HK_GS_VARIANT_RAST].info.gs.out_prim;
1107 } else {
1108 switch (dyn->ia.primitive_topology) {
1109 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
1110 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
1111 return MESA_PRIM_LINES;
1112 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
1113 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
1114 return MESA_PRIM_TRIANGLES;
1115 default:
1116 return hk_gs_in_prim(cmd);
1117 }
1118 }
1119 }
1120
1121 static uint64_t
hk_upload_geometry_params(struct hk_cmd_buffer * cmd,struct agx_draw draw)1122 hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
1123 {
1124 struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
1125 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1126 struct hk_graphics_state *gfx = &cmd->state.gfx;
1127 struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
1128 struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
1129
1130 bool rast_disc = dyn->rs.rasterizer_discard_enable;
1131 struct hk_shader *count = hk_count_gs_variant(gs, rast_disc);
1132
1133 /* XXX: We should deduplicate this logic */
1134 bool indirect = agx_is_indirect(draw.b) ||
1135 gfx->shaders[MESA_SHADER_TESS_EVAL] || draw.restart;
1136 enum mesa_prim mode = hk_gs_in_prim(cmd);
1137
1138 if (draw.restart) {
1139 mode = u_decomposed_prim(mode);
1140 }
1141
1142 struct agx_geometry_params params = {
1143 .state = hk_geometry_state(cmd),
1144 .indirect_desc = cmd->geom_indirect,
1145 .flat_outputs = fs ? fs->info.fs.interp.flat : 0,
1146 .input_topology = mode,
1147
1148 /* Overriden by the indirect setup kernel. As tess->GS is always indirect,
1149 * we can assume here that we're VS->GS.
1150 */
1151 .input_buffer = desc->root.draw.vertex_output_buffer,
1152 .input_mask = desc->root.draw.vertex_outputs,
1153 };
1154
1155 if (gfx->xfb_enabled) {
1156 for (unsigned i = 0; i < ARRAY_SIZE(gfx->xfb); ++i) {
1157 params.xfb_base_original[i] = gfx->xfb[i].addr;
1158 params.xfb_size[i] = gfx->xfb[i].range;
1159 params.xfb_offs_ptrs[i] = gfx->xfb_offsets + i * sizeof(uint32_t);
1160 }
1161 }
1162
1163 for (unsigned i = 0; i < ARRAY_SIZE(gfx->xfb_query); ++i) {
1164 uint64_t q = gfx->xfb_query[i];
1165
1166 if (q) {
1167 params.xfb_prims_generated_counter[i] = q;
1168 params.prims_generated_counter[i] = q + sizeof(uint64_t);
1169 }
1170 }
1171
1172 /* Calculate input primitive count for direct draws, and allocate the vertex
1173 * & count buffers. GPU calculates and allocates for indirect draws.
1174 */
1175 params.count_buffer_stride = count->info.gs.count_words * 4;
1176
1177 if (indirect) {
1178 params.vs_grid[2] = params.gs_grid[2] = 1;
1179 } else {
1180 uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
1181
1182 params.vs_grid[0] = verts;
1183 params.gs_grid[0] = u_decomposed_prims_for_vertices(mode, verts);
1184
1185 params.primitives_log2 = util_logbase2_ceil(params.gs_grid[0]);
1186 params.input_primitives = params.gs_grid[0] * instances;
1187
1188 unsigned size = params.input_primitives * params.count_buffer_stride;
1189 if (size) {
1190 params.count_buffer = hk_pool_alloc(cmd, size, 4).gpu;
1191 }
1192 }
1193
1194 desc->root_dirty = true;
1195 return hk_pool_upload(cmd, ¶ms, sizeof(params), 8);
1196 }
1197
1198 static void
hk_upload_tess_params(struct hk_cmd_buffer * cmd,struct libagx_tess_args * out,struct agx_draw draw)1199 hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct libagx_tess_args *out,
1200 struct agx_draw draw)
1201 {
1202 struct hk_device *dev = hk_cmd_buffer_device(cmd);
1203 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1204 struct hk_graphics_state *gfx = &cmd->state.gfx;
1205 struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]);
1206
1207 enum libagx_tess_partitioning partitioning =
1208 gfx->tess.info.spacing == TESS_SPACING_EQUAL
1209 ? LIBAGX_TESS_PARTITIONING_INTEGER
1210 : gfx->tess.info.spacing == TESS_SPACING_FRACTIONAL_ODD
1211 ? LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD
1212 : LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN;
1213
1214 struct libagx_tess_args args = {
1215 .heap = hk_geometry_state(cmd),
1216 .tcs_stride_el = tcs->info.tess.tcs_output_stride / 4,
1217 .statistic = hk_pipeline_stat_addr(
1218 cmd,
1219 VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT),
1220
1221 .input_patch_size = dyn->ts.patch_control_points,
1222 .output_patch_size = tcs->info.tess.tcs_output_patch_size,
1223 .tcs_patch_constants = tcs->info.tess.tcs_nr_patch_outputs,
1224 .tcs_per_vertex_outputs = tcs->info.tess.tcs_per_vertex_outputs,
1225 .partitioning = partitioning,
1226 .points_mode = gfx->tess.info.points,
1227 };
1228
1229 if (!args.points_mode && gfx->tess.info.mode != TESS_PRIMITIVE_ISOLINES) {
1230 args.ccw = gfx->tess.info.ccw;
1231 args.ccw ^=
1232 dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT;
1233 }
1234
1235 uint32_t draw_stride_el = 5;
1236 size_t draw_stride_B = draw_stride_el * sizeof(uint32_t);
1237
1238 /* heap is allocated by hk_geometry_state */
1239 args.patch_coord_buffer = dev->heap->va->addr;
1240
1241 if (!agx_is_indirect(draw.b)) {
1242 unsigned in_patches = draw.b.count[0] / args.input_patch_size;
1243 unsigned unrolled_patches = in_patches * draw.b.count[1];
1244
1245 uint32_t alloc = 0;
1246 uint32_t tcs_out_offs = alloc;
1247 alloc += unrolled_patches * args.tcs_stride_el * 4 * 32;
1248
1249 uint32_t patch_coord_offs = alloc;
1250 alloc += unrolled_patches * 4 * 32;
1251
1252 uint32_t count_offs = alloc;
1253 alloc += unrolled_patches * sizeof(uint32_t) * 32;
1254
1255 /* Single API draw */
1256 uint32_t draw_offs = alloc;
1257 alloc += draw_stride_B;
1258
1259 struct agx_ptr blob = hk_pool_alloc(cmd, alloc, 4);
1260 args.tcs_buffer = blob.gpu + tcs_out_offs;
1261 args.patches_per_instance = in_patches;
1262 args.coord_allocs = blob.gpu + patch_coord_offs;
1263 args.nr_patches = unrolled_patches;
1264 args.out_draws = blob.gpu + draw_offs;
1265 args.counts = blob.gpu + count_offs;
1266 } else {
1267 /* Allocate 3x indirect global+local grids for VS/TCS/tess */
1268 uint32_t grid_stride = sizeof(uint32_t) * 6;
1269 gfx->tess.grids = hk_pool_alloc(cmd, grid_stride * 3, 4).gpu;
1270
1271 args.out_draws = hk_pool_alloc(cmd, draw_stride_B, 4).gpu;
1272 }
1273
1274 gfx->tess.out_draws = args.out_draws;
1275 memcpy(out, &args, sizeof(args));
1276 }
1277
1278 static struct hk_api_shader *
hk_build_meta_shader_locked(struct hk_device * dev,struct hk_internal_key * key,hk_internal_builder_t builder)1279 hk_build_meta_shader_locked(struct hk_device *dev, struct hk_internal_key *key,
1280 hk_internal_builder_t builder)
1281 {
1282 /* Try to get the cached shader */
1283 struct hash_entry *ent = _mesa_hash_table_search(dev->kernels.ht, key);
1284 if (ent)
1285 return ent->data;
1286
1287 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
1288 &agx_nir_options, NULL);
1289 builder(&b, key->key);
1290
1291 const struct vk_pipeline_robustness_state rs = {
1292 .images = VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DISABLED_EXT,
1293 .storage_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
1294 .uniform_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
1295 .vertex_inputs = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
1296 };
1297
1298 struct vk_shader_compile_info info = {
1299 .stage = b.shader->info.stage,
1300 .nir = b.shader,
1301 .robustness = &rs,
1302 };
1303
1304 hk_preprocess_nir_internal(dev->vk.physical, b.shader);
1305
1306 struct hk_api_shader *s;
1307 if (hk_compile_shader(dev, &info, NULL, NULL, &s) != VK_SUCCESS)
1308 return NULL;
1309
1310 /* ..and cache it before we return. The key is on the stack right now, so
1311 * clone it before using it as a hash table key. The clone is logically owned
1312 * by the hash table.
1313 */
1314 size_t total_key_size = sizeof(*key) + key->key_size;
1315 void *cloned_key = ralloc_memdup(dev->kernels.ht, key, total_key_size);
1316
1317 _mesa_hash_table_insert(dev->kernels.ht, cloned_key, s);
1318 return s;
1319 }
1320
1321 struct hk_api_shader *
hk_meta_shader(struct hk_device * dev,hk_internal_builder_t builder,void * data,size_t data_size)1322 hk_meta_shader(struct hk_device *dev, hk_internal_builder_t builder, void *data,
1323 size_t data_size)
1324 {
1325 size_t total_key_size = sizeof(struct hk_internal_key) + data_size;
1326
1327 struct hk_internal_key *key = alloca(total_key_size);
1328 key->builder = builder;
1329 key->key_size = data_size;
1330
1331 if (data_size)
1332 memcpy(key->key, data, data_size);
1333
1334 simple_mtx_lock(&dev->kernels.lock);
1335 struct hk_api_shader *s = hk_build_meta_shader_locked(dev, key, builder);
1336 simple_mtx_unlock(&dev->kernels.lock);
1337
1338 return s;
1339 }
1340
1341 static struct agx_draw
hk_draw_as_indexed_indirect(struct hk_cmd_buffer * cmd,struct agx_draw draw)1342 hk_draw_as_indexed_indirect(struct hk_cmd_buffer *cmd, struct agx_draw draw)
1343 {
1344 assert(draw.indexed);
1345
1346 if (agx_is_indirect(draw.b))
1347 return draw;
1348
1349 VkDrawIndexedIndirectCommand desc = {
1350 .indexCount = draw.b.count[0],
1351 .instanceCount = draw.b.count[1],
1352 .firstIndex = draw.start,
1353 .vertexOffset = draw.index_bias,
1354 .firstInstance = draw.start_instance,
1355 };
1356
1357 return agx_draw_indexed_indirect(
1358 hk_pool_upload(cmd, &desc, sizeof(desc), 4), draw.index_buffer,
1359 draw.index_buffer_range_B, draw.index_size, draw.restart);
1360 }
1361
1362 static struct agx_draw
hk_draw_without_restart(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct agx_draw draw,uint32_t draw_count)1363 hk_draw_without_restart(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
1364 struct agx_draw draw, uint32_t draw_count)
1365 {
1366 struct hk_device *dev = hk_cmd_buffer_device(cmd);
1367 struct hk_graphics_state *gfx = &cmd->state.gfx;
1368 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1369
1370 perf_debug(dev, "Unrolling primitive restart due to GS/XFB");
1371
1372 /* The unroll kernel assumes an indirect draw. Synthesize one if needed */
1373 draw = hk_draw_as_indexed_indirect(cmd, draw);
1374
1375 /* Next, we unroll the index buffer used by the indirect draw */
1376 enum mesa_prim prim = vk_conv_topology(dyn->ia.primitive_topology);
1377
1378 assert(draw_count == 1 && "TODO: multidraw");
1379
1380 struct libagx_unroll_restart_args ia = {
1381 .heap = hk_geometry_state(cmd),
1382 .index_buffer = draw.index_buffer,
1383 .in_draw = draw.b.ptr,
1384 .out_draw = hk_pool_alloc(cmd, 5 * sizeof(uint32_t) * draw_count, 4).gpu,
1385 .max_draws = 1 /* TODO: MDI */,
1386 .restart_index = gfx->index.restart,
1387 .index_buffer_size_el = agx_draw_index_range_el(draw),
1388 .flatshade_first =
1389 dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT,
1390 .zero_sink = dev->rodata.zero_sink,
1391 };
1392
1393 libagx_unroll_restart_struct(cs, agx_1d(1024 * draw_count), AGX_BARRIER_ALL,
1394 ia, draw.index_size, libagx_compact_prim(prim));
1395
1396 return agx_draw_indexed_indirect(ia.out_draw, dev->heap->va->addr,
1397 dev->heap->size, draw.index_size,
1398 false /* restart */);
1399 }
1400
1401 static struct agx_draw
hk_launch_gs_prerast(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct agx_draw draw)1402 hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
1403 struct agx_draw draw)
1404 {
1405 struct hk_device *dev = hk_cmd_buffer_device(cmd);
1406 struct hk_graphics_state *gfx = &cmd->state.gfx;
1407 struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
1408 struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
1409 struct agx_grid grid_vs, grid_gs;
1410
1411 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1412 bool rast_disc = dyn->rs.rasterizer_discard_enable;
1413
1414 hk_ensure_cs_has_space(cmd, cs, 0x2000 /*XXX*/);
1415
1416 struct hk_shader *vs = hk_bound_sw_vs_before_gs(gfx);
1417 struct hk_shader *main = hk_main_gs_variant(gs, rast_disc);
1418 struct hk_shader *count = hk_count_gs_variant(gs, rast_disc);
1419 struct hk_shader *pre_gs = hk_pre_gs_variant(gs, rast_disc);
1420
1421 uint64_t geometry_params = desc->root.draw.geometry_params;
1422 unsigned count_words = count->info.gs.count_words;
1423
1424 if (false /* TODO */)
1425 perf_debug(dev, "Transform feedbck");
1426 else if (count_words)
1427 perf_debug(dev, "Geometry shader with counts");
1428 else
1429 perf_debug(dev, "Geometry shader without counts");
1430
1431 enum mesa_prim mode = hk_gs_in_prim(cmd);
1432
1433 if (draw.restart) {
1434 draw = hk_draw_without_restart(cmd, cs, draw, 1);
1435 mode = u_decomposed_prim(mode);
1436 }
1437
1438 /* Setup grids */
1439 if (agx_is_indirect(draw.b)) {
1440 struct libagx_gs_setup_indirect_args gsi = {
1441 .index_buffer = draw.index_buffer,
1442 .zero_sink = dev->rodata.zero_sink,
1443 .draw = draw.b.ptr,
1444 .ia = desc->root.draw.input_assembly,
1445 .p = desc->root.draw.geometry_params,
1446 .vs_outputs = vs->b.info.outputs,
1447 .prim = mode,
1448 };
1449
1450 if (cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL]) {
1451 gsi.vertex_buffer = desc->root.draw.tess_params +
1452 offsetof(struct libagx_tess_args, tes_buffer);
1453 } else {
1454 gsi.vertex_buffer = desc->root.root_desc_addr +
1455 offsetof(struct hk_root_descriptor_table,
1456 draw.vertex_output_buffer);
1457 }
1458
1459 if (draw.indexed) {
1460 gsi.index_size_B = agx_index_size_to_B(draw.index_size);
1461 gsi.index_buffer_range_el = agx_draw_index_range_el(draw);
1462 }
1463
1464 libagx_gs_setup_indirect_struct(cs, agx_1d(1), AGX_BARRIER_ALL, gsi);
1465
1466 grid_vs = agx_grid_indirect(
1467 geometry_params + offsetof(struct agx_geometry_params, vs_grid));
1468
1469 grid_gs = agx_grid_indirect(
1470 geometry_params + offsetof(struct agx_geometry_params, gs_grid));
1471 } else {
1472 grid_vs = grid_gs = draw.b;
1473 grid_gs.count[0] = u_decomposed_prims_for_vertices(mode, draw.b.count[0]);
1474 }
1475
1476 /* Launch the vertex shader first */
1477 hk_reserve_scratch(cmd, cs, vs);
1478 hk_dispatch_with_usc(dev, cs, &vs->b.info,
1479 hk_upload_usc_words(cmd, vs,
1480 vs->info.stage == MESA_SHADER_VERTEX
1481 ? gfx->linked[MESA_SHADER_VERTEX]
1482 : vs->only_linked),
1483 grid_vs, agx_workgroup(1, 1, 1));
1484
1485 /* If we need counts, launch the count shader and prefix sum the results. */
1486 if (count_words) {
1487 hk_dispatch_with_local_size(cmd, cs, count, grid_gs,
1488 agx_workgroup(1, 1, 1));
1489
1490 libagx_prefix_sum_geom(cs, agx_1d(1024 * count_words), AGX_BARRIER_ALL,
1491 geometry_params);
1492 }
1493
1494 /* Pre-GS shader */
1495 hk_dispatch_with_local_size(cmd, cs, pre_gs, agx_1d(1),
1496 agx_workgroup(1, 1, 1));
1497
1498 /* Pre-rast geometry shader */
1499 hk_dispatch_with_local_size(cmd, cs, main, grid_gs, agx_workgroup(1, 1, 1));
1500
1501 bool restart = cmd->state.gfx.topology != AGX_PRIMITIVE_POINTS;
1502 return agx_draw_indexed_indirect(cmd->geom_indirect, dev->heap->va->addr,
1503 dev->heap->size, AGX_INDEX_SIZE_U32,
1504 restart);
1505 }
1506
1507 static struct agx_draw
hk_launch_tess(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct agx_draw draw)1508 hk_launch_tess(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
1509 struct agx_draw draw)
1510 {
1511 struct hk_device *dev = hk_cmd_buffer_device(cmd);
1512 struct hk_graphics_state *gfx = &cmd->state.gfx;
1513 struct agx_grid grid_vs, grid_tcs, grid_tess;
1514
1515 struct hk_shader *vs = hk_bound_sw_vs(gfx);
1516 struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]);
1517
1518 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1519 uint32_t input_patch_size = dyn->ts.patch_control_points;
1520 uint64_t state = gfx->descriptors.root.draw.tess_params;
1521 struct hk_tess_info info = gfx->tess.info;
1522
1523 hk_ensure_cs_has_space(cmd, cs, 0x2000 /*XXX*/);
1524
1525 perf_debug(dev, "Tessellation");
1526
1527 uint64_t tcs_stat = hk_pipeline_stat_addr(
1528 cmd, VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT);
1529
1530 /* Setup grids */
1531 if (agx_is_indirect(draw.b)) {
1532 perf_debug(dev, "Indirect tessellation");
1533
1534 struct libagx_tess_setup_indirect_args args = {
1535 .p = state,
1536 .grids = gfx->tess.grids,
1537 .indirect = draw.b.ptr,
1538 .ia = gfx->descriptors.root.draw.input_assembly,
1539 .vertex_outputs = vs->b.info.outputs,
1540 .vertex_output_buffer_ptr =
1541 gfx->root + offsetof(struct hk_root_descriptor_table,
1542 draw.vertex_output_buffer),
1543 .tcs_statistic = hk_pipeline_stat_addr(
1544 cmd,
1545 VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT),
1546 };
1547
1548 if (draw.indexed) {
1549 args.in_index_buffer = draw.index_buffer;
1550 args.in_index_size_B = agx_index_size_to_B(draw.index_size);
1551 args.in_index_buffer_range_el = agx_draw_index_range_el(draw);
1552 }
1553
1554 libagx_tess_setup_indirect_struct(cs, agx_1d(1), AGX_BARRIER_ALL, args);
1555
1556 uint32_t grid_stride = sizeof(uint32_t) * 6;
1557 grid_vs = agx_grid_indirect_local(gfx->tess.grids + 0 * grid_stride);
1558 grid_tcs = agx_grid_indirect_local(gfx->tess.grids + 1 * grid_stride);
1559 grid_tess = agx_grid_indirect_local(gfx->tess.grids + 2 * grid_stride);
1560 } else {
1561 uint32_t patches = draw.b.count[0] / input_patch_size;
1562 grid_vs = grid_tcs = draw.b;
1563
1564 grid_tcs.count[0] = patches * tcs->info.tess.tcs_output_patch_size;
1565 grid_tess = agx_1d(patches * draw.b.count[1]);
1566
1567 /* TCS invocation counter increments once per-patch */
1568 if (tcs_stat) {
1569 perf_debug(dev, "Direct TCS statistic");
1570 libagx_increment_statistic(cs, agx_1d(1), AGX_BARRIER_ALL, tcs_stat,
1571 patches);
1572 }
1573 }
1574
1575 /* First launch the VS and TCS */
1576 hk_reserve_scratch(cmd, cs, vs);
1577 hk_reserve_scratch(cmd, cs, tcs);
1578
1579 hk_dispatch_with_usc(
1580 dev, cs, &vs->b.info,
1581 hk_upload_usc_words(cmd, vs, gfx->linked[MESA_SHADER_VERTEX]), grid_vs,
1582 agx_workgroup(64, 1, 1));
1583
1584 hk_dispatch_with_usc(
1585 dev, cs, &tcs->b.info, hk_upload_usc_words(cmd, tcs, tcs->only_linked),
1586 grid_tcs, agx_workgroup(tcs->info.tess.tcs_output_patch_size, 1, 1));
1587
1588 /* First generate counts, then prefix sum them, and then tessellate. */
1589 libagx_tessellate(cs, grid_tess, AGX_BARRIER_ALL, info.mode,
1590 LIBAGX_TESS_MODE_COUNT, state);
1591
1592 libagx_prefix_sum_tess(cs, agx_1d(1024), AGX_BARRIER_ALL, state);
1593
1594 libagx_tessellate(cs, grid_tess, AGX_BARRIER_ALL, info.mode,
1595 LIBAGX_TESS_MODE_WITH_COUNTS, state);
1596
1597 return agx_draw_indexed_indirect(gfx->tess.out_draws, dev->heap->va->addr,
1598 dev->heap->size, AGX_INDEX_SIZE_U32, false);
1599 }
1600
1601 void
hk_cmd_bind_graphics_shader(struct hk_cmd_buffer * cmd,const gl_shader_stage stage,struct hk_api_shader * shader)1602 hk_cmd_bind_graphics_shader(struct hk_cmd_buffer *cmd,
1603 const gl_shader_stage stage,
1604 struct hk_api_shader *shader)
1605 {
1606 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1607
1608 assert(stage < ARRAY_SIZE(cmd->state.gfx.shaders));
1609 if (cmd->state.gfx.shaders[stage] == shader)
1610 return;
1611
1612 cmd->state.gfx.shaders[stage] = shader;
1613 cmd->state.gfx.shaders_dirty |= BITFIELD_BIT(stage);
1614
1615 if (stage == MESA_SHADER_FRAGMENT) {
1616 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
1617 }
1618 }
1619
1620 static void
hk_flush_shaders(struct hk_cmd_buffer * cmd)1621 hk_flush_shaders(struct hk_cmd_buffer *cmd)
1622 {
1623 if (cmd->state.gfx.shaders_dirty == 0)
1624 return;
1625
1626 struct hk_graphics_state *gfx = &cmd->state.gfx;
1627 struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
1628 desc->root_dirty = true;
1629
1630 /* Geometry shading overrides the restart index, reemit on rebind */
1631 if (IS_SHADER_DIRTY(GEOMETRY)) {
1632 struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
1633
1634 desc->root.draw.api_gs = gs && !gs->is_passthrough;
1635 }
1636
1637 struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
1638 struct hk_api_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];
1639
1640 /* If we have a new VS/FS pair, UVS locations may have changed so need to
1641 * relink. We do this here because there's no dependence on the fast linked
1642 * shaders.
1643 */
1644 agx_assign_uvs(&gfx->linked_varyings, &hw_vs->info.uvs,
1645 fs ? hk_only_variant(fs)->info.fs.interp.flat : 0,
1646 fs ? hk_only_variant(fs)->info.fs.interp.linear : 0);
1647
1648 for (unsigned i = 0; i < VARYING_SLOT_MAX; ++i) {
1649 desc->root.draw.uvs_index[i] = gfx->linked_varyings.slots[i];
1650 }
1651 }
1652
1653 static struct agx_shader_part *
hk_get_prolog_epilog_locked(struct hk_device * dev,struct hk_internal_key * key,hk_internal_builder_t builder,bool preprocess_nir,bool stop,unsigned cf_base)1654 hk_get_prolog_epilog_locked(struct hk_device *dev, struct hk_internal_key *key,
1655 hk_internal_builder_t builder, bool preprocess_nir,
1656 bool stop, unsigned cf_base)
1657 {
1658 /* Try to get the cached shader */
1659 struct hash_entry *ent = _mesa_hash_table_search(dev->prolog_epilog.ht, key);
1660 if (ent)
1661 return ent->data;
1662
1663 nir_builder b = nir_builder_init_simple_shader(0, &agx_nir_options, NULL);
1664 builder(&b, key->key);
1665
1666 if (preprocess_nir)
1667 agx_preprocess_nir(b.shader, dev->dev.libagx);
1668
1669 struct agx_shader_key backend_key = {
1670 .dev = agx_gather_device_key(&dev->dev),
1671 .libagx = dev->dev.libagx,
1672 .secondary = true,
1673 .no_stop = !stop,
1674 };
1675
1676 /* We always use dynamic sample shading in the GL driver. Indicate that. */
1677 if (b.shader->info.stage == MESA_SHADER_FRAGMENT) {
1678 backend_key.fs.cf_base = cf_base;
1679
1680 if (b.shader->info.fs.uses_sample_shading)
1681 backend_key.fs.inside_sample_loop = true;
1682 }
1683
1684 struct agx_shader_part *part =
1685 rzalloc(dev->prolog_epilog.ht, struct agx_shader_part);
1686
1687 agx_compile_shader_nir(b.shader, &backend_key, NULL, part);
1688
1689 ralloc_free(b.shader);
1690
1691 /* ..and cache it before we return. The key is on the stack right now, so
1692 * clone it before using it as a hash table key. The clone is logically owned
1693 * by the hash table.
1694 */
1695 size_t total_key_size = sizeof(*key) + key->key_size;
1696 void *cloned_key = ralloc_memdup(dev->prolog_epilog.ht, key, total_key_size);
1697
1698 _mesa_hash_table_insert(dev->prolog_epilog.ht, cloned_key, part);
1699 return part;
1700 }
1701
1702 static struct agx_shader_part *
hk_get_prolog_epilog(struct hk_device * dev,void * data,size_t data_size,hk_internal_builder_t builder,bool preprocess_nir,bool stop,unsigned cf_base)1703 hk_get_prolog_epilog(struct hk_device *dev, void *data, size_t data_size,
1704 hk_internal_builder_t builder, bool preprocess_nir,
1705 bool stop, unsigned cf_base)
1706 {
1707 /* Build the meta shader key */
1708 size_t total_key_size = sizeof(struct hk_internal_key) + data_size;
1709
1710 struct hk_internal_key *key = alloca(total_key_size);
1711 key->builder = builder;
1712 key->key_size = data_size;
1713
1714 if (data_size)
1715 memcpy(key->key, data, data_size);
1716
1717 simple_mtx_lock(&dev->prolog_epilog.lock);
1718
1719 struct agx_shader_part *part = hk_get_prolog_epilog_locked(
1720 dev, key, builder, preprocess_nir, stop, cf_base);
1721
1722 simple_mtx_unlock(&dev->prolog_epilog.lock);
1723 return part;
1724 }
1725
1726 static struct hk_linked_shader *
hk_get_fast_linked_locked_vs(struct hk_device * dev,struct hk_shader * shader,struct hk_fast_link_key_vs * key)1727 hk_get_fast_linked_locked_vs(struct hk_device *dev, struct hk_shader *shader,
1728 struct hk_fast_link_key_vs *key)
1729 {
1730 struct agx_shader_part *prolog =
1731 hk_get_prolog_epilog(dev, &key->prolog, sizeof(key->prolog),
1732 agx_nir_vs_prolog, false, false, 0);
1733
1734 struct hk_linked_shader *linked =
1735 hk_fast_link(dev, false, shader, prolog, NULL, 0);
1736
1737 struct hk_fast_link_key *key_clone =
1738 ralloc_memdup(shader->linked.ht, key, sizeof(*key));
1739
1740 /* XXX: Fix this higher up the stack */
1741 linked->sw_indexing = !key->prolog.hw || key->prolog.adjacency;
1742 linked->b.uses_base_param |= linked->sw_indexing;
1743
1744 _mesa_hash_table_insert(shader->linked.ht, key_clone, linked);
1745 return linked;
1746 }
1747
1748 static void
build_fs_prolog(nir_builder * b,const void * key)1749 build_fs_prolog(nir_builder *b, const void *key)
1750 {
1751 agx_nir_fs_prolog(b, key);
1752
1753 /* Lower load_stat_query_address_agx, needed for FS statistics */
1754 NIR_PASS(_, b->shader, hk_lower_uvs_index, 0);
1755 }
1756
1757 static struct hk_linked_shader *
hk_get_fast_linked_locked_fs(struct hk_device * dev,struct hk_shader * shader,struct hk_fast_link_key_fs * key)1758 hk_get_fast_linked_locked_fs(struct hk_device *dev, struct hk_shader *shader,
1759 struct hk_fast_link_key_fs *key)
1760 {
1761 /* TODO: prolog without fs needs to work too... */
1762 bool needs_prolog = key->prolog.statistics ||
1763 key->prolog.cull_distance_size ||
1764 key->prolog.api_sample_mask != 0xff;
1765
1766 struct agx_shader_part *prolog = NULL;
1767 if (needs_prolog) {
1768 prolog = hk_get_prolog_epilog(dev, &key->prolog, sizeof(key->prolog),
1769 build_fs_prolog, false, false,
1770 key->prolog.cf_base);
1771 }
1772
1773 /* If sample shading is used, don't stop at the epilog, there's a
1774 * footer that the fast linker will insert to stop.
1775 */
1776 bool epilog_stop = (key->nr_samples_shaded == 0);
1777
1778 struct agx_shader_part *epilog =
1779 hk_get_prolog_epilog(dev, &key->epilog, sizeof(key->epilog),
1780 agx_nir_fs_epilog, true, epilog_stop, 0);
1781
1782 struct hk_linked_shader *linked =
1783 hk_fast_link(dev, true, shader, prolog, epilog, key->nr_samples_shaded);
1784
1785 struct hk_fast_link_key *key_clone =
1786 ralloc_memdup(shader->linked.ht, key, sizeof(*key));
1787
1788 _mesa_hash_table_insert(shader->linked.ht, key_clone, linked);
1789 return linked;
1790 }
1791
1792 /*
1793 * First, look for a fully linked variant. Else, build the required shader
1794 * parts and link.
1795 */
1796 static struct hk_linked_shader *
hk_get_fast_linked(struct hk_device * dev,struct hk_shader * shader,void * key)1797 hk_get_fast_linked(struct hk_device *dev, struct hk_shader *shader, void *key)
1798 {
1799 struct hk_linked_shader *linked;
1800 simple_mtx_lock(&shader->linked.lock);
1801
1802 struct hash_entry *ent = _mesa_hash_table_search(shader->linked.ht, key);
1803
1804 if (ent)
1805 linked = ent->data;
1806 else if (shader->info.stage == MESA_SHADER_VERTEX)
1807 linked = hk_get_fast_linked_locked_vs(dev, shader, key);
1808 else if (shader->info.stage == MESA_SHADER_FRAGMENT)
1809 linked = hk_get_fast_linked_locked_fs(dev, shader, key);
1810 else
1811 unreachable("invalid stage");
1812
1813 simple_mtx_unlock(&shader->linked.lock);
1814 return linked;
1815 }
1816
1817 static void
hk_update_fast_linked(struct hk_cmd_buffer * cmd,struct hk_shader * shader,void * key)1818 hk_update_fast_linked(struct hk_cmd_buffer *cmd, struct hk_shader *shader,
1819 void *key)
1820 {
1821 struct hk_device *dev = hk_cmd_buffer_device(cmd);
1822 struct hk_linked_shader *new = hk_get_fast_linked(dev, shader, key);
1823 gl_shader_stage stage = shader->info.stage;
1824
1825 if (cmd->state.gfx.linked[stage] != new) {
1826 cmd->state.gfx.linked[stage] = new;
1827 cmd->state.gfx.linked_dirty |= BITFIELD_BIT(stage);
1828 }
1829 }
1830
1831 static enum agx_polygon_mode
translate_polygon_mode(VkPolygonMode vk_mode)1832 translate_polygon_mode(VkPolygonMode vk_mode)
1833 {
1834 static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_FILL ==
1835 AGX_POLYGON_MODE_FILL);
1836 static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_LINE ==
1837 AGX_POLYGON_MODE_LINE);
1838 static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_POINT ==
1839 AGX_POLYGON_MODE_POINT);
1840
1841 assert(vk_mode <= VK_POLYGON_MODE_POINT);
1842 return (enum agx_polygon_mode)vk_mode;
1843 }
1844
1845 static enum agx_zs_func
translate_compare_op(VkCompareOp vk_mode)1846 translate_compare_op(VkCompareOp vk_mode)
1847 {
1848 static_assert((enum agx_zs_func)VK_COMPARE_OP_NEVER == AGX_ZS_FUNC_NEVER);
1849 static_assert((enum agx_zs_func)VK_COMPARE_OP_LESS == AGX_ZS_FUNC_LESS);
1850 static_assert((enum agx_zs_func)VK_COMPARE_OP_EQUAL == AGX_ZS_FUNC_EQUAL);
1851 static_assert((enum agx_zs_func)VK_COMPARE_OP_LESS_OR_EQUAL ==
1852 AGX_ZS_FUNC_LEQUAL);
1853 static_assert((enum agx_zs_func)VK_COMPARE_OP_GREATER ==
1854 AGX_ZS_FUNC_GREATER);
1855 static_assert((enum agx_zs_func)VK_COMPARE_OP_NOT_EQUAL ==
1856 AGX_ZS_FUNC_NOT_EQUAL);
1857 static_assert((enum agx_zs_func)VK_COMPARE_OP_GREATER_OR_EQUAL ==
1858 AGX_ZS_FUNC_GEQUAL);
1859 static_assert((enum agx_zs_func)VK_COMPARE_OP_ALWAYS == AGX_ZS_FUNC_ALWAYS);
1860
1861 assert(vk_mode <= VK_COMPARE_OP_ALWAYS);
1862 return (enum agx_zs_func)vk_mode;
1863 }
1864
1865 static enum agx_stencil_op
translate_stencil_op(VkStencilOp vk_op)1866 translate_stencil_op(VkStencilOp vk_op)
1867 {
1868 static_assert((enum agx_stencil_op)VK_STENCIL_OP_KEEP ==
1869 AGX_STENCIL_OP_KEEP);
1870 static_assert((enum agx_stencil_op)VK_STENCIL_OP_ZERO ==
1871 AGX_STENCIL_OP_ZERO);
1872 static_assert((enum agx_stencil_op)VK_STENCIL_OP_REPLACE ==
1873 AGX_STENCIL_OP_REPLACE);
1874 static_assert((enum agx_stencil_op)VK_STENCIL_OP_INCREMENT_AND_CLAMP ==
1875 AGX_STENCIL_OP_INCR_SAT);
1876 static_assert((enum agx_stencil_op)VK_STENCIL_OP_DECREMENT_AND_CLAMP ==
1877 AGX_STENCIL_OP_DECR_SAT);
1878 static_assert((enum agx_stencil_op)VK_STENCIL_OP_INVERT ==
1879 AGX_STENCIL_OP_INVERT);
1880 static_assert((enum agx_stencil_op)VK_STENCIL_OP_INCREMENT_AND_WRAP ==
1881 AGX_STENCIL_OP_INCR_WRAP);
1882 static_assert((enum agx_stencil_op)VK_STENCIL_OP_DECREMENT_AND_WRAP ==
1883 AGX_STENCIL_OP_DECR_WRAP);
1884
1885 return (enum agx_stencil_op)vk_op;
1886 }
1887
1888 static void
hk_ppp_push_stencil_face(struct agx_ppp_update * ppp,struct vk_stencil_test_face_state s,bool enabled)1889 hk_ppp_push_stencil_face(struct agx_ppp_update *ppp,
1890 struct vk_stencil_test_face_state s, bool enabled)
1891 {
1892 if (enabled) {
1893 agx_ppp_push(ppp, FRAGMENT_STENCIL, cfg) {
1894 cfg.compare = translate_compare_op(s.op.compare);
1895 cfg.write_mask = s.write_mask;
1896 cfg.read_mask = s.compare_mask;
1897
1898 cfg.depth_pass = translate_stencil_op(s.op.pass);
1899 cfg.depth_fail = translate_stencil_op(s.op.depth_fail);
1900 cfg.stencil_fail = translate_stencil_op(s.op.fail);
1901 }
1902 } else {
1903 agx_ppp_push(ppp, FRAGMENT_STENCIL, cfg) {
1904 cfg.compare = AGX_ZS_FUNC_ALWAYS;
1905 cfg.write_mask = 0xFF;
1906 cfg.read_mask = 0xFF;
1907
1908 cfg.depth_pass = AGX_STENCIL_OP_KEEP;
1909 cfg.depth_fail = AGX_STENCIL_OP_KEEP;
1910 cfg.stencil_fail = AGX_STENCIL_OP_KEEP;
1911 }
1912 }
1913 }
1914
1915 static bool
hk_stencil_test_enabled(struct hk_cmd_buffer * cmd)1916 hk_stencil_test_enabled(struct hk_cmd_buffer *cmd)
1917 {
1918 const struct hk_rendering_state *render = &cmd->state.gfx.render;
1919 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1920
1921 return dyn->ds.stencil.test_enable &&
1922 render->stencil_att.vk_format != VK_FORMAT_UNDEFINED;
1923 }
1924
1925 static void
hk_flush_vp_state(struct hk_cmd_buffer * cmd,struct hk_cs * cs,uint8_t ** out)1926 hk_flush_vp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out)
1927 {
1928 const struct vk_dynamic_graphics_state *dyn =
1929 &cmd->vk.dynamic_graphics_state;
1930
1931 /* We always need at least 1 viewport for the hardware. With rasterizer
1932 * discard the app may not supply any, but we can just program garbage.
1933 */
1934 unsigned count = MAX2(dyn->vp.viewport_count, 1);
1935
1936 unsigned minx[HK_MAX_VIEWPORTS] = {0}, miny[HK_MAX_VIEWPORTS] = {0};
1937 unsigned maxx[HK_MAX_VIEWPORTS] = {0}, maxy[HK_MAX_VIEWPORTS] = {0};
1938
1939 /* We implicitly scissor to the viewport. We need to do a min/max dance to
1940 * handle inverted viewports.
1941 */
1942 for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
1943 const VkViewport *vp = &dyn->vp.viewports[i];
1944
1945 minx[i] = MIN2(vp->x, vp->x + vp->width);
1946 miny[i] = MIN2(vp->y, vp->y + vp->height);
1947 maxx[i] = MAX2(vp->x, vp->x + vp->width);
1948 maxy[i] = MAX2(vp->y, vp->y + vp->height);
1949 }
1950
1951 /* Additionally clamp to the framebuffer so we don't rasterize
1952 * off-screen pixels. TODO: Is this necessary? the GL driver does this but
1953 * it might be cargoculted at this point.
1954 */
1955 for (unsigned i = 0; i < count; ++i) {
1956 minx[i] = MIN2(minx[i], cmd->state.gfx.render.cr.width);
1957 maxx[i] = MIN2(maxx[i], cmd->state.gfx.render.cr.width);
1958 miny[i] = MIN2(miny[i], cmd->state.gfx.render.cr.height);
1959 maxy[i] = MIN2(maxy[i], cmd->state.gfx.render.cr.height);
1960 }
1961
1962 /* We additionally apply any API scissors */
1963 for (unsigned i = 0; i < dyn->vp.scissor_count; ++i) {
1964 const VkRect2D *s = &dyn->vp.scissors[i];
1965
1966 minx[i] = MAX2(minx[i], s->offset.x);
1967 miny[i] = MAX2(miny[i], s->offset.y);
1968 maxx[i] = MIN2(maxx[i], s->offset.x + s->extent.width);
1969 maxy[i] = MIN2(maxy[i], s->offset.y + s->extent.height);
1970 }
1971
1972 /* Upload a hardware scissor for each viewport, whether there's a
1973 * corresponding API scissor or not.
1974 */
1975 unsigned index = cs->scissor.size / AGX_SCISSOR_LENGTH;
1976 struct agx_scissor_packed *scissors =
1977 util_dynarray_grow_bytes(&cs->scissor, count, AGX_SCISSOR_LENGTH);
1978
1979 for (unsigned i = 0; i < count; ++i) {
1980 const VkViewport *vp = &dyn->vp.viewports[i];
1981
1982 agx_pack(scissors + i, SCISSOR, cfg) {
1983 cfg.min_x = minx[i];
1984 cfg.min_y = miny[i];
1985 cfg.max_x = maxx[i];
1986 cfg.max_y = maxy[i];
1987
1988 /* These settings in conjunction with the PPP control depth clip/clamp
1989 * settings implement depth clip/clamping. Properly setting them
1990 * together is required for conformant depth clip enable.
1991 *
1992 * TODO: Reverse-engineer the finer interactions here.
1993 */
1994 if (dyn->rs.depth_clamp_enable) {
1995 cfg.min_z = MIN2(vp->minDepth, vp->maxDepth);
1996 cfg.max_z = MAX2(vp->minDepth, vp->maxDepth);
1997 } else {
1998 cfg.min_z = 0.0;
1999 cfg.max_z = 1.0;
2000 }
2001 }
2002 }
2003
2004 /* Upload state */
2005 struct AGX_PPP_HEADER present = {
2006 .depth_bias_scissor = true,
2007 .region_clip = true,
2008 .viewport = true,
2009 .viewport_count = count,
2010 };
2011
2012 size_t size = agx_ppp_update_size(&present);
2013 struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
2014 if (!T.cpu)
2015 return;
2016
2017 struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present);
2018
2019 agx_ppp_push(&ppp, DEPTH_BIAS_SCISSOR, cfg) {
2020 cfg.scissor = index;
2021
2022 /* Use the current depth bias, we allocate linearly */
2023 unsigned count = cs->depth_bias.size / AGX_DEPTH_BIAS_LENGTH;
2024 cfg.depth_bias = count ? count - 1 : 0;
2025 };
2026
2027 for (unsigned i = 0; i < count; ++i) {
2028 agx_ppp_push(&ppp, REGION_CLIP, cfg) {
2029 cfg.enable = true;
2030 cfg.min_x = minx[i] / 32;
2031 cfg.min_y = miny[i] / 32;
2032 cfg.max_x = DIV_ROUND_UP(MAX2(maxx[i], 1), 32);
2033 cfg.max_y = DIV_ROUND_UP(MAX2(maxy[i], 1), 32);
2034 }
2035 }
2036
2037 agx_ppp_push(&ppp, VIEWPORT_CONTROL, cfg)
2038 ;
2039
2040 /* Upload viewports */
2041 for (unsigned i = 0; i < count; ++i) {
2042 const VkViewport *vp = &dyn->vp.viewports[i];
2043
2044 agx_ppp_push(&ppp, VIEWPORT, cfg) {
2045 cfg.translate_x = vp->x + 0.5f * vp->width;
2046 cfg.translate_y = vp->y + 0.5f * vp->height;
2047 cfg.translate_z = vp->minDepth;
2048
2049 cfg.scale_x = vp->width * 0.5f;
2050 cfg.scale_y = vp->height * 0.5f;
2051 cfg.scale_z = vp->maxDepth - vp->minDepth;
2052 }
2053 }
2054
2055 agx_ppp_fini(out, &ppp);
2056 }
2057
2058 static enum agx_object_type
translate_object_type(enum mesa_prim topology)2059 translate_object_type(enum mesa_prim topology)
2060 {
2061 static_assert(MESA_PRIM_LINES < MESA_PRIM_LINE_STRIP);
2062 static_assert(MESA_PRIM_TRIANGLES >= MESA_PRIM_LINE_STRIP);
2063
2064 if (topology == MESA_PRIM_POINTS)
2065 return AGX_OBJECT_TYPE_POINT_SPRITE_UV01;
2066 else if (topology <= MESA_PRIM_LINE_STRIP)
2067 return AGX_OBJECT_TYPE_LINE;
2068 else
2069 return AGX_OBJECT_TYPE_TRIANGLE;
2070 }
2071
2072 static enum agx_primitive
translate_hw_primitive_topology(enum mesa_prim prim)2073 translate_hw_primitive_topology(enum mesa_prim prim)
2074 {
2075 switch (prim) {
2076 case MESA_PRIM_POINTS:
2077 return AGX_PRIMITIVE_POINTS;
2078 case MESA_PRIM_LINES:
2079 return AGX_PRIMITIVE_LINES;
2080 case MESA_PRIM_LINE_STRIP:
2081 return AGX_PRIMITIVE_LINE_STRIP;
2082 case MESA_PRIM_TRIANGLES:
2083 return AGX_PRIMITIVE_TRIANGLES;
2084 case MESA_PRIM_TRIANGLE_STRIP:
2085 return AGX_PRIMITIVE_TRIANGLE_STRIP;
2086 case MESA_PRIM_TRIANGLE_FAN:
2087 return AGX_PRIMITIVE_TRIANGLE_FAN;
2088 default:
2089 unreachable("Invalid hardware primitive topology");
2090 }
2091 }
2092
2093 static inline enum agx_vdm_vertex
translate_vdm_vertex(unsigned vtx)2094 translate_vdm_vertex(unsigned vtx)
2095 {
2096 static_assert(AGX_VDM_VERTEX_0 == 0);
2097 static_assert(AGX_VDM_VERTEX_1 == 1);
2098 static_assert(AGX_VDM_VERTEX_2 == 2);
2099
2100 assert(vtx <= 2);
2101 return vtx;
2102 }
2103
2104 static inline enum agx_ppp_vertex
translate_ppp_vertex(unsigned vtx)2105 translate_ppp_vertex(unsigned vtx)
2106 {
2107 static_assert(AGX_PPP_VERTEX_0 == 0 + 1);
2108 static_assert(AGX_PPP_VERTEX_1 == 1 + 1);
2109 static_assert(AGX_PPP_VERTEX_2 == 2 + 1);
2110
2111 assert(vtx <= 2);
2112 return vtx + 1;
2113 }
2114
2115 static void
hk_flush_index(struct hk_cmd_buffer * cmd,struct hk_cs * cs)2116 hk_flush_index(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
2117 {
2118 uint32_t index = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY]
2119 ? BITFIELD_MASK(32)
2120 : cmd->state.gfx.index.restart;
2121
2122 /* VDM State updates are relatively expensive, so only emit them when the
2123 * restart index changes. This is simpler than accurate dirty tracking.
2124 */
2125 if (cs->restart_index != index) {
2126 uint8_t *out = cs->current;
2127 agx_push(out, VDM_STATE, cfg) {
2128 cfg.restart_index_present = true;
2129 }
2130
2131 agx_push(out, VDM_STATE_RESTART_INDEX, cfg) {
2132 cfg.value = index;
2133 }
2134
2135 cs->current = out;
2136 cs->restart_index = index;
2137 }
2138 }
2139
2140 /*
2141 * Return the given sample positions, packed into a 32-bit word with fixed
2142 * point nibbles for each x/y component of the (at most 4) samples. This is
2143 * suitable for programming the PPP_MULTISAMPLECTL control register.
2144 */
2145 static uint32_t
hk_pack_ppp_multisamplectrl(const struct vk_sample_locations_state * sl)2146 hk_pack_ppp_multisamplectrl(const struct vk_sample_locations_state *sl)
2147 {
2148 uint32_t ctrl = 0;
2149
2150 for (int32_t i = sl->per_pixel - 1; i >= 0; i--) {
2151 VkSampleLocationEXT loc = sl->locations[i];
2152
2153 uint32_t x = CLAMP(loc.x, 0.0f, 0.9375f) * 16.0;
2154 uint32_t y = CLAMP(loc.y, 0.0f, 0.9375f) * 16.0;
2155
2156 assert(x <= 15);
2157 assert(y <= 15);
2158
2159 /* Push bytes in reverse order so we can use constant shifts. */
2160 ctrl = (ctrl << 8) | (y << 4) | x;
2161 }
2162
2163 return ctrl;
2164 }
2165
2166 /*
2167 * Return the standard sample positions, prepacked as above for efficiency.
2168 */
2169 uint32_t
hk_default_sample_positions(unsigned nr_samples)2170 hk_default_sample_positions(unsigned nr_samples)
2171 {
2172 switch (nr_samples) {
2173 case 0:
2174 case 1:
2175 return 0x88;
2176 case 2:
2177 return 0x44cc;
2178 case 4:
2179 return 0xeaa26e26;
2180 default:
2181 unreachable("Invalid sample count");
2182 }
2183 }
2184
2185 static void
hk_flush_ppp_state(struct hk_cmd_buffer * cmd,struct hk_cs * cs,uint8_t ** out)2186 hk_flush_ppp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out)
2187 {
2188 const struct hk_rendering_state *render = &cmd->state.gfx.render;
2189 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
2190
2191 struct hk_graphics_state *gfx = &cmd->state.gfx;
2192 struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
2193 struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
2194
2195 bool hw_vs_dirty = IS_SHADER_DIRTY(VERTEX) || IS_SHADER_DIRTY(TESS_EVAL) ||
2196 IS_SHADER_DIRTY(GEOMETRY);
2197 bool fs_dirty = IS_SHADER_DIRTY(FRAGMENT);
2198
2199 struct hk_linked_shader *linked_fs = gfx->linked[MESA_SHADER_FRAGMENT];
2200 bool linked_fs_dirty = IS_LINKED_DIRTY(FRAGMENT);
2201
2202 bool varyings_dirty = gfx->dirty & HK_DIRTY_VARYINGS;
2203
2204 bool face_dirty =
2205 IS_DIRTY(DS_DEPTH_TEST_ENABLE) || IS_DIRTY(DS_DEPTH_WRITE_ENABLE) ||
2206 IS_DIRTY(DS_DEPTH_COMPARE_OP) || IS_DIRTY(DS_STENCIL_REFERENCE) ||
2207 IS_DIRTY(RS_LINE_WIDTH) || IS_DIRTY(RS_POLYGON_MODE) || fs_dirty;
2208
2209 bool stencil_face_dirty =
2210 IS_DIRTY(DS_STENCIL_OP) || IS_DIRTY(DS_STENCIL_COMPARE_MASK) ||
2211 IS_DIRTY(DS_STENCIL_WRITE_MASK) || IS_DIRTY(DS_STENCIL_TEST_ENABLE);
2212
2213 struct AGX_PPP_HEADER dirty = {
2214 .fragment_control =
2215 IS_DIRTY(DS_STENCIL_TEST_ENABLE) || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) ||
2216 IS_DIRTY(RS_DEPTH_BIAS_ENABLE) || gfx->dirty & HK_DIRTY_OCCLUSION,
2217
2218 .fragment_control_2 =
2219 IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) || linked_fs_dirty,
2220
2221 .fragment_front_face = face_dirty,
2222 .fragment_front_face_2 = fs_dirty || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY),
2223 .fragment_front_stencil = stencil_face_dirty,
2224 .fragment_back_face = face_dirty,
2225 .fragment_back_face_2 = fs_dirty || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY),
2226 .fragment_back_stencil = stencil_face_dirty,
2227 .output_select = hw_vs_dirty || linked_fs_dirty || varyings_dirty,
2228 .varying_counts_32 = varyings_dirty,
2229 .varying_counts_16 = varyings_dirty,
2230 .cull = IS_DIRTY(RS_CULL_MODE) ||
2231 IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) ||
2232 IS_DIRTY(RS_FRONT_FACE) || IS_DIRTY(RS_DEPTH_CLIP_ENABLE) ||
2233 IS_DIRTY(RS_DEPTH_CLAMP_ENABLE) || IS_DIRTY(RS_LINE_MODE) ||
2234 IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) ||
2235 (gfx->dirty & HK_DIRTY_PROVOKING) || IS_SHADER_DIRTY(TESS_CTRL) ||
2236 IS_SHADER_DIRTY(TESS_EVAL) || IS_DIRTY(TS_DOMAIN_ORIGIN),
2237 .cull_2 = varyings_dirty,
2238
2239 /* With a null FS, the fragment shader PPP word is ignored and doesn't
2240 * need to be present.
2241 */
2242 .fragment_shader = fs && (fs_dirty || linked_fs_dirty || varyings_dirty ||
2243 gfx->descriptors.root_dirty),
2244
2245 .occlusion_query = gfx->dirty & HK_DIRTY_OCCLUSION,
2246 .output_size = hw_vs_dirty,
2247 .viewport_count = 1, /* irrelevant */
2248 };
2249
2250 /* Calculate the update size. If it equals the header, there is nothing to
2251 * update so early-exit.
2252 */
2253 size_t size = agx_ppp_update_size(&dirty);
2254 if (size == AGX_PPP_HEADER_LENGTH)
2255 return;
2256
2257 /* Otherwise, allocate enough space for the update and push it. */
2258 assert(size > AGX_PPP_HEADER_LENGTH);
2259
2260 struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
2261 if (!T.cpu)
2262 return;
2263
2264 struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &dirty);
2265
2266 if (dirty.fragment_control) {
2267 agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
2268 cfg.visibility_mode = gfx->occlusion.mode;
2269 cfg.stencil_test_enable = hk_stencil_test_enabled(cmd);
2270
2271 /* TODO: Consider optimizing this? */
2272 cfg.two_sided_stencil = cfg.stencil_test_enable;
2273
2274 cfg.depth_bias_enable = dyn->rs.depth_bias.enable &&
2275 gfx->object_type == AGX_OBJECT_TYPE_TRIANGLE;
2276
2277 /* Always enable scissoring so we may scissor to the viewport (TODO:
2278 * optimize this out if the viewport is the default and the app does
2279 * not use the scissor test)
2280 */
2281 cfg.scissor_enable = true;
2282
2283 /* This avoids broken derivatives along primitive edges */
2284 cfg.disable_tri_merging = gfx->object_type != AGX_OBJECT_TYPE_TRIANGLE;
2285 }
2286 }
2287
2288 if (dirty.fragment_control_2) {
2289 if (linked_fs) {
2290 /* Annoying, rasterizer_discard seems to be ignored (sometimes?) in the
2291 * main fragment control word and has to be combined into the secondary
2292 * word for reliable behaviour.
2293 */
2294 agx_ppp_push_merged(&ppp, FRAGMENT_CONTROL, cfg,
2295 linked_fs->b.fragment_control) {
2296
2297 cfg.tag_write_disable = dyn->rs.rasterizer_discard_enable;
2298 }
2299 } else {
2300 /* If there is no fragment shader, we must disable tag writes to avoid
2301 * executing the missing shader. This optimizes depth-only passes.
2302 */
2303 agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
2304 cfg.tag_write_disable = true;
2305 cfg.pass_type = AGX_PASS_TYPE_OPAQUE;
2306 }
2307 }
2308 }
2309
2310 struct agx_fragment_face_packed fragment_face = {};
2311 struct agx_fragment_face_2_packed fragment_face_2 = {};
2312
2313 if (dirty.fragment_front_face) {
2314 bool has_z = render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2315 bool z_test = has_z && dyn->ds.depth.test_enable;
2316
2317 agx_pack(&fragment_face, FRAGMENT_FACE, cfg) {
2318 cfg.line_width = agx_pack_line_width(dyn->rs.line.width);
2319 cfg.polygon_mode = translate_polygon_mode(dyn->rs.polygon_mode);
2320 cfg.disable_depth_write = !(z_test && dyn->ds.depth.write_enable);
2321
2322 if (z_test && !gfx->descriptors.root.draw.force_never_in_shader)
2323 cfg.depth_function = translate_compare_op(dyn->ds.depth.compare_op);
2324 else
2325 cfg.depth_function = AGX_ZS_FUNC_ALWAYS;
2326 };
2327
2328 agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, fragment_face) {
2329 cfg.stencil_reference = dyn->ds.stencil.front.reference;
2330 }
2331 }
2332
2333 if (dirty.fragment_front_face_2) {
2334 if (fs) {
2335 agx_pack(&fragment_face_2, FRAGMENT_FACE_2, cfg) {
2336 cfg.object_type = gfx->object_type;
2337 }
2338
2339 agx_merge(fragment_face_2, fs->frag_face, FRAGMENT_FACE_2);
2340 agx_ppp_push_packed(&ppp, &fragment_face_2, FRAGMENT_FACE_2);
2341 } else {
2342 agx_ppp_fragment_face_2(&ppp, gfx->object_type, NULL);
2343 }
2344 }
2345
2346 if (dirty.fragment_front_stencil) {
2347 hk_ppp_push_stencil_face(&ppp, dyn->ds.stencil.front,
2348 hk_stencil_test_enabled(cmd));
2349 }
2350
2351 if (dirty.fragment_back_face) {
2352 assert(dirty.fragment_front_face);
2353
2354 agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, fragment_face) {
2355 cfg.stencil_reference = dyn->ds.stencil.back.reference;
2356 }
2357 }
2358
2359 if (dirty.fragment_back_face_2) {
2360 assert(dirty.fragment_front_face_2);
2361
2362 agx_ppp_push_packed(&ppp, &fragment_face_2, FRAGMENT_FACE_2);
2363 }
2364
2365 if (dirty.fragment_back_stencil) {
2366 hk_ppp_push_stencil_face(&ppp, dyn->ds.stencil.back,
2367 hk_stencil_test_enabled(cmd));
2368 }
2369
2370 if (dirty.output_select) {
2371 struct agx_output_select_packed osel = hw_vs->info.uvs.osel;
2372
2373 if (linked_fs) {
2374 agx_ppp_push_merged_blobs(&ppp, AGX_OUTPUT_SELECT_LENGTH, &osel,
2375 &linked_fs->b.osel);
2376 } else {
2377 agx_ppp_push_packed(&ppp, &osel, OUTPUT_SELECT);
2378 }
2379 }
2380
2381 assert(dirty.varying_counts_32 == dirty.varying_counts_16);
2382
2383 if (dirty.varying_counts_32) {
2384 agx_ppp_push_packed(&ppp, &gfx->linked_varyings.counts_32,
2385 VARYING_COUNTS);
2386
2387 agx_ppp_push_packed(&ppp, &gfx->linked_varyings.counts_16,
2388 VARYING_COUNTS);
2389 }
2390
2391 if (dirty.cull) {
2392 agx_ppp_push(&ppp, CULL, cfg) {
2393 cfg.cull_front = dyn->rs.cull_mode & VK_CULL_MODE_FRONT_BIT;
2394 cfg.cull_back = dyn->rs.cull_mode & VK_CULL_MODE_BACK_BIT;
2395 cfg.front_face_ccw = dyn->rs.front_face != VK_FRONT_FACE_CLOCKWISE;
2396
2397 if (gfx->shaders[MESA_SHADER_TESS_CTRL] &&
2398 !gfx->shaders[MESA_SHADER_GEOMETRY]) {
2399 cfg.front_face_ccw ^= gfx->tess.info.ccw;
2400 cfg.front_face_ccw ^= dyn->ts.domain_origin ==
2401 VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT;
2402 }
2403
2404 cfg.flat_shading_vertex = translate_ppp_vertex(gfx->provoking);
2405 cfg.rasterizer_discard = dyn->rs.rasterizer_discard_enable;
2406
2407 /* We do not support unrestricted depth, so clamping is inverted from
2408 * clipping. This implementation seems to pass CTS without unrestricted
2409 * depth support.
2410 *
2411 * TODO: Make sure this is right with gl_FragDepth.
2412 */
2413 cfg.depth_clip = vk_rasterization_state_depth_clip_enable(&dyn->rs);
2414 cfg.depth_clamp = !cfg.depth_clip;
2415
2416 cfg.primitive_msaa =
2417 gfx->object_type == AGX_OBJECT_TYPE_LINE &&
2418 dyn->rs.line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR;
2419 }
2420 }
2421
2422 if (dirty.cull_2) {
2423 agx_ppp_push(&ppp, CULL_2, cfg) {
2424 cfg.needs_primitive_id = gfx->generate_primitive_id;
2425 cfg.clamp_w = true;
2426 }
2427 }
2428
2429 if (dirty.fragment_shader) {
2430 /* TODO: Do less often? */
2431 hk_reserve_scratch(cmd, cs, fs);
2432
2433 agx_ppp_push_packed(&ppp, &linked_fs->fs_counts, FRAGMENT_SHADER_WORD_0);
2434
2435 agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_1, cfg) {
2436 cfg.pipeline = hk_upload_usc_words(cmd, fs, linked_fs);
2437 }
2438
2439 agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_2, cfg) {
2440 cfg.cf_bindings = gfx->varyings;
2441 }
2442
2443 agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_3, cfg)
2444 ;
2445 }
2446
2447 if (dirty.occlusion_query) {
2448 agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY, cfg) {
2449 cfg.index = gfx->occlusion.index;
2450 }
2451 }
2452
2453 if (dirty.output_size) {
2454 agx_ppp_push(&ppp, OUTPUT_SIZE, cfg) {
2455 cfg.count = hw_vs->info.uvs.size;
2456 }
2457 }
2458
2459 agx_ppp_fini(out, &ppp);
2460 }
2461
2462 /*
2463 * Based somewhat on the calculation in the PowerVR driver, and mostly trial &
2464 * error to pass CTS. This is a mess.
2465 */
2466 static float
hk_depth_bias_factor(VkFormat format,bool exact,bool force_unorm)2467 hk_depth_bias_factor(VkFormat format, bool exact, bool force_unorm)
2468 {
2469 if (format == VK_FORMAT_D16_UNORM) {
2470 return exact ? (1 << 16) : (1 << 15);
2471 } else if (force_unorm) {
2472 return exact ? (1ull << 24) : (1ull << 23);
2473 } else {
2474 return 1.0;
2475 }
2476 }
2477
2478 static void
hk_flush_dynamic_state(struct hk_cmd_buffer * cmd,struct hk_cs * cs,uint32_t draw_id,struct agx_draw draw)2479 hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
2480 uint32_t draw_id, struct agx_draw draw)
2481 {
2482 struct hk_device *dev = hk_cmd_buffer_device(cmd);
2483 const struct hk_rendering_state *render = &cmd->state.gfx.render;
2484 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
2485
2486 struct hk_graphics_state *gfx = &cmd->state.gfx;
2487
2488 struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
2489 struct hk_shader *sw_vs = hk_bound_sw_vs(gfx);
2490
2491 if (!vk_dynamic_graphics_state_any_dirty(dyn) && !gfx->dirty &&
2492 !gfx->descriptors.root_dirty && !gfx->shaders_dirty &&
2493 !sw_vs->b.info.uses_draw_id && !sw_vs->b.info.uses_base_param &&
2494 !(gfx->linked[MESA_SHADER_VERTEX] &&
2495 gfx->linked[MESA_SHADER_VERTEX]->b.uses_base_param))
2496 return;
2497
2498 struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
2499
2500 assert(cs->current + 0x1000 < cs->end && "already ensured space");
2501 uint8_t *out = cs->current;
2502
2503 struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
2504
2505 bool gt_dirty = IS_SHADER_DIRTY(TESS_CTRL) || IS_SHADER_DIRTY(TESS_EVAL) ||
2506 IS_SHADER_DIRTY(GEOMETRY);
2507 bool vgt_dirty = IS_SHADER_DIRTY(VERTEX) || gt_dirty;
2508 bool fs_dirty = IS_SHADER_DIRTY(FRAGMENT);
2509
2510 if (IS_DIRTY(CB_BLEND_CONSTANTS)) {
2511 static_assert(sizeof(desc->root.draw.blend_constant) ==
2512 sizeof(dyn->cb.blend_constants) &&
2513 "common size");
2514
2515 memcpy(desc->root.draw.blend_constant, dyn->cb.blend_constants,
2516 sizeof(dyn->cb.blend_constants));
2517 desc->root_dirty = true;
2518 }
2519
2520 if (IS_DIRTY(MS_SAMPLE_MASK)) {
2521 desc->root.draw.api_sample_mask = dyn->ms.sample_mask;
2522 desc->root_dirty = true;
2523 }
2524
2525 if (fs_dirty || IS_DIRTY(DS_DEPTH_TEST_ENABLE) ||
2526 IS_DIRTY(DS_DEPTH_COMPARE_OP)) {
2527
2528 const struct hk_rendering_state *render = &cmd->state.gfx.render;
2529 bool has_z = render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2530 bool z_test = has_z && dyn->ds.depth.test_enable;
2531
2532 desc->root.draw.force_never_in_shader =
2533 z_test && dyn->ds.depth.compare_op == VK_COMPARE_OP_NEVER && fs &&
2534 fs->info.fs.writes_memory;
2535
2536 desc->root_dirty = true;
2537 }
2538
2539 /* The main shader must not run tests if the epilog will. */
2540 bool nontrivial_force_early =
2541 fs && (fs->b.info.early_fragment_tests &&
2542 (fs->b.info.writes_sample_mask || fs->info.fs.writes_memory));
2543
2544 bool epilog_discards = dyn->ms.alpha_to_coverage_enable ||
2545 (fs && (fs->info.fs.epilog_key.write_z ||
2546 fs->info.fs.epilog_key.write_s));
2547 epilog_discards &= !nontrivial_force_early;
2548
2549 if (fs_dirty || IS_DIRTY(MS_ALPHA_TO_COVERAGE_ENABLE)) {
2550 desc->root.draw.no_epilog_discard = !epilog_discards ? ~0 : 0;
2551 desc->root_dirty = true;
2552 }
2553
2554 if (IS_DIRTY(VI) || IS_DIRTY(VI_BINDINGS_VALID) ||
2555 IS_DIRTY(VI_BINDING_STRIDES) || vgt_dirty || true /* TODO */) {
2556
2557 struct hk_fast_link_key_vs key = {
2558 .prolog.hw = (sw_vs == hw_vs),
2559
2560 /* FIXME: handle pipeline robustness "properly" */
2561 .prolog.robustness.level =
2562 (dev->vk.enabled_features.robustBufferAccess2 ||
2563 dev->vk.enabled_features.pipelineRobustness)
2564 ? AGX_ROBUSTNESS_D3D
2565 : dev->vk.enabled_features.robustBufferAccess
2566 ? AGX_ROBUSTNESS_GL
2567 : AGX_ROBUSTNESS_DISABLED,
2568
2569 .prolog.robustness.soft_fault = agx_has_soft_fault(&dev->dev),
2570 };
2571
2572 enum mesa_prim prim = vk_conv_topology(dyn->ia.primitive_topology);
2573
2574 if (mesa_prim_has_adjacency(prim)) {
2575 if (draw.restart) {
2576 prim = u_decomposed_prim(prim);
2577 }
2578
2579 key.prolog.adjacency = prim;
2580 }
2581
2582 if (key.prolog.adjacency || !key.prolog.hw) {
2583 key.prolog.sw_index_size_B =
2584 draw.indexed ? agx_index_size_to_B(draw.index_size) : 0;
2585 }
2586
2587 static_assert(sizeof(key.prolog.component_mask) ==
2588 sizeof(sw_vs->info.vs.attrib_components_read));
2589 BITSET_COPY(key.prolog.component_mask,
2590 sw_vs->info.vs.attrib_components_read);
2591
2592 u_foreach_bit(a, dyn->vi->attributes_valid) {
2593 struct vk_vertex_attribute_state attr = dyn->vi->attributes[a];
2594
2595 assert(dyn->vi->bindings_valid & BITFIELD_BIT(attr.binding));
2596 struct vk_vertex_binding_state binding =
2597 dyn->vi->bindings[attr.binding];
2598
2599 /* nir_assign_io_var_locations compacts vertex inputs, eliminating
2600 * unused inputs. We need to do the same here to match the locations.
2601 */
2602 unsigned slot =
2603 util_bitcount64(sw_vs->info.vs.attribs_read & BITFIELD_MASK(a));
2604
2605 key.prolog.attribs[slot] = (struct agx_velem_key){
2606 .format = hk_format_to_pipe_format(attr.format),
2607 .stride = dyn->vi_binding_strides[attr.binding],
2608 .divisor = binding.divisor,
2609 .instanced = binding.input_rate == VK_VERTEX_INPUT_RATE_INSTANCE,
2610 };
2611 }
2612
2613 hk_update_fast_linked(cmd, sw_vs, &key);
2614 }
2615
2616 if (IS_DIRTY(VI) || IS_DIRTY(VI_BINDINGS_VALID) || vgt_dirty ||
2617 (gfx->dirty & HK_DIRTY_VB)) {
2618
2619 uint64_t sink = dev->rodata.zero_sink;
2620
2621 unsigned slot = 0;
2622 u_foreach_bit(a, sw_vs->info.vs.attribs_read) {
2623 if (dyn->vi->attributes_valid & BITFIELD_BIT(a)) {
2624 struct vk_vertex_attribute_state attr = dyn->vi->attributes[a];
2625 struct hk_addr_range vb = gfx->vb[attr.binding];
2626
2627 desc->root.draw.attrib_clamps[slot] = agx_calculate_vbo_clamp(
2628 vb.addr, sink, hk_format_to_pipe_format(attr.format), vb.range,
2629 dyn->vi_binding_strides[attr.binding], attr.offset,
2630 &desc->root.draw.attrib_base[slot]);
2631 } else {
2632 desc->root.draw.attrib_base[slot] = sink;
2633 desc->root.draw.attrib_clamps[slot] = 0;
2634 }
2635
2636 ++slot;
2637 }
2638
2639 desc->root_dirty = true;
2640 }
2641
2642 if (vgt_dirty || IS_SHADER_DIRTY(FRAGMENT) ||
2643 IS_DIRTY(MS_RASTERIZATION_SAMPLES) || IS_DIRTY(MS_SAMPLE_MASK) ||
2644 IS_DIRTY(MS_ALPHA_TO_COVERAGE_ENABLE) ||
2645 IS_DIRTY(MS_ALPHA_TO_ONE_ENABLE) || IS_DIRTY(CB_LOGIC_OP) ||
2646 IS_DIRTY(CB_LOGIC_OP_ENABLE) || IS_DIRTY(CB_WRITE_MASKS) ||
2647 IS_DIRTY(CB_COLOR_WRITE_ENABLES) || IS_DIRTY(CB_ATTACHMENT_COUNT) ||
2648 IS_DIRTY(CB_BLEND_ENABLES) || IS_DIRTY(CB_BLEND_EQUATIONS) ||
2649 IS_DIRTY(CB_BLEND_CONSTANTS) ||
2650 desc->root_dirty /* for pipeline stats */ || true) {
2651
2652 unsigned tib_sample_mask = BITFIELD_MASK(dyn->ms.rasterization_samples);
2653 unsigned api_sample_mask = dyn->ms.sample_mask & tib_sample_mask;
2654 bool has_sample_mask = api_sample_mask != tib_sample_mask;
2655
2656 if (hw_vs->info.vs.cull_distance_array_size) {
2657 perf_debug(dev, "Emulating cull distance (size %u, %s a frag shader)",
2658 hw_vs->info.vs.cull_distance_array_size,
2659 fs ? "with" : "without");
2660 }
2661
2662 if (has_sample_mask) {
2663 perf_debug(dev, "Emulating sample mask (%s a frag shader)",
2664 fs ? "with" : "without");
2665 }
2666
2667 if (fs) {
2668 unsigned samples_shaded = 0;
2669 if (fs->info.fs.epilog_key.sample_shading)
2670 samples_shaded = dyn->ms.rasterization_samples;
2671
2672 struct hk_fast_link_key_fs key = {
2673 .prolog.statistics = hk_pipeline_stat_addr(
2674 cmd,
2675 VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT),
2676
2677 .prolog.cull_distance_size =
2678 hw_vs->info.vs.cull_distance_array_size,
2679 .prolog.api_sample_mask = has_sample_mask ? api_sample_mask : 0xff,
2680 .nr_samples_shaded = samples_shaded,
2681 };
2682
2683 bool prolog_discards =
2684 has_sample_mask || key.prolog.cull_distance_size;
2685
2686 bool needs_prolog = key.prolog.statistics || prolog_discards;
2687
2688 if (needs_prolog) {
2689 /* With late main shader tests, the prolog runs tests if neither the
2690 * main shader nor epilog will.
2691 *
2692 * With (nontrivial) early main shader tests, the prolog does not
2693 * run tests, the tests will run at the start of the main shader.
2694 * This ensures tests are after API sample mask and cull distance
2695 * discards.
2696 */
2697 key.prolog.run_zs_tests = !nontrivial_force_early &&
2698 !fs->b.info.writes_sample_mask &&
2699 !epilog_discards && prolog_discards;
2700
2701 if (key.prolog.cull_distance_size) {
2702 key.prolog.cf_base = fs->b.info.varyings.fs.nr_cf;
2703 }
2704 }
2705
2706 key.epilog = (struct agx_fs_epilog_key){
2707 .link = fs->info.fs.epilog_key,
2708 .nr_samples = MAX2(dyn->ms.rasterization_samples, 1),
2709 .blend.alpha_to_coverage = dyn->ms.alpha_to_coverage_enable,
2710 .blend.alpha_to_one = dyn->ms.alpha_to_one_enable,
2711 .blend.logicop_func = dyn->cb.logic_op_enable
2712 ? vk_logic_op_to_pipe(dyn->cb.logic_op)
2713 : PIPE_LOGICOP_COPY,
2714 };
2715
2716 for (unsigned rt = 0; rt < ARRAY_SIZE(dyn->cal.color_map); ++rt) {
2717 int map = dyn->cal.color_map[rt];
2718 key.epilog.remap[rt] = map == MESA_VK_ATTACHMENT_UNUSED ? -1 : map;
2719 }
2720
2721 if (dyn->ms.alpha_to_one_enable || dyn->ms.alpha_to_coverage_enable ||
2722 dyn->cb.logic_op_enable) {
2723
2724 perf_debug(
2725 dev, "Epilog with%s%s%s",
2726 dyn->ms.alpha_to_one_enable ? " alpha-to-one" : "",
2727 dyn->ms.alpha_to_coverage_enable ? " alpha-to-coverage" : "",
2728 dyn->cb.logic_op_enable ? " logic-op" : "");
2729 }
2730
2731 key.epilog.link.already_ran_zs |= nontrivial_force_early;
2732
2733 struct hk_rendering_state *render = &cmd->state.gfx.render;
2734 for (uint32_t i = 0; i < render->color_att_count; i++) {
2735 key.epilog.rt_formats[i] =
2736 hk_format_to_pipe_format(render->color_att[i].vk_format);
2737
2738 const struct vk_color_blend_attachment_state *cb =
2739 &dyn->cb.attachments[i];
2740
2741 bool write_enable = dyn->cb.color_write_enables & BITFIELD_BIT(i);
2742 unsigned write_mask = write_enable ? cb->write_mask : 0;
2743
2744 /* nir_lower_blend always blends, so use a default blend state when
2745 * blending is disabled at an API level.
2746 */
2747 if (!dyn->cb.attachments[i].blend_enable) {
2748 key.epilog.blend.rt[i] = (struct agx_blend_rt_key){
2749 .colormask = write_mask,
2750 .rgb_func = PIPE_BLEND_ADD,
2751 .alpha_func = PIPE_BLEND_ADD,
2752 .rgb_src_factor = PIPE_BLENDFACTOR_ONE,
2753 .alpha_src_factor = PIPE_BLENDFACTOR_ONE,
2754 .rgb_dst_factor = PIPE_BLENDFACTOR_ZERO,
2755 .alpha_dst_factor = PIPE_BLENDFACTOR_ZERO,
2756 };
2757 } else {
2758 key.epilog.blend.rt[i] = (struct agx_blend_rt_key){
2759 .colormask = write_mask,
2760
2761 .rgb_src_factor =
2762 vk_blend_factor_to_pipe(cb->src_color_blend_factor),
2763
2764 .rgb_dst_factor =
2765 vk_blend_factor_to_pipe(cb->dst_color_blend_factor),
2766
2767 .rgb_func = vk_blend_op_to_pipe(cb->color_blend_op),
2768
2769 .alpha_src_factor =
2770 vk_blend_factor_to_pipe(cb->src_alpha_blend_factor),
2771
2772 .alpha_dst_factor =
2773 vk_blend_factor_to_pipe(cb->dst_alpha_blend_factor),
2774
2775 .alpha_func = vk_blend_op_to_pipe(cb->alpha_blend_op),
2776 };
2777 }
2778 }
2779
2780 hk_update_fast_linked(cmd, fs, &key);
2781 } else {
2782 /* TODO: prolog without fs needs to work too... */
2783 if (cmd->state.gfx.linked[MESA_SHADER_FRAGMENT] != NULL) {
2784 cmd->state.gfx.linked_dirty |= BITFIELD_BIT(MESA_SHADER_FRAGMENT);
2785 cmd->state.gfx.linked[MESA_SHADER_FRAGMENT] = NULL;
2786 }
2787 }
2788 }
2789
2790 /* If the vertex shader uses draw parameters, vertex uniforms are dirty every
2791 * draw. Fragment uniforms are unaffected.
2792 *
2793 * For a direct draw, we upload the draw parameters as-if indirect to
2794 * avoid keying to indirectness.
2795 */
2796 if (gfx->linked[MESA_SHADER_VERTEX]->b.uses_base_param) {
2797 if (agx_is_indirect(draw.b)) {
2798 gfx->draw_params = draw.b.ptr;
2799
2800 if (draw.indexed) {
2801 gfx->draw_params +=
2802 offsetof(VkDrawIndexedIndirectCommand, vertexOffset);
2803 } else {
2804 gfx->draw_params += offsetof(VkDrawIndirectCommand, firstVertex);
2805 }
2806 } else {
2807 uint32_t params[] = {
2808 draw.indexed ? draw.index_bias : draw.start,
2809 draw.start_instance,
2810 };
2811
2812 gfx->draw_params = hk_pool_upload(cmd, params, sizeof(params), 4);
2813 }
2814 } else {
2815 gfx->draw_params = 0;
2816 }
2817
2818 if (sw_vs->b.info.uses_draw_id) {
2819 /* TODO: rodata? */
2820 gfx->draw_id_ptr = hk_pool_upload(cmd, &draw_id, 2, 4);
2821 } else {
2822 gfx->draw_id_ptr = 0;
2823 }
2824
2825 if (IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || gt_dirty) {
2826 enum mesa_prim prim = hk_rast_prim(cmd);
2827
2828 gfx->topology = translate_hw_primitive_topology(prim);
2829 gfx->object_type = translate_object_type(prim);
2830 }
2831
2832 if (IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || IS_DIRTY(RS_PROVOKING_VERTEX)) {
2833 unsigned provoking;
2834 if (dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT)
2835 provoking = 2;
2836 else if (gfx->topology == AGX_PRIMITIVE_TRIANGLE_FAN)
2837 provoking = 1;
2838 else
2839 provoking = 0;
2840
2841 if (provoking != gfx->provoking) {
2842 gfx->provoking = provoking;
2843 gfx->dirty |= HK_DIRTY_PROVOKING;
2844
2845 gfx->descriptors.root.draw.provoking = provoking;
2846 gfx->descriptors.root_dirty = true;
2847 }
2848 }
2849
2850 /* With attachmentless rendering, we don't know the sample count until draw
2851 * time, so we do a late tilebuffer fix up. But with rasterizer discard,
2852 * rasterization_samples might be 0.
2853 *
2854 * Note that we ignore dyn->ms.rasterization_samples when we do have a sample
2855 * count from an attachment. In Vulkan, these have to match anyway, but DX12
2856 * drivers are robust against this scenarios and vkd3d-proton will go out of
2857 * spec here. No reason we can't be robust here too.
2858 */
2859 if (dyn->ms.rasterization_samples && !gfx->render.tilebuffer.nr_samples) {
2860 agx_tilebuffer_set_samples(&gfx->render.tilebuffer,
2861 dyn->ms.rasterization_samples);
2862
2863 cs->tib = gfx->render.tilebuffer;
2864 }
2865
2866 if (IS_DIRTY(MS_SAMPLE_LOCATIONS) || IS_DIRTY(MS_SAMPLE_LOCATIONS_ENABLE) ||
2867 IS_DIRTY(MS_RASTERIZATION_SAMPLES)) {
2868
2869 uint32_t ctrl;
2870 if (dyn->ms.sample_locations_enable) {
2871 ctrl = hk_pack_ppp_multisamplectrl(dyn->ms.sample_locations);
2872 } else {
2873 ctrl = hk_default_sample_positions(dyn->ms.rasterization_samples);
2874 }
2875
2876 bool dont_commit = cmd->in_meta || dyn->ms.rasterization_samples == 0;
2877
2878 if (!cs->has_sample_locations) {
2879 cs->ppp_multisamplectl = ctrl;
2880
2881 /* If we're in vk_meta, do not commit to the sample locations yet.
2882 * vk_meta doesn't care, but the app will!
2883 */
2884 cs->has_sample_locations |= !dont_commit;
2885 } else {
2886 assert(dont_commit || cs->ppp_multisamplectl == ctrl);
2887 }
2888
2889 gfx->descriptors.root.draw.ppp_multisamplectl = ctrl;
2890 gfx->descriptors.root_dirty = true;
2891 }
2892
2893 /* Link varyings before uploading tessellation state, becuase the
2894 * gfx->generate_primitive_id boolean needs to be plumbed.
2895 */
2896 struct hk_linked_shader *linked_vs = gfx->linked[MESA_SHADER_VERTEX];
2897 struct hk_linked_shader *linked_fs = gfx->linked[MESA_SHADER_FRAGMENT];
2898 bool linked_vs_dirty = IS_LINKED_DIRTY(VERTEX);
2899 bool linked_fs_dirty = IS_LINKED_DIRTY(FRAGMENT);
2900
2901 if ((gfx->dirty & HK_DIRTY_PROVOKING) || vgt_dirty || linked_fs_dirty) {
2902 unsigned bindings = linked_fs ? linked_fs->b.cf.nr_bindings : 0;
2903 if (bindings) {
2904 size_t linkage_size =
2905 AGX_CF_BINDING_HEADER_LENGTH + (bindings * AGX_CF_BINDING_LENGTH);
2906
2907 struct agx_ptr t = hk_pool_usc_alloc(cmd, linkage_size, 16);
2908 if (!t.cpu)
2909 return;
2910
2911 agx_link_varyings_vs_fs(
2912 t.cpu, &gfx->linked_varyings, hw_vs->info.uvs.user_size,
2913 &linked_fs->b.cf, gfx->provoking, 0, &gfx->generate_primitive_id);
2914
2915 gfx->varyings = agx_usc_addr(&dev->dev, t.gpu);
2916 } else {
2917 gfx->varyings = 0;
2918 }
2919
2920 gfx->dirty |= HK_DIRTY_VARYINGS;
2921 }
2922
2923 if (gfx->shaders[MESA_SHADER_TESS_EVAL] ||
2924 gfx->shaders[MESA_SHADER_GEOMETRY] || linked_vs->sw_indexing) {
2925 /* XXX: We should deduplicate this logic */
2926 bool indirect = agx_is_indirect(draw.b) || draw.restart;
2927
2928 desc->root.draw.input_assembly =
2929 indirect ? hk_pool_alloc(cmd, sizeof(struct agx_ia_state), 4).gpu
2930 : hk_upload_ia_params(cmd, draw);
2931 desc->root_dirty = true;
2932 }
2933
2934 if (gfx->shaders[MESA_SHADER_TESS_EVAL] ||
2935 gfx->shaders[MESA_SHADER_GEOMETRY]) {
2936
2937 struct hk_shader *vs = hk_bound_sw_vs(gfx);
2938 desc->root.draw.vertex_outputs = vs->b.info.outputs;
2939
2940 /* XXX: We should deduplicate this logic */
2941 bool indirect = agx_is_indirect(draw.b) || draw.restart;
2942
2943 if (!indirect) {
2944 uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
2945 unsigned vb_size =
2946 libagx_tcs_in_size(verts * instances, vs->b.info.outputs);
2947
2948 /* Allocate if there are any outputs, or use the null sink to trap
2949 * reads if there aren't. Those reads are undefined but should not
2950 * fault. Affects:
2951 *
2952 * dEQP-VK.pipeline.monolithic.no_position.explicit_declarations.basic.single_view.v0_g1
2953 */
2954 desc->root.draw.vertex_output_buffer =
2955 vb_size ? hk_pool_alloc(cmd, vb_size, 4).gpu
2956 : dev->rodata.null_sink;
2957 }
2958 }
2959
2960 struct agx_ptr tess_args = {0};
2961 if (gfx->shaders[MESA_SHADER_TESS_EVAL]) {
2962 tess_args = hk_pool_alloc(cmd, sizeof(struct libagx_tess_args), 4);
2963 gfx->descriptors.root.draw.tess_params = tess_args.gpu;
2964 gfx->descriptors.root_dirty = true;
2965 }
2966
2967 if (gfx->shaders[MESA_SHADER_GEOMETRY]) {
2968 /* TODO: size */
2969 cmd->geom_indirect = hk_pool_alloc(cmd, 64, 4).gpu;
2970
2971 gfx->descriptors.root.draw.geometry_params =
2972 hk_upload_geometry_params(cmd, draw);
2973
2974 gfx->descriptors.root_dirty = true;
2975 }
2976
2977 /* Root must be uploaded after the above, which touch the root */
2978 if (gfx->descriptors.root_dirty) {
2979 gfx->root =
2980 hk_cmd_buffer_upload_root(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
2981
2982 /* Tess parameters depend on the root address, so we defer the upload
2983 * until after uploading root. But the root depends on the tess address,
2984 * so we allocate tess parameters before uploading root.
2985 *
2986 * This whole mechanism is a mess ported over from the GL driver. I'm
2987 * planning to do a massive rework of indirect geom/tess so I'm trying not
2988 * to perfectionism it in the mean time.
2989 */
2990 if (tess_args.cpu) {
2991 hk_upload_tess_params(cmd, tess_args.cpu, draw);
2992 }
2993 }
2994
2995 /* Hardware dynamic state must be deferred until after the root and fast
2996 * linking, since it will use the root address and the linked shaders.
2997 */
2998 if ((gfx->dirty & (HK_DIRTY_PROVOKING | HK_DIRTY_VARYINGS)) ||
2999 IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) || linked_vs_dirty || vgt_dirty ||
3000 gfx->descriptors.root_dirty || gfx->draw_id_ptr || gfx->draw_params) {
3001
3002 /* TODO: Do less often? */
3003 hk_reserve_scratch(cmd, cs, hw_vs);
3004
3005 agx_push(out, VDM_STATE, cfg) {
3006 cfg.vertex_shader_word_0_present = true;
3007 cfg.vertex_shader_word_1_present = true;
3008 cfg.vertex_outputs_present = true;
3009 cfg.vertex_unknown_present = true;
3010 }
3011
3012 agx_push_packed(out, hw_vs->counts, VDM_STATE_VERTEX_SHADER_WORD_0);
3013
3014 struct hk_linked_shader *linked_hw_vs =
3015 (hw_vs == sw_vs) ? linked_vs : hw_vs->only_linked;
3016
3017 agx_push(out, VDM_STATE_VERTEX_SHADER_WORD_1, cfg) {
3018 cfg.pipeline = hk_upload_usc_words(cmd, hw_vs, linked_hw_vs);
3019 }
3020
3021 agx_push_packed(out, hw_vs->info.uvs.vdm, VDM_STATE_VERTEX_OUTPUTS);
3022
3023 agx_push(out, VDM_STATE_VERTEX_UNKNOWN, cfg) {
3024 cfg.flat_shading_control = translate_vdm_vertex(gfx->provoking);
3025 cfg.unknown_4 = cfg.unknown_5 = dyn->rs.rasterizer_discard_enable;
3026 cfg.generate_primitive_id = gfx->generate_primitive_id;
3027 }
3028
3029 /* Pad up to a multiple of 8 bytes */
3030 memset(out, 0, 4);
3031 out += 4;
3032 }
3033
3034 if (IS_DIRTY(RS_DEPTH_BIAS_FACTORS)) {
3035 void *ptr =
3036 util_dynarray_grow_bytes(&cs->depth_bias, 1, AGX_DEPTH_BIAS_LENGTH);
3037
3038 bool exact = dyn->rs.depth_bias.exact;
3039 bool force_unorm =
3040 dyn->rs.depth_bias.representation ==
3041 VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT;
3042
3043 agx_pack(ptr, DEPTH_BIAS, cfg) {
3044 cfg.slope_scale = dyn->rs.depth_bias.slope_factor;
3045 cfg.clamp = dyn->rs.depth_bias.clamp;
3046 cfg.depth_bias = dyn->rs.depth_bias.constant_factor;
3047 cfg.depth_bias /= hk_depth_bias_factor(render->depth_att.vk_format,
3048 exact, force_unorm);
3049 }
3050 }
3051
3052 /* Hardware viewport/scissor state is entangled with depth bias. */
3053 if (IS_DIRTY(RS_DEPTH_BIAS_FACTORS) || IS_DIRTY(VP_SCISSORS) ||
3054 IS_DIRTY(VP_SCISSOR_COUNT) || IS_DIRTY(VP_VIEWPORTS) ||
3055 IS_DIRTY(VP_VIEWPORT_COUNT) ||
3056 IS_DIRTY(VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) ||
3057 IS_DIRTY(RS_DEPTH_CLIP_ENABLE) || IS_DIRTY(RS_DEPTH_CLAMP_ENABLE)) {
3058
3059 hk_flush_vp_state(cmd, cs, &out);
3060 }
3061
3062 hk_flush_ppp_state(cmd, cs, &out);
3063 cs->current = out;
3064
3065 vk_dynamic_graphics_state_clear_dirty(dyn);
3066 gfx->shaders_dirty = 0;
3067 gfx->linked_dirty = 0;
3068 gfx->dirty = 0;
3069 gfx->descriptors.root_dirty = false;
3070 }
3071
3072 static bool
hk_needs_index_robustness(struct hk_cmd_buffer * cmd,struct agx_draw * draw)3073 hk_needs_index_robustness(struct hk_cmd_buffer *cmd, struct agx_draw *draw)
3074 {
3075 struct hk_graphics_state *gfx = &cmd->state.gfx;
3076 struct hk_device *dev = hk_cmd_buffer_device(cmd);
3077
3078 if (!draw->indexed)
3079 return false;
3080
3081 /* Geometry or tessellation use robust software index buffer fetch anyway */
3082 if (gfx->shaders[MESA_SHADER_GEOMETRY] ||
3083 gfx->shaders[MESA_SHADER_TESS_EVAL])
3084 return false;
3085
3086 /* Soft fault does not cover the hardware index buffer fetch. So we can't
3087 * simply use index buffers. However, we can use our 16-byte zero sink
3088 * instead, using the hardware clamp. This does seem to work.
3089 */
3090 if (draw->index_buffer_range_B == 0) {
3091 draw->index_buffer = dev->rodata.zero_sink;
3092 draw->index_buffer_range_B = 4;
3093 draw->start = 0;
3094 return false;
3095 }
3096
3097 if (!(dev->vk.enabled_features.robustBufferAccess ||
3098 dev->vk.enabled_features.robustBufferAccess2 ||
3099 dev->vk.enabled_features.pipelineRobustness))
3100 return false;
3101
3102 if (agx_is_indirect(draw->b))
3103 return true;
3104
3105 return agx_direct_draw_overreads_indices(*draw);
3106 }
3107
3108 static void
hk_handle_passthrough_gs(struct hk_cmd_buffer * cmd,struct agx_draw draw)3109 hk_handle_passthrough_gs(struct hk_cmd_buffer *cmd, struct agx_draw draw)
3110 {
3111 struct hk_graphics_state *gfx = &cmd->state.gfx;
3112 struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
3113
3114 /* If there's an application geometry shader, there's nothing to un/bind */
3115 if (gs && !gs->is_passthrough)
3116 return;
3117
3118 /* Determine if we need a geometry shader to emulate XFB or adjacency */
3119 struct hk_shader *last_sw = hk_bound_sw_vs_before_gs(gfx);
3120 uint32_t xfb_outputs = last_sw->info.xfb_info.output_count;
3121 bool needs_gs = xfb_outputs;
3122
3123 /* If we already have a matching GS configuration, we're done */
3124 if ((gs != NULL) == needs_gs)
3125 return;
3126
3127 /* If we don't need a GS but we do have a passthrough, unbind it */
3128 if (gs) {
3129 assert(!needs_gs && gs->is_passthrough);
3130 hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, NULL);
3131 return;
3132 }
3133
3134 /* Else, we need to bind a passthrough GS */
3135 size_t key_size =
3136 sizeof(struct hk_passthrough_gs_key) + nir_xfb_info_size(xfb_outputs);
3137 struct hk_passthrough_gs_key *key = alloca(key_size);
3138
3139 *key = (struct hk_passthrough_gs_key){
3140 .prim = u_decomposed_prim(hk_gs_in_prim(cmd)),
3141 .outputs = last_sw->b.info.outputs,
3142 .clip_distance_array_size = last_sw->info.clip_distance_array_size,
3143 .cull_distance_array_size = last_sw->info.cull_distance_array_size,
3144 };
3145
3146 if (xfb_outputs) {
3147 typed_memcpy(key->xfb_stride, last_sw->info.xfb_stride,
3148 ARRAY_SIZE(key->xfb_stride));
3149
3150 memcpy(&key->xfb_info, &last_sw->info.xfb_info,
3151 nir_xfb_info_size(xfb_outputs));
3152 }
3153
3154 struct hk_device *dev = hk_cmd_buffer_device(cmd);
3155 perf_debug(dev, "Binding passthrough GS for%s\n", xfb_outputs ? " XFB" : "");
3156
3157 gs = hk_meta_shader(dev, hk_nir_passthrough_gs, key, key_size);
3158 gs->is_passthrough = true;
3159 hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, gs);
3160 }
3161
3162 static struct hk_cs *
hk_flush_gfx_state(struct hk_cmd_buffer * cmd,uint32_t draw_id,struct agx_draw draw)3163 hk_flush_gfx_state(struct hk_cmd_buffer *cmd, uint32_t draw_id,
3164 struct agx_draw draw)
3165 {
3166 struct hk_device *dev = hk_cmd_buffer_device(cmd);
3167 struct hk_graphics_state *gfx = &cmd->state.gfx;
3168 struct hk_descriptor_state *desc = &gfx->descriptors;
3169
3170 struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, false /* compute */);
3171 const struct vk_dynamic_graphics_state *dyn =
3172 &cmd->vk.dynamic_graphics_state;
3173
3174 if (!cs)
3175 return NULL;
3176
3177 /* Annoyingly,
3178 * VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT is
3179 * render pass state on Imaginapple but draw state in Vulkan. In practice,
3180 * Proton never changes it within a render pass, but we technically need to
3181 * handle the switch regardless. Do so early since `cs` will be invalidated
3182 * if we need to split the render pass to switch representation mid-frame.
3183 *
3184 * Note we only do this dance with depth bias is actually enabled to avoid
3185 * senseless control stream splits with DXVK.
3186 */
3187 if ((IS_DIRTY(RS_DEPTH_BIAS_FACTORS) || IS_DIRTY(RS_DEPTH_BIAS_ENABLE)) &&
3188 dyn->rs.depth_bias.enable) {
3189
3190 bool dbias_is_int =
3191 (dyn->rs.depth_bias.representation ==
3192 VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT) ||
3193 (gfx->render.depth_att.vk_format == VK_FORMAT_D16_UNORM);
3194
3195 /* Attempt to set dbias_is_int per the draw requirement. If this fails,
3196 * flush the control stream and set it on the new control stream.
3197 */
3198 bool succ = u_tristate_set(&cs->cr.dbias_is_int, dbias_is_int);
3199 if (!succ) {
3200 perf_debug(dev, "Splitting control stream due to depth bias");
3201
3202 hk_cmd_buffer_end_graphics(cmd);
3203 cs = hk_cmd_buffer_get_cs(cmd, false /* compute */);
3204
3205 succ = u_tristate_set(&cs->cr.dbias_is_int, dbias_is_int);
3206 assert(succ && "can always set tri-state on a new control stream");
3207 }
3208 }
3209
3210 hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
3211
3212 #ifndef NDEBUG
3213 if (unlikely(dev->dev.debug & AGX_DBG_DIRTY)) {
3214 hk_cmd_buffer_dirty_all(cmd);
3215 }
3216 #endif
3217
3218 /* Merge tess info before GS construction since that depends on
3219 * gfx->tess.prim
3220 */
3221 if ((IS_SHADER_DIRTY(TESS_CTRL) || IS_SHADER_DIRTY(TESS_EVAL)) &&
3222 gfx->shaders[MESA_SHADER_TESS_CTRL]) {
3223 struct hk_api_shader *tcs = gfx->shaders[MESA_SHADER_TESS_CTRL];
3224 struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL];
3225 struct hk_shader *tese = hk_any_variant(tes);
3226 struct hk_shader *tesc = hk_only_variant(tcs);
3227
3228 gfx->tess.info =
3229 hk_tess_info_merge(tese->info.tess.info, tesc->info.tess.info);
3230
3231 /* Determine primitive based on the merged state */
3232 if (gfx->tess.info.points) {
3233 gfx->tess.prim = MESA_PRIM_POINTS;
3234 } else if (gfx->tess.info.mode == TESS_PRIMITIVE_ISOLINES) {
3235 gfx->tess.prim = MESA_PRIM_LINES;
3236 } else {
3237 gfx->tess.prim = MESA_PRIM_TRIANGLES;
3238 }
3239 }
3240
3241 /* TODO: Try to reduce draw overhead of this */
3242 hk_handle_passthrough_gs(cmd, draw);
3243
3244 hk_flush_shaders(cmd);
3245
3246 if (desc->push_dirty)
3247 hk_cmd_buffer_flush_push_descriptors(cmd, desc);
3248
3249 if (draw.restart || gfx->shaders[MESA_SHADER_GEOMETRY])
3250 hk_flush_index(cmd, cs);
3251
3252 hk_flush_dynamic_state(cmd, cs, draw_id, draw);
3253 return cs;
3254 }
3255
3256 VKAPI_ATTR void VKAPI_CALL
hk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)3257 hk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3258 VkDeviceSize offset, VkDeviceSize size,
3259 VkIndexType indexType)
3260 {
3261 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3262 VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
3263
3264 cmd->state.gfx.index = (struct hk_index_buffer_state){
3265 .buffer = hk_buffer_addr_range(buffer, offset, size),
3266 .size = agx_translate_index_size(vk_index_type_to_bytes(indexType)),
3267 .restart = vk_index_to_restart(indexType),
3268 };
3269
3270 /* TODO: check if necessary, blob does this */
3271 cmd->state.gfx.index.buffer.range =
3272 align(cmd->state.gfx.index.buffer.range, 4);
3273 }
3274
3275 void
hk_cmd_bind_vertex_buffer(struct hk_cmd_buffer * cmd,uint32_t vb_idx,struct hk_addr_range addr_range)3276 hk_cmd_bind_vertex_buffer(struct hk_cmd_buffer *cmd, uint32_t vb_idx,
3277 struct hk_addr_range addr_range)
3278 {
3279 cmd->state.gfx.vb[vb_idx] = addr_range;
3280 cmd->state.gfx.dirty |= HK_DIRTY_VB;
3281 }
3282
3283 VKAPI_ATTR void VKAPI_CALL
hk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)3284 hk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer, uint32_t firstBinding,
3285 uint32_t bindingCount, const VkBuffer *pBuffers,
3286 const VkDeviceSize *pOffsets,
3287 const VkDeviceSize *pSizes,
3288 const VkDeviceSize *pStrides)
3289 {
3290 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3291
3292 if (pStrides) {
3293 vk_cmd_set_vertex_binding_strides(&cmd->vk, firstBinding, bindingCount,
3294 pStrides);
3295 }
3296
3297 for (uint32_t i = 0; i < bindingCount; i++) {
3298 VK_FROM_HANDLE(hk_buffer, buffer, pBuffers[i]);
3299 uint32_t idx = firstBinding + i;
3300
3301 uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
3302 const struct hk_addr_range addr_range =
3303 hk_buffer_addr_range(buffer, pOffsets[i], size);
3304
3305 hk_cmd_bind_vertex_buffer(cmd, idx, addr_range);
3306 }
3307 }
3308
3309 static bool
hk_set_view_index(struct hk_cmd_buffer * cmd,uint32_t view_idx)3310 hk_set_view_index(struct hk_cmd_buffer *cmd, uint32_t view_idx)
3311 {
3312 if (cmd->state.gfx.render.view_mask) {
3313 cmd->state.gfx.descriptors.root.draw.view_index = view_idx;
3314 cmd->state.gfx.descriptors.root_dirty = true;
3315 }
3316
3317 return true;
3318 }
3319
3320 /*
3321 * Iterator macro to duplicate a draw for each enabled view (when multiview is
3322 * enabled, else always view 0). Along with hk_lower_multiview, this forms the
3323 * world's worst multiview lowering.
3324 */
3325 #define hk_foreach_view(cmd) \
3326 u_foreach_bit(view_idx, cmd->state.gfx.render.view_mask ?: 1) \
3327 if (hk_set_view_index(cmd, view_idx))
3328
3329 static void
hk_ia_update(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct agx_draw draw,uint64_t ia_vertices,uint64_t ia_prims,uint64_t vs_invocations,uint64_t c_prims,uint64_t c_inv)3330 hk_ia_update(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct agx_draw draw,
3331 uint64_t ia_vertices, uint64_t ia_prims, uint64_t vs_invocations,
3332 uint64_t c_prims, uint64_t c_inv)
3333 {
3334 /* XXX: stream link needed? */
3335 struct hk_device *dev = hk_cmd_buffer_device(cmd);
3336 perf_debug(dev, "Input assembly counters");
3337
3338 uint64_t draw_ptr;
3339 if (agx_is_indirect(draw.b)) {
3340 draw_ptr = draw.b.ptr;
3341 } else {
3342 uint32_t desc[] = {draw.b.count[0], draw.b.count[1], 0};
3343 draw_ptr = hk_pool_upload(cmd, &desc, sizeof(desc), 4);
3344 }
3345
3346 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
3347 enum mesa_prim prim = vk_conv_topology(dyn->ia.primitive_topology);
3348
3349 bool geom = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY];
3350 bool tess = cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL];
3351
3352 /* Clipper counters depend on geom/tess outputs and must be written with the
3353 * geom/tess output. They are updated as IA counters only when geom/tess is
3354 * not used.
3355 *
3356 * TODO: Tessellation clipper counters not actually wired up, pending CTS.
3357 */
3358 if (geom || tess) {
3359 c_prims = 0;
3360 c_inv = 0;
3361 }
3362
3363 if (draw.restart) {
3364 uint32_t index_size_B = agx_index_size_to_B(draw.index_size);
3365
3366 libagx_increment_ia_restart(
3367 cs, agx_1d(1024), AGX_BARRIER_ALL, ia_vertices, ia_prims,
3368 vs_invocations, c_prims, c_inv, draw_ptr, draw.index_buffer,
3369 agx_draw_index_range_el(draw), cmd->state.gfx.index.restart,
3370 index_size_B, prim);
3371 } else {
3372 libagx_increment_ia(cs, agx_1d(1), AGX_BARRIER_ALL, ia_vertices, ia_prims,
3373 vs_invocations, c_prims, c_inv, draw_ptr, prim);
3374 }
3375 }
3376
3377 static void
hk_draw(struct hk_cmd_buffer * cmd,uint16_t draw_id,struct agx_draw draw_)3378 hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct agx_draw draw_)
3379 {
3380 const struct vk_dynamic_graphics_state *dyn =
3381 &cmd->vk.dynamic_graphics_state;
3382
3383 /* Filter trivial draws so we don't need to worry about null index buffers */
3384 if (!agx_is_indirect(draw_.b) &&
3385 (draw_.b.count[0] == 0 || draw_.b.count[1] == 0))
3386 return;
3387
3388 draw_.restart = dyn->ia.primitive_restart_enable && draw_.indexed;
3389 draw_.index_size = cmd->state.gfx.index.size;
3390
3391 uint64_t stat_ia_verts = hk_pipeline_stat_addr(
3392 cmd, VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT);
3393
3394 uint64_t stat_ia_prims = hk_pipeline_stat_addr(
3395 cmd, VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT);
3396
3397 uint64_t stat_vs_inv = hk_pipeline_stat_addr(
3398 cmd, VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT);
3399
3400 uint64_t stat_c_inv = hk_pipeline_stat_addr(
3401 cmd, VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT);
3402
3403 uint64_t stat_c_prims = hk_pipeline_stat_addr(
3404 cmd, VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT);
3405
3406 bool ia_stats = stat_ia_verts || stat_ia_prims || stat_vs_inv ||
3407 stat_c_inv || stat_c_prims;
3408 struct hk_device *dev = hk_cmd_buffer_device(cmd);
3409
3410 hk_foreach_view(cmd) {
3411 struct agx_draw draw = draw_;
3412 struct hk_cs *cs = hk_flush_gfx_state(cmd, draw_id, draw);
3413 /* If we failed to allocate a control stream, we've already lost the
3414 * device. Just drop the draw so we don't crash.
3415 */
3416 if (!cs)
3417 return;
3418
3419 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
3420 bool geom = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY];
3421 bool tess = cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL];
3422 bool needs_idx_robust = hk_needs_index_robustness(cmd, &draw);
3423 bool adj =
3424 mesa_prim_has_adjacency(vk_conv_topology(dyn->ia.primitive_topology));
3425 adj &= !geom;
3426 needs_idx_robust &= !adj;
3427
3428 struct hk_cs *ccs = NULL;
3429 uint8_t *out = cs->current;
3430 assert(cs->current + 0x1000 < cs->end);
3431
3432 if (tess && HK_PERF(dev, NOTESS))
3433 continue;
3434
3435 cs->stats.calls++;
3436
3437 if (geom || tess || ia_stats || needs_idx_robust ||
3438 (adj && (agx_is_indirect(draw.b) || draw.restart))) {
3439
3440 ccs =
3441 hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true);
3442 if (!ccs)
3443 return;
3444 }
3445
3446 if (ia_stats) {
3447 hk_ia_update(cmd, ccs, draw, stat_ia_verts, stat_ia_prims, stat_vs_inv,
3448 stat_c_prims, stat_c_inv);
3449 }
3450
3451 if (tess) {
3452 draw = hk_launch_tess(cmd, ccs, draw);
3453 }
3454
3455 if (geom) {
3456 draw = hk_launch_gs_prerast(cmd, ccs, draw);
3457
3458 /* We must not draw if the app specified rasterizer discard. This is
3459 * required for both performance (it is pointless to rasterize and
3460 * there are no side effects), but also correctness (no indirect draw
3461 * descriptor will be filled out).
3462 */
3463 if (dyn->rs.rasterizer_discard_enable)
3464 continue;
3465 }
3466
3467 if (adj) {
3468 assert(!geom && "geometry shaders handle adj directly");
3469 enum mesa_prim prim = vk_conv_topology(dyn->ia.primitive_topology);
3470
3471 if (draw.restart) {
3472 draw = hk_draw_without_restart(cmd, ccs, draw, 1);
3473 prim = u_decomposed_prim(prim);
3474 }
3475
3476 if (agx_is_indirect(draw.b)) {
3477 const size_t size = sizeof(VkDrawIndexedIndirectCommand);
3478 static_assert(sizeof(VkDrawIndexedIndirectCommand) >
3479 sizeof(VkDrawIndirectCommand),
3480 "allocation size is conservative");
3481
3482 uint64_t out_draw = hk_pool_alloc(cmd, size, 4).gpu;
3483 struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
3484
3485 libagx_draw_without_adj(
3486 ccs, agx_1d(1), AGX_BARRIER_ALL, out_draw, draw.b.ptr,
3487 desc->root.draw.input_assembly, draw.index_buffer,
3488 draw.indexed ? agx_draw_index_range_el(draw) : 0,
3489 draw.indexed ? agx_index_size_to_B(draw.index_size) : 0, prim);
3490
3491 draw = agx_draw_indirect(out_draw);
3492 } else {
3493 unsigned count = libagx_remap_adj_count(draw.b.count[0], prim);
3494
3495 draw = (struct agx_draw){
3496 .b = agx_3d(count, draw.b.count[1], 1),
3497 };
3498 }
3499 }
3500
3501 enum agx_primitive topology = cmd->state.gfx.topology;
3502 if (needs_idx_robust) {
3503 assert(!geom && !tess && !adj);
3504 perf_debug(dev, "lowering robust index buffer");
3505
3506 cs->current = out;
3507
3508 draw = hk_draw_as_indexed_indirect(cmd, draw);
3509
3510 size_t size_B = libagx_draw_robust_index_vdm_size();
3511 uint64_t target = hk_cs_alloc_for_indirect(cs, size_B);
3512
3513 libagx_draw_robust_index(ccs, agx_1d(32), AGX_BARRIER_ALL, target,
3514 hk_geometry_state(cmd), draw.b.ptr,
3515 draw.index_buffer, draw.index_buffer_range_B,
3516 draw.restart, topology, draw.index_size);
3517 } else {
3518 cs->current = (void *)agx_vdm_draw((uint32_t *)out, dev->dev.chip,
3519 draw, topology);
3520 }
3521
3522 cs->stats.cmds++;
3523 }
3524 }
3525
3526 VKAPI_ATTR void VKAPI_CALL
hk_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)3527 hk_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount,
3528 uint32_t instanceCount, uint32_t firstVertex, uint32_t firstInstance)
3529 {
3530 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3531 struct agx_draw draw;
3532
3533 if (HK_TEST_INDIRECTS) {
3534 uint32_t data[] = {
3535 vertexCount,
3536 instanceCount,
3537 firstVertex,
3538 firstInstance,
3539 };
3540
3541 draw = agx_draw_indirect(hk_pool_upload(cmd, data, sizeof(data), 4));
3542 } else {
3543 draw = (struct agx_draw){
3544 .b = agx_3d(vertexCount, instanceCount, 1),
3545 .start = firstVertex,
3546 .start_instance = firstInstance,
3547 };
3548 }
3549
3550 hk_draw(cmd, 0, draw);
3551 }
3552
3553 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)3554 hk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount,
3555 const VkMultiDrawInfoEXT *pVertexInfo,
3556 uint32_t instanceCount, uint32_t firstInstance,
3557 uint32_t stride)
3558 {
3559 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3560
3561 for (unsigned i = 0; i < drawCount; ++i) {
3562 struct agx_draw draw = {
3563 .b = agx_3d(pVertexInfo->vertexCount, instanceCount, 1),
3564 .start = pVertexInfo->firstVertex,
3565 .start_instance = firstInstance,
3566 };
3567
3568 hk_draw(cmd, i, draw);
3569 pVertexInfo = ((void *)pVertexInfo) + stride;
3570 }
3571 }
3572
3573 static void
hk_draw_indexed(VkCommandBuffer commandBuffer,uint16_t draw_id,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)3574 hk_draw_indexed(VkCommandBuffer commandBuffer, uint16_t draw_id,
3575 uint32_t indexCount, uint32_t instanceCount,
3576 uint32_t firstIndex, int32_t vertexOffset,
3577 uint32_t firstInstance)
3578 {
3579 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3580 struct agx_draw draw;
3581 struct hk_addr_range buf = cmd->state.gfx.index.buffer;
3582
3583 if (HK_TEST_INDIRECTS && draw_id == 0) {
3584 uint32_t data[] = {
3585 indexCount, instanceCount, firstIndex, vertexOffset, firstInstance,
3586 };
3587 uint64_t addr = hk_pool_upload(cmd, data, sizeof(data), 4);
3588
3589 draw = agx_draw_indexed_indirect(addr, buf.addr, buf.range, 0, 0);
3590 } else {
3591 draw =
3592 agx_draw_indexed(indexCount, instanceCount, firstIndex, vertexOffset,
3593 firstInstance, buf.addr, buf.range, 0, 0);
3594 }
3595
3596 hk_draw(cmd, draw_id, draw);
3597 }
3598
3599 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)3600 hk_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount,
3601 uint32_t instanceCount, uint32_t firstIndex,
3602 int32_t vertexOffset, uint32_t firstInstance)
3603 {
3604 hk_draw_indexed(commandBuffer, 0, indexCount, instanceCount, firstIndex,
3605 vertexOffset, firstInstance);
3606 }
3607
3608 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)3609 hk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount,
3610 const VkMultiDrawIndexedInfoEXT *pIndexInfo,
3611 uint32_t instanceCount, uint32_t firstInstance,
3612 uint32_t stride, const int32_t *pVertexOffset)
3613 {
3614 for (unsigned i = 0; i < drawCount; ++i) {
3615 const uint32_t vertex_offset =
3616 pVertexOffset != NULL ? *pVertexOffset : pIndexInfo->vertexOffset;
3617
3618 hk_draw_indexed(commandBuffer, i, pIndexInfo->indexCount, instanceCount,
3619 pIndexInfo->firstIndex, vertex_offset, firstInstance);
3620
3621 pIndexInfo = ((void *)pIndexInfo) + stride;
3622 }
3623 }
3624
3625 static void
hk_draw_indirect_inner(VkCommandBuffer commandBuffer,uint64_t base,uint32_t drawCount,uint32_t stride)3626 hk_draw_indirect_inner(VkCommandBuffer commandBuffer, uint64_t base,
3627 uint32_t drawCount, uint32_t stride)
3628 {
3629 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3630
3631 /* From the Vulkan 1.3.238 spec:
3632 *
3633 * VUID-vkCmdDrawIndirect-drawCount-00476
3634 *
3635 * "If drawCount is greater than 1, stride must be a multiple of 4 and
3636 * must be greater than or equal to sizeof(VkDrawIndirectCommand)"
3637 *
3638 * and
3639 *
3640 * "If drawCount is less than or equal to one, stride is ignored."
3641 */
3642 if (drawCount > 1) {
3643 assert(stride % 4 == 0);
3644 assert(stride >= sizeof(VkDrawIndirectCommand));
3645 }
3646
3647 for (unsigned draw_id = 0; draw_id < drawCount; ++draw_id) {
3648 uint64_t addr = base + stride * draw_id;
3649 hk_draw(cmd, draw_id, agx_draw_indirect(addr));
3650 }
3651 }
3652
3653 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)3654 hk_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3655 VkDeviceSize offset, uint32_t drawCount, uint32_t stride)
3656 {
3657 VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
3658
3659 hk_draw_indirect_inner(commandBuffer, hk_buffer_address(buffer, offset),
3660 drawCount, stride);
3661 }
3662
3663 static void
hk_draw_indexed_indirect_inner(VkCommandBuffer commandBuffer,uint64_t buffer,uint32_t drawCount,uint32_t stride)3664 hk_draw_indexed_indirect_inner(VkCommandBuffer commandBuffer, uint64_t buffer,
3665 uint32_t drawCount, uint32_t stride)
3666 {
3667 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3668
3669 /* From the Vulkan 1.3.238 spec:
3670 *
3671 * VUID-vkCmdDrawIndexedIndirect-drawCount-00528
3672 *
3673 * "If drawCount is greater than 1, stride must be a multiple of 4 and
3674 * must be greater than or equal to
3675 * sizeof(VkDrawIndexedIndirectCommand)"
3676 *
3677 * and
3678 *
3679 * "If drawCount is less than or equal to one, stride is ignored."
3680 */
3681 if (drawCount > 1) {
3682 assert(stride % 4 == 0);
3683 assert(stride >= sizeof(VkDrawIndexedIndirectCommand));
3684 }
3685
3686 for (unsigned draw_id = 0; draw_id < drawCount; ++draw_id) {
3687 uint64_t addr = buffer + stride * draw_id;
3688 struct hk_addr_range buf = cmd->state.gfx.index.buffer;
3689
3690 hk_draw(cmd, draw_id,
3691 agx_draw_indexed_indirect(addr, buf.addr, buf.range, 0, 0));
3692 }
3693 }
3694
3695 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)3696 hk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3697 VkDeviceSize offset, uint32_t drawCount,
3698 uint32_t stride)
3699 {
3700 VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
3701
3702 hk_draw_indexed_indirect_inner(
3703 commandBuffer, hk_buffer_address(buffer, offset), drawCount, stride);
3704 }
3705
3706 /*
3707 * To implement drawIndirectCount generically, we dispatch a compute kernel to
3708 * patch the indirect buffer and then we dispatch the predicated maxDrawCount
3709 * indirect draws.
3710 */
3711 static void
hk_draw_indirect_count(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride,bool indexed)3712 hk_draw_indirect_count(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3713 VkDeviceSize offset, VkBuffer countBuffer,
3714 VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
3715 uint32_t stride, bool indexed)
3716 {
3717 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3718 VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
3719 VK_FROM_HANDLE(hk_buffer, count_buffer, countBuffer);
3720
3721 struct hk_device *dev = hk_cmd_buffer_device(cmd);
3722 perf_debug(dev, "Draw indirect count");
3723
3724 struct hk_cs *cs =
3725 hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true);
3726 if (!cs)
3727 return;
3728
3729 hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
3730
3731 assert((stride % 4) == 0 && "aligned");
3732
3733 size_t out_stride = sizeof(uint32_t) * (indexed ? 5 : 4);
3734 uint64_t patched = hk_pool_alloc(cmd, out_stride * maxDrawCount, 4).gpu;
3735 uint64_t in = hk_buffer_address(buffer, offset);
3736 uint64_t count_addr = hk_buffer_address(count_buffer, countBufferOffset);
3737
3738 libagx_predicate_indirect(cs, agx_1d(maxDrawCount), AGX_BARRIER_ALL, patched,
3739 in, count_addr, stride / 4, indexed);
3740
3741 if (indexed) {
3742 hk_draw_indexed_indirect_inner(commandBuffer, patched, maxDrawCount,
3743 out_stride);
3744 } else {
3745 hk_draw_indirect_inner(commandBuffer, patched, maxDrawCount, out_stride);
3746 }
3747 }
3748
3749 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)3750 hk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3751 VkDeviceSize offset, VkBuffer countBuffer,
3752 VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
3753 uint32_t stride)
3754 {
3755 hk_draw_indirect_count(commandBuffer, _buffer, offset, countBuffer,
3756 countBufferOffset, maxDrawCount, stride, false);
3757 }
3758
3759 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)3760 hk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3761 VkDeviceSize offset, VkBuffer countBuffer,
3762 VkDeviceSize countBufferOffset,
3763 uint32_t maxDrawCount, uint32_t stride)
3764 {
3765 hk_draw_indirect_count(commandBuffer, _buffer, offset, countBuffer,
3766 countBufferOffset, maxDrawCount, stride, true);
3767 }
3768
3769 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)3770 hk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
3771 uint32_t instanceCount, uint32_t firstInstance,
3772 VkBuffer counterBuffer,
3773 VkDeviceSize counterBufferOffset,
3774 uint32_t counterOffset, uint32_t vertexStride)
3775 {
3776 unreachable("TODO");
3777 }
3778
3779 VKAPI_ATTR void VKAPI_CALL
hk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)3780 hk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
3781 uint32_t firstBinding,
3782 uint32_t bindingCount,
3783 const VkBuffer *pBuffers,
3784 const VkDeviceSize *pOffsets,
3785 const VkDeviceSize *pSizes)
3786 {
3787 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3788 struct hk_graphics_state *gfx = &cmd->state.gfx;
3789
3790 for (uint32_t i = 0; i < bindingCount; i++) {
3791 VK_FROM_HANDLE(hk_buffer, buffer, pBuffers[i]);
3792 uint32_t idx = firstBinding + i;
3793 uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
3794
3795 gfx->xfb[idx] = hk_buffer_addr_range(buffer, pOffsets[i], size);
3796 }
3797 }
3798
3799 static void
hk_begin_end_xfb(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets,bool begin)3800 hk_begin_end_xfb(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
3801 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
3802 const VkDeviceSize *pCounterBufferOffsets, bool begin)
3803
3804 {
3805 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3806 struct hk_device *dev = hk_cmd_buffer_device(cmd);
3807 struct hk_graphics_state *gfx = &cmd->state.gfx;
3808
3809 gfx->xfb_enabled = begin;
3810
3811 /* If we haven't reserved XFB offsets yet for the command buffer, do so. */
3812 if (!gfx->xfb_offsets) {
3813 gfx->xfb_offsets = hk_pool_alloc(cmd, 4 * sizeof(uint32_t), 4).gpu;
3814 }
3815
3816 struct hk_cs *cs =
3817 hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true);
3818 if (!cs)
3819 return;
3820 hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
3821
3822 struct libagx_xfb_counter_copy params = {};
3823 unsigned copies = 0;
3824
3825 /* For CmdBeginTransformFeedbackEXT, we need to initialize everything */
3826 if (begin) {
3827 for (copies = 0; copies < 4; ++copies) {
3828 params.dest[copies] = gfx->xfb_offsets + copies * sizeof(uint32_t);
3829 }
3830 }
3831
3832 for (unsigned i = 0; i < counterBufferCount; ++i) {
3833 if (pCounterBuffers[i] == VK_NULL_HANDLE)
3834 continue;
3835
3836 VK_FROM_HANDLE(hk_buffer, buffer, pCounterBuffers[i]);
3837
3838 uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
3839 uint64_t cb_addr = hk_buffer_address(buffer, offset);
3840 uint32_t cmd_idx = firstCounterBuffer + i;
3841
3842 if (begin) {
3843 params.src[cmd_idx] = cb_addr;
3844 } else {
3845 params.dest[copies] = cb_addr;
3846 params.src[copies] = gfx->xfb_offsets + cmd_idx * sizeof(uint32_t);
3847 ++copies;
3848 }
3849 }
3850
3851 if (begin)
3852 copies = 4;
3853
3854 if (copies > 0) {
3855 perf_debug(dev, "XFB counter copy");
3856
3857 libagx_copy_xfb_counters(cs, agx_1d(copies), AGX_BARRIER_ALL,
3858 hk_pool_upload(cmd, ¶ms, sizeof(params), 8));
3859 }
3860 }
3861
3862 VKAPI_ATTR void VKAPI_CALL
hk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)3863 hk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
3864 uint32_t firstCounterBuffer,
3865 uint32_t counterBufferCount,
3866 const VkBuffer *pCounterBuffers,
3867 const VkDeviceSize *pCounterBufferOffsets)
3868 {
3869 hk_begin_end_xfb(commandBuffer, firstCounterBuffer, counterBufferCount,
3870 pCounterBuffers, pCounterBufferOffsets, true);
3871 }
3872
3873 VKAPI_ATTR void VKAPI_CALL
hk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)3874 hk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
3875 uint32_t firstCounterBuffer,
3876 uint32_t counterBufferCount,
3877 const VkBuffer *pCounterBuffers,
3878 const VkDeviceSize *pCounterBufferOffsets)
3879 {
3880 hk_begin_end_xfb(commandBuffer, firstCounterBuffer, counterBufferCount,
3881 pCounterBuffers, pCounterBufferOffsets, false);
3882 }
3883
3884 VKAPI_ATTR void VKAPI_CALL
hk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)3885 hk_CmdBeginConditionalRenderingEXT(
3886 VkCommandBuffer commandBuffer,
3887 const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
3888 {
3889 unreachable("stub");
3890 }
3891
3892 VKAPI_ATTR void VKAPI_CALL
hk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)3893 hk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
3894 {
3895 unreachable("stub");
3896 }
3897