• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2024 Valve Corporation
3  * Copyright 2024 Alyssa Rosenzweig
4  * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5  * SPDX-License-Identifier: MIT
6  */
7 #include <assert.h>
8 #include "agx_bg_eot.h"
9 #include "agx_bo.h"
10 #include "agx_compile.h"
11 #include "agx_compiler.h"
12 #include "agx_device.h"
13 #include "agx_helpers.h"
14 #include "agx_linker.h"
15 #include "agx_nir_lower_gs.h"
16 #include "agx_nir_lower_vbo.h"
17 #include "agx_ppp.h"
18 #include "agx_tilebuffer.h"
19 #include "agx_usc.h"
20 #include "agx_uvs.h"
21 #include "hk_buffer.h"
22 #include "hk_cmd_buffer.h"
23 #include "hk_device.h"
24 #include "hk_entrypoints.h"
25 #include "hk_image.h"
26 #include "hk_image_view.h"
27 #include "hk_physical_device.h"
28 #include "hk_private.h"
29 #include "hk_shader.h"
30 
31 #include "asahi/genxml/agx_pack.h"
32 #include "asahi/libagx/compression.h"
33 #include "asahi/libagx/geometry.h"
34 #include "asahi/libagx/libagx.h"
35 #include "asahi/libagx/query.h"
36 #include "asahi/libagx/tessellator.h"
37 #include "util/blend.h"
38 #include "util/format/format_utils.h"
39 #include "util/format/u_formats.h"
40 #include "util/macros.h"
41 #include "util/ralloc.h"
42 #include "util/u_prim.h"
43 #include "vulkan/vulkan_core.h"
44 #include "layout.h"
45 #include "libagx_dgc.h"
46 #include "libagx_shaders.h"
47 #include "nir.h"
48 #include "nir_builder.h"
49 #include "nir_lower_blend.h"
50 #include "nir_xfb_info.h"
51 #include "pool.h"
52 #include "shader_enums.h"
53 #include "vk_blend.h"
54 #include "vk_enum_to_str.h"
55 #include "vk_format.h"
56 #include "vk_graphics_state.h"
57 #include "vk_pipeline.h"
58 #include "vk_render_pass.h"
59 #include "vk_standard_sample_locations.h"
60 #include "vk_util.h"
61 
62 #define IS_DIRTY(bit) BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_##bit)
63 
64 #define IS_SHADER_DIRTY(bit)                                                   \
65    (cmd->state.gfx.shaders_dirty & BITFIELD_BIT(MESA_SHADER_##bit))
66 
67 #define IS_LINKED_DIRTY(bit)                                                   \
68    (cmd->state.gfx.linked_dirty & BITFIELD_BIT(MESA_SHADER_##bit))
69 
70 /* CTS coverage of indirect draws is pretty bad, so it's helpful to be able to
71  * get some extra smoke testing.
72  */
73 #define HK_TEST_INDIRECTS (0)
74 
75 UNUSED static inline void
print_draw(struct agx_draw d,FILE * fp)76 print_draw(struct agx_draw d, FILE *fp)
77 {
78    if (agx_is_indirect(d.b))
79       fprintf(fp, "indirect (buffer %" PRIx64 "):", d.b.ptr);
80    else
81       fprintf(fp, "direct (%ux%u):", d.b.count[0], d.b.count[1]);
82 
83    if (d.index_size)
84       fprintf(fp, " index_size=%u", agx_index_size_to_B(d.index_size));
85    else
86       fprintf(fp, " non-indexed");
87 
88    if (d.restart)
89       fprintf(fp, " restart");
90 
91    if (d.index_bias)
92       fprintf(fp, " index_bias=%u", d.index_bias);
93 
94    if (d.start)
95       fprintf(fp, " start=%u", d.start);
96 
97    if (d.start_instance)
98       fprintf(fp, " start_instance=%u", d.start_instance);
99 
100    fprintf(fp, "\n");
101 }
102 
103 /* XXX: deduplicate */
104 static inline enum mesa_prim
vk_conv_topology(VkPrimitiveTopology topology)105 vk_conv_topology(VkPrimitiveTopology topology)
106 {
107    switch (topology) {
108    case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
109       return MESA_PRIM_POINTS;
110    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
111       return MESA_PRIM_LINES;
112    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
113       return MESA_PRIM_LINE_STRIP;
114    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
115 #pragma GCC diagnostic push
116 #pragma GCC diagnostic ignored "-Wswitch"
117    case VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA:
118 #pragma GCC diagnostic pop
119       return MESA_PRIM_TRIANGLES;
120    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
121       return MESA_PRIM_TRIANGLE_STRIP;
122    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
123       return MESA_PRIM_TRIANGLE_FAN;
124    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
125       return MESA_PRIM_LINES_ADJACENCY;
126    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
127       return MESA_PRIM_LINE_STRIP_ADJACENCY;
128    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
129       return MESA_PRIM_TRIANGLES_ADJACENCY;
130    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
131       return MESA_PRIM_TRIANGLE_STRIP_ADJACENCY;
132    case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
133       return MESA_PRIM_PATCHES;
134    default:
135       unreachable("invalid");
136    }
137 }
138 
139 static void
hk_cmd_buffer_dirty_render_pass(struct hk_cmd_buffer * cmd)140 hk_cmd_buffer_dirty_render_pass(struct hk_cmd_buffer *cmd)
141 {
142    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
143 
144    /* These depend on color attachment count */
145    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
146    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
147    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
148    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
149 
150    /* These depend on the depth/stencil format */
151    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
152    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
153    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
154    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
155    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
156 
157    /* This may depend on render targets for ESO */
158    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
159 
160    /* This may depend on render targets */
161    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP);
162 }
163 
164 void
hk_cmd_buffer_begin_graphics(struct hk_cmd_buffer * cmd,const VkCommandBufferBeginInfo * pBeginInfo)165 hk_cmd_buffer_begin_graphics(struct hk_cmd_buffer *cmd,
166                              const VkCommandBufferBeginInfo *pBeginInfo)
167 {
168    if (cmd->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
169        (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
170       char gcbiar_data[VK_GCBIARR_DATA_SIZE(HK_MAX_RTS)];
171       const VkRenderingInfo *resume_info =
172          vk_get_command_buffer_inheritance_as_rendering_resume(
173             cmd->vk.level, pBeginInfo, gcbiar_data);
174       if (resume_info) {
175          hk_CmdBeginRendering(hk_cmd_buffer_to_handle(cmd), resume_info);
176       } else {
177          const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
178             vk_get_command_buffer_inheritance_rendering_info(cmd->vk.level,
179                                                              pBeginInfo);
180          assert(inheritance_info);
181 
182          struct hk_rendering_state *render = &cmd->state.gfx.render;
183          render->flags = inheritance_info->flags;
184          render->area = (VkRect2D){};
185          render->layer_count = 0;
186          render->view_mask = inheritance_info->viewMask;
187          render->tilebuffer.nr_samples = inheritance_info->rasterizationSamples;
188 
189          render->color_att_count = inheritance_info->colorAttachmentCount;
190          for (uint32_t i = 0; i < render->color_att_count; i++) {
191             render->color_att[i].vk_format =
192                inheritance_info->pColorAttachmentFormats[i];
193          }
194          render->depth_att.vk_format = inheritance_info->depthAttachmentFormat;
195          render->stencil_att.vk_format =
196             inheritance_info->stencilAttachmentFormat;
197 
198          const VkRenderingAttachmentLocationInfoKHR att_loc_info_default = {
199             .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
200             .colorAttachmentCount = inheritance_info->colorAttachmentCount,
201          };
202          const VkRenderingAttachmentLocationInfoKHR *att_loc_info =
203             vk_get_command_buffer_rendering_attachment_location_info(
204                cmd->vk.level, pBeginInfo);
205          if (att_loc_info == NULL)
206             att_loc_info = &att_loc_info_default;
207 
208          vk_cmd_set_rendering_attachment_locations(&cmd->vk, att_loc_info);
209 
210          hk_cmd_buffer_dirty_render_pass(cmd);
211       }
212    }
213 
214    hk_cmd_buffer_dirty_all(cmd);
215 
216    /* If multiview is disabled, always read 0. If multiview is enabled,
217     * hk_set_view_index will dirty the root each draw.
218     */
219    cmd->state.gfx.descriptors.root.draw.view_index = 0;
220    cmd->state.gfx.descriptors.root_dirty = true;
221 }
222 
223 void
hk_cmd_invalidate_graphics_state(struct hk_cmd_buffer * cmd)224 hk_cmd_invalidate_graphics_state(struct hk_cmd_buffer *cmd)
225 {
226    hk_cmd_buffer_dirty_all(cmd);
227 
228    /* From the Vulkan 1.3.275 spec:
229     *
230     *    "...There is one exception to this rule - if the primary command
231     *    buffer is inside a render pass instance, then the render pass and
232     *    subpass state is not disturbed by executing secondary command
233     *    buffers."
234     *
235     * We need to reset everything EXCEPT the render pass state.
236     */
237    struct hk_rendering_state render_save = cmd->state.gfx.render;
238    memset(&cmd->state.gfx, 0, sizeof(cmd->state.gfx));
239    cmd->state.gfx.render = render_save;
240 }
241 
242 static void
hk_attachment_init(struct hk_attachment * att,const VkRenderingAttachmentInfo * info)243 hk_attachment_init(struct hk_attachment *att,
244                    const VkRenderingAttachmentInfo *info)
245 {
246    if (info == NULL || info->imageView == VK_NULL_HANDLE) {
247       *att = (struct hk_attachment){
248          .iview = NULL,
249       };
250       return;
251    }
252 
253    VK_FROM_HANDLE(hk_image_view, iview, info->imageView);
254    *att = (struct hk_attachment){
255       .vk_format = iview->vk.format,
256       .iview = iview,
257    };
258 
259    if (info->resolveMode != VK_RESOLVE_MODE_NONE) {
260       VK_FROM_HANDLE(hk_image_view, res_iview, info->resolveImageView);
261       att->resolve_mode = info->resolveMode;
262       att->resolve_iview = res_iview;
263    }
264 }
265 
266 VKAPI_ATTR void VKAPI_CALL
hk_GetRenderingAreaGranularityKHR(VkDevice device,const VkRenderingAreaInfoKHR * pRenderingAreaInfo,VkExtent2D * pGranularity)267 hk_GetRenderingAreaGranularityKHR(
268    VkDevice device, const VkRenderingAreaInfoKHR *pRenderingAreaInfo,
269    VkExtent2D *pGranularity)
270 {
271    *pGranularity = (VkExtent2D){.width = 1, .height = 1};
272 }
273 
274 static bool
is_attachment_stored(const VkRenderingAttachmentInfo * att)275 is_attachment_stored(const VkRenderingAttachmentInfo *att)
276 {
277    /* When resolving, we store the intermediate multisampled image as the
278     * resolve is a separate control stream. This could be optimized.
279     */
280    return att->storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
281           att->resolveMode != VK_RESOLVE_MODE_NONE;
282 }
283 
284 static struct hk_bg_eot
hk_build_bg_eot(struct hk_cmd_buffer * cmd,const VkRenderingInfo * info,bool store,bool partial_render,bool incomplete_render_area)285 hk_build_bg_eot(struct hk_cmd_buffer *cmd, const VkRenderingInfo *info,
286                 bool store, bool partial_render, bool incomplete_render_area)
287 {
288    struct hk_device *dev = hk_cmd_buffer_device(cmd);
289    struct hk_rendering_state *render = &cmd->state.gfx.render;
290 
291    /* Construct the key */
292    struct agx_bg_eot_key key = {.tib = render->tilebuffer};
293    static_assert(AGX_BG_EOT_NONE == 0, "default initializer");
294 
295    key.tib.layered = (render->cr.layers > 1);
296 
297    bool needs_textures_for_spilled_rts =
298       agx_tilebuffer_spills(&render->tilebuffer) && !partial_render && !store;
299 
300    for (unsigned i = 0; i < info->colorAttachmentCount; ++i) {
301       const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[i];
302       if (att_info->imageView == VK_NULL_HANDLE)
303          continue;
304 
305       /* Partial render programs exist only to store/load the tilebuffer to
306        * main memory. When render targets are already spilled to main memory,
307        * there's nothing to do.
308        */
309       if (key.tib.spilled[i] && (partial_render || store))
310          continue;
311 
312       if (store) {
313          bool should_store = is_attachment_stored(att_info);
314 
315          /* Partial renders always need to flush to memory. */
316          should_store |= partial_render;
317 
318          if (should_store)
319             key.op[i] = AGX_EOT_STORE;
320       } else {
321          bool load = att_info->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD;
322          bool clear = att_info->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR;
323 
324          /* The background program used for partial renders must always load
325           * whatever was stored in the mid-frame end-of-tile program.
326           */
327          load |= partial_render;
328 
329          /* With an incomplete render area, we're forced to load back tiles and
330           * then use the 3D pipe for the clear.
331           */
332          load |= incomplete_render_area;
333 
334          /* Don't read back spilled render targets, they're already in memory */
335          load &= !key.tib.spilled[i];
336 
337          /* This is a very frustrating corner case. From the spec:
338           *
339           *     VK_ATTACHMENT_STORE_OP_NONE specifies the contents within the
340           *     render area are not accessed by the store operation as long as
341           *     no values are written to the attachment during the render pass.
342           *
343           * With VK_ATTACHMENT_STORE_OP_NONE, we suppress stores on the main
344           * end-of-tile program. Unfortunately, that's not enough: we also need
345           * to preserve the contents throughout partial renders. The easiest way
346           * to do that is forcing a load in the background program, so that
347           * partial stores for unused attachments will be no-op'd by writing
348           * existing contents.
349           *
350           * Optimizing this would require nontrivial tracking. Fortunately,
351           * this is all Android gunk and we don't have to care too much for
352           * dekstop games. So do the simple thing.
353           */
354          bool no_store = (att_info->storeOp == VK_ATTACHMENT_STORE_OP_NONE);
355          bool no_store_wa = no_store && !load && !clear;
356          if (no_store_wa) {
357             perf_debug(dev, "STORE_OP_NONE workaround");
358          }
359 
360          load |= no_store_wa;
361 
362          /* Don't apply clears for spilled render targets when we clear the
363           * render area explicitly after.
364           */
365          if (key.tib.spilled[i] && incomplete_render_area)
366             continue;
367 
368          if (load)
369             key.op[i] = AGX_BG_LOAD;
370          else if (clear)
371             key.op[i] = AGX_BG_CLEAR;
372       }
373    }
374 
375    /* Begin building the pipeline */
376    size_t usc_size = agx_usc_size(3 + HK_MAX_RTS);
377    struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64);
378    if (!t.cpu)
379       return (struct hk_bg_eot){.usc = t.gpu};
380 
381    struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
382 
383    bool uses_txf = false;
384    unsigned uniforms = 0;
385    unsigned nr_tex = 0;
386 
387    for (unsigned rt = 0; rt < HK_MAX_RTS; ++rt) {
388       const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[rt];
389       struct hk_image_view *iview = render->color_att[rt].iview;
390 
391       if (key.op[rt] == AGX_BG_LOAD) {
392          uses_txf = true;
393 
394          uint32_t index = key.tib.layered
395                              ? iview->planes[0].layered_background_desc_index
396                              : iview->planes[0].background_desc_index;
397 
398          agx_usc_pack(&b, TEXTURE, cfg) {
399             /* Shifted to match eMRT indexing, could be optimized */
400             cfg.start = rt * 2;
401             cfg.count = 1;
402             cfg.buffer = dev->images.bo->va->addr + index * AGX_TEXTURE_LENGTH;
403          }
404 
405          nr_tex = (rt * 2) + 1;
406       } else if (key.op[rt] == AGX_BG_CLEAR) {
407          static_assert(sizeof(att_info->clearValue.color) == 16, "fixed ABI");
408          uint64_t colour =
409             hk_pool_upload(cmd, &att_info->clearValue.color, 16, 16);
410 
411          agx_usc_uniform(&b, 4 + (8 * rt), 8, colour);
412          uniforms = MAX2(uniforms, 4 + (8 * rt) + 8);
413       } else if (key.op[rt] == AGX_EOT_STORE) {
414          uint32_t index = key.tib.layered
415                              ? iview->planes[0].layered_eot_pbe_desc_index
416                              : iview->planes[0].eot_pbe_desc_index;
417 
418          agx_usc_pack(&b, TEXTURE, cfg) {
419             cfg.start = rt;
420             cfg.count = 1;
421             cfg.buffer = dev->images.bo->va->addr + index * AGX_TEXTURE_LENGTH;
422          }
423 
424          nr_tex = rt + 1;
425       }
426    }
427 
428    if (needs_textures_for_spilled_rts) {
429       hk_usc_upload_spilled_rt_descs(&b, cmd);
430       uniforms = MAX2(uniforms, 4);
431    }
432 
433    if (uses_txf) {
434       agx_usc_push_packed(&b, SAMPLER, dev->dev.txf_sampler);
435    }
436 
437    /* For attachmentless rendering, we don't know the sample count until
438     * draw-time. But we have trivial bg/eot programs in that case too.
439     */
440    if (key.tib.nr_samples >= 1) {
441       agx_usc_push_packed(&b, SHARED, &key.tib.usc);
442    } else {
443       assert(key.tib.sample_size_B == 0);
444       agx_usc_shared_none(&b);
445 
446       key.tib.nr_samples = 1;
447    }
448 
449    /* Get the shader */
450    key.reserved_preamble = uniforms;
451    /* XXX: locking? */
452    struct agx_bg_eot_shader *shader = agx_get_bg_eot_shader(&dev->bg_eot, &key);
453 
454    agx_usc_pack(&b, SHADER, cfg) {
455       cfg.code = agx_usc_addr(&dev->dev, shader->ptr);
456       cfg.unk_2 = 0;
457    }
458 
459    agx_usc_pack(&b, REGISTERS, cfg)
460       cfg.register_count = shader->info.nr_gprs;
461 
462    if (shader->info.has_preamble) {
463       agx_usc_pack(&b, PRESHADER, cfg) {
464          cfg.code =
465             agx_usc_addr(&dev->dev, shader->ptr + shader->info.preamble_offset);
466       }
467    } else {
468       agx_usc_pack(&b, NO_PRESHADER, cfg)
469          ;
470    }
471 
472    struct hk_bg_eot ret = {.usc = t.gpu};
473 
474    agx_pack(&ret.counts, COUNTS, cfg) {
475       cfg.uniform_register_count = shader->info.push_count;
476       cfg.preshader_register_count = shader->info.nr_preamble_gprs;
477       cfg.texture_state_register_count = nr_tex;
478       cfg.sampler_state_register_count =
479          agx_translate_sampler_state_count(uses_txf ? 1 : 0, false);
480    }
481 
482    return ret;
483 }
484 
485 static bool
is_aligned(unsigned x,unsigned pot_alignment)486 is_aligned(unsigned x, unsigned pot_alignment)
487 {
488    assert(util_is_power_of_two_nonzero(pot_alignment));
489    return (x & (pot_alignment - 1)) == 0;
490 }
491 
492 static void
hk_merge_render_iview(struct hk_rendering_state * render,struct hk_image_view * iview,bool zls)493 hk_merge_render_iview(struct hk_rendering_state *render,
494                       struct hk_image_view *iview, bool zls)
495 {
496    if (iview) {
497       unsigned samples = iview->vk.image->samples;
498       /* TODO: is this right for ycbcr? */
499       unsigned level = iview->vk.base_mip_level;
500       unsigned width = u_minify(iview->vk.image->extent.width, level);
501       unsigned height = u_minify(iview->vk.image->extent.height, level);
502 
503       assert(render->tilebuffer.nr_samples == 0 ||
504              render->tilebuffer.nr_samples == samples);
505       render->tilebuffer.nr_samples = samples;
506 
507       /* TODO: Is this merging logic sound? Not sure how this is supposed to
508        * work conceptually.
509        */
510       render->cr.width = MAX2(render->cr.width, width);
511       render->cr.height = MAX2(render->cr.height, height);
512 
513       if (zls) {
514          render->cr.zls_width = width;
515          render->cr.zls_height = height;
516       }
517    }
518 }
519 
520 static void
hk_pack_zls_control(struct agx_zls_control_packed * packed,struct ail_layout * z_layout,struct ail_layout * s_layout,const VkRenderingAttachmentInfo * attach_z,const VkRenderingAttachmentInfo * attach_s,bool incomplete_render_area,bool partial_render)521 hk_pack_zls_control(struct agx_zls_control_packed *packed,
522                     struct ail_layout *z_layout, struct ail_layout *s_layout,
523                     const VkRenderingAttachmentInfo *attach_z,
524                     const VkRenderingAttachmentInfo *attach_s,
525                     bool incomplete_render_area, bool partial_render)
526 {
527    agx_pack(packed, ZLS_CONTROL, zls_control) {
528       if (z_layout) {
529          /* XXX: Dropping Z stores is wrong if the render pass gets split into
530           * multiple control streams (can that ever happen?) We need more ZLS
531           * variants. Force || true for now.
532           */
533          zls_control.z_store_enable =
534             attach_z->storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
535             attach_z->resolveMode != VK_RESOLVE_MODE_NONE || partial_render ||
536             true;
537 
538          zls_control.z_load_enable =
539             attach_z->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD || partial_render ||
540             incomplete_render_area;
541 
542          if (ail_is_compressed(z_layout)) {
543             zls_control.z_compress_1 = true;
544             zls_control.z_compress_2 = true;
545          }
546 
547          if (z_layout->format == PIPE_FORMAT_Z16_UNORM) {
548             zls_control.z_format = AGX_ZLS_FORMAT_16;
549          } else {
550             zls_control.z_format = AGX_ZLS_FORMAT_32F;
551          }
552       }
553 
554       if (s_layout) {
555          /* TODO:
556           * Fail
557           * dEQP-VK.renderpass.dedicated_allocation.formats.d32_sfloat_s8_uint.input.dont_care.store.self_dep_clear_draw_use_input_aspect
558           * without the force
559           * .. maybe a VkRenderPass emulation bug.
560           */
561          zls_control.s_store_enable =
562             attach_s->storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
563             attach_s->resolveMode != VK_RESOLVE_MODE_NONE || partial_render ||
564             true;
565 
566          zls_control.s_load_enable =
567             attach_s->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD || partial_render ||
568             incomplete_render_area;
569 
570          if (ail_is_compressed(s_layout)) {
571             zls_control.s_compress_1 = true;
572             zls_control.s_compress_2 = true;
573          }
574       }
575    }
576 }
577 
578 VKAPI_ATTR void VKAPI_CALL
hk_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)579 hk_CmdBeginRendering(VkCommandBuffer commandBuffer,
580                      const VkRenderingInfo *pRenderingInfo)
581 {
582    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
583    struct hk_rendering_state *render = &cmd->state.gfx.render;
584    struct hk_device *dev = hk_cmd_buffer_device(cmd);
585 
586    memset(render, 0, sizeof(*render));
587 
588    render->flags = pRenderingInfo->flags;
589    render->area = pRenderingInfo->renderArea;
590    render->view_mask = pRenderingInfo->viewMask;
591    render->layer_count = pRenderingInfo->layerCount;
592    render->tilebuffer.nr_samples = 0;
593 
594    const uint32_t layer_count = render->view_mask
595                                    ? util_last_bit(render->view_mask)
596                                    : render->layer_count;
597 
598    render->color_att_count = pRenderingInfo->colorAttachmentCount;
599    for (uint32_t i = 0; i < render->color_att_count; i++) {
600       hk_attachment_init(&render->color_att[i],
601                          &pRenderingInfo->pColorAttachments[i]);
602    }
603 
604    hk_attachment_init(&render->depth_att, pRenderingInfo->pDepthAttachment);
605    hk_attachment_init(&render->stencil_att, pRenderingInfo->pStencilAttachment);
606 
607    for (uint32_t i = 0; i < render->color_att_count; i++) {
608       hk_merge_render_iview(render, render->color_att[i].iview, false);
609    }
610 
611    hk_merge_render_iview(
612       render, render->depth_att.iview ?: render->stencil_att.iview, true);
613 
614    /* Infer for attachmentless. samples is inferred at draw-time. */
615    render->cr.width =
616       MAX2(render->cr.width, render->area.offset.x + render->area.extent.width);
617 
618    render->cr.height = MAX2(render->cr.height,
619                             render->area.offset.y + render->area.extent.height);
620 
621    if (!render->cr.zls_width) {
622       render->cr.zls_width = render->cr.width;
623       render->cr.zls_height = render->cr.height;
624    }
625 
626    render->cr.layers = layer_count;
627 
628    /* Choose a tilebuffer layout given the framebuffer key */
629    enum pipe_format formats[HK_MAX_RTS] = {0};
630    for (unsigned i = 0; i < render->color_att_count; ++i) {
631       formats[i] = hk_format_to_pipe_format(render->color_att[i].vk_format);
632    }
633 
634    /* For now, we force layered=true since it makes compatibility problems way
635     * easier.
636     */
637    render->tilebuffer = agx_build_tilebuffer_layout(
638       formats, render->color_att_count, render->tilebuffer.nr_samples, true);
639 
640    const VkRenderingAttachmentLocationInfoKHR ral_info = {
641       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
642       .colorAttachmentCount = pRenderingInfo->colorAttachmentCount,
643    };
644    vk_cmd_set_rendering_attachment_locations(&cmd->vk, &ral_info);
645 
646    hk_cmd_buffer_dirty_render_pass(cmd);
647 
648    /* Determine whether the render area is complete, enabling us to use a
649     * fast-clear.
650     *
651     * TODO: If it is incomplete but tile aligned, it should be possibly to fast
652     * clear with the appropriate settings. This is critical for performance.
653     */
654    bool incomplete_render_area =
655       render->area.offset.x > 0 || render->area.offset.y > 0 ||
656       render->area.extent.width < render->cr.width ||
657       render->area.extent.height < render->cr.height ||
658       (render->view_mask &&
659        render->view_mask != BITFIELD64_MASK(render->cr.layers));
660 
661    perf_debug(dev, "Rendering %ux%ux%u@%u %s%s", render->cr.width,
662               render->cr.height, render->cr.layers,
663               render->tilebuffer.nr_samples,
664               render->view_mask ? " multiview" : "",
665               incomplete_render_area ? " incomplete" : "");
666 
667    render->cr.bg.main = hk_build_bg_eot(cmd, pRenderingInfo, false, false,
668                                         incomplete_render_area);
669    render->cr.bg.partial =
670       hk_build_bg_eot(cmd, pRenderingInfo, false, true, incomplete_render_area);
671 
672    render->cr.eot.main =
673       hk_build_bg_eot(cmd, pRenderingInfo, true, false, incomplete_render_area);
674    render->cr.eot.partial =
675       hk_build_bg_eot(cmd, pRenderingInfo, true, true, incomplete_render_area);
676 
677    render->cr.isp_bgobjvals = 0x300;
678 
679    const VkRenderingAttachmentInfo *attach_z = pRenderingInfo->pDepthAttachment;
680    const VkRenderingAttachmentInfo *attach_s =
681       pRenderingInfo->pStencilAttachment;
682 
683    render->cr.iogpu_unk_214 = 0xc000;
684 
685    struct ail_layout *z_layout = NULL, *s_layout = NULL;
686 
687    if (attach_z != NULL && attach_z != VK_NULL_HANDLE && attach_z->imageView) {
688       struct hk_image_view *view = render->depth_att.iview;
689       struct hk_image *image =
690          container_of(view->vk.image, struct hk_image, vk);
691 
692       z_layout = &image->planes[0].layout;
693 
694       unsigned level = view->vk.base_mip_level;
695       unsigned first_layer = view->vk.base_array_layer;
696 
697       const struct util_format_description *desc =
698          util_format_description(hk_format_to_pipe_format(view->vk.format));
699 
700       assert(desc->format == PIPE_FORMAT_Z32_FLOAT ||
701              desc->format == PIPE_FORMAT_Z16_UNORM ||
702              desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
703 
704       render->cr.depth.buffer =
705          hk_image_base_address(image, 0) +
706          ail_get_layer_level_B(z_layout, first_layer, level);
707 
708       /* Main stride in pages */
709       assert((z_layout->depth_px == 1 ||
710               is_aligned(z_layout->layer_stride_B, AIL_PAGESIZE)) &&
711              "Page aligned Z layers");
712 
713       unsigned stride_pages = z_layout->layer_stride_B / AIL_PAGESIZE;
714       render->cr.depth.stride = ((stride_pages - 1) << 14) | 1;
715 
716       assert(z_layout->tiling != AIL_TILING_LINEAR && "must tile");
717 
718       if (ail_is_compressed(z_layout)) {
719          render->cr.depth.meta =
720             hk_image_base_address(image, 0) + z_layout->metadata_offset_B +
721             (first_layer * z_layout->compression_layer_stride_B) +
722             z_layout->level_offsets_compressed_B[level];
723 
724          /* Meta stride in cache lines */
725          assert(
726             is_aligned(z_layout->compression_layer_stride_B, AIL_CACHELINE) &&
727             "Cacheline aligned Z meta layers");
728 
729          unsigned stride_lines =
730             z_layout->compression_layer_stride_B / AIL_CACHELINE;
731          render->cr.depth.meta_stride = (stride_lines - 1) << 14;
732       }
733 
734       float clear_depth = attach_z->clearValue.depthStencil.depth;
735 
736       if (z_layout->format == PIPE_FORMAT_Z16_UNORM) {
737          render->cr.isp_bgobjdepth = _mesa_float_to_unorm(clear_depth, 16);
738       } else {
739          render->cr.isp_bgobjdepth = fui(clear_depth);
740       }
741    }
742 
743    if (attach_s != NULL && attach_s != VK_NULL_HANDLE && attach_s->imageView) {
744       struct hk_image_view *view = render->stencil_att.iview;
745       struct hk_image *image =
746          container_of(view->vk.image, struct hk_image, vk);
747 
748       /* Stencil is always the last plane (possibly the only plane) */
749       unsigned plane = image->plane_count - 1;
750       s_layout = &image->planes[plane].layout;
751       assert(s_layout->format == PIPE_FORMAT_S8_UINT);
752 
753       unsigned level = view->vk.base_mip_level;
754       unsigned first_layer = view->vk.base_array_layer;
755 
756       render->cr.stencil.buffer =
757          hk_image_base_address(image, plane) +
758          ail_get_layer_level_B(s_layout, first_layer, level);
759 
760       /* Main stride in pages */
761       assert((s_layout->depth_px == 1 ||
762               is_aligned(s_layout->layer_stride_B, AIL_PAGESIZE)) &&
763              "Page aligned S layers");
764       unsigned stride_pages = s_layout->layer_stride_B / AIL_PAGESIZE;
765       render->cr.stencil.stride = ((stride_pages - 1) << 14) | 1;
766 
767       if (ail_is_compressed(s_layout)) {
768          render->cr.stencil.meta =
769             hk_image_base_address(image, plane) + s_layout->metadata_offset_B +
770             (first_layer * s_layout->compression_layer_stride_B) +
771             s_layout->level_offsets_compressed_B[level];
772 
773          /* Meta stride in cache lines */
774          assert(
775             is_aligned(s_layout->compression_layer_stride_B, AIL_CACHELINE) &&
776             "Cacheline aligned S meta layers");
777 
778          unsigned stride_lines =
779             s_layout->compression_layer_stride_B / AIL_CACHELINE;
780 
781          render->cr.stencil.meta_stride = (stride_lines - 1) << 14;
782       }
783 
784       render->cr.isp_bgobjvals |= attach_s->clearValue.depthStencil.stencil;
785    }
786 
787    hk_pack_zls_control(&render->cr.zls_control, z_layout, s_layout, attach_z,
788                        attach_s, incomplete_render_area, false);
789 
790    hk_pack_zls_control(&render->cr.zls_control_partial, z_layout, s_layout,
791                        attach_z, attach_s, incomplete_render_area, true);
792 
793    /* If multiview is disabled, always read 0. If multiview is enabled,
794     * hk_set_view_index will dirty the root each draw.
795     */
796    cmd->state.gfx.descriptors.root.draw.view_index = 0;
797    cmd->state.gfx.descriptors.root_dirty = true;
798 
799    if (render->flags & VK_RENDERING_RESUMING_BIT)
800       return;
801 
802    /* The first control stream of the render pass is special since it gets
803     * the clears. Create it and swap in the clear.
804     */
805    assert(!cmd->current_cs.gfx && "not already in a render pass");
806    struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, false /* compute */);
807    if (!cs)
808       return;
809 
810    cs->cr.bg.main = render->cr.bg.main;
811    cs->cr.zls_control = render->cr.zls_control;
812 
813    /* Reordering barrier for post-gfx, in case we had any. */
814    hk_cmd_buffer_end_compute_internal(cmd, &cmd->current_cs.post_gfx);
815 
816    /* Don't reorder compute across render passes.
817     *
818     * TODO: Check if this is necessary if the proper PipelineBarriers are
819     * handled... there may be CTS bugs...
820     */
821    hk_cmd_buffer_end_compute(cmd);
822 
823    /* If we spill colour attachments, we need to decompress them. This happens
824     * at the start of the render; it is not re-emitted when resuming
825     * secondaries. It could be hoisted to the start of the command buffer but
826     * we're not that clever yet.
827     */
828    if (agx_tilebuffer_spills(&render->tilebuffer)) {
829       perf_debug(dev, "eMRT render pass");
830 
831       for (unsigned i = 0; i < render->color_att_count; ++i) {
832          struct hk_image_view *view = render->color_att[i].iview;
833          if (view) {
834             struct hk_image *image =
835                container_of(view->vk.image, struct hk_image, vk);
836 
837             /* TODO: YCbCr interaction? */
838             uint8_t plane = 0;
839             uint8_t image_plane = view->planes[plane].image_plane;
840             struct ail_layout *layout = &image->planes[image_plane].layout;
841 
842             if (ail_is_level_compressed(layout, view->vk.base_mip_level)) {
843                struct hk_device *dev = hk_cmd_buffer_device(cmd);
844                perf_debug(dev, "Decompressing in-place");
845 
846                struct hk_cs *cs = hk_cmd_buffer_get_cs_general(
847                   cmd, &cmd->current_cs.pre_gfx, true);
848                if (!cs)
849                   return;
850 
851                unsigned level = view->vk.base_mip_level;
852                unsigned layer = view->vk.base_array_layer;
853                uint64_t base = hk_image_base_address(image, image_plane);
854 
855                struct libagx_decompress_images imgs = {
856                   .compressed = view->planes[plane].emrt_texture,
857                   .uncompressed = view->planes[plane].emrt_pbe,
858                };
859 
860                struct agx_grid grid =
861                   agx_3d(ail_metadata_width_tl(layout, level) * 32,
862                          ail_metadata_height_tl(layout, level), layer_count);
863 
864                libagx_decompress(cs, grid, AGX_BARRIER_ALL, layout, layer,
865                                  level, base,
866                                  hk_pool_upload(cmd, &imgs, sizeof(imgs), 64));
867             }
868          }
869       }
870    }
871 
872    uint32_t clear_count = 0;
873    VkClearAttachment clear_att[HK_MAX_RTS + 1];
874    bool resolved_clear = false;
875 
876    for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
877       const VkRenderingAttachmentInfo *att_info =
878          &pRenderingInfo->pColorAttachments[i];
879       if (att_info->imageView == VK_NULL_HANDLE ||
880           att_info->loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
881          continue;
882 
883       clear_att[clear_count++] = (VkClearAttachment){
884          .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
885          .colorAttachment = i,
886          .clearValue = att_info->clearValue,
887       };
888 
889       resolved_clear |= is_attachment_stored(att_info);
890    }
891 
892    clear_att[clear_count] = (VkClearAttachment){
893       .aspectMask = 0,
894    };
895 
896    if (attach_z && attach_z->imageView != VK_NULL_HANDLE &&
897        attach_z->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
898       clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
899       clear_att[clear_count].clearValue.depthStencil.depth =
900          attach_z->clearValue.depthStencil.depth;
901 
902       resolved_clear |= is_attachment_stored(attach_z);
903    }
904 
905    if (attach_s != NULL && attach_s->imageView != VK_NULL_HANDLE &&
906        attach_s->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
907       clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
908       clear_att[clear_count].clearValue.depthStencil.stencil =
909          attach_s->clearValue.depthStencil.stencil;
910 
911       resolved_clear |= is_attachment_stored(attach_s);
912    }
913 
914    if (clear_att[clear_count].aspectMask != 0)
915       clear_count++;
916 
917    if (clear_count > 0 && incomplete_render_area) {
918       const VkClearRect clear_rect = {
919          .rect = render->area,
920          .baseArrayLayer = 0,
921          .layerCount = render->view_mask ? 1 : render->layer_count,
922       };
923 
924       hk_CmdClearAttachments(hk_cmd_buffer_to_handle(cmd), clear_count,
925                              clear_att, 1, &clear_rect);
926    } else {
927       /* If a tile is empty, we do not want to process it, as the redundant
928        * roundtrip of memory-->tilebuffer-->memory wastes a tremendous amount of
929        * memory bandwidth. Any draw marks a tile as non-empty, so we only need
930        * to process empty tiles if the background+EOT programs have a side
931        * effect. This is the case exactly when there is an attachment we are
932        * fast clearing and then storing.
933        */
934       cs->cr.process_empty_tiles = resolved_clear;
935    }
936 }
937 
938 VKAPI_ATTR void VKAPI_CALL
hk_CmdEndRendering(VkCommandBuffer commandBuffer)939 hk_CmdEndRendering(VkCommandBuffer commandBuffer)
940 {
941    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
942    struct hk_rendering_state *render = &cmd->state.gfx.render;
943    struct hk_device *dev = hk_cmd_buffer_device(cmd);
944 
945    /* The last control stream of the render pass is special since it gets its
946     * stores dropped. Swap it in.
947     */
948    struct hk_cs *cs = cmd->current_cs.gfx;
949    if (cs) {
950       cs->cr.eot.main = render->cr.eot.main;
951    }
952 
953    perf_debug(dev, "End rendering");
954    hk_cmd_buffer_end_graphics(cmd);
955 
956    bool need_resolve = false;
957 
958    /* Translate render state back to VK for meta */
959    VkRenderingAttachmentInfo vk_color_att[HK_MAX_RTS];
960    for (uint32_t i = 0; i < render->color_att_count; i++) {
961       if (render->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE)
962          need_resolve = true;
963 
964       vk_color_att[i] = (VkRenderingAttachmentInfo){
965          .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
966          .imageView = hk_image_view_to_handle(render->color_att[i].iview),
967          .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
968          .resolveMode = render->color_att[i].resolve_mode,
969          .resolveImageView =
970             hk_image_view_to_handle(render->color_att[i].resolve_iview),
971          .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
972       };
973    }
974 
975    const VkRenderingAttachmentInfo vk_depth_att = {
976       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
977       .imageView = hk_image_view_to_handle(render->depth_att.iview),
978       .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
979       .resolveMode = render->depth_att.resolve_mode,
980       .resolveImageView =
981          hk_image_view_to_handle(render->depth_att.resolve_iview),
982       .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
983    };
984    if (render->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE)
985       need_resolve = true;
986 
987    const VkRenderingAttachmentInfo vk_stencil_att = {
988       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
989       .imageView = hk_image_view_to_handle(render->stencil_att.iview),
990       .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
991       .resolveMode = render->stencil_att.resolve_mode,
992       .resolveImageView =
993          hk_image_view_to_handle(render->stencil_att.resolve_iview),
994       .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
995    };
996    if (render->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)
997       need_resolve = true;
998 
999    const VkRenderingInfo vk_render = {
1000       .sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
1001       .renderArea = render->area,
1002       .layerCount = render->layer_count,
1003       .viewMask = render->view_mask,
1004       .colorAttachmentCount = render->color_att_count,
1005       .pColorAttachments = vk_color_att,
1006       .pDepthAttachment = &vk_depth_att,
1007       .pStencilAttachment = &vk_stencil_att,
1008    };
1009 
1010    if (render->flags & VK_RENDERING_SUSPENDING_BIT)
1011       need_resolve = false;
1012 
1013    memset(render, 0, sizeof(*render));
1014 
1015    if (need_resolve) {
1016       perf_debug(dev, "Resolving render pass, colour store op %u",
1017                  vk_color_att[0].storeOp);
1018 
1019       hk_meta_resolve_rendering(cmd, &vk_render);
1020    }
1021 }
1022 
1023 static uint64_t
hk_geometry_state(struct hk_cmd_buffer * cmd)1024 hk_geometry_state(struct hk_cmd_buffer *cmd)
1025 {
1026    struct hk_device *dev = hk_cmd_buffer_device(cmd);
1027 
1028    /* We tie heap allocation to geometry state allocation, so allocate now. */
1029    if (unlikely(!dev->heap)) {
1030       perf_debug(dev, "Allocating heap");
1031 
1032       size_t size = 128 * 1024 * 1024;
1033       dev->heap = agx_bo_create(&dev->dev, size, 0, 0, "Geometry heap");
1034 
1035       /* The geometry state buffer is initialized here and then is treated by
1036        * the CPU as rodata, even though the GPU uses it for scratch internally.
1037        */
1038       off_t off = dev->rodata.geometry_state - dev->rodata.bo->va->addr;
1039       struct agx_geometry_state *map = agx_bo_map(dev->rodata.bo) + off;
1040 
1041       *map = (struct agx_geometry_state){
1042          .heap = dev->heap->va->addr,
1043          .heap_size = size,
1044       };
1045    }
1046 
1047    /* We need to free all allocations after each command buffer execution */
1048    if (!cmd->uses_heap) {
1049       perf_debug(dev, "Freeing heap");
1050       uint64_t addr = dev->rodata.geometry_state;
1051 
1052       /* Zeroing the allocated index frees everything */
1053       hk_queue_write(cmd,
1054                      addr + offsetof(struct agx_geometry_state, heap_bottom), 0,
1055                      true /* after gfx */);
1056 
1057       cmd->uses_heap = true;
1058    }
1059 
1060    return dev->rodata.geometry_state;
1061 }
1062 
1063 static uint64_t
hk_upload_ia_params(struct hk_cmd_buffer * cmd,struct agx_draw draw)1064 hk_upload_ia_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
1065 {
1066    struct hk_device *dev = hk_cmd_buffer_device(cmd);
1067    assert(!agx_is_indirect(draw.b) && "indirect params written by GPU");
1068 
1069    struct agx_ia_state ia = {.verts_per_instance = draw.b.count[0]};
1070 
1071    if (draw.indexed) {
1072       unsigned index_size_B = agx_index_size_to_B(draw.index_size);
1073       unsigned range_el = agx_draw_index_range_el(draw);
1074 
1075       ia.index_buffer =
1076          libagx_index_buffer(agx_draw_index_buffer(draw), range_el, 0,
1077                              index_size_B, dev->rodata.zero_sink);
1078 
1079       ia.index_buffer_range_el = range_el;
1080    }
1081 
1082    return hk_pool_upload(cmd, &ia, sizeof(ia), 8);
1083 }
1084 
1085 static enum mesa_prim
hk_gs_in_prim(struct hk_cmd_buffer * cmd)1086 hk_gs_in_prim(struct hk_cmd_buffer *cmd)
1087 {
1088    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1089    struct hk_graphics_state *gfx = &cmd->state.gfx;
1090    struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL];
1091 
1092    if (tes != NULL)
1093       return gfx->tess.prim;
1094    else
1095       return vk_conv_topology(dyn->ia.primitive_topology);
1096 }
1097 
1098 static enum mesa_prim
hk_rast_prim(struct hk_cmd_buffer * cmd)1099 hk_rast_prim(struct hk_cmd_buffer *cmd)
1100 {
1101    struct hk_graphics_state *gfx = &cmd->state.gfx;
1102    struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
1103    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1104 
1105    if (gs != NULL) {
1106       return gs->variants[HK_GS_VARIANT_RAST].info.gs.out_prim;
1107    } else {
1108       switch (dyn->ia.primitive_topology) {
1109       case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
1110       case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
1111          return MESA_PRIM_LINES;
1112       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
1113       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
1114          return MESA_PRIM_TRIANGLES;
1115       default:
1116          return hk_gs_in_prim(cmd);
1117       }
1118    }
1119 }
1120 
1121 static uint64_t
hk_upload_geometry_params(struct hk_cmd_buffer * cmd,struct agx_draw draw)1122 hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw)
1123 {
1124    struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
1125    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1126    struct hk_graphics_state *gfx = &cmd->state.gfx;
1127    struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
1128    struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
1129 
1130    bool rast_disc = dyn->rs.rasterizer_discard_enable;
1131    struct hk_shader *count = hk_count_gs_variant(gs, rast_disc);
1132 
1133    /* XXX: We should deduplicate this logic */
1134    bool indirect = agx_is_indirect(draw.b) ||
1135                    gfx->shaders[MESA_SHADER_TESS_EVAL] || draw.restart;
1136    enum mesa_prim mode = hk_gs_in_prim(cmd);
1137 
1138    if (draw.restart) {
1139       mode = u_decomposed_prim(mode);
1140    }
1141 
1142    struct agx_geometry_params params = {
1143       .state = hk_geometry_state(cmd),
1144       .indirect_desc = cmd->geom_indirect,
1145       .flat_outputs = fs ? fs->info.fs.interp.flat : 0,
1146       .input_topology = mode,
1147 
1148       /* Overriden by the indirect setup kernel. As tess->GS is always indirect,
1149        * we can assume here that we're VS->GS.
1150        */
1151       .input_buffer = desc->root.draw.vertex_output_buffer,
1152       .input_mask = desc->root.draw.vertex_outputs,
1153    };
1154 
1155    if (gfx->xfb_enabled) {
1156       for (unsigned i = 0; i < ARRAY_SIZE(gfx->xfb); ++i) {
1157          params.xfb_base_original[i] = gfx->xfb[i].addr;
1158          params.xfb_size[i] = gfx->xfb[i].range;
1159          params.xfb_offs_ptrs[i] = gfx->xfb_offsets + i * sizeof(uint32_t);
1160       }
1161    }
1162 
1163    for (unsigned i = 0; i < ARRAY_SIZE(gfx->xfb_query); ++i) {
1164       uint64_t q = gfx->xfb_query[i];
1165 
1166       if (q) {
1167          params.xfb_prims_generated_counter[i] = q;
1168          params.prims_generated_counter[i] = q + sizeof(uint64_t);
1169       }
1170    }
1171 
1172    /* Calculate input primitive count for direct draws, and allocate the vertex
1173     * & count buffers. GPU calculates and allocates for indirect draws.
1174     */
1175    params.count_buffer_stride = count->info.gs.count_words * 4;
1176 
1177    if (indirect) {
1178       params.vs_grid[2] = params.gs_grid[2] = 1;
1179    } else {
1180       uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
1181 
1182       params.vs_grid[0] = verts;
1183       params.gs_grid[0] = u_decomposed_prims_for_vertices(mode, verts);
1184 
1185       params.primitives_log2 = util_logbase2_ceil(params.gs_grid[0]);
1186       params.input_primitives = params.gs_grid[0] * instances;
1187 
1188       unsigned size = params.input_primitives * params.count_buffer_stride;
1189       if (size) {
1190          params.count_buffer = hk_pool_alloc(cmd, size, 4).gpu;
1191       }
1192    }
1193 
1194    desc->root_dirty = true;
1195    return hk_pool_upload(cmd, &params, sizeof(params), 8);
1196 }
1197 
1198 static void
hk_upload_tess_params(struct hk_cmd_buffer * cmd,struct libagx_tess_args * out,struct agx_draw draw)1199 hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct libagx_tess_args *out,
1200                       struct agx_draw draw)
1201 {
1202    struct hk_device *dev = hk_cmd_buffer_device(cmd);
1203    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1204    struct hk_graphics_state *gfx = &cmd->state.gfx;
1205    struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]);
1206 
1207    enum libagx_tess_partitioning partitioning =
1208       gfx->tess.info.spacing == TESS_SPACING_EQUAL
1209          ? LIBAGX_TESS_PARTITIONING_INTEGER
1210       : gfx->tess.info.spacing == TESS_SPACING_FRACTIONAL_ODD
1211          ? LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD
1212          : LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN;
1213 
1214    struct libagx_tess_args args = {
1215       .heap = hk_geometry_state(cmd),
1216       .tcs_stride_el = tcs->info.tess.tcs_output_stride / 4,
1217       .statistic = hk_pipeline_stat_addr(
1218          cmd,
1219          VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT),
1220 
1221       .input_patch_size = dyn->ts.patch_control_points,
1222       .output_patch_size = tcs->info.tess.tcs_output_patch_size,
1223       .tcs_patch_constants = tcs->info.tess.tcs_nr_patch_outputs,
1224       .tcs_per_vertex_outputs = tcs->info.tess.tcs_per_vertex_outputs,
1225       .partitioning = partitioning,
1226       .points_mode = gfx->tess.info.points,
1227    };
1228 
1229    if (!args.points_mode && gfx->tess.info.mode != TESS_PRIMITIVE_ISOLINES) {
1230       args.ccw = gfx->tess.info.ccw;
1231       args.ccw ^=
1232          dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT;
1233    }
1234 
1235    uint32_t draw_stride_el = 5;
1236    size_t draw_stride_B = draw_stride_el * sizeof(uint32_t);
1237 
1238    /* heap is allocated by hk_geometry_state */
1239    args.patch_coord_buffer = dev->heap->va->addr;
1240 
1241    if (!agx_is_indirect(draw.b)) {
1242       unsigned in_patches = draw.b.count[0] / args.input_patch_size;
1243       unsigned unrolled_patches = in_patches * draw.b.count[1];
1244 
1245       uint32_t alloc = 0;
1246       uint32_t tcs_out_offs = alloc;
1247       alloc += unrolled_patches * args.tcs_stride_el * 4 * 32;
1248 
1249       uint32_t patch_coord_offs = alloc;
1250       alloc += unrolled_patches * 4 * 32;
1251 
1252       uint32_t count_offs = alloc;
1253       alloc += unrolled_patches * sizeof(uint32_t) * 32;
1254 
1255       /* Single API draw */
1256       uint32_t draw_offs = alloc;
1257       alloc += draw_stride_B;
1258 
1259       struct agx_ptr blob = hk_pool_alloc(cmd, alloc, 4);
1260       args.tcs_buffer = blob.gpu + tcs_out_offs;
1261       args.patches_per_instance = in_patches;
1262       args.coord_allocs = blob.gpu + patch_coord_offs;
1263       args.nr_patches = unrolled_patches;
1264       args.out_draws = blob.gpu + draw_offs;
1265       args.counts = blob.gpu + count_offs;
1266    } else {
1267       /* Allocate 3x indirect global+local grids for VS/TCS/tess */
1268       uint32_t grid_stride = sizeof(uint32_t) * 6;
1269       gfx->tess.grids = hk_pool_alloc(cmd, grid_stride * 3, 4).gpu;
1270 
1271       args.out_draws = hk_pool_alloc(cmd, draw_stride_B, 4).gpu;
1272    }
1273 
1274    gfx->tess.out_draws = args.out_draws;
1275    memcpy(out, &args, sizeof(args));
1276 }
1277 
1278 static struct hk_api_shader *
hk_build_meta_shader_locked(struct hk_device * dev,struct hk_internal_key * key,hk_internal_builder_t builder)1279 hk_build_meta_shader_locked(struct hk_device *dev, struct hk_internal_key *key,
1280                             hk_internal_builder_t builder)
1281 {
1282    /* Try to get the cached shader */
1283    struct hash_entry *ent = _mesa_hash_table_search(dev->kernels.ht, key);
1284    if (ent)
1285       return ent->data;
1286 
1287    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
1288                                                   &agx_nir_options, NULL);
1289    builder(&b, key->key);
1290 
1291    const struct vk_pipeline_robustness_state rs = {
1292       .images = VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DISABLED_EXT,
1293       .storage_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
1294       .uniform_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
1295       .vertex_inputs = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
1296    };
1297 
1298    struct vk_shader_compile_info info = {
1299       .stage = b.shader->info.stage,
1300       .nir = b.shader,
1301       .robustness = &rs,
1302    };
1303 
1304    hk_preprocess_nir_internal(dev->vk.physical, b.shader);
1305 
1306    struct hk_api_shader *s;
1307    if (hk_compile_shader(dev, &info, NULL, NULL, &s) != VK_SUCCESS)
1308       return NULL;
1309 
1310    /* ..and cache it before we return. The key is on the stack right now, so
1311     * clone it before using it as a hash table key. The clone is logically owned
1312     * by the hash table.
1313     */
1314    size_t total_key_size = sizeof(*key) + key->key_size;
1315    void *cloned_key = ralloc_memdup(dev->kernels.ht, key, total_key_size);
1316 
1317    _mesa_hash_table_insert(dev->kernels.ht, cloned_key, s);
1318    return s;
1319 }
1320 
1321 struct hk_api_shader *
hk_meta_shader(struct hk_device * dev,hk_internal_builder_t builder,void * data,size_t data_size)1322 hk_meta_shader(struct hk_device *dev, hk_internal_builder_t builder, void *data,
1323                size_t data_size)
1324 {
1325    size_t total_key_size = sizeof(struct hk_internal_key) + data_size;
1326 
1327    struct hk_internal_key *key = alloca(total_key_size);
1328    key->builder = builder;
1329    key->key_size = data_size;
1330 
1331    if (data_size)
1332       memcpy(key->key, data, data_size);
1333 
1334    simple_mtx_lock(&dev->kernels.lock);
1335    struct hk_api_shader *s = hk_build_meta_shader_locked(dev, key, builder);
1336    simple_mtx_unlock(&dev->kernels.lock);
1337 
1338    return s;
1339 }
1340 
1341 static struct agx_draw
hk_draw_as_indexed_indirect(struct hk_cmd_buffer * cmd,struct agx_draw draw)1342 hk_draw_as_indexed_indirect(struct hk_cmd_buffer *cmd, struct agx_draw draw)
1343 {
1344    assert(draw.indexed);
1345 
1346    if (agx_is_indirect(draw.b))
1347       return draw;
1348 
1349    VkDrawIndexedIndirectCommand desc = {
1350       .indexCount = draw.b.count[0],
1351       .instanceCount = draw.b.count[1],
1352       .firstIndex = draw.start,
1353       .vertexOffset = draw.index_bias,
1354       .firstInstance = draw.start_instance,
1355    };
1356 
1357    return agx_draw_indexed_indirect(
1358       hk_pool_upload(cmd, &desc, sizeof(desc), 4), draw.index_buffer,
1359       draw.index_buffer_range_B, draw.index_size, draw.restart);
1360 }
1361 
1362 static struct agx_draw
hk_draw_without_restart(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct agx_draw draw,uint32_t draw_count)1363 hk_draw_without_restart(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
1364                         struct agx_draw draw, uint32_t draw_count)
1365 {
1366    struct hk_device *dev = hk_cmd_buffer_device(cmd);
1367    struct hk_graphics_state *gfx = &cmd->state.gfx;
1368    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1369 
1370    perf_debug(dev, "Unrolling primitive restart due to GS/XFB");
1371 
1372    /* The unroll kernel assumes an indirect draw. Synthesize one if needed */
1373    draw = hk_draw_as_indexed_indirect(cmd, draw);
1374 
1375    /* Next, we unroll the index buffer used by the indirect draw */
1376    enum mesa_prim prim = vk_conv_topology(dyn->ia.primitive_topology);
1377 
1378    assert(draw_count == 1 && "TODO: multidraw");
1379 
1380    struct libagx_unroll_restart_args ia = {
1381       .heap = hk_geometry_state(cmd),
1382       .index_buffer = draw.index_buffer,
1383       .in_draw = draw.b.ptr,
1384       .out_draw = hk_pool_alloc(cmd, 5 * sizeof(uint32_t) * draw_count, 4).gpu,
1385       .max_draws = 1 /* TODO: MDI */,
1386       .restart_index = gfx->index.restart,
1387       .index_buffer_size_el = agx_draw_index_range_el(draw),
1388       .flatshade_first =
1389          dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT,
1390       .zero_sink = dev->rodata.zero_sink,
1391    };
1392 
1393    libagx_unroll_restart_struct(cs, agx_1d(1024 * draw_count), AGX_BARRIER_ALL,
1394                                 ia, draw.index_size, libagx_compact_prim(prim));
1395 
1396    return agx_draw_indexed_indirect(ia.out_draw, dev->heap->va->addr,
1397                                     dev->heap->size, draw.index_size,
1398                                     false /* restart */);
1399 }
1400 
1401 static struct agx_draw
hk_launch_gs_prerast(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct agx_draw draw)1402 hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
1403                      struct agx_draw draw)
1404 {
1405    struct hk_device *dev = hk_cmd_buffer_device(cmd);
1406    struct hk_graphics_state *gfx = &cmd->state.gfx;
1407    struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
1408    struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
1409    struct agx_grid grid_vs, grid_gs;
1410 
1411    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1412    bool rast_disc = dyn->rs.rasterizer_discard_enable;
1413 
1414    hk_ensure_cs_has_space(cmd, cs, 0x2000 /*XXX*/);
1415 
1416    struct hk_shader *vs = hk_bound_sw_vs_before_gs(gfx);
1417    struct hk_shader *main = hk_main_gs_variant(gs, rast_disc);
1418    struct hk_shader *count = hk_count_gs_variant(gs, rast_disc);
1419    struct hk_shader *pre_gs = hk_pre_gs_variant(gs, rast_disc);
1420 
1421    uint64_t geometry_params = desc->root.draw.geometry_params;
1422    unsigned count_words = count->info.gs.count_words;
1423 
1424    if (false /* TODO */)
1425       perf_debug(dev, "Transform feedbck");
1426    else if (count_words)
1427       perf_debug(dev, "Geometry shader with counts");
1428    else
1429       perf_debug(dev, "Geometry shader without counts");
1430 
1431    enum mesa_prim mode = hk_gs_in_prim(cmd);
1432 
1433    if (draw.restart) {
1434       draw = hk_draw_without_restart(cmd, cs, draw, 1);
1435       mode = u_decomposed_prim(mode);
1436    }
1437 
1438    /* Setup grids */
1439    if (agx_is_indirect(draw.b)) {
1440       struct libagx_gs_setup_indirect_args gsi = {
1441          .index_buffer = draw.index_buffer,
1442          .zero_sink = dev->rodata.zero_sink,
1443          .draw = draw.b.ptr,
1444          .ia = desc->root.draw.input_assembly,
1445          .p = desc->root.draw.geometry_params,
1446          .vs_outputs = vs->b.info.outputs,
1447          .prim = mode,
1448       };
1449 
1450       if (cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL]) {
1451          gsi.vertex_buffer = desc->root.draw.tess_params +
1452                              offsetof(struct libagx_tess_args, tes_buffer);
1453       } else {
1454          gsi.vertex_buffer = desc->root.root_desc_addr +
1455                              offsetof(struct hk_root_descriptor_table,
1456                                       draw.vertex_output_buffer);
1457       }
1458 
1459       if (draw.indexed) {
1460          gsi.index_size_B = agx_index_size_to_B(draw.index_size);
1461          gsi.index_buffer_range_el = agx_draw_index_range_el(draw);
1462       }
1463 
1464       libagx_gs_setup_indirect_struct(cs, agx_1d(1), AGX_BARRIER_ALL, gsi);
1465 
1466       grid_vs = agx_grid_indirect(
1467          geometry_params + offsetof(struct agx_geometry_params, vs_grid));
1468 
1469       grid_gs = agx_grid_indirect(
1470          geometry_params + offsetof(struct agx_geometry_params, gs_grid));
1471    } else {
1472       grid_vs = grid_gs = draw.b;
1473       grid_gs.count[0] = u_decomposed_prims_for_vertices(mode, draw.b.count[0]);
1474    }
1475 
1476    /* Launch the vertex shader first */
1477    hk_reserve_scratch(cmd, cs, vs);
1478    hk_dispatch_with_usc(dev, cs, &vs->b.info,
1479                         hk_upload_usc_words(cmd, vs,
1480                                             vs->info.stage == MESA_SHADER_VERTEX
1481                                                ? gfx->linked[MESA_SHADER_VERTEX]
1482                                                : vs->only_linked),
1483                         grid_vs, agx_workgroup(1, 1, 1));
1484 
1485    /* If we need counts, launch the count shader and prefix sum the results. */
1486    if (count_words) {
1487       hk_dispatch_with_local_size(cmd, cs, count, grid_gs,
1488                                   agx_workgroup(1, 1, 1));
1489 
1490       libagx_prefix_sum_geom(cs, agx_1d(1024 * count_words), AGX_BARRIER_ALL,
1491                              geometry_params);
1492    }
1493 
1494    /* Pre-GS shader */
1495    hk_dispatch_with_local_size(cmd, cs, pre_gs, agx_1d(1),
1496                                agx_workgroup(1, 1, 1));
1497 
1498    /* Pre-rast geometry shader */
1499    hk_dispatch_with_local_size(cmd, cs, main, grid_gs, agx_workgroup(1, 1, 1));
1500 
1501    bool restart = cmd->state.gfx.topology != AGX_PRIMITIVE_POINTS;
1502    return agx_draw_indexed_indirect(cmd->geom_indirect, dev->heap->va->addr,
1503                                     dev->heap->size, AGX_INDEX_SIZE_U32,
1504                                     restart);
1505 }
1506 
1507 static struct agx_draw
hk_launch_tess(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct agx_draw draw)1508 hk_launch_tess(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
1509                struct agx_draw draw)
1510 {
1511    struct hk_device *dev = hk_cmd_buffer_device(cmd);
1512    struct hk_graphics_state *gfx = &cmd->state.gfx;
1513    struct agx_grid grid_vs, grid_tcs, grid_tess;
1514 
1515    struct hk_shader *vs = hk_bound_sw_vs(gfx);
1516    struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]);
1517 
1518    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1519    uint32_t input_patch_size = dyn->ts.patch_control_points;
1520    uint64_t state = gfx->descriptors.root.draw.tess_params;
1521    struct hk_tess_info info = gfx->tess.info;
1522 
1523    hk_ensure_cs_has_space(cmd, cs, 0x2000 /*XXX*/);
1524 
1525    perf_debug(dev, "Tessellation");
1526 
1527    uint64_t tcs_stat = hk_pipeline_stat_addr(
1528       cmd, VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT);
1529 
1530    /* Setup grids */
1531    if (agx_is_indirect(draw.b)) {
1532       perf_debug(dev, "Indirect tessellation");
1533 
1534       struct libagx_tess_setup_indirect_args args = {
1535          .p = state,
1536          .grids = gfx->tess.grids,
1537          .indirect = draw.b.ptr,
1538          .ia = gfx->descriptors.root.draw.input_assembly,
1539          .vertex_outputs = vs->b.info.outputs,
1540          .vertex_output_buffer_ptr =
1541             gfx->root + offsetof(struct hk_root_descriptor_table,
1542                                  draw.vertex_output_buffer),
1543          .tcs_statistic = hk_pipeline_stat_addr(
1544             cmd,
1545             VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT),
1546       };
1547 
1548       if (draw.indexed) {
1549          args.in_index_buffer = draw.index_buffer;
1550          args.in_index_size_B = agx_index_size_to_B(draw.index_size);
1551          args.in_index_buffer_range_el = agx_draw_index_range_el(draw);
1552       }
1553 
1554       libagx_tess_setup_indirect_struct(cs, agx_1d(1), AGX_BARRIER_ALL, args);
1555 
1556       uint32_t grid_stride = sizeof(uint32_t) * 6;
1557       grid_vs = agx_grid_indirect_local(gfx->tess.grids + 0 * grid_stride);
1558       grid_tcs = agx_grid_indirect_local(gfx->tess.grids + 1 * grid_stride);
1559       grid_tess = agx_grid_indirect_local(gfx->tess.grids + 2 * grid_stride);
1560    } else {
1561       uint32_t patches = draw.b.count[0] / input_patch_size;
1562       grid_vs = grid_tcs = draw.b;
1563 
1564       grid_tcs.count[0] = patches * tcs->info.tess.tcs_output_patch_size;
1565       grid_tess = agx_1d(patches * draw.b.count[1]);
1566 
1567       /* TCS invocation counter increments once per-patch */
1568       if (tcs_stat) {
1569          perf_debug(dev, "Direct TCS statistic");
1570          libagx_increment_statistic(cs, agx_1d(1), AGX_BARRIER_ALL, tcs_stat,
1571                                     patches);
1572       }
1573    }
1574 
1575    /* First launch the VS and TCS */
1576    hk_reserve_scratch(cmd, cs, vs);
1577    hk_reserve_scratch(cmd, cs, tcs);
1578 
1579    hk_dispatch_with_usc(
1580       dev, cs, &vs->b.info,
1581       hk_upload_usc_words(cmd, vs, gfx->linked[MESA_SHADER_VERTEX]), grid_vs,
1582       agx_workgroup(64, 1, 1));
1583 
1584    hk_dispatch_with_usc(
1585       dev, cs, &tcs->b.info, hk_upload_usc_words(cmd, tcs, tcs->only_linked),
1586       grid_tcs, agx_workgroup(tcs->info.tess.tcs_output_patch_size, 1, 1));
1587 
1588    /* First generate counts, then prefix sum them, and then tessellate. */
1589    libagx_tessellate(cs, grid_tess, AGX_BARRIER_ALL, info.mode,
1590                      LIBAGX_TESS_MODE_COUNT, state);
1591 
1592    libagx_prefix_sum_tess(cs, agx_1d(1024), AGX_BARRIER_ALL, state);
1593 
1594    libagx_tessellate(cs, grid_tess, AGX_BARRIER_ALL, info.mode,
1595                      LIBAGX_TESS_MODE_WITH_COUNTS, state);
1596 
1597    return agx_draw_indexed_indirect(gfx->tess.out_draws, dev->heap->va->addr,
1598                                     dev->heap->size, AGX_INDEX_SIZE_U32, false);
1599 }
1600 
1601 void
hk_cmd_bind_graphics_shader(struct hk_cmd_buffer * cmd,const gl_shader_stage stage,struct hk_api_shader * shader)1602 hk_cmd_bind_graphics_shader(struct hk_cmd_buffer *cmd,
1603                             const gl_shader_stage stage,
1604                             struct hk_api_shader *shader)
1605 {
1606    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1607 
1608    assert(stage < ARRAY_SIZE(cmd->state.gfx.shaders));
1609    if (cmd->state.gfx.shaders[stage] == shader)
1610       return;
1611 
1612    cmd->state.gfx.shaders[stage] = shader;
1613    cmd->state.gfx.shaders_dirty |= BITFIELD_BIT(stage);
1614 
1615    if (stage == MESA_SHADER_FRAGMENT) {
1616       BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
1617    }
1618 }
1619 
1620 static void
hk_flush_shaders(struct hk_cmd_buffer * cmd)1621 hk_flush_shaders(struct hk_cmd_buffer *cmd)
1622 {
1623    if (cmd->state.gfx.shaders_dirty == 0)
1624       return;
1625 
1626    struct hk_graphics_state *gfx = &cmd->state.gfx;
1627    struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
1628    desc->root_dirty = true;
1629 
1630    /* Geometry shading overrides the restart index, reemit on rebind */
1631    if (IS_SHADER_DIRTY(GEOMETRY)) {
1632       struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
1633 
1634       desc->root.draw.api_gs = gs && !gs->is_passthrough;
1635    }
1636 
1637    struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
1638    struct hk_api_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];
1639 
1640    /* If we have a new VS/FS pair, UVS locations may have changed so need to
1641     * relink. We do this here because there's no dependence on the fast linked
1642     * shaders.
1643     */
1644    agx_assign_uvs(&gfx->linked_varyings, &hw_vs->info.uvs,
1645                   fs ? hk_only_variant(fs)->info.fs.interp.flat : 0,
1646                   fs ? hk_only_variant(fs)->info.fs.interp.linear : 0);
1647 
1648    for (unsigned i = 0; i < VARYING_SLOT_MAX; ++i) {
1649       desc->root.draw.uvs_index[i] = gfx->linked_varyings.slots[i];
1650    }
1651 }
1652 
1653 static struct agx_shader_part *
hk_get_prolog_epilog_locked(struct hk_device * dev,struct hk_internal_key * key,hk_internal_builder_t builder,bool preprocess_nir,bool stop,unsigned cf_base)1654 hk_get_prolog_epilog_locked(struct hk_device *dev, struct hk_internal_key *key,
1655                             hk_internal_builder_t builder, bool preprocess_nir,
1656                             bool stop, unsigned cf_base)
1657 {
1658    /* Try to get the cached shader */
1659    struct hash_entry *ent = _mesa_hash_table_search(dev->prolog_epilog.ht, key);
1660    if (ent)
1661       return ent->data;
1662 
1663    nir_builder b = nir_builder_init_simple_shader(0, &agx_nir_options, NULL);
1664    builder(&b, key->key);
1665 
1666    if (preprocess_nir)
1667       agx_preprocess_nir(b.shader, dev->dev.libagx);
1668 
1669    struct agx_shader_key backend_key = {
1670       .dev = agx_gather_device_key(&dev->dev),
1671       .libagx = dev->dev.libagx,
1672       .secondary = true,
1673       .no_stop = !stop,
1674    };
1675 
1676    /* We always use dynamic sample shading in the GL driver. Indicate that. */
1677    if (b.shader->info.stage == MESA_SHADER_FRAGMENT) {
1678       backend_key.fs.cf_base = cf_base;
1679 
1680       if (b.shader->info.fs.uses_sample_shading)
1681          backend_key.fs.inside_sample_loop = true;
1682    }
1683 
1684    struct agx_shader_part *part =
1685       rzalloc(dev->prolog_epilog.ht, struct agx_shader_part);
1686 
1687    agx_compile_shader_nir(b.shader, &backend_key, NULL, part);
1688 
1689    ralloc_free(b.shader);
1690 
1691    /* ..and cache it before we return. The key is on the stack right now, so
1692     * clone it before using it as a hash table key. The clone is logically owned
1693     * by the hash table.
1694     */
1695    size_t total_key_size = sizeof(*key) + key->key_size;
1696    void *cloned_key = ralloc_memdup(dev->prolog_epilog.ht, key, total_key_size);
1697 
1698    _mesa_hash_table_insert(dev->prolog_epilog.ht, cloned_key, part);
1699    return part;
1700 }
1701 
1702 static struct agx_shader_part *
hk_get_prolog_epilog(struct hk_device * dev,void * data,size_t data_size,hk_internal_builder_t builder,bool preprocess_nir,bool stop,unsigned cf_base)1703 hk_get_prolog_epilog(struct hk_device *dev, void *data, size_t data_size,
1704                      hk_internal_builder_t builder, bool preprocess_nir,
1705                      bool stop, unsigned cf_base)
1706 {
1707    /* Build the meta shader key */
1708    size_t total_key_size = sizeof(struct hk_internal_key) + data_size;
1709 
1710    struct hk_internal_key *key = alloca(total_key_size);
1711    key->builder = builder;
1712    key->key_size = data_size;
1713 
1714    if (data_size)
1715       memcpy(key->key, data, data_size);
1716 
1717    simple_mtx_lock(&dev->prolog_epilog.lock);
1718 
1719    struct agx_shader_part *part = hk_get_prolog_epilog_locked(
1720       dev, key, builder, preprocess_nir, stop, cf_base);
1721 
1722    simple_mtx_unlock(&dev->prolog_epilog.lock);
1723    return part;
1724 }
1725 
1726 static struct hk_linked_shader *
hk_get_fast_linked_locked_vs(struct hk_device * dev,struct hk_shader * shader,struct hk_fast_link_key_vs * key)1727 hk_get_fast_linked_locked_vs(struct hk_device *dev, struct hk_shader *shader,
1728                              struct hk_fast_link_key_vs *key)
1729 {
1730    struct agx_shader_part *prolog =
1731       hk_get_prolog_epilog(dev, &key->prolog, sizeof(key->prolog),
1732                            agx_nir_vs_prolog, false, false, 0);
1733 
1734    struct hk_linked_shader *linked =
1735       hk_fast_link(dev, false, shader, prolog, NULL, 0);
1736 
1737    struct hk_fast_link_key *key_clone =
1738       ralloc_memdup(shader->linked.ht, key, sizeof(*key));
1739 
1740    /* XXX: Fix this higher up the stack */
1741    linked->sw_indexing = !key->prolog.hw || key->prolog.adjacency;
1742    linked->b.uses_base_param |= linked->sw_indexing;
1743 
1744    _mesa_hash_table_insert(shader->linked.ht, key_clone, linked);
1745    return linked;
1746 }
1747 
1748 static void
build_fs_prolog(nir_builder * b,const void * key)1749 build_fs_prolog(nir_builder *b, const void *key)
1750 {
1751    agx_nir_fs_prolog(b, key);
1752 
1753    /* Lower load_stat_query_address_agx, needed for FS statistics */
1754    NIR_PASS(_, b->shader, hk_lower_uvs_index, 0);
1755 }
1756 
1757 static struct hk_linked_shader *
hk_get_fast_linked_locked_fs(struct hk_device * dev,struct hk_shader * shader,struct hk_fast_link_key_fs * key)1758 hk_get_fast_linked_locked_fs(struct hk_device *dev, struct hk_shader *shader,
1759                              struct hk_fast_link_key_fs *key)
1760 {
1761    /* TODO: prolog without fs needs to work too... */
1762    bool needs_prolog = key->prolog.statistics ||
1763                        key->prolog.cull_distance_size ||
1764                        key->prolog.api_sample_mask != 0xff;
1765 
1766    struct agx_shader_part *prolog = NULL;
1767    if (needs_prolog) {
1768       prolog = hk_get_prolog_epilog(dev, &key->prolog, sizeof(key->prolog),
1769                                     build_fs_prolog, false, false,
1770                                     key->prolog.cf_base);
1771    }
1772 
1773    /* If sample shading is used, don't stop at the epilog, there's a
1774     * footer that the fast linker will insert to stop.
1775     */
1776    bool epilog_stop = (key->nr_samples_shaded == 0);
1777 
1778    struct agx_shader_part *epilog =
1779       hk_get_prolog_epilog(dev, &key->epilog, sizeof(key->epilog),
1780                            agx_nir_fs_epilog, true, epilog_stop, 0);
1781 
1782    struct hk_linked_shader *linked =
1783       hk_fast_link(dev, true, shader, prolog, epilog, key->nr_samples_shaded);
1784 
1785    struct hk_fast_link_key *key_clone =
1786       ralloc_memdup(shader->linked.ht, key, sizeof(*key));
1787 
1788    _mesa_hash_table_insert(shader->linked.ht, key_clone, linked);
1789    return linked;
1790 }
1791 
1792 /*
1793  * First, look for a fully linked variant. Else, build the required shader
1794  * parts and link.
1795  */
1796 static struct hk_linked_shader *
hk_get_fast_linked(struct hk_device * dev,struct hk_shader * shader,void * key)1797 hk_get_fast_linked(struct hk_device *dev, struct hk_shader *shader, void *key)
1798 {
1799    struct hk_linked_shader *linked;
1800    simple_mtx_lock(&shader->linked.lock);
1801 
1802    struct hash_entry *ent = _mesa_hash_table_search(shader->linked.ht, key);
1803 
1804    if (ent)
1805       linked = ent->data;
1806    else if (shader->info.stage == MESA_SHADER_VERTEX)
1807       linked = hk_get_fast_linked_locked_vs(dev, shader, key);
1808    else if (shader->info.stage == MESA_SHADER_FRAGMENT)
1809       linked = hk_get_fast_linked_locked_fs(dev, shader, key);
1810    else
1811       unreachable("invalid stage");
1812 
1813    simple_mtx_unlock(&shader->linked.lock);
1814    return linked;
1815 }
1816 
1817 static void
hk_update_fast_linked(struct hk_cmd_buffer * cmd,struct hk_shader * shader,void * key)1818 hk_update_fast_linked(struct hk_cmd_buffer *cmd, struct hk_shader *shader,
1819                       void *key)
1820 {
1821    struct hk_device *dev = hk_cmd_buffer_device(cmd);
1822    struct hk_linked_shader *new = hk_get_fast_linked(dev, shader, key);
1823    gl_shader_stage stage = shader->info.stage;
1824 
1825    if (cmd->state.gfx.linked[stage] != new) {
1826       cmd->state.gfx.linked[stage] = new;
1827       cmd->state.gfx.linked_dirty |= BITFIELD_BIT(stage);
1828    }
1829 }
1830 
1831 static enum agx_polygon_mode
translate_polygon_mode(VkPolygonMode vk_mode)1832 translate_polygon_mode(VkPolygonMode vk_mode)
1833 {
1834    static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_FILL ==
1835                  AGX_POLYGON_MODE_FILL);
1836    static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_LINE ==
1837                  AGX_POLYGON_MODE_LINE);
1838    static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_POINT ==
1839                  AGX_POLYGON_MODE_POINT);
1840 
1841    assert(vk_mode <= VK_POLYGON_MODE_POINT);
1842    return (enum agx_polygon_mode)vk_mode;
1843 }
1844 
1845 static enum agx_zs_func
translate_compare_op(VkCompareOp vk_mode)1846 translate_compare_op(VkCompareOp vk_mode)
1847 {
1848    static_assert((enum agx_zs_func)VK_COMPARE_OP_NEVER == AGX_ZS_FUNC_NEVER);
1849    static_assert((enum agx_zs_func)VK_COMPARE_OP_LESS == AGX_ZS_FUNC_LESS);
1850    static_assert((enum agx_zs_func)VK_COMPARE_OP_EQUAL == AGX_ZS_FUNC_EQUAL);
1851    static_assert((enum agx_zs_func)VK_COMPARE_OP_LESS_OR_EQUAL ==
1852                  AGX_ZS_FUNC_LEQUAL);
1853    static_assert((enum agx_zs_func)VK_COMPARE_OP_GREATER ==
1854                  AGX_ZS_FUNC_GREATER);
1855    static_assert((enum agx_zs_func)VK_COMPARE_OP_NOT_EQUAL ==
1856                  AGX_ZS_FUNC_NOT_EQUAL);
1857    static_assert((enum agx_zs_func)VK_COMPARE_OP_GREATER_OR_EQUAL ==
1858                  AGX_ZS_FUNC_GEQUAL);
1859    static_assert((enum agx_zs_func)VK_COMPARE_OP_ALWAYS == AGX_ZS_FUNC_ALWAYS);
1860 
1861    assert(vk_mode <= VK_COMPARE_OP_ALWAYS);
1862    return (enum agx_zs_func)vk_mode;
1863 }
1864 
1865 static enum agx_stencil_op
translate_stencil_op(VkStencilOp vk_op)1866 translate_stencil_op(VkStencilOp vk_op)
1867 {
1868    static_assert((enum agx_stencil_op)VK_STENCIL_OP_KEEP ==
1869                  AGX_STENCIL_OP_KEEP);
1870    static_assert((enum agx_stencil_op)VK_STENCIL_OP_ZERO ==
1871                  AGX_STENCIL_OP_ZERO);
1872    static_assert((enum agx_stencil_op)VK_STENCIL_OP_REPLACE ==
1873                  AGX_STENCIL_OP_REPLACE);
1874    static_assert((enum agx_stencil_op)VK_STENCIL_OP_INCREMENT_AND_CLAMP ==
1875                  AGX_STENCIL_OP_INCR_SAT);
1876    static_assert((enum agx_stencil_op)VK_STENCIL_OP_DECREMENT_AND_CLAMP ==
1877                  AGX_STENCIL_OP_DECR_SAT);
1878    static_assert((enum agx_stencil_op)VK_STENCIL_OP_INVERT ==
1879                  AGX_STENCIL_OP_INVERT);
1880    static_assert((enum agx_stencil_op)VK_STENCIL_OP_INCREMENT_AND_WRAP ==
1881                  AGX_STENCIL_OP_INCR_WRAP);
1882    static_assert((enum agx_stencil_op)VK_STENCIL_OP_DECREMENT_AND_WRAP ==
1883                  AGX_STENCIL_OP_DECR_WRAP);
1884 
1885    return (enum agx_stencil_op)vk_op;
1886 }
1887 
1888 static void
hk_ppp_push_stencil_face(struct agx_ppp_update * ppp,struct vk_stencil_test_face_state s,bool enabled)1889 hk_ppp_push_stencil_face(struct agx_ppp_update *ppp,
1890                          struct vk_stencil_test_face_state s, bool enabled)
1891 {
1892    if (enabled) {
1893       agx_ppp_push(ppp, FRAGMENT_STENCIL, cfg) {
1894          cfg.compare = translate_compare_op(s.op.compare);
1895          cfg.write_mask = s.write_mask;
1896          cfg.read_mask = s.compare_mask;
1897 
1898          cfg.depth_pass = translate_stencil_op(s.op.pass);
1899          cfg.depth_fail = translate_stencil_op(s.op.depth_fail);
1900          cfg.stencil_fail = translate_stencil_op(s.op.fail);
1901       }
1902    } else {
1903       agx_ppp_push(ppp, FRAGMENT_STENCIL, cfg) {
1904          cfg.compare = AGX_ZS_FUNC_ALWAYS;
1905          cfg.write_mask = 0xFF;
1906          cfg.read_mask = 0xFF;
1907 
1908          cfg.depth_pass = AGX_STENCIL_OP_KEEP;
1909          cfg.depth_fail = AGX_STENCIL_OP_KEEP;
1910          cfg.stencil_fail = AGX_STENCIL_OP_KEEP;
1911       }
1912    }
1913 }
1914 
1915 static bool
hk_stencil_test_enabled(struct hk_cmd_buffer * cmd)1916 hk_stencil_test_enabled(struct hk_cmd_buffer *cmd)
1917 {
1918    const struct hk_rendering_state *render = &cmd->state.gfx.render;
1919    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1920 
1921    return dyn->ds.stencil.test_enable &&
1922           render->stencil_att.vk_format != VK_FORMAT_UNDEFINED;
1923 }
1924 
1925 static void
hk_flush_vp_state(struct hk_cmd_buffer * cmd,struct hk_cs * cs,uint8_t ** out)1926 hk_flush_vp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out)
1927 {
1928    const struct vk_dynamic_graphics_state *dyn =
1929       &cmd->vk.dynamic_graphics_state;
1930 
1931    /* We always need at least 1 viewport for the hardware. With rasterizer
1932     * discard the app may not supply any, but we can just program garbage.
1933     */
1934    unsigned count = MAX2(dyn->vp.viewport_count, 1);
1935 
1936    unsigned minx[HK_MAX_VIEWPORTS] = {0}, miny[HK_MAX_VIEWPORTS] = {0};
1937    unsigned maxx[HK_MAX_VIEWPORTS] = {0}, maxy[HK_MAX_VIEWPORTS] = {0};
1938 
1939    /* We implicitly scissor to the viewport. We need to do a min/max dance to
1940     * handle inverted viewports.
1941     */
1942    for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
1943       const VkViewport *vp = &dyn->vp.viewports[i];
1944 
1945       minx[i] = MIN2(vp->x, vp->x + vp->width);
1946       miny[i] = MIN2(vp->y, vp->y + vp->height);
1947       maxx[i] = MAX2(vp->x, vp->x + vp->width);
1948       maxy[i] = MAX2(vp->y, vp->y + vp->height);
1949    }
1950 
1951    /* Additionally clamp to the framebuffer so we don't rasterize
1952     * off-screen pixels. TODO: Is this necessary? the GL driver does this but
1953     * it might be cargoculted at this point.
1954     */
1955    for (unsigned i = 0; i < count; ++i) {
1956       minx[i] = MIN2(minx[i], cmd->state.gfx.render.cr.width);
1957       maxx[i] = MIN2(maxx[i], cmd->state.gfx.render.cr.width);
1958       miny[i] = MIN2(miny[i], cmd->state.gfx.render.cr.height);
1959       maxy[i] = MIN2(maxy[i], cmd->state.gfx.render.cr.height);
1960    }
1961 
1962    /* We additionally apply any API scissors */
1963    for (unsigned i = 0; i < dyn->vp.scissor_count; ++i) {
1964       const VkRect2D *s = &dyn->vp.scissors[i];
1965 
1966       minx[i] = MAX2(minx[i], s->offset.x);
1967       miny[i] = MAX2(miny[i], s->offset.y);
1968       maxx[i] = MIN2(maxx[i], s->offset.x + s->extent.width);
1969       maxy[i] = MIN2(maxy[i], s->offset.y + s->extent.height);
1970    }
1971 
1972    /* Upload a hardware scissor for each viewport, whether there's a
1973     * corresponding API scissor or not.
1974     */
1975    unsigned index = cs->scissor.size / AGX_SCISSOR_LENGTH;
1976    struct agx_scissor_packed *scissors =
1977       util_dynarray_grow_bytes(&cs->scissor, count, AGX_SCISSOR_LENGTH);
1978 
1979    for (unsigned i = 0; i < count; ++i) {
1980       const VkViewport *vp = &dyn->vp.viewports[i];
1981 
1982       agx_pack(scissors + i, SCISSOR, cfg) {
1983          cfg.min_x = minx[i];
1984          cfg.min_y = miny[i];
1985          cfg.max_x = maxx[i];
1986          cfg.max_y = maxy[i];
1987 
1988          /* These settings in conjunction with the PPP control depth clip/clamp
1989           * settings implement depth clip/clamping. Properly setting them
1990           * together is required for conformant depth clip enable.
1991           *
1992           * TODO: Reverse-engineer the finer interactions here.
1993           */
1994          if (dyn->rs.depth_clamp_enable) {
1995             cfg.min_z = MIN2(vp->minDepth, vp->maxDepth);
1996             cfg.max_z = MAX2(vp->minDepth, vp->maxDepth);
1997          } else {
1998             cfg.min_z = 0.0;
1999             cfg.max_z = 1.0;
2000          }
2001       }
2002    }
2003 
2004    /* Upload state */
2005    struct AGX_PPP_HEADER present = {
2006       .depth_bias_scissor = true,
2007       .region_clip = true,
2008       .viewport = true,
2009       .viewport_count = count,
2010    };
2011 
2012    size_t size = agx_ppp_update_size(&present);
2013    struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
2014    if (!T.cpu)
2015       return;
2016 
2017    struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present);
2018 
2019    agx_ppp_push(&ppp, DEPTH_BIAS_SCISSOR, cfg) {
2020       cfg.scissor = index;
2021 
2022       /* Use the current depth bias, we allocate linearly */
2023       unsigned count = cs->depth_bias.size / AGX_DEPTH_BIAS_LENGTH;
2024       cfg.depth_bias = count ? count - 1 : 0;
2025    };
2026 
2027    for (unsigned i = 0; i < count; ++i) {
2028       agx_ppp_push(&ppp, REGION_CLIP, cfg) {
2029          cfg.enable = true;
2030          cfg.min_x = minx[i] / 32;
2031          cfg.min_y = miny[i] / 32;
2032          cfg.max_x = DIV_ROUND_UP(MAX2(maxx[i], 1), 32);
2033          cfg.max_y = DIV_ROUND_UP(MAX2(maxy[i], 1), 32);
2034       }
2035    }
2036 
2037    agx_ppp_push(&ppp, VIEWPORT_CONTROL, cfg)
2038       ;
2039 
2040    /* Upload viewports */
2041    for (unsigned i = 0; i < count; ++i) {
2042       const VkViewport *vp = &dyn->vp.viewports[i];
2043 
2044       agx_ppp_push(&ppp, VIEWPORT, cfg) {
2045          cfg.translate_x = vp->x + 0.5f * vp->width;
2046          cfg.translate_y = vp->y + 0.5f * vp->height;
2047          cfg.translate_z = vp->minDepth;
2048 
2049          cfg.scale_x = vp->width * 0.5f;
2050          cfg.scale_y = vp->height * 0.5f;
2051          cfg.scale_z = vp->maxDepth - vp->minDepth;
2052       }
2053    }
2054 
2055    agx_ppp_fini(out, &ppp);
2056 }
2057 
2058 static enum agx_object_type
translate_object_type(enum mesa_prim topology)2059 translate_object_type(enum mesa_prim topology)
2060 {
2061    static_assert(MESA_PRIM_LINES < MESA_PRIM_LINE_STRIP);
2062    static_assert(MESA_PRIM_TRIANGLES >= MESA_PRIM_LINE_STRIP);
2063 
2064    if (topology == MESA_PRIM_POINTS)
2065       return AGX_OBJECT_TYPE_POINT_SPRITE_UV01;
2066    else if (topology <= MESA_PRIM_LINE_STRIP)
2067       return AGX_OBJECT_TYPE_LINE;
2068    else
2069       return AGX_OBJECT_TYPE_TRIANGLE;
2070 }
2071 
2072 static enum agx_primitive
translate_hw_primitive_topology(enum mesa_prim prim)2073 translate_hw_primitive_topology(enum mesa_prim prim)
2074 {
2075    switch (prim) {
2076    case MESA_PRIM_POINTS:
2077       return AGX_PRIMITIVE_POINTS;
2078    case MESA_PRIM_LINES:
2079       return AGX_PRIMITIVE_LINES;
2080    case MESA_PRIM_LINE_STRIP:
2081       return AGX_PRIMITIVE_LINE_STRIP;
2082    case MESA_PRIM_TRIANGLES:
2083       return AGX_PRIMITIVE_TRIANGLES;
2084    case MESA_PRIM_TRIANGLE_STRIP:
2085       return AGX_PRIMITIVE_TRIANGLE_STRIP;
2086    case MESA_PRIM_TRIANGLE_FAN:
2087       return AGX_PRIMITIVE_TRIANGLE_FAN;
2088    default:
2089       unreachable("Invalid hardware primitive topology");
2090    }
2091 }
2092 
2093 static inline enum agx_vdm_vertex
translate_vdm_vertex(unsigned vtx)2094 translate_vdm_vertex(unsigned vtx)
2095 {
2096    static_assert(AGX_VDM_VERTEX_0 == 0);
2097    static_assert(AGX_VDM_VERTEX_1 == 1);
2098    static_assert(AGX_VDM_VERTEX_2 == 2);
2099 
2100    assert(vtx <= 2);
2101    return vtx;
2102 }
2103 
2104 static inline enum agx_ppp_vertex
translate_ppp_vertex(unsigned vtx)2105 translate_ppp_vertex(unsigned vtx)
2106 {
2107    static_assert(AGX_PPP_VERTEX_0 == 0 + 1);
2108    static_assert(AGX_PPP_VERTEX_1 == 1 + 1);
2109    static_assert(AGX_PPP_VERTEX_2 == 2 + 1);
2110 
2111    assert(vtx <= 2);
2112    return vtx + 1;
2113 }
2114 
2115 static void
hk_flush_index(struct hk_cmd_buffer * cmd,struct hk_cs * cs)2116 hk_flush_index(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
2117 {
2118    uint32_t index = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY]
2119                        ? BITFIELD_MASK(32)
2120                        : cmd->state.gfx.index.restart;
2121 
2122    /* VDM State updates are relatively expensive, so only emit them when the
2123     * restart index changes. This is simpler than accurate dirty tracking.
2124     */
2125    if (cs->restart_index != index) {
2126       uint8_t *out = cs->current;
2127       agx_push(out, VDM_STATE, cfg) {
2128          cfg.restart_index_present = true;
2129       }
2130 
2131       agx_push(out, VDM_STATE_RESTART_INDEX, cfg) {
2132          cfg.value = index;
2133       }
2134 
2135       cs->current = out;
2136       cs->restart_index = index;
2137    }
2138 }
2139 
2140 /*
2141  * Return the given sample positions, packed into a 32-bit word with fixed
2142  * point nibbles for each x/y component of the (at most 4) samples. This is
2143  * suitable for programming the PPP_MULTISAMPLECTL control register.
2144  */
2145 static uint32_t
hk_pack_ppp_multisamplectrl(const struct vk_sample_locations_state * sl)2146 hk_pack_ppp_multisamplectrl(const struct vk_sample_locations_state *sl)
2147 {
2148    uint32_t ctrl = 0;
2149 
2150    for (int32_t i = sl->per_pixel - 1; i >= 0; i--) {
2151       VkSampleLocationEXT loc = sl->locations[i];
2152 
2153       uint32_t x = CLAMP(loc.x, 0.0f, 0.9375f) * 16.0;
2154       uint32_t y = CLAMP(loc.y, 0.0f, 0.9375f) * 16.0;
2155 
2156       assert(x <= 15);
2157       assert(y <= 15);
2158 
2159       /* Push bytes in reverse order so we can use constant shifts. */
2160       ctrl = (ctrl << 8) | (y << 4) | x;
2161    }
2162 
2163    return ctrl;
2164 }
2165 
2166 /*
2167  * Return the standard sample positions, prepacked as above for efficiency.
2168  */
2169 uint32_t
hk_default_sample_positions(unsigned nr_samples)2170 hk_default_sample_positions(unsigned nr_samples)
2171 {
2172    switch (nr_samples) {
2173    case 0:
2174    case 1:
2175       return 0x88;
2176    case 2:
2177       return 0x44cc;
2178    case 4:
2179       return 0xeaa26e26;
2180    default:
2181       unreachable("Invalid sample count");
2182    }
2183 }
2184 
2185 static void
hk_flush_ppp_state(struct hk_cmd_buffer * cmd,struct hk_cs * cs,uint8_t ** out)2186 hk_flush_ppp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out)
2187 {
2188    const struct hk_rendering_state *render = &cmd->state.gfx.render;
2189    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
2190 
2191    struct hk_graphics_state *gfx = &cmd->state.gfx;
2192    struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
2193    struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
2194 
2195    bool hw_vs_dirty = IS_SHADER_DIRTY(VERTEX) || IS_SHADER_DIRTY(TESS_EVAL) ||
2196                       IS_SHADER_DIRTY(GEOMETRY);
2197    bool fs_dirty = IS_SHADER_DIRTY(FRAGMENT);
2198 
2199    struct hk_linked_shader *linked_fs = gfx->linked[MESA_SHADER_FRAGMENT];
2200    bool linked_fs_dirty = IS_LINKED_DIRTY(FRAGMENT);
2201 
2202    bool varyings_dirty = gfx->dirty & HK_DIRTY_VARYINGS;
2203 
2204    bool face_dirty =
2205       IS_DIRTY(DS_DEPTH_TEST_ENABLE) || IS_DIRTY(DS_DEPTH_WRITE_ENABLE) ||
2206       IS_DIRTY(DS_DEPTH_COMPARE_OP) || IS_DIRTY(DS_STENCIL_REFERENCE) ||
2207       IS_DIRTY(RS_LINE_WIDTH) || IS_DIRTY(RS_POLYGON_MODE) || fs_dirty;
2208 
2209    bool stencil_face_dirty =
2210       IS_DIRTY(DS_STENCIL_OP) || IS_DIRTY(DS_STENCIL_COMPARE_MASK) ||
2211       IS_DIRTY(DS_STENCIL_WRITE_MASK) || IS_DIRTY(DS_STENCIL_TEST_ENABLE);
2212 
2213    struct AGX_PPP_HEADER dirty = {
2214       .fragment_control =
2215          IS_DIRTY(DS_STENCIL_TEST_ENABLE) || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) ||
2216          IS_DIRTY(RS_DEPTH_BIAS_ENABLE) || gfx->dirty & HK_DIRTY_OCCLUSION,
2217 
2218       .fragment_control_2 =
2219          IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) || linked_fs_dirty,
2220 
2221       .fragment_front_face = face_dirty,
2222       .fragment_front_face_2 = fs_dirty || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY),
2223       .fragment_front_stencil = stencil_face_dirty,
2224       .fragment_back_face = face_dirty,
2225       .fragment_back_face_2 = fs_dirty || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY),
2226       .fragment_back_stencil = stencil_face_dirty,
2227       .output_select = hw_vs_dirty || linked_fs_dirty || varyings_dirty,
2228       .varying_counts_32 = varyings_dirty,
2229       .varying_counts_16 = varyings_dirty,
2230       .cull = IS_DIRTY(RS_CULL_MODE) ||
2231               IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) ||
2232               IS_DIRTY(RS_FRONT_FACE) || IS_DIRTY(RS_DEPTH_CLIP_ENABLE) ||
2233               IS_DIRTY(RS_DEPTH_CLAMP_ENABLE) || IS_DIRTY(RS_LINE_MODE) ||
2234               IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) ||
2235               (gfx->dirty & HK_DIRTY_PROVOKING) || IS_SHADER_DIRTY(TESS_CTRL) ||
2236               IS_SHADER_DIRTY(TESS_EVAL) || IS_DIRTY(TS_DOMAIN_ORIGIN),
2237       .cull_2 = varyings_dirty,
2238 
2239       /* With a null FS, the fragment shader PPP word is ignored and doesn't
2240        * need to be present.
2241        */
2242       .fragment_shader = fs && (fs_dirty || linked_fs_dirty || varyings_dirty ||
2243                                 gfx->descriptors.root_dirty),
2244 
2245       .occlusion_query = gfx->dirty & HK_DIRTY_OCCLUSION,
2246       .output_size = hw_vs_dirty,
2247       .viewport_count = 1, /* irrelevant */
2248    };
2249 
2250    /* Calculate the update size. If it equals the header, there is nothing to
2251     * update so early-exit.
2252     */
2253    size_t size = agx_ppp_update_size(&dirty);
2254    if (size == AGX_PPP_HEADER_LENGTH)
2255       return;
2256 
2257    /* Otherwise, allocate enough space for the update and push it. */
2258    assert(size > AGX_PPP_HEADER_LENGTH);
2259 
2260    struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
2261    if (!T.cpu)
2262       return;
2263 
2264    struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &dirty);
2265 
2266    if (dirty.fragment_control) {
2267       agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
2268          cfg.visibility_mode = gfx->occlusion.mode;
2269          cfg.stencil_test_enable = hk_stencil_test_enabled(cmd);
2270 
2271          /* TODO: Consider optimizing this? */
2272          cfg.two_sided_stencil = cfg.stencil_test_enable;
2273 
2274          cfg.depth_bias_enable = dyn->rs.depth_bias.enable &&
2275                                  gfx->object_type == AGX_OBJECT_TYPE_TRIANGLE;
2276 
2277          /* Always enable scissoring so we may scissor to the viewport (TODO:
2278           * optimize this out if the viewport is the default and the app does
2279           * not use the scissor test)
2280           */
2281          cfg.scissor_enable = true;
2282 
2283          /* This avoids broken derivatives along primitive edges */
2284          cfg.disable_tri_merging = gfx->object_type != AGX_OBJECT_TYPE_TRIANGLE;
2285       }
2286    }
2287 
2288    if (dirty.fragment_control_2) {
2289       if (linked_fs) {
2290          /* Annoying, rasterizer_discard seems to be ignored (sometimes?) in the
2291           * main fragment control word and has to be combined into the secondary
2292           * word for reliable behaviour.
2293           */
2294          agx_ppp_push_merged(&ppp, FRAGMENT_CONTROL, cfg,
2295                              linked_fs->b.fragment_control) {
2296 
2297             cfg.tag_write_disable = dyn->rs.rasterizer_discard_enable;
2298          }
2299       } else {
2300          /* If there is no fragment shader, we must disable tag writes to avoid
2301           * executing the missing shader. This optimizes depth-only passes.
2302           */
2303          agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
2304             cfg.tag_write_disable = true;
2305             cfg.pass_type = AGX_PASS_TYPE_OPAQUE;
2306          }
2307       }
2308    }
2309 
2310    struct agx_fragment_face_packed fragment_face = {};
2311    struct agx_fragment_face_2_packed fragment_face_2 = {};
2312 
2313    if (dirty.fragment_front_face) {
2314       bool has_z = render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2315       bool z_test = has_z && dyn->ds.depth.test_enable;
2316 
2317       agx_pack(&fragment_face, FRAGMENT_FACE, cfg) {
2318          cfg.line_width = agx_pack_line_width(dyn->rs.line.width);
2319          cfg.polygon_mode = translate_polygon_mode(dyn->rs.polygon_mode);
2320          cfg.disable_depth_write = !(z_test && dyn->ds.depth.write_enable);
2321 
2322          if (z_test && !gfx->descriptors.root.draw.force_never_in_shader)
2323             cfg.depth_function = translate_compare_op(dyn->ds.depth.compare_op);
2324          else
2325             cfg.depth_function = AGX_ZS_FUNC_ALWAYS;
2326       };
2327 
2328       agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, fragment_face) {
2329          cfg.stencil_reference = dyn->ds.stencil.front.reference;
2330       }
2331    }
2332 
2333    if (dirty.fragment_front_face_2) {
2334       if (fs) {
2335          agx_pack(&fragment_face_2, FRAGMENT_FACE_2, cfg) {
2336             cfg.object_type = gfx->object_type;
2337          }
2338 
2339          agx_merge(fragment_face_2, fs->frag_face, FRAGMENT_FACE_2);
2340          agx_ppp_push_packed(&ppp, &fragment_face_2, FRAGMENT_FACE_2);
2341       } else {
2342          agx_ppp_fragment_face_2(&ppp, gfx->object_type, NULL);
2343       }
2344    }
2345 
2346    if (dirty.fragment_front_stencil) {
2347       hk_ppp_push_stencil_face(&ppp, dyn->ds.stencil.front,
2348                                hk_stencil_test_enabled(cmd));
2349    }
2350 
2351    if (dirty.fragment_back_face) {
2352       assert(dirty.fragment_front_face);
2353 
2354       agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, fragment_face) {
2355          cfg.stencil_reference = dyn->ds.stencil.back.reference;
2356       }
2357    }
2358 
2359    if (dirty.fragment_back_face_2) {
2360       assert(dirty.fragment_front_face_2);
2361 
2362       agx_ppp_push_packed(&ppp, &fragment_face_2, FRAGMENT_FACE_2);
2363    }
2364 
2365    if (dirty.fragment_back_stencil) {
2366       hk_ppp_push_stencil_face(&ppp, dyn->ds.stencil.back,
2367                                hk_stencil_test_enabled(cmd));
2368    }
2369 
2370    if (dirty.output_select) {
2371       struct agx_output_select_packed osel = hw_vs->info.uvs.osel;
2372 
2373       if (linked_fs) {
2374          agx_ppp_push_merged_blobs(&ppp, AGX_OUTPUT_SELECT_LENGTH, &osel,
2375                                    &linked_fs->b.osel);
2376       } else {
2377          agx_ppp_push_packed(&ppp, &osel, OUTPUT_SELECT);
2378       }
2379    }
2380 
2381    assert(dirty.varying_counts_32 == dirty.varying_counts_16);
2382 
2383    if (dirty.varying_counts_32) {
2384       agx_ppp_push_packed(&ppp, &gfx->linked_varyings.counts_32,
2385                           VARYING_COUNTS);
2386 
2387       agx_ppp_push_packed(&ppp, &gfx->linked_varyings.counts_16,
2388                           VARYING_COUNTS);
2389    }
2390 
2391    if (dirty.cull) {
2392       agx_ppp_push(&ppp, CULL, cfg) {
2393          cfg.cull_front = dyn->rs.cull_mode & VK_CULL_MODE_FRONT_BIT;
2394          cfg.cull_back = dyn->rs.cull_mode & VK_CULL_MODE_BACK_BIT;
2395          cfg.front_face_ccw = dyn->rs.front_face != VK_FRONT_FACE_CLOCKWISE;
2396 
2397          if (gfx->shaders[MESA_SHADER_TESS_CTRL] &&
2398              !gfx->shaders[MESA_SHADER_GEOMETRY]) {
2399             cfg.front_face_ccw ^= gfx->tess.info.ccw;
2400             cfg.front_face_ccw ^= dyn->ts.domain_origin ==
2401                                   VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT;
2402          }
2403 
2404          cfg.flat_shading_vertex = translate_ppp_vertex(gfx->provoking);
2405          cfg.rasterizer_discard = dyn->rs.rasterizer_discard_enable;
2406 
2407          /* We do not support unrestricted depth, so clamping is inverted from
2408           * clipping. This implementation seems to pass CTS without unrestricted
2409           * depth support.
2410           *
2411           * TODO: Make sure this is right with gl_FragDepth.
2412           */
2413          cfg.depth_clip = vk_rasterization_state_depth_clip_enable(&dyn->rs);
2414          cfg.depth_clamp = !cfg.depth_clip;
2415 
2416          cfg.primitive_msaa =
2417             gfx->object_type == AGX_OBJECT_TYPE_LINE &&
2418             dyn->rs.line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR;
2419       }
2420    }
2421 
2422    if (dirty.cull_2) {
2423       agx_ppp_push(&ppp, CULL_2, cfg) {
2424          cfg.needs_primitive_id = gfx->generate_primitive_id;
2425          cfg.clamp_w = true;
2426       }
2427    }
2428 
2429    if (dirty.fragment_shader) {
2430       /* TODO: Do less often? */
2431       hk_reserve_scratch(cmd, cs, fs);
2432 
2433       agx_ppp_push_packed(&ppp, &linked_fs->fs_counts, FRAGMENT_SHADER_WORD_0);
2434 
2435       agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_1, cfg) {
2436          cfg.pipeline = hk_upload_usc_words(cmd, fs, linked_fs);
2437       }
2438 
2439       agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_2, cfg) {
2440          cfg.cf_bindings = gfx->varyings;
2441       }
2442 
2443       agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_3, cfg)
2444          ;
2445    }
2446 
2447    if (dirty.occlusion_query) {
2448       agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY, cfg) {
2449          cfg.index = gfx->occlusion.index;
2450       }
2451    }
2452 
2453    if (dirty.output_size) {
2454       agx_ppp_push(&ppp, OUTPUT_SIZE, cfg) {
2455          cfg.count = hw_vs->info.uvs.size;
2456       }
2457    }
2458 
2459    agx_ppp_fini(out, &ppp);
2460 }
2461 
2462 /*
2463  * Based somewhat on the calculation in the PowerVR driver, and mostly trial &
2464  * error to pass CTS. This is a mess.
2465  */
2466 static float
hk_depth_bias_factor(VkFormat format,bool exact,bool force_unorm)2467 hk_depth_bias_factor(VkFormat format, bool exact, bool force_unorm)
2468 {
2469    if (format == VK_FORMAT_D16_UNORM) {
2470       return exact ? (1 << 16) : (1 << 15);
2471    } else if (force_unorm) {
2472       return exact ? (1ull << 24) : (1ull << 23);
2473    } else {
2474       return 1.0;
2475    }
2476 }
2477 
2478 static void
hk_flush_dynamic_state(struct hk_cmd_buffer * cmd,struct hk_cs * cs,uint32_t draw_id,struct agx_draw draw)2479 hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
2480                        uint32_t draw_id, struct agx_draw draw)
2481 {
2482    struct hk_device *dev = hk_cmd_buffer_device(cmd);
2483    const struct hk_rendering_state *render = &cmd->state.gfx.render;
2484    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
2485 
2486    struct hk_graphics_state *gfx = &cmd->state.gfx;
2487 
2488    struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
2489    struct hk_shader *sw_vs = hk_bound_sw_vs(gfx);
2490 
2491    if (!vk_dynamic_graphics_state_any_dirty(dyn) && !gfx->dirty &&
2492        !gfx->descriptors.root_dirty && !gfx->shaders_dirty &&
2493        !sw_vs->b.info.uses_draw_id && !sw_vs->b.info.uses_base_param &&
2494        !(gfx->linked[MESA_SHADER_VERTEX] &&
2495          gfx->linked[MESA_SHADER_VERTEX]->b.uses_base_param))
2496       return;
2497 
2498    struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
2499 
2500    assert(cs->current + 0x1000 < cs->end && "already ensured space");
2501    uint8_t *out = cs->current;
2502 
2503    struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
2504 
2505    bool gt_dirty = IS_SHADER_DIRTY(TESS_CTRL) || IS_SHADER_DIRTY(TESS_EVAL) ||
2506                    IS_SHADER_DIRTY(GEOMETRY);
2507    bool vgt_dirty = IS_SHADER_DIRTY(VERTEX) || gt_dirty;
2508    bool fs_dirty = IS_SHADER_DIRTY(FRAGMENT);
2509 
2510    if (IS_DIRTY(CB_BLEND_CONSTANTS)) {
2511       static_assert(sizeof(desc->root.draw.blend_constant) ==
2512                        sizeof(dyn->cb.blend_constants) &&
2513                     "common size");
2514 
2515       memcpy(desc->root.draw.blend_constant, dyn->cb.blend_constants,
2516              sizeof(dyn->cb.blend_constants));
2517       desc->root_dirty = true;
2518    }
2519 
2520    if (IS_DIRTY(MS_SAMPLE_MASK)) {
2521       desc->root.draw.api_sample_mask = dyn->ms.sample_mask;
2522       desc->root_dirty = true;
2523    }
2524 
2525    if (fs_dirty || IS_DIRTY(DS_DEPTH_TEST_ENABLE) ||
2526        IS_DIRTY(DS_DEPTH_COMPARE_OP)) {
2527 
2528       const struct hk_rendering_state *render = &cmd->state.gfx.render;
2529       bool has_z = render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2530       bool z_test = has_z && dyn->ds.depth.test_enable;
2531 
2532       desc->root.draw.force_never_in_shader =
2533          z_test && dyn->ds.depth.compare_op == VK_COMPARE_OP_NEVER && fs &&
2534          fs->info.fs.writes_memory;
2535 
2536       desc->root_dirty = true;
2537    }
2538 
2539    /* The main shader must not run tests if the epilog will. */
2540    bool nontrivial_force_early =
2541       fs && (fs->b.info.early_fragment_tests &&
2542              (fs->b.info.writes_sample_mask || fs->info.fs.writes_memory));
2543 
2544    bool epilog_discards = dyn->ms.alpha_to_coverage_enable ||
2545                           (fs && (fs->info.fs.epilog_key.write_z ||
2546                                   fs->info.fs.epilog_key.write_s));
2547    epilog_discards &= !nontrivial_force_early;
2548 
2549    if (fs_dirty || IS_DIRTY(MS_ALPHA_TO_COVERAGE_ENABLE)) {
2550       desc->root.draw.no_epilog_discard = !epilog_discards ? ~0 : 0;
2551       desc->root_dirty = true;
2552    }
2553 
2554    if (IS_DIRTY(VI) || IS_DIRTY(VI_BINDINGS_VALID) ||
2555        IS_DIRTY(VI_BINDING_STRIDES) || vgt_dirty || true /* TODO */) {
2556 
2557       struct hk_fast_link_key_vs key = {
2558          .prolog.hw = (sw_vs == hw_vs),
2559 
2560          /* FIXME: handle pipeline robustness "properly" */
2561          .prolog.robustness.level =
2562             (dev->vk.enabled_features.robustBufferAccess2 ||
2563              dev->vk.enabled_features.pipelineRobustness)
2564                ? AGX_ROBUSTNESS_D3D
2565             : dev->vk.enabled_features.robustBufferAccess
2566                ? AGX_ROBUSTNESS_GL
2567                : AGX_ROBUSTNESS_DISABLED,
2568 
2569          .prolog.robustness.soft_fault = agx_has_soft_fault(&dev->dev),
2570       };
2571 
2572       enum mesa_prim prim = vk_conv_topology(dyn->ia.primitive_topology);
2573 
2574       if (mesa_prim_has_adjacency(prim)) {
2575          if (draw.restart) {
2576             prim = u_decomposed_prim(prim);
2577          }
2578 
2579          key.prolog.adjacency = prim;
2580       }
2581 
2582       if (key.prolog.adjacency || !key.prolog.hw) {
2583          key.prolog.sw_index_size_B =
2584             draw.indexed ? agx_index_size_to_B(draw.index_size) : 0;
2585       }
2586 
2587       static_assert(sizeof(key.prolog.component_mask) ==
2588                     sizeof(sw_vs->info.vs.attrib_components_read));
2589       BITSET_COPY(key.prolog.component_mask,
2590                   sw_vs->info.vs.attrib_components_read);
2591 
2592       u_foreach_bit(a, dyn->vi->attributes_valid) {
2593          struct vk_vertex_attribute_state attr = dyn->vi->attributes[a];
2594 
2595          assert(dyn->vi->bindings_valid & BITFIELD_BIT(attr.binding));
2596          struct vk_vertex_binding_state binding =
2597             dyn->vi->bindings[attr.binding];
2598 
2599          /* nir_assign_io_var_locations compacts vertex inputs, eliminating
2600           * unused inputs. We need to do the same here to match the locations.
2601           */
2602          unsigned slot =
2603             util_bitcount64(sw_vs->info.vs.attribs_read & BITFIELD_MASK(a));
2604 
2605          key.prolog.attribs[slot] = (struct agx_velem_key){
2606             .format = hk_format_to_pipe_format(attr.format),
2607             .stride = dyn->vi_binding_strides[attr.binding],
2608             .divisor = binding.divisor,
2609             .instanced = binding.input_rate == VK_VERTEX_INPUT_RATE_INSTANCE,
2610          };
2611       }
2612 
2613       hk_update_fast_linked(cmd, sw_vs, &key);
2614    }
2615 
2616    if (IS_DIRTY(VI) || IS_DIRTY(VI_BINDINGS_VALID) || vgt_dirty ||
2617        (gfx->dirty & HK_DIRTY_VB)) {
2618 
2619       uint64_t sink = dev->rodata.zero_sink;
2620 
2621       unsigned slot = 0;
2622       u_foreach_bit(a, sw_vs->info.vs.attribs_read) {
2623          if (dyn->vi->attributes_valid & BITFIELD_BIT(a)) {
2624             struct vk_vertex_attribute_state attr = dyn->vi->attributes[a];
2625             struct hk_addr_range vb = gfx->vb[attr.binding];
2626 
2627             desc->root.draw.attrib_clamps[slot] = agx_calculate_vbo_clamp(
2628                vb.addr, sink, hk_format_to_pipe_format(attr.format), vb.range,
2629                dyn->vi_binding_strides[attr.binding], attr.offset,
2630                &desc->root.draw.attrib_base[slot]);
2631          } else {
2632             desc->root.draw.attrib_base[slot] = sink;
2633             desc->root.draw.attrib_clamps[slot] = 0;
2634          }
2635 
2636          ++slot;
2637       }
2638 
2639       desc->root_dirty = true;
2640    }
2641 
2642    if (vgt_dirty || IS_SHADER_DIRTY(FRAGMENT) ||
2643        IS_DIRTY(MS_RASTERIZATION_SAMPLES) || IS_DIRTY(MS_SAMPLE_MASK) ||
2644        IS_DIRTY(MS_ALPHA_TO_COVERAGE_ENABLE) ||
2645        IS_DIRTY(MS_ALPHA_TO_ONE_ENABLE) || IS_DIRTY(CB_LOGIC_OP) ||
2646        IS_DIRTY(CB_LOGIC_OP_ENABLE) || IS_DIRTY(CB_WRITE_MASKS) ||
2647        IS_DIRTY(CB_COLOR_WRITE_ENABLES) || IS_DIRTY(CB_ATTACHMENT_COUNT) ||
2648        IS_DIRTY(CB_BLEND_ENABLES) || IS_DIRTY(CB_BLEND_EQUATIONS) ||
2649        IS_DIRTY(CB_BLEND_CONSTANTS) ||
2650        desc->root_dirty /* for pipeline stats */ || true) {
2651 
2652       unsigned tib_sample_mask = BITFIELD_MASK(dyn->ms.rasterization_samples);
2653       unsigned api_sample_mask = dyn->ms.sample_mask & tib_sample_mask;
2654       bool has_sample_mask = api_sample_mask != tib_sample_mask;
2655 
2656       if (hw_vs->info.vs.cull_distance_array_size) {
2657          perf_debug(dev, "Emulating cull distance (size %u, %s a frag shader)",
2658                     hw_vs->info.vs.cull_distance_array_size,
2659                     fs ? "with" : "without");
2660       }
2661 
2662       if (has_sample_mask) {
2663          perf_debug(dev, "Emulating sample mask (%s a frag shader)",
2664                     fs ? "with" : "without");
2665       }
2666 
2667       if (fs) {
2668          unsigned samples_shaded = 0;
2669          if (fs->info.fs.epilog_key.sample_shading)
2670             samples_shaded = dyn->ms.rasterization_samples;
2671 
2672          struct hk_fast_link_key_fs key = {
2673             .prolog.statistics = hk_pipeline_stat_addr(
2674                cmd,
2675                VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT),
2676 
2677             .prolog.cull_distance_size =
2678                hw_vs->info.vs.cull_distance_array_size,
2679             .prolog.api_sample_mask = has_sample_mask ? api_sample_mask : 0xff,
2680             .nr_samples_shaded = samples_shaded,
2681          };
2682 
2683          bool prolog_discards =
2684             has_sample_mask || key.prolog.cull_distance_size;
2685 
2686          bool needs_prolog = key.prolog.statistics || prolog_discards;
2687 
2688          if (needs_prolog) {
2689             /* With late main shader tests, the prolog runs tests if neither the
2690              * main shader nor epilog will.
2691              *
2692              * With (nontrivial) early main shader tests, the prolog does not
2693              * run tests, the tests will run at the start of the main shader.
2694              * This ensures tests are after API sample mask and cull distance
2695              * discards.
2696              */
2697             key.prolog.run_zs_tests = !nontrivial_force_early &&
2698                                       !fs->b.info.writes_sample_mask &&
2699                                       !epilog_discards && prolog_discards;
2700 
2701             if (key.prolog.cull_distance_size) {
2702                key.prolog.cf_base = fs->b.info.varyings.fs.nr_cf;
2703             }
2704          }
2705 
2706          key.epilog = (struct agx_fs_epilog_key){
2707             .link = fs->info.fs.epilog_key,
2708             .nr_samples = MAX2(dyn->ms.rasterization_samples, 1),
2709             .blend.alpha_to_coverage = dyn->ms.alpha_to_coverage_enable,
2710             .blend.alpha_to_one = dyn->ms.alpha_to_one_enable,
2711             .blend.logicop_func = dyn->cb.logic_op_enable
2712                                      ? vk_logic_op_to_pipe(dyn->cb.logic_op)
2713                                      : PIPE_LOGICOP_COPY,
2714          };
2715 
2716          for (unsigned rt = 0; rt < ARRAY_SIZE(dyn->cal.color_map); ++rt) {
2717             int map = dyn->cal.color_map[rt];
2718             key.epilog.remap[rt] = map == MESA_VK_ATTACHMENT_UNUSED ? -1 : map;
2719          }
2720 
2721          if (dyn->ms.alpha_to_one_enable || dyn->ms.alpha_to_coverage_enable ||
2722              dyn->cb.logic_op_enable) {
2723 
2724             perf_debug(
2725                dev, "Epilog with%s%s%s",
2726                dyn->ms.alpha_to_one_enable ? " alpha-to-one" : "",
2727                dyn->ms.alpha_to_coverage_enable ? " alpha-to-coverage" : "",
2728                dyn->cb.logic_op_enable ? " logic-op" : "");
2729          }
2730 
2731          key.epilog.link.already_ran_zs |= nontrivial_force_early;
2732 
2733          struct hk_rendering_state *render = &cmd->state.gfx.render;
2734          for (uint32_t i = 0; i < render->color_att_count; i++) {
2735             key.epilog.rt_formats[i] =
2736                hk_format_to_pipe_format(render->color_att[i].vk_format);
2737 
2738             const struct vk_color_blend_attachment_state *cb =
2739                &dyn->cb.attachments[i];
2740 
2741             bool write_enable = dyn->cb.color_write_enables & BITFIELD_BIT(i);
2742             unsigned write_mask = write_enable ? cb->write_mask : 0;
2743 
2744             /* nir_lower_blend always blends, so use a default blend state when
2745              * blending is disabled at an API level.
2746              */
2747             if (!dyn->cb.attachments[i].blend_enable) {
2748                key.epilog.blend.rt[i] = (struct agx_blend_rt_key){
2749                   .colormask = write_mask,
2750                   .rgb_func = PIPE_BLEND_ADD,
2751                   .alpha_func = PIPE_BLEND_ADD,
2752                   .rgb_src_factor = PIPE_BLENDFACTOR_ONE,
2753                   .alpha_src_factor = PIPE_BLENDFACTOR_ONE,
2754                   .rgb_dst_factor = PIPE_BLENDFACTOR_ZERO,
2755                   .alpha_dst_factor = PIPE_BLENDFACTOR_ZERO,
2756                };
2757             } else {
2758                key.epilog.blend.rt[i] = (struct agx_blend_rt_key){
2759                   .colormask = write_mask,
2760 
2761                   .rgb_src_factor =
2762                      vk_blend_factor_to_pipe(cb->src_color_blend_factor),
2763 
2764                   .rgb_dst_factor =
2765                      vk_blend_factor_to_pipe(cb->dst_color_blend_factor),
2766 
2767                   .rgb_func = vk_blend_op_to_pipe(cb->color_blend_op),
2768 
2769                   .alpha_src_factor =
2770                      vk_blend_factor_to_pipe(cb->src_alpha_blend_factor),
2771 
2772                   .alpha_dst_factor =
2773                      vk_blend_factor_to_pipe(cb->dst_alpha_blend_factor),
2774 
2775                   .alpha_func = vk_blend_op_to_pipe(cb->alpha_blend_op),
2776                };
2777             }
2778          }
2779 
2780          hk_update_fast_linked(cmd, fs, &key);
2781       } else {
2782          /* TODO: prolog without fs needs to work too... */
2783          if (cmd->state.gfx.linked[MESA_SHADER_FRAGMENT] != NULL) {
2784             cmd->state.gfx.linked_dirty |= BITFIELD_BIT(MESA_SHADER_FRAGMENT);
2785             cmd->state.gfx.linked[MESA_SHADER_FRAGMENT] = NULL;
2786          }
2787       }
2788    }
2789 
2790    /* If the vertex shader uses draw parameters, vertex uniforms are dirty every
2791     * draw. Fragment uniforms are unaffected.
2792     *
2793     * For a direct draw, we upload the draw parameters as-if indirect to
2794     * avoid keying to indirectness.
2795     */
2796    if (gfx->linked[MESA_SHADER_VERTEX]->b.uses_base_param) {
2797       if (agx_is_indirect(draw.b)) {
2798          gfx->draw_params = draw.b.ptr;
2799 
2800          if (draw.indexed) {
2801             gfx->draw_params +=
2802                offsetof(VkDrawIndexedIndirectCommand, vertexOffset);
2803          } else {
2804             gfx->draw_params += offsetof(VkDrawIndirectCommand, firstVertex);
2805          }
2806       } else {
2807          uint32_t params[] = {
2808             draw.indexed ? draw.index_bias : draw.start,
2809             draw.start_instance,
2810          };
2811 
2812          gfx->draw_params = hk_pool_upload(cmd, params, sizeof(params), 4);
2813       }
2814    } else {
2815       gfx->draw_params = 0;
2816    }
2817 
2818    if (sw_vs->b.info.uses_draw_id) {
2819       /* TODO: rodata? */
2820       gfx->draw_id_ptr = hk_pool_upload(cmd, &draw_id, 2, 4);
2821    } else {
2822       gfx->draw_id_ptr = 0;
2823    }
2824 
2825    if (IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || gt_dirty) {
2826       enum mesa_prim prim = hk_rast_prim(cmd);
2827 
2828       gfx->topology = translate_hw_primitive_topology(prim);
2829       gfx->object_type = translate_object_type(prim);
2830    }
2831 
2832    if (IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || IS_DIRTY(RS_PROVOKING_VERTEX)) {
2833       unsigned provoking;
2834       if (dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT)
2835          provoking = 2;
2836       else if (gfx->topology == AGX_PRIMITIVE_TRIANGLE_FAN)
2837          provoking = 1;
2838       else
2839          provoking = 0;
2840 
2841       if (provoking != gfx->provoking) {
2842          gfx->provoking = provoking;
2843          gfx->dirty |= HK_DIRTY_PROVOKING;
2844 
2845          gfx->descriptors.root.draw.provoking = provoking;
2846          gfx->descriptors.root_dirty = true;
2847       }
2848    }
2849 
2850    /* With attachmentless rendering, we don't know the sample count until draw
2851     * time, so we do a late tilebuffer fix up. But with rasterizer discard,
2852     * rasterization_samples might be 0.
2853     *
2854     * Note that we ignore dyn->ms.rasterization_samples when we do have a sample
2855     * count from an attachment. In Vulkan, these have to match anyway, but DX12
2856     * drivers are robust against this scenarios and vkd3d-proton will go out of
2857     * spec here. No reason we can't be robust here too.
2858     */
2859    if (dyn->ms.rasterization_samples && !gfx->render.tilebuffer.nr_samples) {
2860       agx_tilebuffer_set_samples(&gfx->render.tilebuffer,
2861                                  dyn->ms.rasterization_samples);
2862 
2863       cs->tib = gfx->render.tilebuffer;
2864    }
2865 
2866    if (IS_DIRTY(MS_SAMPLE_LOCATIONS) || IS_DIRTY(MS_SAMPLE_LOCATIONS_ENABLE) ||
2867        IS_DIRTY(MS_RASTERIZATION_SAMPLES)) {
2868 
2869       uint32_t ctrl;
2870       if (dyn->ms.sample_locations_enable) {
2871          ctrl = hk_pack_ppp_multisamplectrl(dyn->ms.sample_locations);
2872       } else {
2873          ctrl = hk_default_sample_positions(dyn->ms.rasterization_samples);
2874       }
2875 
2876       bool dont_commit = cmd->in_meta || dyn->ms.rasterization_samples == 0;
2877 
2878       if (!cs->has_sample_locations) {
2879          cs->ppp_multisamplectl = ctrl;
2880 
2881          /* If we're in vk_meta, do not commit to the sample locations yet.
2882           * vk_meta doesn't care, but the app will!
2883           */
2884          cs->has_sample_locations |= !dont_commit;
2885       } else {
2886          assert(dont_commit || cs->ppp_multisamplectl == ctrl);
2887       }
2888 
2889       gfx->descriptors.root.draw.ppp_multisamplectl = ctrl;
2890       gfx->descriptors.root_dirty = true;
2891    }
2892 
2893    /* Link varyings before uploading tessellation state, becuase the
2894     * gfx->generate_primitive_id boolean needs to be plumbed.
2895     */
2896    struct hk_linked_shader *linked_vs = gfx->linked[MESA_SHADER_VERTEX];
2897    struct hk_linked_shader *linked_fs = gfx->linked[MESA_SHADER_FRAGMENT];
2898    bool linked_vs_dirty = IS_LINKED_DIRTY(VERTEX);
2899    bool linked_fs_dirty = IS_LINKED_DIRTY(FRAGMENT);
2900 
2901    if ((gfx->dirty & HK_DIRTY_PROVOKING) || vgt_dirty || linked_fs_dirty) {
2902       unsigned bindings = linked_fs ? linked_fs->b.cf.nr_bindings : 0;
2903       if (bindings) {
2904          size_t linkage_size =
2905             AGX_CF_BINDING_HEADER_LENGTH + (bindings * AGX_CF_BINDING_LENGTH);
2906 
2907          struct agx_ptr t = hk_pool_usc_alloc(cmd, linkage_size, 16);
2908          if (!t.cpu)
2909             return;
2910 
2911          agx_link_varyings_vs_fs(
2912             t.cpu, &gfx->linked_varyings, hw_vs->info.uvs.user_size,
2913             &linked_fs->b.cf, gfx->provoking, 0, &gfx->generate_primitive_id);
2914 
2915          gfx->varyings = agx_usc_addr(&dev->dev, t.gpu);
2916       } else {
2917          gfx->varyings = 0;
2918       }
2919 
2920       gfx->dirty |= HK_DIRTY_VARYINGS;
2921    }
2922 
2923    if (gfx->shaders[MESA_SHADER_TESS_EVAL] ||
2924        gfx->shaders[MESA_SHADER_GEOMETRY] || linked_vs->sw_indexing) {
2925       /* XXX: We should deduplicate this logic */
2926       bool indirect = agx_is_indirect(draw.b) || draw.restart;
2927 
2928       desc->root.draw.input_assembly =
2929          indirect ? hk_pool_alloc(cmd, sizeof(struct agx_ia_state), 4).gpu
2930                   : hk_upload_ia_params(cmd, draw);
2931       desc->root_dirty = true;
2932    }
2933 
2934    if (gfx->shaders[MESA_SHADER_TESS_EVAL] ||
2935        gfx->shaders[MESA_SHADER_GEOMETRY]) {
2936 
2937       struct hk_shader *vs = hk_bound_sw_vs(gfx);
2938       desc->root.draw.vertex_outputs = vs->b.info.outputs;
2939 
2940       /* XXX: We should deduplicate this logic */
2941       bool indirect = agx_is_indirect(draw.b) || draw.restart;
2942 
2943       if (!indirect) {
2944          uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
2945          unsigned vb_size =
2946             libagx_tcs_in_size(verts * instances, vs->b.info.outputs);
2947 
2948          /* Allocate if there are any outputs, or use the null sink to trap
2949           * reads if there aren't. Those reads are undefined but should not
2950           * fault. Affects:
2951           *
2952           *    dEQP-VK.pipeline.monolithic.no_position.explicit_declarations.basic.single_view.v0_g1
2953           */
2954          desc->root.draw.vertex_output_buffer =
2955             vb_size ? hk_pool_alloc(cmd, vb_size, 4).gpu
2956                     : dev->rodata.null_sink;
2957       }
2958    }
2959 
2960    struct agx_ptr tess_args = {0};
2961    if (gfx->shaders[MESA_SHADER_TESS_EVAL]) {
2962       tess_args = hk_pool_alloc(cmd, sizeof(struct libagx_tess_args), 4);
2963       gfx->descriptors.root.draw.tess_params = tess_args.gpu;
2964       gfx->descriptors.root_dirty = true;
2965    }
2966 
2967    if (gfx->shaders[MESA_SHADER_GEOMETRY]) {
2968       /* TODO: size */
2969       cmd->geom_indirect = hk_pool_alloc(cmd, 64, 4).gpu;
2970 
2971       gfx->descriptors.root.draw.geometry_params =
2972          hk_upload_geometry_params(cmd, draw);
2973 
2974       gfx->descriptors.root_dirty = true;
2975    }
2976 
2977    /* Root must be uploaded after the above, which touch the root */
2978    if (gfx->descriptors.root_dirty) {
2979       gfx->root =
2980          hk_cmd_buffer_upload_root(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
2981 
2982       /* Tess parameters depend on the root address, so we defer the upload
2983        * until after uploading root. But the root depends on the tess address,
2984        * so we allocate tess parameters before uploading root.
2985        *
2986        * This whole mechanism is a mess ported over from the GL driver. I'm
2987        * planning to do a massive rework of indirect geom/tess so I'm trying not
2988        * to perfectionism it in the mean time.
2989        */
2990       if (tess_args.cpu) {
2991          hk_upload_tess_params(cmd, tess_args.cpu, draw);
2992       }
2993    }
2994 
2995    /* Hardware dynamic state must be deferred until after the root and fast
2996     * linking, since it will use the root address and the linked shaders.
2997     */
2998    if ((gfx->dirty & (HK_DIRTY_PROVOKING | HK_DIRTY_VARYINGS)) ||
2999        IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) || linked_vs_dirty || vgt_dirty ||
3000        gfx->descriptors.root_dirty || gfx->draw_id_ptr || gfx->draw_params) {
3001 
3002       /* TODO: Do less often? */
3003       hk_reserve_scratch(cmd, cs, hw_vs);
3004 
3005       agx_push(out, VDM_STATE, cfg) {
3006          cfg.vertex_shader_word_0_present = true;
3007          cfg.vertex_shader_word_1_present = true;
3008          cfg.vertex_outputs_present = true;
3009          cfg.vertex_unknown_present = true;
3010       }
3011 
3012       agx_push_packed(out, hw_vs->counts, VDM_STATE_VERTEX_SHADER_WORD_0);
3013 
3014       struct hk_linked_shader *linked_hw_vs =
3015          (hw_vs == sw_vs) ? linked_vs : hw_vs->only_linked;
3016 
3017       agx_push(out, VDM_STATE_VERTEX_SHADER_WORD_1, cfg) {
3018          cfg.pipeline = hk_upload_usc_words(cmd, hw_vs, linked_hw_vs);
3019       }
3020 
3021       agx_push_packed(out, hw_vs->info.uvs.vdm, VDM_STATE_VERTEX_OUTPUTS);
3022 
3023       agx_push(out, VDM_STATE_VERTEX_UNKNOWN, cfg) {
3024          cfg.flat_shading_control = translate_vdm_vertex(gfx->provoking);
3025          cfg.unknown_4 = cfg.unknown_5 = dyn->rs.rasterizer_discard_enable;
3026          cfg.generate_primitive_id = gfx->generate_primitive_id;
3027       }
3028 
3029       /* Pad up to a multiple of 8 bytes */
3030       memset(out, 0, 4);
3031       out += 4;
3032    }
3033 
3034    if (IS_DIRTY(RS_DEPTH_BIAS_FACTORS)) {
3035       void *ptr =
3036          util_dynarray_grow_bytes(&cs->depth_bias, 1, AGX_DEPTH_BIAS_LENGTH);
3037 
3038       bool exact = dyn->rs.depth_bias.exact;
3039       bool force_unorm =
3040          dyn->rs.depth_bias.representation ==
3041          VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT;
3042 
3043       agx_pack(ptr, DEPTH_BIAS, cfg) {
3044          cfg.slope_scale = dyn->rs.depth_bias.slope_factor;
3045          cfg.clamp = dyn->rs.depth_bias.clamp;
3046          cfg.depth_bias = dyn->rs.depth_bias.constant_factor;
3047          cfg.depth_bias /= hk_depth_bias_factor(render->depth_att.vk_format,
3048                                                 exact, force_unorm);
3049       }
3050    }
3051 
3052    /* Hardware viewport/scissor state is entangled with depth bias. */
3053    if (IS_DIRTY(RS_DEPTH_BIAS_FACTORS) || IS_DIRTY(VP_SCISSORS) ||
3054        IS_DIRTY(VP_SCISSOR_COUNT) || IS_DIRTY(VP_VIEWPORTS) ||
3055        IS_DIRTY(VP_VIEWPORT_COUNT) ||
3056        IS_DIRTY(VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) ||
3057        IS_DIRTY(RS_DEPTH_CLIP_ENABLE) || IS_DIRTY(RS_DEPTH_CLAMP_ENABLE)) {
3058 
3059       hk_flush_vp_state(cmd, cs, &out);
3060    }
3061 
3062    hk_flush_ppp_state(cmd, cs, &out);
3063    cs->current = out;
3064 
3065    vk_dynamic_graphics_state_clear_dirty(dyn);
3066    gfx->shaders_dirty = 0;
3067    gfx->linked_dirty = 0;
3068    gfx->dirty = 0;
3069    gfx->descriptors.root_dirty = false;
3070 }
3071 
3072 static bool
hk_needs_index_robustness(struct hk_cmd_buffer * cmd,struct agx_draw * draw)3073 hk_needs_index_robustness(struct hk_cmd_buffer *cmd, struct agx_draw *draw)
3074 {
3075    struct hk_graphics_state *gfx = &cmd->state.gfx;
3076    struct hk_device *dev = hk_cmd_buffer_device(cmd);
3077 
3078    if (!draw->indexed)
3079       return false;
3080 
3081    /* Geometry or tessellation use robust software index buffer fetch anyway */
3082    if (gfx->shaders[MESA_SHADER_GEOMETRY] ||
3083        gfx->shaders[MESA_SHADER_TESS_EVAL])
3084       return false;
3085 
3086    /* Soft fault does not cover the hardware index buffer fetch. So we can't
3087     * simply use index buffers. However, we can use our 16-byte zero sink
3088     * instead, using the hardware clamp. This does seem to work.
3089     */
3090    if (draw->index_buffer_range_B == 0) {
3091       draw->index_buffer = dev->rodata.zero_sink;
3092       draw->index_buffer_range_B = 4;
3093       draw->start = 0;
3094       return false;
3095    }
3096 
3097    if (!(dev->vk.enabled_features.robustBufferAccess ||
3098          dev->vk.enabled_features.robustBufferAccess2 ||
3099          dev->vk.enabled_features.pipelineRobustness))
3100       return false;
3101 
3102    if (agx_is_indirect(draw->b))
3103       return true;
3104 
3105    return agx_direct_draw_overreads_indices(*draw);
3106 }
3107 
3108 static void
hk_handle_passthrough_gs(struct hk_cmd_buffer * cmd,struct agx_draw draw)3109 hk_handle_passthrough_gs(struct hk_cmd_buffer *cmd, struct agx_draw draw)
3110 {
3111    struct hk_graphics_state *gfx = &cmd->state.gfx;
3112    struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
3113 
3114    /* If there's an application geometry shader, there's nothing to un/bind */
3115    if (gs && !gs->is_passthrough)
3116       return;
3117 
3118    /* Determine if we need a geometry shader to emulate XFB or adjacency */
3119    struct hk_shader *last_sw = hk_bound_sw_vs_before_gs(gfx);
3120    uint32_t xfb_outputs = last_sw->info.xfb_info.output_count;
3121    bool needs_gs = xfb_outputs;
3122 
3123    /* If we already have a matching GS configuration, we're done */
3124    if ((gs != NULL) == needs_gs)
3125       return;
3126 
3127    /* If we don't need a GS but we do have a passthrough, unbind it */
3128    if (gs) {
3129       assert(!needs_gs && gs->is_passthrough);
3130       hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, NULL);
3131       return;
3132    }
3133 
3134    /* Else, we need to bind a passthrough GS */
3135    size_t key_size =
3136       sizeof(struct hk_passthrough_gs_key) + nir_xfb_info_size(xfb_outputs);
3137    struct hk_passthrough_gs_key *key = alloca(key_size);
3138 
3139    *key = (struct hk_passthrough_gs_key){
3140       .prim = u_decomposed_prim(hk_gs_in_prim(cmd)),
3141       .outputs = last_sw->b.info.outputs,
3142       .clip_distance_array_size = last_sw->info.clip_distance_array_size,
3143       .cull_distance_array_size = last_sw->info.cull_distance_array_size,
3144    };
3145 
3146    if (xfb_outputs) {
3147       typed_memcpy(key->xfb_stride, last_sw->info.xfb_stride,
3148                    ARRAY_SIZE(key->xfb_stride));
3149 
3150       memcpy(&key->xfb_info, &last_sw->info.xfb_info,
3151              nir_xfb_info_size(xfb_outputs));
3152    }
3153 
3154    struct hk_device *dev = hk_cmd_buffer_device(cmd);
3155    perf_debug(dev, "Binding passthrough GS for%s\n", xfb_outputs ? " XFB" : "");
3156 
3157    gs = hk_meta_shader(dev, hk_nir_passthrough_gs, key, key_size);
3158    gs->is_passthrough = true;
3159    hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, gs);
3160 }
3161 
3162 static struct hk_cs *
hk_flush_gfx_state(struct hk_cmd_buffer * cmd,uint32_t draw_id,struct agx_draw draw)3163 hk_flush_gfx_state(struct hk_cmd_buffer *cmd, uint32_t draw_id,
3164                    struct agx_draw draw)
3165 {
3166    struct hk_device *dev = hk_cmd_buffer_device(cmd);
3167    struct hk_graphics_state *gfx = &cmd->state.gfx;
3168    struct hk_descriptor_state *desc = &gfx->descriptors;
3169 
3170    struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, false /* compute */);
3171    const struct vk_dynamic_graphics_state *dyn =
3172       &cmd->vk.dynamic_graphics_state;
3173 
3174    if (!cs)
3175       return NULL;
3176 
3177    /* Annoyingly,
3178     * VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT is
3179     * render pass state on Imaginapple but draw state in Vulkan. In practice,
3180     * Proton never changes it within a render pass, but we technically need to
3181     * handle the switch regardless. Do so early since `cs` will be invalidated
3182     * if we need to split the render pass to switch representation mid-frame.
3183     *
3184     * Note we only do this dance with depth bias is actually enabled to avoid
3185     * senseless control stream splits with DXVK.
3186     */
3187    if ((IS_DIRTY(RS_DEPTH_BIAS_FACTORS) || IS_DIRTY(RS_DEPTH_BIAS_ENABLE)) &&
3188        dyn->rs.depth_bias.enable) {
3189 
3190       bool dbias_is_int =
3191          (dyn->rs.depth_bias.representation ==
3192           VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT) ||
3193          (gfx->render.depth_att.vk_format == VK_FORMAT_D16_UNORM);
3194 
3195       /* Attempt to set dbias_is_int per the draw requirement. If this fails,
3196        * flush the control stream and set it on the new control stream.
3197        */
3198       bool succ = u_tristate_set(&cs->cr.dbias_is_int, dbias_is_int);
3199       if (!succ) {
3200          perf_debug(dev, "Splitting control stream due to depth bias");
3201 
3202          hk_cmd_buffer_end_graphics(cmd);
3203          cs = hk_cmd_buffer_get_cs(cmd, false /* compute */);
3204 
3205          succ = u_tristate_set(&cs->cr.dbias_is_int, dbias_is_int);
3206          assert(succ && "can always set tri-state on a new control stream");
3207       }
3208    }
3209 
3210    hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
3211 
3212 #ifndef NDEBUG
3213    if (unlikely(dev->dev.debug & AGX_DBG_DIRTY)) {
3214       hk_cmd_buffer_dirty_all(cmd);
3215    }
3216 #endif
3217 
3218    /* Merge tess info before GS construction since that depends on
3219     * gfx->tess.prim
3220     */
3221    if ((IS_SHADER_DIRTY(TESS_CTRL) || IS_SHADER_DIRTY(TESS_EVAL)) &&
3222        gfx->shaders[MESA_SHADER_TESS_CTRL]) {
3223       struct hk_api_shader *tcs = gfx->shaders[MESA_SHADER_TESS_CTRL];
3224       struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL];
3225       struct hk_shader *tese = hk_any_variant(tes);
3226       struct hk_shader *tesc = hk_only_variant(tcs);
3227 
3228       gfx->tess.info =
3229          hk_tess_info_merge(tese->info.tess.info, tesc->info.tess.info);
3230 
3231       /* Determine primitive based on the merged state */
3232       if (gfx->tess.info.points) {
3233          gfx->tess.prim = MESA_PRIM_POINTS;
3234       } else if (gfx->tess.info.mode == TESS_PRIMITIVE_ISOLINES) {
3235          gfx->tess.prim = MESA_PRIM_LINES;
3236       } else {
3237          gfx->tess.prim = MESA_PRIM_TRIANGLES;
3238       }
3239    }
3240 
3241    /* TODO: Try to reduce draw overhead of this */
3242    hk_handle_passthrough_gs(cmd, draw);
3243 
3244    hk_flush_shaders(cmd);
3245 
3246    if (desc->push_dirty)
3247       hk_cmd_buffer_flush_push_descriptors(cmd, desc);
3248 
3249    if (draw.restart || gfx->shaders[MESA_SHADER_GEOMETRY])
3250       hk_flush_index(cmd, cs);
3251 
3252    hk_flush_dynamic_state(cmd, cs, draw_id, draw);
3253    return cs;
3254 }
3255 
3256 VKAPI_ATTR void VKAPI_CALL
hk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)3257 hk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3258                           VkDeviceSize offset, VkDeviceSize size,
3259                           VkIndexType indexType)
3260 {
3261    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3262    VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
3263 
3264    cmd->state.gfx.index = (struct hk_index_buffer_state){
3265       .buffer = hk_buffer_addr_range(buffer, offset, size),
3266       .size = agx_translate_index_size(vk_index_type_to_bytes(indexType)),
3267       .restart = vk_index_to_restart(indexType),
3268    };
3269 
3270    /* TODO: check if necessary, blob does this */
3271    cmd->state.gfx.index.buffer.range =
3272       align(cmd->state.gfx.index.buffer.range, 4);
3273 }
3274 
3275 void
hk_cmd_bind_vertex_buffer(struct hk_cmd_buffer * cmd,uint32_t vb_idx,struct hk_addr_range addr_range)3276 hk_cmd_bind_vertex_buffer(struct hk_cmd_buffer *cmd, uint32_t vb_idx,
3277                           struct hk_addr_range addr_range)
3278 {
3279    cmd->state.gfx.vb[vb_idx] = addr_range;
3280    cmd->state.gfx.dirty |= HK_DIRTY_VB;
3281 }
3282 
3283 VKAPI_ATTR void VKAPI_CALL
hk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)3284 hk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer, uint32_t firstBinding,
3285                          uint32_t bindingCount, const VkBuffer *pBuffers,
3286                          const VkDeviceSize *pOffsets,
3287                          const VkDeviceSize *pSizes,
3288                          const VkDeviceSize *pStrides)
3289 {
3290    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3291 
3292    if (pStrides) {
3293       vk_cmd_set_vertex_binding_strides(&cmd->vk, firstBinding, bindingCount,
3294                                         pStrides);
3295    }
3296 
3297    for (uint32_t i = 0; i < bindingCount; i++) {
3298       VK_FROM_HANDLE(hk_buffer, buffer, pBuffers[i]);
3299       uint32_t idx = firstBinding + i;
3300 
3301       uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
3302       const struct hk_addr_range addr_range =
3303          hk_buffer_addr_range(buffer, pOffsets[i], size);
3304 
3305       hk_cmd_bind_vertex_buffer(cmd, idx, addr_range);
3306    }
3307 }
3308 
3309 static bool
hk_set_view_index(struct hk_cmd_buffer * cmd,uint32_t view_idx)3310 hk_set_view_index(struct hk_cmd_buffer *cmd, uint32_t view_idx)
3311 {
3312    if (cmd->state.gfx.render.view_mask) {
3313       cmd->state.gfx.descriptors.root.draw.view_index = view_idx;
3314       cmd->state.gfx.descriptors.root_dirty = true;
3315    }
3316 
3317    return true;
3318 }
3319 
3320 /*
3321  * Iterator macro to duplicate a draw for each enabled view (when multiview is
3322  * enabled, else always view 0). Along with hk_lower_multiview, this forms the
3323  * world's worst multiview lowering.
3324  */
3325 #define hk_foreach_view(cmd)                                                   \
3326    u_foreach_bit(view_idx, cmd->state.gfx.render.view_mask ?: 1)               \
3327       if (hk_set_view_index(cmd, view_idx))
3328 
3329 static void
hk_ia_update(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct agx_draw draw,uint64_t ia_vertices,uint64_t ia_prims,uint64_t vs_invocations,uint64_t c_prims,uint64_t c_inv)3330 hk_ia_update(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct agx_draw draw,
3331              uint64_t ia_vertices, uint64_t ia_prims, uint64_t vs_invocations,
3332              uint64_t c_prims, uint64_t c_inv)
3333 {
3334    /* XXX: stream link needed? */
3335    struct hk_device *dev = hk_cmd_buffer_device(cmd);
3336    perf_debug(dev, "Input assembly counters");
3337 
3338    uint64_t draw_ptr;
3339    if (agx_is_indirect(draw.b)) {
3340       draw_ptr = draw.b.ptr;
3341    } else {
3342       uint32_t desc[] = {draw.b.count[0], draw.b.count[1], 0};
3343       draw_ptr = hk_pool_upload(cmd, &desc, sizeof(desc), 4);
3344    }
3345 
3346    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
3347    enum mesa_prim prim = vk_conv_topology(dyn->ia.primitive_topology);
3348 
3349    bool geom = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY];
3350    bool tess = cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL];
3351 
3352    /* Clipper counters depend on geom/tess outputs and must be written with the
3353     * geom/tess output. They are updated as IA counters only when geom/tess is
3354     * not used.
3355     *
3356     * TODO: Tessellation clipper counters not actually wired up, pending CTS.
3357     */
3358    if (geom || tess) {
3359       c_prims = 0;
3360       c_inv = 0;
3361    }
3362 
3363    if (draw.restart) {
3364       uint32_t index_size_B = agx_index_size_to_B(draw.index_size);
3365 
3366       libagx_increment_ia_restart(
3367          cs, agx_1d(1024), AGX_BARRIER_ALL, ia_vertices, ia_prims,
3368          vs_invocations, c_prims, c_inv, draw_ptr, draw.index_buffer,
3369          agx_draw_index_range_el(draw), cmd->state.gfx.index.restart,
3370          index_size_B, prim);
3371    } else {
3372       libagx_increment_ia(cs, agx_1d(1), AGX_BARRIER_ALL, ia_vertices, ia_prims,
3373                           vs_invocations, c_prims, c_inv, draw_ptr, prim);
3374    }
3375 }
3376 
3377 static void
hk_draw(struct hk_cmd_buffer * cmd,uint16_t draw_id,struct agx_draw draw_)3378 hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct agx_draw draw_)
3379 {
3380    const struct vk_dynamic_graphics_state *dyn =
3381       &cmd->vk.dynamic_graphics_state;
3382 
3383    /* Filter trivial draws so we don't need to worry about null index buffers */
3384    if (!agx_is_indirect(draw_.b) &&
3385        (draw_.b.count[0] == 0 || draw_.b.count[1] == 0))
3386       return;
3387 
3388    draw_.restart = dyn->ia.primitive_restart_enable && draw_.indexed;
3389    draw_.index_size = cmd->state.gfx.index.size;
3390 
3391    uint64_t stat_ia_verts = hk_pipeline_stat_addr(
3392       cmd, VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT);
3393 
3394    uint64_t stat_ia_prims = hk_pipeline_stat_addr(
3395       cmd, VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT);
3396 
3397    uint64_t stat_vs_inv = hk_pipeline_stat_addr(
3398       cmd, VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT);
3399 
3400    uint64_t stat_c_inv = hk_pipeline_stat_addr(
3401       cmd, VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT);
3402 
3403    uint64_t stat_c_prims = hk_pipeline_stat_addr(
3404       cmd, VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT);
3405 
3406    bool ia_stats = stat_ia_verts || stat_ia_prims || stat_vs_inv ||
3407                    stat_c_inv || stat_c_prims;
3408    struct hk_device *dev = hk_cmd_buffer_device(cmd);
3409 
3410    hk_foreach_view(cmd) {
3411       struct agx_draw draw = draw_;
3412       struct hk_cs *cs = hk_flush_gfx_state(cmd, draw_id, draw);
3413       /* If we failed to allocate a control stream, we've already lost the
3414        * device. Just drop the draw so we don't crash.
3415        */
3416       if (!cs)
3417          return;
3418 
3419       struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
3420       bool geom = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY];
3421       bool tess = cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL];
3422       bool needs_idx_robust = hk_needs_index_robustness(cmd, &draw);
3423       bool adj =
3424          mesa_prim_has_adjacency(vk_conv_topology(dyn->ia.primitive_topology));
3425       adj &= !geom;
3426       needs_idx_robust &= !adj;
3427 
3428       struct hk_cs *ccs = NULL;
3429       uint8_t *out = cs->current;
3430       assert(cs->current + 0x1000 < cs->end);
3431 
3432       if (tess && HK_PERF(dev, NOTESS))
3433          continue;
3434 
3435       cs->stats.calls++;
3436 
3437       if (geom || tess || ia_stats || needs_idx_robust ||
3438           (adj && (agx_is_indirect(draw.b) || draw.restart))) {
3439 
3440          ccs =
3441             hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true);
3442          if (!ccs)
3443             return;
3444       }
3445 
3446       if (ia_stats) {
3447          hk_ia_update(cmd, ccs, draw, stat_ia_verts, stat_ia_prims, stat_vs_inv,
3448                       stat_c_prims, stat_c_inv);
3449       }
3450 
3451       if (tess) {
3452          draw = hk_launch_tess(cmd, ccs, draw);
3453       }
3454 
3455       if (geom) {
3456          draw = hk_launch_gs_prerast(cmd, ccs, draw);
3457 
3458          /* We must not draw if the app specified rasterizer discard. This is
3459           * required for both performance (it is pointless to rasterize and
3460           * there are no side effects), but also correctness (no indirect draw
3461           * descriptor will be filled out).
3462           */
3463          if (dyn->rs.rasterizer_discard_enable)
3464             continue;
3465       }
3466 
3467       if (adj) {
3468          assert(!geom && "geometry shaders handle adj directly");
3469          enum mesa_prim prim = vk_conv_topology(dyn->ia.primitive_topology);
3470 
3471          if (draw.restart) {
3472             draw = hk_draw_without_restart(cmd, ccs, draw, 1);
3473             prim = u_decomposed_prim(prim);
3474          }
3475 
3476          if (agx_is_indirect(draw.b)) {
3477             const size_t size = sizeof(VkDrawIndexedIndirectCommand);
3478             static_assert(sizeof(VkDrawIndexedIndirectCommand) >
3479                              sizeof(VkDrawIndirectCommand),
3480                           "allocation size is conservative");
3481 
3482             uint64_t out_draw = hk_pool_alloc(cmd, size, 4).gpu;
3483             struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
3484 
3485             libagx_draw_without_adj(
3486                ccs, agx_1d(1), AGX_BARRIER_ALL, out_draw, draw.b.ptr,
3487                desc->root.draw.input_assembly, draw.index_buffer,
3488                draw.indexed ? agx_draw_index_range_el(draw) : 0,
3489                draw.indexed ? agx_index_size_to_B(draw.index_size) : 0, prim);
3490 
3491             draw = agx_draw_indirect(out_draw);
3492          } else {
3493             unsigned count = libagx_remap_adj_count(draw.b.count[0], prim);
3494 
3495             draw = (struct agx_draw){
3496                .b = agx_3d(count, draw.b.count[1], 1),
3497             };
3498          }
3499       }
3500 
3501       enum agx_primitive topology = cmd->state.gfx.topology;
3502       if (needs_idx_robust) {
3503          assert(!geom && !tess && !adj);
3504          perf_debug(dev, "lowering robust index buffer");
3505 
3506          cs->current = out;
3507 
3508          draw = hk_draw_as_indexed_indirect(cmd, draw);
3509 
3510          size_t size_B = libagx_draw_robust_index_vdm_size();
3511          uint64_t target = hk_cs_alloc_for_indirect(cs, size_B);
3512 
3513          libagx_draw_robust_index(ccs, agx_1d(32), AGX_BARRIER_ALL, target,
3514                                   hk_geometry_state(cmd), draw.b.ptr,
3515                                   draw.index_buffer, draw.index_buffer_range_B,
3516                                   draw.restart, topology, draw.index_size);
3517       } else {
3518          cs->current = (void *)agx_vdm_draw((uint32_t *)out, dev->dev.chip,
3519                                             draw, topology);
3520       }
3521 
3522       cs->stats.cmds++;
3523    }
3524 }
3525 
3526 VKAPI_ATTR void VKAPI_CALL
hk_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)3527 hk_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount,
3528            uint32_t instanceCount, uint32_t firstVertex, uint32_t firstInstance)
3529 {
3530    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3531    struct agx_draw draw;
3532 
3533    if (HK_TEST_INDIRECTS) {
3534       uint32_t data[] = {
3535          vertexCount,
3536          instanceCount,
3537          firstVertex,
3538          firstInstance,
3539       };
3540 
3541       draw = agx_draw_indirect(hk_pool_upload(cmd, data, sizeof(data), 4));
3542    } else {
3543       draw = (struct agx_draw){
3544          .b = agx_3d(vertexCount, instanceCount, 1),
3545          .start = firstVertex,
3546          .start_instance = firstInstance,
3547       };
3548    }
3549 
3550    hk_draw(cmd, 0, draw);
3551 }
3552 
3553 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)3554 hk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount,
3555                    const VkMultiDrawInfoEXT *pVertexInfo,
3556                    uint32_t instanceCount, uint32_t firstInstance,
3557                    uint32_t stride)
3558 {
3559    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3560 
3561    for (unsigned i = 0; i < drawCount; ++i) {
3562       struct agx_draw draw = {
3563          .b = agx_3d(pVertexInfo->vertexCount, instanceCount, 1),
3564          .start = pVertexInfo->firstVertex,
3565          .start_instance = firstInstance,
3566       };
3567 
3568       hk_draw(cmd, i, draw);
3569       pVertexInfo = ((void *)pVertexInfo) + stride;
3570    }
3571 }
3572 
3573 static void
hk_draw_indexed(VkCommandBuffer commandBuffer,uint16_t draw_id,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)3574 hk_draw_indexed(VkCommandBuffer commandBuffer, uint16_t draw_id,
3575                 uint32_t indexCount, uint32_t instanceCount,
3576                 uint32_t firstIndex, int32_t vertexOffset,
3577                 uint32_t firstInstance)
3578 {
3579    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3580    struct agx_draw draw;
3581    struct hk_addr_range buf = cmd->state.gfx.index.buffer;
3582 
3583    if (HK_TEST_INDIRECTS && draw_id == 0) {
3584       uint32_t data[] = {
3585          indexCount, instanceCount, firstIndex, vertexOffset, firstInstance,
3586       };
3587       uint64_t addr = hk_pool_upload(cmd, data, sizeof(data), 4);
3588 
3589       draw = agx_draw_indexed_indirect(addr, buf.addr, buf.range, 0, 0);
3590    } else {
3591       draw =
3592          agx_draw_indexed(indexCount, instanceCount, firstIndex, vertexOffset,
3593                           firstInstance, buf.addr, buf.range, 0, 0);
3594    }
3595 
3596    hk_draw(cmd, draw_id, draw);
3597 }
3598 
3599 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)3600 hk_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount,
3601                   uint32_t instanceCount, uint32_t firstIndex,
3602                   int32_t vertexOffset, uint32_t firstInstance)
3603 {
3604    hk_draw_indexed(commandBuffer, 0, indexCount, instanceCount, firstIndex,
3605                    vertexOffset, firstInstance);
3606 }
3607 
3608 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)3609 hk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount,
3610                           const VkMultiDrawIndexedInfoEXT *pIndexInfo,
3611                           uint32_t instanceCount, uint32_t firstInstance,
3612                           uint32_t stride, const int32_t *pVertexOffset)
3613 {
3614    for (unsigned i = 0; i < drawCount; ++i) {
3615       const uint32_t vertex_offset =
3616          pVertexOffset != NULL ? *pVertexOffset : pIndexInfo->vertexOffset;
3617 
3618       hk_draw_indexed(commandBuffer, i, pIndexInfo->indexCount, instanceCount,
3619                       pIndexInfo->firstIndex, vertex_offset, firstInstance);
3620 
3621       pIndexInfo = ((void *)pIndexInfo) + stride;
3622    }
3623 }
3624 
3625 static void
hk_draw_indirect_inner(VkCommandBuffer commandBuffer,uint64_t base,uint32_t drawCount,uint32_t stride)3626 hk_draw_indirect_inner(VkCommandBuffer commandBuffer, uint64_t base,
3627                        uint32_t drawCount, uint32_t stride)
3628 {
3629    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3630 
3631    /* From the Vulkan 1.3.238 spec:
3632     *
3633     *    VUID-vkCmdDrawIndirect-drawCount-00476
3634     *
3635     *    "If drawCount is greater than 1, stride must be a multiple of 4 and
3636     *    must be greater than or equal to sizeof(VkDrawIndirectCommand)"
3637     *
3638     * and
3639     *
3640     *    "If drawCount is less than or equal to one, stride is ignored."
3641     */
3642    if (drawCount > 1) {
3643       assert(stride % 4 == 0);
3644       assert(stride >= sizeof(VkDrawIndirectCommand));
3645    }
3646 
3647    for (unsigned draw_id = 0; draw_id < drawCount; ++draw_id) {
3648       uint64_t addr = base + stride * draw_id;
3649       hk_draw(cmd, draw_id, agx_draw_indirect(addr));
3650    }
3651 }
3652 
3653 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)3654 hk_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3655                    VkDeviceSize offset, uint32_t drawCount, uint32_t stride)
3656 {
3657    VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
3658 
3659    hk_draw_indirect_inner(commandBuffer, hk_buffer_address(buffer, offset),
3660                           drawCount, stride);
3661 }
3662 
3663 static void
hk_draw_indexed_indirect_inner(VkCommandBuffer commandBuffer,uint64_t buffer,uint32_t drawCount,uint32_t stride)3664 hk_draw_indexed_indirect_inner(VkCommandBuffer commandBuffer, uint64_t buffer,
3665                                uint32_t drawCount, uint32_t stride)
3666 {
3667    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3668 
3669    /* From the Vulkan 1.3.238 spec:
3670     *
3671     *    VUID-vkCmdDrawIndexedIndirect-drawCount-00528
3672     *
3673     *    "If drawCount is greater than 1, stride must be a multiple of 4 and
3674     *    must be greater than or equal to
3675     * sizeof(VkDrawIndexedIndirectCommand)"
3676     *
3677     * and
3678     *
3679     *    "If drawCount is less than or equal to one, stride is ignored."
3680     */
3681    if (drawCount > 1) {
3682       assert(stride % 4 == 0);
3683       assert(stride >= sizeof(VkDrawIndexedIndirectCommand));
3684    }
3685 
3686    for (unsigned draw_id = 0; draw_id < drawCount; ++draw_id) {
3687       uint64_t addr = buffer + stride * draw_id;
3688       struct hk_addr_range buf = cmd->state.gfx.index.buffer;
3689 
3690       hk_draw(cmd, draw_id,
3691               agx_draw_indexed_indirect(addr, buf.addr, buf.range, 0, 0));
3692    }
3693 }
3694 
3695 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)3696 hk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3697                           VkDeviceSize offset, uint32_t drawCount,
3698                           uint32_t stride)
3699 {
3700    VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
3701 
3702    hk_draw_indexed_indirect_inner(
3703       commandBuffer, hk_buffer_address(buffer, offset), drawCount, stride);
3704 }
3705 
3706 /*
3707  * To implement drawIndirectCount generically, we dispatch a compute kernel to
3708  * patch the indirect buffer and then we dispatch the predicated maxDrawCount
3709  * indirect draws.
3710  */
3711 static void
hk_draw_indirect_count(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride,bool indexed)3712 hk_draw_indirect_count(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3713                        VkDeviceSize offset, VkBuffer countBuffer,
3714                        VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
3715                        uint32_t stride, bool indexed)
3716 {
3717    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3718    VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
3719    VK_FROM_HANDLE(hk_buffer, count_buffer, countBuffer);
3720 
3721    struct hk_device *dev = hk_cmd_buffer_device(cmd);
3722    perf_debug(dev, "Draw indirect count");
3723 
3724    struct hk_cs *cs =
3725       hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true);
3726    if (!cs)
3727       return;
3728 
3729    hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
3730 
3731    assert((stride % 4) == 0 && "aligned");
3732 
3733    size_t out_stride = sizeof(uint32_t) * (indexed ? 5 : 4);
3734    uint64_t patched = hk_pool_alloc(cmd, out_stride * maxDrawCount, 4).gpu;
3735    uint64_t in = hk_buffer_address(buffer, offset);
3736    uint64_t count_addr = hk_buffer_address(count_buffer, countBufferOffset);
3737 
3738    libagx_predicate_indirect(cs, agx_1d(maxDrawCount), AGX_BARRIER_ALL, patched,
3739                              in, count_addr, stride / 4, indexed);
3740 
3741    if (indexed) {
3742       hk_draw_indexed_indirect_inner(commandBuffer, patched, maxDrawCount,
3743                                      out_stride);
3744    } else {
3745       hk_draw_indirect_inner(commandBuffer, patched, maxDrawCount, out_stride);
3746    }
3747 }
3748 
3749 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)3750 hk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3751                         VkDeviceSize offset, VkBuffer countBuffer,
3752                         VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
3753                         uint32_t stride)
3754 {
3755    hk_draw_indirect_count(commandBuffer, _buffer, offset, countBuffer,
3756                           countBufferOffset, maxDrawCount, stride, false);
3757 }
3758 
3759 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)3760 hk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3761                                VkDeviceSize offset, VkBuffer countBuffer,
3762                                VkDeviceSize countBufferOffset,
3763                                uint32_t maxDrawCount, uint32_t stride)
3764 {
3765    hk_draw_indirect_count(commandBuffer, _buffer, offset, countBuffer,
3766                           countBufferOffset, maxDrawCount, stride, true);
3767 }
3768 
3769 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)3770 hk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
3771                                uint32_t instanceCount, uint32_t firstInstance,
3772                                VkBuffer counterBuffer,
3773                                VkDeviceSize counterBufferOffset,
3774                                uint32_t counterOffset, uint32_t vertexStride)
3775 {
3776    unreachable("TODO");
3777 }
3778 
3779 VKAPI_ATTR void VKAPI_CALL
hk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)3780 hk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
3781                                       uint32_t firstBinding,
3782                                       uint32_t bindingCount,
3783                                       const VkBuffer *pBuffers,
3784                                       const VkDeviceSize *pOffsets,
3785                                       const VkDeviceSize *pSizes)
3786 {
3787    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3788    struct hk_graphics_state *gfx = &cmd->state.gfx;
3789 
3790    for (uint32_t i = 0; i < bindingCount; i++) {
3791       VK_FROM_HANDLE(hk_buffer, buffer, pBuffers[i]);
3792       uint32_t idx = firstBinding + i;
3793       uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
3794 
3795       gfx->xfb[idx] = hk_buffer_addr_range(buffer, pOffsets[i], size);
3796    }
3797 }
3798 
3799 static void
hk_begin_end_xfb(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets,bool begin)3800 hk_begin_end_xfb(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
3801                  uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
3802                  const VkDeviceSize *pCounterBufferOffsets, bool begin)
3803 
3804 {
3805    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3806    struct hk_device *dev = hk_cmd_buffer_device(cmd);
3807    struct hk_graphics_state *gfx = &cmd->state.gfx;
3808 
3809    gfx->xfb_enabled = begin;
3810 
3811    /* If we haven't reserved XFB offsets yet for the command buffer, do so. */
3812    if (!gfx->xfb_offsets) {
3813       gfx->xfb_offsets = hk_pool_alloc(cmd, 4 * sizeof(uint32_t), 4).gpu;
3814    }
3815 
3816    struct hk_cs *cs =
3817       hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true);
3818    if (!cs)
3819       return;
3820    hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
3821 
3822    struct libagx_xfb_counter_copy params = {};
3823    unsigned copies = 0;
3824 
3825    /* For CmdBeginTransformFeedbackEXT, we need to initialize everything */
3826    if (begin) {
3827       for (copies = 0; copies < 4; ++copies) {
3828          params.dest[copies] = gfx->xfb_offsets + copies * sizeof(uint32_t);
3829       }
3830    }
3831 
3832    for (unsigned i = 0; i < counterBufferCount; ++i) {
3833       if (pCounterBuffers[i] == VK_NULL_HANDLE)
3834          continue;
3835 
3836       VK_FROM_HANDLE(hk_buffer, buffer, pCounterBuffers[i]);
3837 
3838       uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
3839       uint64_t cb_addr = hk_buffer_address(buffer, offset);
3840       uint32_t cmd_idx = firstCounterBuffer + i;
3841 
3842       if (begin) {
3843          params.src[cmd_idx] = cb_addr;
3844       } else {
3845          params.dest[copies] = cb_addr;
3846          params.src[copies] = gfx->xfb_offsets + cmd_idx * sizeof(uint32_t);
3847          ++copies;
3848       }
3849    }
3850 
3851    if (begin)
3852       copies = 4;
3853 
3854    if (copies > 0) {
3855       perf_debug(dev, "XFB counter copy");
3856 
3857       libagx_copy_xfb_counters(cs, agx_1d(copies), AGX_BARRIER_ALL,
3858                                hk_pool_upload(cmd, &params, sizeof(params), 8));
3859    }
3860 }
3861 
3862 VKAPI_ATTR void VKAPI_CALL
hk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)3863 hk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
3864                                 uint32_t firstCounterBuffer,
3865                                 uint32_t counterBufferCount,
3866                                 const VkBuffer *pCounterBuffers,
3867                                 const VkDeviceSize *pCounterBufferOffsets)
3868 {
3869    hk_begin_end_xfb(commandBuffer, firstCounterBuffer, counterBufferCount,
3870                     pCounterBuffers, pCounterBufferOffsets, true);
3871 }
3872 
3873 VKAPI_ATTR void VKAPI_CALL
hk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)3874 hk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
3875                               uint32_t firstCounterBuffer,
3876                               uint32_t counterBufferCount,
3877                               const VkBuffer *pCounterBuffers,
3878                               const VkDeviceSize *pCounterBufferOffsets)
3879 {
3880    hk_begin_end_xfb(commandBuffer, firstCounterBuffer, counterBufferCount,
3881                     pCounterBuffers, pCounterBufferOffsets, false);
3882 }
3883 
3884 VKAPI_ATTR void VKAPI_CALL
hk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)3885 hk_CmdBeginConditionalRenderingEXT(
3886    VkCommandBuffer commandBuffer,
3887    const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
3888 {
3889    unreachable("stub");
3890 }
3891 
3892 VKAPI_ATTR void VKAPI_CALL
hk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)3893 hk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
3894 {
3895    unreachable("stub");
3896 }
3897