• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2024 Valve Corporation
3  * Copyright 2024 Alyssa Rosenzweig
4  * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5  * SPDX-License-Identifier: MIT
6  */
7 #include "hk_cmd_buffer.h"
8 
9 #include "agx_bo.h"
10 #include "agx_device.h"
11 #include "agx_linker.h"
12 #include "agx_tilebuffer.h"
13 #include "agx_usc.h"
14 #include "hk_buffer.h"
15 #include "hk_cmd_pool.h"
16 #include "hk_descriptor_set.h"
17 #include "hk_descriptor_set_layout.h"
18 #include "hk_device.h"
19 #include "hk_device_memory.h"
20 #include "hk_entrypoints.h"
21 #include "hk_image_view.h"
22 #include "hk_physical_device.h"
23 #include "hk_shader.h"
24 
25 #include "libagx_dgc.h"
26 #include "pool.h"
27 #include "shader_enums.h"
28 #include "vk_pipeline_layout.h"
29 #include "vk_synchronization.h"
30 
31 #include "util/list.h"
32 #include "util/macros.h"
33 #include "util/u_dynarray.h"
34 #include "vulkan/vulkan_core.h"
35 
36 static void
hk_descriptor_state_fini(struct hk_cmd_buffer * cmd,struct hk_descriptor_state * desc)37 hk_descriptor_state_fini(struct hk_cmd_buffer *cmd,
38                          struct hk_descriptor_state *desc)
39 {
40    struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd);
41 
42    for (unsigned i = 0; i < HK_MAX_SETS; i++) {
43       vk_free(&pool->vk.alloc, desc->push[i]);
44       desc->push[i] = NULL;
45    }
46 }
47 
48 static void
hk_free_resettable_cmd_buffer(struct hk_cmd_buffer * cmd)49 hk_free_resettable_cmd_buffer(struct hk_cmd_buffer *cmd)
50 {
51    struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd);
52    struct hk_device *dev = hk_cmd_pool_device(pool);
53 
54    hk_descriptor_state_fini(cmd, &cmd->state.gfx.descriptors);
55    hk_descriptor_state_fini(cmd, &cmd->state.cs.descriptors);
56 
57    hk_cmd_pool_free_bo_list(pool, &cmd->uploader.main.bos);
58    hk_cmd_pool_free_usc_bo_list(pool, &cmd->uploader.usc.bos);
59 
60    list_for_each_entry_safe(struct hk_cs, it, &cmd->control_streams, node) {
61       list_del(&it->node);
62       hk_cs_destroy(it);
63    }
64 
65    util_dynarray_foreach(&cmd->large_bos, struct agx_bo *, bo) {
66       agx_bo_unreference(&dev->dev, *bo);
67    }
68 
69    util_dynarray_clear(&cmd->large_bos);
70 }
71 
72 static void
hk_destroy_cmd_buffer(struct vk_command_buffer * vk_cmd_buffer)73 hk_destroy_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer)
74 {
75    struct hk_cmd_buffer *cmd =
76       container_of(vk_cmd_buffer, struct hk_cmd_buffer, vk);
77    struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd);
78 
79    util_dynarray_fini(&cmd->large_bos);
80    hk_free_resettable_cmd_buffer(cmd);
81    vk_command_buffer_finish(&cmd->vk);
82    vk_free(&pool->vk.alloc, cmd);
83 }
84 
85 static VkResult
hk_create_cmd_buffer(struct vk_command_pool * vk_pool,VkCommandBufferLevel level,struct vk_command_buffer ** cmd_buffer_out)86 hk_create_cmd_buffer(struct vk_command_pool *vk_pool,
87                      VkCommandBufferLevel level,
88                      struct vk_command_buffer **cmd_buffer_out)
89 {
90    struct hk_cmd_pool *pool = container_of(vk_pool, struct hk_cmd_pool, vk);
91    struct hk_device *dev = hk_cmd_pool_device(pool);
92    struct hk_cmd_buffer *cmd;
93    VkResult result;
94 
95    cmd = vk_zalloc(&pool->vk.alloc, sizeof(*cmd), 8,
96                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
97    if (cmd == NULL)
98       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
99 
100    result =
101       vk_command_buffer_init(&pool->vk, &cmd->vk, &hk_cmd_buffer_ops, level);
102    if (result != VK_SUCCESS) {
103       vk_free(&pool->vk.alloc, cmd);
104       return result;
105    }
106 
107    util_dynarray_init(&cmd->large_bos, NULL);
108 
109    cmd->vk.dynamic_graphics_state.vi = &cmd->state.gfx._dynamic_vi;
110    cmd->vk.dynamic_graphics_state.ms.sample_locations =
111       &cmd->state.gfx._dynamic_sl;
112 
113    list_inithead(&cmd->uploader.main.bos);
114    list_inithead(&cmd->uploader.usc.bos);
115    list_inithead(&cmd->control_streams);
116 
117    *cmd_buffer_out = &cmd->vk;
118 
119    return VK_SUCCESS;
120 }
121 
122 static void
hk_reset_cmd_buffer(struct vk_command_buffer * vk_cmd_buffer,UNUSED VkCommandBufferResetFlags flags)123 hk_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
124                     UNUSED VkCommandBufferResetFlags flags)
125 {
126    struct hk_cmd_buffer *cmd =
127       container_of(vk_cmd_buffer, struct hk_cmd_buffer, vk);
128 
129    vk_command_buffer_reset(&cmd->vk);
130    hk_free_resettable_cmd_buffer(cmd);
131 
132    cmd->uploader.main.map = NULL;
133    cmd->uploader.main.base = 0;
134    cmd->uploader.main.offset = 0;
135    cmd->uploader.usc.map = NULL;
136    cmd->uploader.usc.base = 0;
137    cmd->uploader.usc.offset = 0;
138 
139    cmd->current_cs.gfx = NULL;
140    cmd->current_cs.cs = NULL;
141    cmd->current_cs.post_gfx = NULL;
142    cmd->current_cs.pre_gfx = NULL;
143 
144    /* TODO: clear pool! */
145 
146    memset(&cmd->state, 0, sizeof(cmd->state));
147 }
148 
149 const struct vk_command_buffer_ops hk_cmd_buffer_ops = {
150    .create = hk_create_cmd_buffer,
151    .reset = hk_reset_cmd_buffer,
152    .destroy = hk_destroy_cmd_buffer,
153 };
154 
155 static VkResult
hk_cmd_buffer_alloc_bo(struct hk_cmd_buffer * cmd,bool usc,struct hk_cmd_bo ** bo_out)156 hk_cmd_buffer_alloc_bo(struct hk_cmd_buffer *cmd, bool usc,
157                        struct hk_cmd_bo **bo_out)
158 {
159    VkResult result = hk_cmd_pool_alloc_bo(hk_cmd_buffer_pool(cmd), usc, bo_out);
160    if (result != VK_SUCCESS)
161       return result;
162 
163    if (usc)
164       list_addtail(&(*bo_out)->link, &cmd->uploader.usc.bos);
165    else
166       list_addtail(&(*bo_out)->link, &cmd->uploader.main.bos);
167 
168    return VK_SUCCESS;
169 }
170 
171 struct agx_ptr
hk_pool_alloc_internal(struct hk_cmd_buffer * cmd,uint32_t size,uint32_t alignment,bool usc)172 hk_pool_alloc_internal(struct hk_cmd_buffer *cmd, uint32_t size,
173                        uint32_t alignment, bool usc)
174 {
175    struct hk_device *dev = hk_cmd_buffer_device(cmd);
176    struct hk_uploader *uploader =
177       usc ? &cmd->uploader.usc : &cmd->uploader.main;
178 
179    /* Specially handle large allocations owned by the command buffer, e.g. used
180     * for statically allocated vertex output buffers with geometry shaders.
181     */
182    if (size > HK_CMD_BO_SIZE) {
183       uint32_t flags = usc ? AGX_BO_LOW_VA : 0;
184       struct agx_bo *bo =
185          agx_bo_create(&dev->dev, size, flags, 0, "Large pool allocation");
186 
187       util_dynarray_append(&cmd->large_bos, struct agx_bo *, bo);
188       return (struct agx_ptr){
189          .gpu = bo->va->addr,
190          .cpu = agx_bo_map(bo),
191       };
192    }
193 
194    assert(size <= HK_CMD_BO_SIZE);
195    assert(alignment > 0);
196 
197    uint32_t offset = align(uploader->offset, alignment);
198 
199    assert(offset <= HK_CMD_BO_SIZE);
200    if (uploader->map != NULL && size <= HK_CMD_BO_SIZE - offset) {
201       uploader->offset = offset + size;
202 
203       return (struct agx_ptr){
204          .gpu = uploader->base + offset,
205          .cpu = uploader->map + offset,
206       };
207    }
208 
209    struct hk_cmd_bo *bo;
210    VkResult result = hk_cmd_buffer_alloc_bo(cmd, usc, &bo);
211    if (unlikely(result != VK_SUCCESS)) {
212       vk_command_buffer_set_error(&cmd->vk, result);
213       return (struct agx_ptr){0};
214    }
215 
216    /* Pick whichever of the current upload BO and the new BO will have more
217     * room left to be the BO for the next upload.  If our upload size is
218     * bigger than the old offset, we're better off burning the whole new
219     * upload BO on this one allocation and continuing on the current upload
220     * BO.
221     */
222    if (uploader->map == NULL || size < uploader->offset) {
223       uploader->map = agx_bo_map(bo->bo);
224       uploader->base = bo->bo->va->addr;
225       uploader->offset = size;
226    }
227 
228    return (struct agx_ptr){
229       .gpu = bo->bo->va->addr,
230       .cpu = bo->map,
231    };
232 }
233 
234 uint64_t
hk_pool_upload(struct hk_cmd_buffer * cmd,const void * data,uint32_t size,uint32_t alignment)235 hk_pool_upload(struct hk_cmd_buffer *cmd, const void *data, uint32_t size,
236                uint32_t alignment)
237 {
238    struct agx_ptr T = hk_pool_alloc(cmd, size, alignment);
239    if (unlikely(T.cpu == NULL))
240       return 0;
241 
242    memcpy(T.cpu, data, size);
243    return T.gpu;
244 }
245 
246 VKAPI_ATTR VkResult VKAPI_CALL
hk_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)247 hk_BeginCommandBuffer(VkCommandBuffer commandBuffer,
248                       const VkCommandBufferBeginInfo *pBeginInfo)
249 {
250    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
251    struct hk_device *dev = hk_cmd_buffer_device(cmd);
252 
253    hk_reset_cmd_buffer(&cmd->vk, 0);
254 
255    perf_debug(dev, "Begin command buffer");
256    hk_cmd_buffer_begin_compute(cmd, pBeginInfo);
257    hk_cmd_buffer_begin_graphics(cmd, pBeginInfo);
258 
259    return VK_SUCCESS;
260 }
261 
262 VKAPI_ATTR VkResult VKAPI_CALL
hk_EndCommandBuffer(VkCommandBuffer commandBuffer)263 hk_EndCommandBuffer(VkCommandBuffer commandBuffer)
264 {
265    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
266    struct hk_device *dev = hk_cmd_buffer_device(cmd);
267 
268    assert(cmd->current_cs.gfx == NULL && cmd->current_cs.pre_gfx == NULL &&
269           "must end rendering before ending the command buffer");
270 
271    perf_debug(dev, "End command buffer");
272    hk_cmd_buffer_end_compute(cmd);
273    hk_cmd_buffer_end_compute_internal(cmd, &cmd->current_cs.post_gfx);
274 
275    /* With rasterizer discard, we might end up with empty VDM batches.
276     * It is difficult to avoid creating these empty batches, but it's easy to
277     * optimize them out at record-time. Do so now.
278     */
279    list_for_each_entry_safe(struct hk_cs, cs, &cmd->control_streams, node) {
280       if (cs->type == HK_CS_VDM && cs->stats.cmds == 0 &&
281           !cs->cr.process_empty_tiles) {
282 
283          list_del(&cs->node);
284          hk_cs_destroy(cs);
285       }
286    }
287 
288    return vk_command_buffer_get_record_result(&cmd->vk);
289 }
290 
291 VKAPI_ATTR void VKAPI_CALL
hk_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)292 hk_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
293                        const VkDependencyInfo *pDependencyInfo)
294 {
295    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
296    struct hk_device *dev = hk_cmd_buffer_device(cmd);
297 
298    if (HK_PERF(dev, NOBARRIER))
299       return;
300 
301    perf_debug(dev, "Pipeline barrier");
302 
303    /* The big hammer. We end both compute and graphics batches. Ending compute
304     * here is necessary to properly handle graphics->compute dependencies.
305     *
306     * XXX: perf. */
307    hk_cmd_buffer_end_compute(cmd);
308    hk_cmd_buffer_end_graphics(cmd);
309 }
310 
311 void
hk_cmd_bind_shaders(struct vk_command_buffer * vk_cmd,uint32_t stage_count,const gl_shader_stage * stages,struct vk_shader ** const shaders)312 hk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count,
313                     const gl_shader_stage *stages,
314                     struct vk_shader **const shaders)
315 {
316    struct hk_cmd_buffer *cmd = container_of(vk_cmd, struct hk_cmd_buffer, vk);
317 
318    for (uint32_t i = 0; i < stage_count; i++) {
319       struct hk_api_shader *shader =
320          container_of(shaders[i], struct hk_api_shader, vk);
321 
322       if (stages[i] == MESA_SHADER_COMPUTE || stages[i] == MESA_SHADER_KERNEL)
323          hk_cmd_bind_compute_shader(cmd, shader);
324       else
325          hk_cmd_bind_graphics_shader(cmd, stages[i], shader);
326    }
327 }
328 
329 static void
hk_bind_descriptor_sets(UNUSED struct hk_cmd_buffer * cmd,struct hk_descriptor_state * desc,const VkBindDescriptorSetsInfoKHR * info)330 hk_bind_descriptor_sets(UNUSED struct hk_cmd_buffer *cmd,
331                         struct hk_descriptor_state *desc,
332                         const VkBindDescriptorSetsInfoKHR *info)
333 {
334    VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, info->layout);
335 
336    /* Fro the Vulkan 1.3.275 spec:
337     *
338     *    "When binding a descriptor set (see Descriptor Set Binding) to
339     *    set number N...
340     *
341     *    If, additionally, the previously bound descriptor set for set
342     *    N was bound using a pipeline layout not compatible for set N,
343     *    then all bindings in sets numbered greater than N are
344     *    disturbed."
345     *
346     * This means that, if some earlier set gets bound in such a way that
347     * it changes set_dynamic_buffer_start[s], this binding is implicitly
348     * invalidated.  Therefore, we can always look at the current value
349     * of set_dynamic_buffer_start[s] as the base of our dynamic buffer
350     * range and it's only our responsibility to adjust all
351     * set_dynamic_buffer_start[p] for p > s as needed.
352     */
353    uint8_t dyn_buffer_start =
354       desc->root.set_dynamic_buffer_start[info->firstSet];
355 
356    uint32_t next_dyn_offset = 0;
357    for (uint32_t i = 0; i < info->descriptorSetCount; ++i) {
358       unsigned s = i + info->firstSet;
359       VK_FROM_HANDLE(hk_descriptor_set, set, info->pDescriptorSets[i]);
360 
361       if (desc->sets[s] != set) {
362          if (set != NULL) {
363             desc->root.sets[s] = hk_descriptor_set_addr(set);
364             desc->set_sizes[s] = set->size;
365          } else {
366             desc->root.sets[s] = 0;
367             desc->set_sizes[s] = 0;
368          }
369          desc->sets[s] = set;
370          desc->sets_dirty |= BITFIELD_BIT(s);
371 
372          /* Binding descriptors invalidates push descriptors */
373          desc->push_dirty &= ~BITFIELD_BIT(s);
374       }
375 
376       desc->root.set_dynamic_buffer_start[s] = dyn_buffer_start;
377 
378       if (pipeline_layout->set_layouts[s] != NULL) {
379          const struct hk_descriptor_set_layout *set_layout =
380             vk_to_hk_descriptor_set_layout(pipeline_layout->set_layouts[s]);
381 
382          if (set != NULL && set_layout->dynamic_buffer_count > 0) {
383             for (uint32_t j = 0; j < set_layout->dynamic_buffer_count; j++) {
384                struct hk_buffer_address addr = set->dynamic_buffers[j];
385                addr.base_addr += info->pDynamicOffsets[next_dyn_offset + j];
386                desc->root.dynamic_buffers[dyn_buffer_start + j] = addr;
387             }
388             next_dyn_offset += set->layout->dynamic_buffer_count;
389          }
390 
391          dyn_buffer_start += set_layout->dynamic_buffer_count;
392       } else {
393          assert(set == NULL);
394       }
395    }
396    assert(dyn_buffer_start <= HK_MAX_DYNAMIC_BUFFERS);
397    assert(next_dyn_offset <= info->dynamicOffsetCount);
398 
399    for (uint32_t s = info->firstSet + info->descriptorSetCount; s < HK_MAX_SETS;
400         s++)
401       desc->root.set_dynamic_buffer_start[s] = dyn_buffer_start;
402 
403    desc->root_dirty = true;
404 }
405 
406 VKAPI_ATTR void VKAPI_CALL
hk_CmdBindDescriptorSets2KHR(VkCommandBuffer commandBuffer,const VkBindDescriptorSetsInfoKHR * pBindDescriptorSetsInfo)407 hk_CmdBindDescriptorSets2KHR(
408    VkCommandBuffer commandBuffer,
409    const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo)
410 {
411    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
412 
413    if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
414       hk_bind_descriptor_sets(cmd, &cmd->state.gfx.descriptors,
415                               pBindDescriptorSetsInfo);
416    }
417 
418    if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
419       hk_bind_descriptor_sets(cmd, &cmd->state.cs.descriptors,
420                               pBindDescriptorSetsInfo);
421    }
422 }
423 
424 static void
hk_push_constants(UNUSED struct hk_cmd_buffer * cmd,struct hk_descriptor_state * desc,const VkPushConstantsInfoKHR * info)425 hk_push_constants(UNUSED struct hk_cmd_buffer *cmd,
426                   struct hk_descriptor_state *desc,
427                   const VkPushConstantsInfoKHR *info)
428 {
429    memcpy(desc->root.push + info->offset, info->pValues, info->size);
430    desc->root_dirty = true;
431 }
432 
433 VKAPI_ATTR void VKAPI_CALL
hk_CmdPushConstants2KHR(VkCommandBuffer commandBuffer,const VkPushConstantsInfoKHR * pPushConstantsInfo)434 hk_CmdPushConstants2KHR(VkCommandBuffer commandBuffer,
435                         const VkPushConstantsInfoKHR *pPushConstantsInfo)
436 {
437    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
438 
439    if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS)
440       hk_push_constants(cmd, &cmd->state.gfx.descriptors, pPushConstantsInfo);
441 
442    if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT)
443       hk_push_constants(cmd, &cmd->state.cs.descriptors, pPushConstantsInfo);
444 }
445 
446 static struct hk_push_descriptor_set *
hk_cmd_push_descriptors(struct hk_cmd_buffer * cmd,struct hk_descriptor_state * desc,uint32_t set)447 hk_cmd_push_descriptors(struct hk_cmd_buffer *cmd,
448                         struct hk_descriptor_state *desc, uint32_t set)
449 {
450    assert(set < HK_MAX_SETS);
451    if (unlikely(desc->push[set] == NULL)) {
452       desc->push[set] =
453          vk_zalloc(&cmd->vk.pool->alloc, sizeof(*desc->push[set]), 8,
454                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
455       if (unlikely(desc->push[set] == NULL)) {
456          vk_command_buffer_set_error(&cmd->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
457          return NULL;
458       }
459    }
460 
461    /* Pushing descriptors replaces whatever sets are bound */
462    desc->sets[set] = NULL;
463    desc->push_dirty |= BITFIELD_BIT(set);
464 
465    return desc->push[set];
466 }
467 
468 static void
hk_push_descriptor_set(struct hk_cmd_buffer * cmd,struct hk_descriptor_state * desc,const VkPushDescriptorSetInfoKHR * info)469 hk_push_descriptor_set(struct hk_cmd_buffer *cmd,
470                        struct hk_descriptor_state *desc,
471                        const VkPushDescriptorSetInfoKHR *info)
472 {
473    VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, info->layout);
474 
475    struct hk_push_descriptor_set *push_set =
476       hk_cmd_push_descriptors(cmd, desc, info->set);
477    if (unlikely(push_set == NULL))
478       return;
479 
480    struct hk_descriptor_set_layout *set_layout =
481       vk_to_hk_descriptor_set_layout(pipeline_layout->set_layouts[info->set]);
482 
483    hk_push_descriptor_set_update(push_set, set_layout,
484                                  info->descriptorWriteCount,
485                                  info->pDescriptorWrites);
486 }
487 
488 VKAPI_ATTR void VKAPI_CALL
hk_CmdPushDescriptorSet2KHR(VkCommandBuffer commandBuffer,const VkPushDescriptorSetInfoKHR * pPushDescriptorSetInfo)489 hk_CmdPushDescriptorSet2KHR(
490    VkCommandBuffer commandBuffer,
491    const VkPushDescriptorSetInfoKHR *pPushDescriptorSetInfo)
492 {
493    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
494 
495    if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
496       hk_push_descriptor_set(cmd, &cmd->state.gfx.descriptors,
497                              pPushDescriptorSetInfo);
498    }
499 
500    if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
501       hk_push_descriptor_set(cmd, &cmd->state.cs.descriptors,
502                              pPushDescriptorSetInfo);
503    }
504 }
505 
506 void
hk_cmd_buffer_flush_push_descriptors(struct hk_cmd_buffer * cmd,struct hk_descriptor_state * desc)507 hk_cmd_buffer_flush_push_descriptors(struct hk_cmd_buffer *cmd,
508                                      struct hk_descriptor_state *desc)
509 {
510    u_foreach_bit(set_idx, desc->push_dirty) {
511       struct hk_push_descriptor_set *push_set = desc->push[set_idx];
512       uint64_t push_set_addr = hk_pool_upload(
513          cmd, push_set->data, sizeof(push_set->data), HK_MIN_UBO_ALIGNMENT);
514 
515       desc->root.sets[set_idx] = push_set_addr;
516       desc->set_sizes[set_idx] = sizeof(push_set->data);
517    }
518 
519    desc->root_dirty = true;
520    desc->push_dirty = 0;
521 }
522 
523 VKAPI_ATTR void VKAPI_CALL
hk_CmdPushDescriptorSetWithTemplate2KHR(VkCommandBuffer commandBuffer,const VkPushDescriptorSetWithTemplateInfoKHR * pPushDescriptorSetWithTemplateInfo)524 hk_CmdPushDescriptorSetWithTemplate2KHR(
525    VkCommandBuffer commandBuffer, const VkPushDescriptorSetWithTemplateInfoKHR
526                                      *pPushDescriptorSetWithTemplateInfo)
527 {
528    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
529    VK_FROM_HANDLE(vk_descriptor_update_template, template,
530                   pPushDescriptorSetWithTemplateInfo->descriptorUpdateTemplate);
531    VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout,
532                   pPushDescriptorSetWithTemplateInfo->layout);
533 
534    struct hk_descriptor_state *desc =
535       hk_get_descriptors_state(cmd, template->bind_point);
536    struct hk_push_descriptor_set *push_set = hk_cmd_push_descriptors(
537       cmd, desc, pPushDescriptorSetWithTemplateInfo->set);
538    if (unlikely(push_set == NULL))
539       return;
540 
541    struct hk_descriptor_set_layout *set_layout = vk_to_hk_descriptor_set_layout(
542       pipeline_layout->set_layouts[pPushDescriptorSetWithTemplateInfo->set]);
543 
544    hk_push_descriptor_set_update_template(
545       push_set, set_layout, template,
546       pPushDescriptorSetWithTemplateInfo->pData);
547 }
548 
549 uint64_t
hk_cmd_buffer_upload_root(struct hk_cmd_buffer * cmd,VkPipelineBindPoint bind_point)550 hk_cmd_buffer_upload_root(struct hk_cmd_buffer *cmd,
551                           VkPipelineBindPoint bind_point)
552 {
553    struct hk_descriptor_state *desc = hk_get_descriptors_state(cmd, bind_point);
554    struct hk_root_descriptor_table *root = &desc->root;
555 
556    struct agx_ptr root_ptr = hk_pool_alloc(cmd, sizeof(*root), 8);
557    if (!root_ptr.gpu)
558       return 0;
559 
560    root->root_desc_addr = root_ptr.gpu;
561 
562    memcpy(root_ptr.cpu, root, sizeof(*root));
563    return root_ptr.gpu;
564 }
565 
566 void
hk_usc_upload_spilled_rt_descs(struct agx_usc_builder * b,struct hk_cmd_buffer * cmd)567 hk_usc_upload_spilled_rt_descs(struct agx_usc_builder *b,
568                                struct hk_cmd_buffer *cmd)
569 {
570    struct hk_rendering_state *render = &cmd->state.gfx.render;
571 
572    /* Upload texture/PBE descriptors for each render target so we can clear
573     * spilled render targets.
574     */
575    struct agx_ptr descs =
576       hk_pool_alloc(cmd, AGX_TEXTURE_LENGTH * 2 * render->color_att_count, 64);
577    struct agx_texture_packed *desc = descs.cpu;
578    if (!desc)
579       return;
580 
581    for (unsigned i = 0; i < render->color_att_count; ++i) {
582       struct hk_image_view *iview = render->color_att[i].iview;
583       if (!iview) {
584          /* XXX: probably should emit a null descriptor here...? */
585          continue;
586       }
587 
588       memcpy(&desc[(i * 2) + 0], &iview->planes[0].emrt_texture, sizeof(*desc));
589       memcpy(&desc[(i * 2) + 1], &iview->planes[0].emrt_pbe, sizeof(*desc));
590    }
591 
592    desc = descs.cpu;
593 
594    /* Bind the base as u0_u1 for bindless access */
595    agx_usc_uniform(b, 0, 4, hk_pool_upload(cmd, &descs.gpu, 8, 8));
596 }
597 
598 void
hk_reserve_scratch(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct hk_shader * s)599 hk_reserve_scratch(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
600                    struct hk_shader *s)
601 {
602    struct hk_device *dev = hk_cmd_buffer_device(cmd);
603    uint32_t max_scratch_size =
604       MAX2(s->b.info.scratch_size, s->b.info.preamble_scratch_size);
605 
606    if (max_scratch_size == 0)
607       return;
608 
609    unsigned preamble_size = (s->b.info.preamble_scratch_size > 0) ? 1 : 0;
610 
611    /* Note: this uses the hardware stage, not the software stage */
612    hk_device_alloc_scratch(dev, s->b.info.stage, max_scratch_size);
613    perf_debug(dev, "Reserving %u (%u) bytes of scratch for stage %s",
614               s->b.info.scratch_size, s->b.info.preamble_scratch_size,
615               _mesa_shader_stage_to_abbrev(s->b.info.stage));
616 
617    switch (s->b.info.stage) {
618    case PIPE_SHADER_FRAGMENT:
619       cs->scratch.fs.main = true;
620       cs->scratch.fs.preamble = MAX2(cs->scratch.fs.preamble, preamble_size);
621       break;
622    case PIPE_SHADER_VERTEX:
623       cs->scratch.vs.main = true;
624       cs->scratch.vs.preamble = MAX2(cs->scratch.vs.preamble, preamble_size);
625       break;
626    default:
627       cs->scratch.cs.main = true;
628       cs->scratch.cs.preamble = MAX2(cs->scratch.cs.preamble, preamble_size);
629       break;
630    }
631 }
632 
633 uint32_t
hk_upload_usc_words(struct hk_cmd_buffer * cmd,struct hk_shader * s,struct hk_linked_shader * linked)634 hk_upload_usc_words(struct hk_cmd_buffer *cmd, struct hk_shader *s,
635                     struct hk_linked_shader *linked)
636 {
637    struct hk_device *dev = hk_cmd_buffer_device(cmd);
638 
639    enum pipe_shader_type sw_stage = s->info.stage;
640 
641    unsigned constant_push_ranges = DIV_ROUND_UP(s->b.info.rodata.size_16, 64);
642    unsigned push_ranges = 2;
643    unsigned stage_ranges = 3;
644 
645    size_t usc_size =
646       agx_usc_size(constant_push_ranges + push_ranges + stage_ranges + 4);
647    struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64);
648    if (!t.cpu)
649       return 0;
650 
651    struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
652 
653    uint64_t root_ptr;
654 
655    if (sw_stage == PIPE_SHADER_COMPUTE)
656       root_ptr = hk_cmd_buffer_upload_root(cmd, VK_PIPELINE_BIND_POINT_COMPUTE);
657    else
658       root_ptr = cmd->state.gfx.root;
659 
660    static_assert(offsetof(struct hk_root_descriptor_table, root_desc_addr) == 0,
661                  "self-reflective");
662 
663    agx_usc_uniform(&b, HK_ROOT_UNIFORM, 4, root_ptr);
664 
665    if (sw_stage == MESA_SHADER_VERTEX) {
666       unsigned count =
667          DIV_ROUND_UP(BITSET_LAST_BIT(s->info.vs.attrib_components_read), 4);
668 
669       if (count) {
670          agx_usc_uniform(
671             &b, 0, 4 * count,
672             root_ptr + hk_root_descriptor_offset(draw.attrib_base));
673 
674          agx_usc_uniform(
675             &b, 4 * count, 2 * count,
676             root_ptr + hk_root_descriptor_offset(draw.attrib_clamps));
677       }
678 
679       if (cmd->state.gfx.draw_params)
680          agx_usc_uniform(&b, 6 * count, 4, cmd->state.gfx.draw_params);
681 
682       if (cmd->state.gfx.draw_id_ptr)
683          agx_usc_uniform(&b, (6 * count) + 4, 1, cmd->state.gfx.draw_id_ptr);
684 
685       if (linked->sw_indexing) {
686          agx_usc_uniform(
687             &b, (6 * count) + 8, 4,
688             root_ptr + hk_root_descriptor_offset(draw.input_assembly));
689       }
690    } else if (sw_stage == MESA_SHADER_FRAGMENT) {
691       if (agx_tilebuffer_spills(&cmd->state.gfx.render.tilebuffer)) {
692          hk_usc_upload_spilled_rt_descs(&b, cmd);
693       }
694 
695       agx_usc_uniform(
696          &b, 4, 8, root_ptr + hk_root_descriptor_offset(draw.blend_constant));
697 
698       /* The SHARED state is baked into linked->usc for non-fragment shaders. We
699        * don't pass around the information to bake the tilebuffer layout.
700        *
701        * TODO: We probably could with some refactor.
702        */
703       agx_usc_push_packed(&b, SHARED, &cmd->state.gfx.render.tilebuffer.usc);
704    }
705 
706    agx_usc_push_blob(&b, linked->usc.data, linked->usc.size);
707    return agx_usc_addr(&dev->dev, t.gpu);
708 }
709 
710 void
hk_dispatch_precomp(struct hk_cs * cs,struct agx_grid grid,enum agx_barrier barrier,enum libagx_program idx,void * data,size_t data_size)711 hk_dispatch_precomp(struct hk_cs *cs, struct agx_grid grid,
712                     enum agx_barrier barrier, enum libagx_program idx,
713                     void *data, size_t data_size)
714 {
715    struct hk_device *dev = hk_cmd_buffer_device(cs->cmd);
716    struct agx_precompiled_shader *prog = agx_get_precompiled(&dev->bg_eot, idx);
717 
718    struct agx_ptr t = hk_pool_usc_alloc(cs->cmd, agx_usc_size(15), 64);
719    uint64_t uploaded_data = hk_pool_upload(cs->cmd, data, data_size, 4);
720 
721    agx_usc_words_precomp(t.cpu, &prog->b, uploaded_data, data_size);
722 
723    hk_dispatch_with_usc_launch(dev, cs, prog->b.launch,
724                                agx_usc_addr(&dev->dev, t.gpu), grid,
725                                prog->b.workgroup);
726 }
727 
728 void
hk_cs_init_graphics(struct hk_cmd_buffer * cmd,struct hk_cs * cs)729 hk_cs_init_graphics(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
730 {
731    struct hk_rendering_state *render = &cmd->state.gfx.render;
732    uint8_t *map = cs->current;
733 
734    cs->tib = render->tilebuffer;
735 
736    /* Assume this is not the first control stream of the render pass, so
737     * initially use the partial background/EOT program and ZLS control.
738     * hk_BeginRendering/hk_EndRendering will override.
739     */
740    cs->cr = render->cr;
741    cs->cr.bg.main = render->cr.bg.partial;
742    cs->cr.eot.main = render->cr.eot.partial;
743    cs->cr.zls_control = render->cr.zls_control_partial;
744 
745    /* Barrier to enforce GPU-CPU coherency, in case this batch is back to back
746     * with another that caused stale data to be cached and the CPU wrote to it
747     * in the meantime.
748     */
749    agx_push(map, VDM_BARRIER, cfg) {
750       cfg.usc_cache_inval = true;
751    }
752 
753    struct AGX_PPP_HEADER present = {
754       .w_clamp = true,
755       .occlusion_query_2 = true,
756       .output_unknown = true,
757       .varying_word_2 = true,
758       .viewport_count = 1, /* irrelevant */
759    };
760 
761    size_t size = agx_ppp_update_size(&present);
762    struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
763    if (!T.cpu)
764       return;
765 
766    struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present);
767 
768    /* clang-format off */
769    agx_ppp_push(&ppp, W_CLAMP, cfg) cfg.w_clamp = 1e-10;
770    agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY_2, cfg);
771    agx_ppp_push(&ppp, OUTPUT_UNKNOWN, cfg);
772    agx_ppp_push(&ppp, VARYING_2, cfg);
773    /* clang-format on */
774 
775    agx_ppp_fini(&map, &ppp);
776    cs->current = map;
777 
778    util_dynarray_init(&cs->scissor, NULL);
779    util_dynarray_init(&cs->depth_bias, NULL);
780 
781    /* All graphics state must be reemited in each control stream */
782    hk_cmd_buffer_dirty_all(cmd);
783 }
784 
785 void
hk_ensure_cs_has_space(struct hk_cmd_buffer * cmd,struct hk_cs * cs,size_t space)786 hk_ensure_cs_has_space(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
787                        size_t space)
788 {
789    bool vdm = cs->type == HK_CS_VDM;
790 
791    size_t link_length =
792       vdm ? AGX_VDM_STREAM_LINK_LENGTH : AGX_CDM_STREAM_LINK_LENGTH;
793 
794    /* Assert that we have space for a link tag */
795    assert((cs->current + link_length) <= cs->end && "Encoder overflowed");
796 
797    /* Always leave room for a link tag, in case we run out of space later,
798     * plus padding because VDM apparently overreads?
799     *
800     * 0x200 is not enough. 0x400 seems to work. 0x800 for safety.
801     */
802    space += link_length + 0x800;
803 
804    /* If there is room in the command buffer, we're done */
805    if (likely((cs->end - cs->current) >= space))
806       return;
807 
808    /* Otherwise, we need to allocate a new command buffer. We use memory owned
809     * by the batch to simplify lifetime management for the BO.
810     */
811    size_t size = 65536;
812    struct agx_ptr T = hk_pool_alloc(cmd, size, 256);
813 
814    /* Jump from the old control stream to the new control stream */
815    agx_cs_jump(cs->current, T.gpu, vdm);
816 
817    /* Swap out the control stream */
818    cs->current = T.cpu;
819    cs->end = cs->current + size;
820    cs->chunk = T;
821    cs->stream_linked = true;
822 }
823