1 /*
2 * Copyright 2024 Valve Corporation
3 * Copyright 2024 Alyssa Rosenzweig
4 * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5 * SPDX-License-Identifier: MIT
6 */
7 #include "hk_cmd_buffer.h"
8
9 #include "agx_bo.h"
10 #include "agx_device.h"
11 #include "agx_linker.h"
12 #include "agx_tilebuffer.h"
13 #include "agx_usc.h"
14 #include "hk_buffer.h"
15 #include "hk_cmd_pool.h"
16 #include "hk_descriptor_set.h"
17 #include "hk_descriptor_set_layout.h"
18 #include "hk_device.h"
19 #include "hk_device_memory.h"
20 #include "hk_entrypoints.h"
21 #include "hk_image_view.h"
22 #include "hk_physical_device.h"
23 #include "hk_shader.h"
24
25 #include "libagx_dgc.h"
26 #include "pool.h"
27 #include "shader_enums.h"
28 #include "vk_pipeline_layout.h"
29 #include "vk_synchronization.h"
30
31 #include "util/list.h"
32 #include "util/macros.h"
33 #include "util/u_dynarray.h"
34 #include "vulkan/vulkan_core.h"
35
36 static void
hk_descriptor_state_fini(struct hk_cmd_buffer * cmd,struct hk_descriptor_state * desc)37 hk_descriptor_state_fini(struct hk_cmd_buffer *cmd,
38 struct hk_descriptor_state *desc)
39 {
40 struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd);
41
42 for (unsigned i = 0; i < HK_MAX_SETS; i++) {
43 vk_free(&pool->vk.alloc, desc->push[i]);
44 desc->push[i] = NULL;
45 }
46 }
47
48 static void
hk_free_resettable_cmd_buffer(struct hk_cmd_buffer * cmd)49 hk_free_resettable_cmd_buffer(struct hk_cmd_buffer *cmd)
50 {
51 struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd);
52 struct hk_device *dev = hk_cmd_pool_device(pool);
53
54 hk_descriptor_state_fini(cmd, &cmd->state.gfx.descriptors);
55 hk_descriptor_state_fini(cmd, &cmd->state.cs.descriptors);
56
57 hk_cmd_pool_free_bo_list(pool, &cmd->uploader.main.bos);
58 hk_cmd_pool_free_usc_bo_list(pool, &cmd->uploader.usc.bos);
59
60 list_for_each_entry_safe(struct hk_cs, it, &cmd->control_streams, node) {
61 list_del(&it->node);
62 hk_cs_destroy(it);
63 }
64
65 util_dynarray_foreach(&cmd->large_bos, struct agx_bo *, bo) {
66 agx_bo_unreference(&dev->dev, *bo);
67 }
68
69 util_dynarray_clear(&cmd->large_bos);
70 }
71
72 static void
hk_destroy_cmd_buffer(struct vk_command_buffer * vk_cmd_buffer)73 hk_destroy_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer)
74 {
75 struct hk_cmd_buffer *cmd =
76 container_of(vk_cmd_buffer, struct hk_cmd_buffer, vk);
77 struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd);
78
79 util_dynarray_fini(&cmd->large_bos);
80 hk_free_resettable_cmd_buffer(cmd);
81 vk_command_buffer_finish(&cmd->vk);
82 vk_free(&pool->vk.alloc, cmd);
83 }
84
85 static VkResult
hk_create_cmd_buffer(struct vk_command_pool * vk_pool,VkCommandBufferLevel level,struct vk_command_buffer ** cmd_buffer_out)86 hk_create_cmd_buffer(struct vk_command_pool *vk_pool,
87 VkCommandBufferLevel level,
88 struct vk_command_buffer **cmd_buffer_out)
89 {
90 struct hk_cmd_pool *pool = container_of(vk_pool, struct hk_cmd_pool, vk);
91 struct hk_device *dev = hk_cmd_pool_device(pool);
92 struct hk_cmd_buffer *cmd;
93 VkResult result;
94
95 cmd = vk_zalloc(&pool->vk.alloc, sizeof(*cmd), 8,
96 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
97 if (cmd == NULL)
98 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
99
100 result =
101 vk_command_buffer_init(&pool->vk, &cmd->vk, &hk_cmd_buffer_ops, level);
102 if (result != VK_SUCCESS) {
103 vk_free(&pool->vk.alloc, cmd);
104 return result;
105 }
106
107 util_dynarray_init(&cmd->large_bos, NULL);
108
109 cmd->vk.dynamic_graphics_state.vi = &cmd->state.gfx._dynamic_vi;
110 cmd->vk.dynamic_graphics_state.ms.sample_locations =
111 &cmd->state.gfx._dynamic_sl;
112
113 list_inithead(&cmd->uploader.main.bos);
114 list_inithead(&cmd->uploader.usc.bos);
115 list_inithead(&cmd->control_streams);
116
117 *cmd_buffer_out = &cmd->vk;
118
119 return VK_SUCCESS;
120 }
121
122 static void
hk_reset_cmd_buffer(struct vk_command_buffer * vk_cmd_buffer,UNUSED VkCommandBufferResetFlags flags)123 hk_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
124 UNUSED VkCommandBufferResetFlags flags)
125 {
126 struct hk_cmd_buffer *cmd =
127 container_of(vk_cmd_buffer, struct hk_cmd_buffer, vk);
128
129 vk_command_buffer_reset(&cmd->vk);
130 hk_free_resettable_cmd_buffer(cmd);
131
132 cmd->uploader.main.map = NULL;
133 cmd->uploader.main.base = 0;
134 cmd->uploader.main.offset = 0;
135 cmd->uploader.usc.map = NULL;
136 cmd->uploader.usc.base = 0;
137 cmd->uploader.usc.offset = 0;
138
139 cmd->current_cs.gfx = NULL;
140 cmd->current_cs.cs = NULL;
141 cmd->current_cs.post_gfx = NULL;
142 cmd->current_cs.pre_gfx = NULL;
143
144 /* TODO: clear pool! */
145
146 memset(&cmd->state, 0, sizeof(cmd->state));
147 }
148
149 const struct vk_command_buffer_ops hk_cmd_buffer_ops = {
150 .create = hk_create_cmd_buffer,
151 .reset = hk_reset_cmd_buffer,
152 .destroy = hk_destroy_cmd_buffer,
153 };
154
155 static VkResult
hk_cmd_buffer_alloc_bo(struct hk_cmd_buffer * cmd,bool usc,struct hk_cmd_bo ** bo_out)156 hk_cmd_buffer_alloc_bo(struct hk_cmd_buffer *cmd, bool usc,
157 struct hk_cmd_bo **bo_out)
158 {
159 VkResult result = hk_cmd_pool_alloc_bo(hk_cmd_buffer_pool(cmd), usc, bo_out);
160 if (result != VK_SUCCESS)
161 return result;
162
163 if (usc)
164 list_addtail(&(*bo_out)->link, &cmd->uploader.usc.bos);
165 else
166 list_addtail(&(*bo_out)->link, &cmd->uploader.main.bos);
167
168 return VK_SUCCESS;
169 }
170
171 struct agx_ptr
hk_pool_alloc_internal(struct hk_cmd_buffer * cmd,uint32_t size,uint32_t alignment,bool usc)172 hk_pool_alloc_internal(struct hk_cmd_buffer *cmd, uint32_t size,
173 uint32_t alignment, bool usc)
174 {
175 struct hk_device *dev = hk_cmd_buffer_device(cmd);
176 struct hk_uploader *uploader =
177 usc ? &cmd->uploader.usc : &cmd->uploader.main;
178
179 /* Specially handle large allocations owned by the command buffer, e.g. used
180 * for statically allocated vertex output buffers with geometry shaders.
181 */
182 if (size > HK_CMD_BO_SIZE) {
183 uint32_t flags = usc ? AGX_BO_LOW_VA : 0;
184 struct agx_bo *bo =
185 agx_bo_create(&dev->dev, size, flags, 0, "Large pool allocation");
186
187 util_dynarray_append(&cmd->large_bos, struct agx_bo *, bo);
188 return (struct agx_ptr){
189 .gpu = bo->va->addr,
190 .cpu = agx_bo_map(bo),
191 };
192 }
193
194 assert(size <= HK_CMD_BO_SIZE);
195 assert(alignment > 0);
196
197 uint32_t offset = align(uploader->offset, alignment);
198
199 assert(offset <= HK_CMD_BO_SIZE);
200 if (uploader->map != NULL && size <= HK_CMD_BO_SIZE - offset) {
201 uploader->offset = offset + size;
202
203 return (struct agx_ptr){
204 .gpu = uploader->base + offset,
205 .cpu = uploader->map + offset,
206 };
207 }
208
209 struct hk_cmd_bo *bo;
210 VkResult result = hk_cmd_buffer_alloc_bo(cmd, usc, &bo);
211 if (unlikely(result != VK_SUCCESS)) {
212 vk_command_buffer_set_error(&cmd->vk, result);
213 return (struct agx_ptr){0};
214 }
215
216 /* Pick whichever of the current upload BO and the new BO will have more
217 * room left to be the BO for the next upload. If our upload size is
218 * bigger than the old offset, we're better off burning the whole new
219 * upload BO on this one allocation and continuing on the current upload
220 * BO.
221 */
222 if (uploader->map == NULL || size < uploader->offset) {
223 uploader->map = agx_bo_map(bo->bo);
224 uploader->base = bo->bo->va->addr;
225 uploader->offset = size;
226 }
227
228 return (struct agx_ptr){
229 .gpu = bo->bo->va->addr,
230 .cpu = bo->map,
231 };
232 }
233
234 uint64_t
hk_pool_upload(struct hk_cmd_buffer * cmd,const void * data,uint32_t size,uint32_t alignment)235 hk_pool_upload(struct hk_cmd_buffer *cmd, const void *data, uint32_t size,
236 uint32_t alignment)
237 {
238 struct agx_ptr T = hk_pool_alloc(cmd, size, alignment);
239 if (unlikely(T.cpu == NULL))
240 return 0;
241
242 memcpy(T.cpu, data, size);
243 return T.gpu;
244 }
245
246 VKAPI_ATTR VkResult VKAPI_CALL
hk_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)247 hk_BeginCommandBuffer(VkCommandBuffer commandBuffer,
248 const VkCommandBufferBeginInfo *pBeginInfo)
249 {
250 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
251 struct hk_device *dev = hk_cmd_buffer_device(cmd);
252
253 hk_reset_cmd_buffer(&cmd->vk, 0);
254
255 perf_debug(dev, "Begin command buffer");
256 hk_cmd_buffer_begin_compute(cmd, pBeginInfo);
257 hk_cmd_buffer_begin_graphics(cmd, pBeginInfo);
258
259 return VK_SUCCESS;
260 }
261
262 VKAPI_ATTR VkResult VKAPI_CALL
hk_EndCommandBuffer(VkCommandBuffer commandBuffer)263 hk_EndCommandBuffer(VkCommandBuffer commandBuffer)
264 {
265 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
266 struct hk_device *dev = hk_cmd_buffer_device(cmd);
267
268 assert(cmd->current_cs.gfx == NULL && cmd->current_cs.pre_gfx == NULL &&
269 "must end rendering before ending the command buffer");
270
271 perf_debug(dev, "End command buffer");
272 hk_cmd_buffer_end_compute(cmd);
273 hk_cmd_buffer_end_compute_internal(cmd, &cmd->current_cs.post_gfx);
274
275 /* With rasterizer discard, we might end up with empty VDM batches.
276 * It is difficult to avoid creating these empty batches, but it's easy to
277 * optimize them out at record-time. Do so now.
278 */
279 list_for_each_entry_safe(struct hk_cs, cs, &cmd->control_streams, node) {
280 if (cs->type == HK_CS_VDM && cs->stats.cmds == 0 &&
281 !cs->cr.process_empty_tiles) {
282
283 list_del(&cs->node);
284 hk_cs_destroy(cs);
285 }
286 }
287
288 return vk_command_buffer_get_record_result(&cmd->vk);
289 }
290
291 VKAPI_ATTR void VKAPI_CALL
hk_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)292 hk_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
293 const VkDependencyInfo *pDependencyInfo)
294 {
295 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
296 struct hk_device *dev = hk_cmd_buffer_device(cmd);
297
298 if (HK_PERF(dev, NOBARRIER))
299 return;
300
301 perf_debug(dev, "Pipeline barrier");
302
303 /* The big hammer. We end both compute and graphics batches. Ending compute
304 * here is necessary to properly handle graphics->compute dependencies.
305 *
306 * XXX: perf. */
307 hk_cmd_buffer_end_compute(cmd);
308 hk_cmd_buffer_end_graphics(cmd);
309 }
310
311 void
hk_cmd_bind_shaders(struct vk_command_buffer * vk_cmd,uint32_t stage_count,const gl_shader_stage * stages,struct vk_shader ** const shaders)312 hk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count,
313 const gl_shader_stage *stages,
314 struct vk_shader **const shaders)
315 {
316 struct hk_cmd_buffer *cmd = container_of(vk_cmd, struct hk_cmd_buffer, vk);
317
318 for (uint32_t i = 0; i < stage_count; i++) {
319 struct hk_api_shader *shader =
320 container_of(shaders[i], struct hk_api_shader, vk);
321
322 if (stages[i] == MESA_SHADER_COMPUTE || stages[i] == MESA_SHADER_KERNEL)
323 hk_cmd_bind_compute_shader(cmd, shader);
324 else
325 hk_cmd_bind_graphics_shader(cmd, stages[i], shader);
326 }
327 }
328
329 static void
hk_bind_descriptor_sets(UNUSED struct hk_cmd_buffer * cmd,struct hk_descriptor_state * desc,const VkBindDescriptorSetsInfoKHR * info)330 hk_bind_descriptor_sets(UNUSED struct hk_cmd_buffer *cmd,
331 struct hk_descriptor_state *desc,
332 const VkBindDescriptorSetsInfoKHR *info)
333 {
334 VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, info->layout);
335
336 /* Fro the Vulkan 1.3.275 spec:
337 *
338 * "When binding a descriptor set (see Descriptor Set Binding) to
339 * set number N...
340 *
341 * If, additionally, the previously bound descriptor set for set
342 * N was bound using a pipeline layout not compatible for set N,
343 * then all bindings in sets numbered greater than N are
344 * disturbed."
345 *
346 * This means that, if some earlier set gets bound in such a way that
347 * it changes set_dynamic_buffer_start[s], this binding is implicitly
348 * invalidated. Therefore, we can always look at the current value
349 * of set_dynamic_buffer_start[s] as the base of our dynamic buffer
350 * range and it's only our responsibility to adjust all
351 * set_dynamic_buffer_start[p] for p > s as needed.
352 */
353 uint8_t dyn_buffer_start =
354 desc->root.set_dynamic_buffer_start[info->firstSet];
355
356 uint32_t next_dyn_offset = 0;
357 for (uint32_t i = 0; i < info->descriptorSetCount; ++i) {
358 unsigned s = i + info->firstSet;
359 VK_FROM_HANDLE(hk_descriptor_set, set, info->pDescriptorSets[i]);
360
361 if (desc->sets[s] != set) {
362 if (set != NULL) {
363 desc->root.sets[s] = hk_descriptor_set_addr(set);
364 desc->set_sizes[s] = set->size;
365 } else {
366 desc->root.sets[s] = 0;
367 desc->set_sizes[s] = 0;
368 }
369 desc->sets[s] = set;
370 desc->sets_dirty |= BITFIELD_BIT(s);
371
372 /* Binding descriptors invalidates push descriptors */
373 desc->push_dirty &= ~BITFIELD_BIT(s);
374 }
375
376 desc->root.set_dynamic_buffer_start[s] = dyn_buffer_start;
377
378 if (pipeline_layout->set_layouts[s] != NULL) {
379 const struct hk_descriptor_set_layout *set_layout =
380 vk_to_hk_descriptor_set_layout(pipeline_layout->set_layouts[s]);
381
382 if (set != NULL && set_layout->dynamic_buffer_count > 0) {
383 for (uint32_t j = 0; j < set_layout->dynamic_buffer_count; j++) {
384 struct hk_buffer_address addr = set->dynamic_buffers[j];
385 addr.base_addr += info->pDynamicOffsets[next_dyn_offset + j];
386 desc->root.dynamic_buffers[dyn_buffer_start + j] = addr;
387 }
388 next_dyn_offset += set->layout->dynamic_buffer_count;
389 }
390
391 dyn_buffer_start += set_layout->dynamic_buffer_count;
392 } else {
393 assert(set == NULL);
394 }
395 }
396 assert(dyn_buffer_start <= HK_MAX_DYNAMIC_BUFFERS);
397 assert(next_dyn_offset <= info->dynamicOffsetCount);
398
399 for (uint32_t s = info->firstSet + info->descriptorSetCount; s < HK_MAX_SETS;
400 s++)
401 desc->root.set_dynamic_buffer_start[s] = dyn_buffer_start;
402
403 desc->root_dirty = true;
404 }
405
406 VKAPI_ATTR void VKAPI_CALL
hk_CmdBindDescriptorSets2KHR(VkCommandBuffer commandBuffer,const VkBindDescriptorSetsInfoKHR * pBindDescriptorSetsInfo)407 hk_CmdBindDescriptorSets2KHR(
408 VkCommandBuffer commandBuffer,
409 const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo)
410 {
411 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
412
413 if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
414 hk_bind_descriptor_sets(cmd, &cmd->state.gfx.descriptors,
415 pBindDescriptorSetsInfo);
416 }
417
418 if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
419 hk_bind_descriptor_sets(cmd, &cmd->state.cs.descriptors,
420 pBindDescriptorSetsInfo);
421 }
422 }
423
424 static void
hk_push_constants(UNUSED struct hk_cmd_buffer * cmd,struct hk_descriptor_state * desc,const VkPushConstantsInfoKHR * info)425 hk_push_constants(UNUSED struct hk_cmd_buffer *cmd,
426 struct hk_descriptor_state *desc,
427 const VkPushConstantsInfoKHR *info)
428 {
429 memcpy(desc->root.push + info->offset, info->pValues, info->size);
430 desc->root_dirty = true;
431 }
432
433 VKAPI_ATTR void VKAPI_CALL
hk_CmdPushConstants2KHR(VkCommandBuffer commandBuffer,const VkPushConstantsInfoKHR * pPushConstantsInfo)434 hk_CmdPushConstants2KHR(VkCommandBuffer commandBuffer,
435 const VkPushConstantsInfoKHR *pPushConstantsInfo)
436 {
437 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
438
439 if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS)
440 hk_push_constants(cmd, &cmd->state.gfx.descriptors, pPushConstantsInfo);
441
442 if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT)
443 hk_push_constants(cmd, &cmd->state.cs.descriptors, pPushConstantsInfo);
444 }
445
446 static struct hk_push_descriptor_set *
hk_cmd_push_descriptors(struct hk_cmd_buffer * cmd,struct hk_descriptor_state * desc,uint32_t set)447 hk_cmd_push_descriptors(struct hk_cmd_buffer *cmd,
448 struct hk_descriptor_state *desc, uint32_t set)
449 {
450 assert(set < HK_MAX_SETS);
451 if (unlikely(desc->push[set] == NULL)) {
452 desc->push[set] =
453 vk_zalloc(&cmd->vk.pool->alloc, sizeof(*desc->push[set]), 8,
454 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
455 if (unlikely(desc->push[set] == NULL)) {
456 vk_command_buffer_set_error(&cmd->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
457 return NULL;
458 }
459 }
460
461 /* Pushing descriptors replaces whatever sets are bound */
462 desc->sets[set] = NULL;
463 desc->push_dirty |= BITFIELD_BIT(set);
464
465 return desc->push[set];
466 }
467
468 static void
hk_push_descriptor_set(struct hk_cmd_buffer * cmd,struct hk_descriptor_state * desc,const VkPushDescriptorSetInfoKHR * info)469 hk_push_descriptor_set(struct hk_cmd_buffer *cmd,
470 struct hk_descriptor_state *desc,
471 const VkPushDescriptorSetInfoKHR *info)
472 {
473 VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, info->layout);
474
475 struct hk_push_descriptor_set *push_set =
476 hk_cmd_push_descriptors(cmd, desc, info->set);
477 if (unlikely(push_set == NULL))
478 return;
479
480 struct hk_descriptor_set_layout *set_layout =
481 vk_to_hk_descriptor_set_layout(pipeline_layout->set_layouts[info->set]);
482
483 hk_push_descriptor_set_update(push_set, set_layout,
484 info->descriptorWriteCount,
485 info->pDescriptorWrites);
486 }
487
488 VKAPI_ATTR void VKAPI_CALL
hk_CmdPushDescriptorSet2KHR(VkCommandBuffer commandBuffer,const VkPushDescriptorSetInfoKHR * pPushDescriptorSetInfo)489 hk_CmdPushDescriptorSet2KHR(
490 VkCommandBuffer commandBuffer,
491 const VkPushDescriptorSetInfoKHR *pPushDescriptorSetInfo)
492 {
493 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
494
495 if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
496 hk_push_descriptor_set(cmd, &cmd->state.gfx.descriptors,
497 pPushDescriptorSetInfo);
498 }
499
500 if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
501 hk_push_descriptor_set(cmd, &cmd->state.cs.descriptors,
502 pPushDescriptorSetInfo);
503 }
504 }
505
506 void
hk_cmd_buffer_flush_push_descriptors(struct hk_cmd_buffer * cmd,struct hk_descriptor_state * desc)507 hk_cmd_buffer_flush_push_descriptors(struct hk_cmd_buffer *cmd,
508 struct hk_descriptor_state *desc)
509 {
510 u_foreach_bit(set_idx, desc->push_dirty) {
511 struct hk_push_descriptor_set *push_set = desc->push[set_idx];
512 uint64_t push_set_addr = hk_pool_upload(
513 cmd, push_set->data, sizeof(push_set->data), HK_MIN_UBO_ALIGNMENT);
514
515 desc->root.sets[set_idx] = push_set_addr;
516 desc->set_sizes[set_idx] = sizeof(push_set->data);
517 }
518
519 desc->root_dirty = true;
520 desc->push_dirty = 0;
521 }
522
523 VKAPI_ATTR void VKAPI_CALL
hk_CmdPushDescriptorSetWithTemplate2KHR(VkCommandBuffer commandBuffer,const VkPushDescriptorSetWithTemplateInfoKHR * pPushDescriptorSetWithTemplateInfo)524 hk_CmdPushDescriptorSetWithTemplate2KHR(
525 VkCommandBuffer commandBuffer, const VkPushDescriptorSetWithTemplateInfoKHR
526 *pPushDescriptorSetWithTemplateInfo)
527 {
528 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
529 VK_FROM_HANDLE(vk_descriptor_update_template, template,
530 pPushDescriptorSetWithTemplateInfo->descriptorUpdateTemplate);
531 VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout,
532 pPushDescriptorSetWithTemplateInfo->layout);
533
534 struct hk_descriptor_state *desc =
535 hk_get_descriptors_state(cmd, template->bind_point);
536 struct hk_push_descriptor_set *push_set = hk_cmd_push_descriptors(
537 cmd, desc, pPushDescriptorSetWithTemplateInfo->set);
538 if (unlikely(push_set == NULL))
539 return;
540
541 struct hk_descriptor_set_layout *set_layout = vk_to_hk_descriptor_set_layout(
542 pipeline_layout->set_layouts[pPushDescriptorSetWithTemplateInfo->set]);
543
544 hk_push_descriptor_set_update_template(
545 push_set, set_layout, template,
546 pPushDescriptorSetWithTemplateInfo->pData);
547 }
548
549 uint64_t
hk_cmd_buffer_upload_root(struct hk_cmd_buffer * cmd,VkPipelineBindPoint bind_point)550 hk_cmd_buffer_upload_root(struct hk_cmd_buffer *cmd,
551 VkPipelineBindPoint bind_point)
552 {
553 struct hk_descriptor_state *desc = hk_get_descriptors_state(cmd, bind_point);
554 struct hk_root_descriptor_table *root = &desc->root;
555
556 struct agx_ptr root_ptr = hk_pool_alloc(cmd, sizeof(*root), 8);
557 if (!root_ptr.gpu)
558 return 0;
559
560 root->root_desc_addr = root_ptr.gpu;
561
562 memcpy(root_ptr.cpu, root, sizeof(*root));
563 return root_ptr.gpu;
564 }
565
566 void
hk_usc_upload_spilled_rt_descs(struct agx_usc_builder * b,struct hk_cmd_buffer * cmd)567 hk_usc_upload_spilled_rt_descs(struct agx_usc_builder *b,
568 struct hk_cmd_buffer *cmd)
569 {
570 struct hk_rendering_state *render = &cmd->state.gfx.render;
571
572 /* Upload texture/PBE descriptors for each render target so we can clear
573 * spilled render targets.
574 */
575 struct agx_ptr descs =
576 hk_pool_alloc(cmd, AGX_TEXTURE_LENGTH * 2 * render->color_att_count, 64);
577 struct agx_texture_packed *desc = descs.cpu;
578 if (!desc)
579 return;
580
581 for (unsigned i = 0; i < render->color_att_count; ++i) {
582 struct hk_image_view *iview = render->color_att[i].iview;
583 if (!iview) {
584 /* XXX: probably should emit a null descriptor here...? */
585 continue;
586 }
587
588 memcpy(&desc[(i * 2) + 0], &iview->planes[0].emrt_texture, sizeof(*desc));
589 memcpy(&desc[(i * 2) + 1], &iview->planes[0].emrt_pbe, sizeof(*desc));
590 }
591
592 desc = descs.cpu;
593
594 /* Bind the base as u0_u1 for bindless access */
595 agx_usc_uniform(b, 0, 4, hk_pool_upload(cmd, &descs.gpu, 8, 8));
596 }
597
598 void
hk_reserve_scratch(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct hk_shader * s)599 hk_reserve_scratch(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
600 struct hk_shader *s)
601 {
602 struct hk_device *dev = hk_cmd_buffer_device(cmd);
603 uint32_t max_scratch_size =
604 MAX2(s->b.info.scratch_size, s->b.info.preamble_scratch_size);
605
606 if (max_scratch_size == 0)
607 return;
608
609 unsigned preamble_size = (s->b.info.preamble_scratch_size > 0) ? 1 : 0;
610
611 /* Note: this uses the hardware stage, not the software stage */
612 hk_device_alloc_scratch(dev, s->b.info.stage, max_scratch_size);
613 perf_debug(dev, "Reserving %u (%u) bytes of scratch for stage %s",
614 s->b.info.scratch_size, s->b.info.preamble_scratch_size,
615 _mesa_shader_stage_to_abbrev(s->b.info.stage));
616
617 switch (s->b.info.stage) {
618 case PIPE_SHADER_FRAGMENT:
619 cs->scratch.fs.main = true;
620 cs->scratch.fs.preamble = MAX2(cs->scratch.fs.preamble, preamble_size);
621 break;
622 case PIPE_SHADER_VERTEX:
623 cs->scratch.vs.main = true;
624 cs->scratch.vs.preamble = MAX2(cs->scratch.vs.preamble, preamble_size);
625 break;
626 default:
627 cs->scratch.cs.main = true;
628 cs->scratch.cs.preamble = MAX2(cs->scratch.cs.preamble, preamble_size);
629 break;
630 }
631 }
632
633 uint32_t
hk_upload_usc_words(struct hk_cmd_buffer * cmd,struct hk_shader * s,struct hk_linked_shader * linked)634 hk_upload_usc_words(struct hk_cmd_buffer *cmd, struct hk_shader *s,
635 struct hk_linked_shader *linked)
636 {
637 struct hk_device *dev = hk_cmd_buffer_device(cmd);
638
639 enum pipe_shader_type sw_stage = s->info.stage;
640
641 unsigned constant_push_ranges = DIV_ROUND_UP(s->b.info.rodata.size_16, 64);
642 unsigned push_ranges = 2;
643 unsigned stage_ranges = 3;
644
645 size_t usc_size =
646 agx_usc_size(constant_push_ranges + push_ranges + stage_ranges + 4);
647 struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64);
648 if (!t.cpu)
649 return 0;
650
651 struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
652
653 uint64_t root_ptr;
654
655 if (sw_stage == PIPE_SHADER_COMPUTE)
656 root_ptr = hk_cmd_buffer_upload_root(cmd, VK_PIPELINE_BIND_POINT_COMPUTE);
657 else
658 root_ptr = cmd->state.gfx.root;
659
660 static_assert(offsetof(struct hk_root_descriptor_table, root_desc_addr) == 0,
661 "self-reflective");
662
663 agx_usc_uniform(&b, HK_ROOT_UNIFORM, 4, root_ptr);
664
665 if (sw_stage == MESA_SHADER_VERTEX) {
666 unsigned count =
667 DIV_ROUND_UP(BITSET_LAST_BIT(s->info.vs.attrib_components_read), 4);
668
669 if (count) {
670 agx_usc_uniform(
671 &b, 0, 4 * count,
672 root_ptr + hk_root_descriptor_offset(draw.attrib_base));
673
674 agx_usc_uniform(
675 &b, 4 * count, 2 * count,
676 root_ptr + hk_root_descriptor_offset(draw.attrib_clamps));
677 }
678
679 if (cmd->state.gfx.draw_params)
680 agx_usc_uniform(&b, 6 * count, 4, cmd->state.gfx.draw_params);
681
682 if (cmd->state.gfx.draw_id_ptr)
683 agx_usc_uniform(&b, (6 * count) + 4, 1, cmd->state.gfx.draw_id_ptr);
684
685 if (linked->sw_indexing) {
686 agx_usc_uniform(
687 &b, (6 * count) + 8, 4,
688 root_ptr + hk_root_descriptor_offset(draw.input_assembly));
689 }
690 } else if (sw_stage == MESA_SHADER_FRAGMENT) {
691 if (agx_tilebuffer_spills(&cmd->state.gfx.render.tilebuffer)) {
692 hk_usc_upload_spilled_rt_descs(&b, cmd);
693 }
694
695 agx_usc_uniform(
696 &b, 4, 8, root_ptr + hk_root_descriptor_offset(draw.blend_constant));
697
698 /* The SHARED state is baked into linked->usc for non-fragment shaders. We
699 * don't pass around the information to bake the tilebuffer layout.
700 *
701 * TODO: We probably could with some refactor.
702 */
703 agx_usc_push_packed(&b, SHARED, &cmd->state.gfx.render.tilebuffer.usc);
704 }
705
706 agx_usc_push_blob(&b, linked->usc.data, linked->usc.size);
707 return agx_usc_addr(&dev->dev, t.gpu);
708 }
709
710 void
hk_dispatch_precomp(struct hk_cs * cs,struct agx_grid grid,enum agx_barrier barrier,enum libagx_program idx,void * data,size_t data_size)711 hk_dispatch_precomp(struct hk_cs *cs, struct agx_grid grid,
712 enum agx_barrier barrier, enum libagx_program idx,
713 void *data, size_t data_size)
714 {
715 struct hk_device *dev = hk_cmd_buffer_device(cs->cmd);
716 struct agx_precompiled_shader *prog = agx_get_precompiled(&dev->bg_eot, idx);
717
718 struct agx_ptr t = hk_pool_usc_alloc(cs->cmd, agx_usc_size(15), 64);
719 uint64_t uploaded_data = hk_pool_upload(cs->cmd, data, data_size, 4);
720
721 agx_usc_words_precomp(t.cpu, &prog->b, uploaded_data, data_size);
722
723 hk_dispatch_with_usc_launch(dev, cs, prog->b.launch,
724 agx_usc_addr(&dev->dev, t.gpu), grid,
725 prog->b.workgroup);
726 }
727
728 void
hk_cs_init_graphics(struct hk_cmd_buffer * cmd,struct hk_cs * cs)729 hk_cs_init_graphics(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
730 {
731 struct hk_rendering_state *render = &cmd->state.gfx.render;
732 uint8_t *map = cs->current;
733
734 cs->tib = render->tilebuffer;
735
736 /* Assume this is not the first control stream of the render pass, so
737 * initially use the partial background/EOT program and ZLS control.
738 * hk_BeginRendering/hk_EndRendering will override.
739 */
740 cs->cr = render->cr;
741 cs->cr.bg.main = render->cr.bg.partial;
742 cs->cr.eot.main = render->cr.eot.partial;
743 cs->cr.zls_control = render->cr.zls_control_partial;
744
745 /* Barrier to enforce GPU-CPU coherency, in case this batch is back to back
746 * with another that caused stale data to be cached and the CPU wrote to it
747 * in the meantime.
748 */
749 agx_push(map, VDM_BARRIER, cfg) {
750 cfg.usc_cache_inval = true;
751 }
752
753 struct AGX_PPP_HEADER present = {
754 .w_clamp = true,
755 .occlusion_query_2 = true,
756 .output_unknown = true,
757 .varying_word_2 = true,
758 .viewport_count = 1, /* irrelevant */
759 };
760
761 size_t size = agx_ppp_update_size(&present);
762 struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
763 if (!T.cpu)
764 return;
765
766 struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present);
767
768 /* clang-format off */
769 agx_ppp_push(&ppp, W_CLAMP, cfg) cfg.w_clamp = 1e-10;
770 agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY_2, cfg);
771 agx_ppp_push(&ppp, OUTPUT_UNKNOWN, cfg);
772 agx_ppp_push(&ppp, VARYING_2, cfg);
773 /* clang-format on */
774
775 agx_ppp_fini(&map, &ppp);
776 cs->current = map;
777
778 util_dynarray_init(&cs->scissor, NULL);
779 util_dynarray_init(&cs->depth_bias, NULL);
780
781 /* All graphics state must be reemited in each control stream */
782 hk_cmd_buffer_dirty_all(cmd);
783 }
784
785 void
hk_ensure_cs_has_space(struct hk_cmd_buffer * cmd,struct hk_cs * cs,size_t space)786 hk_ensure_cs_has_space(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
787 size_t space)
788 {
789 bool vdm = cs->type == HK_CS_VDM;
790
791 size_t link_length =
792 vdm ? AGX_VDM_STREAM_LINK_LENGTH : AGX_CDM_STREAM_LINK_LENGTH;
793
794 /* Assert that we have space for a link tag */
795 assert((cs->current + link_length) <= cs->end && "Encoder overflowed");
796
797 /* Always leave room for a link tag, in case we run out of space later,
798 * plus padding because VDM apparently overreads?
799 *
800 * 0x200 is not enough. 0x400 seems to work. 0x800 for safety.
801 */
802 space += link_length + 0x800;
803
804 /* If there is room in the command buffer, we're done */
805 if (likely((cs->end - cs->current) >= space))
806 return;
807
808 /* Otherwise, we need to allocate a new command buffer. We use memory owned
809 * by the batch to simplify lifetime management for the BO.
810 */
811 size_t size = 65536;
812 struct agx_ptr T = hk_pool_alloc(cmd, size, 256);
813
814 /* Jump from the old control stream to the new control stream */
815 agx_cs_jump(cs->current, T.gpu, vdm);
816
817 /* Swap out the control stream */
818 cs->current = T.cpu;
819 cs->end = cs->current + size;
820 cs->chunk = T;
821 cs->stream_linked = true;
822 }
823