/* * Copyright © 2021 Collabora Ltd. * * Derived from tu_cmd_buffer.c which is: * Copyright © 2016 Red Hat. * Copyright © 2016 Bas Nieuwenhuizen * Copyright © 2015 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include "genxml/gen_macros.h" #include "panvk_buffer.h" #include "panvk_cmd_alloc.h" #include "panvk_cmd_buffer.h" #include "panvk_cmd_desc_state.h" #include "panvk_cmd_pool.h" #include "panvk_cmd_push_constant.h" #include "panvk_device.h" #include "panvk_entrypoints.h" #include "panvk_instance.h" #include "panvk_physical_device.h" #include "panvk_priv_bo.h" #include "panvk_tracepoints.h" #include "panvk_utrace.h" #include "pan_desc.h" #include "pan_encoder.h" #include "pan_props.h" #include "pan_samples.h" #include "util/bitscan.h" #include "vk_descriptor_update_template.h" #include "vk_format.h" #include "vk_synchronization.h" static void emit_tls(struct panvk_cmd_buffer *cmdbuf) { struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); struct panvk_physical_device *phys_dev = to_panvk_physical_device(dev->vk.physical); unsigned core_id_range; panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range); if (cmdbuf->state.tls.info.tls.size) { unsigned thread_tls_alloc = panfrost_query_thread_tls_alloc(&phys_dev->kmod.props); unsigned size = panfrost_get_total_stack_size( cmdbuf->state.tls.info.tls.size, thread_tls_alloc, core_id_range); cmdbuf->state.tls.info.tls.ptr = panvk_cmd_alloc_dev_mem(cmdbuf, tls, size, 4096).gpu; } assert(!cmdbuf->state.tls.info.wls.size); if (cmdbuf->state.tls.desc.cpu) { GENX(pan_emit_tls)(&cmdbuf->state.tls.info, cmdbuf->state.tls.desc.cpu); } } /** * Write all sync point updates to seqno registers and reset the relative sync * points to 0. */ static void flush_sync_points(struct panvk_cmd_buffer *cmdbuf) { for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) { struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i); if (!cs_is_valid(b)) { vk_command_buffer_set_error(&cmdbuf->vk, VK_ERROR_OUT_OF_DEVICE_MEMORY); return; } cs_update_progress_seqno(b) { for (uint32_t j = 0; j < PANVK_SUBQUEUE_COUNT; j++) { uint32_t rel_sync_point = cmdbuf->state.cs[j].relative_sync_point; if (!rel_sync_point) continue; cs_add64(b, cs_progress_seqno_reg(b, j), cs_progress_seqno_reg(b, j), rel_sync_point); } } } for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) cmdbuf->state.cs[i].relative_sync_point = 0; } static void finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue) { struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); struct panvk_instance *instance = to_panvk_instance(dev->vk.physical->instance); struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue); /* We need a clean because descriptor/CS memory can be returned to the * command pool where they get recycled. If we don't clean dirty cache lines, * those cache lines might get evicted asynchronously and their content * pushed back to main memory after the CPU has written new stuff there. */ struct cs_index flush_id = cs_scratch_reg32(b, 0); cs_move32_to(b, flush_id, 0); cs_wait_slots(b, SB_ALL_MASK, false); cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN, false, flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH))); cs_wait_slot(b, SB_ID(IMM_FLUSH), false); /* If we're in sync/trace more, we signal the debug object. */ if (instance->debug_flags & (PANVK_DEBUG_SYNC | PANVK_DEBUG_TRACE)) { struct cs_index debug_sync_addr = cs_scratch_reg64(b, 0); struct cs_index one = cs_scratch_reg32(b, 2); struct cs_index error = cs_scratch_reg32(b, 3); struct cs_index cmp_scratch = cs_scratch_reg32(b, 2); cs_move32_to(b, one, 1); cs_load64_to(b, debug_sync_addr, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, debug.syncobjs)); cs_wait_slot(b, SB_ID(LS), false); cs_add64(b, debug_sync_addr, debug_sync_addr, sizeof(struct panvk_cs_sync32) * subqueue); cs_load32_to(b, error, debug_sync_addr, offsetof(struct panvk_cs_sync32, error)); cs_wait_slots(b, SB_ALL_MASK, false); if (cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_CSG, one, debug_sync_addr, cs_now()); cs_match(b, error, cmp_scratch) { cs_case(b, 0) { /* Do nothing. */ } cs_default(b) { /* Overwrite the sync error with the first error we encountered. */ cs_store32(b, error, debug_sync_addr, offsetof(struct panvk_cs_sync32, error)); cs_wait_slot(b, SB_ID(LS), false); } } } /* If this is a secondary command buffer, we don't poison the reg file to * preserve the render pass context. We also don't poison the reg file if the * last render pass was suspended. In practice we could preserve only the * registers that matter, but this is a debug feature so let's keep things * simple with this all-or-nothing approach. */ if ((instance->debug_flags & PANVK_DEBUG_CS) && cmdbuf->vk.level != VK_COMMAND_BUFFER_LEVEL_SECONDARY && !(cmdbuf->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT)) { cs_update_cmdbuf_regs(b) { /* Poison all cmdbuf registers to make sure we don't inherit state from * a previously executed cmdbuf. */ for (uint32_t i = 0; i <= PANVK_CS_REG_SCRATCH_END; i++) cs_move32_to(b, cs_reg32(b, i), 0xdead | i << 24); } } trace_end_cmdbuf(&cmdbuf->utrace.uts[subqueue], cmdbuf, cmdbuf->flags); cs_finish(&cmdbuf->state.cs[subqueue].builder); } VKAPI_ATTR VkResult VKAPI_CALL panvk_per_arch(EndCommandBuffer)(VkCommandBuffer commandBuffer) { VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); emit_tls(cmdbuf); flush_sync_points(cmdbuf); for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) { struct cs_builder *b = &cmdbuf->state.cs[i].builder; if (!cs_is_valid(b)) { vk_command_buffer_set_error(&cmdbuf->vk, VK_ERROR_OUT_OF_DEVICE_MEMORY); } else { finish_cs(cmdbuf, i); } } cmdbuf->flush_id = panthor_kmod_get_flush_id(dev->kmod.dev); return vk_command_buffer_end(&cmdbuf->vk); } static VkPipelineStageFlags2 get_subqueue_stages(enum panvk_subqueue_id subqueue) { switch (subqueue) { case PANVK_SUBQUEUE_VERTEX_TILER: return VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT | VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT | VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT; case PANVK_SUBQUEUE_FRAGMENT: return VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT | VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT; case PANVK_SUBQUEUE_COMPUTE: return VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_COPY_BIT; default: unreachable("Invalid subqueue id"); } } static void add_execution_dependency(uint32_t wait_masks[static PANVK_SUBQUEUE_COUNT], VkPipelineStageFlags2 src_stages, VkPipelineStageFlags2 dst_stages) { /* convert stages to subqueues */ uint32_t src_subqueues = 0; uint32_t dst_subqueues = 0; for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) { const VkPipelineStageFlags2 subqueue_stages = get_subqueue_stages(i); if (src_stages & subqueue_stages) src_subqueues |= BITFIELD_BIT(i); if (dst_stages & subqueue_stages) dst_subqueues |= BITFIELD_BIT(i); } const bool dst_host = dst_stages & VK_PIPELINE_STAGE_2_HOST_BIT; /* nothing to wait */ if (!src_subqueues || (!dst_subqueues && !dst_host)) return; u_foreach_bit(i, dst_subqueues) { /* each dst subqueue should wait for all src subqueues */ uint32_t wait_mask = src_subqueues; switch (i) { case PANVK_SUBQUEUE_VERTEX_TILER: /* Indirect draw buffers are read from the command stream, and * load/store operations are synchronized with the LS scoreboard * immediately after the read, so no need to wait in that case. */ if ((src_stages & get_subqueue_stages(i)) == VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT) wait_mask &= ~BITFIELD_BIT(i); break; case PANVK_SUBQUEUE_FRAGMENT: /* The fragment subqueue always waits for the tiler subqueue already. * Explicit waits can be skipped. */ wait_mask &= ~BITFIELD_BIT(PANVK_SUBQUEUE_VERTEX_TILER); break; default: break; } wait_masks[i] |= wait_mask; } /* The host does not wait for src subqueues. All src subqueues should * self-wait instead. * * Also, our callers currently expect src subqueues to self-wait when there * are dst subqueues. Until that changes, make all src subqueues self-wait. */ if (dst_host || dst_subqueues) { u_foreach_bit(i, src_subqueues) wait_masks[i] |= BITFIELD_BIT(i); } } static void add_memory_dependency(struct panvk_cache_flush_info *cache_flush, VkAccessFlags2 src_access, VkAccessFlags2 dst_access) { /* Note on the cache organization: * * - L2 cache is unified, so all changes to this cache are automatically * visible to all GPU sub-components (shader cores, tiler, ...). This * means we only need to flush when the host (AKA CPU) is involved. * - LS caches (which are basically just read-write L1 caches) are coherent * with each other and with the L2 cache, so again, we only need to flush * when the host is involved. * - Other read-only L1 caches (like the ones in front of the texture unit) * are not coherent with the LS or L2 caches, and thus need to be * invalidated any time a write happens. * * Translating to the Vulkan memory model: * * - The device domain is the L2 cache. * - An availability operation from device writes to the device domain is * nop. * - A visibility operation from the device domain to device accesses that * are coherent with L2/LS is nop. * - A visibility operation from the device domain to device accesses that * are incoherent with L2/LS invalidates the other RO L1 caches. * - A host-to-device domain operation invalidates all caches. * - A device-to-host domain operation flushes L2/LS. */ const VkAccessFlags2 ro_l1_access = VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_2_TRANSFER_READ_BIT | VK_ACCESS_2_SHADER_SAMPLED_READ_BIT | VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT; /* visibility op */ if (dst_access & ro_l1_access) cache_flush->others |= true; /* host-to-device domain op */ if (src_access & VK_ACCESS_2_HOST_WRITE_BIT) { cache_flush->l2 |= MALI_CS_FLUSH_MODE_CLEAN_AND_INVALIDATE; cache_flush->lsc |= MALI_CS_FLUSH_MODE_CLEAN_AND_INVALIDATE; cache_flush->others |= true; } /* device-to-host domain op */ if (dst_access & (VK_ACCESS_2_HOST_READ_BIT | VK_ACCESS_2_HOST_WRITE_BIT)) { cache_flush->l2 |= MALI_CS_FLUSH_MODE_CLEAN; cache_flush->lsc |= MALI_CS_FLUSH_MODE_CLEAN; } } static bool should_split_render_pass(const uint32_t wait_masks[static PANVK_SUBQUEUE_COUNT], VkAccessFlags2 src_access, VkAccessFlags2 dst_access) { /* From the Vulkan 1.3.301 spec: * * VUID-vkCmdPipelineBarrier-None-07892 * * "If vkCmdPipelineBarrier is called within a render pass instance, the * source and destination stage masks of any memory barriers must only * include graphics pipeline stages" * * We only consider the tiler and the fragment subqueues here. */ /* split if the tiler subqueue waits for the fragment subqueue */ if (wait_masks[PANVK_SUBQUEUE_VERTEX_TILER] & BITFIELD_BIT(PANVK_SUBQUEUE_FRAGMENT)) return true; /* split if the fragment subqueue self-waits with a feedback loop, because * we lower subpassLoad to texelFetch */ if ((wait_masks[PANVK_SUBQUEUE_FRAGMENT] & BITFIELD_BIT(PANVK_SUBQUEUE_FRAGMENT)) && (src_access & (VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT)) && (dst_access & VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT)) return true; return false; } static void collect_cache_flush_info(enum panvk_subqueue_id subqueue, struct panvk_cache_flush_info *cache_flush, VkAccessFlags2 src_access, VkAccessFlags2 dst_access) { /* limit access to the subqueue and host */ const VkPipelineStageFlags2 subqueue_stages = get_subqueue_stages(subqueue) | VK_PIPELINE_STAGE_2_HOST_BIT; src_access = vk_filter_src_access_flags2(subqueue_stages, src_access); dst_access = vk_filter_dst_access_flags2(subqueue_stages, dst_access); add_memory_dependency(cache_flush, src_access, dst_access); } static void collect_cs_deps(struct panvk_cmd_buffer *cmdbuf, VkPipelineStageFlags2 src_stages, VkPipelineStageFlags2 dst_stages, VkAccessFlags2 src_access, VkAccessFlags2 dst_access, struct panvk_cs_deps *deps) { uint32_t wait_masks[PANVK_SUBQUEUE_COUNT] = {0}; add_execution_dependency(wait_masks, src_stages, dst_stages); /* within a render pass */ if (cmdbuf->state.gfx.render.tiler) { if (should_split_render_pass(wait_masks, src_access, dst_access)) { deps->needs_draw_flush = true; } else { /* skip the tiler subqueue self-wait because we use the same * scoreboard slot for the idvs jobs */ wait_masks[PANVK_SUBQUEUE_VERTEX_TILER] &= ~BITFIELD_BIT(PANVK_SUBQUEUE_VERTEX_TILER); /* skip the fragment subqueue self-wait because we emit the fragment * job at the end of the render pass and there is nothing to wait yet */ wait_masks[PANVK_SUBQUEUE_FRAGMENT] &= ~BITFIELD_BIT(PANVK_SUBQUEUE_FRAGMENT); } } for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) { if (wait_masks[i] & BITFIELD_BIT(i)) { /* We need to self-wait for all previously submitted jobs, and given * the iterator scoreboard is a moving target, we just wait for the * whole dynamic scoreboard range. */ deps->src[i].wait_sb_mask |= SB_ALL_ITERS_MASK; } collect_cache_flush_info(i, &deps->src[i].cache_flush, src_access, dst_access); deps->dst[i].wait_subqueue_mask |= wait_masks[i]; } } static void normalize_dependency(VkPipelineStageFlags2 *src_stages, VkPipelineStageFlags2 *dst_stages, VkAccessFlags2 *src_access, VkAccessFlags2 *dst_access, uint32_t src_qfi, uint32_t dst_qfi) { /* queue family acquire operation */ switch (src_qfi) { case VK_QUEUE_FAMILY_EXTERNAL: /* no execution dependency and no availability operation */ *src_stages = VK_PIPELINE_STAGE_2_NONE; *src_access = VK_ACCESS_2_NONE; break; case VK_QUEUE_FAMILY_FOREIGN_EXT: /* treat the foreign queue as the host */ *src_stages = VK_PIPELINE_STAGE_2_HOST_BIT; *src_access = VK_ACCESS_2_HOST_WRITE_BIT; break; default: break; } /* queue family release operation */ switch (dst_qfi) { case VK_QUEUE_FAMILY_EXTERNAL: /* no execution dependency and no visibility operation */ *dst_stages = VK_PIPELINE_STAGE_2_NONE; *dst_access = VK_ACCESS_2_NONE; break; case VK_QUEUE_FAMILY_FOREIGN_EXT: /* treat the foreign queue as the host */ *dst_stages = VK_PIPELINE_STAGE_2_HOST_BIT; *dst_access = VK_ACCESS_2_HOST_WRITE_BIT; break; default: break; } *src_stages = vk_expand_src_stage_flags2(*src_stages); *dst_stages = vk_expand_dst_stage_flags2(*dst_stages); *src_access = vk_filter_src_access_flags2(*src_stages, *src_access); *dst_access = vk_filter_dst_access_flags2(*dst_stages, *dst_access); } void panvk_per_arch(get_cs_deps)(struct panvk_cmd_buffer *cmdbuf, const VkDependencyInfo *in, struct panvk_cs_deps *out) { memset(out, 0, sizeof(*out)); for (uint32_t i = 0; i < in->memoryBarrierCount; i++) { const VkMemoryBarrier2 *barrier = &in->pMemoryBarriers[i]; VkPipelineStageFlags2 src_stages = barrier->srcStageMask; VkPipelineStageFlags2 dst_stages = barrier->dstStageMask; VkAccessFlags2 src_access = barrier->srcAccessMask; VkAccessFlags2 dst_access = barrier->dstAccessMask; normalize_dependency(&src_stages, &dst_stages, &src_access, &dst_access, VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED); collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access, out); } for (uint32_t i = 0; i < in->bufferMemoryBarrierCount; i++) { const VkBufferMemoryBarrier2 *barrier = &in->pBufferMemoryBarriers[i]; VkPipelineStageFlags2 src_stages = barrier->srcStageMask; VkPipelineStageFlags2 dst_stages = barrier->dstStageMask; VkAccessFlags2 src_access = barrier->srcAccessMask; VkAccessFlags2 dst_access = barrier->dstAccessMask; normalize_dependency(&src_stages, &dst_stages, &src_access, &dst_access, barrier->srcQueueFamilyIndex, barrier->dstQueueFamilyIndex); collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access, out); } for (uint32_t i = 0; i < in->imageMemoryBarrierCount; i++) { const VkImageMemoryBarrier2 *barrier = &in->pImageMemoryBarriers[i]; VkPipelineStageFlags2 src_stages = barrier->srcStageMask; VkPipelineStageFlags2 dst_stages = barrier->dstStageMask; VkAccessFlags2 src_access = barrier->srcAccessMask; VkAccessFlags2 dst_access = barrier->dstAccessMask; normalize_dependency(&src_stages, &dst_stages, &src_access, &dst_access, barrier->srcQueueFamilyIndex, barrier->dstQueueFamilyIndex); collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access, out); } } VKAPI_ATTR void VKAPI_CALL panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer, const VkDependencyInfo *pDependencyInfo) { VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); struct panvk_cs_deps deps; panvk_per_arch(get_cs_deps)(cmdbuf, pDependencyInfo, &deps); if (deps.needs_draw_flush) panvk_per_arch(cmd_flush_draws)(cmdbuf); uint32_t wait_subqueue_mask = 0; for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) { /* no need to perform both types of waits on the same subqueue */ if (deps.src[i].wait_sb_mask) deps.dst[i].wait_subqueue_mask &= ~BITFIELD_BIT(i); assert(!(deps.dst[i].wait_subqueue_mask & BITFIELD_BIT(i))); wait_subqueue_mask |= deps.dst[i].wait_subqueue_mask; } for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) { struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i); struct panvk_cs_state *cs_state = &cmdbuf->state.cs[i]; if (deps.src[i].wait_sb_mask) cs_wait_slots(b, deps.src[i].wait_sb_mask, false); struct panvk_cache_flush_info cache_flush = deps.src[i].cache_flush; if (cache_flush.l2 != MALI_CS_FLUSH_MODE_NONE || cache_flush.lsc != MALI_CS_FLUSH_MODE_NONE || cache_flush.others) { struct cs_index flush_id = cs_scratch_reg32(b, 0); cs_move32_to(b, flush_id, 0); cs_flush_caches(b, cache_flush.l2, cache_flush.lsc, cache_flush.others, flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH))); cs_wait_slot(b, SB_ID(IMM_FLUSH), false); } /* If no one waits on us, there's no point signaling the sync object. */ if (wait_subqueue_mask & BITFIELD_BIT(i)) { struct cs_index sync_addr = cs_scratch_reg64(b, 0); struct cs_index add_val = cs_scratch_reg64(b, 2); assert(deps.src[i].wait_sb_mask); cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, syncobjs)); cs_wait_slot(b, SB_ID(LS), false); cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * i); cs_move64_to(b, add_val, 1); cs_sync64_add(b, false, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr, cs_now()); ++cs_state->relative_sync_point; } } for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) { struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i); u_foreach_bit(j, deps.dst[i].wait_subqueue_mask) { struct panvk_cs_state *cs_state = &cmdbuf->state.cs[j]; struct cs_index sync_addr = cs_scratch_reg64(b, 0); struct cs_index wait_val = cs_scratch_reg64(b, 2); cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, syncobjs)); cs_wait_slot(b, SB_ID(LS), false); cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * j); cs_add64(b, wait_val, cs_progress_seqno_reg(b, j), cs_state->relative_sync_point); cs_sync64_wait(b, false, MALI_CS_CONDITION_GREATER, wait_val, sync_addr); } } } void panvk_per_arch(cs_pick_iter_sb)(struct panvk_cmd_buffer *cmdbuf, enum panvk_subqueue_id subqueue) { struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue); struct cs_index iter_sb = cs_scratch_reg32(b, 0); struct cs_index cmp_scratch = cs_scratch_reg32(b, 1); cs_load32_to(b, iter_sb, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, iter_sb)); cs_wait_slot(b, SB_ID(LS), false); cs_match(b, iter_sb, cmp_scratch) { #define CASE(x) \ cs_case(b, x) { \ cs_wait_slot(b, SB_ITER(x), false); \ cs_set_scoreboard_entry(b, SB_ITER(x), SB_ID(LS)); \ } CASE(0) CASE(1) CASE(2) CASE(3) CASE(4) #undef CASE } } static struct cs_buffer alloc_cs_buffer(void *cookie) { struct panvk_cmd_buffer *cmdbuf = cookie; const unsigned capacity = 64 * 1024 / sizeof(uint64_t); struct panfrost_ptr ptr = panvk_cmd_alloc_dev_mem(cmdbuf, cs, capacity * 8, 64); return (struct cs_buffer){ .cpu = ptr.cpu, .gpu = ptr.gpu, .capacity = capacity, }; } static enum cs_reg_perm cs_reg_perm(struct cs_builder *b, unsigned reg) { struct panvk_cs_state *cs_state = container_of(b, struct panvk_cs_state, builder); struct panvk_cs_reg_upd_context *upd_ctx; for (upd_ctx = cs_state->reg_access.upd_ctx_stack; upd_ctx; upd_ctx = upd_ctx->next) { if (upd_ctx->reg_perm(b, reg) == CS_REG_RW) return CS_REG_RW; } return cs_state->reg_access.base_perm(b, reg); } static void init_cs_builders(struct panvk_cmd_buffer *cmdbuf) { struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); struct panvk_instance *instance = to_panvk_instance(dev->vk.physical->instance); const reg_perm_cb_t base_reg_perms[PANVK_SUBQUEUE_COUNT] = { [PANVK_SUBQUEUE_VERTEX_TILER] = panvk_cs_vt_reg_perm, [PANVK_SUBQUEUE_FRAGMENT] = panvk_cs_frag_reg_perm, [PANVK_SUBQUEUE_COMPUTE] = panvk_cs_compute_reg_perm, }; for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) { struct cs_builder *b = &cmdbuf->state.cs[i].builder; /* Lazy allocation of the root CS. */ struct cs_buffer root_cs = {0}; struct cs_builder_conf conf = { .nr_registers = 96, .nr_kernel_registers = 4, .alloc_buffer = alloc_cs_buffer, .cookie = cmdbuf, }; if (instance->debug_flags & PANVK_DEBUG_CS) { cmdbuf->state.cs[i].ls_tracker = (struct cs_load_store_tracker){ .sb_slot = SB_ID(LS), }; conf.ls_tracker = &cmdbuf->state.cs[i].ls_tracker; cmdbuf->state.cs[i].reg_access.upd_ctx_stack = NULL; cmdbuf->state.cs[i].reg_access.base_perm = base_reg_perms[i]; conf.reg_perm = cs_reg_perm; } cs_builder_init(b, &conf, root_cs); if (instance->debug_flags & PANVK_DEBUG_TRACE) { cmdbuf->state.cs[i].tracing = (struct cs_tracing_ctx){ .enabled = true, .ctx_reg = cs_subqueue_ctx_reg(b), .tracebuf_addr_offset = offsetof(struct panvk_cs_subqueue_context, debug.tracebuf.cs), .ls_sb_slot = SB_ID(LS), }; } } } static void panvk_reset_cmdbuf(struct vk_command_buffer *vk_cmdbuf, VkCommandBufferResetFlags flags) { struct panvk_cmd_buffer *cmdbuf = container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk); struct panvk_cmd_pool *pool = container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk); struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); vk_command_buffer_reset(&cmdbuf->vk); panvk_pool_reset(&cmdbuf->cs_pool); panvk_pool_reset(&cmdbuf->desc_pool); panvk_pool_reset(&cmdbuf->tls_pool); list_splicetail(&cmdbuf->push_sets, &pool->push_sets); list_inithead(&cmdbuf->push_sets); for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->utrace.uts); i++) { struct u_trace *ut = &cmdbuf->utrace.uts[i]; u_trace_fini(ut); u_trace_init(ut, &dev->utrace.utctx); } memset(&cmdbuf->state, 0, sizeof(cmdbuf->state)); init_cs_builders(cmdbuf); } static void panvk_destroy_cmdbuf(struct vk_command_buffer *vk_cmdbuf) { struct panvk_cmd_buffer *cmdbuf = container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk); struct panvk_cmd_pool *pool = container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk); struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->utrace.uts); i++) u_trace_fini(&cmdbuf->utrace.uts[i]); panvk_pool_cleanup(&cmdbuf->cs_pool); panvk_pool_cleanup(&cmdbuf->desc_pool); panvk_pool_cleanup(&cmdbuf->tls_pool); list_splicetail(&cmdbuf->push_sets, &pool->push_sets); vk_command_buffer_finish(&cmdbuf->vk); vk_free(&dev->vk.alloc, cmdbuf); } static VkResult panvk_create_cmdbuf(struct vk_command_pool *vk_pool, VkCommandBufferLevel level, struct vk_command_buffer **cmdbuf_out) { struct panvk_device *device = container_of(vk_pool->base.device, struct panvk_device, vk); struct panvk_cmd_pool *pool = container_of(vk_pool, struct panvk_cmd_pool, vk); struct panvk_cmd_buffer *cmdbuf; cmdbuf = vk_zalloc(&device->vk.alloc, sizeof(*cmdbuf), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!cmdbuf) return panvk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); VkResult result = vk_command_buffer_init( &pool->vk, &cmdbuf->vk, &panvk_per_arch(cmd_buffer_ops), level); if (result != VK_SUCCESS) { vk_free(&device->vk.alloc, cmdbuf); return result; } list_inithead(&cmdbuf->push_sets); cmdbuf->vk.dynamic_graphics_state.vi = &cmdbuf->state.gfx.dynamic.vi; cmdbuf->vk.dynamic_graphics_state.ms.sample_locations = &cmdbuf->state.gfx.dynamic.sl; struct panvk_pool_properties cs_pool_props = { .create_flags = 0, .slab_size = 64 * 1024, .label = "Command buffer CS pool", .prealloc = false, .owns_bos = true, .needs_locking = false, }; panvk_pool_init(&cmdbuf->cs_pool, device, &pool->cs_bo_pool, &cs_pool_props); struct panvk_pool_properties desc_pool_props = { .create_flags = 0, .slab_size = 64 * 1024, .label = "Command buffer descriptor pool", .prealloc = false, .owns_bos = true, .needs_locking = false, }; panvk_pool_init(&cmdbuf->desc_pool, device, &pool->desc_bo_pool, &desc_pool_props); struct panvk_pool_properties tls_pool_props = { .create_flags = panvk_device_adjust_bo_flags(device, PAN_KMOD_BO_FLAG_NO_MMAP), .slab_size = 64 * 1024, .label = "TLS pool", .prealloc = false, .owns_bos = true, .needs_locking = false, }; panvk_pool_init(&cmdbuf->tls_pool, device, &pool->tls_bo_pool, &tls_pool_props); for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->utrace.uts); i++) u_trace_init(&cmdbuf->utrace.uts[i], &device->utrace.utctx); init_cs_builders(cmdbuf); *cmdbuf_out = &cmdbuf->vk; return VK_SUCCESS; } const struct vk_command_buffer_ops panvk_per_arch(cmd_buffer_ops) = { .create = panvk_create_cmdbuf, .reset = panvk_reset_cmdbuf, .destroy = panvk_destroy_cmdbuf, }; VKAPI_ATTR VkResult VKAPI_CALL panvk_per_arch(BeginCommandBuffer)(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo) { VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer); struct panvk_instance *instance = to_panvk_instance(cmdbuf->vk.base.device->physical->instance); vk_command_buffer_begin(&cmdbuf->vk, pBeginInfo); cmdbuf->flags = pBeginInfo->flags; if (instance->debug_flags & PANVK_DEBUG_FORCE_SIMULTANEOUS) { cmdbuf->flags |= VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; cmdbuf->flags &= ~VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; } panvk_per_arch(cmd_inherit_render_state)(cmdbuf, pBeginInfo); for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) trace_begin_cmdbuf(&cmdbuf->utrace.uts[i], cmdbuf); return VK_SUCCESS; } static void panvk_cmd_invalidate_state(struct panvk_cmd_buffer *cmdbuf) { /* From the Vulkan 1.3.275 spec: * * "...There is one exception to this rule - if the primary command * buffer is inside a render pass instance, then the render pass and * subpass state is not disturbed by executing secondary command * buffers." * * We need to reset everything EXCEPT the render pass state. */ struct panvk_rendering_state render_save = cmdbuf->state.gfx.render; memset(&cmdbuf->state.gfx, 0, sizeof(cmdbuf->state.gfx)); cmdbuf->state.gfx.render = render_save; vk_dynamic_graphics_state_dirty_all(&cmdbuf->vk.dynamic_graphics_state); gfx_state_set_all_dirty(cmdbuf); } VKAPI_ATTR void VKAPI_CALL panvk_per_arch(CmdExecuteCommands)(VkCommandBuffer commandBuffer, uint32_t commandBufferCount, const VkCommandBuffer *pCommandBuffers) { VK_FROM_HANDLE(panvk_cmd_buffer, primary, commandBuffer); if (commandBufferCount == 0) return; /* Write out any pending seqno changes to registers before calling * secondary command buffers. */ flush_sync_points(primary); for (uint32_t i = 0; i < commandBufferCount; i++) { VK_FROM_HANDLE(panvk_cmd_buffer, secondary, pCommandBuffers[i]); /* make sure the CS context is setup properly * to inherit the primary command buffer state */ primary->state.tls.info.tls.size = MAX2(primary->state.tls.info.tls.size, secondary->state.tls.info.tls.size); panvk_per_arch(cmd_prepare_exec_cmd_for_draws)(primary, secondary); for (uint32_t j = 0; j < ARRAY_SIZE(primary->state.cs); j++) { struct cs_builder *sec_b = panvk_get_cs_builder(secondary, j); assert(cs_is_valid(sec_b)); if (!cs_is_empty(sec_b)) { struct cs_builder *prim_b = panvk_get_cs_builder(primary, j); struct cs_index addr = cs_scratch_reg64(prim_b, 0); struct cs_index size = cs_scratch_reg32(prim_b, 2); cs_move64_to(prim_b, addr, cs_root_chunk_gpu_addr(sec_b)); cs_move32_to(prim_b, size, cs_root_chunk_size(sec_b)); cs_call(prim_b, addr, size); struct u_trace *prim_ut = &primary->utrace.uts[j]; struct u_trace *sec_ut = &secondary->utrace.uts[j]; u_trace_clone_append(u_trace_begin_iterator(sec_ut), u_trace_end_iterator(sec_ut), prim_ut, prim_b, panvk_per_arch(utrace_copy_buffer)); } } /* We need to propagate the suspending state of the secondary command * buffer if we want to avoid poisoning the reg file when the secondary * command buffer suspended the render pass. */ if (secondary->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT) primary->state.gfx.render.flags = secondary->state.gfx.render.flags; /* If the render context we passed to the secondary command buffer got * invalidated, reset the FB/tiler descs and treat things as if we * suspended the render pass, since those descriptors have been * re-emitted by the secondary command buffer already. */ if (secondary->state.gfx.render.invalidate_inherited_ctx) { memset(&primary->state.gfx.render.fbds, 0, sizeof(primary->state.gfx.render.fbds)); primary->state.gfx.render.tiler = 0; primary->state.gfx.render.flags |= VK_RENDERING_RESUMING_BIT; } } /* From the Vulkan 1.3.275 spec: * * "When secondary command buffer(s) are recorded to execute on a * primary command buffer, the secondary command buffer inherits no * state from the primary command buffer, and all state of the primary * command buffer is undefined after an execute secondary command buffer * command is recorded. There is one exception to this rule - if the * primary command buffer is inside a render pass instance, then the * render pass and subpass state is not disturbed by executing secondary * command buffers. For state dependent commands (such as draws and * dispatches), any state consumed by those commands must not be * undefined." * * Therefore, it's the client's job to reset all the state in the primary * after the secondary executes. However, if we're doing any internal * dirty tracking, we may miss the fact that a secondary has messed with * GPU state if we don't invalidate all our internal tracking. */ panvk_cmd_invalidate_state(primary); }