• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Collabora Ltd.
3  *
4  * Derived from tu_cmd_buffer.c which is:
5  * Copyright © 2016 Red Hat.
6  * Copyright © 2016 Bas Nieuwenhuizen
7  * Copyright © 2015 Intel Corporation
8  *
9  * Permission is hereby granted, free of charge, to any person obtaining a
10  * copy of this software and associated documentation files (the "Software"),
11  * to deal in the Software without restriction, including without limitation
12  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
13  * and/or sell copies of the Software, and to permit persons to whom the
14  * Software is furnished to do so, subject to the following conditions:
15  *
16  * The above copyright notice and this permission notice (including the next
17  * paragraph) shall be included in all copies or substantial portions of the
18  * Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26  * DEALINGS IN THE SOFTWARE.
27  */
28 
29 #include "genxml/gen_macros.h"
30 
31 #include "panvk_buffer.h"
32 #include "panvk_cmd_alloc.h"
33 #include "panvk_cmd_buffer.h"
34 #include "panvk_cmd_desc_state.h"
35 #include "panvk_cmd_pool.h"
36 #include "panvk_cmd_push_constant.h"
37 #include "panvk_device.h"
38 #include "panvk_entrypoints.h"
39 #include "panvk_instance.h"
40 #include "panvk_physical_device.h"
41 #include "panvk_priv_bo.h"
42 #include "panvk_tracepoints.h"
43 #include "panvk_utrace.h"
44 
45 #include "pan_desc.h"
46 #include "pan_encoder.h"
47 #include "pan_props.h"
48 #include "pan_samples.h"
49 
50 #include "util/bitscan.h"
51 #include "vk_descriptor_update_template.h"
52 #include "vk_format.h"
53 #include "vk_synchronization.h"
54 
55 static void
emit_tls(struct panvk_cmd_buffer * cmdbuf)56 emit_tls(struct panvk_cmd_buffer *cmdbuf)
57 {
58    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
59    struct panvk_physical_device *phys_dev =
60       to_panvk_physical_device(dev->vk.physical);
61    unsigned core_id_range;
62    panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
63 
64    if (cmdbuf->state.tls.info.tls.size) {
65       unsigned thread_tls_alloc =
66          panfrost_query_thread_tls_alloc(&phys_dev->kmod.props);
67       unsigned size = panfrost_get_total_stack_size(
68          cmdbuf->state.tls.info.tls.size, thread_tls_alloc, core_id_range);
69 
70       cmdbuf->state.tls.info.tls.ptr =
71          panvk_cmd_alloc_dev_mem(cmdbuf, tls, size, 4096).gpu;
72    }
73 
74    assert(!cmdbuf->state.tls.info.wls.size);
75 
76    if (cmdbuf->state.tls.desc.cpu) {
77       GENX(pan_emit_tls)(&cmdbuf->state.tls.info, cmdbuf->state.tls.desc.cpu);
78    }
79 }
80 
81 /**
82  * Write all sync point updates to seqno registers and reset the relative sync
83  * points to 0.
84  */
85 static void
flush_sync_points(struct panvk_cmd_buffer * cmdbuf)86 flush_sync_points(struct panvk_cmd_buffer *cmdbuf)
87 {
88    for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) {
89       struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i);
90 
91       if (!cs_is_valid(b)) {
92          vk_command_buffer_set_error(&cmdbuf->vk,
93                                      VK_ERROR_OUT_OF_DEVICE_MEMORY);
94          return;
95       }
96 
97       cs_update_progress_seqno(b) {
98          for (uint32_t j = 0; j < PANVK_SUBQUEUE_COUNT; j++) {
99             uint32_t rel_sync_point = cmdbuf->state.cs[j].relative_sync_point;
100 
101             if (!rel_sync_point)
102                continue;
103 
104             cs_add64(b, cs_progress_seqno_reg(b, j), cs_progress_seqno_reg(b, j),
105                      rel_sync_point);
106          }
107       }
108    }
109 
110    for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++)
111       cmdbuf->state.cs[i].relative_sync_point = 0;
112 }
113 
114 static void
finish_cs(struct panvk_cmd_buffer * cmdbuf,uint32_t subqueue)115 finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue)
116 {
117    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
118    struct panvk_instance *instance =
119       to_panvk_instance(dev->vk.physical->instance);
120    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue);
121 
122    /* We need a clean because descriptor/CS memory can be returned to the
123     * command pool where they get recycled. If we don't clean dirty cache lines,
124     * those cache lines might get evicted asynchronously and their content
125     * pushed back to main memory after the CPU has written new stuff there. */
126    struct cs_index flush_id = cs_scratch_reg32(b, 0);
127 
128    cs_move32_to(b, flush_id, 0);
129    cs_wait_slots(b, SB_ALL_MASK, false);
130    cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN,
131                    false, flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH)));
132    cs_wait_slot(b, SB_ID(IMM_FLUSH), false);
133 
134    /* If we're in sync/trace more, we signal the debug object. */
135    if (instance->debug_flags & (PANVK_DEBUG_SYNC | PANVK_DEBUG_TRACE)) {
136       struct cs_index debug_sync_addr = cs_scratch_reg64(b, 0);
137       struct cs_index one = cs_scratch_reg32(b, 2);
138       struct cs_index error = cs_scratch_reg32(b, 3);
139       struct cs_index cmp_scratch = cs_scratch_reg32(b, 2);
140 
141       cs_move32_to(b, one, 1);
142       cs_load64_to(b, debug_sync_addr, cs_subqueue_ctx_reg(b),
143                    offsetof(struct panvk_cs_subqueue_context, debug.syncobjs));
144       cs_wait_slot(b, SB_ID(LS), false);
145       cs_add64(b, debug_sync_addr, debug_sync_addr,
146                sizeof(struct panvk_cs_sync32) * subqueue);
147       cs_load32_to(b, error, debug_sync_addr,
148                    offsetof(struct panvk_cs_sync32, error));
149       cs_wait_slots(b, SB_ALL_MASK, false);
150       if (cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
151          cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_CSG, one,
152                        debug_sync_addr, cs_now());
153       cs_match(b, error, cmp_scratch) {
154          cs_case(b, 0) {
155             /* Do nothing. */
156          }
157 
158          cs_default(b) {
159             /* Overwrite the sync error with the first error we encountered. */
160             cs_store32(b, error, debug_sync_addr,
161                        offsetof(struct panvk_cs_sync32, error));
162             cs_wait_slot(b, SB_ID(LS), false);
163          }
164       }
165    }
166 
167    /* If this is a secondary command buffer, we don't poison the reg file to
168     * preserve the render pass context. We also don't poison the reg file if the
169     * last render pass was suspended. In practice we could preserve only the
170     * registers that matter, but this is a debug feature so let's keep things
171     * simple with this all-or-nothing approach. */
172    if ((instance->debug_flags & PANVK_DEBUG_CS) &&
173        cmdbuf->vk.level != VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
174        !(cmdbuf->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT)) {
175       cs_update_cmdbuf_regs(b) {
176          /* Poison all cmdbuf registers to make sure we don't inherit state from
177           * a previously executed cmdbuf. */
178          for (uint32_t i = 0; i <= PANVK_CS_REG_SCRATCH_END; i++)
179             cs_move32_to(b, cs_reg32(b, i), 0xdead | i << 24);
180       }
181    }
182 
183    trace_end_cmdbuf(&cmdbuf->utrace.uts[subqueue], cmdbuf, cmdbuf->flags);
184 
185    cs_finish(&cmdbuf->state.cs[subqueue].builder);
186 }
187 
188 VKAPI_ATTR VkResult VKAPI_CALL
panvk_per_arch(EndCommandBuffer)189 panvk_per_arch(EndCommandBuffer)(VkCommandBuffer commandBuffer)
190 {
191    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
192    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
193 
194    emit_tls(cmdbuf);
195    flush_sync_points(cmdbuf);
196 
197    for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) {
198       struct cs_builder *b = &cmdbuf->state.cs[i].builder;
199 
200       if (!cs_is_valid(b)) {
201          vk_command_buffer_set_error(&cmdbuf->vk,
202                                      VK_ERROR_OUT_OF_DEVICE_MEMORY);
203       } else {
204          finish_cs(cmdbuf, i);
205       }
206    }
207 
208    cmdbuf->flush_id = panthor_kmod_get_flush_id(dev->kmod.dev);
209 
210    return vk_command_buffer_end(&cmdbuf->vk);
211 }
212 
213 static VkPipelineStageFlags2
get_subqueue_stages(enum panvk_subqueue_id subqueue)214 get_subqueue_stages(enum panvk_subqueue_id subqueue)
215 {
216    switch (subqueue) {
217    case PANVK_SUBQUEUE_VERTEX_TILER:
218       return VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
219              VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
220              VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
221              VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT;
222    case PANVK_SUBQUEUE_FRAGMENT:
223       return VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
224              VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
225              VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT |
226              VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
227              VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_RESOLVE_BIT |
228              VK_PIPELINE_STAGE_2_BLIT_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT;
229    case PANVK_SUBQUEUE_COMPUTE:
230       return VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
231              VK_PIPELINE_STAGE_2_COPY_BIT;
232    default:
233       unreachable("Invalid subqueue id");
234    }
235 }
236 
237 static void
add_execution_dependency(uint32_t wait_masks[static PANVK_SUBQUEUE_COUNT],VkPipelineStageFlags2 src_stages,VkPipelineStageFlags2 dst_stages)238 add_execution_dependency(uint32_t wait_masks[static PANVK_SUBQUEUE_COUNT],
239                          VkPipelineStageFlags2 src_stages,
240                          VkPipelineStageFlags2 dst_stages)
241 {
242    /* convert stages to subqueues */
243    uint32_t src_subqueues = 0;
244    uint32_t dst_subqueues = 0;
245    for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
246       const VkPipelineStageFlags2 subqueue_stages = get_subqueue_stages(i);
247       if (src_stages & subqueue_stages)
248          src_subqueues |= BITFIELD_BIT(i);
249       if (dst_stages & subqueue_stages)
250          dst_subqueues |= BITFIELD_BIT(i);
251    }
252 
253    const bool dst_host = dst_stages & VK_PIPELINE_STAGE_2_HOST_BIT;
254 
255    /* nothing to wait */
256    if (!src_subqueues || (!dst_subqueues && !dst_host))
257       return;
258 
259    u_foreach_bit(i, dst_subqueues) {
260       /* each dst subqueue should wait for all src subqueues */
261       uint32_t wait_mask = src_subqueues;
262 
263       switch (i) {
264       case PANVK_SUBQUEUE_VERTEX_TILER:
265          /* Indirect draw buffers are read from the command stream, and
266           * load/store operations are synchronized with the LS scoreboard
267           * immediately after the read, so no need to wait in that case.
268           */
269          if ((src_stages & get_subqueue_stages(i)) ==
270              VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT)
271             wait_mask &= ~BITFIELD_BIT(i);
272          break;
273       case PANVK_SUBQUEUE_FRAGMENT:
274          /* The fragment subqueue always waits for the tiler subqueue already.
275           * Explicit waits can be skipped.
276           */
277          wait_mask &= ~BITFIELD_BIT(PANVK_SUBQUEUE_VERTEX_TILER);
278          break;
279       default:
280          break;
281       }
282 
283       wait_masks[i] |= wait_mask;
284    }
285 
286    /* The host does not wait for src subqueues.  All src subqueues should
287     * self-wait instead.
288     *
289     * Also, our callers currently expect src subqueues to self-wait when there
290     * are dst subqueues.  Until that changes, make all src subqueues self-wait.
291     */
292    if (dst_host || dst_subqueues) {
293       u_foreach_bit(i, src_subqueues)
294          wait_masks[i] |= BITFIELD_BIT(i);
295    }
296 }
297 
298 static void
add_memory_dependency(struct panvk_cache_flush_info * cache_flush,VkAccessFlags2 src_access,VkAccessFlags2 dst_access)299 add_memory_dependency(struct panvk_cache_flush_info *cache_flush,
300                       VkAccessFlags2 src_access, VkAccessFlags2 dst_access)
301 {
302    /* Note on the cache organization:
303     *
304     * - L2 cache is unified, so all changes to this cache are automatically
305     *   visible to all GPU sub-components (shader cores, tiler, ...). This
306     *   means we only need to flush when the host (AKA CPU) is involved.
307     * - LS caches (which are basically just read-write L1 caches) are coherent
308     *   with each other and with the L2 cache, so again, we only need to flush
309     *   when the host is involved.
310     * - Other read-only L1 caches (like the ones in front of the texture unit)
311     *   are not coherent with the LS or L2 caches, and thus need to be
312     *   invalidated any time a write happens.
313     *
314     * Translating to the Vulkan memory model:
315     *
316     * - The device domain is the L2 cache.
317     * - An availability operation from device writes to the device domain is
318     *   nop.
319     * - A visibility operation from the device domain to device accesses that
320     *   are coherent with L2/LS is nop.
321     * - A visibility operation from the device domain to device accesses that
322     *   are incoherent with L2/LS invalidates the other RO L1 caches.
323     * - A host-to-device domain operation invalidates all caches.
324     * - A device-to-host domain operation flushes L2/LS.
325     */
326    const VkAccessFlags2 ro_l1_access =
327       VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT |
328       VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
329       VK_ACCESS_2_TRANSFER_READ_BIT | VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
330       VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT;
331 
332    /* visibility op */
333    if (dst_access & ro_l1_access)
334       cache_flush->others |= true;
335 
336    /* host-to-device domain op */
337    if (src_access & VK_ACCESS_2_HOST_WRITE_BIT) {
338       cache_flush->l2 |= MALI_CS_FLUSH_MODE_CLEAN_AND_INVALIDATE;
339       cache_flush->lsc |= MALI_CS_FLUSH_MODE_CLEAN_AND_INVALIDATE;
340       cache_flush->others |= true;
341    }
342 
343    /* device-to-host domain op */
344    if (dst_access & (VK_ACCESS_2_HOST_READ_BIT | VK_ACCESS_2_HOST_WRITE_BIT)) {
345       cache_flush->l2 |= MALI_CS_FLUSH_MODE_CLEAN;
346       cache_flush->lsc |= MALI_CS_FLUSH_MODE_CLEAN;
347    }
348 }
349 
350 static bool
should_split_render_pass(const uint32_t wait_masks[static PANVK_SUBQUEUE_COUNT],VkAccessFlags2 src_access,VkAccessFlags2 dst_access)351 should_split_render_pass(const uint32_t wait_masks[static PANVK_SUBQUEUE_COUNT],
352                          VkAccessFlags2 src_access, VkAccessFlags2 dst_access)
353 {
354    /* From the Vulkan 1.3.301 spec:
355     *
356     *    VUID-vkCmdPipelineBarrier-None-07892
357     *
358     *    "If vkCmdPipelineBarrier is called within a render pass instance, the
359     *    source and destination stage masks of any memory barriers must only
360     *    include graphics pipeline stages"
361     *
362     * We only consider the tiler and the fragment subqueues here.
363     */
364 
365    /* split if the tiler subqueue waits for the fragment subqueue */
366    if (wait_masks[PANVK_SUBQUEUE_VERTEX_TILER] &
367        BITFIELD_BIT(PANVK_SUBQUEUE_FRAGMENT))
368       return true;
369 
370    /* split if the fragment subqueue self-waits with a feedback loop, because
371     * we lower subpassLoad to texelFetch
372     */
373    if ((wait_masks[PANVK_SUBQUEUE_FRAGMENT] &
374         BITFIELD_BIT(PANVK_SUBQUEUE_FRAGMENT)) &&
375        (src_access & (VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
376                       VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT)) &&
377        (dst_access & VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT))
378       return true;
379 
380    return false;
381 }
382 
383 static void
collect_cache_flush_info(enum panvk_subqueue_id subqueue,struct panvk_cache_flush_info * cache_flush,VkAccessFlags2 src_access,VkAccessFlags2 dst_access)384 collect_cache_flush_info(enum panvk_subqueue_id subqueue,
385                          struct panvk_cache_flush_info *cache_flush,
386                          VkAccessFlags2 src_access, VkAccessFlags2 dst_access)
387 {
388    /* limit access to the subqueue and host */
389    const VkPipelineStageFlags2 subqueue_stages =
390       get_subqueue_stages(subqueue) | VK_PIPELINE_STAGE_2_HOST_BIT;
391    src_access = vk_filter_src_access_flags2(subqueue_stages, src_access);
392    dst_access = vk_filter_dst_access_flags2(subqueue_stages, dst_access);
393 
394    add_memory_dependency(cache_flush, src_access, dst_access);
395 }
396 
397 static void
collect_cs_deps(struct panvk_cmd_buffer * cmdbuf,VkPipelineStageFlags2 src_stages,VkPipelineStageFlags2 dst_stages,VkAccessFlags2 src_access,VkAccessFlags2 dst_access,struct panvk_cs_deps * deps)398 collect_cs_deps(struct panvk_cmd_buffer *cmdbuf,
399                 VkPipelineStageFlags2 src_stages,
400                 VkPipelineStageFlags2 dst_stages, VkAccessFlags2 src_access,
401                 VkAccessFlags2 dst_access, struct panvk_cs_deps *deps)
402 {
403    uint32_t wait_masks[PANVK_SUBQUEUE_COUNT] = {0};
404    add_execution_dependency(wait_masks, src_stages, dst_stages);
405 
406    /* within a render pass */
407    if (cmdbuf->state.gfx.render.tiler) {
408       if (should_split_render_pass(wait_masks, src_access, dst_access)) {
409          deps->needs_draw_flush = true;
410       } else {
411          /* skip the tiler subqueue self-wait because we use the same
412           * scoreboard slot for the idvs jobs
413           */
414          wait_masks[PANVK_SUBQUEUE_VERTEX_TILER] &=
415             ~BITFIELD_BIT(PANVK_SUBQUEUE_VERTEX_TILER);
416 
417          /* skip the fragment subqueue self-wait because we emit the fragment
418           * job at the end of the render pass and there is nothing to wait yet
419           */
420          wait_masks[PANVK_SUBQUEUE_FRAGMENT] &=
421             ~BITFIELD_BIT(PANVK_SUBQUEUE_FRAGMENT);
422       }
423    }
424 
425    for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
426       if (wait_masks[i] & BITFIELD_BIT(i)) {
427          /* We need to self-wait for all previously submitted jobs, and given
428           * the iterator scoreboard is a moving target, we just wait for the
429           * whole dynamic scoreboard range.
430           */
431          deps->src[i].wait_sb_mask |= SB_ALL_ITERS_MASK;
432       }
433 
434       collect_cache_flush_info(i, &deps->src[i].cache_flush, src_access,
435                                dst_access);
436 
437       deps->dst[i].wait_subqueue_mask |= wait_masks[i];
438    }
439 }
440 
441 static void
normalize_dependency(VkPipelineStageFlags2 * src_stages,VkPipelineStageFlags2 * dst_stages,VkAccessFlags2 * src_access,VkAccessFlags2 * dst_access,uint32_t src_qfi,uint32_t dst_qfi)442 normalize_dependency(VkPipelineStageFlags2 *src_stages,
443                      VkPipelineStageFlags2 *dst_stages,
444                      VkAccessFlags2 *src_access, VkAccessFlags2 *dst_access,
445                      uint32_t src_qfi, uint32_t dst_qfi)
446 {
447    /* queue family acquire operation */
448    switch (src_qfi) {
449    case VK_QUEUE_FAMILY_EXTERNAL:
450       /* no execution dependency and no availability operation */
451       *src_stages = VK_PIPELINE_STAGE_2_NONE;
452       *src_access = VK_ACCESS_2_NONE;
453       break;
454    case VK_QUEUE_FAMILY_FOREIGN_EXT:
455       /* treat the foreign queue as the host */
456       *src_stages = VK_PIPELINE_STAGE_2_HOST_BIT;
457       *src_access = VK_ACCESS_2_HOST_WRITE_BIT;
458       break;
459    default:
460       break;
461    }
462 
463    /* queue family release operation */
464    switch (dst_qfi) {
465    case VK_QUEUE_FAMILY_EXTERNAL:
466       /* no execution dependency and no visibility operation */
467       *dst_stages = VK_PIPELINE_STAGE_2_NONE;
468       *dst_access = VK_ACCESS_2_NONE;
469       break;
470    case VK_QUEUE_FAMILY_FOREIGN_EXT:
471       /* treat the foreign queue as the host */
472       *dst_stages = VK_PIPELINE_STAGE_2_HOST_BIT;
473       *dst_access = VK_ACCESS_2_HOST_WRITE_BIT;
474       break;
475    default:
476       break;
477    }
478 
479    *src_stages = vk_expand_src_stage_flags2(*src_stages);
480    *dst_stages = vk_expand_dst_stage_flags2(*dst_stages);
481 
482    *src_access = vk_filter_src_access_flags2(*src_stages, *src_access);
483    *dst_access = vk_filter_dst_access_flags2(*dst_stages, *dst_access);
484 }
485 
486 void
panvk_per_arch(get_cs_deps)487 panvk_per_arch(get_cs_deps)(struct panvk_cmd_buffer *cmdbuf,
488                             const VkDependencyInfo *in,
489                             struct panvk_cs_deps *out)
490 {
491    memset(out, 0, sizeof(*out));
492 
493    for (uint32_t i = 0; i < in->memoryBarrierCount; i++) {
494       const VkMemoryBarrier2 *barrier = &in->pMemoryBarriers[i];
495       VkPipelineStageFlags2 src_stages = barrier->srcStageMask;
496       VkPipelineStageFlags2 dst_stages = barrier->dstStageMask;
497       VkAccessFlags2 src_access = barrier->srcAccessMask;
498       VkAccessFlags2 dst_access = barrier->dstAccessMask;
499       normalize_dependency(&src_stages, &dst_stages, &src_access, &dst_access,
500                            VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED);
501 
502       collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access,
503                       out);
504    }
505 
506    for (uint32_t i = 0; i < in->bufferMemoryBarrierCount; i++) {
507       const VkBufferMemoryBarrier2 *barrier = &in->pBufferMemoryBarriers[i];
508       VkPipelineStageFlags2 src_stages = barrier->srcStageMask;
509       VkPipelineStageFlags2 dst_stages = barrier->dstStageMask;
510       VkAccessFlags2 src_access = barrier->srcAccessMask;
511       VkAccessFlags2 dst_access = barrier->dstAccessMask;
512       normalize_dependency(&src_stages, &dst_stages, &src_access, &dst_access,
513                            barrier->srcQueueFamilyIndex,
514                            barrier->dstQueueFamilyIndex);
515 
516       collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access,
517                       out);
518    }
519 
520    for (uint32_t i = 0; i < in->imageMemoryBarrierCount; i++) {
521       const VkImageMemoryBarrier2 *barrier = &in->pImageMemoryBarriers[i];
522       VkPipelineStageFlags2 src_stages = barrier->srcStageMask;
523       VkPipelineStageFlags2 dst_stages = barrier->dstStageMask;
524       VkAccessFlags2 src_access = barrier->srcAccessMask;
525       VkAccessFlags2 dst_access = barrier->dstAccessMask;
526       normalize_dependency(&src_stages, &dst_stages, &src_access, &dst_access,
527                            barrier->srcQueueFamilyIndex,
528                            barrier->dstQueueFamilyIndex);
529 
530       collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access,
531                       out);
532    }
533 }
534 
535 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdPipelineBarrier2)536 panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer,
537                                     const VkDependencyInfo *pDependencyInfo)
538 {
539    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
540    struct panvk_cs_deps deps;
541 
542    panvk_per_arch(get_cs_deps)(cmdbuf, pDependencyInfo, &deps);
543 
544    if (deps.needs_draw_flush)
545       panvk_per_arch(cmd_flush_draws)(cmdbuf);
546 
547    uint32_t wait_subqueue_mask = 0;
548    for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
549       /* no need to perform both types of waits on the same subqueue */
550       if (deps.src[i].wait_sb_mask)
551          deps.dst[i].wait_subqueue_mask &= ~BITFIELD_BIT(i);
552       assert(!(deps.dst[i].wait_subqueue_mask & BITFIELD_BIT(i)));
553 
554       wait_subqueue_mask |= deps.dst[i].wait_subqueue_mask;
555    }
556 
557    for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
558 
559       struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i);
560       struct panvk_cs_state *cs_state = &cmdbuf->state.cs[i];
561 
562       if (deps.src[i].wait_sb_mask)
563          cs_wait_slots(b, deps.src[i].wait_sb_mask, false);
564 
565       struct panvk_cache_flush_info cache_flush = deps.src[i].cache_flush;
566       if (cache_flush.l2 != MALI_CS_FLUSH_MODE_NONE ||
567           cache_flush.lsc != MALI_CS_FLUSH_MODE_NONE || cache_flush.others) {
568          struct cs_index flush_id = cs_scratch_reg32(b, 0);
569 
570          cs_move32_to(b, flush_id, 0);
571          cs_flush_caches(b, cache_flush.l2, cache_flush.lsc, cache_flush.others,
572                          flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH)));
573          cs_wait_slot(b, SB_ID(IMM_FLUSH), false);
574       }
575 
576       /* If no one waits on us, there's no point signaling the sync object. */
577       if (wait_subqueue_mask & BITFIELD_BIT(i)) {
578          struct cs_index sync_addr = cs_scratch_reg64(b, 0);
579          struct cs_index add_val = cs_scratch_reg64(b, 2);
580 
581          assert(deps.src[i].wait_sb_mask);
582 
583          cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
584                       offsetof(struct panvk_cs_subqueue_context, syncobjs));
585          cs_wait_slot(b, SB_ID(LS), false);
586          cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * i);
587          cs_move64_to(b, add_val, 1);
588          cs_sync64_add(b, false, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr,
589                        cs_now());
590          ++cs_state->relative_sync_point;
591       }
592    }
593 
594    for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
595       struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i);
596       u_foreach_bit(j, deps.dst[i].wait_subqueue_mask) {
597          struct panvk_cs_state *cs_state = &cmdbuf->state.cs[j];
598          struct cs_index sync_addr = cs_scratch_reg64(b, 0);
599          struct cs_index wait_val = cs_scratch_reg64(b, 2);
600 
601          cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
602                       offsetof(struct panvk_cs_subqueue_context, syncobjs));
603          cs_wait_slot(b, SB_ID(LS), false);
604          cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * j);
605 
606          cs_add64(b, wait_val, cs_progress_seqno_reg(b, j),
607                   cs_state->relative_sync_point);
608          cs_sync64_wait(b, false, MALI_CS_CONDITION_GREATER, wait_val,
609                         sync_addr);
610       }
611    }
612 }
613 
614 void
panvk_per_arch(cs_pick_iter_sb)615 panvk_per_arch(cs_pick_iter_sb)(struct panvk_cmd_buffer *cmdbuf,
616                                 enum panvk_subqueue_id subqueue)
617 {
618    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue);
619    struct cs_index iter_sb = cs_scratch_reg32(b, 0);
620    struct cs_index cmp_scratch = cs_scratch_reg32(b, 1);
621 
622    cs_load32_to(b, iter_sb, cs_subqueue_ctx_reg(b),
623                 offsetof(struct panvk_cs_subqueue_context, iter_sb));
624    cs_wait_slot(b, SB_ID(LS), false);
625 
626    cs_match(b, iter_sb, cmp_scratch) {
627 #define CASE(x)                                                                \
628       cs_case(b, x) {                                                          \
629          cs_wait_slot(b, SB_ITER(x), false);                                   \
630          cs_set_scoreboard_entry(b, SB_ITER(x), SB_ID(LS));                    \
631       }
632 
633       CASE(0)
634       CASE(1)
635       CASE(2)
636       CASE(3)
637       CASE(4)
638 #undef CASE
639    }
640 }
641 
642 static struct cs_buffer
alloc_cs_buffer(void * cookie)643 alloc_cs_buffer(void *cookie)
644 {
645    struct panvk_cmd_buffer *cmdbuf = cookie;
646    const unsigned capacity = 64 * 1024 / sizeof(uint64_t);
647 
648    struct panfrost_ptr ptr =
649       panvk_cmd_alloc_dev_mem(cmdbuf, cs, capacity * 8, 64);
650 
651    return (struct cs_buffer){
652       .cpu = ptr.cpu,
653       .gpu = ptr.gpu,
654       .capacity = capacity,
655    };
656 }
657 
658 static enum cs_reg_perm
cs_reg_perm(struct cs_builder * b,unsigned reg)659 cs_reg_perm(struct cs_builder *b, unsigned reg)
660 {
661    struct panvk_cs_state *cs_state =
662       container_of(b, struct panvk_cs_state, builder);
663    struct panvk_cs_reg_upd_context *upd_ctx;
664 
665    for (upd_ctx = cs_state->reg_access.upd_ctx_stack; upd_ctx;
666         upd_ctx = upd_ctx->next) {
667       if (upd_ctx->reg_perm(b, reg) == CS_REG_RW)
668          return CS_REG_RW;
669    }
670 
671    return cs_state->reg_access.base_perm(b, reg);
672 }
673 
674 static void
init_cs_builders(struct panvk_cmd_buffer * cmdbuf)675 init_cs_builders(struct panvk_cmd_buffer *cmdbuf)
676 {
677    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
678    struct panvk_instance *instance =
679       to_panvk_instance(dev->vk.physical->instance);
680    const reg_perm_cb_t base_reg_perms[PANVK_SUBQUEUE_COUNT] = {
681       [PANVK_SUBQUEUE_VERTEX_TILER] = panvk_cs_vt_reg_perm,
682       [PANVK_SUBQUEUE_FRAGMENT] = panvk_cs_frag_reg_perm,
683       [PANVK_SUBQUEUE_COMPUTE] = panvk_cs_compute_reg_perm,
684    };
685 
686    for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) {
687       struct cs_builder *b = &cmdbuf->state.cs[i].builder;
688       /* Lazy allocation of the root CS. */
689       struct cs_buffer root_cs = {0};
690 
691       struct cs_builder_conf conf = {
692          .nr_registers = 96,
693          .nr_kernel_registers = 4,
694          .alloc_buffer = alloc_cs_buffer,
695          .cookie = cmdbuf,
696       };
697 
698       if (instance->debug_flags & PANVK_DEBUG_CS) {
699          cmdbuf->state.cs[i].ls_tracker = (struct cs_load_store_tracker){
700             .sb_slot = SB_ID(LS),
701          };
702 
703          conf.ls_tracker = &cmdbuf->state.cs[i].ls_tracker;
704 
705          cmdbuf->state.cs[i].reg_access.upd_ctx_stack = NULL;
706          cmdbuf->state.cs[i].reg_access.base_perm = base_reg_perms[i];
707          conf.reg_perm = cs_reg_perm;
708       }
709 
710       cs_builder_init(b, &conf, root_cs);
711 
712       if (instance->debug_flags & PANVK_DEBUG_TRACE) {
713          cmdbuf->state.cs[i].tracing = (struct cs_tracing_ctx){
714             .enabled = true,
715             .ctx_reg = cs_subqueue_ctx_reg(b),
716             .tracebuf_addr_offset =
717                offsetof(struct panvk_cs_subqueue_context, debug.tracebuf.cs),
718             .ls_sb_slot = SB_ID(LS),
719          };
720       }
721    }
722 }
723 
724 static void
panvk_reset_cmdbuf(struct vk_command_buffer * vk_cmdbuf,VkCommandBufferResetFlags flags)725 panvk_reset_cmdbuf(struct vk_command_buffer *vk_cmdbuf,
726                    VkCommandBufferResetFlags flags)
727 {
728    struct panvk_cmd_buffer *cmdbuf =
729       container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk);
730    struct panvk_cmd_pool *pool =
731       container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk);
732    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
733 
734    vk_command_buffer_reset(&cmdbuf->vk);
735 
736    panvk_pool_reset(&cmdbuf->cs_pool);
737    panvk_pool_reset(&cmdbuf->desc_pool);
738    panvk_pool_reset(&cmdbuf->tls_pool);
739    list_splicetail(&cmdbuf->push_sets, &pool->push_sets);
740    list_inithead(&cmdbuf->push_sets);
741 
742    for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->utrace.uts); i++) {
743       struct u_trace *ut = &cmdbuf->utrace.uts[i];
744       u_trace_fini(ut);
745       u_trace_init(ut, &dev->utrace.utctx);
746    }
747 
748    memset(&cmdbuf->state, 0, sizeof(cmdbuf->state));
749    init_cs_builders(cmdbuf);
750 }
751 
752 static void
panvk_destroy_cmdbuf(struct vk_command_buffer * vk_cmdbuf)753 panvk_destroy_cmdbuf(struct vk_command_buffer *vk_cmdbuf)
754 {
755    struct panvk_cmd_buffer *cmdbuf =
756       container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk);
757    struct panvk_cmd_pool *pool =
758       container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk);
759    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
760 
761    for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->utrace.uts); i++)
762       u_trace_fini(&cmdbuf->utrace.uts[i]);
763 
764    panvk_pool_cleanup(&cmdbuf->cs_pool);
765    panvk_pool_cleanup(&cmdbuf->desc_pool);
766    panvk_pool_cleanup(&cmdbuf->tls_pool);
767    list_splicetail(&cmdbuf->push_sets, &pool->push_sets);
768    vk_command_buffer_finish(&cmdbuf->vk);
769    vk_free(&dev->vk.alloc, cmdbuf);
770 }
771 
772 static VkResult
panvk_create_cmdbuf(struct vk_command_pool * vk_pool,VkCommandBufferLevel level,struct vk_command_buffer ** cmdbuf_out)773 panvk_create_cmdbuf(struct vk_command_pool *vk_pool, VkCommandBufferLevel level,
774                     struct vk_command_buffer **cmdbuf_out)
775 {
776    struct panvk_device *device =
777       container_of(vk_pool->base.device, struct panvk_device, vk);
778    struct panvk_cmd_pool *pool =
779       container_of(vk_pool, struct panvk_cmd_pool, vk);
780    struct panvk_cmd_buffer *cmdbuf;
781 
782    cmdbuf = vk_zalloc(&device->vk.alloc, sizeof(*cmdbuf), 8,
783                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
784    if (!cmdbuf)
785       return panvk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
786 
787    VkResult result = vk_command_buffer_init(
788       &pool->vk, &cmdbuf->vk, &panvk_per_arch(cmd_buffer_ops), level);
789    if (result != VK_SUCCESS) {
790       vk_free(&device->vk.alloc, cmdbuf);
791       return result;
792    }
793 
794    list_inithead(&cmdbuf->push_sets);
795    cmdbuf->vk.dynamic_graphics_state.vi = &cmdbuf->state.gfx.dynamic.vi;
796    cmdbuf->vk.dynamic_graphics_state.ms.sample_locations =
797       &cmdbuf->state.gfx.dynamic.sl;
798 
799    struct panvk_pool_properties cs_pool_props = {
800       .create_flags = 0,
801       .slab_size = 64 * 1024,
802       .label = "Command buffer CS pool",
803       .prealloc = false,
804       .owns_bos = true,
805       .needs_locking = false,
806    };
807    panvk_pool_init(&cmdbuf->cs_pool, device, &pool->cs_bo_pool, &cs_pool_props);
808 
809    struct panvk_pool_properties desc_pool_props = {
810       .create_flags = 0,
811       .slab_size = 64 * 1024,
812       .label = "Command buffer descriptor pool",
813       .prealloc = false,
814       .owns_bos = true,
815       .needs_locking = false,
816    };
817    panvk_pool_init(&cmdbuf->desc_pool, device, &pool->desc_bo_pool,
818                    &desc_pool_props);
819 
820    struct panvk_pool_properties tls_pool_props = {
821       .create_flags =
822          panvk_device_adjust_bo_flags(device, PAN_KMOD_BO_FLAG_NO_MMAP),
823       .slab_size = 64 * 1024,
824       .label = "TLS pool",
825       .prealloc = false,
826       .owns_bos = true,
827       .needs_locking = false,
828    };
829    panvk_pool_init(&cmdbuf->tls_pool, device, &pool->tls_bo_pool,
830                    &tls_pool_props);
831 
832    for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->utrace.uts); i++)
833       u_trace_init(&cmdbuf->utrace.uts[i], &device->utrace.utctx);
834 
835    init_cs_builders(cmdbuf);
836    *cmdbuf_out = &cmdbuf->vk;
837    return VK_SUCCESS;
838 }
839 
840 const struct vk_command_buffer_ops panvk_per_arch(cmd_buffer_ops) = {
841    .create = panvk_create_cmdbuf,
842    .reset = panvk_reset_cmdbuf,
843    .destroy = panvk_destroy_cmdbuf,
844 };
845 
846 VKAPI_ATTR VkResult VKAPI_CALL
panvk_per_arch(BeginCommandBuffer)847 panvk_per_arch(BeginCommandBuffer)(VkCommandBuffer commandBuffer,
848                                    const VkCommandBufferBeginInfo *pBeginInfo)
849 {
850    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
851    struct panvk_instance *instance =
852       to_panvk_instance(cmdbuf->vk.base.device->physical->instance);
853 
854    vk_command_buffer_begin(&cmdbuf->vk, pBeginInfo);
855    cmdbuf->flags = pBeginInfo->flags;
856 
857    if (instance->debug_flags & PANVK_DEBUG_FORCE_SIMULTANEOUS) {
858       cmdbuf->flags |= VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
859       cmdbuf->flags &= ~VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
860    }
861 
862    panvk_per_arch(cmd_inherit_render_state)(cmdbuf, pBeginInfo);
863 
864    for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++)
865       trace_begin_cmdbuf(&cmdbuf->utrace.uts[i], cmdbuf);
866 
867    return VK_SUCCESS;
868 }
869 
870 static void
panvk_cmd_invalidate_state(struct panvk_cmd_buffer * cmdbuf)871 panvk_cmd_invalidate_state(struct panvk_cmd_buffer *cmdbuf)
872 {
873    /* From the Vulkan 1.3.275 spec:
874     *
875     *    "...There is one exception to this rule - if the primary command
876     *    buffer is inside a render pass instance, then the render pass and
877     *    subpass state is not disturbed by executing secondary command
878     *    buffers."
879     *
880     * We need to reset everything EXCEPT the render pass state.
881     */
882    struct panvk_rendering_state render_save = cmdbuf->state.gfx.render;
883    memset(&cmdbuf->state.gfx, 0, sizeof(cmdbuf->state.gfx));
884    cmdbuf->state.gfx.render = render_save;
885 
886    vk_dynamic_graphics_state_dirty_all(&cmdbuf->vk.dynamic_graphics_state);
887    gfx_state_set_all_dirty(cmdbuf);
888 }
889 
890 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdExecuteCommands)891 panvk_per_arch(CmdExecuteCommands)(VkCommandBuffer commandBuffer,
892                                    uint32_t commandBufferCount,
893                                    const VkCommandBuffer *pCommandBuffers)
894 {
895    VK_FROM_HANDLE(panvk_cmd_buffer, primary, commandBuffer);
896 
897    if (commandBufferCount == 0)
898       return;
899 
900    /* Write out any pending seqno changes to registers before calling
901     * secondary command buffers. */
902    flush_sync_points(primary);
903 
904    for (uint32_t i = 0; i < commandBufferCount; i++) {
905       VK_FROM_HANDLE(panvk_cmd_buffer, secondary, pCommandBuffers[i]);
906 
907       /* make sure the CS context is setup properly
908        * to inherit the primary command buffer state
909        */
910       primary->state.tls.info.tls.size =
911          MAX2(primary->state.tls.info.tls.size,
912               secondary->state.tls.info.tls.size);
913       panvk_per_arch(cmd_prepare_exec_cmd_for_draws)(primary, secondary);
914 
915       for (uint32_t j = 0; j < ARRAY_SIZE(primary->state.cs); j++) {
916          struct cs_builder *sec_b = panvk_get_cs_builder(secondary, j);
917          assert(cs_is_valid(sec_b));
918          if (!cs_is_empty(sec_b)) {
919             struct cs_builder *prim_b = panvk_get_cs_builder(primary, j);
920             struct cs_index addr = cs_scratch_reg64(prim_b, 0);
921             struct cs_index size = cs_scratch_reg32(prim_b, 2);
922             cs_move64_to(prim_b, addr, cs_root_chunk_gpu_addr(sec_b));
923             cs_move32_to(prim_b, size, cs_root_chunk_size(sec_b));
924             cs_call(prim_b, addr, size);
925 
926             struct u_trace *prim_ut = &primary->utrace.uts[j];
927             struct u_trace *sec_ut = &secondary->utrace.uts[j];
928             u_trace_clone_append(u_trace_begin_iterator(sec_ut),
929                                  u_trace_end_iterator(sec_ut), prim_ut, prim_b,
930                                  panvk_per_arch(utrace_copy_buffer));
931          }
932       }
933 
934       /* We need to propagate the suspending state of the secondary command
935        * buffer if we want to avoid poisoning the reg file when the secondary
936        * command buffer suspended the render pass. */
937       if (secondary->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT)
938          primary->state.gfx.render.flags = secondary->state.gfx.render.flags;
939 
940       /* If the render context we passed to the secondary command buffer got
941        * invalidated, reset the FB/tiler descs and treat things as if we
942        * suspended the render pass, since those descriptors have been
943        * re-emitted by the secondary command buffer already. */
944       if (secondary->state.gfx.render.invalidate_inherited_ctx) {
945          memset(&primary->state.gfx.render.fbds, 0,
946                 sizeof(primary->state.gfx.render.fbds));
947          primary->state.gfx.render.tiler = 0;
948          primary->state.gfx.render.flags |= VK_RENDERING_RESUMING_BIT;
949       }
950    }
951 
952    /* From the Vulkan 1.3.275 spec:
953     *
954     *    "When secondary command buffer(s) are recorded to execute on a
955     *    primary command buffer, the secondary command buffer inherits no
956     *    state from the primary command buffer, and all state of the primary
957     *    command buffer is undefined after an execute secondary command buffer
958     *    command is recorded. There is one exception to this rule - if the
959     *    primary command buffer is inside a render pass instance, then the
960     *    render pass and subpass state is not disturbed by executing secondary
961     *    command buffers. For state dependent commands (such as draws and
962     *    dispatches), any state consumed by those commands must not be
963     *    undefined."
964     *
965     * Therefore, it's the client's job to reset all the state in the primary
966     * after the secondary executes.  However, if we're doing any internal
967     * dirty tracking, we may miss the fact that a secondary has messed with
968     * GPU state if we don't invalidate all our internal tracking.
969     */
970    panvk_cmd_invalidate_state(primary);
971 }
972