1 /*
2 * Copyright © 2021 Collabora Ltd.
3 *
4 * Derived from tu_cmd_buffer.c which is:
5 * Copyright © 2016 Red Hat.
6 * Copyright © 2016 Bas Nieuwenhuizen
7 * Copyright © 2015 Intel Corporation
8 *
9 * Permission is hereby granted, free of charge, to any person obtaining a
10 * copy of this software and associated documentation files (the "Software"),
11 * to deal in the Software without restriction, including without limitation
12 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 * and/or sell copies of the Software, and to permit persons to whom the
14 * Software is furnished to do so, subject to the following conditions:
15 *
16 * The above copyright notice and this permission notice (including the next
17 * paragraph) shall be included in all copies or substantial portions of the
18 * Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 * DEALINGS IN THE SOFTWARE.
27 */
28
29 #include "genxml/gen_macros.h"
30
31 #include "panvk_buffer.h"
32 #include "panvk_cmd_alloc.h"
33 #include "panvk_cmd_buffer.h"
34 #include "panvk_cmd_desc_state.h"
35 #include "panvk_cmd_pool.h"
36 #include "panvk_cmd_push_constant.h"
37 #include "panvk_device.h"
38 #include "panvk_entrypoints.h"
39 #include "panvk_instance.h"
40 #include "panvk_physical_device.h"
41 #include "panvk_priv_bo.h"
42 #include "panvk_tracepoints.h"
43 #include "panvk_utrace.h"
44
45 #include "pan_desc.h"
46 #include "pan_encoder.h"
47 #include "pan_props.h"
48 #include "pan_samples.h"
49
50 #include "util/bitscan.h"
51 #include "vk_descriptor_update_template.h"
52 #include "vk_format.h"
53 #include "vk_synchronization.h"
54
55 static void
emit_tls(struct panvk_cmd_buffer * cmdbuf)56 emit_tls(struct panvk_cmd_buffer *cmdbuf)
57 {
58 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
59 struct panvk_physical_device *phys_dev =
60 to_panvk_physical_device(dev->vk.physical);
61 unsigned core_id_range;
62 panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
63
64 if (cmdbuf->state.tls.info.tls.size) {
65 unsigned thread_tls_alloc =
66 panfrost_query_thread_tls_alloc(&phys_dev->kmod.props);
67 unsigned size = panfrost_get_total_stack_size(
68 cmdbuf->state.tls.info.tls.size, thread_tls_alloc, core_id_range);
69
70 cmdbuf->state.tls.info.tls.ptr =
71 panvk_cmd_alloc_dev_mem(cmdbuf, tls, size, 4096).gpu;
72 }
73
74 assert(!cmdbuf->state.tls.info.wls.size);
75
76 if (cmdbuf->state.tls.desc.cpu) {
77 GENX(pan_emit_tls)(&cmdbuf->state.tls.info, cmdbuf->state.tls.desc.cpu);
78 }
79 }
80
81 /**
82 * Write all sync point updates to seqno registers and reset the relative sync
83 * points to 0.
84 */
85 static void
flush_sync_points(struct panvk_cmd_buffer * cmdbuf)86 flush_sync_points(struct panvk_cmd_buffer *cmdbuf)
87 {
88 for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) {
89 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i);
90
91 if (!cs_is_valid(b)) {
92 vk_command_buffer_set_error(&cmdbuf->vk,
93 VK_ERROR_OUT_OF_DEVICE_MEMORY);
94 return;
95 }
96
97 cs_update_progress_seqno(b) {
98 for (uint32_t j = 0; j < PANVK_SUBQUEUE_COUNT; j++) {
99 uint32_t rel_sync_point = cmdbuf->state.cs[j].relative_sync_point;
100
101 if (!rel_sync_point)
102 continue;
103
104 cs_add64(b, cs_progress_seqno_reg(b, j), cs_progress_seqno_reg(b, j),
105 rel_sync_point);
106 }
107 }
108 }
109
110 for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++)
111 cmdbuf->state.cs[i].relative_sync_point = 0;
112 }
113
114 static void
finish_cs(struct panvk_cmd_buffer * cmdbuf,uint32_t subqueue)115 finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue)
116 {
117 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
118 struct panvk_instance *instance =
119 to_panvk_instance(dev->vk.physical->instance);
120 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue);
121
122 /* We need a clean because descriptor/CS memory can be returned to the
123 * command pool where they get recycled. If we don't clean dirty cache lines,
124 * those cache lines might get evicted asynchronously and their content
125 * pushed back to main memory after the CPU has written new stuff there. */
126 struct cs_index flush_id = cs_scratch_reg32(b, 0);
127
128 cs_move32_to(b, flush_id, 0);
129 cs_wait_slots(b, SB_ALL_MASK, false);
130 cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN,
131 false, flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH)));
132 cs_wait_slot(b, SB_ID(IMM_FLUSH), false);
133
134 /* If we're in sync/trace more, we signal the debug object. */
135 if (instance->debug_flags & (PANVK_DEBUG_SYNC | PANVK_DEBUG_TRACE)) {
136 struct cs_index debug_sync_addr = cs_scratch_reg64(b, 0);
137 struct cs_index one = cs_scratch_reg32(b, 2);
138 struct cs_index error = cs_scratch_reg32(b, 3);
139 struct cs_index cmp_scratch = cs_scratch_reg32(b, 2);
140
141 cs_move32_to(b, one, 1);
142 cs_load64_to(b, debug_sync_addr, cs_subqueue_ctx_reg(b),
143 offsetof(struct panvk_cs_subqueue_context, debug.syncobjs));
144 cs_wait_slot(b, SB_ID(LS), false);
145 cs_add64(b, debug_sync_addr, debug_sync_addr,
146 sizeof(struct panvk_cs_sync32) * subqueue);
147 cs_load32_to(b, error, debug_sync_addr,
148 offsetof(struct panvk_cs_sync32, error));
149 cs_wait_slots(b, SB_ALL_MASK, false);
150 if (cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
151 cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_CSG, one,
152 debug_sync_addr, cs_now());
153 cs_match(b, error, cmp_scratch) {
154 cs_case(b, 0) {
155 /* Do nothing. */
156 }
157
158 cs_default(b) {
159 /* Overwrite the sync error with the first error we encountered. */
160 cs_store32(b, error, debug_sync_addr,
161 offsetof(struct panvk_cs_sync32, error));
162 cs_wait_slot(b, SB_ID(LS), false);
163 }
164 }
165 }
166
167 /* If this is a secondary command buffer, we don't poison the reg file to
168 * preserve the render pass context. We also don't poison the reg file if the
169 * last render pass was suspended. In practice we could preserve only the
170 * registers that matter, but this is a debug feature so let's keep things
171 * simple with this all-or-nothing approach. */
172 if ((instance->debug_flags & PANVK_DEBUG_CS) &&
173 cmdbuf->vk.level != VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
174 !(cmdbuf->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT)) {
175 cs_update_cmdbuf_regs(b) {
176 /* Poison all cmdbuf registers to make sure we don't inherit state from
177 * a previously executed cmdbuf. */
178 for (uint32_t i = 0; i <= PANVK_CS_REG_SCRATCH_END; i++)
179 cs_move32_to(b, cs_reg32(b, i), 0xdead | i << 24);
180 }
181 }
182
183 trace_end_cmdbuf(&cmdbuf->utrace.uts[subqueue], cmdbuf, cmdbuf->flags);
184
185 cs_finish(&cmdbuf->state.cs[subqueue].builder);
186 }
187
188 VKAPI_ATTR VkResult VKAPI_CALL
panvk_per_arch(EndCommandBuffer)189 panvk_per_arch(EndCommandBuffer)(VkCommandBuffer commandBuffer)
190 {
191 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
192 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
193
194 emit_tls(cmdbuf);
195 flush_sync_points(cmdbuf);
196
197 for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) {
198 struct cs_builder *b = &cmdbuf->state.cs[i].builder;
199
200 if (!cs_is_valid(b)) {
201 vk_command_buffer_set_error(&cmdbuf->vk,
202 VK_ERROR_OUT_OF_DEVICE_MEMORY);
203 } else {
204 finish_cs(cmdbuf, i);
205 }
206 }
207
208 cmdbuf->flush_id = panthor_kmod_get_flush_id(dev->kmod.dev);
209
210 return vk_command_buffer_end(&cmdbuf->vk);
211 }
212
213 static VkPipelineStageFlags2
get_subqueue_stages(enum panvk_subqueue_id subqueue)214 get_subqueue_stages(enum panvk_subqueue_id subqueue)
215 {
216 switch (subqueue) {
217 case PANVK_SUBQUEUE_VERTEX_TILER:
218 return VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
219 VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
220 VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
221 VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT;
222 case PANVK_SUBQUEUE_FRAGMENT:
223 return VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
224 VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
225 VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT |
226 VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
227 VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_RESOLVE_BIT |
228 VK_PIPELINE_STAGE_2_BLIT_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT;
229 case PANVK_SUBQUEUE_COMPUTE:
230 return VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
231 VK_PIPELINE_STAGE_2_COPY_BIT;
232 default:
233 unreachable("Invalid subqueue id");
234 }
235 }
236
237 static void
add_execution_dependency(uint32_t wait_masks[static PANVK_SUBQUEUE_COUNT],VkPipelineStageFlags2 src_stages,VkPipelineStageFlags2 dst_stages)238 add_execution_dependency(uint32_t wait_masks[static PANVK_SUBQUEUE_COUNT],
239 VkPipelineStageFlags2 src_stages,
240 VkPipelineStageFlags2 dst_stages)
241 {
242 /* convert stages to subqueues */
243 uint32_t src_subqueues = 0;
244 uint32_t dst_subqueues = 0;
245 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
246 const VkPipelineStageFlags2 subqueue_stages = get_subqueue_stages(i);
247 if (src_stages & subqueue_stages)
248 src_subqueues |= BITFIELD_BIT(i);
249 if (dst_stages & subqueue_stages)
250 dst_subqueues |= BITFIELD_BIT(i);
251 }
252
253 const bool dst_host = dst_stages & VK_PIPELINE_STAGE_2_HOST_BIT;
254
255 /* nothing to wait */
256 if (!src_subqueues || (!dst_subqueues && !dst_host))
257 return;
258
259 u_foreach_bit(i, dst_subqueues) {
260 /* each dst subqueue should wait for all src subqueues */
261 uint32_t wait_mask = src_subqueues;
262
263 switch (i) {
264 case PANVK_SUBQUEUE_VERTEX_TILER:
265 /* Indirect draw buffers are read from the command stream, and
266 * load/store operations are synchronized with the LS scoreboard
267 * immediately after the read, so no need to wait in that case.
268 */
269 if ((src_stages & get_subqueue_stages(i)) ==
270 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT)
271 wait_mask &= ~BITFIELD_BIT(i);
272 break;
273 case PANVK_SUBQUEUE_FRAGMENT:
274 /* The fragment subqueue always waits for the tiler subqueue already.
275 * Explicit waits can be skipped.
276 */
277 wait_mask &= ~BITFIELD_BIT(PANVK_SUBQUEUE_VERTEX_TILER);
278 break;
279 default:
280 break;
281 }
282
283 wait_masks[i] |= wait_mask;
284 }
285
286 /* The host does not wait for src subqueues. All src subqueues should
287 * self-wait instead.
288 *
289 * Also, our callers currently expect src subqueues to self-wait when there
290 * are dst subqueues. Until that changes, make all src subqueues self-wait.
291 */
292 if (dst_host || dst_subqueues) {
293 u_foreach_bit(i, src_subqueues)
294 wait_masks[i] |= BITFIELD_BIT(i);
295 }
296 }
297
298 static void
add_memory_dependency(struct panvk_cache_flush_info * cache_flush,VkAccessFlags2 src_access,VkAccessFlags2 dst_access)299 add_memory_dependency(struct panvk_cache_flush_info *cache_flush,
300 VkAccessFlags2 src_access, VkAccessFlags2 dst_access)
301 {
302 /* Note on the cache organization:
303 *
304 * - L2 cache is unified, so all changes to this cache are automatically
305 * visible to all GPU sub-components (shader cores, tiler, ...). This
306 * means we only need to flush when the host (AKA CPU) is involved.
307 * - LS caches (which are basically just read-write L1 caches) are coherent
308 * with each other and with the L2 cache, so again, we only need to flush
309 * when the host is involved.
310 * - Other read-only L1 caches (like the ones in front of the texture unit)
311 * are not coherent with the LS or L2 caches, and thus need to be
312 * invalidated any time a write happens.
313 *
314 * Translating to the Vulkan memory model:
315 *
316 * - The device domain is the L2 cache.
317 * - An availability operation from device writes to the device domain is
318 * nop.
319 * - A visibility operation from the device domain to device accesses that
320 * are coherent with L2/LS is nop.
321 * - A visibility operation from the device domain to device accesses that
322 * are incoherent with L2/LS invalidates the other RO L1 caches.
323 * - A host-to-device domain operation invalidates all caches.
324 * - A device-to-host domain operation flushes L2/LS.
325 */
326 const VkAccessFlags2 ro_l1_access =
327 VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT |
328 VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
329 VK_ACCESS_2_TRANSFER_READ_BIT | VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
330 VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT;
331
332 /* visibility op */
333 if (dst_access & ro_l1_access)
334 cache_flush->others |= true;
335
336 /* host-to-device domain op */
337 if (src_access & VK_ACCESS_2_HOST_WRITE_BIT) {
338 cache_flush->l2 |= MALI_CS_FLUSH_MODE_CLEAN_AND_INVALIDATE;
339 cache_flush->lsc |= MALI_CS_FLUSH_MODE_CLEAN_AND_INVALIDATE;
340 cache_flush->others |= true;
341 }
342
343 /* device-to-host domain op */
344 if (dst_access & (VK_ACCESS_2_HOST_READ_BIT | VK_ACCESS_2_HOST_WRITE_BIT)) {
345 cache_flush->l2 |= MALI_CS_FLUSH_MODE_CLEAN;
346 cache_flush->lsc |= MALI_CS_FLUSH_MODE_CLEAN;
347 }
348 }
349
350 static bool
should_split_render_pass(const uint32_t wait_masks[static PANVK_SUBQUEUE_COUNT],VkAccessFlags2 src_access,VkAccessFlags2 dst_access)351 should_split_render_pass(const uint32_t wait_masks[static PANVK_SUBQUEUE_COUNT],
352 VkAccessFlags2 src_access, VkAccessFlags2 dst_access)
353 {
354 /* From the Vulkan 1.3.301 spec:
355 *
356 * VUID-vkCmdPipelineBarrier-None-07892
357 *
358 * "If vkCmdPipelineBarrier is called within a render pass instance, the
359 * source and destination stage masks of any memory barriers must only
360 * include graphics pipeline stages"
361 *
362 * We only consider the tiler and the fragment subqueues here.
363 */
364
365 /* split if the tiler subqueue waits for the fragment subqueue */
366 if (wait_masks[PANVK_SUBQUEUE_VERTEX_TILER] &
367 BITFIELD_BIT(PANVK_SUBQUEUE_FRAGMENT))
368 return true;
369
370 /* split if the fragment subqueue self-waits with a feedback loop, because
371 * we lower subpassLoad to texelFetch
372 */
373 if ((wait_masks[PANVK_SUBQUEUE_FRAGMENT] &
374 BITFIELD_BIT(PANVK_SUBQUEUE_FRAGMENT)) &&
375 (src_access & (VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
376 VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT)) &&
377 (dst_access & VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT))
378 return true;
379
380 return false;
381 }
382
383 static void
collect_cache_flush_info(enum panvk_subqueue_id subqueue,struct panvk_cache_flush_info * cache_flush,VkAccessFlags2 src_access,VkAccessFlags2 dst_access)384 collect_cache_flush_info(enum panvk_subqueue_id subqueue,
385 struct panvk_cache_flush_info *cache_flush,
386 VkAccessFlags2 src_access, VkAccessFlags2 dst_access)
387 {
388 /* limit access to the subqueue and host */
389 const VkPipelineStageFlags2 subqueue_stages =
390 get_subqueue_stages(subqueue) | VK_PIPELINE_STAGE_2_HOST_BIT;
391 src_access = vk_filter_src_access_flags2(subqueue_stages, src_access);
392 dst_access = vk_filter_dst_access_flags2(subqueue_stages, dst_access);
393
394 add_memory_dependency(cache_flush, src_access, dst_access);
395 }
396
397 static void
collect_cs_deps(struct panvk_cmd_buffer * cmdbuf,VkPipelineStageFlags2 src_stages,VkPipelineStageFlags2 dst_stages,VkAccessFlags2 src_access,VkAccessFlags2 dst_access,struct panvk_cs_deps * deps)398 collect_cs_deps(struct panvk_cmd_buffer *cmdbuf,
399 VkPipelineStageFlags2 src_stages,
400 VkPipelineStageFlags2 dst_stages, VkAccessFlags2 src_access,
401 VkAccessFlags2 dst_access, struct panvk_cs_deps *deps)
402 {
403 uint32_t wait_masks[PANVK_SUBQUEUE_COUNT] = {0};
404 add_execution_dependency(wait_masks, src_stages, dst_stages);
405
406 /* within a render pass */
407 if (cmdbuf->state.gfx.render.tiler) {
408 if (should_split_render_pass(wait_masks, src_access, dst_access)) {
409 deps->needs_draw_flush = true;
410 } else {
411 /* skip the tiler subqueue self-wait because we use the same
412 * scoreboard slot for the idvs jobs
413 */
414 wait_masks[PANVK_SUBQUEUE_VERTEX_TILER] &=
415 ~BITFIELD_BIT(PANVK_SUBQUEUE_VERTEX_TILER);
416
417 /* skip the fragment subqueue self-wait because we emit the fragment
418 * job at the end of the render pass and there is nothing to wait yet
419 */
420 wait_masks[PANVK_SUBQUEUE_FRAGMENT] &=
421 ~BITFIELD_BIT(PANVK_SUBQUEUE_FRAGMENT);
422 }
423 }
424
425 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
426 if (wait_masks[i] & BITFIELD_BIT(i)) {
427 /* We need to self-wait for all previously submitted jobs, and given
428 * the iterator scoreboard is a moving target, we just wait for the
429 * whole dynamic scoreboard range.
430 */
431 deps->src[i].wait_sb_mask |= SB_ALL_ITERS_MASK;
432 }
433
434 collect_cache_flush_info(i, &deps->src[i].cache_flush, src_access,
435 dst_access);
436
437 deps->dst[i].wait_subqueue_mask |= wait_masks[i];
438 }
439 }
440
441 static void
normalize_dependency(VkPipelineStageFlags2 * src_stages,VkPipelineStageFlags2 * dst_stages,VkAccessFlags2 * src_access,VkAccessFlags2 * dst_access,uint32_t src_qfi,uint32_t dst_qfi)442 normalize_dependency(VkPipelineStageFlags2 *src_stages,
443 VkPipelineStageFlags2 *dst_stages,
444 VkAccessFlags2 *src_access, VkAccessFlags2 *dst_access,
445 uint32_t src_qfi, uint32_t dst_qfi)
446 {
447 /* queue family acquire operation */
448 switch (src_qfi) {
449 case VK_QUEUE_FAMILY_EXTERNAL:
450 /* no execution dependency and no availability operation */
451 *src_stages = VK_PIPELINE_STAGE_2_NONE;
452 *src_access = VK_ACCESS_2_NONE;
453 break;
454 case VK_QUEUE_FAMILY_FOREIGN_EXT:
455 /* treat the foreign queue as the host */
456 *src_stages = VK_PIPELINE_STAGE_2_HOST_BIT;
457 *src_access = VK_ACCESS_2_HOST_WRITE_BIT;
458 break;
459 default:
460 break;
461 }
462
463 /* queue family release operation */
464 switch (dst_qfi) {
465 case VK_QUEUE_FAMILY_EXTERNAL:
466 /* no execution dependency and no visibility operation */
467 *dst_stages = VK_PIPELINE_STAGE_2_NONE;
468 *dst_access = VK_ACCESS_2_NONE;
469 break;
470 case VK_QUEUE_FAMILY_FOREIGN_EXT:
471 /* treat the foreign queue as the host */
472 *dst_stages = VK_PIPELINE_STAGE_2_HOST_BIT;
473 *dst_access = VK_ACCESS_2_HOST_WRITE_BIT;
474 break;
475 default:
476 break;
477 }
478
479 *src_stages = vk_expand_src_stage_flags2(*src_stages);
480 *dst_stages = vk_expand_dst_stage_flags2(*dst_stages);
481
482 *src_access = vk_filter_src_access_flags2(*src_stages, *src_access);
483 *dst_access = vk_filter_dst_access_flags2(*dst_stages, *dst_access);
484 }
485
486 void
panvk_per_arch(get_cs_deps)487 panvk_per_arch(get_cs_deps)(struct panvk_cmd_buffer *cmdbuf,
488 const VkDependencyInfo *in,
489 struct panvk_cs_deps *out)
490 {
491 memset(out, 0, sizeof(*out));
492
493 for (uint32_t i = 0; i < in->memoryBarrierCount; i++) {
494 const VkMemoryBarrier2 *barrier = &in->pMemoryBarriers[i];
495 VkPipelineStageFlags2 src_stages = barrier->srcStageMask;
496 VkPipelineStageFlags2 dst_stages = barrier->dstStageMask;
497 VkAccessFlags2 src_access = barrier->srcAccessMask;
498 VkAccessFlags2 dst_access = barrier->dstAccessMask;
499 normalize_dependency(&src_stages, &dst_stages, &src_access, &dst_access,
500 VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED);
501
502 collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access,
503 out);
504 }
505
506 for (uint32_t i = 0; i < in->bufferMemoryBarrierCount; i++) {
507 const VkBufferMemoryBarrier2 *barrier = &in->pBufferMemoryBarriers[i];
508 VkPipelineStageFlags2 src_stages = barrier->srcStageMask;
509 VkPipelineStageFlags2 dst_stages = barrier->dstStageMask;
510 VkAccessFlags2 src_access = barrier->srcAccessMask;
511 VkAccessFlags2 dst_access = barrier->dstAccessMask;
512 normalize_dependency(&src_stages, &dst_stages, &src_access, &dst_access,
513 barrier->srcQueueFamilyIndex,
514 barrier->dstQueueFamilyIndex);
515
516 collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access,
517 out);
518 }
519
520 for (uint32_t i = 0; i < in->imageMemoryBarrierCount; i++) {
521 const VkImageMemoryBarrier2 *barrier = &in->pImageMemoryBarriers[i];
522 VkPipelineStageFlags2 src_stages = barrier->srcStageMask;
523 VkPipelineStageFlags2 dst_stages = barrier->dstStageMask;
524 VkAccessFlags2 src_access = barrier->srcAccessMask;
525 VkAccessFlags2 dst_access = barrier->dstAccessMask;
526 normalize_dependency(&src_stages, &dst_stages, &src_access, &dst_access,
527 barrier->srcQueueFamilyIndex,
528 barrier->dstQueueFamilyIndex);
529
530 collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access,
531 out);
532 }
533 }
534
535 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdPipelineBarrier2)536 panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer,
537 const VkDependencyInfo *pDependencyInfo)
538 {
539 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
540 struct panvk_cs_deps deps;
541
542 panvk_per_arch(get_cs_deps)(cmdbuf, pDependencyInfo, &deps);
543
544 if (deps.needs_draw_flush)
545 panvk_per_arch(cmd_flush_draws)(cmdbuf);
546
547 uint32_t wait_subqueue_mask = 0;
548 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
549 /* no need to perform both types of waits on the same subqueue */
550 if (deps.src[i].wait_sb_mask)
551 deps.dst[i].wait_subqueue_mask &= ~BITFIELD_BIT(i);
552 assert(!(deps.dst[i].wait_subqueue_mask & BITFIELD_BIT(i)));
553
554 wait_subqueue_mask |= deps.dst[i].wait_subqueue_mask;
555 }
556
557 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
558
559 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i);
560 struct panvk_cs_state *cs_state = &cmdbuf->state.cs[i];
561
562 if (deps.src[i].wait_sb_mask)
563 cs_wait_slots(b, deps.src[i].wait_sb_mask, false);
564
565 struct panvk_cache_flush_info cache_flush = deps.src[i].cache_flush;
566 if (cache_flush.l2 != MALI_CS_FLUSH_MODE_NONE ||
567 cache_flush.lsc != MALI_CS_FLUSH_MODE_NONE || cache_flush.others) {
568 struct cs_index flush_id = cs_scratch_reg32(b, 0);
569
570 cs_move32_to(b, flush_id, 0);
571 cs_flush_caches(b, cache_flush.l2, cache_flush.lsc, cache_flush.others,
572 flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH)));
573 cs_wait_slot(b, SB_ID(IMM_FLUSH), false);
574 }
575
576 /* If no one waits on us, there's no point signaling the sync object. */
577 if (wait_subqueue_mask & BITFIELD_BIT(i)) {
578 struct cs_index sync_addr = cs_scratch_reg64(b, 0);
579 struct cs_index add_val = cs_scratch_reg64(b, 2);
580
581 assert(deps.src[i].wait_sb_mask);
582
583 cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
584 offsetof(struct panvk_cs_subqueue_context, syncobjs));
585 cs_wait_slot(b, SB_ID(LS), false);
586 cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * i);
587 cs_move64_to(b, add_val, 1);
588 cs_sync64_add(b, false, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr,
589 cs_now());
590 ++cs_state->relative_sync_point;
591 }
592 }
593
594 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
595 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i);
596 u_foreach_bit(j, deps.dst[i].wait_subqueue_mask) {
597 struct panvk_cs_state *cs_state = &cmdbuf->state.cs[j];
598 struct cs_index sync_addr = cs_scratch_reg64(b, 0);
599 struct cs_index wait_val = cs_scratch_reg64(b, 2);
600
601 cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
602 offsetof(struct panvk_cs_subqueue_context, syncobjs));
603 cs_wait_slot(b, SB_ID(LS), false);
604 cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * j);
605
606 cs_add64(b, wait_val, cs_progress_seqno_reg(b, j),
607 cs_state->relative_sync_point);
608 cs_sync64_wait(b, false, MALI_CS_CONDITION_GREATER, wait_val,
609 sync_addr);
610 }
611 }
612 }
613
614 void
panvk_per_arch(cs_pick_iter_sb)615 panvk_per_arch(cs_pick_iter_sb)(struct panvk_cmd_buffer *cmdbuf,
616 enum panvk_subqueue_id subqueue)
617 {
618 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue);
619 struct cs_index iter_sb = cs_scratch_reg32(b, 0);
620 struct cs_index cmp_scratch = cs_scratch_reg32(b, 1);
621
622 cs_load32_to(b, iter_sb, cs_subqueue_ctx_reg(b),
623 offsetof(struct panvk_cs_subqueue_context, iter_sb));
624 cs_wait_slot(b, SB_ID(LS), false);
625
626 cs_match(b, iter_sb, cmp_scratch) {
627 #define CASE(x) \
628 cs_case(b, x) { \
629 cs_wait_slot(b, SB_ITER(x), false); \
630 cs_set_scoreboard_entry(b, SB_ITER(x), SB_ID(LS)); \
631 }
632
633 CASE(0)
634 CASE(1)
635 CASE(2)
636 CASE(3)
637 CASE(4)
638 #undef CASE
639 }
640 }
641
642 static struct cs_buffer
alloc_cs_buffer(void * cookie)643 alloc_cs_buffer(void *cookie)
644 {
645 struct panvk_cmd_buffer *cmdbuf = cookie;
646 const unsigned capacity = 64 * 1024 / sizeof(uint64_t);
647
648 struct panfrost_ptr ptr =
649 panvk_cmd_alloc_dev_mem(cmdbuf, cs, capacity * 8, 64);
650
651 return (struct cs_buffer){
652 .cpu = ptr.cpu,
653 .gpu = ptr.gpu,
654 .capacity = capacity,
655 };
656 }
657
658 static enum cs_reg_perm
cs_reg_perm(struct cs_builder * b,unsigned reg)659 cs_reg_perm(struct cs_builder *b, unsigned reg)
660 {
661 struct panvk_cs_state *cs_state =
662 container_of(b, struct panvk_cs_state, builder);
663 struct panvk_cs_reg_upd_context *upd_ctx;
664
665 for (upd_ctx = cs_state->reg_access.upd_ctx_stack; upd_ctx;
666 upd_ctx = upd_ctx->next) {
667 if (upd_ctx->reg_perm(b, reg) == CS_REG_RW)
668 return CS_REG_RW;
669 }
670
671 return cs_state->reg_access.base_perm(b, reg);
672 }
673
674 static void
init_cs_builders(struct panvk_cmd_buffer * cmdbuf)675 init_cs_builders(struct panvk_cmd_buffer *cmdbuf)
676 {
677 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
678 struct panvk_instance *instance =
679 to_panvk_instance(dev->vk.physical->instance);
680 const reg_perm_cb_t base_reg_perms[PANVK_SUBQUEUE_COUNT] = {
681 [PANVK_SUBQUEUE_VERTEX_TILER] = panvk_cs_vt_reg_perm,
682 [PANVK_SUBQUEUE_FRAGMENT] = panvk_cs_frag_reg_perm,
683 [PANVK_SUBQUEUE_COMPUTE] = panvk_cs_compute_reg_perm,
684 };
685
686 for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) {
687 struct cs_builder *b = &cmdbuf->state.cs[i].builder;
688 /* Lazy allocation of the root CS. */
689 struct cs_buffer root_cs = {0};
690
691 struct cs_builder_conf conf = {
692 .nr_registers = 96,
693 .nr_kernel_registers = 4,
694 .alloc_buffer = alloc_cs_buffer,
695 .cookie = cmdbuf,
696 };
697
698 if (instance->debug_flags & PANVK_DEBUG_CS) {
699 cmdbuf->state.cs[i].ls_tracker = (struct cs_load_store_tracker){
700 .sb_slot = SB_ID(LS),
701 };
702
703 conf.ls_tracker = &cmdbuf->state.cs[i].ls_tracker;
704
705 cmdbuf->state.cs[i].reg_access.upd_ctx_stack = NULL;
706 cmdbuf->state.cs[i].reg_access.base_perm = base_reg_perms[i];
707 conf.reg_perm = cs_reg_perm;
708 }
709
710 cs_builder_init(b, &conf, root_cs);
711
712 if (instance->debug_flags & PANVK_DEBUG_TRACE) {
713 cmdbuf->state.cs[i].tracing = (struct cs_tracing_ctx){
714 .enabled = true,
715 .ctx_reg = cs_subqueue_ctx_reg(b),
716 .tracebuf_addr_offset =
717 offsetof(struct panvk_cs_subqueue_context, debug.tracebuf.cs),
718 .ls_sb_slot = SB_ID(LS),
719 };
720 }
721 }
722 }
723
724 static void
panvk_reset_cmdbuf(struct vk_command_buffer * vk_cmdbuf,VkCommandBufferResetFlags flags)725 panvk_reset_cmdbuf(struct vk_command_buffer *vk_cmdbuf,
726 VkCommandBufferResetFlags flags)
727 {
728 struct panvk_cmd_buffer *cmdbuf =
729 container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk);
730 struct panvk_cmd_pool *pool =
731 container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk);
732 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
733
734 vk_command_buffer_reset(&cmdbuf->vk);
735
736 panvk_pool_reset(&cmdbuf->cs_pool);
737 panvk_pool_reset(&cmdbuf->desc_pool);
738 panvk_pool_reset(&cmdbuf->tls_pool);
739 list_splicetail(&cmdbuf->push_sets, &pool->push_sets);
740 list_inithead(&cmdbuf->push_sets);
741
742 for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->utrace.uts); i++) {
743 struct u_trace *ut = &cmdbuf->utrace.uts[i];
744 u_trace_fini(ut);
745 u_trace_init(ut, &dev->utrace.utctx);
746 }
747
748 memset(&cmdbuf->state, 0, sizeof(cmdbuf->state));
749 init_cs_builders(cmdbuf);
750 }
751
752 static void
panvk_destroy_cmdbuf(struct vk_command_buffer * vk_cmdbuf)753 panvk_destroy_cmdbuf(struct vk_command_buffer *vk_cmdbuf)
754 {
755 struct panvk_cmd_buffer *cmdbuf =
756 container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk);
757 struct panvk_cmd_pool *pool =
758 container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk);
759 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
760
761 for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->utrace.uts); i++)
762 u_trace_fini(&cmdbuf->utrace.uts[i]);
763
764 panvk_pool_cleanup(&cmdbuf->cs_pool);
765 panvk_pool_cleanup(&cmdbuf->desc_pool);
766 panvk_pool_cleanup(&cmdbuf->tls_pool);
767 list_splicetail(&cmdbuf->push_sets, &pool->push_sets);
768 vk_command_buffer_finish(&cmdbuf->vk);
769 vk_free(&dev->vk.alloc, cmdbuf);
770 }
771
772 static VkResult
panvk_create_cmdbuf(struct vk_command_pool * vk_pool,VkCommandBufferLevel level,struct vk_command_buffer ** cmdbuf_out)773 panvk_create_cmdbuf(struct vk_command_pool *vk_pool, VkCommandBufferLevel level,
774 struct vk_command_buffer **cmdbuf_out)
775 {
776 struct panvk_device *device =
777 container_of(vk_pool->base.device, struct panvk_device, vk);
778 struct panvk_cmd_pool *pool =
779 container_of(vk_pool, struct panvk_cmd_pool, vk);
780 struct panvk_cmd_buffer *cmdbuf;
781
782 cmdbuf = vk_zalloc(&device->vk.alloc, sizeof(*cmdbuf), 8,
783 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
784 if (!cmdbuf)
785 return panvk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
786
787 VkResult result = vk_command_buffer_init(
788 &pool->vk, &cmdbuf->vk, &panvk_per_arch(cmd_buffer_ops), level);
789 if (result != VK_SUCCESS) {
790 vk_free(&device->vk.alloc, cmdbuf);
791 return result;
792 }
793
794 list_inithead(&cmdbuf->push_sets);
795 cmdbuf->vk.dynamic_graphics_state.vi = &cmdbuf->state.gfx.dynamic.vi;
796 cmdbuf->vk.dynamic_graphics_state.ms.sample_locations =
797 &cmdbuf->state.gfx.dynamic.sl;
798
799 struct panvk_pool_properties cs_pool_props = {
800 .create_flags = 0,
801 .slab_size = 64 * 1024,
802 .label = "Command buffer CS pool",
803 .prealloc = false,
804 .owns_bos = true,
805 .needs_locking = false,
806 };
807 panvk_pool_init(&cmdbuf->cs_pool, device, &pool->cs_bo_pool, &cs_pool_props);
808
809 struct panvk_pool_properties desc_pool_props = {
810 .create_flags = 0,
811 .slab_size = 64 * 1024,
812 .label = "Command buffer descriptor pool",
813 .prealloc = false,
814 .owns_bos = true,
815 .needs_locking = false,
816 };
817 panvk_pool_init(&cmdbuf->desc_pool, device, &pool->desc_bo_pool,
818 &desc_pool_props);
819
820 struct panvk_pool_properties tls_pool_props = {
821 .create_flags =
822 panvk_device_adjust_bo_flags(device, PAN_KMOD_BO_FLAG_NO_MMAP),
823 .slab_size = 64 * 1024,
824 .label = "TLS pool",
825 .prealloc = false,
826 .owns_bos = true,
827 .needs_locking = false,
828 };
829 panvk_pool_init(&cmdbuf->tls_pool, device, &pool->tls_bo_pool,
830 &tls_pool_props);
831
832 for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->utrace.uts); i++)
833 u_trace_init(&cmdbuf->utrace.uts[i], &device->utrace.utctx);
834
835 init_cs_builders(cmdbuf);
836 *cmdbuf_out = &cmdbuf->vk;
837 return VK_SUCCESS;
838 }
839
840 const struct vk_command_buffer_ops panvk_per_arch(cmd_buffer_ops) = {
841 .create = panvk_create_cmdbuf,
842 .reset = panvk_reset_cmdbuf,
843 .destroy = panvk_destroy_cmdbuf,
844 };
845
846 VKAPI_ATTR VkResult VKAPI_CALL
panvk_per_arch(BeginCommandBuffer)847 panvk_per_arch(BeginCommandBuffer)(VkCommandBuffer commandBuffer,
848 const VkCommandBufferBeginInfo *pBeginInfo)
849 {
850 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
851 struct panvk_instance *instance =
852 to_panvk_instance(cmdbuf->vk.base.device->physical->instance);
853
854 vk_command_buffer_begin(&cmdbuf->vk, pBeginInfo);
855 cmdbuf->flags = pBeginInfo->flags;
856
857 if (instance->debug_flags & PANVK_DEBUG_FORCE_SIMULTANEOUS) {
858 cmdbuf->flags |= VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
859 cmdbuf->flags &= ~VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
860 }
861
862 panvk_per_arch(cmd_inherit_render_state)(cmdbuf, pBeginInfo);
863
864 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++)
865 trace_begin_cmdbuf(&cmdbuf->utrace.uts[i], cmdbuf);
866
867 return VK_SUCCESS;
868 }
869
870 static void
panvk_cmd_invalidate_state(struct panvk_cmd_buffer * cmdbuf)871 panvk_cmd_invalidate_state(struct panvk_cmd_buffer *cmdbuf)
872 {
873 /* From the Vulkan 1.3.275 spec:
874 *
875 * "...There is one exception to this rule - if the primary command
876 * buffer is inside a render pass instance, then the render pass and
877 * subpass state is not disturbed by executing secondary command
878 * buffers."
879 *
880 * We need to reset everything EXCEPT the render pass state.
881 */
882 struct panvk_rendering_state render_save = cmdbuf->state.gfx.render;
883 memset(&cmdbuf->state.gfx, 0, sizeof(cmdbuf->state.gfx));
884 cmdbuf->state.gfx.render = render_save;
885
886 vk_dynamic_graphics_state_dirty_all(&cmdbuf->vk.dynamic_graphics_state);
887 gfx_state_set_all_dirty(cmdbuf);
888 }
889
890 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdExecuteCommands)891 panvk_per_arch(CmdExecuteCommands)(VkCommandBuffer commandBuffer,
892 uint32_t commandBufferCount,
893 const VkCommandBuffer *pCommandBuffers)
894 {
895 VK_FROM_HANDLE(panvk_cmd_buffer, primary, commandBuffer);
896
897 if (commandBufferCount == 0)
898 return;
899
900 /* Write out any pending seqno changes to registers before calling
901 * secondary command buffers. */
902 flush_sync_points(primary);
903
904 for (uint32_t i = 0; i < commandBufferCount; i++) {
905 VK_FROM_HANDLE(panvk_cmd_buffer, secondary, pCommandBuffers[i]);
906
907 /* make sure the CS context is setup properly
908 * to inherit the primary command buffer state
909 */
910 primary->state.tls.info.tls.size =
911 MAX2(primary->state.tls.info.tls.size,
912 secondary->state.tls.info.tls.size);
913 panvk_per_arch(cmd_prepare_exec_cmd_for_draws)(primary, secondary);
914
915 for (uint32_t j = 0; j < ARRAY_SIZE(primary->state.cs); j++) {
916 struct cs_builder *sec_b = panvk_get_cs_builder(secondary, j);
917 assert(cs_is_valid(sec_b));
918 if (!cs_is_empty(sec_b)) {
919 struct cs_builder *prim_b = panvk_get_cs_builder(primary, j);
920 struct cs_index addr = cs_scratch_reg64(prim_b, 0);
921 struct cs_index size = cs_scratch_reg32(prim_b, 2);
922 cs_move64_to(prim_b, addr, cs_root_chunk_gpu_addr(sec_b));
923 cs_move32_to(prim_b, size, cs_root_chunk_size(sec_b));
924 cs_call(prim_b, addr, size);
925
926 struct u_trace *prim_ut = &primary->utrace.uts[j];
927 struct u_trace *sec_ut = &secondary->utrace.uts[j];
928 u_trace_clone_append(u_trace_begin_iterator(sec_ut),
929 u_trace_end_iterator(sec_ut), prim_ut, prim_b,
930 panvk_per_arch(utrace_copy_buffer));
931 }
932 }
933
934 /* We need to propagate the suspending state of the secondary command
935 * buffer if we want to avoid poisoning the reg file when the secondary
936 * command buffer suspended the render pass. */
937 if (secondary->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT)
938 primary->state.gfx.render.flags = secondary->state.gfx.render.flags;
939
940 /* If the render context we passed to the secondary command buffer got
941 * invalidated, reset the FB/tiler descs and treat things as if we
942 * suspended the render pass, since those descriptors have been
943 * re-emitted by the secondary command buffer already. */
944 if (secondary->state.gfx.render.invalidate_inherited_ctx) {
945 memset(&primary->state.gfx.render.fbds, 0,
946 sizeof(primary->state.gfx.render.fbds));
947 primary->state.gfx.render.tiler = 0;
948 primary->state.gfx.render.flags |= VK_RENDERING_RESUMING_BIT;
949 }
950 }
951
952 /* From the Vulkan 1.3.275 spec:
953 *
954 * "When secondary command buffer(s) are recorded to execute on a
955 * primary command buffer, the secondary command buffer inherits no
956 * state from the primary command buffer, and all state of the primary
957 * command buffer is undefined after an execute secondary command buffer
958 * command is recorded. There is one exception to this rule - if the
959 * primary command buffer is inside a render pass instance, then the
960 * render pass and subpass state is not disturbed by executing secondary
961 * command buffers. For state dependent commands (such as draws and
962 * dispatches), any state consumed by those commands must not be
963 * undefined."
964 *
965 * Therefore, it's the client's job to reset all the state in the primary
966 * after the secondary executes. However, if we're doing any internal
967 * dirty tracking, we may miss the fact that a secondary has messed with
968 * GPU state if we don't invalidate all our internal tracking.
969 */
970 panvk_cmd_invalidate_state(primary);
971 }
972