1 /*
2 * Copyright © 2024 Collabora Ltd.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "drm-uapi/panthor_drm.h"
8
9 #include "genxml/cs_builder.h"
10 #include "genxml/decode.h"
11
12 #include "panvk_cmd_buffer.h"
13 #include "panvk_macros.h"
14 #include "panvk_queue.h"
15 #include "panvk_utrace.h"
16
17 #include "util/bitscan.h"
18 #include "vk_drm_syncobj.h"
19 #include "vk_log.h"
20
21 #define MIN_DESC_TRACEBUF_SIZE (128 * 1024)
22 #define DEFAULT_DESC_TRACEBUF_SIZE (2 * 1024 * 1024)
23 #define MIN_CS_TRACEBUF_SIZE (512 * 1024)
24 #define DEFAULT_CS_TRACEBUF_SIZE (2 * 1024 * 1024)
25
26 static void
finish_render_desc_ringbuf(struct panvk_queue * queue)27 finish_render_desc_ringbuf(struct panvk_queue *queue)
28 {
29 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
30 struct panvk_instance *instance =
31 to_panvk_instance(dev->vk.physical->instance);
32 bool tracing_enabled = instance->debug_flags & PANVK_DEBUG_TRACE;
33 struct panvk_desc_ringbuf *ringbuf = &queue->render_desc_ringbuf;
34
35 panvk_pool_free_mem(&ringbuf->syncobj);
36
37 if (dev->debug.decode_ctx && ringbuf->addr.dev) {
38 pandecode_inject_free(dev->debug.decode_ctx, ringbuf->addr.dev,
39 ringbuf->size);
40 if (!tracing_enabled)
41 pandecode_inject_free(dev->debug.decode_ctx,
42 ringbuf->addr.dev + ringbuf->size,
43 ringbuf->size);
44 }
45
46 if (ringbuf->addr.dev) {
47 struct pan_kmod_vm_op op = {
48 .type = PAN_KMOD_VM_OP_TYPE_UNMAP,
49 .va = {
50 .start = ringbuf->addr.dev,
51 .size = ringbuf->size * (tracing_enabled ? 2 : 1),
52 },
53 };
54
55 ASSERTED int ret =
56 pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, &op, 1);
57 assert(!ret);
58
59 simple_mtx_lock(&dev->as.lock);
60 util_vma_heap_free(&dev->as.heap, ringbuf->addr.dev, ringbuf->size * 2);
61 simple_mtx_unlock(&dev->as.lock);
62 }
63
64 if (ringbuf->addr.host) {
65 ASSERTED int ret =
66 os_munmap(ringbuf->addr.host, ringbuf->size);
67 assert(!ret);
68 }
69
70 pan_kmod_bo_put(ringbuf->bo);
71 }
72
73 static VkResult
init_render_desc_ringbuf(struct panvk_queue * queue)74 init_render_desc_ringbuf(struct panvk_queue *queue)
75 {
76 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
77 struct panvk_instance *instance =
78 to_panvk_instance(dev->vk.physical->instance);
79 bool tracing_enabled = instance->debug_flags & PANVK_DEBUG_TRACE;
80 uint32_t flags = panvk_device_adjust_bo_flags(dev, PAN_KMOD_BO_FLAG_NO_MMAP);
81 struct panvk_desc_ringbuf *ringbuf = &queue->render_desc_ringbuf;
82 uint64_t dev_addr = 0;
83 int ret;
84
85 if (tracing_enabled) {
86 ringbuf->size = debug_get_num_option("PANVK_DESC_TRACEBUF_SIZE",
87 DEFAULT_DESC_TRACEBUF_SIZE);
88 flags |= PAN_KMOD_BO_FLAG_GPU_UNCACHED;
89 assert(ringbuf->size > MIN_DESC_TRACEBUF_SIZE &&
90 util_is_power_of_two_nonzero(ringbuf->size));
91 } else {
92 ringbuf->size = RENDER_DESC_RINGBUF_SIZE;
93 }
94
95 ringbuf->bo =
96 pan_kmod_bo_alloc(dev->kmod.dev, dev->kmod.vm, ringbuf->size, flags);
97 if (!ringbuf->bo)
98 return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
99 "Failed to create a descriptor ring buffer context");
100
101 if (!(flags & PAN_KMOD_BO_FLAG_NO_MMAP)) {
102 ringbuf->addr.host =
103 pan_kmod_bo_mmap(ringbuf->bo, 0, ringbuf->size, PROT_READ | PROT_WRITE,
104 MAP_SHARED, NULL);
105 if (ringbuf->addr.host == MAP_FAILED)
106 return panvk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY,
107 "Failed to CPU map ringbuf BO");
108 }
109
110 /* We choose the alignment to guarantee that we won't ever cross a 4G
111 * boundary when accessing the mapping. This way we can encode the wraparound
112 * using 32-bit operations. */
113 simple_mtx_lock(&dev->as.lock);
114 dev_addr =
115 util_vma_heap_alloc(&dev->as.heap, ringbuf->size * 2, ringbuf->size * 2);
116 simple_mtx_unlock(&dev->as.lock);
117
118 if (!dev_addr)
119 return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
120 "Failed to allocate virtual address for ringbuf BO");
121
122 struct pan_kmod_vm_op vm_ops[] = {
123 {
124 .type = PAN_KMOD_VM_OP_TYPE_MAP,
125 .va = {
126 .start = dev_addr,
127 .size = ringbuf->size,
128 },
129 .map = {
130 .bo = ringbuf->bo,
131 .bo_offset = 0,
132 },
133 },
134 {
135 .type = PAN_KMOD_VM_OP_TYPE_MAP,
136 .va = {
137 .start = dev_addr + ringbuf->size,
138 .size = ringbuf->size,
139 },
140 .map = {
141 .bo = ringbuf->bo,
142 .bo_offset = 0,
143 },
144 },
145 };
146
147 /* If tracing is enabled, we keep the second part of the mapping unmapped
148 * to serve as a guard region. */
149 ret = pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, vm_ops,
150 tracing_enabled ? 1 : ARRAY_SIZE(vm_ops));
151 if (ret) {
152 simple_mtx_lock(&dev->as.lock);
153 util_vma_heap_free(&dev->as.heap, dev_addr, ringbuf->size * 2);
154 simple_mtx_unlock(&dev->as.lock);
155 return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
156 "Failed to GPU map ringbuf BO");
157 }
158
159 ringbuf->addr.dev = dev_addr;
160
161 if (dev->debug.decode_ctx) {
162 pandecode_inject_mmap(dev->debug.decode_ctx, ringbuf->addr.dev,
163 ringbuf->addr.host, ringbuf->size, NULL);
164 if (!tracing_enabled)
165 pandecode_inject_mmap(dev->debug.decode_ctx,
166 ringbuf->addr.dev + ringbuf->size,
167 ringbuf->addr.host, ringbuf->size, NULL);
168 }
169
170 struct panvk_pool_alloc_info alloc_info = {
171 .size = sizeof(struct panvk_cs_sync32),
172 .alignment = 64,
173 };
174
175 ringbuf->syncobj = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
176
177 struct panvk_cs_sync32 *syncobj = panvk_priv_mem_host_addr(ringbuf->syncobj);
178
179 if (!syncobj)
180 return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
181 "Failed to create the render desc ringbuf context");
182
183 *syncobj = (struct panvk_cs_sync32){
184 .seqno = RENDER_DESC_RINGBUF_SIZE,
185 };
186
187 return VK_SUCCESS;
188 }
189
190 static void
finish_subqueue_tracing(struct panvk_queue * queue,enum panvk_subqueue_id subqueue)191 finish_subqueue_tracing(struct panvk_queue *queue,
192 enum panvk_subqueue_id subqueue)
193 {
194 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
195 struct panvk_subqueue *subq = &queue->subqueues[subqueue];
196
197 if (subq->tracebuf.addr.dev) {
198 size_t pgsize = getpagesize();
199
200 pandecode_inject_free(dev->debug.decode_ctx, subq->tracebuf.addr.dev,
201 subq->tracebuf.size);
202
203 struct pan_kmod_vm_op op = {
204 .type = PAN_KMOD_VM_OP_TYPE_UNMAP,
205 .va = {
206 .start = subq->tracebuf.addr.dev,
207 .size = subq->tracebuf.size,
208 },
209 };
210
211 ASSERTED int ret =
212 pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, &op, 1);
213 assert(!ret);
214
215 simple_mtx_lock(&dev->as.lock);
216 util_vma_heap_free(&dev->as.heap, subq->tracebuf.addr.dev,
217 subq->tracebuf.size + pgsize);
218 simple_mtx_unlock(&dev->as.lock);
219 }
220
221 if (subq->tracebuf.addr.host) {
222 ASSERTED int ret =
223 os_munmap(subq->tracebuf.addr.host, subq->tracebuf.size);
224 assert(!ret);
225 }
226
227 pan_kmod_bo_put(subq->tracebuf.bo);
228
229 vk_free(&dev->vk.alloc, subq->reg_file);
230 }
231
232 static VkResult
init_subqueue_tracing(struct panvk_queue * queue,enum panvk_subqueue_id subqueue)233 init_subqueue_tracing(struct panvk_queue *queue,
234 enum panvk_subqueue_id subqueue)
235 {
236 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
237 struct panvk_subqueue *subq = &queue->subqueues[subqueue];
238 struct panvk_instance *instance =
239 to_panvk_instance(dev->vk.physical->instance);
240 unsigned debug = instance->debug_flags;
241 uint64_t dev_addr;
242
243 if (!(debug & PANVK_DEBUG_TRACE))
244 return VK_SUCCESS;
245
246 subq->reg_file =
247 vk_zalloc(&dev->vk.alloc, sizeof(uint32_t) * 256, sizeof(uint64_t),
248 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
249 if (!subq->reg_file)
250 return panvk_errorf(dev->vk.physical, VK_ERROR_OUT_OF_HOST_MEMORY,
251 "Failed to allocate reg file cache");
252
253 subq->tracebuf.size = debug_get_num_option("PANVK_CS_TRACEBUF_SIZE",
254 DEFAULT_CS_TRACEBUF_SIZE);
255 assert(subq->tracebuf.size > MIN_CS_TRACEBUF_SIZE &&
256 util_is_power_of_two_nonzero(subq->tracebuf.size));
257
258 subq->tracebuf.bo =
259 pan_kmod_bo_alloc(dev->kmod.dev, dev->kmod.vm, subq->tracebuf.size,
260 PAN_KMOD_BO_FLAG_GPU_UNCACHED);
261 if (!subq->tracebuf.bo)
262 return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
263 "Failed to create a CS tracebuf");
264
265 subq->tracebuf.addr.host =
266 pan_kmod_bo_mmap(subq->tracebuf.bo, 0, subq->tracebuf.size,
267 PROT_READ | PROT_WRITE, MAP_SHARED, NULL);
268 if (subq->tracebuf.addr.host == MAP_FAILED) {
269 subq->tracebuf.addr.host = NULL;
270 return panvk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY,
271 "Failed to CPU map tracebuf");
272 }
273
274 /* Add a guard page. */
275 size_t pgsize = getpagesize();
276 simple_mtx_lock(&dev->as.lock);
277 dev_addr =
278 util_vma_heap_alloc(&dev->as.heap, subq->tracebuf.size + pgsize, pgsize);
279 simple_mtx_unlock(&dev->as.lock);
280
281 if (!dev_addr)
282 return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
283 "Failed to allocate virtual address for tracebuf");
284
285 struct pan_kmod_vm_op vm_op = {
286 .type = PAN_KMOD_VM_OP_TYPE_MAP,
287 .va = {
288 .start = dev_addr,
289 .size = subq->tracebuf.size,
290 },
291 .map = {
292 .bo = subq->tracebuf.bo,
293 .bo_offset = 0,
294 },
295 };
296
297 /* If tracing is enabled, we keep the second part of the mapping unmapped
298 * to serve as a guard region. */
299 int ret =
300 pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, &vm_op, 1);
301 if (ret) {
302 simple_mtx_lock(&dev->as.lock);
303 util_vma_heap_free(&dev->as.heap, dev_addr, subq->tracebuf.size + pgsize);
304 simple_mtx_unlock(&dev->as.lock);
305 return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
306 "Failed to GPU map ringbuf BO");
307 }
308
309 subq->tracebuf.addr.dev = dev_addr;
310
311 if (dev->debug.decode_ctx) {
312 pandecode_inject_mmap(dev->debug.decode_ctx, subq->tracebuf.addr.dev,
313 subq->tracebuf.addr.host, subq->tracebuf.size,
314 NULL);
315 }
316
317 return VK_SUCCESS;
318 }
319
320 static void
finish_subqueue(struct panvk_queue * queue,enum panvk_subqueue_id subqueue)321 finish_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue)
322 {
323 panvk_pool_free_mem(&queue->subqueues[subqueue].context);
324 finish_subqueue_tracing(queue, subqueue);
325 }
326
327 static VkResult
init_utrace(struct panvk_queue * queue)328 init_utrace(struct panvk_queue *queue)
329 {
330 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
331 const struct panvk_physical_device *phys_dev =
332 to_panvk_physical_device(dev->vk.physical);
333 VkResult result;
334
335 const struct vk_sync_type *sync_type = phys_dev->sync_types[0];
336 assert(sync_type && vk_sync_type_is_drm_syncobj(sync_type) &&
337 (sync_type->features & VK_SYNC_FEATURE_TIMELINE));
338
339 result = vk_sync_create(&dev->vk, sync_type, VK_SYNC_IS_TIMELINE, 0,
340 &queue->utrace.sync);
341 if (result != VK_SUCCESS)
342 return result;
343
344 queue->utrace.next_value = 1;
345
346 return VK_SUCCESS;
347 }
348
349 static VkResult
init_subqueue(struct panvk_queue * queue,enum panvk_subqueue_id subqueue)350 init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue)
351 {
352 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
353 struct panvk_subqueue *subq = &queue->subqueues[subqueue];
354 const struct panvk_physical_device *phys_dev =
355 to_panvk_physical_device(queue->vk.base.device->physical);
356 struct panvk_instance *instance =
357 to_panvk_instance(dev->vk.physical->instance);
358 unsigned debug = instance->debug_flags;
359 struct panvk_cs_sync64 *syncobjs = panvk_priv_mem_host_addr(queue->syncobjs);
360
361 VkResult result = init_subqueue_tracing(queue, subqueue);
362 if (result != VK_SUCCESS)
363 return result;
364
365 struct panvk_pool_alloc_info alloc_info = {
366 .size = sizeof(struct panvk_cs_subqueue_context),
367 .alignment = 64,
368 };
369
370 /* When tracing is enabled, we want to use a non-cached pool, so can get
371 * up-to-date context even if the CS crashed in the middle. */
372 struct panvk_pool *mempool =
373 (debug & PANVK_DEBUG_TRACE) ? &dev->mempools.rw_nc : &dev->mempools.rw;
374
375 subq->context = panvk_pool_alloc_mem(mempool, alloc_info);
376 if (!panvk_priv_mem_host_addr(subq->context))
377 return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
378 "Failed to create a queue context");
379
380 struct panvk_cs_subqueue_context *cs_ctx =
381 panvk_priv_mem_host_addr(subq->context);
382
383 *cs_ctx = (struct panvk_cs_subqueue_context){
384 .syncobjs = panvk_priv_mem_dev_addr(queue->syncobjs),
385 .debug.syncobjs = panvk_priv_mem_dev_addr(queue->debug_syncobjs),
386 .debug.tracebuf.cs = subq->tracebuf.addr.dev,
387 .iter_sb = 0,
388 .tiler_oom_ctx.reg_dump_addr =
389 panvk_priv_mem_dev_addr(queue->tiler_oom_regs_save),
390 };
391
392 /* We use the geometry buffer for our temporary CS buffer. */
393 struct cs_buffer root_cs = {
394 .cpu = panvk_priv_mem_host_addr(queue->tiler_heap.desc) + 4096,
395 .gpu = panvk_priv_mem_dev_addr(queue->tiler_heap.desc) + 4096,
396 .capacity = 64 * 1024 / sizeof(uint64_t),
397 };
398 const struct cs_builder_conf conf = {
399 .nr_registers = 96,
400 .nr_kernel_registers = 4,
401 };
402 struct cs_builder b;
403
404 assert(panvk_priv_mem_dev_addr(queue->tiler_heap.desc) != 0);
405
406 cs_builder_init(&b, &conf, root_cs);
407 /* Pass the context. */
408 cs_move64_to(&b, cs_subqueue_ctx_reg(&b),
409 panvk_priv_mem_dev_addr(subq->context));
410
411 /* Intialize scoreboard slots used for asynchronous operations. */
412 cs_set_scoreboard_entry(&b, SB_ITER(0), SB_ID(LS));
413
414 /* We do greater than test on sync objects, and given the reference seqno
415 * registers are all zero at init time, we need to initialize all syncobjs
416 * with a seqno of one. */
417 syncobjs[subqueue].seqno = 1;
418
419 if (subqueue != PANVK_SUBQUEUE_COMPUTE) {
420 cs_ctx->render.tiler_heap =
421 panvk_priv_mem_dev_addr(queue->tiler_heap.desc);
422 /* Our geometry buffer comes 4k after the tiler heap, and we encode the
423 * size in the lower 12 bits so the address can be copied directly
424 * to the tiler descriptors. */
425 cs_ctx->render.geom_buf =
426 (cs_ctx->render.tiler_heap + 4096) | ((64 * 1024) >> 12);
427
428 /* Initialize the ringbuf */
429 cs_ctx->render.desc_ringbuf = (struct panvk_cs_desc_ringbuf){
430 .syncobj = panvk_priv_mem_dev_addr(queue->render_desc_ringbuf.syncobj),
431 .ptr = queue->render_desc_ringbuf.addr.dev,
432 .pos = 0,
433 };
434
435 struct cs_index heap_ctx_addr = cs_scratch_reg64(&b, 0);
436
437 /* Pre-set the heap context on the vertex-tiler/fragment queues. */
438 cs_move64_to(&b, heap_ctx_addr, queue->tiler_heap.context.dev_addr);
439 cs_heap_set(&b, heap_ctx_addr);
440 }
441
442 cs_finish(&b);
443
444 assert(cs_is_valid(&b));
445
446 struct drm_panthor_sync_op syncop = {
447 .flags =
448 DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ | DRM_PANTHOR_SYNC_OP_SIGNAL,
449 .handle = queue->syncobj_handle,
450 .timeline_value = 0,
451 };
452 struct drm_panthor_queue_submit qsubmit = {
453 .queue_index = subqueue,
454 .stream_size = cs_root_chunk_size(&b),
455 .stream_addr = cs_root_chunk_gpu_addr(&b),
456 .latest_flush = panthor_kmod_get_flush_id(dev->kmod.dev),
457 .syncs = DRM_PANTHOR_OBJ_ARRAY(1, &syncop),
458 };
459 struct drm_panthor_group_submit gsubmit = {
460 .group_handle = queue->group_handle,
461 .queue_submits = DRM_PANTHOR_OBJ_ARRAY(1, &qsubmit),
462 };
463
464 int ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_SUBMIT, &gsubmit);
465 if (ret)
466 return panvk_errorf(dev->vk.physical, VK_ERROR_INITIALIZATION_FAILED,
467 "Failed to initialized subqueue: %m");
468
469 ret = drmSyncobjWait(dev->vk.drm_fd, &queue->syncobj_handle, 1, INT64_MAX, 0,
470 NULL);
471 if (ret)
472 return panvk_errorf(dev->vk.physical, VK_ERROR_INITIALIZATION_FAILED,
473 "SyncobjWait failed: %m");
474
475 if (debug & PANVK_DEBUG_TRACE) {
476 pandecode_user_msg(dev->debug.decode_ctx, "Init subqueue %d binary\n\n",
477 subqueue);
478 pandecode_cs_binary(dev->debug.decode_ctx, qsubmit.stream_addr,
479 qsubmit.stream_size,
480 phys_dev->kmod.props.gpu_prod_id);
481 }
482
483 return VK_SUCCESS;
484 }
485
486 static void
cleanup_queue(struct panvk_queue * queue)487 cleanup_queue(struct panvk_queue *queue)
488 {
489 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
490
491 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++)
492 finish_subqueue(queue, i);
493
494 if (queue->utrace.sync)
495 vk_sync_destroy(&dev->vk, queue->utrace.sync);
496
497 finish_render_desc_ringbuf(queue);
498
499 panvk_pool_free_mem(&queue->tiler_oom_regs_save);
500 panvk_pool_free_mem(&queue->debug_syncobjs);
501 panvk_pool_free_mem(&queue->syncobjs);
502 }
503
504 static VkResult
init_queue(struct panvk_queue * queue)505 init_queue(struct panvk_queue *queue)
506 {
507 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
508 struct panvk_instance *instance =
509 to_panvk_instance(dev->vk.physical->instance);
510 unsigned debug = instance->debug_flags;
511 VkResult result;
512
513 struct panvk_pool_alloc_info alloc_info = {
514 .size =
515 ALIGN_POT(sizeof(struct panvk_cs_sync64), 64) * PANVK_SUBQUEUE_COUNT,
516 .alignment = 64,
517 };
518
519 queue->syncobjs = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
520 if (!panvk_priv_mem_host_addr(queue->syncobjs))
521 return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
522 "Failed to allocate subqueue sync objects");
523
524 if (instance->debug_flags & (PANVK_DEBUG_SYNC | PANVK_DEBUG_TRACE)) {
525 alloc_info.size =
526 ALIGN_POT(sizeof(struct panvk_cs_sync32), 64) * PANVK_SUBQUEUE_COUNT,
527 queue->debug_syncobjs =
528 panvk_pool_alloc_mem(&dev->mempools.rw_nc, alloc_info);
529 if (!panvk_priv_mem_host_addr(queue->debug_syncobjs)) {
530 result = panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
531 "Failed to allocate subqueue sync objects");
532 goto err_cleanup_queue;
533 }
534 }
535
536 alloc_info.size = dev->tiler_oom.dump_region_size;
537 alloc_info.alignment = sizeof(uint32_t);
538 queue->tiler_oom_regs_save =
539 panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
540 if (!panvk_priv_mem_host_addr(queue->tiler_oom_regs_save)) {
541 result = panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
542 "Failed to allocate tiler oom register save area");
543 goto err_cleanup_queue;
544 }
545
546 result = init_render_desc_ringbuf(queue);
547 if (result != VK_SUCCESS)
548 goto err_cleanup_queue;
549
550 result = init_utrace(queue);
551 if (result != VK_SUCCESS)
552 goto err_cleanup_queue;
553
554 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
555 result = init_subqueue(queue, i);
556 if (result != VK_SUCCESS)
557 goto err_cleanup_queue;
558 }
559
560 if (debug & PANVK_DEBUG_TRACE)
561 pandecode_next_frame(dev->debug.decode_ctx);
562
563 return VK_SUCCESS;
564
565 err_cleanup_queue:
566 cleanup_queue(queue);
567 return result;
568 }
569
570 static VkResult
create_group(struct panvk_queue * queue,enum drm_panthor_group_priority group_priority)571 create_group(struct panvk_queue *queue,
572 enum drm_panthor_group_priority group_priority)
573 {
574 const struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
575 const struct panvk_physical_device *phys_dev =
576 to_panvk_physical_device(queue->vk.base.device->physical);
577
578 struct drm_panthor_queue_create qc[] = {
579 [PANVK_SUBQUEUE_VERTEX_TILER] =
580 {
581 .priority = 1,
582 .ringbuf_size = 64 * 1024,
583 },
584 [PANVK_SUBQUEUE_FRAGMENT] =
585 {
586 .priority = 1,
587 .ringbuf_size = 64 * 1024,
588 },
589 [PANVK_SUBQUEUE_COMPUTE] =
590 {
591 .priority = 1,
592 .ringbuf_size = 64 * 1024,
593 },
594 };
595
596 struct drm_panthor_group_create gc = {
597 .compute_core_mask = phys_dev->kmod.props.shader_present,
598 .fragment_core_mask = phys_dev->kmod.props.shader_present,
599 .tiler_core_mask = 1,
600 .max_compute_cores = util_bitcount64(phys_dev->kmod.props.shader_present),
601 .max_fragment_cores =
602 util_bitcount64(phys_dev->kmod.props.shader_present),
603 .max_tiler_cores = 1,
604 .priority = group_priority,
605 .queues = DRM_PANTHOR_OBJ_ARRAY(ARRAY_SIZE(qc), qc),
606 .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
607 };
608
609 int ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_CREATE, &gc);
610 if (ret)
611 return panvk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
612 "Failed to create a scheduling group");
613
614 queue->group_handle = gc.group_handle;
615 return VK_SUCCESS;
616 }
617
618 static void
destroy_group(struct panvk_queue * queue)619 destroy_group(struct panvk_queue *queue)
620 {
621 const struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
622 struct drm_panthor_group_destroy gd = {
623 .group_handle = queue->group_handle,
624 };
625
626 ASSERTED int ret =
627 drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_DESTROY, &gd);
628 assert(!ret);
629 }
630
631 static VkResult
init_tiler(struct panvk_queue * queue)632 init_tiler(struct panvk_queue *queue)
633 {
634 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
635 struct panvk_tiler_heap *tiler_heap = &queue->tiler_heap;
636 VkResult result;
637
638 /* We allocate the tiler heap descriptor and geometry buffer in one go,
639 * so we can pass it through a single 64-bit register to the VERTEX_TILER
640 * command streams. */
641 struct panvk_pool_alloc_info alloc_info = {
642 .size = (64 * 1024) + 4096,
643 .alignment = 4096,
644 };
645
646 tiler_heap->desc = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
647 if (!panvk_priv_mem_host_addr(tiler_heap->desc)) {
648 result = panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
649 "Failed to create a tiler heap context");
650 goto err_free_desc;
651 }
652
653 tiler_heap->chunk_size = 2 * 1024 * 1024;
654
655 struct drm_panthor_tiler_heap_create thc = {
656 .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
657 .chunk_size = tiler_heap->chunk_size,
658 .initial_chunk_count = 5,
659 .max_chunks = 64,
660 .target_in_flight = 65535,
661 };
662
663 int ret =
664 drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE, &thc);
665 if (ret) {
666 result = panvk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
667 "Failed to create a tiler heap context");
668 goto err_free_desc;
669 }
670
671 tiler_heap->context.handle = thc.handle;
672 tiler_heap->context.dev_addr = thc.tiler_heap_ctx_gpu_va;
673
674 pan_cast_and_pack(panvk_priv_mem_host_addr(tiler_heap->desc), TILER_HEAP,
675 cfg) {
676 cfg.size = tiler_heap->chunk_size;
677 cfg.base = thc.first_heap_chunk_gpu_va;
678 cfg.bottom = cfg.base + 64;
679 cfg.top = cfg.base + cfg.size;
680 }
681
682 return VK_SUCCESS;
683
684 err_free_desc:
685 panvk_pool_free_mem(&tiler_heap->desc);
686 return result;
687 }
688
689 static void
cleanup_tiler(struct panvk_queue * queue)690 cleanup_tiler(struct panvk_queue *queue)
691 {
692 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
693 struct panvk_tiler_heap *tiler_heap = &queue->tiler_heap;
694 struct drm_panthor_tiler_heap_destroy thd = {
695 .handle = tiler_heap->context.handle,
696 };
697 ASSERTED int ret =
698 drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY, &thd);
699 assert(!ret);
700
701 panvk_pool_free_mem(&tiler_heap->desc);
702 }
703
704 struct panvk_queue_submit {
705 const struct panvk_instance *instance;
706 const struct panvk_physical_device *phys_dev;
707 struct panvk_device *dev;
708 struct panvk_queue *queue;
709
710 bool process_utrace;
711 bool force_sync;
712
713 uint32_t used_queue_mask;
714
715 uint32_t qsubmit_count;
716 bool needs_waits;
717 bool needs_signals;
718
719 struct drm_panthor_queue_submit *qsubmits;
720 struct drm_panthor_sync_op *wait_ops;
721 struct drm_panthor_sync_op *signal_ops;
722
723 struct {
724 uint32_t queue_mask;
725 enum panvk_subqueue_id first_subqueue;
726 enum panvk_subqueue_id last_subqueue;
727 bool needs_clone;
728 const struct u_trace *last_ut;
729 struct panvk_utrace_flush_data *data_storage;
730
731 struct panvk_utrace_flush_data *data[PANVK_SUBQUEUE_COUNT];
732 } utrace;
733 };
734
735 struct panvk_queue_submit_stack_storage {
736 struct drm_panthor_queue_submit qsubmits[8];
737 struct drm_panthor_sync_op syncops[8];
738 };
739
740 static void
panvk_queue_submit_init(struct panvk_queue_submit * submit,struct vk_queue * vk_queue)741 panvk_queue_submit_init(struct panvk_queue_submit *submit,
742 struct vk_queue *vk_queue)
743 {
744 struct vk_device *vk_dev = vk_queue->base.device;
745
746 *submit = (struct panvk_queue_submit){
747 .instance = to_panvk_instance(vk_dev->physical->instance),
748 .phys_dev = to_panvk_physical_device(vk_dev->physical),
749 .dev = to_panvk_device(vk_dev),
750 .queue = container_of(vk_queue, struct panvk_queue, vk),
751 };
752
753 submit->process_utrace =
754 u_trace_should_process(&submit->dev->utrace.utctx) &&
755 submit->phys_dev->kmod.props.timestamp_frequency;
756
757 submit->force_sync =
758 submit->instance->debug_flags & (PANVK_DEBUG_TRACE | PANVK_DEBUG_SYNC);
759 }
760
761 static void
panvk_queue_submit_init_storage(struct panvk_queue_submit * submit,const struct vk_queue_submit * vk_submit,struct panvk_queue_submit_stack_storage * stack_storage)762 panvk_queue_submit_init_storage(
763 struct panvk_queue_submit *submit, const struct vk_queue_submit *vk_submit,
764 struct panvk_queue_submit_stack_storage *stack_storage)
765 {
766 submit->utrace.first_subqueue = PANVK_SUBQUEUE_COUNT;
767 for (uint32_t i = 0; i < vk_submit->command_buffer_count; i++) {
768 struct panvk_cmd_buffer *cmdbuf = container_of(
769 vk_submit->command_buffers[i], struct panvk_cmd_buffer, vk);
770
771 for (uint32_t j = 0; j < ARRAY_SIZE(cmdbuf->state.cs); j++) {
772 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, j);
773 assert(cs_is_valid(b));
774 if (cs_is_empty(b))
775 continue;
776
777 submit->used_queue_mask |= BITFIELD_BIT(j);
778 submit->qsubmit_count++;
779
780 struct u_trace *ut = &cmdbuf->utrace.uts[j];
781 if (submit->process_utrace && u_trace_has_points(ut)) {
782 submit->utrace.queue_mask |= BITFIELD_BIT(j);
783 if (submit->utrace.first_subqueue == PANVK_SUBQUEUE_COUNT)
784 submit->utrace.first_subqueue = j;
785 submit->utrace.last_subqueue = j;
786 submit->utrace.last_ut = ut;
787
788 if (!(cmdbuf->flags &
789 VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)) {
790 /* we will follow the user cs with a timestamp copy cs */
791 submit->qsubmit_count++;
792 submit->utrace.needs_clone = true;
793 }
794 }
795 }
796 }
797
798 /* Synchronize all subqueues if we have no command buffer submitted. */
799 if (!submit->qsubmit_count)
800 submit->used_queue_mask = BITFIELD_MASK(PANVK_SUBQUEUE_COUNT);
801
802 uint32_t syncop_count = 0;
803
804 submit->needs_waits = vk_submit->wait_count > 0;
805 submit->needs_signals = vk_submit->signal_count > 0 || submit->force_sync ||
806 submit->utrace.queue_mask;
807
808 /* We add sync-only queue submits to place our wait/signal operations. */
809 if (submit->needs_waits) {
810 submit->qsubmit_count += util_bitcount(submit->used_queue_mask);
811 syncop_count += vk_submit->wait_count;
812 }
813 if (submit->needs_signals) {
814 submit->qsubmit_count += util_bitcount(submit->used_queue_mask);
815 syncop_count += util_bitcount(submit->used_queue_mask);
816 }
817
818 submit->qsubmits =
819 submit->qsubmit_count <= ARRAY_SIZE(stack_storage->qsubmits)
820 ? stack_storage->qsubmits
821 : malloc(sizeof(*submit->qsubmits) * submit->qsubmit_count);
822
823 submit->wait_ops = syncop_count <= ARRAY_SIZE(stack_storage->syncops)
824 ? stack_storage->syncops
825 : malloc(sizeof(*submit->wait_ops) * syncop_count);
826 submit->signal_ops = submit->wait_ops + vk_submit->wait_count;
827
828 /* reset so that we can initialize submit->qsubmits incrementally */
829 submit->qsubmit_count = 0;
830
831 if (submit->utrace.queue_mask) {
832 submit->utrace.data_storage =
833 malloc(sizeof(*submit->utrace.data_storage) *
834 util_bitcount(submit->utrace.queue_mask));
835 }
836 }
837
838 static void
panvk_queue_submit_cleanup_storage(struct panvk_queue_submit * submit,const struct panvk_queue_submit_stack_storage * stack_storage)839 panvk_queue_submit_cleanup_storage(
840 struct panvk_queue_submit *submit,
841 const struct panvk_queue_submit_stack_storage *stack_storage)
842 {
843 if (submit->qsubmits != stack_storage->qsubmits)
844 free(submit->qsubmits);
845 if (submit->wait_ops != stack_storage->syncops)
846 free(submit->wait_ops);
847
848 /* either no utrace flush data or the data has been transferred to u_trace */
849 assert(!submit->utrace.data_storage);
850 }
851
852 static void
panvk_queue_submit_init_utrace(struct panvk_queue_submit * submit,const struct vk_queue_submit * vk_submit)853 panvk_queue_submit_init_utrace(struct panvk_queue_submit *submit,
854 const struct vk_queue_submit *vk_submit)
855 {
856 struct panvk_device *dev = submit->dev;
857
858 if (!submit->utrace.queue_mask)
859 return;
860
861 /* u_trace_context processes trace events in order. We want to make sure
862 * it waits for the timestamp writes before processing the first event and
863 * it can free the flush data after processing the last event.
864 */
865 struct panvk_utrace_flush_data *next = submit->utrace.data_storage;
866 submit->utrace.data[submit->utrace.last_subqueue] = next++;
867
868 u_foreach_bit(i, submit->utrace.queue_mask) {
869 if (i != submit->utrace.last_subqueue)
870 submit->utrace.data[i] = next++;
871
872 const bool wait = i == submit->utrace.first_subqueue;
873 *submit->utrace.data[i] = (struct panvk_utrace_flush_data){
874 .subqueue = i,
875 .sync = wait ? submit->queue->utrace.sync : NULL,
876 .wait_value = wait ? submit->queue->utrace.next_value : 0,
877 };
878 }
879
880 if (submit->utrace.needs_clone) {
881 struct panvk_pool *clone_pool = &submit->utrace.data_storage->clone_pool;
882 panvk_per_arch(utrace_clone_init_pool)(clone_pool, dev);
883 }
884 }
885
886 static void
panvk_queue_submit_init_waits(struct panvk_queue_submit * submit,const struct vk_queue_submit * vk_submit)887 panvk_queue_submit_init_waits(struct panvk_queue_submit *submit,
888 const struct vk_queue_submit *vk_submit)
889 {
890 if (!submit->needs_waits)
891 return;
892
893 for (uint32_t i = 0; i < vk_submit->wait_count; i++) {
894 const struct vk_sync_wait *wait = &vk_submit->waits[i];
895 const struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(wait->sync);
896 assert(syncobj);
897
898 submit->wait_ops[i] = (struct drm_panthor_sync_op){
899 .flags = (syncobj->base.flags & VK_SYNC_IS_TIMELINE
900 ? DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ
901 : DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ) |
902 DRM_PANTHOR_SYNC_OP_WAIT,
903 .handle = syncobj->syncobj,
904 .timeline_value = wait->wait_value,
905 };
906 }
907
908 u_foreach_bit(i, submit->used_queue_mask) {
909 submit->qsubmits[submit->qsubmit_count++] =
910 (struct drm_panthor_queue_submit){
911 .queue_index = i,
912 .syncs =
913 DRM_PANTHOR_OBJ_ARRAY(vk_submit->wait_count, submit->wait_ops),
914 };
915 }
916 }
917
918 static void
panvk_queue_submit_init_cmdbufs(struct panvk_queue_submit * submit,const struct vk_queue_submit * vk_submit)919 panvk_queue_submit_init_cmdbufs(struct panvk_queue_submit *submit,
920 const struct vk_queue_submit *vk_submit)
921 {
922 struct panvk_device *dev = submit->dev;
923
924 for (uint32_t i = 0; i < vk_submit->command_buffer_count; i++) {
925 struct panvk_cmd_buffer *cmdbuf = container_of(
926 vk_submit->command_buffers[i], struct panvk_cmd_buffer, vk);
927
928 for (uint32_t j = 0; j < ARRAY_SIZE(cmdbuf->state.cs); j++) {
929 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, j);
930 if (cs_is_empty(b))
931 continue;
932
933 submit->qsubmits[submit->qsubmit_count++] =
934 (struct drm_panthor_queue_submit){
935 .queue_index = j,
936 .stream_size = cs_root_chunk_size(b),
937 .stream_addr = cs_root_chunk_gpu_addr(b),
938 .latest_flush = cmdbuf->flush_id,
939 };
940 }
941
942 u_foreach_bit(j, submit->utrace.queue_mask) {
943 struct u_trace *ut = &cmdbuf->utrace.uts[j];
944
945 if (!u_trace_has_points(ut))
946 continue;
947
948 const bool free_data = ut == submit->utrace.last_ut;
949
950 struct u_trace clone_ut;
951 if (!(cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)) {
952 u_trace_init(&clone_ut, &dev->utrace.utctx);
953
954 struct panvk_pool *clone_pool =
955 &submit->utrace.data_storage->clone_pool;
956 struct cs_builder clone_builder;
957 panvk_per_arch(utrace_clone_init_builder)(&clone_builder,
958 clone_pool);
959
960 u_trace_clone_append(
961 u_trace_begin_iterator(ut), u_trace_end_iterator(ut), &clone_ut,
962 &clone_builder, panvk_per_arch(utrace_copy_buffer));
963
964 panvk_per_arch(utrace_clone_finish_builder)(&clone_builder);
965
966 submit->qsubmits[submit->qsubmit_count++] =
967 (struct drm_panthor_queue_submit){
968 .queue_index = j,
969 .stream_size = cs_root_chunk_size(&clone_builder),
970 .stream_addr = cs_root_chunk_gpu_addr(&clone_builder),
971 .latest_flush = panthor_kmod_get_flush_id(dev->kmod.dev),
972 };
973
974 ut = &clone_ut;
975 }
976
977 u_trace_flush(ut, submit->utrace.data[j], dev->vk.current_frame,
978 free_data);
979 }
980 }
981
982 /* we've transferred the data ownership to utrace, if any */
983 submit->utrace.data_storage = NULL;
984 }
985
986 static void
panvk_queue_submit_init_signals(struct panvk_queue_submit * submit,const struct vk_queue_submit * vk_submit)987 panvk_queue_submit_init_signals(struct panvk_queue_submit *submit,
988 const struct vk_queue_submit *vk_submit)
989 {
990 struct panvk_queue *queue = submit->queue;
991
992 if (!submit->needs_signals)
993 return;
994
995 uint32_t signal_op = 0;
996 u_foreach_bit(i, submit->used_queue_mask) {
997 submit->signal_ops[signal_op] = (struct drm_panthor_sync_op){
998 .flags = DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ |
999 DRM_PANTHOR_SYNC_OP_SIGNAL,
1000 .handle = queue->syncobj_handle,
1001 .timeline_value = signal_op + 1,
1002 };
1003
1004 submit->qsubmits[submit->qsubmit_count++] =
1005 (struct drm_panthor_queue_submit){
1006 .queue_index = i,
1007 .syncs = DRM_PANTHOR_OBJ_ARRAY(1, &submit->signal_ops[signal_op++]),
1008 };
1009 }
1010
1011 if (submit->force_sync) {
1012 struct panvk_cs_sync32 *debug_syncs =
1013 panvk_priv_mem_host_addr(queue->debug_syncobjs);
1014
1015 assert(debug_syncs);
1016 memset(debug_syncs, 0, sizeof(*debug_syncs) * PANVK_SUBQUEUE_COUNT);
1017 }
1018 }
1019
1020 static VkResult
panvk_queue_submit_ioctl(struct panvk_queue_submit * submit)1021 panvk_queue_submit_ioctl(struct panvk_queue_submit *submit)
1022 {
1023 const struct panvk_device *dev = submit->dev;
1024 const struct panvk_instance *instance = submit->instance;
1025 struct panvk_queue *queue = submit->queue;
1026 int ret;
1027
1028 if (instance->debug_flags & PANVK_DEBUG_TRACE) {
1029 /* If we're tracing, we need to reset the desc ringbufs and the CS
1030 * tracebuf. */
1031 for (uint32_t i = 0; i < ARRAY_SIZE(queue->subqueues); i++) {
1032 struct panvk_cs_subqueue_context *ctx =
1033 panvk_priv_mem_host_addr(queue->subqueues[i].context);
1034
1035 if (ctx->render.desc_ringbuf.ptr) {
1036 ctx->render.desc_ringbuf.ptr = queue->render_desc_ringbuf.addr.dev;
1037 ctx->render.desc_ringbuf.pos = 0;
1038 }
1039
1040 if (ctx->debug.tracebuf.cs)
1041 ctx->debug.tracebuf.cs = queue->subqueues[i].tracebuf.addr.dev;
1042 }
1043 }
1044
1045 struct drm_panthor_group_submit gsubmit = {
1046 .group_handle = queue->group_handle,
1047 .queue_submits =
1048 DRM_PANTHOR_OBJ_ARRAY(submit->qsubmit_count, submit->qsubmits),
1049 };
1050
1051 ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_SUBMIT, &gsubmit);
1052 if (ret)
1053 return vk_queue_set_lost(&queue->vk, "GROUP_SUBMIT: %m");
1054
1055 return VK_SUCCESS;
1056 }
1057
1058 static void
panvk_queue_submit_process_signals(struct panvk_queue_submit * submit,const struct vk_queue_submit * vk_submit)1059 panvk_queue_submit_process_signals(struct panvk_queue_submit *submit,
1060 const struct vk_queue_submit *vk_submit)
1061 {
1062 struct panvk_device *dev = submit->dev;
1063 struct panvk_queue *queue = submit->queue;
1064 int ret;
1065
1066 if (!submit->needs_signals)
1067 return;
1068
1069 if (submit->force_sync) {
1070 uint64_t point = util_bitcount(submit->used_queue_mask);
1071 ret = drmSyncobjTimelineWait(dev->vk.drm_fd, &queue->syncobj_handle,
1072 &point, 1, INT64_MAX,
1073 DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL);
1074 assert(!ret);
1075 }
1076
1077 for (uint32_t i = 0; i < vk_submit->signal_count; i++) {
1078 const struct vk_sync_signal *signal = &vk_submit->signals[i];
1079 const struct vk_drm_syncobj *syncobj =
1080 vk_sync_as_drm_syncobj(signal->sync);
1081 assert(syncobj);
1082
1083 drmSyncobjTransfer(dev->vk.drm_fd, syncobj->syncobj, signal->signal_value,
1084 queue->syncobj_handle, 0, 0);
1085 }
1086
1087 if (submit->utrace.queue_mask) {
1088 const struct vk_drm_syncobj *syncobj =
1089 vk_sync_as_drm_syncobj(queue->utrace.sync);
1090
1091 drmSyncobjTransfer(dev->vk.drm_fd, syncobj->syncobj,
1092 queue->utrace.next_value++, queue->syncobj_handle, 0,
1093 0);
1094
1095 /* process flushed events after the syncobj is set up */
1096 u_trace_context_process(&dev->utrace.utctx, false);
1097 }
1098
1099 drmSyncobjReset(dev->vk.drm_fd, &queue->syncobj_handle, 1);
1100 }
1101
1102 static void
panvk_queue_submit_process_debug(const struct panvk_queue_submit * submit)1103 panvk_queue_submit_process_debug(const struct panvk_queue_submit *submit)
1104 {
1105 const struct panvk_instance *instance = submit->instance;
1106 struct panvk_queue *queue = submit->queue;
1107 struct pandecode_context *decode_ctx = submit->dev->debug.decode_ctx;
1108
1109 if (instance->debug_flags & PANVK_DEBUG_TRACE) {
1110 const struct pan_kmod_dev_props *props = &submit->phys_dev->kmod.props;
1111
1112 for (uint32_t i = 0; i < submit->qsubmit_count; i++) {
1113 const struct drm_panthor_queue_submit *qsubmit = &submit->qsubmits[i];
1114 if (!qsubmit->stream_size)
1115 continue;
1116
1117 pandecode_user_msg(decode_ctx, "CS %d on subqueue %d binaries\n\n", i,
1118 qsubmit->queue_index);
1119 pandecode_cs_binary(decode_ctx, qsubmit->stream_addr,
1120 qsubmit->stream_size, props->gpu_prod_id);
1121 pandecode_user_msg(decode_ctx, "\n");
1122 }
1123
1124 for (uint32_t i = 0; i < ARRAY_SIZE(queue->subqueues); i++) {
1125 struct panvk_cs_subqueue_context *ctx =
1126 panvk_priv_mem_host_addr(queue->subqueues[i].context);
1127
1128 size_t trace_size =
1129 ctx->debug.tracebuf.cs - queue->subqueues[i].tracebuf.addr.dev;
1130 if (!trace_size)
1131 continue;
1132
1133 assert(
1134 trace_size <= queue->subqueues[i].tracebuf.size ||
1135 !"OOB access on the CS tracebuf, pass a bigger PANVK_CS_TRACEBUF_SIZE");
1136
1137 assert(
1138 !ctx->render.desc_ringbuf.ptr ||
1139 ctx->render.desc_ringbuf.pos <= queue->render_desc_ringbuf.size ||
1140 !"OOB access on the desc tracebuf, pass a bigger PANVK_DESC_TRACEBUF_SIZE");
1141
1142 uint64_t trace = queue->subqueues[i].tracebuf.addr.dev;
1143
1144 pandecode_user_msg(decode_ctx, "\nCS traces on subqueue %d\n\n", i);
1145 pandecode_cs_trace(decode_ctx, trace, trace_size, props->gpu_prod_id);
1146 pandecode_user_msg(decode_ctx, "\n");
1147 }
1148 }
1149
1150 if (instance->debug_flags & PANVK_DEBUG_DUMP)
1151 pandecode_dump_mappings(decode_ctx);
1152
1153 if (instance->debug_flags & PANVK_DEBUG_TRACE)
1154 pandecode_next_frame(decode_ctx);
1155
1156 /* validate last after the command streams are dumped */
1157 if (submit->force_sync) {
1158 struct panvk_cs_sync32 *debug_syncs =
1159 panvk_priv_mem_host_addr(queue->debug_syncobjs);
1160 uint32_t debug_sync_points[PANVK_SUBQUEUE_COUNT] = {0};
1161
1162 for (uint32_t i = 0; i < submit->qsubmit_count; i++) {
1163 const struct drm_panthor_queue_submit *qsubmit = &submit->qsubmits[i];
1164 if (qsubmit->stream_size)
1165 debug_sync_points[qsubmit->queue_index]++;
1166 }
1167
1168 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
1169 if (debug_syncs[i].seqno != debug_sync_points[i] ||
1170 debug_syncs[i].error != 0)
1171 vk_queue_set_lost(&queue->vk, "Incomplete job or timeout");
1172 }
1173 }
1174 }
1175
1176 static VkResult
panvk_queue_submit(struct vk_queue * vk_queue,struct vk_queue_submit * vk_submit)1177 panvk_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *vk_submit)
1178 {
1179 struct panvk_queue_submit_stack_storage stack_storage;
1180 struct panvk_queue_submit submit;
1181 VkResult result = VK_SUCCESS;
1182
1183 if (vk_queue_is_lost(vk_queue))
1184 return VK_ERROR_DEVICE_LOST;
1185
1186 panvk_queue_submit_init(&submit, vk_queue);
1187 panvk_queue_submit_init_storage(&submit, vk_submit, &stack_storage);
1188 panvk_queue_submit_init_utrace(&submit, vk_submit);
1189 panvk_queue_submit_init_waits(&submit, vk_submit);
1190 panvk_queue_submit_init_cmdbufs(&submit, vk_submit);
1191 panvk_queue_submit_init_signals(&submit, vk_submit);
1192
1193 result = panvk_queue_submit_ioctl(&submit);
1194 if (result != VK_SUCCESS)
1195 goto out;
1196
1197 panvk_queue_submit_process_signals(&submit, vk_submit);
1198 panvk_queue_submit_process_debug(&submit);
1199
1200 out:
1201 panvk_queue_submit_cleanup_storage(&submit, &stack_storage);
1202 return result;
1203 }
1204
1205 static enum drm_panthor_group_priority
get_panthor_group_priority(const VkDeviceQueueCreateInfo * create_info)1206 get_panthor_group_priority(const VkDeviceQueueCreateInfo *create_info)
1207 {
1208 const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
1209 vk_find_struct_const(create_info->pNext,
1210 DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
1211 const VkQueueGlobalPriorityKHR priority =
1212 priority_info ? priority_info->globalPriority
1213 : VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
1214
1215 switch (priority) {
1216 case VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR:
1217 return PANTHOR_GROUP_PRIORITY_LOW;
1218 case VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR:
1219 return PANTHOR_GROUP_PRIORITY_MEDIUM;
1220 case VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR:
1221 return PANTHOR_GROUP_PRIORITY_HIGH;
1222 case VK_QUEUE_GLOBAL_PRIORITY_REALTIME_KHR:
1223 return PANTHOR_GROUP_PRIORITY_REALTIME;
1224 default:
1225 unreachable("Invalid global priority");
1226 }
1227 }
1228
1229 VkResult
panvk_per_arch(queue_init)1230 panvk_per_arch(queue_init)(struct panvk_device *dev, struct panvk_queue *queue,
1231 int idx, const VkDeviceQueueCreateInfo *create_info)
1232 {
1233 VkResult result = vk_queue_init(&queue->vk, &dev->vk, create_info, idx);
1234 if (result != VK_SUCCESS)
1235 return result;
1236
1237 int ret = drmSyncobjCreate(dev->vk.drm_fd, 0, &queue->syncobj_handle);
1238 if (ret) {
1239 result = panvk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
1240 "Failed to create our internal sync object");
1241 goto err_finish_queue;
1242 }
1243
1244 result = init_tiler(queue);
1245 if (result != VK_SUCCESS)
1246 goto err_destroy_syncobj;
1247
1248 result = create_group(queue, get_panthor_group_priority(create_info));
1249 if (result != VK_SUCCESS)
1250 goto err_cleanup_tiler;
1251
1252 result = init_queue(queue);
1253 if (result != VK_SUCCESS)
1254 goto err_destroy_group;
1255
1256 queue->vk.driver_submit = panvk_queue_submit;
1257 return VK_SUCCESS;
1258
1259 err_destroy_group:
1260 destroy_group(queue);
1261
1262 err_cleanup_tiler:
1263 cleanup_tiler(queue);
1264
1265 err_destroy_syncobj:
1266 drmSyncobjDestroy(dev->vk.drm_fd, queue->syncobj_handle);
1267
1268 err_finish_queue:
1269 vk_queue_finish(&queue->vk);
1270 return result;
1271 }
1272
1273 void
panvk_per_arch(queue_finish)1274 panvk_per_arch(queue_finish)(struct panvk_queue *queue)
1275 {
1276 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
1277
1278 cleanup_queue(queue);
1279 destroy_group(queue);
1280 cleanup_tiler(queue);
1281 drmSyncobjDestroy(dev->vk.drm_fd, queue->syncobj_handle);
1282 vk_queue_finish(&queue->vk);
1283 }
1284
1285 VkResult
panvk_per_arch(queue_check_status)1286 panvk_per_arch(queue_check_status)(struct panvk_queue *queue)
1287 {
1288 struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
1289 struct drm_panthor_group_get_state state = {
1290 .group_handle = queue->group_handle,
1291 };
1292
1293 int ret =
1294 drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_GET_STATE, &state);
1295 if (!ret && !state.state)
1296 return VK_SUCCESS;
1297
1298 vk_queue_set_lost(&queue->vk,
1299 "group state: err=%d, state=0x%x, fatal_queues=0x%x", ret,
1300 state.state, state.fatal_queues);
1301
1302 return VK_ERROR_DEVICE_LOST;
1303 }
1304