• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2024 Collabora Ltd.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "drm-uapi/panthor_drm.h"
8 
9 #include "genxml/cs_builder.h"
10 #include "genxml/decode.h"
11 
12 #include "panvk_cmd_buffer.h"
13 #include "panvk_macros.h"
14 #include "panvk_queue.h"
15 #include "panvk_utrace.h"
16 
17 #include "util/bitscan.h"
18 #include "vk_drm_syncobj.h"
19 #include "vk_log.h"
20 
21 #define MIN_DESC_TRACEBUF_SIZE (128 * 1024)
22 #define DEFAULT_DESC_TRACEBUF_SIZE (2 * 1024 * 1024)
23 #define MIN_CS_TRACEBUF_SIZE (512 * 1024)
24 #define DEFAULT_CS_TRACEBUF_SIZE (2 * 1024 * 1024)
25 
26 static void
finish_render_desc_ringbuf(struct panvk_queue * queue)27 finish_render_desc_ringbuf(struct panvk_queue *queue)
28 {
29    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
30    struct panvk_instance *instance =
31       to_panvk_instance(dev->vk.physical->instance);
32    bool tracing_enabled = instance->debug_flags & PANVK_DEBUG_TRACE;
33    struct panvk_desc_ringbuf *ringbuf = &queue->render_desc_ringbuf;
34 
35    panvk_pool_free_mem(&ringbuf->syncobj);
36 
37    if (dev->debug.decode_ctx && ringbuf->addr.dev) {
38       pandecode_inject_free(dev->debug.decode_ctx, ringbuf->addr.dev,
39                             ringbuf->size);
40       if (!tracing_enabled)
41          pandecode_inject_free(dev->debug.decode_ctx,
42                                ringbuf->addr.dev + ringbuf->size,
43                                ringbuf->size);
44    }
45 
46    if (ringbuf->addr.dev) {
47       struct pan_kmod_vm_op op = {
48          .type = PAN_KMOD_VM_OP_TYPE_UNMAP,
49          .va = {
50             .start = ringbuf->addr.dev,
51             .size = ringbuf->size * (tracing_enabled ? 2 : 1),
52          },
53       };
54 
55       ASSERTED int ret =
56          pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, &op, 1);
57       assert(!ret);
58 
59       simple_mtx_lock(&dev->as.lock);
60       util_vma_heap_free(&dev->as.heap, ringbuf->addr.dev, ringbuf->size * 2);
61       simple_mtx_unlock(&dev->as.lock);
62    }
63 
64    if (ringbuf->addr.host) {
65       ASSERTED int ret =
66          os_munmap(ringbuf->addr.host, ringbuf->size);
67       assert(!ret);
68    }
69 
70    pan_kmod_bo_put(ringbuf->bo);
71 }
72 
73 static VkResult
init_render_desc_ringbuf(struct panvk_queue * queue)74 init_render_desc_ringbuf(struct panvk_queue *queue)
75 {
76    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
77    struct panvk_instance *instance =
78       to_panvk_instance(dev->vk.physical->instance);
79    bool tracing_enabled = instance->debug_flags & PANVK_DEBUG_TRACE;
80    uint32_t flags = panvk_device_adjust_bo_flags(dev, PAN_KMOD_BO_FLAG_NO_MMAP);
81    struct panvk_desc_ringbuf *ringbuf = &queue->render_desc_ringbuf;
82    uint64_t dev_addr = 0;
83    int ret;
84 
85    if (tracing_enabled) {
86       ringbuf->size = debug_get_num_option("PANVK_DESC_TRACEBUF_SIZE",
87                                            DEFAULT_DESC_TRACEBUF_SIZE);
88       flags |= PAN_KMOD_BO_FLAG_GPU_UNCACHED;
89       assert(ringbuf->size > MIN_DESC_TRACEBUF_SIZE &&
90              util_is_power_of_two_nonzero(ringbuf->size));
91    } else {
92       ringbuf->size = RENDER_DESC_RINGBUF_SIZE;
93    }
94 
95    ringbuf->bo =
96       pan_kmod_bo_alloc(dev->kmod.dev, dev->kmod.vm, ringbuf->size, flags);
97    if (!ringbuf->bo)
98       return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
99                           "Failed to create a descriptor ring buffer context");
100 
101    if (!(flags & PAN_KMOD_BO_FLAG_NO_MMAP)) {
102       ringbuf->addr.host =
103          pan_kmod_bo_mmap(ringbuf->bo, 0, ringbuf->size, PROT_READ | PROT_WRITE,
104                           MAP_SHARED, NULL);
105       if (ringbuf->addr.host == MAP_FAILED)
106          return panvk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY,
107                              "Failed to CPU map ringbuf BO");
108    }
109 
110    /* We choose the alignment to guarantee that we won't ever cross a 4G
111     * boundary when accessing the mapping. This way we can encode the wraparound
112     * using 32-bit operations. */
113    simple_mtx_lock(&dev->as.lock);
114    dev_addr =
115       util_vma_heap_alloc(&dev->as.heap, ringbuf->size * 2, ringbuf->size * 2);
116    simple_mtx_unlock(&dev->as.lock);
117 
118    if (!dev_addr)
119       return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
120                           "Failed to allocate virtual address for ringbuf BO");
121 
122    struct pan_kmod_vm_op vm_ops[] = {
123       {
124          .type = PAN_KMOD_VM_OP_TYPE_MAP,
125          .va = {
126             .start = dev_addr,
127             .size = ringbuf->size,
128          },
129          .map = {
130             .bo = ringbuf->bo,
131             .bo_offset = 0,
132          },
133       },
134       {
135          .type = PAN_KMOD_VM_OP_TYPE_MAP,
136          .va = {
137             .start = dev_addr + ringbuf->size,
138             .size = ringbuf->size,
139          },
140          .map = {
141             .bo = ringbuf->bo,
142             .bo_offset = 0,
143          },
144       },
145    };
146 
147    /* If tracing is enabled, we keep the second part of the mapping unmapped
148     * to serve as a guard region. */
149    ret = pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, vm_ops,
150                           tracing_enabled ? 1 : ARRAY_SIZE(vm_ops));
151    if (ret) {
152       simple_mtx_lock(&dev->as.lock);
153       util_vma_heap_free(&dev->as.heap, dev_addr, ringbuf->size * 2);
154       simple_mtx_unlock(&dev->as.lock);
155       return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
156                           "Failed to GPU map ringbuf BO");
157    }
158 
159    ringbuf->addr.dev = dev_addr;
160 
161    if (dev->debug.decode_ctx) {
162       pandecode_inject_mmap(dev->debug.decode_ctx, ringbuf->addr.dev,
163                             ringbuf->addr.host, ringbuf->size, NULL);
164       if (!tracing_enabled)
165          pandecode_inject_mmap(dev->debug.decode_ctx,
166                                ringbuf->addr.dev + ringbuf->size,
167                                ringbuf->addr.host, ringbuf->size, NULL);
168    }
169 
170    struct panvk_pool_alloc_info alloc_info = {
171       .size = sizeof(struct panvk_cs_sync32),
172       .alignment = 64,
173    };
174 
175    ringbuf->syncobj = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
176 
177    struct panvk_cs_sync32 *syncobj = panvk_priv_mem_host_addr(ringbuf->syncobj);
178 
179    if (!syncobj)
180       return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
181                           "Failed to create the render desc ringbuf context");
182 
183    *syncobj = (struct panvk_cs_sync32){
184       .seqno = RENDER_DESC_RINGBUF_SIZE,
185    };
186 
187    return VK_SUCCESS;
188 }
189 
190 static void
finish_subqueue_tracing(struct panvk_queue * queue,enum panvk_subqueue_id subqueue)191 finish_subqueue_tracing(struct panvk_queue *queue,
192                         enum panvk_subqueue_id subqueue)
193 {
194    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
195    struct panvk_subqueue *subq = &queue->subqueues[subqueue];
196 
197    if (subq->tracebuf.addr.dev) {
198       size_t pgsize = getpagesize();
199 
200       pandecode_inject_free(dev->debug.decode_ctx, subq->tracebuf.addr.dev,
201                             subq->tracebuf.size);
202 
203       struct pan_kmod_vm_op op = {
204          .type = PAN_KMOD_VM_OP_TYPE_UNMAP,
205          .va = {
206             .start = subq->tracebuf.addr.dev,
207             .size = subq->tracebuf.size,
208          },
209       };
210 
211       ASSERTED int ret =
212          pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, &op, 1);
213       assert(!ret);
214 
215       simple_mtx_lock(&dev->as.lock);
216       util_vma_heap_free(&dev->as.heap, subq->tracebuf.addr.dev,
217                          subq->tracebuf.size + pgsize);
218       simple_mtx_unlock(&dev->as.lock);
219    }
220 
221    if (subq->tracebuf.addr.host) {
222       ASSERTED int ret =
223          os_munmap(subq->tracebuf.addr.host, subq->tracebuf.size);
224       assert(!ret);
225    }
226 
227    pan_kmod_bo_put(subq->tracebuf.bo);
228 
229    vk_free(&dev->vk.alloc, subq->reg_file);
230 }
231 
232 static VkResult
init_subqueue_tracing(struct panvk_queue * queue,enum panvk_subqueue_id subqueue)233 init_subqueue_tracing(struct panvk_queue *queue,
234                       enum panvk_subqueue_id subqueue)
235 {
236    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
237    struct panvk_subqueue *subq = &queue->subqueues[subqueue];
238    struct panvk_instance *instance =
239       to_panvk_instance(dev->vk.physical->instance);
240    unsigned debug = instance->debug_flags;
241    uint64_t dev_addr;
242 
243    if (!(debug & PANVK_DEBUG_TRACE))
244       return VK_SUCCESS;
245 
246    subq->reg_file =
247       vk_zalloc(&dev->vk.alloc, sizeof(uint32_t) * 256, sizeof(uint64_t),
248                 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
249    if (!subq->reg_file)
250       return panvk_errorf(dev->vk.physical, VK_ERROR_OUT_OF_HOST_MEMORY,
251                           "Failed to allocate reg file cache");
252 
253    subq->tracebuf.size = debug_get_num_option("PANVK_CS_TRACEBUF_SIZE",
254                                               DEFAULT_CS_TRACEBUF_SIZE);
255    assert(subq->tracebuf.size > MIN_CS_TRACEBUF_SIZE &&
256           util_is_power_of_two_nonzero(subq->tracebuf.size));
257 
258    subq->tracebuf.bo =
259       pan_kmod_bo_alloc(dev->kmod.dev, dev->kmod.vm, subq->tracebuf.size,
260                         PAN_KMOD_BO_FLAG_GPU_UNCACHED);
261    if (!subq->tracebuf.bo)
262       return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
263                           "Failed to create a CS tracebuf");
264 
265    subq->tracebuf.addr.host =
266       pan_kmod_bo_mmap(subq->tracebuf.bo, 0, subq->tracebuf.size,
267                        PROT_READ | PROT_WRITE, MAP_SHARED, NULL);
268    if (subq->tracebuf.addr.host == MAP_FAILED) {
269       subq->tracebuf.addr.host = NULL;
270       return panvk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY,
271                           "Failed to CPU map tracebuf");
272    }
273 
274    /* Add a guard page. */
275    size_t pgsize = getpagesize();
276    simple_mtx_lock(&dev->as.lock);
277    dev_addr =
278       util_vma_heap_alloc(&dev->as.heap, subq->tracebuf.size + pgsize, pgsize);
279    simple_mtx_unlock(&dev->as.lock);
280 
281    if (!dev_addr)
282       return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
283                           "Failed to allocate virtual address for tracebuf");
284 
285    struct pan_kmod_vm_op vm_op = {
286       .type = PAN_KMOD_VM_OP_TYPE_MAP,
287       .va = {
288          .start = dev_addr,
289          .size = subq->tracebuf.size,
290       },
291       .map = {
292          .bo = subq->tracebuf.bo,
293          .bo_offset = 0,
294       },
295    };
296 
297    /* If tracing is enabled, we keep the second part of the mapping unmapped
298     * to serve as a guard region. */
299    int ret =
300       pan_kmod_vm_bind(dev->kmod.vm, PAN_KMOD_VM_OP_MODE_IMMEDIATE, &vm_op, 1);
301    if (ret) {
302       simple_mtx_lock(&dev->as.lock);
303       util_vma_heap_free(&dev->as.heap, dev_addr, subq->tracebuf.size + pgsize);
304       simple_mtx_unlock(&dev->as.lock);
305       return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
306                           "Failed to GPU map ringbuf BO");
307    }
308 
309    subq->tracebuf.addr.dev = dev_addr;
310 
311    if (dev->debug.decode_ctx) {
312       pandecode_inject_mmap(dev->debug.decode_ctx, subq->tracebuf.addr.dev,
313                             subq->tracebuf.addr.host, subq->tracebuf.size,
314                             NULL);
315    }
316 
317    return VK_SUCCESS;
318 }
319 
320 static void
finish_subqueue(struct panvk_queue * queue,enum panvk_subqueue_id subqueue)321 finish_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue)
322 {
323    panvk_pool_free_mem(&queue->subqueues[subqueue].context);
324    finish_subqueue_tracing(queue, subqueue);
325 }
326 
327 static VkResult
init_utrace(struct panvk_queue * queue)328 init_utrace(struct panvk_queue *queue)
329 {
330    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
331    const struct panvk_physical_device *phys_dev =
332       to_panvk_physical_device(dev->vk.physical);
333    VkResult result;
334 
335    const struct vk_sync_type *sync_type = phys_dev->sync_types[0];
336    assert(sync_type && vk_sync_type_is_drm_syncobj(sync_type) &&
337           (sync_type->features & VK_SYNC_FEATURE_TIMELINE));
338 
339    result = vk_sync_create(&dev->vk, sync_type, VK_SYNC_IS_TIMELINE, 0,
340                            &queue->utrace.sync);
341    if (result != VK_SUCCESS)
342       return result;
343 
344    queue->utrace.next_value = 1;
345 
346    return VK_SUCCESS;
347 }
348 
349 static VkResult
init_subqueue(struct panvk_queue * queue,enum panvk_subqueue_id subqueue)350 init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue)
351 {
352    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
353    struct panvk_subqueue *subq = &queue->subqueues[subqueue];
354    const struct panvk_physical_device *phys_dev =
355       to_panvk_physical_device(queue->vk.base.device->physical);
356    struct panvk_instance *instance =
357       to_panvk_instance(dev->vk.physical->instance);
358    unsigned debug = instance->debug_flags;
359    struct panvk_cs_sync64 *syncobjs = panvk_priv_mem_host_addr(queue->syncobjs);
360 
361    VkResult result = init_subqueue_tracing(queue, subqueue);
362    if (result != VK_SUCCESS)
363       return result;
364 
365    struct panvk_pool_alloc_info alloc_info = {
366       .size = sizeof(struct panvk_cs_subqueue_context),
367       .alignment = 64,
368    };
369 
370    /* When tracing is enabled, we want to use a non-cached pool, so can get
371     * up-to-date context even if the CS crashed in the middle. */
372    struct panvk_pool *mempool =
373       (debug & PANVK_DEBUG_TRACE) ? &dev->mempools.rw_nc : &dev->mempools.rw;
374 
375    subq->context = panvk_pool_alloc_mem(mempool, alloc_info);
376    if (!panvk_priv_mem_host_addr(subq->context))
377       return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
378                           "Failed to create a queue context");
379 
380    struct panvk_cs_subqueue_context *cs_ctx =
381       panvk_priv_mem_host_addr(subq->context);
382 
383    *cs_ctx = (struct panvk_cs_subqueue_context){
384       .syncobjs = panvk_priv_mem_dev_addr(queue->syncobjs),
385       .debug.syncobjs = panvk_priv_mem_dev_addr(queue->debug_syncobjs),
386       .debug.tracebuf.cs = subq->tracebuf.addr.dev,
387       .iter_sb = 0,
388       .tiler_oom_ctx.reg_dump_addr =
389          panvk_priv_mem_dev_addr(queue->tiler_oom_regs_save),
390    };
391 
392    /* We use the geometry buffer for our temporary CS buffer. */
393    struct cs_buffer root_cs = {
394       .cpu = panvk_priv_mem_host_addr(queue->tiler_heap.desc) + 4096,
395       .gpu = panvk_priv_mem_dev_addr(queue->tiler_heap.desc) + 4096,
396       .capacity = 64 * 1024 / sizeof(uint64_t),
397    };
398    const struct cs_builder_conf conf = {
399       .nr_registers = 96,
400       .nr_kernel_registers = 4,
401    };
402    struct cs_builder b;
403 
404    assert(panvk_priv_mem_dev_addr(queue->tiler_heap.desc) != 0);
405 
406    cs_builder_init(&b, &conf, root_cs);
407    /* Pass the context. */
408    cs_move64_to(&b, cs_subqueue_ctx_reg(&b),
409                 panvk_priv_mem_dev_addr(subq->context));
410 
411    /* Intialize scoreboard slots used for asynchronous operations. */
412    cs_set_scoreboard_entry(&b, SB_ITER(0), SB_ID(LS));
413 
414    /* We do greater than test on sync objects, and given the reference seqno
415     * registers are all zero at init time, we need to initialize all syncobjs
416     * with a seqno of one. */
417    syncobjs[subqueue].seqno = 1;
418 
419    if (subqueue != PANVK_SUBQUEUE_COMPUTE) {
420       cs_ctx->render.tiler_heap =
421          panvk_priv_mem_dev_addr(queue->tiler_heap.desc);
422       /* Our geometry buffer comes 4k after the tiler heap, and we encode the
423        * size in the lower 12 bits so the address can be copied directly
424        * to the tiler descriptors. */
425       cs_ctx->render.geom_buf =
426          (cs_ctx->render.tiler_heap + 4096) | ((64 * 1024) >> 12);
427 
428       /* Initialize the ringbuf */
429       cs_ctx->render.desc_ringbuf = (struct panvk_cs_desc_ringbuf){
430          .syncobj = panvk_priv_mem_dev_addr(queue->render_desc_ringbuf.syncobj),
431          .ptr = queue->render_desc_ringbuf.addr.dev,
432          .pos = 0,
433       };
434 
435       struct cs_index heap_ctx_addr = cs_scratch_reg64(&b, 0);
436 
437       /* Pre-set the heap context on the vertex-tiler/fragment queues. */
438       cs_move64_to(&b, heap_ctx_addr, queue->tiler_heap.context.dev_addr);
439       cs_heap_set(&b, heap_ctx_addr);
440    }
441 
442    cs_finish(&b);
443 
444    assert(cs_is_valid(&b));
445 
446    struct drm_panthor_sync_op syncop = {
447       .flags =
448          DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ | DRM_PANTHOR_SYNC_OP_SIGNAL,
449       .handle = queue->syncobj_handle,
450       .timeline_value = 0,
451    };
452    struct drm_panthor_queue_submit qsubmit = {
453       .queue_index = subqueue,
454       .stream_size = cs_root_chunk_size(&b),
455       .stream_addr = cs_root_chunk_gpu_addr(&b),
456       .latest_flush = panthor_kmod_get_flush_id(dev->kmod.dev),
457       .syncs = DRM_PANTHOR_OBJ_ARRAY(1, &syncop),
458    };
459    struct drm_panthor_group_submit gsubmit = {
460       .group_handle = queue->group_handle,
461       .queue_submits = DRM_PANTHOR_OBJ_ARRAY(1, &qsubmit),
462    };
463 
464    int ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_SUBMIT, &gsubmit);
465    if (ret)
466       return panvk_errorf(dev->vk.physical, VK_ERROR_INITIALIZATION_FAILED,
467                           "Failed to initialized subqueue: %m");
468 
469    ret = drmSyncobjWait(dev->vk.drm_fd, &queue->syncobj_handle, 1, INT64_MAX, 0,
470                         NULL);
471    if (ret)
472       return panvk_errorf(dev->vk.physical, VK_ERROR_INITIALIZATION_FAILED,
473                           "SyncobjWait failed: %m");
474 
475    if (debug & PANVK_DEBUG_TRACE) {
476       pandecode_user_msg(dev->debug.decode_ctx, "Init subqueue %d binary\n\n",
477                          subqueue);
478       pandecode_cs_binary(dev->debug.decode_ctx, qsubmit.stream_addr,
479                           qsubmit.stream_size,
480                           phys_dev->kmod.props.gpu_prod_id);
481    }
482 
483    return VK_SUCCESS;
484 }
485 
486 static void
cleanup_queue(struct panvk_queue * queue)487 cleanup_queue(struct panvk_queue *queue)
488 {
489    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
490 
491    for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++)
492       finish_subqueue(queue, i);
493 
494    if (queue->utrace.sync)
495       vk_sync_destroy(&dev->vk, queue->utrace.sync);
496 
497    finish_render_desc_ringbuf(queue);
498 
499    panvk_pool_free_mem(&queue->tiler_oom_regs_save);
500    panvk_pool_free_mem(&queue->debug_syncobjs);
501    panvk_pool_free_mem(&queue->syncobjs);
502 }
503 
504 static VkResult
init_queue(struct panvk_queue * queue)505 init_queue(struct panvk_queue *queue)
506 {
507    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
508    struct panvk_instance *instance =
509       to_panvk_instance(dev->vk.physical->instance);
510    unsigned debug = instance->debug_flags;
511    VkResult result;
512 
513    struct panvk_pool_alloc_info alloc_info = {
514       .size =
515          ALIGN_POT(sizeof(struct panvk_cs_sync64), 64) * PANVK_SUBQUEUE_COUNT,
516       .alignment = 64,
517    };
518 
519    queue->syncobjs = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
520    if (!panvk_priv_mem_host_addr(queue->syncobjs))
521       return panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
522                           "Failed to allocate subqueue sync objects");
523 
524    if (instance->debug_flags & (PANVK_DEBUG_SYNC | PANVK_DEBUG_TRACE)) {
525       alloc_info.size =
526          ALIGN_POT(sizeof(struct panvk_cs_sync32), 64) * PANVK_SUBQUEUE_COUNT,
527       queue->debug_syncobjs =
528          panvk_pool_alloc_mem(&dev->mempools.rw_nc, alloc_info);
529       if (!panvk_priv_mem_host_addr(queue->debug_syncobjs)) {
530          result = panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
531                                "Failed to allocate subqueue sync objects");
532          goto err_cleanup_queue;
533       }
534    }
535 
536    alloc_info.size = dev->tiler_oom.dump_region_size;
537    alloc_info.alignment = sizeof(uint32_t);
538    queue->tiler_oom_regs_save =
539       panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
540    if (!panvk_priv_mem_host_addr(queue->tiler_oom_regs_save)) {
541       result = panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
542                             "Failed to allocate tiler oom register save area");
543       goto err_cleanup_queue;
544    }
545 
546    result = init_render_desc_ringbuf(queue);
547    if (result != VK_SUCCESS)
548       goto err_cleanup_queue;
549 
550    result = init_utrace(queue);
551    if (result != VK_SUCCESS)
552       goto err_cleanup_queue;
553 
554    for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
555       result = init_subqueue(queue, i);
556       if (result != VK_SUCCESS)
557          goto err_cleanup_queue;
558    }
559 
560    if (debug & PANVK_DEBUG_TRACE)
561       pandecode_next_frame(dev->debug.decode_ctx);
562 
563    return VK_SUCCESS;
564 
565 err_cleanup_queue:
566    cleanup_queue(queue);
567    return result;
568 }
569 
570 static VkResult
create_group(struct panvk_queue * queue,enum drm_panthor_group_priority group_priority)571 create_group(struct panvk_queue *queue,
572              enum drm_panthor_group_priority group_priority)
573 {
574    const struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
575    const struct panvk_physical_device *phys_dev =
576       to_panvk_physical_device(queue->vk.base.device->physical);
577 
578    struct drm_panthor_queue_create qc[] = {
579       [PANVK_SUBQUEUE_VERTEX_TILER] =
580          {
581             .priority = 1,
582             .ringbuf_size = 64 * 1024,
583          },
584       [PANVK_SUBQUEUE_FRAGMENT] =
585          {
586             .priority = 1,
587             .ringbuf_size = 64 * 1024,
588          },
589       [PANVK_SUBQUEUE_COMPUTE] =
590          {
591             .priority = 1,
592             .ringbuf_size = 64 * 1024,
593          },
594    };
595 
596    struct drm_panthor_group_create gc = {
597       .compute_core_mask = phys_dev->kmod.props.shader_present,
598       .fragment_core_mask = phys_dev->kmod.props.shader_present,
599       .tiler_core_mask = 1,
600       .max_compute_cores = util_bitcount64(phys_dev->kmod.props.shader_present),
601       .max_fragment_cores =
602          util_bitcount64(phys_dev->kmod.props.shader_present),
603       .max_tiler_cores = 1,
604       .priority = group_priority,
605       .queues = DRM_PANTHOR_OBJ_ARRAY(ARRAY_SIZE(qc), qc),
606       .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
607    };
608 
609    int ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_CREATE, &gc);
610    if (ret)
611       return panvk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
612                           "Failed to create a scheduling group");
613 
614    queue->group_handle = gc.group_handle;
615    return VK_SUCCESS;
616 }
617 
618 static void
destroy_group(struct panvk_queue * queue)619 destroy_group(struct panvk_queue *queue)
620 {
621    const struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
622    struct drm_panthor_group_destroy gd = {
623       .group_handle = queue->group_handle,
624    };
625 
626    ASSERTED int ret =
627       drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_DESTROY, &gd);
628    assert(!ret);
629 }
630 
631 static VkResult
init_tiler(struct panvk_queue * queue)632 init_tiler(struct panvk_queue *queue)
633 {
634    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
635    struct panvk_tiler_heap *tiler_heap = &queue->tiler_heap;
636    VkResult result;
637 
638    /* We allocate the tiler heap descriptor and geometry buffer in one go,
639     * so we can pass it through a single 64-bit register to the VERTEX_TILER
640     * command streams. */
641    struct panvk_pool_alloc_info alloc_info = {
642       .size = (64 * 1024) + 4096,
643       .alignment = 4096,
644    };
645 
646    tiler_heap->desc = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
647    if (!panvk_priv_mem_host_addr(tiler_heap->desc)) {
648       result = panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
649                             "Failed to create a tiler heap context");
650       goto err_free_desc;
651    }
652 
653    tiler_heap->chunk_size = 2 * 1024 * 1024;
654 
655    struct drm_panthor_tiler_heap_create thc = {
656       .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
657       .chunk_size = tiler_heap->chunk_size,
658       .initial_chunk_count = 5,
659       .max_chunks = 64,
660       .target_in_flight = 65535,
661    };
662 
663    int ret =
664       drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE, &thc);
665    if (ret) {
666       result = panvk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
667                             "Failed to create a tiler heap context");
668       goto err_free_desc;
669    }
670 
671    tiler_heap->context.handle = thc.handle;
672    tiler_heap->context.dev_addr = thc.tiler_heap_ctx_gpu_va;
673 
674    pan_cast_and_pack(panvk_priv_mem_host_addr(tiler_heap->desc), TILER_HEAP,
675                      cfg) {
676       cfg.size = tiler_heap->chunk_size;
677       cfg.base = thc.first_heap_chunk_gpu_va;
678       cfg.bottom = cfg.base + 64;
679       cfg.top = cfg.base + cfg.size;
680    }
681 
682    return VK_SUCCESS;
683 
684 err_free_desc:
685    panvk_pool_free_mem(&tiler_heap->desc);
686    return result;
687 }
688 
689 static void
cleanup_tiler(struct panvk_queue * queue)690 cleanup_tiler(struct panvk_queue *queue)
691 {
692    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
693    struct panvk_tiler_heap *tiler_heap = &queue->tiler_heap;
694    struct drm_panthor_tiler_heap_destroy thd = {
695       .handle = tiler_heap->context.handle,
696    };
697    ASSERTED int ret =
698       drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY, &thd);
699    assert(!ret);
700 
701    panvk_pool_free_mem(&tiler_heap->desc);
702 }
703 
704 struct panvk_queue_submit {
705    const struct panvk_instance *instance;
706    const struct panvk_physical_device *phys_dev;
707    struct panvk_device *dev;
708    struct panvk_queue *queue;
709 
710    bool process_utrace;
711    bool force_sync;
712 
713    uint32_t used_queue_mask;
714 
715    uint32_t qsubmit_count;
716    bool needs_waits;
717    bool needs_signals;
718 
719    struct drm_panthor_queue_submit *qsubmits;
720    struct drm_panthor_sync_op *wait_ops;
721    struct drm_panthor_sync_op *signal_ops;
722 
723    struct {
724       uint32_t queue_mask;
725       enum panvk_subqueue_id first_subqueue;
726       enum panvk_subqueue_id last_subqueue;
727       bool needs_clone;
728       const struct u_trace *last_ut;
729       struct panvk_utrace_flush_data *data_storage;
730 
731       struct panvk_utrace_flush_data *data[PANVK_SUBQUEUE_COUNT];
732    } utrace;
733 };
734 
735 struct panvk_queue_submit_stack_storage {
736    struct drm_panthor_queue_submit qsubmits[8];
737    struct drm_panthor_sync_op syncops[8];
738 };
739 
740 static void
panvk_queue_submit_init(struct panvk_queue_submit * submit,struct vk_queue * vk_queue)741 panvk_queue_submit_init(struct panvk_queue_submit *submit,
742                         struct vk_queue *vk_queue)
743 {
744    struct vk_device *vk_dev = vk_queue->base.device;
745 
746    *submit = (struct panvk_queue_submit){
747       .instance = to_panvk_instance(vk_dev->physical->instance),
748       .phys_dev = to_panvk_physical_device(vk_dev->physical),
749       .dev = to_panvk_device(vk_dev),
750       .queue = container_of(vk_queue, struct panvk_queue, vk),
751    };
752 
753    submit->process_utrace =
754       u_trace_should_process(&submit->dev->utrace.utctx) &&
755       submit->phys_dev->kmod.props.timestamp_frequency;
756 
757    submit->force_sync =
758       submit->instance->debug_flags & (PANVK_DEBUG_TRACE | PANVK_DEBUG_SYNC);
759 }
760 
761 static void
panvk_queue_submit_init_storage(struct panvk_queue_submit * submit,const struct vk_queue_submit * vk_submit,struct panvk_queue_submit_stack_storage * stack_storage)762 panvk_queue_submit_init_storage(
763    struct panvk_queue_submit *submit, const struct vk_queue_submit *vk_submit,
764    struct panvk_queue_submit_stack_storage *stack_storage)
765 {
766    submit->utrace.first_subqueue = PANVK_SUBQUEUE_COUNT;
767    for (uint32_t i = 0; i < vk_submit->command_buffer_count; i++) {
768       struct panvk_cmd_buffer *cmdbuf = container_of(
769          vk_submit->command_buffers[i], struct panvk_cmd_buffer, vk);
770 
771       for (uint32_t j = 0; j < ARRAY_SIZE(cmdbuf->state.cs); j++) {
772          struct cs_builder *b = panvk_get_cs_builder(cmdbuf, j);
773          assert(cs_is_valid(b));
774          if (cs_is_empty(b))
775             continue;
776 
777          submit->used_queue_mask |= BITFIELD_BIT(j);
778          submit->qsubmit_count++;
779 
780          struct u_trace *ut = &cmdbuf->utrace.uts[j];
781          if (submit->process_utrace && u_trace_has_points(ut)) {
782             submit->utrace.queue_mask |= BITFIELD_BIT(j);
783             if (submit->utrace.first_subqueue == PANVK_SUBQUEUE_COUNT)
784                submit->utrace.first_subqueue = j;
785             submit->utrace.last_subqueue = j;
786             submit->utrace.last_ut = ut;
787 
788             if (!(cmdbuf->flags &
789                   VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)) {
790                /* we will follow the user cs with a timestamp copy cs */
791                submit->qsubmit_count++;
792                submit->utrace.needs_clone = true;
793             }
794          }
795       }
796    }
797 
798    /* Synchronize all subqueues if we have no command buffer submitted. */
799    if (!submit->qsubmit_count)
800       submit->used_queue_mask = BITFIELD_MASK(PANVK_SUBQUEUE_COUNT);
801 
802    uint32_t syncop_count = 0;
803 
804    submit->needs_waits = vk_submit->wait_count > 0;
805    submit->needs_signals = vk_submit->signal_count > 0 || submit->force_sync ||
806                            submit->utrace.queue_mask;
807 
808    /* We add sync-only queue submits to place our wait/signal operations. */
809    if (submit->needs_waits) {
810       submit->qsubmit_count += util_bitcount(submit->used_queue_mask);
811       syncop_count += vk_submit->wait_count;
812    }
813    if (submit->needs_signals) {
814       submit->qsubmit_count += util_bitcount(submit->used_queue_mask);
815       syncop_count += util_bitcount(submit->used_queue_mask);
816    }
817 
818    submit->qsubmits =
819       submit->qsubmit_count <= ARRAY_SIZE(stack_storage->qsubmits)
820          ? stack_storage->qsubmits
821          : malloc(sizeof(*submit->qsubmits) * submit->qsubmit_count);
822 
823    submit->wait_ops = syncop_count <= ARRAY_SIZE(stack_storage->syncops)
824                          ? stack_storage->syncops
825                          : malloc(sizeof(*submit->wait_ops) * syncop_count);
826    submit->signal_ops = submit->wait_ops + vk_submit->wait_count;
827 
828    /* reset so that we can initialize submit->qsubmits incrementally */
829    submit->qsubmit_count = 0;
830 
831    if (submit->utrace.queue_mask) {
832       submit->utrace.data_storage =
833          malloc(sizeof(*submit->utrace.data_storage) *
834                 util_bitcount(submit->utrace.queue_mask));
835    }
836 }
837 
838 static void
panvk_queue_submit_cleanup_storage(struct panvk_queue_submit * submit,const struct panvk_queue_submit_stack_storage * stack_storage)839 panvk_queue_submit_cleanup_storage(
840    struct panvk_queue_submit *submit,
841    const struct panvk_queue_submit_stack_storage *stack_storage)
842 {
843    if (submit->qsubmits != stack_storage->qsubmits)
844       free(submit->qsubmits);
845    if (submit->wait_ops != stack_storage->syncops)
846       free(submit->wait_ops);
847 
848    /* either no utrace flush data or the data has been transferred to u_trace */
849    assert(!submit->utrace.data_storage);
850 }
851 
852 static void
panvk_queue_submit_init_utrace(struct panvk_queue_submit * submit,const struct vk_queue_submit * vk_submit)853 panvk_queue_submit_init_utrace(struct panvk_queue_submit *submit,
854                                const struct vk_queue_submit *vk_submit)
855 {
856    struct panvk_device *dev = submit->dev;
857 
858    if (!submit->utrace.queue_mask)
859       return;
860 
861    /* u_trace_context processes trace events in order.  We want to make sure
862     * it waits for the timestamp writes before processing the first event and
863     * it can free the flush data after processing the last event.
864     */
865    struct panvk_utrace_flush_data *next = submit->utrace.data_storage;
866    submit->utrace.data[submit->utrace.last_subqueue] = next++;
867 
868    u_foreach_bit(i, submit->utrace.queue_mask) {
869       if (i != submit->utrace.last_subqueue)
870          submit->utrace.data[i] = next++;
871 
872       const bool wait = i == submit->utrace.first_subqueue;
873       *submit->utrace.data[i] = (struct panvk_utrace_flush_data){
874          .subqueue = i,
875          .sync = wait ? submit->queue->utrace.sync : NULL,
876          .wait_value = wait ? submit->queue->utrace.next_value : 0,
877       };
878    }
879 
880    if (submit->utrace.needs_clone) {
881       struct panvk_pool *clone_pool = &submit->utrace.data_storage->clone_pool;
882       panvk_per_arch(utrace_clone_init_pool)(clone_pool, dev);
883    }
884 }
885 
886 static void
panvk_queue_submit_init_waits(struct panvk_queue_submit * submit,const struct vk_queue_submit * vk_submit)887 panvk_queue_submit_init_waits(struct panvk_queue_submit *submit,
888                               const struct vk_queue_submit *vk_submit)
889 {
890    if (!submit->needs_waits)
891       return;
892 
893    for (uint32_t i = 0; i < vk_submit->wait_count; i++) {
894       const struct vk_sync_wait *wait = &vk_submit->waits[i];
895       const struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(wait->sync);
896       assert(syncobj);
897 
898       submit->wait_ops[i] = (struct drm_panthor_sync_op){
899          .flags = (syncobj->base.flags & VK_SYNC_IS_TIMELINE
900                       ? DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ
901                       : DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ) |
902                   DRM_PANTHOR_SYNC_OP_WAIT,
903          .handle = syncobj->syncobj,
904          .timeline_value = wait->wait_value,
905       };
906    }
907 
908    u_foreach_bit(i, submit->used_queue_mask) {
909       submit->qsubmits[submit->qsubmit_count++] =
910          (struct drm_panthor_queue_submit){
911             .queue_index = i,
912             .syncs =
913                DRM_PANTHOR_OBJ_ARRAY(vk_submit->wait_count, submit->wait_ops),
914          };
915    }
916 }
917 
918 static void
panvk_queue_submit_init_cmdbufs(struct panvk_queue_submit * submit,const struct vk_queue_submit * vk_submit)919 panvk_queue_submit_init_cmdbufs(struct panvk_queue_submit *submit,
920                                 const struct vk_queue_submit *vk_submit)
921 {
922    struct panvk_device *dev = submit->dev;
923 
924    for (uint32_t i = 0; i < vk_submit->command_buffer_count; i++) {
925       struct panvk_cmd_buffer *cmdbuf = container_of(
926          vk_submit->command_buffers[i], struct panvk_cmd_buffer, vk);
927 
928       for (uint32_t j = 0; j < ARRAY_SIZE(cmdbuf->state.cs); j++) {
929          struct cs_builder *b = panvk_get_cs_builder(cmdbuf, j);
930          if (cs_is_empty(b))
931             continue;
932 
933          submit->qsubmits[submit->qsubmit_count++] =
934             (struct drm_panthor_queue_submit){
935                .queue_index = j,
936                .stream_size = cs_root_chunk_size(b),
937                .stream_addr = cs_root_chunk_gpu_addr(b),
938                .latest_flush = cmdbuf->flush_id,
939             };
940       }
941 
942       u_foreach_bit(j, submit->utrace.queue_mask) {
943          struct u_trace *ut = &cmdbuf->utrace.uts[j];
944 
945          if (!u_trace_has_points(ut))
946             continue;
947 
948          const bool free_data = ut == submit->utrace.last_ut;
949 
950          struct u_trace clone_ut;
951          if (!(cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)) {
952             u_trace_init(&clone_ut, &dev->utrace.utctx);
953 
954             struct panvk_pool *clone_pool =
955                &submit->utrace.data_storage->clone_pool;
956             struct cs_builder clone_builder;
957             panvk_per_arch(utrace_clone_init_builder)(&clone_builder,
958                                                       clone_pool);
959 
960             u_trace_clone_append(
961                u_trace_begin_iterator(ut), u_trace_end_iterator(ut), &clone_ut,
962                &clone_builder, panvk_per_arch(utrace_copy_buffer));
963 
964             panvk_per_arch(utrace_clone_finish_builder)(&clone_builder);
965 
966             submit->qsubmits[submit->qsubmit_count++] =
967                (struct drm_panthor_queue_submit){
968                   .queue_index = j,
969                   .stream_size = cs_root_chunk_size(&clone_builder),
970                   .stream_addr = cs_root_chunk_gpu_addr(&clone_builder),
971                   .latest_flush = panthor_kmod_get_flush_id(dev->kmod.dev),
972                };
973 
974             ut = &clone_ut;
975          }
976 
977          u_trace_flush(ut, submit->utrace.data[j], dev->vk.current_frame,
978                        free_data);
979       }
980    }
981 
982    /* we've transferred the data ownership to utrace, if any */
983    submit->utrace.data_storage = NULL;
984 }
985 
986 static void
panvk_queue_submit_init_signals(struct panvk_queue_submit * submit,const struct vk_queue_submit * vk_submit)987 panvk_queue_submit_init_signals(struct panvk_queue_submit *submit,
988                                 const struct vk_queue_submit *vk_submit)
989 {
990    struct panvk_queue *queue = submit->queue;
991 
992    if (!submit->needs_signals)
993       return;
994 
995    uint32_t signal_op = 0;
996    u_foreach_bit(i, submit->used_queue_mask) {
997       submit->signal_ops[signal_op] = (struct drm_panthor_sync_op){
998          .flags = DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ |
999                   DRM_PANTHOR_SYNC_OP_SIGNAL,
1000          .handle = queue->syncobj_handle,
1001          .timeline_value = signal_op + 1,
1002       };
1003 
1004       submit->qsubmits[submit->qsubmit_count++] =
1005          (struct drm_panthor_queue_submit){
1006             .queue_index = i,
1007             .syncs = DRM_PANTHOR_OBJ_ARRAY(1, &submit->signal_ops[signal_op++]),
1008          };
1009    }
1010 
1011    if (submit->force_sync) {
1012       struct panvk_cs_sync32 *debug_syncs =
1013          panvk_priv_mem_host_addr(queue->debug_syncobjs);
1014 
1015       assert(debug_syncs);
1016       memset(debug_syncs, 0, sizeof(*debug_syncs) * PANVK_SUBQUEUE_COUNT);
1017    }
1018 }
1019 
1020 static VkResult
panvk_queue_submit_ioctl(struct panvk_queue_submit * submit)1021 panvk_queue_submit_ioctl(struct panvk_queue_submit *submit)
1022 {
1023    const struct panvk_device *dev = submit->dev;
1024    const struct panvk_instance *instance = submit->instance;
1025    struct panvk_queue *queue = submit->queue;
1026    int ret;
1027 
1028    if (instance->debug_flags & PANVK_DEBUG_TRACE) {
1029       /* If we're tracing, we need to reset the desc ringbufs and the CS
1030        * tracebuf. */
1031       for (uint32_t i = 0; i < ARRAY_SIZE(queue->subqueues); i++) {
1032          struct panvk_cs_subqueue_context *ctx =
1033             panvk_priv_mem_host_addr(queue->subqueues[i].context);
1034 
1035          if (ctx->render.desc_ringbuf.ptr) {
1036             ctx->render.desc_ringbuf.ptr = queue->render_desc_ringbuf.addr.dev;
1037             ctx->render.desc_ringbuf.pos = 0;
1038          }
1039 
1040          if (ctx->debug.tracebuf.cs)
1041             ctx->debug.tracebuf.cs = queue->subqueues[i].tracebuf.addr.dev;
1042       }
1043    }
1044 
1045    struct drm_panthor_group_submit gsubmit = {
1046       .group_handle = queue->group_handle,
1047       .queue_submits =
1048          DRM_PANTHOR_OBJ_ARRAY(submit->qsubmit_count, submit->qsubmits),
1049    };
1050 
1051    ret = drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_SUBMIT, &gsubmit);
1052    if (ret)
1053       return vk_queue_set_lost(&queue->vk, "GROUP_SUBMIT: %m");
1054 
1055    return VK_SUCCESS;
1056 }
1057 
1058 static void
panvk_queue_submit_process_signals(struct panvk_queue_submit * submit,const struct vk_queue_submit * vk_submit)1059 panvk_queue_submit_process_signals(struct panvk_queue_submit *submit,
1060                                    const struct vk_queue_submit *vk_submit)
1061 {
1062    struct panvk_device *dev = submit->dev;
1063    struct panvk_queue *queue = submit->queue;
1064    int ret;
1065 
1066    if (!submit->needs_signals)
1067       return;
1068 
1069    if (submit->force_sync) {
1070       uint64_t point = util_bitcount(submit->used_queue_mask);
1071       ret = drmSyncobjTimelineWait(dev->vk.drm_fd, &queue->syncobj_handle,
1072                                    &point, 1, INT64_MAX,
1073                                    DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL, NULL);
1074       assert(!ret);
1075    }
1076 
1077    for (uint32_t i = 0; i < vk_submit->signal_count; i++) {
1078       const struct vk_sync_signal *signal = &vk_submit->signals[i];
1079       const struct vk_drm_syncobj *syncobj =
1080          vk_sync_as_drm_syncobj(signal->sync);
1081       assert(syncobj);
1082 
1083       drmSyncobjTransfer(dev->vk.drm_fd, syncobj->syncobj, signal->signal_value,
1084                          queue->syncobj_handle, 0, 0);
1085    }
1086 
1087    if (submit->utrace.queue_mask) {
1088       const struct vk_drm_syncobj *syncobj =
1089          vk_sync_as_drm_syncobj(queue->utrace.sync);
1090 
1091       drmSyncobjTransfer(dev->vk.drm_fd, syncobj->syncobj,
1092                          queue->utrace.next_value++, queue->syncobj_handle, 0,
1093                          0);
1094 
1095       /* process flushed events after the syncobj is set up */
1096       u_trace_context_process(&dev->utrace.utctx, false);
1097    }
1098 
1099    drmSyncobjReset(dev->vk.drm_fd, &queue->syncobj_handle, 1);
1100 }
1101 
1102 static void
panvk_queue_submit_process_debug(const struct panvk_queue_submit * submit)1103 panvk_queue_submit_process_debug(const struct panvk_queue_submit *submit)
1104 {
1105    const struct panvk_instance *instance = submit->instance;
1106    struct panvk_queue *queue = submit->queue;
1107    struct pandecode_context *decode_ctx = submit->dev->debug.decode_ctx;
1108 
1109    if (instance->debug_flags & PANVK_DEBUG_TRACE) {
1110       const struct pan_kmod_dev_props *props = &submit->phys_dev->kmod.props;
1111 
1112       for (uint32_t i = 0; i < submit->qsubmit_count; i++) {
1113          const struct drm_panthor_queue_submit *qsubmit = &submit->qsubmits[i];
1114          if (!qsubmit->stream_size)
1115             continue;
1116 
1117          pandecode_user_msg(decode_ctx, "CS %d on subqueue %d binaries\n\n", i,
1118                             qsubmit->queue_index);
1119          pandecode_cs_binary(decode_ctx, qsubmit->stream_addr,
1120                              qsubmit->stream_size, props->gpu_prod_id);
1121          pandecode_user_msg(decode_ctx, "\n");
1122       }
1123 
1124       for (uint32_t i = 0; i < ARRAY_SIZE(queue->subqueues); i++) {
1125          struct panvk_cs_subqueue_context *ctx =
1126             panvk_priv_mem_host_addr(queue->subqueues[i].context);
1127 
1128          size_t trace_size =
1129             ctx->debug.tracebuf.cs - queue->subqueues[i].tracebuf.addr.dev;
1130          if (!trace_size)
1131             continue;
1132 
1133          assert(
1134             trace_size <= queue->subqueues[i].tracebuf.size ||
1135             !"OOB access on the CS tracebuf, pass a bigger PANVK_CS_TRACEBUF_SIZE");
1136 
1137          assert(
1138             !ctx->render.desc_ringbuf.ptr ||
1139             ctx->render.desc_ringbuf.pos <= queue->render_desc_ringbuf.size ||
1140             !"OOB access on the desc tracebuf, pass a bigger PANVK_DESC_TRACEBUF_SIZE");
1141 
1142          uint64_t trace = queue->subqueues[i].tracebuf.addr.dev;
1143 
1144          pandecode_user_msg(decode_ctx, "\nCS traces on subqueue %d\n\n", i);
1145          pandecode_cs_trace(decode_ctx, trace, trace_size, props->gpu_prod_id);
1146          pandecode_user_msg(decode_ctx, "\n");
1147       }
1148    }
1149 
1150    if (instance->debug_flags & PANVK_DEBUG_DUMP)
1151       pandecode_dump_mappings(decode_ctx);
1152 
1153    if (instance->debug_flags & PANVK_DEBUG_TRACE)
1154       pandecode_next_frame(decode_ctx);
1155 
1156    /* validate last after the command streams are dumped */
1157    if (submit->force_sync) {
1158       struct panvk_cs_sync32 *debug_syncs =
1159          panvk_priv_mem_host_addr(queue->debug_syncobjs);
1160       uint32_t debug_sync_points[PANVK_SUBQUEUE_COUNT] = {0};
1161 
1162       for (uint32_t i = 0; i < submit->qsubmit_count; i++) {
1163          const struct drm_panthor_queue_submit *qsubmit = &submit->qsubmits[i];
1164          if (qsubmit->stream_size)
1165             debug_sync_points[qsubmit->queue_index]++;
1166       }
1167 
1168       for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
1169          if (debug_syncs[i].seqno != debug_sync_points[i] ||
1170              debug_syncs[i].error != 0)
1171             vk_queue_set_lost(&queue->vk, "Incomplete job or timeout");
1172       }
1173    }
1174 }
1175 
1176 static VkResult
panvk_queue_submit(struct vk_queue * vk_queue,struct vk_queue_submit * vk_submit)1177 panvk_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *vk_submit)
1178 {
1179    struct panvk_queue_submit_stack_storage stack_storage;
1180    struct panvk_queue_submit submit;
1181    VkResult result = VK_SUCCESS;
1182 
1183    if (vk_queue_is_lost(vk_queue))
1184       return VK_ERROR_DEVICE_LOST;
1185 
1186    panvk_queue_submit_init(&submit, vk_queue);
1187    panvk_queue_submit_init_storage(&submit, vk_submit, &stack_storage);
1188    panvk_queue_submit_init_utrace(&submit, vk_submit);
1189    panvk_queue_submit_init_waits(&submit, vk_submit);
1190    panvk_queue_submit_init_cmdbufs(&submit, vk_submit);
1191    panvk_queue_submit_init_signals(&submit, vk_submit);
1192 
1193    result = panvk_queue_submit_ioctl(&submit);
1194    if (result != VK_SUCCESS)
1195       goto out;
1196 
1197    panvk_queue_submit_process_signals(&submit, vk_submit);
1198    panvk_queue_submit_process_debug(&submit);
1199 
1200 out:
1201    panvk_queue_submit_cleanup_storage(&submit, &stack_storage);
1202    return result;
1203 }
1204 
1205 static enum drm_panthor_group_priority
get_panthor_group_priority(const VkDeviceQueueCreateInfo * create_info)1206 get_panthor_group_priority(const VkDeviceQueueCreateInfo *create_info)
1207 {
1208    const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
1209       vk_find_struct_const(create_info->pNext,
1210                            DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
1211    const VkQueueGlobalPriorityKHR priority =
1212       priority_info ? priority_info->globalPriority
1213                     : VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
1214 
1215    switch (priority) {
1216    case VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR:
1217       return PANTHOR_GROUP_PRIORITY_LOW;
1218    case VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR:
1219       return PANTHOR_GROUP_PRIORITY_MEDIUM;
1220    case VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR:
1221       return PANTHOR_GROUP_PRIORITY_HIGH;
1222    case VK_QUEUE_GLOBAL_PRIORITY_REALTIME_KHR:
1223       return PANTHOR_GROUP_PRIORITY_REALTIME;
1224    default:
1225       unreachable("Invalid global priority");
1226    }
1227 }
1228 
1229 VkResult
panvk_per_arch(queue_init)1230 panvk_per_arch(queue_init)(struct panvk_device *dev, struct panvk_queue *queue,
1231                            int idx, const VkDeviceQueueCreateInfo *create_info)
1232 {
1233    VkResult result = vk_queue_init(&queue->vk, &dev->vk, create_info, idx);
1234    if (result != VK_SUCCESS)
1235       return result;
1236 
1237    int ret = drmSyncobjCreate(dev->vk.drm_fd, 0, &queue->syncobj_handle);
1238    if (ret) {
1239       result = panvk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
1240                             "Failed to create our internal sync object");
1241       goto err_finish_queue;
1242    }
1243 
1244    result = init_tiler(queue);
1245    if (result != VK_SUCCESS)
1246       goto err_destroy_syncobj;
1247 
1248    result = create_group(queue, get_panthor_group_priority(create_info));
1249    if (result != VK_SUCCESS)
1250       goto err_cleanup_tiler;
1251 
1252    result = init_queue(queue);
1253    if (result != VK_SUCCESS)
1254       goto err_destroy_group;
1255 
1256    queue->vk.driver_submit = panvk_queue_submit;
1257    return VK_SUCCESS;
1258 
1259 err_destroy_group:
1260    destroy_group(queue);
1261 
1262 err_cleanup_tiler:
1263    cleanup_tiler(queue);
1264 
1265 err_destroy_syncobj:
1266    drmSyncobjDestroy(dev->vk.drm_fd, queue->syncobj_handle);
1267 
1268 err_finish_queue:
1269    vk_queue_finish(&queue->vk);
1270    return result;
1271 }
1272 
1273 void
panvk_per_arch(queue_finish)1274 panvk_per_arch(queue_finish)(struct panvk_queue *queue)
1275 {
1276    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
1277 
1278    cleanup_queue(queue);
1279    destroy_group(queue);
1280    cleanup_tiler(queue);
1281    drmSyncobjDestroy(dev->vk.drm_fd, queue->syncobj_handle);
1282    vk_queue_finish(&queue->vk);
1283 }
1284 
1285 VkResult
panvk_per_arch(queue_check_status)1286 panvk_per_arch(queue_check_status)(struct panvk_queue *queue)
1287 {
1288    struct panvk_device *dev = to_panvk_device(queue->vk.base.device);
1289    struct drm_panthor_group_get_state state = {
1290       .group_handle = queue->group_handle,
1291    };
1292 
1293    int ret =
1294       drmIoctl(dev->vk.drm_fd, DRM_IOCTL_PANTHOR_GROUP_GET_STATE, &state);
1295    if (!ret && !state.state)
1296       return VK_SUCCESS;
1297 
1298    vk_queue_set_lost(&queue->vk,
1299                      "group state: err=%d, state=0x%x, fatal_queues=0x%x", ret,
1300                      state.state, state.fatal_queues);
1301 
1302    return VK_ERROR_DEVICE_LOST;
1303 }
1304