• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Google, Inc.
3  * Copyright © 2015 Intel Corporation
4  * SPDX-License-Identifier: MIT
5  *
6  * Kernel interface layer for turnip running on virtio_gpu (aka virtgpu)
7  */
8 
9 #include "tu_knl.h"
10 
11 #include <errno.h>
12 #include <fcntl.h>
13 #include <sys/ioctl.h>
14 #include <sys/mman.h>
15 #include <xf86drm.h>
16 
17 #include "vk_util.h"
18 
19 #include "drm-uapi/msm_drm.h"
20 #include "drm-uapi/virtgpu_drm.h"
21 #include "util/u_debug.h"
22 #include "util/hash_table.h"
23 #include "util/libsync.h"
24 #include "util/u_process.h"
25 
26 #include "tu_cmd_buffer.h"
27 #include "tu_cs.h"
28 #include "tu_device.h"
29 #include "tu_dynamic_rendering.h"
30 #include "tu_knl_drm.h"
31 
32 #define VIRGL_RENDERER_UNSTABLE_APIS 1
33 #include "virglrenderer_hw.h"
34 #include "msm_proto.h"
35 
36 #include "vdrm.h"
37 
38 struct tu_userspace_fence_cmd {
39    uint32_t pkt[4];    /* first 4 dwords of packet */
40    uint32_t fence;     /* fifth dword is fence value which is plugged in at runtime */
41    uint32_t _pad[11];
42 };
43 
44 struct tu_userspace_fence_cmds {
45    struct tu_userspace_fence_cmd cmds[64];
46 };
47 
48 struct tu_queue_submit {
49    struct vk_queue_submit *vk_submit;
50    struct tu_u_trace_submission_data *u_trace_submission_data;
51 
52    struct tu_cmd_buffer **cmd_buffers;
53    struct drm_msm_gem_submit_cmd *cmds;
54    struct drm_virtgpu_execbuffer_syncobj *in_syncobjs;
55    struct drm_virtgpu_execbuffer_syncobj *out_syncobjs;
56 
57    uint32_t nr_cmd_buffers;
58    uint32_t nr_in_syncobjs;
59    uint32_t nr_out_syncobjs;
60    uint32_t entry_count;
61    uint32_t perf_pass_index;
62 
63    bool     autotune_fence;
64 };
65 
66 struct tu_u_trace_syncobj {
67    uint32_t msm_queue_id;
68    uint32_t fence;
69 };
70 
71 struct tu_virtio_device {
72    struct vdrm_device *vdrm;
73    struct msm_shmem *shmem;
74    uint32_t next_blob_id;
75 
76    struct tu_userspace_fence_cmds *fence_cmds;
77    struct tu_bo *fence_cmds_mem;
78 
79    /**
80     * Processing zombie VMAs is a two step process, first we clear the iova
81     * and then we close the handles.  But to minimize waste of virtqueue
82     * space (and associated stalling and ping-ponging between guest and host)
83     * we want to batch up all the GEM_SET_IOVA ccmds before we flush them to
84     * the host and start closing handles.
85     *
86     * This gives us a place to stash the VMAs between the two steps.
87     */
88    struct u_vector zombie_vmas_stage_2;
89 };
90 
91 static int tu_drm_get_param(struct tu_device *dev, uint32_t param, uint64_t *value);
92 
93 /**
94  * Helper for simple pass-thru ioctls
95  */
96 static int
virtio_simple_ioctl(struct tu_device * dev,unsigned cmd,void * _req)97 virtio_simple_ioctl(struct tu_device *dev, unsigned cmd, void *_req)
98 {
99    MESA_TRACE_FUNC();
100    struct vdrm_device *vdrm = dev->vdev->vdrm;
101    unsigned req_len = sizeof(struct msm_ccmd_ioctl_simple_req);
102    unsigned rsp_len = sizeof(struct msm_ccmd_ioctl_simple_rsp);
103 
104    req_len += _IOC_SIZE(cmd);
105    if (cmd & IOC_OUT)
106       rsp_len += _IOC_SIZE(cmd);
107 
108    uint8_t buf[req_len];
109    struct msm_ccmd_ioctl_simple_req *req = (struct msm_ccmd_ioctl_simple_req *)buf;
110    struct msm_ccmd_ioctl_simple_rsp *rsp;
111 
112    req->hdr = MSM_CCMD(IOCTL_SIMPLE, req_len);
113    req->cmd = cmd;
114    memcpy(req->payload, _req, _IOC_SIZE(cmd));
115 
116    rsp = (struct msm_ccmd_ioctl_simple_rsp *)
117          vdrm_alloc_rsp(vdrm, &req->hdr, rsp_len);
118 
119    int ret = vdrm_send_req(vdrm, &req->hdr, true);
120 
121    if (cmd & IOC_OUT)
122       memcpy(_req, rsp->payload, _IOC_SIZE(cmd));
123 
124    ret = rsp->ret;
125 
126    return ret;
127 }
128 
129 static int
set_iova(struct tu_device * device,uint32_t res_id,uint64_t iova)130 set_iova(struct tu_device *device, uint32_t res_id, uint64_t iova)
131 {
132    struct msm_ccmd_gem_set_iova_req req = {
133          .hdr = MSM_CCMD(GEM_SET_IOVA, sizeof(req)),
134          .iova = iova,
135          .res_id = res_id,
136    };
137 
138    return vdrm_send_req(device->vdev->vdrm, &req.hdr, false);
139 }
140 
141 static int
query_faults(struct tu_device * dev,uint64_t * value)142 query_faults(struct tu_device *dev, uint64_t *value)
143 {
144    struct tu_virtio_device *vdev = dev->vdev;
145    uint32_t async_error = 0;
146    uint64_t global_faults;
147 
148    if (vdrm_shmem_has_field(vdev->shmem, async_error))
149       async_error = vdev->shmem->async_error;
150 
151    if (vdrm_shmem_has_field(vdev->shmem, global_faults)) {
152       global_faults = vdev->shmem->global_faults;
153    } else {
154       int ret = tu_drm_get_param(dev, MSM_PARAM_FAULTS, &global_faults);
155       if (ret)
156          return ret;
157    }
158 
159    *value = global_faults + async_error;
160 
161    return 0;
162 }
163 
164 static void
set_debuginfo(struct tu_device * dev)165 set_debuginfo(struct tu_device *dev)
166 {
167    const char *comm = util_get_process_name();
168    static char cmdline[0x1000+1];
169    int fd = open("/proc/self/cmdline", O_RDONLY);
170    if (fd < 0)
171       return;
172 
173    int n = read(fd, cmdline, sizeof(cmdline) - 1);
174    if (n < 0)
175       return;
176 
177    /* arguments are separated by NULL, convert to spaces: */
178    for (int i = 0; i < n; i++) {
179       if (cmdline[i] == '\0') {
180          cmdline[i] = ' ';
181       }
182    }
183 
184    cmdline[n] = '\0';
185 
186    unsigned comm_len = strlen(comm) + 1;
187    unsigned cmdline_len = strlen(cmdline) + 1;
188 
189    struct msm_ccmd_set_debuginfo_req *req;
190 
191    unsigned req_len = align(sizeof(*req) + comm_len + cmdline_len, 4);
192 
193    req = (struct msm_ccmd_set_debuginfo_req *)malloc(req_len);
194 
195    req->hdr         = MSM_CCMD(SET_DEBUGINFO, req_len);
196    req->comm_len    = comm_len;
197    req->cmdline_len = cmdline_len;
198 
199    memcpy(&req->payload[0], comm, comm_len);
200    memcpy(&req->payload[comm_len], cmdline, cmdline_len);
201 
202    vdrm_send_req(dev->vdev->vdrm, &req->hdr, false);
203 
204    free(req);
205 }
206 
207 static VkResult
virtio_device_init(struct tu_device * dev)208 virtio_device_init(struct tu_device *dev)
209 {
210    struct tu_instance *instance = dev->physical_device->instance;
211    int fd;
212 
213    fd = open(dev->physical_device->fd_path, O_RDWR | O_CLOEXEC);
214    if (fd < 0) {
215       return vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
216                                "failed to open device %s", dev->physical_device->fd_path);
217    }
218 
219    struct tu_virtio_device *vdev = (struct tu_virtio_device *)
220             vk_zalloc(&instance->vk.alloc, sizeof(*vdev), 8,
221                       VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
222    if (!vdev) {
223       close(fd);
224       return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
225    };
226 
227    u_vector_init(&vdev->zombie_vmas_stage_2, 64, sizeof(struct tu_zombie_vma));
228 
229    dev->vdev = vdev;
230    dev->fd = fd;
231 
232    vdev->vdrm = vdrm_device_connect(fd, VIRTGPU_DRM_CONTEXT_MSM);
233 
234    p_atomic_set(&vdev->next_blob_id, 1);
235    vdev->shmem = to_msm_shmem(vdev->vdrm->shmem);
236 
237    query_faults(dev, &dev->fault_count);
238 
239    set_debuginfo(dev);
240 
241    return VK_SUCCESS;
242 }
243 
244 static void
virtio_device_finish(struct tu_device * dev)245 virtio_device_finish(struct tu_device *dev)
246 {
247    struct tu_instance *instance = dev->physical_device->instance;
248    struct tu_virtio_device *vdev = dev->vdev;
249 
250    u_vector_finish(&vdev->zombie_vmas_stage_2);
251 
252    vdrm_device_close(vdev->vdrm);
253 
254    vk_free(&instance->vk.alloc, vdev);
255    dev->vdev = NULL;
256 
257    close(dev->fd);
258 }
259 
260 static int
tu_drm_get_param(struct tu_device * dev,uint32_t param,uint64_t * value)261 tu_drm_get_param(struct tu_device *dev, uint32_t param, uint64_t *value)
262 {
263    /* Technically this requires a pipe, but the kernel only supports one pipe
264     * anyway at the time of writing and most of these are clearly pipe
265     * independent. */
266    struct drm_msm_param req = {
267       .pipe = MSM_PIPE_3D0,
268       .param = param,
269    };
270 
271    int ret = virtio_simple_ioctl(dev, DRM_IOCTL_MSM_GET_PARAM, &req);
272    if (ret)
273       return ret;
274 
275    *value = req.value;
276 
277    return 0;
278 }
279 
280 static int
virtio_device_get_gpu_timestamp(struct tu_device * dev,uint64_t * ts)281 virtio_device_get_gpu_timestamp(struct tu_device *dev, uint64_t *ts)
282 {
283    return tu_drm_get_param(dev, MSM_PARAM_TIMESTAMP, ts);
284 }
285 
286 static int
virtio_device_get_suspend_count(struct tu_device * dev,uint64_t * suspend_count)287 virtio_device_get_suspend_count(struct tu_device *dev, uint64_t *suspend_count)
288 {
289    int ret = tu_drm_get_param(dev, MSM_PARAM_SUSPENDS, suspend_count);
290    return ret;
291 }
292 
293 static VkResult
virtio_device_check_status(struct tu_device * device)294 virtio_device_check_status(struct tu_device *device)
295 {
296    uint64_t last_fault_count = device->fault_count;
297 
298    query_faults(device, &device->fault_count);
299 
300    if (last_fault_count != device->fault_count)
301       return vk_device_set_lost(&device->vk, "GPU faulted or hung");
302 
303    return VK_SUCCESS;
304 }
305 
306 static int
virtio_submitqueue_new(struct tu_device * dev,int priority,uint32_t * queue_id)307 virtio_submitqueue_new(struct tu_device *dev,
308                        int priority,
309                        uint32_t *queue_id)
310 {
311    assert(priority >= 0 &&
312           priority < dev->physical_device->submitqueue_priority_count);
313 
314    struct drm_msm_submitqueue req = {
315       .flags = 0,
316       .prio = priority,
317    };
318 
319    int ret = virtio_simple_ioctl(dev, DRM_IOCTL_MSM_SUBMITQUEUE_NEW, &req);
320    if (ret)
321       return ret;
322 
323    *queue_id = req.id;
324    return 0;
325 }
326 
327 static void
virtio_submitqueue_close(struct tu_device * dev,uint32_t queue_id)328 virtio_submitqueue_close(struct tu_device *dev, uint32_t queue_id)
329 {
330    virtio_simple_ioctl(dev, DRM_IOCTL_MSM_SUBMITQUEUE_CLOSE, &queue_id);
331 }
332 
333 static VkResult
tu_wait_fence(struct tu_device * dev,uint32_t queue_id,int fence,uint64_t timeout_ns)334 tu_wait_fence(struct tu_device *dev,
335               uint32_t queue_id,
336               int fence,
337               uint64_t timeout_ns)
338 {
339    struct vdrm_device *vdrm = dev->vdev->vdrm;
340 
341    if (!fence_before(dev->global_bo_map->userspace_fence, fence))
342       return VK_SUCCESS;
343 
344    if (!timeout_ns)
345       return VK_TIMEOUT;
346 
347    MESA_TRACE_FUNC();
348 
349    struct msm_ccmd_wait_fence_req req = {
350          .hdr = MSM_CCMD(WAIT_FENCE, sizeof(req)),
351          .queue_id = queue_id,
352          .fence = fence,
353    };
354    struct msm_ccmd_submitqueue_query_rsp *rsp;
355    int64_t end_time = os_time_get_nano() + timeout_ns;
356    int ret;
357 
358    do {
359       rsp = (struct msm_ccmd_submitqueue_query_rsp *)
360             vdrm_alloc_rsp(vdrm, &req.hdr, sizeof(*rsp));
361 
362       ret = vdrm_send_req(vdrm, &req.hdr, true);
363       if (ret)
364          goto out;
365 
366       if (os_time_get_nano() >= end_time)
367          break;
368 
369       ret = rsp->ret;
370    } while (ret == -ETIMEDOUT);
371 
372 out:
373    if (!ret) return VK_SUCCESS;
374    if (ret == -ETIMEDOUT) return VK_TIMEOUT;
375    return VK_ERROR_UNKNOWN;
376 }
377 
378 static VkResult
tu_free_zombie_vma_locked(struct tu_device * dev,bool wait)379 tu_free_zombie_vma_locked(struct tu_device *dev, bool wait)
380 {
381    struct tu_virtio_device *vdev = dev->vdev;
382 
383    if (!u_vector_length(&dev->zombie_vmas))
384       return VK_SUCCESS;
385 
386    if (wait) {
387       struct tu_zombie_vma *vma = (struct tu_zombie_vma *)
388             u_vector_head(&dev->zombie_vmas);
389       /* Wait for 3s (arbitrary timeout) */
390       VkResult ret = tu_wait_fence(dev, dev->queues[0]->msm_queue_id,
391                                    vma->fence, 3000000000);
392 
393       if (ret != VK_SUCCESS)
394          return ret;
395    }
396 
397    /* Clear the iova of all finished objects in first pass so the SET_IOVA
398     * ccmd's can be buffered and sent together to the host.  *Then* delete
399     * the handles.  This avoids filling up the virtqueue with tiny messages,
400     * since each execbuf ends up needing to be page aligned.
401     */
402    int last_signaled_fence = -1;
403    while (u_vector_length(&dev->zombie_vmas) > 0) {
404       struct tu_zombie_vma *vma = (struct tu_zombie_vma *)
405             u_vector_tail(&dev->zombie_vmas);
406       if (vma->fence > last_signaled_fence) {
407          VkResult ret =
408             tu_wait_fence(dev, dev->queues[0]->msm_queue_id, vma->fence, 0);
409          if (ret != VK_SUCCESS)
410             break;
411 
412          last_signaled_fence = vma->fence;
413       }
414 
415       set_iova(dev, vma->res_id, 0);
416 
417       u_vector_remove(&dev->zombie_vmas);
418 
419       struct tu_zombie_vma *vma2 = (struct tu_zombie_vma *)
420             u_vector_add(&vdev->zombie_vmas_stage_2);
421 
422       *vma2 = *vma;
423    }
424 
425    /* And _then_ close the GEM handles: */
426    while (u_vector_length(&vdev->zombie_vmas_stage_2) > 0) {
427       struct tu_zombie_vma *vma = (struct tu_zombie_vma *)
428             u_vector_remove(&vdev->zombie_vmas_stage_2);
429 
430       util_vma_heap_free(&dev->vma, vma->iova, vma->size);
431       vdrm_bo_close(dev->vdev->vdrm, vma->gem_handle);
432    }
433 
434    return VK_SUCCESS;
435 }
436 
437 static VkResult
virtio_allocate_userspace_iova(struct tu_device * dev,uint64_t size,uint64_t client_iova,enum tu_bo_alloc_flags flags,uint64_t * iova)438 virtio_allocate_userspace_iova(struct tu_device *dev,
439                                uint64_t size,
440                                uint64_t client_iova,
441                                enum tu_bo_alloc_flags flags,
442                                uint64_t *iova)
443 {
444    VkResult result;
445 
446    mtx_lock(&dev->vma_mutex);
447 
448    *iova = 0;
449 
450    tu_free_zombie_vma_locked(dev, false);
451 
452    result = tu_allocate_userspace_iova(dev, size, client_iova, flags, iova);
453    if (result == VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS) {
454       /* Address may be already freed by us, but not considered as
455        * freed by the kernel. We have to wait until all work that
456        * may hold the address is done. Since addresses are meant to
457        * be replayed only by debug tooling, it should be ok to wait.
458        */
459       tu_free_zombie_vma_locked(dev, true);
460       result = tu_allocate_userspace_iova(dev, size, client_iova, flags, iova);
461    }
462 
463    mtx_unlock(&dev->vma_mutex);
464 
465    return result;
466 }
467 
468 static VkResult
tu_bo_init(struct tu_device * dev,struct tu_bo * bo,uint32_t gem_handle,uint64_t size,uint64_t iova,enum tu_bo_alloc_flags flags,const char * name)469 tu_bo_init(struct tu_device *dev,
470            struct tu_bo *bo,
471            uint32_t gem_handle,
472            uint64_t size,
473            uint64_t iova,
474            enum tu_bo_alloc_flags flags,
475            const char *name)
476 {
477    assert(dev->physical_device->has_set_iova);
478 
479    set_iova(dev, bo->res_id, iova);
480 
481    name = tu_debug_bos_add(dev, size, name);
482 
483    mtx_lock(&dev->bo_mutex);
484    uint32_t idx = dev->bo_count++;
485 
486    /* grow the bo list if needed */
487    if (idx >= dev->bo_list_size) {
488       uint32_t new_len = idx + 64;
489       struct drm_msm_gem_submit_bo *new_ptr = (struct drm_msm_gem_submit_bo *)
490          vk_realloc(&dev->vk.alloc, dev->bo_list, new_len * sizeof(*dev->bo_list),
491                     8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
492       if (!new_ptr) {
493          dev->bo_count--;
494          mtx_unlock(&dev->bo_mutex);
495          vdrm_bo_close(dev->vdev->vdrm, bo->gem_handle);
496          return VK_ERROR_OUT_OF_HOST_MEMORY;
497       }
498 
499       dev->bo_list = new_ptr;
500       dev->bo_list_size = new_len;
501    }
502 
503    bool dump = flags & TU_BO_ALLOC_ALLOW_DUMP;
504    dev->bo_list[idx] = (struct drm_msm_gem_submit_bo) {
505       .flags = MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE |
506                COND(dump, MSM_SUBMIT_BO_DUMP),
507       .handle = bo->res_id,
508       .presumed = iova,
509    };
510 
511    *bo = (struct tu_bo) {
512       .gem_handle = gem_handle,
513       .res_id = bo->res_id,
514       .size = size,
515       .iova = iova,
516       .name = name,
517       .refcnt = 1,
518       .bo_list_idx = idx,
519    };
520 
521    mtx_unlock(&dev->bo_mutex);
522 
523    return VK_SUCCESS;
524 }
525 
526 /**
527  * Sets the name in the kernel so that the contents of /debug/dri/0/gem are more
528  * useful.
529  *
530  * We skip this on release builds (when we're also not doing BO debugging) to
531  * reduce overhead.
532  */
533 static void
tu_bo_set_kernel_name(struct tu_device * dev,struct tu_bo * bo,const char * name)534 tu_bo_set_kernel_name(struct tu_device *dev, struct tu_bo *bo, const char *name)
535 {
536    bool kernel_bo_names = dev->bo_sizes != NULL;
537 #ifdef DEBUG
538    kernel_bo_names = true;
539 #endif
540    if (!kernel_bo_names)
541       return;
542 
543    size_t sz = strlen(name);
544 
545    unsigned req_len = sizeof(struct msm_ccmd_gem_set_name_req) + align(sz, 4);
546 
547    uint8_t buf[req_len];
548    struct msm_ccmd_gem_set_name_req *req = (struct msm_ccmd_gem_set_name_req *)buf;
549 
550    req->hdr = MSM_CCMD(GEM_SET_NAME, req_len);
551    req->res_id = bo->res_id;
552    req->len = sz;
553 
554    memcpy(req->payload, name, sz);
555 
556    vdrm_send_req(dev->vdev->vdrm, &req->hdr, false);
557 }
558 
559 static VkResult
virtio_bo_init(struct tu_device * dev,struct tu_bo ** out_bo,uint64_t size,uint64_t client_iova,VkMemoryPropertyFlags mem_property,enum tu_bo_alloc_flags flags,const char * name)560 virtio_bo_init(struct tu_device *dev,
561             struct tu_bo **out_bo,
562             uint64_t size,
563             uint64_t client_iova,
564             VkMemoryPropertyFlags mem_property,
565             enum tu_bo_alloc_flags flags,
566             const char *name)
567 {
568    struct tu_virtio_device *vdev = dev->vdev;
569    struct msm_ccmd_gem_new_req req = {
570          .hdr = MSM_CCMD(GEM_NEW, sizeof(req)),
571          .size = size,
572    };
573    VkResult result;
574 
575    result = virtio_allocate_userspace_iova(dev, size, client_iova,
576                                            flags, &req.iova);
577    if (result != VK_SUCCESS) {
578       return result;
579    }
580 
581    if (mem_property & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) {
582       if (mem_property & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) {
583          req.flags |= MSM_BO_CACHED_COHERENT;
584       } else {
585          req.flags |= MSM_BO_CACHED;
586       }
587    } else {
588       req.flags |= MSM_BO_WC;
589    }
590 
591    uint32_t blob_flags = 0;
592    if (mem_property & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
593       blob_flags |= VIRTGPU_BLOB_FLAG_USE_MAPPABLE;
594    }
595 
596    if (!(mem_property & VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT)) {
597       blob_flags |= VIRTGPU_BLOB_FLAG_USE_CROSS_DEVICE |
598             VIRTGPU_BLOB_FLAG_USE_SHAREABLE;
599    }
600 
601    if (flags & TU_BO_ALLOC_GPU_READ_ONLY)
602       req.flags |= MSM_BO_GPU_READONLY;
603 
604    /* tunneled cmds are processed separately on host side,
605     * before the renderer->get_blob() callback.. the blob_id
606     * is used to link the created bo to the get_blob() call
607     */
608    req.blob_id = p_atomic_inc_return(&vdev->next_blob_id);;
609 
610    uint32_t handle =
611       vdrm_bo_create(vdev->vdrm, size, blob_flags, req.blob_id, &req.hdr);
612 
613    if (!handle) {
614       util_vma_heap_free(&dev->vma, req.iova, size);
615       return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
616    }
617 
618    uint32_t res_id = vdrm_handle_to_res_id(vdev->vdrm, handle);
619    struct tu_bo* bo = tu_device_lookup_bo(dev, res_id);
620    assert(bo && bo->gem_handle == 0);
621 
622    bo->res_id = res_id;
623 
624    result = tu_bo_init(dev, bo, handle, size, req.iova, flags, name);
625    if (result != VK_SUCCESS)
626       memset(bo, 0, sizeof(*bo));
627    else
628       *out_bo = bo;
629 
630    /* We don't use bo->name here because for the !TU_DEBUG=bo case bo->name is NULL. */
631    tu_bo_set_kernel_name(dev, bo, name);
632 
633    if (result == VK_SUCCESS &&
634        (mem_property & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) &&
635        !(mem_property & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
636       tu_bo_map(dev, bo);
637 
638       /* Cached non-coherent memory may already have dirty cache lines,
639        * we should clean the cache lines before GPU got the chance to
640        * write into this memory.
641        *
642        * MSM already does this automatically for uncached (MSM_BO_WC) memory.
643        */
644       tu_sync_cache_bo(dev, bo, 0, VK_WHOLE_SIZE, TU_MEM_SYNC_CACHE_TO_GPU);
645    }
646 
647    return result;
648 }
649 
650 static VkResult
virtio_bo_init_dmabuf(struct tu_device * dev,struct tu_bo ** out_bo,uint64_t size,int prime_fd)651 virtio_bo_init_dmabuf(struct tu_device *dev,
652                    struct tu_bo **out_bo,
653                    uint64_t size,
654                    int prime_fd)
655 {
656    struct vdrm_device *vdrm = dev->vdev->vdrm;
657    VkResult result;
658    struct tu_bo* bo = NULL;
659 
660    /* lseek() to get the real size */
661    off_t real_size = lseek(prime_fd, 0, SEEK_END);
662    lseek(prime_fd, 0, SEEK_SET);
663    if (real_size < 0 || (uint64_t) real_size < size)
664       return vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
665 
666    /* iova allocation needs to consider the object's *real* size: */
667    size = real_size;
668 
669    uint64_t iova;
670    result = virtio_allocate_userspace_iova(dev, size, 0, TU_BO_ALLOC_NO_FLAGS, &iova);
671    if (result != VK_SUCCESS)
672       return result;
673 
674    /* Importing the same dmabuf several times would yield the same
675     * gem_handle. Thus there could be a race when destroying
676     * BO and importing the same dmabuf from different threads.
677     * We must not permit the creation of dmabuf BO and its release
678     * to happen in parallel.
679     */
680    u_rwlock_wrlock(&dev->dma_bo_lock);
681 
682    uint32_t handle, res_id;
683 
684    handle = vdrm_dmabuf_to_handle(vdrm, prime_fd);
685    if (!handle) {
686       result = vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
687       goto out_unlock;
688    }
689 
690    res_id = vdrm_handle_to_res_id(vdrm, handle);
691    if (!res_id) {
692       result = vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
693       goto out_unlock;
694    }
695 
696    bo = tu_device_lookup_bo(dev, res_id);
697 
698    if (bo->refcnt != 0) {
699       p_atomic_inc(&bo->refcnt);
700       assert(bo->res_id == res_id);
701       *out_bo = bo;
702       result = VK_SUCCESS;
703       goto out_unlock;
704    }
705 
706    bo->res_id = res_id;
707 
708    result = tu_bo_init(dev, bo, handle, size, iova,
709                        TU_BO_ALLOC_NO_FLAGS, "dmabuf");
710    if (result != VK_SUCCESS)
711       memset(bo, 0, sizeof(*bo));
712    else
713       *out_bo = bo;
714 
715 out_unlock:
716    u_rwlock_wrunlock(&dev->dma_bo_lock);
717    if (result != VK_SUCCESS) {
718       mtx_lock(&dev->vma_mutex);
719       util_vma_heap_free(&dev->vma, iova, size);
720       mtx_unlock(&dev->vma_mutex);
721    }
722 
723    return result;
724 }
725 
726 static VkResult
virtio_bo_map(struct tu_device * dev,struct tu_bo * bo)727 virtio_bo_map(struct tu_device *dev, struct tu_bo *bo)
728 {
729    if (bo->map)
730       return VK_SUCCESS;
731 
732    bo->map = vdrm_bo_map(dev->vdev->vdrm, bo->gem_handle, bo->size);
733    if (bo->map == MAP_FAILED)
734       return vk_error(dev, VK_ERROR_MEMORY_MAP_FAILED);
735 
736    return VK_SUCCESS;
737 }
738 
739 static void
virtio_bo_allow_dump(struct tu_device * dev,struct tu_bo * bo)740 virtio_bo_allow_dump(struct tu_device *dev, struct tu_bo *bo)
741 {
742    mtx_lock(&dev->bo_mutex);
743    dev->bo_list[bo->bo_list_idx].flags |= MSM_SUBMIT_BO_DUMP;
744    mtx_unlock(&dev->bo_mutex);
745 }
746 
747 static VkResult
tu_queue_submit_create_locked(struct tu_queue * queue,struct vk_queue_submit * vk_submit,const uint32_t nr_in_syncobjs,const uint32_t nr_out_syncobjs,uint32_t perf_pass_index,struct tu_queue_submit * new_submit)748 tu_queue_submit_create_locked(struct tu_queue *queue,
749                               struct vk_queue_submit *vk_submit,
750                               const uint32_t nr_in_syncobjs,
751                               const uint32_t nr_out_syncobjs,
752                               uint32_t perf_pass_index,
753                               struct tu_queue_submit *new_submit)
754 {
755    VkResult result;
756 
757    bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context);
758    bool has_trace_points = false;
759 
760    struct vk_command_buffer **vk_cmd_buffers = vk_submit->command_buffers;
761 
762    memset(new_submit, 0, sizeof(struct tu_queue_submit));
763 
764    new_submit->cmd_buffers = (struct tu_cmd_buffer **) vk_cmd_buffers;
765    new_submit->nr_cmd_buffers = vk_submit->command_buffer_count;
766    tu_insert_dynamic_cmdbufs(queue->device, &new_submit->cmd_buffers,
767                              &new_submit->nr_cmd_buffers);
768 
769    uint32_t entry_count = 0;
770    for (uint32_t j = 0; j < new_submit->nr_cmd_buffers; ++j) {
771       struct tu_cmd_buffer *cmdbuf = new_submit->cmd_buffers[j];
772 
773       if (perf_pass_index != ~0)
774          entry_count++;
775 
776       entry_count += cmdbuf->cs.entry_count;
777 
778       if (u_trace_enabled && u_trace_has_points(&cmdbuf->trace)) {
779          if (!(cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT))
780             entry_count++;
781 
782          has_trace_points = true;
783       }
784    }
785 
786    new_submit->autotune_fence =
787       tu_autotune_submit_requires_fence(new_submit->cmd_buffers, new_submit->nr_cmd_buffers);
788    if (new_submit->autotune_fence)
789       entry_count++;
790 
791    /* Add one for the userspace fence cmd: */
792    entry_count += 1;
793 
794    new_submit->cmds = (struct drm_msm_gem_submit_cmd *) vk_zalloc(
795       &queue->device->vk.alloc, entry_count * sizeof(*new_submit->cmds), 8,
796       VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
797 
798    if (new_submit->cmds == NULL) {
799       result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
800       goto fail_cmds;
801    }
802 
803    if (has_trace_points) {
804       result =
805          tu_u_trace_submission_data_create(
806             queue->device, new_submit->cmd_buffers,
807             new_submit->nr_cmd_buffers,
808             &new_submit->u_trace_submission_data);
809 
810       if (result != VK_SUCCESS) {
811          goto fail_u_trace_submission_data;
812       }
813    }
814 
815    /* Allocate without wait timeline semaphores */
816    new_submit->in_syncobjs = (struct drm_virtgpu_execbuffer_syncobj *) vk_zalloc(
817       &queue->device->vk.alloc,
818       nr_in_syncobjs * sizeof(*new_submit->in_syncobjs), 8,
819       VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
820 
821    if (new_submit->in_syncobjs == NULL) {
822       result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
823       goto fail_in_syncobjs;
824    }
825 
826    /* Allocate with signal timeline semaphores considered */
827    new_submit->out_syncobjs = (struct drm_virtgpu_execbuffer_syncobj *) vk_zalloc(
828       &queue->device->vk.alloc,
829       nr_out_syncobjs * sizeof(*new_submit->out_syncobjs), 8,
830       VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
831 
832    if (new_submit->out_syncobjs == NULL) {
833       result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
834       goto fail_out_syncobjs;
835    }
836 
837    new_submit->entry_count = entry_count;
838    new_submit->nr_in_syncobjs = nr_in_syncobjs;
839    new_submit->nr_out_syncobjs = nr_out_syncobjs;
840    new_submit->perf_pass_index = perf_pass_index;
841    new_submit->vk_submit = vk_submit;
842 
843    return VK_SUCCESS;
844 
845 fail_out_syncobjs:
846    vk_free(&queue->device->vk.alloc, new_submit->in_syncobjs);
847 fail_in_syncobjs:
848    if (new_submit->u_trace_submission_data)
849       tu_u_trace_submission_data_finish(queue->device,
850                                         new_submit->u_trace_submission_data);
851 fail_u_trace_submission_data:
852    vk_free(&queue->device->vk.alloc, new_submit->cmds);
853 fail_cmds:
854    return result;
855 }
856 
857 static void
tu_queue_submit_finish(struct tu_queue * queue,struct tu_queue_submit * submit)858 tu_queue_submit_finish(struct tu_queue *queue, struct tu_queue_submit *submit)
859 {
860    vk_free(&queue->device->vk.alloc, submit->cmds);
861    vk_free(&queue->device->vk.alloc, submit->in_syncobjs);
862    vk_free(&queue->device->vk.alloc, submit->out_syncobjs);
863    if (submit->cmd_buffers != (void *) submit->vk_submit->command_buffers)
864       vk_free(&queue->device->vk.alloc, submit->cmd_buffers);
865 }
866 
867 static void
tu_fill_msm_gem_submit(struct tu_device * dev,struct drm_msm_gem_submit_cmd * cmd,struct tu_cs_entry * cs_entry)868 tu_fill_msm_gem_submit(struct tu_device *dev,
869                        struct drm_msm_gem_submit_cmd *cmd,
870                        struct tu_cs_entry *cs_entry)
871 {
872    cmd->type = MSM_SUBMIT_CMD_BUF;
873    cmd->submit_idx = cs_entry->bo->bo_list_idx;
874    cmd->submit_offset = cs_entry->offset;
875    cmd->size = cs_entry->size;
876    cmd->pad = 0;
877    cmd->nr_relocs = 0;
878    cmd->relocs = 0;
879 }
880 
881 static void
tu_queue_build_msm_gem_submit_cmds(struct tu_queue * queue,struct tu_queue_submit * submit,struct tu_cs * autotune_cs)882 tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue,
883                                    struct tu_queue_submit *submit,
884                                    struct tu_cs *autotune_cs)
885 {
886    struct tu_device *dev = queue->device;
887    struct tu_virtio_device *vdev = dev->vdev;
888    struct drm_msm_gem_submit_cmd *cmds = submit->cmds;
889 
890    uint32_t entry_idx = 0;
891    for (uint32_t j = 0; j < submit->nr_cmd_buffers; ++j) {
892       struct tu_device *dev = queue->device;
893       struct tu_cmd_buffer *cmdbuf = submit->cmd_buffers[j];
894       struct tu_cs *cs = &cmdbuf->cs;
895 
896       if (submit->perf_pass_index != ~0) {
897          struct tu_cs_entry *perf_cs_entry =
898             &dev->perfcntrs_pass_cs_entries[submit->perf_pass_index];
899 
900          tu_fill_msm_gem_submit(dev, &cmds[entry_idx], perf_cs_entry);
901          entry_idx++;
902       }
903 
904       for (unsigned i = 0; i < cs->entry_count; ++i, ++entry_idx) {
905          tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &cs->entries[i]);
906       }
907 
908       if (submit->u_trace_submission_data) {
909          struct tu_cs *ts_cs =
910             submit->u_trace_submission_data->cmd_trace_data[j].timestamp_copy_cs;
911          if (ts_cs) {
912             tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &ts_cs->entries[0]);
913             entry_idx++;
914          }
915       }
916    }
917 
918    if (autotune_cs) {
919       assert(autotune_cs->entry_count == 1);
920       tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &autotune_cs->entries[0]);
921       entry_idx++;
922    }
923 
924    /* Last, add the userspace fence cmd: */
925    struct tu_userspace_fence_cmds *fcmds = vdev->fence_cmds;
926    if (queue->fence <= 0)
927       queue->fence = 0;
928    uint32_t fence = ++queue->fence;
929    int idx = fence % ARRAY_SIZE(fcmds->cmds);
930 
931    /* Wait for previous usage of fence cmd to be idle.. in practice the table
932     * of recycled cmds should be big enough to never stall here:
933     */
934    tu_wait_fence(dev, dev->queues[0]->msm_queue_id, fcmds->cmds[idx].fence, 3000000000);
935 
936    fcmds->cmds[idx].fence = fence;
937 
938    cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF;
939    cmds[entry_idx].submit_idx = vdev->fence_cmds_mem->bo_list_idx;
940    cmds[entry_idx].submit_offset = ((intptr_t)&fcmds->cmds[idx]) - (intptr_t)fcmds;
941    cmds[entry_idx].size = 5 * 4;
942    cmds[entry_idx].pad = 0;
943    cmds[entry_idx].nr_relocs = 0;
944    cmds[entry_idx].relocs = 0;
945 }
946 
947 static VkResult
setup_fence_cmds(struct tu_device * dev)948 setup_fence_cmds(struct tu_device *dev)
949 {
950    struct tu_virtio_device *vdev = dev->vdev;
951    VkResult result;
952 
953    result = tu_bo_init_new(dev, &vdev->fence_cmds_mem, sizeof(*vdev->fence_cmds),
954                            (enum tu_bo_alloc_flags)
955                               (TU_BO_ALLOC_ALLOW_DUMP | TU_BO_ALLOC_GPU_READ_ONLY),
956                            "fence_cmds");
957    if (result != VK_SUCCESS)
958       return result;
959 
960    result = tu_bo_map(dev, vdev->fence_cmds_mem);
961    if (result != VK_SUCCESS)
962       return result;
963 
964    vdev->fence_cmds = (struct tu_userspace_fence_cmds *)vdev->fence_cmds_mem->map;
965 
966    uint64_t fence_iova = dev->global_bo->iova + gb_offset(userspace_fence);
967    for (int i = 0; i < ARRAY_SIZE(vdev->fence_cmds->cmds); i++) {
968       struct tu_userspace_fence_cmd *c = &vdev->fence_cmds->cmds[i];
969 
970       memset(c, 0, sizeof(*c));
971 
972       c->pkt[0] = pm4_pkt7_hdr((uint8_t)CP_EVENT_WRITE, 4);
973       c->pkt[1] = CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS);
974       c->pkt[2] = fence_iova;
975       c->pkt[3] = fence_iova >> 32;
976    }
977 
978    return result;
979 }
980 
981 static VkResult
tu_queue_submit_locked(struct tu_queue * queue,struct tu_queue_submit * submit)982 tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
983 {
984    struct tu_virtio_device *vdev = queue->device->vdev;
985 
986    queue->device->submit_count++;
987 
988    /* It would be nice to not need to defer this, but virtio_device_init()
989     * happens before the device is initialized enough to allocate normal
990     * GEM buffers
991     */
992    if (!vdev->fence_cmds) {
993       VkResult result = setup_fence_cmds(queue->device);
994       if (result != VK_SUCCESS)
995          return result;
996    }
997 
998    struct tu_cs *autotune_cs = NULL;
999    if (submit->autotune_fence) {
1000       autotune_cs = tu_autotune_on_submit(queue->device,
1001                                           &queue->device->autotune,
1002                                           submit->cmd_buffers,
1003                                           submit->nr_cmd_buffers);
1004    }
1005 
1006    uint32_t flags = MSM_PIPE_3D0;
1007 
1008    if (submit->vk_submit->wait_count)
1009       flags |= MSM_SUBMIT_SYNCOBJ_IN;
1010 
1011    if (submit->vk_submit->signal_count)
1012       flags |= MSM_SUBMIT_SYNCOBJ_OUT;
1013 
1014    mtx_lock(&queue->device->bo_mutex);
1015 
1016    if (queue->device->implicit_sync_bo_count == 0)
1017       flags |= MSM_SUBMIT_NO_IMPLICIT;
1018 
1019    /* drm_msm_gem_submit_cmd requires index of bo which could change at any
1020     * time when bo_mutex is not locked. So we build submit cmds here the real
1021     * place to submit.
1022     */
1023    tu_queue_build_msm_gem_submit_cmds(queue, submit, autotune_cs);
1024 
1025    /* TODO avoid extra memcpy, and populate bo's and cmds directly
1026     * into the req msg
1027     */
1028    unsigned nr_cmds = submit->entry_count;
1029    unsigned nr_bos = nr_cmds ? queue->device->bo_count : 0;
1030    unsigned bos_len = nr_bos * sizeof(struct drm_msm_gem_submit_bo);
1031    unsigned cmd_len = nr_cmds * sizeof(struct drm_msm_gem_submit_cmd);
1032    unsigned req_len = sizeof(struct msm_ccmd_gem_submit_req) + bos_len + cmd_len;
1033    struct msm_ccmd_gem_submit_req *req = (struct msm_ccmd_gem_submit_req *)vk_alloc(
1034          &queue->device->vk.alloc, req_len, 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1035 
1036    if (!req) {
1037       mtx_unlock(&queue->device->bo_mutex);
1038       return vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
1039    }
1040 
1041    req->hdr      = MSM_CCMD(GEM_SUBMIT, req_len);
1042    req->flags    = flags;
1043    req->queue_id = queue->msm_queue_id;
1044    req->nr_bos   = nr_bos;
1045    req->nr_cmds  = nr_cmds;
1046 
1047    /* Use same kernel fence and userspace fence seqno to avoid having
1048     * to track both:
1049     */
1050    req->fence    = queue->fence;
1051 
1052    memcpy(req->payload, queue->device->bo_list, bos_len);
1053    memcpy(req->payload + bos_len, submit->cmds, cmd_len);
1054 
1055    int ring_idx = queue->priority + 1;
1056    int ret;
1057 
1058    struct vdrm_execbuf_params p = {
1059       .ring_idx = ring_idx,
1060       .req = &req->hdr,
1061       .in_syncobjs = submit->in_syncobjs,
1062       .out_syncobjs = submit->out_syncobjs,
1063       .num_in_syncobjs = submit->nr_in_syncobjs,
1064       .num_out_syncobjs = submit->nr_out_syncobjs,
1065    };
1066 
1067    ret = vdrm_execbuf(vdev->vdrm, &p);
1068 
1069    mtx_unlock(&queue->device->bo_mutex);
1070 
1071    tu_debug_bos_print_stats(queue->device);
1072 
1073    if (ret)
1074       return vk_device_set_lost(&queue->device->vk, "submit failed: %m");
1075 
1076    uint64_t gpu_offset = 0;
1077 #if HAVE_PERFETTO
1078    struct tu_perfetto_clocks clocks =
1079       tu_perfetto_submit(queue->device, queue->device->submit_count, NULL);
1080    gpu_offset = clocks.gpu_ts_offset;
1081 #endif
1082 
1083    if (submit->u_trace_submission_data) {
1084       struct tu_u_trace_submission_data *submission_data =
1085          submit->u_trace_submission_data;
1086       submission_data->submission_id = queue->device->submit_count;
1087       submission_data->gpu_ts_offset = gpu_offset;
1088       /* We have to allocate it here since it is different between drm/kgsl */
1089       submission_data->syncobj = (struct tu_u_trace_syncobj *)
1090          vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj),
1091                8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1092       submission_data->syncobj->fence = req->fence;
1093       submission_data->syncobj->msm_queue_id = queue->msm_queue_id;
1094 
1095       submit->u_trace_submission_data = NULL;
1096 
1097       for (uint32_t i = 0; i < submission_data->cmd_buffer_count; i++) {
1098          bool free_data = i == submission_data->last_buffer_with_tracepoints;
1099          if (submission_data->cmd_trace_data[i].trace)
1100             u_trace_flush(submission_data->cmd_trace_data[i].trace,
1101                           submission_data, free_data);
1102 
1103          if (!submission_data->cmd_trace_data[i].timestamp_copy_cs) {
1104             /* u_trace is owned by cmd_buffer */
1105             submission_data->cmd_trace_data[i].trace = NULL;
1106          }
1107       }
1108    }
1109 
1110    for (uint32_t i = 0; i < submit->vk_submit->wait_count; i++) {
1111       if (!vk_sync_is_tu_timeline_sync(submit->vk_submit->waits[i].sync))
1112          continue;
1113 
1114       struct tu_timeline_sync *sync =
1115          container_of(submit->vk_submit->waits[i].sync, struct tu_timeline_sync, base);
1116 
1117       assert(sync->state != TU_TIMELINE_SYNC_STATE_RESET);
1118 
1119       /* Set SIGNALED to the state of the wait timeline sync since this means the syncobj
1120        * is done and ready again so this can be garbage-collectioned later.
1121        */
1122       sync->state = TU_TIMELINE_SYNC_STATE_SIGNALED;
1123    }
1124 
1125    for (uint32_t i = 0; i < submit->vk_submit->signal_count; i++) {
1126       if (!vk_sync_is_tu_timeline_sync(submit->vk_submit->signals[i].sync))
1127          continue;
1128 
1129       struct tu_timeline_sync *sync =
1130          container_of(submit->vk_submit->signals[i].sync, struct tu_timeline_sync, base);
1131 
1132       assert(sync->state == TU_TIMELINE_SYNC_STATE_RESET);
1133       /* Set SUBMITTED to the state of the signal timeline sync so we could wait for
1134        * this timeline sync until completed if necessary.
1135        */
1136       sync->state = TU_TIMELINE_SYNC_STATE_SUBMITTED;
1137    }
1138 
1139    pthread_cond_broadcast(&queue->device->timeline_cond);
1140 
1141    return VK_SUCCESS;
1142 }
1143 
1144 static VkResult
virtio_device_wait_u_trace(struct tu_device * dev,struct tu_u_trace_syncobj * syncobj)1145 virtio_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj)
1146 {
1147    return tu_wait_fence(dev, syncobj->msm_queue_id, syncobj->fence, 1000000000);
1148 }
1149 
1150 static VkResult
virtio_queue_submit(struct tu_queue * queue,struct vk_queue_submit * submit)1151 virtio_queue_submit(struct tu_queue *queue, struct vk_queue_submit *submit)
1152 {
1153    MESA_TRACE_FUNC();
1154    uint32_t perf_pass_index = queue->device->perfcntrs_pass_cs ?
1155                               submit->perf_pass_index : ~0;
1156    struct tu_queue_submit submit_req;
1157 
1158    if (TU_DEBUG(LOG_SKIP_GMEM_OPS)) {
1159       tu_dbg_log_gmem_load_store_skips(queue->device);
1160    }
1161 
1162    pthread_mutex_lock(&queue->device->submit_mutex);
1163 
1164    VkResult ret = tu_queue_submit_create_locked(queue, submit,
1165          submit->wait_count, submit->signal_count,
1166          perf_pass_index, &submit_req);
1167 
1168    if (ret != VK_SUCCESS) {
1169       pthread_mutex_unlock(&queue->device->submit_mutex);
1170       return ret;
1171    }
1172 
1173    /* note: assuming there won't be any very large semaphore counts */
1174    struct drm_virtgpu_execbuffer_syncobj *in_syncobjs = submit_req.in_syncobjs;
1175    struct drm_virtgpu_execbuffer_syncobj *out_syncobjs = submit_req.out_syncobjs;
1176 
1177    uint32_t nr_in_syncobjs = 0, nr_out_syncobjs = 0;
1178 
1179    for (uint32_t i = 0; i < submit->wait_count; i++) {
1180       struct vk_sync *sync = submit->waits[i].sync;
1181 
1182       in_syncobjs[nr_in_syncobjs++] = (struct drm_virtgpu_execbuffer_syncobj) {
1183          .handle = tu_syncobj_from_vk_sync(sync),
1184          .flags = 0,
1185          .point = submit->waits[i].wait_value,
1186       };
1187    }
1188 
1189    for (uint32_t i = 0; i < submit->signal_count; i++) {
1190       struct vk_sync *sync = submit->signals[i].sync;
1191 
1192       out_syncobjs[nr_out_syncobjs++] = (struct drm_virtgpu_execbuffer_syncobj) {
1193          .handle = tu_syncobj_from_vk_sync(sync),
1194          .flags = 0,
1195          .point = submit->signals[i].signal_value,
1196       };
1197    }
1198 
1199    ret = tu_queue_submit_locked(queue, &submit_req);
1200 
1201    pthread_mutex_unlock(&queue->device->submit_mutex);
1202    tu_queue_submit_finish(queue, &submit_req);
1203 
1204    if (ret != VK_SUCCESS)
1205        return ret;
1206 
1207    u_trace_context_process(&queue->device->trace_context, true);
1208 
1209    return VK_SUCCESS;
1210 }
1211 
1212 static const struct tu_knl virtio_knl_funcs = {
1213       .name = "virtgpu",
1214 
1215       .device_init = virtio_device_init,
1216       .device_finish = virtio_device_finish,
1217       .device_get_gpu_timestamp = virtio_device_get_gpu_timestamp,
1218       .device_get_suspend_count = virtio_device_get_suspend_count,
1219       .device_check_status = virtio_device_check_status,
1220       .submitqueue_new = virtio_submitqueue_new,
1221       .submitqueue_close = virtio_submitqueue_close,
1222       .bo_init = virtio_bo_init,
1223       .bo_init_dmabuf = virtio_bo_init_dmabuf,
1224       .bo_export_dmabuf = tu_drm_export_dmabuf,
1225       .bo_map = virtio_bo_map,
1226       .bo_allow_dump = virtio_bo_allow_dump,
1227       .bo_finish = tu_drm_bo_finish,
1228       .device_wait_u_trace = virtio_device_wait_u_trace,
1229       .queue_submit = virtio_queue_submit,
1230 };
1231 
1232 VkResult
tu_knl_drm_virtio_load(struct tu_instance * instance,int fd,struct _drmVersion * version,struct tu_physical_device ** out)1233 tu_knl_drm_virtio_load(struct tu_instance *instance,
1234                        int fd, struct _drmVersion *version,
1235                        struct tu_physical_device **out)
1236 {
1237    struct virgl_renderer_capset_drm caps;
1238    struct vdrm_device *vdrm;
1239    VkResult result = VK_SUCCESS;
1240    uint64_t val;
1241 
1242    /* Debug option to force fallback to venus: */
1243    if (debug_get_bool_option("TU_NO_VIRTIO", false))
1244       return VK_ERROR_INCOMPATIBLE_DRIVER;
1245 
1246    if (drmGetCap(fd, DRM_CAP_SYNCOBJ, &val) || !val) {
1247       return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
1248                                "kernel driver for device %s does not support DRM_CAP_SYNC_OBJ",
1249                                version->name);
1250    }
1251 
1252    vdrm = vdrm_device_connect(fd, VIRTGPU_DRM_CONTEXT_MSM);
1253    if (!vdrm) {
1254       return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
1255                                "could not get connect vdrm: %s", strerror(errno));
1256    }
1257 
1258    caps = vdrm->caps;
1259 
1260    vdrm_device_close(vdrm);
1261 
1262    mesa_logd("wire_format_version: %u", caps.wire_format_version);
1263    mesa_logd("version_major:       %u", caps.version_major);
1264    mesa_logd("version_minor:       %u", caps.version_minor);
1265    mesa_logd("version_patchlevel:  %u", caps.version_patchlevel);
1266    mesa_logd("has_cached_coherent: %u", caps.u.msm.has_cached_coherent);
1267    mesa_logd("va_start:            0x%0" PRIx64, caps.u.msm.va_start);
1268    mesa_logd("va_size:             0x%0" PRIx64, caps.u.msm.va_size);
1269    mesa_logd("gpu_id:              %u", caps.u.msm.gpu_id);
1270    mesa_logd("gmem_size:           %u", caps.u.msm.gmem_size);
1271    mesa_logd("gmem_base:           0x%0" PRIx64, caps.u.msm.gmem_base);
1272    mesa_logd("chip_id:             0x%0" PRIx64, caps.u.msm.chip_id);
1273    mesa_logd("max_freq:            %u", caps.u.msm.max_freq);
1274 
1275    if (caps.wire_format_version != 2) {
1276       return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
1277                                "Unsupported protocol version: %u",
1278                                caps.wire_format_version);
1279    }
1280 
1281    if ((caps.version_major != 1) || (caps.version_minor < 9)) {
1282       return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
1283                                "unsupported version: %u.%u.%u",
1284                                caps.version_major,
1285                                caps.version_minor,
1286                                caps.version_patchlevel);
1287    }
1288 
1289    if (!caps.u.msm.va_size) {
1290       return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
1291                                "No address space");
1292    }
1293 
1294    struct tu_physical_device *device = (struct tu_physical_device *)
1295       vk_zalloc(&instance->vk.alloc, sizeof(*device), 8,
1296                 VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
1297    if (!device) {
1298       result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1299       goto fail;
1300    }
1301 
1302    device->msm_major_version = caps.version_major;
1303    device->msm_minor_version = caps.version_minor;
1304 
1305    device->instance = instance;
1306    device->local_fd = fd;
1307 
1308    device->dev_id.gpu_id  = caps.u.msm.gpu_id;
1309    device->dev_id.chip_id = caps.u.msm.chip_id;
1310    device->gmem_size      = caps.u.msm.gmem_size;
1311    device->gmem_base      = caps.u.msm.gmem_base;
1312    device->va_start       = caps.u.msm.va_start;
1313    device->va_size        = caps.u.msm.va_size;
1314    device->has_set_iova   = true;
1315 
1316    device->gmem_size = debug_get_num_option("TU_GMEM", device->gmem_size);
1317 
1318    device->has_cached_coherent_memory = caps.u.msm.has_cached_coherent;
1319 
1320    device->submitqueue_priority_count = caps.u.msm.priorities;
1321 
1322    device->syncobj_type = vk_drm_syncobj_get_type(fd);
1323    /* we don't support DRM_CAP_SYNCOBJ_TIMELINE, but drm-shim does */
1324    if (!(device->syncobj_type.features & VK_SYNC_FEATURE_TIMELINE))
1325       device->timeline_type = vk_sync_timeline_get_type(&tu_timeline_sync_type);
1326 
1327    device->sync_types[0] = &device->syncobj_type;
1328    device->sync_types[1] = &device->timeline_type.sync;
1329    device->sync_types[2] = NULL;
1330 
1331    device->heap.size = tu_get_system_heap_size(device);
1332    device->heap.used = 0u;
1333    device->heap.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT;
1334 
1335    instance->knl = &virtio_knl_funcs;
1336 
1337    *out = device;
1338 
1339    return VK_SUCCESS;
1340 
1341 fail:
1342    vk_free(&instance->vk.alloc, device);
1343    return result;
1344 }
1345