1 /*
2 * Copyright © 2018 Google, Inc.
3 * Copyright © 2015 Intel Corporation
4 * SPDX-License-Identifier: MIT
5 *
6 * Kernel interface layer for turnip running on virtio_gpu (aka virtgpu)
7 */
8
9 #include "tu_knl.h"
10
11 #include <errno.h>
12 #include <fcntl.h>
13 #include <sys/ioctl.h>
14 #include <sys/mman.h>
15 #include <xf86drm.h>
16
17 #include "vk_util.h"
18
19 #include "drm-uapi/msm_drm.h"
20 #include "drm-uapi/virtgpu_drm.h"
21 #include "util/u_debug.h"
22 #include "util/hash_table.h"
23 #include "util/libsync.h"
24 #include "util/u_process.h"
25
26 #include "tu_cmd_buffer.h"
27 #include "tu_cs.h"
28 #include "tu_device.h"
29 #include "tu_dynamic_rendering.h"
30 #include "tu_knl_drm.h"
31
32 #define VIRGL_RENDERER_UNSTABLE_APIS 1
33 #include "virglrenderer_hw.h"
34 #include "msm_proto.h"
35
36 #include "vdrm.h"
37
38 struct tu_userspace_fence_cmd {
39 uint32_t pkt[4]; /* first 4 dwords of packet */
40 uint32_t fence; /* fifth dword is fence value which is plugged in at runtime */
41 uint32_t _pad[11];
42 };
43
44 struct tu_userspace_fence_cmds {
45 struct tu_userspace_fence_cmd cmds[64];
46 };
47
48 struct tu_queue_submit {
49 struct vk_queue_submit *vk_submit;
50 struct tu_u_trace_submission_data *u_trace_submission_data;
51
52 struct tu_cmd_buffer **cmd_buffers;
53 struct drm_msm_gem_submit_cmd *cmds;
54 struct drm_virtgpu_execbuffer_syncobj *in_syncobjs;
55 struct drm_virtgpu_execbuffer_syncobj *out_syncobjs;
56
57 uint32_t nr_cmd_buffers;
58 uint32_t nr_in_syncobjs;
59 uint32_t nr_out_syncobjs;
60 uint32_t entry_count;
61 uint32_t perf_pass_index;
62
63 bool autotune_fence;
64 };
65
66 struct tu_u_trace_syncobj {
67 uint32_t msm_queue_id;
68 uint32_t fence;
69 };
70
71 struct tu_virtio_device {
72 struct vdrm_device *vdrm;
73 struct msm_shmem *shmem;
74 uint32_t next_blob_id;
75
76 struct tu_userspace_fence_cmds *fence_cmds;
77 struct tu_bo *fence_cmds_mem;
78
79 /**
80 * Processing zombie VMAs is a two step process, first we clear the iova
81 * and then we close the handles. But to minimize waste of virtqueue
82 * space (and associated stalling and ping-ponging between guest and host)
83 * we want to batch up all the GEM_SET_IOVA ccmds before we flush them to
84 * the host and start closing handles.
85 *
86 * This gives us a place to stash the VMAs between the two steps.
87 */
88 struct u_vector zombie_vmas_stage_2;
89 };
90
91 static int tu_drm_get_param(struct tu_device *dev, uint32_t param, uint64_t *value);
92
93 /**
94 * Helper for simple pass-thru ioctls
95 */
96 static int
virtio_simple_ioctl(struct tu_device * dev,unsigned cmd,void * _req)97 virtio_simple_ioctl(struct tu_device *dev, unsigned cmd, void *_req)
98 {
99 MESA_TRACE_FUNC();
100 struct vdrm_device *vdrm = dev->vdev->vdrm;
101 unsigned req_len = sizeof(struct msm_ccmd_ioctl_simple_req);
102 unsigned rsp_len = sizeof(struct msm_ccmd_ioctl_simple_rsp);
103
104 req_len += _IOC_SIZE(cmd);
105 if (cmd & IOC_OUT)
106 rsp_len += _IOC_SIZE(cmd);
107
108 uint8_t buf[req_len];
109 struct msm_ccmd_ioctl_simple_req *req = (struct msm_ccmd_ioctl_simple_req *)buf;
110 struct msm_ccmd_ioctl_simple_rsp *rsp;
111
112 req->hdr = MSM_CCMD(IOCTL_SIMPLE, req_len);
113 req->cmd = cmd;
114 memcpy(req->payload, _req, _IOC_SIZE(cmd));
115
116 rsp = (struct msm_ccmd_ioctl_simple_rsp *)
117 vdrm_alloc_rsp(vdrm, &req->hdr, rsp_len);
118
119 int ret = vdrm_send_req(vdrm, &req->hdr, true);
120
121 if (cmd & IOC_OUT)
122 memcpy(_req, rsp->payload, _IOC_SIZE(cmd));
123
124 ret = rsp->ret;
125
126 return ret;
127 }
128
129 static int
set_iova(struct tu_device * device,uint32_t res_id,uint64_t iova)130 set_iova(struct tu_device *device, uint32_t res_id, uint64_t iova)
131 {
132 struct msm_ccmd_gem_set_iova_req req = {
133 .hdr = MSM_CCMD(GEM_SET_IOVA, sizeof(req)),
134 .iova = iova,
135 .res_id = res_id,
136 };
137
138 return vdrm_send_req(device->vdev->vdrm, &req.hdr, false);
139 }
140
141 static int
query_faults(struct tu_device * dev,uint64_t * value)142 query_faults(struct tu_device *dev, uint64_t *value)
143 {
144 struct tu_virtio_device *vdev = dev->vdev;
145 uint32_t async_error = 0;
146 uint64_t global_faults;
147
148 if (vdrm_shmem_has_field(vdev->shmem, async_error))
149 async_error = vdev->shmem->async_error;
150
151 if (vdrm_shmem_has_field(vdev->shmem, global_faults)) {
152 global_faults = vdev->shmem->global_faults;
153 } else {
154 int ret = tu_drm_get_param(dev, MSM_PARAM_FAULTS, &global_faults);
155 if (ret)
156 return ret;
157 }
158
159 *value = global_faults + async_error;
160
161 return 0;
162 }
163
164 static void
set_debuginfo(struct tu_device * dev)165 set_debuginfo(struct tu_device *dev)
166 {
167 const char *comm = util_get_process_name();
168 static char cmdline[0x1000+1];
169 int fd = open("/proc/self/cmdline", O_RDONLY);
170 if (fd < 0)
171 return;
172
173 int n = read(fd, cmdline, sizeof(cmdline) - 1);
174 if (n < 0)
175 return;
176
177 /* arguments are separated by NULL, convert to spaces: */
178 for (int i = 0; i < n; i++) {
179 if (cmdline[i] == '\0') {
180 cmdline[i] = ' ';
181 }
182 }
183
184 cmdline[n] = '\0';
185
186 unsigned comm_len = strlen(comm) + 1;
187 unsigned cmdline_len = strlen(cmdline) + 1;
188
189 struct msm_ccmd_set_debuginfo_req *req;
190
191 unsigned req_len = align(sizeof(*req) + comm_len + cmdline_len, 4);
192
193 req = (struct msm_ccmd_set_debuginfo_req *)malloc(req_len);
194
195 req->hdr = MSM_CCMD(SET_DEBUGINFO, req_len);
196 req->comm_len = comm_len;
197 req->cmdline_len = cmdline_len;
198
199 memcpy(&req->payload[0], comm, comm_len);
200 memcpy(&req->payload[comm_len], cmdline, cmdline_len);
201
202 vdrm_send_req(dev->vdev->vdrm, &req->hdr, false);
203
204 free(req);
205 }
206
207 static VkResult
virtio_device_init(struct tu_device * dev)208 virtio_device_init(struct tu_device *dev)
209 {
210 struct tu_instance *instance = dev->physical_device->instance;
211 int fd;
212
213 fd = open(dev->physical_device->fd_path, O_RDWR | O_CLOEXEC);
214 if (fd < 0) {
215 return vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
216 "failed to open device %s", dev->physical_device->fd_path);
217 }
218
219 struct tu_virtio_device *vdev = (struct tu_virtio_device *)
220 vk_zalloc(&instance->vk.alloc, sizeof(*vdev), 8,
221 VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
222 if (!vdev) {
223 close(fd);
224 return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
225 };
226
227 u_vector_init(&vdev->zombie_vmas_stage_2, 64, sizeof(struct tu_zombie_vma));
228
229 dev->vdev = vdev;
230 dev->fd = fd;
231
232 vdev->vdrm = vdrm_device_connect(fd, VIRTGPU_DRM_CONTEXT_MSM);
233
234 p_atomic_set(&vdev->next_blob_id, 1);
235 vdev->shmem = to_msm_shmem(vdev->vdrm->shmem);
236
237 query_faults(dev, &dev->fault_count);
238
239 set_debuginfo(dev);
240
241 return VK_SUCCESS;
242 }
243
244 static void
virtio_device_finish(struct tu_device * dev)245 virtio_device_finish(struct tu_device *dev)
246 {
247 struct tu_instance *instance = dev->physical_device->instance;
248 struct tu_virtio_device *vdev = dev->vdev;
249
250 u_vector_finish(&vdev->zombie_vmas_stage_2);
251
252 vdrm_device_close(vdev->vdrm);
253
254 vk_free(&instance->vk.alloc, vdev);
255 dev->vdev = NULL;
256
257 close(dev->fd);
258 }
259
260 static int
tu_drm_get_param(struct tu_device * dev,uint32_t param,uint64_t * value)261 tu_drm_get_param(struct tu_device *dev, uint32_t param, uint64_t *value)
262 {
263 /* Technically this requires a pipe, but the kernel only supports one pipe
264 * anyway at the time of writing and most of these are clearly pipe
265 * independent. */
266 struct drm_msm_param req = {
267 .pipe = MSM_PIPE_3D0,
268 .param = param,
269 };
270
271 int ret = virtio_simple_ioctl(dev, DRM_IOCTL_MSM_GET_PARAM, &req);
272 if (ret)
273 return ret;
274
275 *value = req.value;
276
277 return 0;
278 }
279
280 static int
virtio_device_get_gpu_timestamp(struct tu_device * dev,uint64_t * ts)281 virtio_device_get_gpu_timestamp(struct tu_device *dev, uint64_t *ts)
282 {
283 return tu_drm_get_param(dev, MSM_PARAM_TIMESTAMP, ts);
284 }
285
286 static int
virtio_device_get_suspend_count(struct tu_device * dev,uint64_t * suspend_count)287 virtio_device_get_suspend_count(struct tu_device *dev, uint64_t *suspend_count)
288 {
289 int ret = tu_drm_get_param(dev, MSM_PARAM_SUSPENDS, suspend_count);
290 return ret;
291 }
292
293 static VkResult
virtio_device_check_status(struct tu_device * device)294 virtio_device_check_status(struct tu_device *device)
295 {
296 uint64_t last_fault_count = device->fault_count;
297
298 query_faults(device, &device->fault_count);
299
300 if (last_fault_count != device->fault_count)
301 return vk_device_set_lost(&device->vk, "GPU faulted or hung");
302
303 return VK_SUCCESS;
304 }
305
306 static int
virtio_submitqueue_new(struct tu_device * dev,int priority,uint32_t * queue_id)307 virtio_submitqueue_new(struct tu_device *dev,
308 int priority,
309 uint32_t *queue_id)
310 {
311 assert(priority >= 0 &&
312 priority < dev->physical_device->submitqueue_priority_count);
313
314 struct drm_msm_submitqueue req = {
315 .flags = 0,
316 .prio = priority,
317 };
318
319 int ret = virtio_simple_ioctl(dev, DRM_IOCTL_MSM_SUBMITQUEUE_NEW, &req);
320 if (ret)
321 return ret;
322
323 *queue_id = req.id;
324 return 0;
325 }
326
327 static void
virtio_submitqueue_close(struct tu_device * dev,uint32_t queue_id)328 virtio_submitqueue_close(struct tu_device *dev, uint32_t queue_id)
329 {
330 virtio_simple_ioctl(dev, DRM_IOCTL_MSM_SUBMITQUEUE_CLOSE, &queue_id);
331 }
332
333 static VkResult
tu_wait_fence(struct tu_device * dev,uint32_t queue_id,int fence,uint64_t timeout_ns)334 tu_wait_fence(struct tu_device *dev,
335 uint32_t queue_id,
336 int fence,
337 uint64_t timeout_ns)
338 {
339 struct vdrm_device *vdrm = dev->vdev->vdrm;
340
341 if (!fence_before(dev->global_bo_map->userspace_fence, fence))
342 return VK_SUCCESS;
343
344 if (!timeout_ns)
345 return VK_TIMEOUT;
346
347 MESA_TRACE_FUNC();
348
349 struct msm_ccmd_wait_fence_req req = {
350 .hdr = MSM_CCMD(WAIT_FENCE, sizeof(req)),
351 .queue_id = queue_id,
352 .fence = fence,
353 };
354 struct msm_ccmd_submitqueue_query_rsp *rsp;
355 int64_t end_time = os_time_get_nano() + timeout_ns;
356 int ret;
357
358 do {
359 rsp = (struct msm_ccmd_submitqueue_query_rsp *)
360 vdrm_alloc_rsp(vdrm, &req.hdr, sizeof(*rsp));
361
362 ret = vdrm_send_req(vdrm, &req.hdr, true);
363 if (ret)
364 goto out;
365
366 if (os_time_get_nano() >= end_time)
367 break;
368
369 ret = rsp->ret;
370 } while (ret == -ETIMEDOUT);
371
372 out:
373 if (!ret) return VK_SUCCESS;
374 if (ret == -ETIMEDOUT) return VK_TIMEOUT;
375 return VK_ERROR_UNKNOWN;
376 }
377
378 static VkResult
tu_free_zombie_vma_locked(struct tu_device * dev,bool wait)379 tu_free_zombie_vma_locked(struct tu_device *dev, bool wait)
380 {
381 struct tu_virtio_device *vdev = dev->vdev;
382
383 if (!u_vector_length(&dev->zombie_vmas))
384 return VK_SUCCESS;
385
386 if (wait) {
387 struct tu_zombie_vma *vma = (struct tu_zombie_vma *)
388 u_vector_head(&dev->zombie_vmas);
389 /* Wait for 3s (arbitrary timeout) */
390 VkResult ret = tu_wait_fence(dev, dev->queues[0]->msm_queue_id,
391 vma->fence, 3000000000);
392
393 if (ret != VK_SUCCESS)
394 return ret;
395 }
396
397 /* Clear the iova of all finished objects in first pass so the SET_IOVA
398 * ccmd's can be buffered and sent together to the host. *Then* delete
399 * the handles. This avoids filling up the virtqueue with tiny messages,
400 * since each execbuf ends up needing to be page aligned.
401 */
402 int last_signaled_fence = -1;
403 while (u_vector_length(&dev->zombie_vmas) > 0) {
404 struct tu_zombie_vma *vma = (struct tu_zombie_vma *)
405 u_vector_tail(&dev->zombie_vmas);
406 if (vma->fence > last_signaled_fence) {
407 VkResult ret =
408 tu_wait_fence(dev, dev->queues[0]->msm_queue_id, vma->fence, 0);
409 if (ret != VK_SUCCESS)
410 break;
411
412 last_signaled_fence = vma->fence;
413 }
414
415 set_iova(dev, vma->res_id, 0);
416
417 u_vector_remove(&dev->zombie_vmas);
418
419 struct tu_zombie_vma *vma2 = (struct tu_zombie_vma *)
420 u_vector_add(&vdev->zombie_vmas_stage_2);
421
422 *vma2 = *vma;
423 }
424
425 /* And _then_ close the GEM handles: */
426 while (u_vector_length(&vdev->zombie_vmas_stage_2) > 0) {
427 struct tu_zombie_vma *vma = (struct tu_zombie_vma *)
428 u_vector_remove(&vdev->zombie_vmas_stage_2);
429
430 util_vma_heap_free(&dev->vma, vma->iova, vma->size);
431 vdrm_bo_close(dev->vdev->vdrm, vma->gem_handle);
432 }
433
434 return VK_SUCCESS;
435 }
436
437 static VkResult
virtio_allocate_userspace_iova(struct tu_device * dev,uint64_t size,uint64_t client_iova,enum tu_bo_alloc_flags flags,uint64_t * iova)438 virtio_allocate_userspace_iova(struct tu_device *dev,
439 uint64_t size,
440 uint64_t client_iova,
441 enum tu_bo_alloc_flags flags,
442 uint64_t *iova)
443 {
444 VkResult result;
445
446 mtx_lock(&dev->vma_mutex);
447
448 *iova = 0;
449
450 tu_free_zombie_vma_locked(dev, false);
451
452 result = tu_allocate_userspace_iova(dev, size, client_iova, flags, iova);
453 if (result == VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS) {
454 /* Address may be already freed by us, but not considered as
455 * freed by the kernel. We have to wait until all work that
456 * may hold the address is done. Since addresses are meant to
457 * be replayed only by debug tooling, it should be ok to wait.
458 */
459 tu_free_zombie_vma_locked(dev, true);
460 result = tu_allocate_userspace_iova(dev, size, client_iova, flags, iova);
461 }
462
463 mtx_unlock(&dev->vma_mutex);
464
465 return result;
466 }
467
468 static VkResult
tu_bo_init(struct tu_device * dev,struct tu_bo * bo,uint32_t gem_handle,uint64_t size,uint64_t iova,enum tu_bo_alloc_flags flags,const char * name)469 tu_bo_init(struct tu_device *dev,
470 struct tu_bo *bo,
471 uint32_t gem_handle,
472 uint64_t size,
473 uint64_t iova,
474 enum tu_bo_alloc_flags flags,
475 const char *name)
476 {
477 assert(dev->physical_device->has_set_iova);
478
479 set_iova(dev, bo->res_id, iova);
480
481 name = tu_debug_bos_add(dev, size, name);
482
483 mtx_lock(&dev->bo_mutex);
484 uint32_t idx = dev->bo_count++;
485
486 /* grow the bo list if needed */
487 if (idx >= dev->bo_list_size) {
488 uint32_t new_len = idx + 64;
489 struct drm_msm_gem_submit_bo *new_ptr = (struct drm_msm_gem_submit_bo *)
490 vk_realloc(&dev->vk.alloc, dev->bo_list, new_len * sizeof(*dev->bo_list),
491 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
492 if (!new_ptr) {
493 dev->bo_count--;
494 mtx_unlock(&dev->bo_mutex);
495 vdrm_bo_close(dev->vdev->vdrm, bo->gem_handle);
496 return VK_ERROR_OUT_OF_HOST_MEMORY;
497 }
498
499 dev->bo_list = new_ptr;
500 dev->bo_list_size = new_len;
501 }
502
503 bool dump = flags & TU_BO_ALLOC_ALLOW_DUMP;
504 dev->bo_list[idx] = (struct drm_msm_gem_submit_bo) {
505 .flags = MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE |
506 COND(dump, MSM_SUBMIT_BO_DUMP),
507 .handle = bo->res_id,
508 .presumed = iova,
509 };
510
511 *bo = (struct tu_bo) {
512 .gem_handle = gem_handle,
513 .res_id = bo->res_id,
514 .size = size,
515 .iova = iova,
516 .name = name,
517 .refcnt = 1,
518 .bo_list_idx = idx,
519 };
520
521 mtx_unlock(&dev->bo_mutex);
522
523 return VK_SUCCESS;
524 }
525
526 /**
527 * Sets the name in the kernel so that the contents of /debug/dri/0/gem are more
528 * useful.
529 *
530 * We skip this on release builds (when we're also not doing BO debugging) to
531 * reduce overhead.
532 */
533 static void
tu_bo_set_kernel_name(struct tu_device * dev,struct tu_bo * bo,const char * name)534 tu_bo_set_kernel_name(struct tu_device *dev, struct tu_bo *bo, const char *name)
535 {
536 bool kernel_bo_names = dev->bo_sizes != NULL;
537 #ifdef DEBUG
538 kernel_bo_names = true;
539 #endif
540 if (!kernel_bo_names)
541 return;
542
543 size_t sz = strlen(name);
544
545 unsigned req_len = sizeof(struct msm_ccmd_gem_set_name_req) + align(sz, 4);
546
547 uint8_t buf[req_len];
548 struct msm_ccmd_gem_set_name_req *req = (struct msm_ccmd_gem_set_name_req *)buf;
549
550 req->hdr = MSM_CCMD(GEM_SET_NAME, req_len);
551 req->res_id = bo->res_id;
552 req->len = sz;
553
554 memcpy(req->payload, name, sz);
555
556 vdrm_send_req(dev->vdev->vdrm, &req->hdr, false);
557 }
558
559 static VkResult
virtio_bo_init(struct tu_device * dev,struct tu_bo ** out_bo,uint64_t size,uint64_t client_iova,VkMemoryPropertyFlags mem_property,enum tu_bo_alloc_flags flags,const char * name)560 virtio_bo_init(struct tu_device *dev,
561 struct tu_bo **out_bo,
562 uint64_t size,
563 uint64_t client_iova,
564 VkMemoryPropertyFlags mem_property,
565 enum tu_bo_alloc_flags flags,
566 const char *name)
567 {
568 struct tu_virtio_device *vdev = dev->vdev;
569 struct msm_ccmd_gem_new_req req = {
570 .hdr = MSM_CCMD(GEM_NEW, sizeof(req)),
571 .size = size,
572 };
573 VkResult result;
574
575 result = virtio_allocate_userspace_iova(dev, size, client_iova,
576 flags, &req.iova);
577 if (result != VK_SUCCESS) {
578 return result;
579 }
580
581 if (mem_property & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) {
582 if (mem_property & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) {
583 req.flags |= MSM_BO_CACHED_COHERENT;
584 } else {
585 req.flags |= MSM_BO_CACHED;
586 }
587 } else {
588 req.flags |= MSM_BO_WC;
589 }
590
591 uint32_t blob_flags = 0;
592 if (mem_property & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
593 blob_flags |= VIRTGPU_BLOB_FLAG_USE_MAPPABLE;
594 }
595
596 if (!(mem_property & VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT)) {
597 blob_flags |= VIRTGPU_BLOB_FLAG_USE_CROSS_DEVICE |
598 VIRTGPU_BLOB_FLAG_USE_SHAREABLE;
599 }
600
601 if (flags & TU_BO_ALLOC_GPU_READ_ONLY)
602 req.flags |= MSM_BO_GPU_READONLY;
603
604 /* tunneled cmds are processed separately on host side,
605 * before the renderer->get_blob() callback.. the blob_id
606 * is used to link the created bo to the get_blob() call
607 */
608 req.blob_id = p_atomic_inc_return(&vdev->next_blob_id);;
609
610 uint32_t handle =
611 vdrm_bo_create(vdev->vdrm, size, blob_flags, req.blob_id, &req.hdr);
612
613 if (!handle) {
614 util_vma_heap_free(&dev->vma, req.iova, size);
615 return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
616 }
617
618 uint32_t res_id = vdrm_handle_to_res_id(vdev->vdrm, handle);
619 struct tu_bo* bo = tu_device_lookup_bo(dev, res_id);
620 assert(bo && bo->gem_handle == 0);
621
622 bo->res_id = res_id;
623
624 result = tu_bo_init(dev, bo, handle, size, req.iova, flags, name);
625 if (result != VK_SUCCESS)
626 memset(bo, 0, sizeof(*bo));
627 else
628 *out_bo = bo;
629
630 /* We don't use bo->name here because for the !TU_DEBUG=bo case bo->name is NULL. */
631 tu_bo_set_kernel_name(dev, bo, name);
632
633 if (result == VK_SUCCESS &&
634 (mem_property & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) &&
635 !(mem_property & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
636 tu_bo_map(dev, bo);
637
638 /* Cached non-coherent memory may already have dirty cache lines,
639 * we should clean the cache lines before GPU got the chance to
640 * write into this memory.
641 *
642 * MSM already does this automatically for uncached (MSM_BO_WC) memory.
643 */
644 tu_sync_cache_bo(dev, bo, 0, VK_WHOLE_SIZE, TU_MEM_SYNC_CACHE_TO_GPU);
645 }
646
647 return result;
648 }
649
650 static VkResult
virtio_bo_init_dmabuf(struct tu_device * dev,struct tu_bo ** out_bo,uint64_t size,int prime_fd)651 virtio_bo_init_dmabuf(struct tu_device *dev,
652 struct tu_bo **out_bo,
653 uint64_t size,
654 int prime_fd)
655 {
656 struct vdrm_device *vdrm = dev->vdev->vdrm;
657 VkResult result;
658 struct tu_bo* bo = NULL;
659
660 /* lseek() to get the real size */
661 off_t real_size = lseek(prime_fd, 0, SEEK_END);
662 lseek(prime_fd, 0, SEEK_SET);
663 if (real_size < 0 || (uint64_t) real_size < size)
664 return vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
665
666 /* iova allocation needs to consider the object's *real* size: */
667 size = real_size;
668
669 uint64_t iova;
670 result = virtio_allocate_userspace_iova(dev, size, 0, TU_BO_ALLOC_NO_FLAGS, &iova);
671 if (result != VK_SUCCESS)
672 return result;
673
674 /* Importing the same dmabuf several times would yield the same
675 * gem_handle. Thus there could be a race when destroying
676 * BO and importing the same dmabuf from different threads.
677 * We must not permit the creation of dmabuf BO and its release
678 * to happen in parallel.
679 */
680 u_rwlock_wrlock(&dev->dma_bo_lock);
681
682 uint32_t handle, res_id;
683
684 handle = vdrm_dmabuf_to_handle(vdrm, prime_fd);
685 if (!handle) {
686 result = vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
687 goto out_unlock;
688 }
689
690 res_id = vdrm_handle_to_res_id(vdrm, handle);
691 if (!res_id) {
692 result = vk_error(dev, VK_ERROR_INVALID_EXTERNAL_HANDLE);
693 goto out_unlock;
694 }
695
696 bo = tu_device_lookup_bo(dev, res_id);
697
698 if (bo->refcnt != 0) {
699 p_atomic_inc(&bo->refcnt);
700 assert(bo->res_id == res_id);
701 *out_bo = bo;
702 result = VK_SUCCESS;
703 goto out_unlock;
704 }
705
706 bo->res_id = res_id;
707
708 result = tu_bo_init(dev, bo, handle, size, iova,
709 TU_BO_ALLOC_NO_FLAGS, "dmabuf");
710 if (result != VK_SUCCESS)
711 memset(bo, 0, sizeof(*bo));
712 else
713 *out_bo = bo;
714
715 out_unlock:
716 u_rwlock_wrunlock(&dev->dma_bo_lock);
717 if (result != VK_SUCCESS) {
718 mtx_lock(&dev->vma_mutex);
719 util_vma_heap_free(&dev->vma, iova, size);
720 mtx_unlock(&dev->vma_mutex);
721 }
722
723 return result;
724 }
725
726 static VkResult
virtio_bo_map(struct tu_device * dev,struct tu_bo * bo)727 virtio_bo_map(struct tu_device *dev, struct tu_bo *bo)
728 {
729 if (bo->map)
730 return VK_SUCCESS;
731
732 bo->map = vdrm_bo_map(dev->vdev->vdrm, bo->gem_handle, bo->size);
733 if (bo->map == MAP_FAILED)
734 return vk_error(dev, VK_ERROR_MEMORY_MAP_FAILED);
735
736 return VK_SUCCESS;
737 }
738
739 static void
virtio_bo_allow_dump(struct tu_device * dev,struct tu_bo * bo)740 virtio_bo_allow_dump(struct tu_device *dev, struct tu_bo *bo)
741 {
742 mtx_lock(&dev->bo_mutex);
743 dev->bo_list[bo->bo_list_idx].flags |= MSM_SUBMIT_BO_DUMP;
744 mtx_unlock(&dev->bo_mutex);
745 }
746
747 static VkResult
tu_queue_submit_create_locked(struct tu_queue * queue,struct vk_queue_submit * vk_submit,const uint32_t nr_in_syncobjs,const uint32_t nr_out_syncobjs,uint32_t perf_pass_index,struct tu_queue_submit * new_submit)748 tu_queue_submit_create_locked(struct tu_queue *queue,
749 struct vk_queue_submit *vk_submit,
750 const uint32_t nr_in_syncobjs,
751 const uint32_t nr_out_syncobjs,
752 uint32_t perf_pass_index,
753 struct tu_queue_submit *new_submit)
754 {
755 VkResult result;
756
757 bool u_trace_enabled = u_trace_should_process(&queue->device->trace_context);
758 bool has_trace_points = false;
759
760 struct vk_command_buffer **vk_cmd_buffers = vk_submit->command_buffers;
761
762 memset(new_submit, 0, sizeof(struct tu_queue_submit));
763
764 new_submit->cmd_buffers = (struct tu_cmd_buffer **) vk_cmd_buffers;
765 new_submit->nr_cmd_buffers = vk_submit->command_buffer_count;
766 tu_insert_dynamic_cmdbufs(queue->device, &new_submit->cmd_buffers,
767 &new_submit->nr_cmd_buffers);
768
769 uint32_t entry_count = 0;
770 for (uint32_t j = 0; j < new_submit->nr_cmd_buffers; ++j) {
771 struct tu_cmd_buffer *cmdbuf = new_submit->cmd_buffers[j];
772
773 if (perf_pass_index != ~0)
774 entry_count++;
775
776 entry_count += cmdbuf->cs.entry_count;
777
778 if (u_trace_enabled && u_trace_has_points(&cmdbuf->trace)) {
779 if (!(cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT))
780 entry_count++;
781
782 has_trace_points = true;
783 }
784 }
785
786 new_submit->autotune_fence =
787 tu_autotune_submit_requires_fence(new_submit->cmd_buffers, new_submit->nr_cmd_buffers);
788 if (new_submit->autotune_fence)
789 entry_count++;
790
791 /* Add one for the userspace fence cmd: */
792 entry_count += 1;
793
794 new_submit->cmds = (struct drm_msm_gem_submit_cmd *) vk_zalloc(
795 &queue->device->vk.alloc, entry_count * sizeof(*new_submit->cmds), 8,
796 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
797
798 if (new_submit->cmds == NULL) {
799 result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
800 goto fail_cmds;
801 }
802
803 if (has_trace_points) {
804 result =
805 tu_u_trace_submission_data_create(
806 queue->device, new_submit->cmd_buffers,
807 new_submit->nr_cmd_buffers,
808 &new_submit->u_trace_submission_data);
809
810 if (result != VK_SUCCESS) {
811 goto fail_u_trace_submission_data;
812 }
813 }
814
815 /* Allocate without wait timeline semaphores */
816 new_submit->in_syncobjs = (struct drm_virtgpu_execbuffer_syncobj *) vk_zalloc(
817 &queue->device->vk.alloc,
818 nr_in_syncobjs * sizeof(*new_submit->in_syncobjs), 8,
819 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
820
821 if (new_submit->in_syncobjs == NULL) {
822 result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
823 goto fail_in_syncobjs;
824 }
825
826 /* Allocate with signal timeline semaphores considered */
827 new_submit->out_syncobjs = (struct drm_virtgpu_execbuffer_syncobj *) vk_zalloc(
828 &queue->device->vk.alloc,
829 nr_out_syncobjs * sizeof(*new_submit->out_syncobjs), 8,
830 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
831
832 if (new_submit->out_syncobjs == NULL) {
833 result = vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
834 goto fail_out_syncobjs;
835 }
836
837 new_submit->entry_count = entry_count;
838 new_submit->nr_in_syncobjs = nr_in_syncobjs;
839 new_submit->nr_out_syncobjs = nr_out_syncobjs;
840 new_submit->perf_pass_index = perf_pass_index;
841 new_submit->vk_submit = vk_submit;
842
843 return VK_SUCCESS;
844
845 fail_out_syncobjs:
846 vk_free(&queue->device->vk.alloc, new_submit->in_syncobjs);
847 fail_in_syncobjs:
848 if (new_submit->u_trace_submission_data)
849 tu_u_trace_submission_data_finish(queue->device,
850 new_submit->u_trace_submission_data);
851 fail_u_trace_submission_data:
852 vk_free(&queue->device->vk.alloc, new_submit->cmds);
853 fail_cmds:
854 return result;
855 }
856
857 static void
tu_queue_submit_finish(struct tu_queue * queue,struct tu_queue_submit * submit)858 tu_queue_submit_finish(struct tu_queue *queue, struct tu_queue_submit *submit)
859 {
860 vk_free(&queue->device->vk.alloc, submit->cmds);
861 vk_free(&queue->device->vk.alloc, submit->in_syncobjs);
862 vk_free(&queue->device->vk.alloc, submit->out_syncobjs);
863 if (submit->cmd_buffers != (void *) submit->vk_submit->command_buffers)
864 vk_free(&queue->device->vk.alloc, submit->cmd_buffers);
865 }
866
867 static void
tu_fill_msm_gem_submit(struct tu_device * dev,struct drm_msm_gem_submit_cmd * cmd,struct tu_cs_entry * cs_entry)868 tu_fill_msm_gem_submit(struct tu_device *dev,
869 struct drm_msm_gem_submit_cmd *cmd,
870 struct tu_cs_entry *cs_entry)
871 {
872 cmd->type = MSM_SUBMIT_CMD_BUF;
873 cmd->submit_idx = cs_entry->bo->bo_list_idx;
874 cmd->submit_offset = cs_entry->offset;
875 cmd->size = cs_entry->size;
876 cmd->pad = 0;
877 cmd->nr_relocs = 0;
878 cmd->relocs = 0;
879 }
880
881 static void
tu_queue_build_msm_gem_submit_cmds(struct tu_queue * queue,struct tu_queue_submit * submit,struct tu_cs * autotune_cs)882 tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue,
883 struct tu_queue_submit *submit,
884 struct tu_cs *autotune_cs)
885 {
886 struct tu_device *dev = queue->device;
887 struct tu_virtio_device *vdev = dev->vdev;
888 struct drm_msm_gem_submit_cmd *cmds = submit->cmds;
889
890 uint32_t entry_idx = 0;
891 for (uint32_t j = 0; j < submit->nr_cmd_buffers; ++j) {
892 struct tu_device *dev = queue->device;
893 struct tu_cmd_buffer *cmdbuf = submit->cmd_buffers[j];
894 struct tu_cs *cs = &cmdbuf->cs;
895
896 if (submit->perf_pass_index != ~0) {
897 struct tu_cs_entry *perf_cs_entry =
898 &dev->perfcntrs_pass_cs_entries[submit->perf_pass_index];
899
900 tu_fill_msm_gem_submit(dev, &cmds[entry_idx], perf_cs_entry);
901 entry_idx++;
902 }
903
904 for (unsigned i = 0; i < cs->entry_count; ++i, ++entry_idx) {
905 tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &cs->entries[i]);
906 }
907
908 if (submit->u_trace_submission_data) {
909 struct tu_cs *ts_cs =
910 submit->u_trace_submission_data->cmd_trace_data[j].timestamp_copy_cs;
911 if (ts_cs) {
912 tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &ts_cs->entries[0]);
913 entry_idx++;
914 }
915 }
916 }
917
918 if (autotune_cs) {
919 assert(autotune_cs->entry_count == 1);
920 tu_fill_msm_gem_submit(dev, &cmds[entry_idx], &autotune_cs->entries[0]);
921 entry_idx++;
922 }
923
924 /* Last, add the userspace fence cmd: */
925 struct tu_userspace_fence_cmds *fcmds = vdev->fence_cmds;
926 if (queue->fence <= 0)
927 queue->fence = 0;
928 uint32_t fence = ++queue->fence;
929 int idx = fence % ARRAY_SIZE(fcmds->cmds);
930
931 /* Wait for previous usage of fence cmd to be idle.. in practice the table
932 * of recycled cmds should be big enough to never stall here:
933 */
934 tu_wait_fence(dev, dev->queues[0]->msm_queue_id, fcmds->cmds[idx].fence, 3000000000);
935
936 fcmds->cmds[idx].fence = fence;
937
938 cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF;
939 cmds[entry_idx].submit_idx = vdev->fence_cmds_mem->bo_list_idx;
940 cmds[entry_idx].submit_offset = ((intptr_t)&fcmds->cmds[idx]) - (intptr_t)fcmds;
941 cmds[entry_idx].size = 5 * 4;
942 cmds[entry_idx].pad = 0;
943 cmds[entry_idx].nr_relocs = 0;
944 cmds[entry_idx].relocs = 0;
945 }
946
947 static VkResult
setup_fence_cmds(struct tu_device * dev)948 setup_fence_cmds(struct tu_device *dev)
949 {
950 struct tu_virtio_device *vdev = dev->vdev;
951 VkResult result;
952
953 result = tu_bo_init_new(dev, &vdev->fence_cmds_mem, sizeof(*vdev->fence_cmds),
954 (enum tu_bo_alloc_flags)
955 (TU_BO_ALLOC_ALLOW_DUMP | TU_BO_ALLOC_GPU_READ_ONLY),
956 "fence_cmds");
957 if (result != VK_SUCCESS)
958 return result;
959
960 result = tu_bo_map(dev, vdev->fence_cmds_mem);
961 if (result != VK_SUCCESS)
962 return result;
963
964 vdev->fence_cmds = (struct tu_userspace_fence_cmds *)vdev->fence_cmds_mem->map;
965
966 uint64_t fence_iova = dev->global_bo->iova + gb_offset(userspace_fence);
967 for (int i = 0; i < ARRAY_SIZE(vdev->fence_cmds->cmds); i++) {
968 struct tu_userspace_fence_cmd *c = &vdev->fence_cmds->cmds[i];
969
970 memset(c, 0, sizeof(*c));
971
972 c->pkt[0] = pm4_pkt7_hdr((uint8_t)CP_EVENT_WRITE, 4);
973 c->pkt[1] = CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS);
974 c->pkt[2] = fence_iova;
975 c->pkt[3] = fence_iova >> 32;
976 }
977
978 return result;
979 }
980
981 static VkResult
tu_queue_submit_locked(struct tu_queue * queue,struct tu_queue_submit * submit)982 tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
983 {
984 struct tu_virtio_device *vdev = queue->device->vdev;
985
986 queue->device->submit_count++;
987
988 /* It would be nice to not need to defer this, but virtio_device_init()
989 * happens before the device is initialized enough to allocate normal
990 * GEM buffers
991 */
992 if (!vdev->fence_cmds) {
993 VkResult result = setup_fence_cmds(queue->device);
994 if (result != VK_SUCCESS)
995 return result;
996 }
997
998 struct tu_cs *autotune_cs = NULL;
999 if (submit->autotune_fence) {
1000 autotune_cs = tu_autotune_on_submit(queue->device,
1001 &queue->device->autotune,
1002 submit->cmd_buffers,
1003 submit->nr_cmd_buffers);
1004 }
1005
1006 uint32_t flags = MSM_PIPE_3D0;
1007
1008 if (submit->vk_submit->wait_count)
1009 flags |= MSM_SUBMIT_SYNCOBJ_IN;
1010
1011 if (submit->vk_submit->signal_count)
1012 flags |= MSM_SUBMIT_SYNCOBJ_OUT;
1013
1014 mtx_lock(&queue->device->bo_mutex);
1015
1016 if (queue->device->implicit_sync_bo_count == 0)
1017 flags |= MSM_SUBMIT_NO_IMPLICIT;
1018
1019 /* drm_msm_gem_submit_cmd requires index of bo which could change at any
1020 * time when bo_mutex is not locked. So we build submit cmds here the real
1021 * place to submit.
1022 */
1023 tu_queue_build_msm_gem_submit_cmds(queue, submit, autotune_cs);
1024
1025 /* TODO avoid extra memcpy, and populate bo's and cmds directly
1026 * into the req msg
1027 */
1028 unsigned nr_cmds = submit->entry_count;
1029 unsigned nr_bos = nr_cmds ? queue->device->bo_count : 0;
1030 unsigned bos_len = nr_bos * sizeof(struct drm_msm_gem_submit_bo);
1031 unsigned cmd_len = nr_cmds * sizeof(struct drm_msm_gem_submit_cmd);
1032 unsigned req_len = sizeof(struct msm_ccmd_gem_submit_req) + bos_len + cmd_len;
1033 struct msm_ccmd_gem_submit_req *req = (struct msm_ccmd_gem_submit_req *)vk_alloc(
1034 &queue->device->vk.alloc, req_len, 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1035
1036 if (!req) {
1037 mtx_unlock(&queue->device->bo_mutex);
1038 return vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
1039 }
1040
1041 req->hdr = MSM_CCMD(GEM_SUBMIT, req_len);
1042 req->flags = flags;
1043 req->queue_id = queue->msm_queue_id;
1044 req->nr_bos = nr_bos;
1045 req->nr_cmds = nr_cmds;
1046
1047 /* Use same kernel fence and userspace fence seqno to avoid having
1048 * to track both:
1049 */
1050 req->fence = queue->fence;
1051
1052 memcpy(req->payload, queue->device->bo_list, bos_len);
1053 memcpy(req->payload + bos_len, submit->cmds, cmd_len);
1054
1055 int ring_idx = queue->priority + 1;
1056 int ret;
1057
1058 struct vdrm_execbuf_params p = {
1059 .ring_idx = ring_idx,
1060 .req = &req->hdr,
1061 .in_syncobjs = submit->in_syncobjs,
1062 .out_syncobjs = submit->out_syncobjs,
1063 .num_in_syncobjs = submit->nr_in_syncobjs,
1064 .num_out_syncobjs = submit->nr_out_syncobjs,
1065 };
1066
1067 ret = vdrm_execbuf(vdev->vdrm, &p);
1068
1069 mtx_unlock(&queue->device->bo_mutex);
1070
1071 tu_debug_bos_print_stats(queue->device);
1072
1073 if (ret)
1074 return vk_device_set_lost(&queue->device->vk, "submit failed: %m");
1075
1076 uint64_t gpu_offset = 0;
1077 #if HAVE_PERFETTO
1078 struct tu_perfetto_clocks clocks =
1079 tu_perfetto_submit(queue->device, queue->device->submit_count, NULL);
1080 gpu_offset = clocks.gpu_ts_offset;
1081 #endif
1082
1083 if (submit->u_trace_submission_data) {
1084 struct tu_u_trace_submission_data *submission_data =
1085 submit->u_trace_submission_data;
1086 submission_data->submission_id = queue->device->submit_count;
1087 submission_data->gpu_ts_offset = gpu_offset;
1088 /* We have to allocate it here since it is different between drm/kgsl */
1089 submission_data->syncobj = (struct tu_u_trace_syncobj *)
1090 vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj),
1091 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1092 submission_data->syncobj->fence = req->fence;
1093 submission_data->syncobj->msm_queue_id = queue->msm_queue_id;
1094
1095 submit->u_trace_submission_data = NULL;
1096
1097 for (uint32_t i = 0; i < submission_data->cmd_buffer_count; i++) {
1098 bool free_data = i == submission_data->last_buffer_with_tracepoints;
1099 if (submission_data->cmd_trace_data[i].trace)
1100 u_trace_flush(submission_data->cmd_trace_data[i].trace,
1101 submission_data, free_data);
1102
1103 if (!submission_data->cmd_trace_data[i].timestamp_copy_cs) {
1104 /* u_trace is owned by cmd_buffer */
1105 submission_data->cmd_trace_data[i].trace = NULL;
1106 }
1107 }
1108 }
1109
1110 for (uint32_t i = 0; i < submit->vk_submit->wait_count; i++) {
1111 if (!vk_sync_is_tu_timeline_sync(submit->vk_submit->waits[i].sync))
1112 continue;
1113
1114 struct tu_timeline_sync *sync =
1115 container_of(submit->vk_submit->waits[i].sync, struct tu_timeline_sync, base);
1116
1117 assert(sync->state != TU_TIMELINE_SYNC_STATE_RESET);
1118
1119 /* Set SIGNALED to the state of the wait timeline sync since this means the syncobj
1120 * is done and ready again so this can be garbage-collectioned later.
1121 */
1122 sync->state = TU_TIMELINE_SYNC_STATE_SIGNALED;
1123 }
1124
1125 for (uint32_t i = 0; i < submit->vk_submit->signal_count; i++) {
1126 if (!vk_sync_is_tu_timeline_sync(submit->vk_submit->signals[i].sync))
1127 continue;
1128
1129 struct tu_timeline_sync *sync =
1130 container_of(submit->vk_submit->signals[i].sync, struct tu_timeline_sync, base);
1131
1132 assert(sync->state == TU_TIMELINE_SYNC_STATE_RESET);
1133 /* Set SUBMITTED to the state of the signal timeline sync so we could wait for
1134 * this timeline sync until completed if necessary.
1135 */
1136 sync->state = TU_TIMELINE_SYNC_STATE_SUBMITTED;
1137 }
1138
1139 pthread_cond_broadcast(&queue->device->timeline_cond);
1140
1141 return VK_SUCCESS;
1142 }
1143
1144 static VkResult
virtio_device_wait_u_trace(struct tu_device * dev,struct tu_u_trace_syncobj * syncobj)1145 virtio_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj)
1146 {
1147 return tu_wait_fence(dev, syncobj->msm_queue_id, syncobj->fence, 1000000000);
1148 }
1149
1150 static VkResult
virtio_queue_submit(struct tu_queue * queue,struct vk_queue_submit * submit)1151 virtio_queue_submit(struct tu_queue *queue, struct vk_queue_submit *submit)
1152 {
1153 MESA_TRACE_FUNC();
1154 uint32_t perf_pass_index = queue->device->perfcntrs_pass_cs ?
1155 submit->perf_pass_index : ~0;
1156 struct tu_queue_submit submit_req;
1157
1158 if (TU_DEBUG(LOG_SKIP_GMEM_OPS)) {
1159 tu_dbg_log_gmem_load_store_skips(queue->device);
1160 }
1161
1162 pthread_mutex_lock(&queue->device->submit_mutex);
1163
1164 VkResult ret = tu_queue_submit_create_locked(queue, submit,
1165 submit->wait_count, submit->signal_count,
1166 perf_pass_index, &submit_req);
1167
1168 if (ret != VK_SUCCESS) {
1169 pthread_mutex_unlock(&queue->device->submit_mutex);
1170 return ret;
1171 }
1172
1173 /* note: assuming there won't be any very large semaphore counts */
1174 struct drm_virtgpu_execbuffer_syncobj *in_syncobjs = submit_req.in_syncobjs;
1175 struct drm_virtgpu_execbuffer_syncobj *out_syncobjs = submit_req.out_syncobjs;
1176
1177 uint32_t nr_in_syncobjs = 0, nr_out_syncobjs = 0;
1178
1179 for (uint32_t i = 0; i < submit->wait_count; i++) {
1180 struct vk_sync *sync = submit->waits[i].sync;
1181
1182 in_syncobjs[nr_in_syncobjs++] = (struct drm_virtgpu_execbuffer_syncobj) {
1183 .handle = tu_syncobj_from_vk_sync(sync),
1184 .flags = 0,
1185 .point = submit->waits[i].wait_value,
1186 };
1187 }
1188
1189 for (uint32_t i = 0; i < submit->signal_count; i++) {
1190 struct vk_sync *sync = submit->signals[i].sync;
1191
1192 out_syncobjs[nr_out_syncobjs++] = (struct drm_virtgpu_execbuffer_syncobj) {
1193 .handle = tu_syncobj_from_vk_sync(sync),
1194 .flags = 0,
1195 .point = submit->signals[i].signal_value,
1196 };
1197 }
1198
1199 ret = tu_queue_submit_locked(queue, &submit_req);
1200
1201 pthread_mutex_unlock(&queue->device->submit_mutex);
1202 tu_queue_submit_finish(queue, &submit_req);
1203
1204 if (ret != VK_SUCCESS)
1205 return ret;
1206
1207 u_trace_context_process(&queue->device->trace_context, true);
1208
1209 return VK_SUCCESS;
1210 }
1211
1212 static const struct tu_knl virtio_knl_funcs = {
1213 .name = "virtgpu",
1214
1215 .device_init = virtio_device_init,
1216 .device_finish = virtio_device_finish,
1217 .device_get_gpu_timestamp = virtio_device_get_gpu_timestamp,
1218 .device_get_suspend_count = virtio_device_get_suspend_count,
1219 .device_check_status = virtio_device_check_status,
1220 .submitqueue_new = virtio_submitqueue_new,
1221 .submitqueue_close = virtio_submitqueue_close,
1222 .bo_init = virtio_bo_init,
1223 .bo_init_dmabuf = virtio_bo_init_dmabuf,
1224 .bo_export_dmabuf = tu_drm_export_dmabuf,
1225 .bo_map = virtio_bo_map,
1226 .bo_allow_dump = virtio_bo_allow_dump,
1227 .bo_finish = tu_drm_bo_finish,
1228 .device_wait_u_trace = virtio_device_wait_u_trace,
1229 .queue_submit = virtio_queue_submit,
1230 };
1231
1232 VkResult
tu_knl_drm_virtio_load(struct tu_instance * instance,int fd,struct _drmVersion * version,struct tu_physical_device ** out)1233 tu_knl_drm_virtio_load(struct tu_instance *instance,
1234 int fd, struct _drmVersion *version,
1235 struct tu_physical_device **out)
1236 {
1237 struct virgl_renderer_capset_drm caps;
1238 struct vdrm_device *vdrm;
1239 VkResult result = VK_SUCCESS;
1240 uint64_t val;
1241
1242 /* Debug option to force fallback to venus: */
1243 if (debug_get_bool_option("TU_NO_VIRTIO", false))
1244 return VK_ERROR_INCOMPATIBLE_DRIVER;
1245
1246 if (drmGetCap(fd, DRM_CAP_SYNCOBJ, &val) || !val) {
1247 return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
1248 "kernel driver for device %s does not support DRM_CAP_SYNC_OBJ",
1249 version->name);
1250 }
1251
1252 vdrm = vdrm_device_connect(fd, VIRTGPU_DRM_CONTEXT_MSM);
1253 if (!vdrm) {
1254 return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
1255 "could not get connect vdrm: %s", strerror(errno));
1256 }
1257
1258 caps = vdrm->caps;
1259
1260 vdrm_device_close(vdrm);
1261
1262 mesa_logd("wire_format_version: %u", caps.wire_format_version);
1263 mesa_logd("version_major: %u", caps.version_major);
1264 mesa_logd("version_minor: %u", caps.version_minor);
1265 mesa_logd("version_patchlevel: %u", caps.version_patchlevel);
1266 mesa_logd("has_cached_coherent: %u", caps.u.msm.has_cached_coherent);
1267 mesa_logd("va_start: 0x%0" PRIx64, caps.u.msm.va_start);
1268 mesa_logd("va_size: 0x%0" PRIx64, caps.u.msm.va_size);
1269 mesa_logd("gpu_id: %u", caps.u.msm.gpu_id);
1270 mesa_logd("gmem_size: %u", caps.u.msm.gmem_size);
1271 mesa_logd("gmem_base: 0x%0" PRIx64, caps.u.msm.gmem_base);
1272 mesa_logd("chip_id: 0x%0" PRIx64, caps.u.msm.chip_id);
1273 mesa_logd("max_freq: %u", caps.u.msm.max_freq);
1274
1275 if (caps.wire_format_version != 2) {
1276 return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
1277 "Unsupported protocol version: %u",
1278 caps.wire_format_version);
1279 }
1280
1281 if ((caps.version_major != 1) || (caps.version_minor < 9)) {
1282 return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
1283 "unsupported version: %u.%u.%u",
1284 caps.version_major,
1285 caps.version_minor,
1286 caps.version_patchlevel);
1287 }
1288
1289 if (!caps.u.msm.va_size) {
1290 return vk_startup_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
1291 "No address space");
1292 }
1293
1294 struct tu_physical_device *device = (struct tu_physical_device *)
1295 vk_zalloc(&instance->vk.alloc, sizeof(*device), 8,
1296 VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
1297 if (!device) {
1298 result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1299 goto fail;
1300 }
1301
1302 device->msm_major_version = caps.version_major;
1303 device->msm_minor_version = caps.version_minor;
1304
1305 device->instance = instance;
1306 device->local_fd = fd;
1307
1308 device->dev_id.gpu_id = caps.u.msm.gpu_id;
1309 device->dev_id.chip_id = caps.u.msm.chip_id;
1310 device->gmem_size = caps.u.msm.gmem_size;
1311 device->gmem_base = caps.u.msm.gmem_base;
1312 device->va_start = caps.u.msm.va_start;
1313 device->va_size = caps.u.msm.va_size;
1314 device->has_set_iova = true;
1315
1316 device->gmem_size = debug_get_num_option("TU_GMEM", device->gmem_size);
1317
1318 device->has_cached_coherent_memory = caps.u.msm.has_cached_coherent;
1319
1320 device->submitqueue_priority_count = caps.u.msm.priorities;
1321
1322 device->syncobj_type = vk_drm_syncobj_get_type(fd);
1323 /* we don't support DRM_CAP_SYNCOBJ_TIMELINE, but drm-shim does */
1324 if (!(device->syncobj_type.features & VK_SYNC_FEATURE_TIMELINE))
1325 device->timeline_type = vk_sync_timeline_get_type(&tu_timeline_sync_type);
1326
1327 device->sync_types[0] = &device->syncobj_type;
1328 device->sync_types[1] = &device->timeline_type.sync;
1329 device->sync_types[2] = NULL;
1330
1331 device->heap.size = tu_get_system_heap_size(device);
1332 device->heap.used = 0u;
1333 device->heap.flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT;
1334
1335 instance->knl = &virtio_knl_funcs;
1336
1337 *out = device;
1338
1339 return VK_SUCCESS;
1340
1341 fail:
1342 vk_free(&instance->vk.alloc, device);
1343 return result;
1344 }
1345