1 /*
2 * Copyright © 2019 Raspberry Pi
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "v3dv_private.h"
25 #include "drm-uapi/v3d_drm.h"
26
27 #include "broadcom/clif/clif_dump.h"
28
29 #include <errno.h>
30 #include <time.h>
31
32 static void
v3dv_clif_dump(struct v3dv_device * device,struct v3dv_job * job,struct drm_v3d_submit_cl * submit)33 v3dv_clif_dump(struct v3dv_device *device,
34 struct v3dv_job *job,
35 struct drm_v3d_submit_cl *submit)
36 {
37 if (!(unlikely(V3D_DEBUG & (V3D_DEBUG_CL |
38 V3D_DEBUG_CL_NO_BIN |
39 V3D_DEBUG_CLIF))))
40 return;
41
42 struct clif_dump *clif = clif_dump_init(&device->devinfo,
43 stderr,
44 V3D_DEBUG & (V3D_DEBUG_CL |
45 V3D_DEBUG_CL_NO_BIN),
46 V3D_DEBUG & V3D_DEBUG_CL_NO_BIN);
47
48 set_foreach(job->bos, entry) {
49 struct v3dv_bo *bo = (void *)entry->key;
50 char *name = ralloc_asprintf(NULL, "%s_0x%x",
51 bo->name, bo->offset);
52
53 bool ok = v3dv_bo_map(device, bo, bo->size);
54 if (!ok) {
55 fprintf(stderr, "failed to map BO for clif_dump.\n");
56 ralloc_free(name);
57 goto free_clif;
58 }
59 clif_dump_add_bo(clif, name, bo->offset, bo->size, bo->map);
60
61 ralloc_free(name);
62 }
63
64 clif_dump(clif, submit);
65
66 free_clif:
67 clif_dump_destroy(clif);
68 }
69
70 static uint64_t
gettime_ns()71 gettime_ns()
72 {
73 struct timespec current;
74 clock_gettime(CLOCK_MONOTONIC, ¤t);
75 return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec;
76 }
77
78 static uint64_t
get_absolute_timeout(uint64_t timeout)79 get_absolute_timeout(uint64_t timeout)
80 {
81 uint64_t current_time = gettime_ns();
82 uint64_t max_timeout = (uint64_t) INT64_MAX - current_time;
83
84 timeout = MIN2(max_timeout, timeout);
85
86 return (current_time + timeout);
87 }
88
89 static VkResult
90 queue_submit_job(struct v3dv_queue *queue,
91 struct v3dv_job *job,
92 bool do_sem_wait,
93 pthread_t *wait_thread);
94
95 /* Waits for active CPU wait threads spawned before the current thread to
96 * complete and submit all their GPU jobs.
97 */
98 static void
cpu_queue_wait_idle(struct v3dv_queue * queue)99 cpu_queue_wait_idle(struct v3dv_queue *queue)
100 {
101 const pthread_t this_thread = pthread_self();
102
103 retry:
104 mtx_lock(&queue->mutex);
105 list_for_each_entry(struct v3dv_queue_submit_wait_info, info,
106 &queue->submit_wait_list, list_link) {
107 for (uint32_t i = 0; i < info->wait_thread_count; i++) {
108 if (info->wait_threads[i].finished)
109 continue;
110
111 /* Because we are testing this against the list of spawned threads
112 * it will never match for the main thread, so when we call this from
113 * the main thread we are effectively waiting for all active threads
114 * to complete, and otherwise we are only waiting for work submitted
115 * before the wait thread that called this (a wait thread should never
116 * be waiting for work submitted after it).
117 */
118 if (info->wait_threads[i].thread == this_thread)
119 goto done;
120
121 /* Wait and try again */
122 mtx_unlock(&queue->mutex);
123 usleep(500); /* 0.5 ms */
124 goto retry;
125 }
126 }
127
128 done:
129 mtx_unlock(&queue->mutex);
130 }
131
132 static VkResult
gpu_queue_wait_idle(struct v3dv_queue * queue)133 gpu_queue_wait_idle(struct v3dv_queue *queue)
134 {
135 struct v3dv_device *device = queue->device;
136
137 mtx_lock(&device->mutex);
138 uint32_t last_job_sync = device->last_job_sync;
139 mtx_unlock(&device->mutex);
140
141 int ret = drmSyncobjWait(device->pdevice->render_fd,
142 &last_job_sync, 1, INT64_MAX, 0, NULL);
143 if (ret)
144 return VK_ERROR_DEVICE_LOST;
145
146 return VK_SUCCESS;
147 }
148
149 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_QueueWaitIdle(VkQueue _queue)150 v3dv_QueueWaitIdle(VkQueue _queue)
151 {
152 V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
153
154 /* Check that we don't have any wait threads running in the CPU first,
155 * as these can spawn new GPU jobs.
156 */
157 cpu_queue_wait_idle(queue);
158
159 /* Check we don't have any GPU jobs running */
160 return gpu_queue_wait_idle(queue);
161 }
162
163 static VkResult
handle_reset_query_cpu_job(struct v3dv_job * job)164 handle_reset_query_cpu_job(struct v3dv_job *job)
165 {
166 struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset;
167 assert(info->pool);
168
169 /* We are about to reset query counters so we need to make sure that
170 * The GPU is not using them. The exception is timestamp queries, since
171 * we handle those in the CPU.
172 *
173 * FIXME: we could avoid blocking the main thread for this if we use
174 * submission thread.
175 */
176 if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION)
177 v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE);
178
179 for (uint32_t i = info->first; i < info->first + info->count; i++) {
180 assert(i < info->pool->query_count);
181 struct v3dv_query *q = &info->pool->queries[i];
182 q->maybe_available = false;
183 switch (info->pool->query_type) {
184 case VK_QUERY_TYPE_OCCLUSION: {
185 const uint8_t *q_addr = ((uint8_t *) q->bo->map) + q->offset;
186 uint32_t *counter = (uint32_t *) q_addr;
187 *counter = 0;
188 break;
189 }
190 case VK_QUERY_TYPE_TIMESTAMP:
191 q->value = 0;
192 break;
193 default:
194 unreachable("Unsupported query type");
195 }
196 }
197
198 return VK_SUCCESS;
199 }
200
201 static VkResult
handle_end_query_cpu_job(struct v3dv_job * job)202 handle_end_query_cpu_job(struct v3dv_job *job)
203 {
204 struct v3dv_end_query_cpu_job_info *info = &job->cpu.query_end;
205 for (uint32_t i = 0; i < info->count; i++) {
206 assert(info->query + i < info->pool->query_count);
207 struct v3dv_query *query = &info->pool->queries[info->query + i];
208 query->maybe_available = true;
209 }
210
211 return VK_SUCCESS;
212 }
213
214 static VkResult
handle_copy_query_results_cpu_job(struct v3dv_job * job)215 handle_copy_query_results_cpu_job(struct v3dv_job *job)
216 {
217 struct v3dv_copy_query_results_cpu_job_info *info =
218 &job->cpu.query_copy_results;
219
220 assert(info->dst && info->dst->mem && info->dst->mem->bo);
221 struct v3dv_bo *bo = info->dst->mem->bo;
222
223 /* Map the entire dst buffer for the CPU copy if needed */
224 assert(!bo->map || bo->map_size == bo->size);
225 if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
226 return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
227
228 /* FIXME: if flags includes VK_QUERY_RESULT_WAIT_BIT this could trigger a
229 * sync wait on the CPU for the corresponding GPU jobs to finish. We might
230 * want to use a submission thread to avoid blocking on the main thread.
231 */
232 uint8_t *offset = ((uint8_t *) bo->map) +
233 info->offset + info->dst->mem_offset;
234 v3dv_get_query_pool_results_cpu(job->device,
235 info->pool,
236 info->first,
237 info->count,
238 offset,
239 info->stride,
240 info->flags);
241
242 return VK_SUCCESS;
243 }
244
245 static VkResult
handle_set_event_cpu_job(struct v3dv_job * job,bool is_wait_thread)246 handle_set_event_cpu_job(struct v3dv_job *job, bool is_wait_thread)
247 {
248 /* From the Vulkan 1.0 spec:
249 *
250 * "When vkCmdSetEvent is submitted to a queue, it defines an execution
251 * dependency on commands that were submitted before it, and defines an
252 * event signal operation which sets the event to the signaled state.
253 * The first synchronization scope includes every command previously
254 * submitted to the same queue, including those in the same command
255 * buffer and batch".
256 *
257 * So we should wait for all prior work to be completed before signaling
258 * the event, this includes all active CPU wait threads spawned for any
259 * command buffer submitted *before* this.
260 *
261 * FIXME: we could avoid blocking the main thread for this if we use a
262 * submission thread.
263 */
264
265 /* If we are calling this from a wait thread it will only wait
266 * wait threads sspawned before it, otherwise it will wait for
267 * all active threads to complete.
268 */
269 cpu_queue_wait_idle(&job->device->queue);
270
271 VkResult result = gpu_queue_wait_idle(&job->device->queue);
272 if (result != VK_SUCCESS)
273 return result;
274
275 struct v3dv_event_set_cpu_job_info *info = &job->cpu.event_set;
276 p_atomic_set(&info->event->state, info->state);
277
278 return VK_SUCCESS;
279 }
280
281 static bool
check_wait_events_complete(struct v3dv_job * job)282 check_wait_events_complete(struct v3dv_job *job)
283 {
284 assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
285
286 struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait;
287 for (uint32_t i = 0; i < info->event_count; i++) {
288 if (!p_atomic_read(&info->events[i]->state))
289 return false;
290 }
291 return true;
292 }
293
294 static void
wait_thread_finish(struct v3dv_queue * queue,pthread_t thread)295 wait_thread_finish(struct v3dv_queue *queue, pthread_t thread)
296 {
297 mtx_lock(&queue->mutex);
298 list_for_each_entry(struct v3dv_queue_submit_wait_info, info,
299 &queue->submit_wait_list, list_link) {
300 for (uint32_t i = 0; i < info->wait_thread_count; i++) {
301 if (info->wait_threads[i].thread == thread) {
302 info->wait_threads[i].finished = true;
303 goto done;
304 }
305 }
306 }
307
308 unreachable(!"Failed to finish wait thread: not found");
309
310 done:
311 mtx_unlock(&queue->mutex);
312 }
313
314 static void *
event_wait_thread_func(void * _job)315 event_wait_thread_func(void *_job)
316 {
317 struct v3dv_job *job = (struct v3dv_job *) _job;
318 assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
319 struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait;
320
321 /* Wait for events to be signaled */
322 const useconds_t wait_interval_ms = 1;
323 while (!check_wait_events_complete(job))
324 usleep(wait_interval_ms * 1000);
325
326 /* Now continue submitting pending jobs for the same command buffer after
327 * the wait job.
328 */
329 struct v3dv_queue *queue = &job->device->queue;
330 list_for_each_entry_from(struct v3dv_job, pjob, job->list_link.next,
331 &job->cmd_buffer->jobs, list_link) {
332 /* We don't want to spawn more than one wait thread per command buffer.
333 * If this job also requires a wait for events, we will do the wait here.
334 */
335 VkResult result = queue_submit_job(queue, pjob, info->sem_wait, NULL);
336 if (result == VK_NOT_READY) {
337 while (!check_wait_events_complete(pjob)) {
338 usleep(wait_interval_ms * 1000);
339 }
340 result = VK_SUCCESS;
341 }
342
343 if (result != VK_SUCCESS) {
344 fprintf(stderr, "Wait thread job execution failed.\n");
345 goto done;
346 }
347 }
348
349 done:
350 wait_thread_finish(queue, pthread_self());
351 return NULL;
352 }
353
354 static VkResult
spawn_event_wait_thread(struct v3dv_job * job,pthread_t * wait_thread)355 spawn_event_wait_thread(struct v3dv_job *job, pthread_t *wait_thread)
356
357 {
358 assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
359 assert(job->cmd_buffer);
360 assert(wait_thread != NULL);
361
362 if (pthread_create(wait_thread, NULL, event_wait_thread_func, job))
363 return vk_error(job->device, VK_ERROR_DEVICE_LOST);
364
365 return VK_NOT_READY;
366 }
367
368 static VkResult
handle_wait_events_cpu_job(struct v3dv_job * job,bool sem_wait,pthread_t * wait_thread)369 handle_wait_events_cpu_job(struct v3dv_job *job,
370 bool sem_wait,
371 pthread_t *wait_thread)
372 {
373 assert(job->type == V3DV_JOB_TYPE_CPU_WAIT_EVENTS);
374 struct v3dv_event_wait_cpu_job_info *info = &job->cpu.event_wait;
375
376 /* If all events are signaled then we are done and can continue submitting
377 * the rest of the command buffer normally.
378 */
379 if (check_wait_events_complete(job))
380 return VK_SUCCESS;
381
382 /* Otherwise, we put the rest of the command buffer on a wait thread until
383 * all events are signaled. We only spawn a new thread on the first
384 * wait job we see for a command buffer, any additional wait jobs in the
385 * same command buffer will run in that same wait thread and will get here
386 * with a NULL wait_thread pointer.
387 *
388 * Also, whether we spawn a wait thread or not, we always return
389 * VK_NOT_READY (unless an error happened), so we stop trying to submit
390 * any jobs in the same command buffer after the wait job. The wait thread
391 * will attempt to submit them after the wait completes.
392 */
393 info->sem_wait = sem_wait;
394 if (wait_thread)
395 return spawn_event_wait_thread(job, wait_thread);
396 else
397 return VK_NOT_READY;
398 }
399
400 static VkResult
handle_copy_buffer_to_image_cpu_job(struct v3dv_job * job)401 handle_copy_buffer_to_image_cpu_job(struct v3dv_job *job)
402 {
403 assert(job->type == V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE);
404 struct v3dv_copy_buffer_to_image_cpu_job_info *info =
405 &job->cpu.copy_buffer_to_image;
406
407 /* Wait for all GPU work to finish first, since we may be accessing
408 * the BOs involved in the operation.
409 */
410 v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue));
411
412 /* Map BOs */
413 struct v3dv_bo *dst_bo = info->image->mem->bo;
414 assert(!dst_bo->map || dst_bo->map_size == dst_bo->size);
415 if (!dst_bo->map && !v3dv_bo_map(job->device, dst_bo, dst_bo->size))
416 return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
417 void *dst_ptr = dst_bo->map;
418
419 struct v3dv_bo *src_bo = info->buffer->mem->bo;
420 assert(!src_bo->map || src_bo->map_size == src_bo->size);
421 if (!src_bo->map && !v3dv_bo_map(job->device, src_bo, src_bo->size))
422 return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
423 void *src_ptr = src_bo->map;
424
425 const struct v3d_resource_slice *slice =
426 &info->image->slices[info->mip_level];
427
428 const struct pipe_box box = {
429 info->image_offset.x, info->image_offset.y, info->base_layer,
430 info->image_extent.width, info->image_extent.height, info->layer_count,
431 };
432
433 /* Copy each layer */
434 for (uint32_t i = 0; i < info->layer_count; i++) {
435 const uint32_t dst_offset =
436 v3dv_layer_offset(info->image, info->mip_level, info->base_layer + i);
437 const uint32_t src_offset =
438 info->buffer->mem_offset + info->buffer_offset +
439 info->buffer_layer_stride * i;
440 v3d_store_tiled_image(
441 dst_ptr + dst_offset, slice->stride,
442 src_ptr + src_offset, info->buffer_stride,
443 slice->tiling, info->image->cpp, slice->padded_height, &box);
444 }
445
446 return VK_SUCCESS;
447 }
448
449 static VkResult
handle_timestamp_query_cpu_job(struct v3dv_job * job)450 handle_timestamp_query_cpu_job(struct v3dv_job *job)
451 {
452 assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY);
453 struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp;
454
455 /* Wait for completion of all work queued before the timestamp query */
456 v3dv_QueueWaitIdle(v3dv_queue_to_handle(&job->device->queue));
457
458 /* Compute timestamp */
459 struct timespec t;
460 clock_gettime(CLOCK_MONOTONIC, &t);
461
462 for (uint32_t i = 0; i < info->count; i++) {
463 assert(info->query + i < info->pool->query_count);
464 struct v3dv_query *query = &info->pool->queries[info->query + i];
465 query->maybe_available = true;
466 if (i == 0)
467 query->value = t.tv_sec * 1000000000ull + t.tv_nsec;
468 }
469
470 return VK_SUCCESS;
471 }
472
473 static VkResult
474 handle_csd_job(struct v3dv_queue *queue,
475 struct v3dv_job *job,
476 bool do_sem_wait);
477
478 static VkResult
handle_csd_indirect_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,bool do_sem_wait)479 handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
480 struct v3dv_job *job,
481 bool do_sem_wait)
482 {
483 assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
484 struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect;
485 assert(info->csd_job);
486
487 /* Make sure the GPU is no longer using the indirect buffer*/
488 assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
489 v3dv_bo_wait(queue->device, info->buffer->mem->bo, PIPE_TIMEOUT_INFINITE);
490
491 /* Map the indirect buffer and read the dispatch parameters */
492 assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
493 struct v3dv_bo *bo = info->buffer->mem->bo;
494 if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
495 return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
496 assert(bo->map);
497
498 const uint32_t offset = info->buffer->mem_offset + info->offset;
499 const uint32_t *group_counts = (uint32_t *) (bo->map + offset);
500 if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0)
501 return VK_SUCCESS;
502
503 if (memcmp(group_counts, info->csd_job->csd.wg_count,
504 sizeof(info->csd_job->csd.wg_count)) != 0) {
505 v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts);
506 }
507
508 handle_csd_job(queue, info->csd_job, do_sem_wait);
509
510 return VK_SUCCESS;
511 }
512
513 static VkResult
process_semaphores_to_signal(struct v3dv_device * device,uint32_t count,const VkSemaphore * sems)514 process_semaphores_to_signal(struct v3dv_device *device,
515 uint32_t count, const VkSemaphore *sems)
516 {
517 if (count == 0)
518 return VK_SUCCESS;
519
520 int render_fd = device->pdevice->render_fd;
521
522 int fd;
523 mtx_lock(&device->mutex);
524 drmSyncobjExportSyncFile(render_fd, device->last_job_sync, &fd);
525 mtx_unlock(&device->mutex);
526 if (fd == -1)
527 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
528
529 VkResult result = VK_SUCCESS;
530 for (uint32_t i = 0; i < count; i++) {
531 struct v3dv_semaphore *sem = v3dv_semaphore_from_handle(sems[i]);
532
533 int ret;
534 if (!sem->temp_sync)
535 ret = drmSyncobjImportSyncFile(render_fd, sem->sync, fd);
536 else
537 ret = drmSyncobjImportSyncFile(render_fd, sem->temp_sync, fd);
538
539 if (ret) {
540 result = VK_ERROR_OUT_OF_HOST_MEMORY;
541 break;
542 }
543 }
544
545 assert(fd >= 0);
546 close(fd);
547
548 return result;
549 }
550
551 static VkResult
process_fence_to_signal(struct v3dv_device * device,VkFence _fence)552 process_fence_to_signal(struct v3dv_device *device, VkFence _fence)
553 {
554 if (_fence == VK_NULL_HANDLE)
555 return VK_SUCCESS;
556
557 struct v3dv_fence *fence = v3dv_fence_from_handle(_fence);
558
559 int render_fd = device->pdevice->render_fd;
560
561 int fd;
562 mtx_lock(&device->mutex);
563 drmSyncobjExportSyncFile(render_fd, device->last_job_sync, &fd);
564 mtx_unlock(&device->mutex);
565 if (fd == -1)
566 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
567
568 int ret;
569 if (!fence->temp_sync)
570 ret = drmSyncobjImportSyncFile(render_fd, fence->sync, fd);
571 else
572 ret = drmSyncobjImportSyncFile(render_fd, fence->temp_sync, fd);
573
574 assert(fd >= 0);
575 close(fd);
576
577 return ret ? VK_ERROR_OUT_OF_HOST_MEMORY : VK_SUCCESS;
578 }
579
580 static VkResult
handle_cl_job(struct v3dv_queue * queue,struct v3dv_job * job,bool do_sem_wait)581 handle_cl_job(struct v3dv_queue *queue,
582 struct v3dv_job *job,
583 bool do_sem_wait)
584 {
585 struct v3dv_device *device = queue->device;
586
587 struct drm_v3d_submit_cl submit = { 0 };
588
589 /* Sanity check: we should only flag a bcl sync on a job that needs to be
590 * serialized.
591 */
592 assert(job->serialize || !job->needs_bcl_sync);
593
594 /* We expect to have just one RCL per job which should fit in just one BO.
595 * Our BCL, could chain multiple BOS together though.
596 */
597 assert(list_length(&job->rcl.bo_list) == 1);
598 assert(list_length(&job->bcl.bo_list) >= 1);
599 struct v3dv_bo *bcl_fist_bo =
600 list_first_entry(&job->bcl.bo_list, struct v3dv_bo, list_link);
601 submit.bcl_start = bcl_fist_bo->offset;
602 submit.bcl_end = job->bcl.bo->offset + v3dv_cl_offset(&job->bcl);
603 submit.rcl_start = job->rcl.bo->offset;
604 submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl);
605
606 submit.qma = job->tile_alloc->offset;
607 submit.qms = job->tile_alloc->size;
608 submit.qts = job->tile_state->offset;
609
610 submit.flags = 0;
611 if (job->tmu_dirty_rcl)
612 submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
613
614 submit.bo_handle_count = job->bo_count;
615 uint32_t *bo_handles =
616 (uint32_t *) malloc(sizeof(uint32_t) * submit.bo_handle_count);
617 uint32_t bo_idx = 0;
618 set_foreach(job->bos, entry) {
619 struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
620 bo_handles[bo_idx++] = bo->handle;
621 }
622 assert(bo_idx == submit.bo_handle_count);
623 submit.bo_handles = (uintptr_t)(void *)bo_handles;
624
625 /* We need a binning sync if we are waiting on a sempahore (do_sem_wait) or
626 * if the job comes after a pipeline barrier than involves geometry stages
627 * (needs_bcl_sync).
628 *
629 * We need a render sync if the job doesn't need a binning sync but has
630 * still been flagged for serialization. It should be noted that RCL jobs
631 * don't start until the previous RCL job has finished so we don't really
632 * need to add a fence for those, however, we might need to wait on a CSD or
633 * TFU job, which are not automatically serialized with CL jobs.
634 *
635 * FIXME: for now, if we are asked to wait on any semaphores, we just wait
636 * on the last job we submitted. In the future we might want to pass the
637 * actual syncobj of the wait semaphores so we don't block on the last RCL
638 * if we only need to wait for a previous CSD or TFU, for example, but
639 * we would have to extend our kernel interface to support the case where
640 * we have more than one semaphore to wait on.
641 */
642 const bool needs_bcl_sync = do_sem_wait || job->needs_bcl_sync;
643 const bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
644
645 mtx_lock(&queue->device->mutex);
646 submit.in_sync_bcl = needs_bcl_sync ? device->last_job_sync : 0;
647 submit.in_sync_rcl = needs_rcl_sync ? device->last_job_sync : 0;
648 submit.out_sync = device->last_job_sync;
649 v3dv_clif_dump(device, job, &submit);
650 int ret = v3dv_ioctl(device->pdevice->render_fd,
651 DRM_IOCTL_V3D_SUBMIT_CL, &submit);
652 mtx_unlock(&queue->device->mutex);
653
654 static bool warned = false;
655 if (ret && !warned) {
656 fprintf(stderr, "Draw call returned %s. Expect corruption.\n",
657 strerror(errno));
658 warned = true;
659 }
660
661 free(bo_handles);
662
663 if (ret)
664 return vk_error(device, VK_ERROR_DEVICE_LOST);
665
666 return VK_SUCCESS;
667 }
668
669 static VkResult
handle_tfu_job(struct v3dv_queue * queue,struct v3dv_job * job,bool do_sem_wait)670 handle_tfu_job(struct v3dv_queue *queue,
671 struct v3dv_job *job,
672 bool do_sem_wait)
673 {
674 struct v3dv_device *device = queue->device;
675
676 const bool needs_sync = do_sem_wait || job->serialize;
677
678 mtx_lock(&device->mutex);
679 job->tfu.in_sync = needs_sync ? device->last_job_sync : 0;
680 job->tfu.out_sync = device->last_job_sync;
681 int ret = v3dv_ioctl(device->pdevice->render_fd,
682 DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu);
683 mtx_unlock(&device->mutex);
684
685 if (ret != 0) {
686 fprintf(stderr, "Failed to submit TFU job: %d\n", ret);
687 return vk_error(device, VK_ERROR_DEVICE_LOST);
688 }
689
690 return VK_SUCCESS;
691 }
692
693 static VkResult
handle_csd_job(struct v3dv_queue * queue,struct v3dv_job * job,bool do_sem_wait)694 handle_csd_job(struct v3dv_queue *queue,
695 struct v3dv_job *job,
696 bool do_sem_wait)
697 {
698 struct v3dv_device *device = queue->device;
699
700 struct drm_v3d_submit_csd *submit = &job->csd.submit;
701
702 submit->bo_handle_count = job->bo_count;
703 uint32_t *bo_handles =
704 (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2));
705 uint32_t bo_idx = 0;
706 set_foreach(job->bos, entry) {
707 struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
708 bo_handles[bo_idx++] = bo->handle;
709 }
710 assert(bo_idx == submit->bo_handle_count);
711 submit->bo_handles = (uintptr_t)(void *)bo_handles;
712
713 const bool needs_sync = do_sem_wait || job->serialize;
714
715 mtx_lock(&queue->device->mutex);
716 submit->in_sync = needs_sync ? device->last_job_sync : 0;
717 submit->out_sync = device->last_job_sync;
718 int ret = v3dv_ioctl(device->pdevice->render_fd,
719 DRM_IOCTL_V3D_SUBMIT_CSD, submit);
720 mtx_unlock(&queue->device->mutex);
721
722 static bool warned = false;
723 if (ret && !warned) {
724 fprintf(stderr, "Compute dispatch returned %s. Expect corruption.\n",
725 strerror(errno));
726 warned = true;
727 }
728
729 free(bo_handles);
730
731 if (ret)
732 return vk_error(device, VK_ERROR_DEVICE_LOST);
733
734 return VK_SUCCESS;
735 }
736
737 static VkResult
queue_submit_job(struct v3dv_queue * queue,struct v3dv_job * job,bool do_sem_wait,pthread_t * wait_thread)738 queue_submit_job(struct v3dv_queue *queue,
739 struct v3dv_job *job,
740 bool do_sem_wait,
741 pthread_t *wait_thread)
742 {
743 assert(job);
744
745 switch (job->type) {
746 case V3DV_JOB_TYPE_GPU_CL:
747 return handle_cl_job(queue, job, do_sem_wait);
748 case V3DV_JOB_TYPE_GPU_TFU:
749 return handle_tfu_job(queue, job, do_sem_wait);
750 case V3DV_JOB_TYPE_GPU_CSD:
751 return handle_csd_job(queue, job, do_sem_wait);
752 case V3DV_JOB_TYPE_CPU_RESET_QUERIES:
753 return handle_reset_query_cpu_job(job);
754 case V3DV_JOB_TYPE_CPU_END_QUERY:
755 return handle_end_query_cpu_job(job);
756 case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
757 return handle_copy_query_results_cpu_job(job);
758 case V3DV_JOB_TYPE_CPU_SET_EVENT:
759 return handle_set_event_cpu_job(job, wait_thread != NULL);
760 case V3DV_JOB_TYPE_CPU_WAIT_EVENTS:
761 return handle_wait_events_cpu_job(job, do_sem_wait, wait_thread);
762 case V3DV_JOB_TYPE_CPU_COPY_BUFFER_TO_IMAGE:
763 return handle_copy_buffer_to_image_cpu_job(job);
764 case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
765 return handle_csd_indirect_cpu_job(queue, job, do_sem_wait);
766 case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY:
767 return handle_timestamp_query_cpu_job(job);
768 default:
769 unreachable("Unhandled job type");
770 }
771 }
772
773 static VkResult
queue_create_noop_job(struct v3dv_queue * queue)774 queue_create_noop_job(struct v3dv_queue *queue)
775 {
776 struct v3dv_device *device = queue->device;
777 queue->noop_job = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_job), 8,
778 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
779 if (!queue->noop_job)
780 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
781 v3dv_job_init(queue->noop_job, V3DV_JOB_TYPE_GPU_CL, device, NULL, -1);
782
783 v3dv_X(device, job_emit_noop)(queue->noop_job);
784
785 return VK_SUCCESS;
786 }
787
788 static VkResult
queue_submit_noop_job(struct v3dv_queue * queue,const VkSubmitInfo * pSubmit)789 queue_submit_noop_job(struct v3dv_queue *queue, const VkSubmitInfo *pSubmit)
790 {
791 /* VkQueue host access is externally synchronized so we don't need to lock
792 * here for the static variable.
793 */
794 if (!queue->noop_job) {
795 VkResult result = queue_create_noop_job(queue);
796 if (result != VK_SUCCESS)
797 return result;
798 }
799
800 return queue_submit_job(queue, queue->noop_job,
801 pSubmit->waitSemaphoreCount > 0, NULL);
802 }
803
804 static VkResult
queue_submit_cmd_buffer(struct v3dv_queue * queue,struct v3dv_cmd_buffer * cmd_buffer,const VkSubmitInfo * pSubmit,pthread_t * wait_thread)805 queue_submit_cmd_buffer(struct v3dv_queue *queue,
806 struct v3dv_cmd_buffer *cmd_buffer,
807 const VkSubmitInfo *pSubmit,
808 pthread_t *wait_thread)
809 {
810 assert(cmd_buffer);
811 assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_EXECUTABLE);
812
813 if (list_is_empty(&cmd_buffer->jobs))
814 return queue_submit_noop_job(queue, pSubmit);
815
816 list_for_each_entry_safe(struct v3dv_job, job,
817 &cmd_buffer->jobs, list_link) {
818 VkResult result = queue_submit_job(queue, job,
819 pSubmit->waitSemaphoreCount > 0,
820 wait_thread);
821 if (result != VK_SUCCESS)
822 return result;
823 }
824
825 return VK_SUCCESS;
826 }
827
828 static void
add_wait_thread_to_list(struct v3dv_device * device,pthread_t thread,struct v3dv_queue_submit_wait_info ** wait_info)829 add_wait_thread_to_list(struct v3dv_device *device,
830 pthread_t thread,
831 struct v3dv_queue_submit_wait_info **wait_info)
832 {
833 /* If this is the first time we spawn a wait thread for this queue
834 * submission create a v3dv_queue_submit_wait_info to track this and
835 * any other threads in the same submission and add it to the global list
836 * in the queue.
837 */
838 if (*wait_info == NULL) {
839 *wait_info =
840 vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_queue_submit_wait_info), 8,
841 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
842 (*wait_info)->device = device;
843 }
844
845 /* And add the thread to the list of wait threads for this submission */
846 const uint32_t thread_idx = (*wait_info)->wait_thread_count;
847 assert(thread_idx < 16);
848 (*wait_info)->wait_threads[thread_idx].thread = thread;
849 (*wait_info)->wait_threads[thread_idx].finished = false;
850 (*wait_info)->wait_thread_count++;
851 }
852
853 static void
add_signal_semaphores_to_wait_list(struct v3dv_device * device,const VkSubmitInfo * pSubmit,struct v3dv_queue_submit_wait_info * wait_info)854 add_signal_semaphores_to_wait_list(struct v3dv_device *device,
855 const VkSubmitInfo *pSubmit,
856 struct v3dv_queue_submit_wait_info *wait_info)
857 {
858 assert(wait_info);
859
860 if (pSubmit->signalSemaphoreCount == 0)
861 return;
862
863 /* FIXME: We put all the semaphores in a list and we signal all of them
864 * together from the submit master thread when the last wait thread in the
865 * submit completes. We could do better though: group the semaphores per
866 * submit and signal them as soon as all wait threads for a particular
867 * submit completes. Not sure if the extra work would be worth it though,
868 * since we only spawn waith threads for event waits and only when the
869 * event if set from the host after the queue submission.
870 */
871
872 /* Check the size of the current semaphore list */
873 const uint32_t prev_count = wait_info->signal_semaphore_count;
874 const uint32_t prev_alloc_size = prev_count * sizeof(VkSemaphore);
875 VkSemaphore *prev_list = wait_info->signal_semaphores;
876
877 /* Resize the list to hold the additional semaphores */
878 const uint32_t extra_alloc_size =
879 pSubmit->signalSemaphoreCount * sizeof(VkSemaphore);
880 wait_info->signal_semaphore_count += pSubmit->signalSemaphoreCount;
881 wait_info->signal_semaphores =
882 vk_alloc(&device->vk.alloc, prev_alloc_size + extra_alloc_size, 8,
883 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
884
885 /* Copy the old list to the new allocation and free the old list */
886 if (prev_count > 0) {
887 memcpy(wait_info->signal_semaphores, prev_list, prev_alloc_size);
888 vk_free(&device->vk.alloc, prev_list);
889 }
890
891 /* Add the new semaphores to the list */
892 memcpy(wait_info->signal_semaphores + prev_count,
893 pSubmit->pSignalSemaphores, extra_alloc_size);
894 }
895
896 static VkResult
queue_submit_cmd_buffer_batch(struct v3dv_queue * queue,const VkSubmitInfo * pSubmit,struct v3dv_queue_submit_wait_info ** wait_info)897 queue_submit_cmd_buffer_batch(struct v3dv_queue *queue,
898 const VkSubmitInfo *pSubmit,
899 struct v3dv_queue_submit_wait_info **wait_info)
900 {
901 VkResult result = VK_SUCCESS;
902 bool has_wait_threads = false;
903
904 /* Even if we don't have any actual work to submit we still need to wait
905 * on the wait semaphores and signal the signal semaphores and fence, so
906 * in this scenario we just submit a trivial no-op job so we don't have
907 * to do anything special, it should not be a common case anyway.
908 */
909 if (pSubmit->commandBufferCount == 0) {
910 result = queue_submit_noop_job(queue, pSubmit);
911 } else {
912 for (uint32_t i = 0; i < pSubmit->commandBufferCount; i++) {
913 pthread_t wait_thread;
914 struct v3dv_cmd_buffer *cmd_buffer =
915 v3dv_cmd_buffer_from_handle(pSubmit->pCommandBuffers[i]);
916 result = queue_submit_cmd_buffer(queue, cmd_buffer, pSubmit,
917 &wait_thread);
918
919 /* We get VK_NOT_READY if we had to spawn a wait thread for the
920 * command buffer. In that scenario, we want to continue submitting
921 * any pending command buffers in the batch, but we don't want to
922 * process any signal semaphores for the batch until we know we have
923 * submitted every job for every command buffer in the batch.
924 */
925 if (result == VK_NOT_READY) {
926 result = VK_SUCCESS;
927 add_wait_thread_to_list(queue->device, wait_thread, wait_info);
928 has_wait_threads = true;
929 }
930
931 if (result != VK_SUCCESS)
932 break;
933 }
934 }
935
936 if (result != VK_SUCCESS)
937 return result;
938
939 /* If had to emit any wait threads in this submit we need to wait for all
940 * of them to complete before we can signal any semaphores.
941 */
942 if (!has_wait_threads) {
943 return process_semaphores_to_signal(queue->device,
944 pSubmit->signalSemaphoreCount,
945 pSubmit->pSignalSemaphores);
946 } else {
947 assert(*wait_info);
948 add_signal_semaphores_to_wait_list(queue->device, pSubmit, *wait_info);
949 return VK_NOT_READY;
950 }
951 }
952
953 static void *
master_wait_thread_func(void * _wait_info)954 master_wait_thread_func(void *_wait_info)
955 {
956 struct v3dv_queue_submit_wait_info *wait_info =
957 (struct v3dv_queue_submit_wait_info *) _wait_info;
958
959 struct v3dv_queue *queue = &wait_info->device->queue;
960
961 /* Wait for all command buffer wait threads to complete */
962 for (uint32_t i = 0; i < wait_info->wait_thread_count; i++) {
963 int res = pthread_join(wait_info->wait_threads[i].thread, NULL);
964 if (res != 0)
965 fprintf(stderr, "Wait thread failed to join.\n");
966 }
967
968 /* Signal semaphores and fences */
969 VkResult result;
970 result = process_semaphores_to_signal(wait_info->device,
971 wait_info->signal_semaphore_count,
972 wait_info->signal_semaphores);
973 if (result != VK_SUCCESS)
974 fprintf(stderr, "Wait thread semaphore signaling failed.");
975
976 result = process_fence_to_signal(wait_info->device, wait_info->fence);
977 if (result != VK_SUCCESS)
978 fprintf(stderr, "Wait thread fence signaling failed.");
979
980 /* Release wait_info */
981 mtx_lock(&queue->mutex);
982 list_del(&wait_info->list_link);
983 mtx_unlock(&queue->mutex);
984
985 vk_free(&wait_info->device->vk.alloc, wait_info->signal_semaphores);
986 vk_free(&wait_info->device->vk.alloc, wait_info);
987
988 return NULL;
989 }
990
991
992 static VkResult
spawn_master_wait_thread(struct v3dv_queue * queue,struct v3dv_queue_submit_wait_info * wait_info)993 spawn_master_wait_thread(struct v3dv_queue *queue,
994 struct v3dv_queue_submit_wait_info *wait_info)
995
996 {
997 VkResult result = VK_SUCCESS;
998
999 mtx_lock(&queue->mutex);
1000 if (pthread_create(&wait_info->master_wait_thread, NULL,
1001 master_wait_thread_func, wait_info)) {
1002 result = vk_error(queue, VK_ERROR_DEVICE_LOST);
1003 goto done;
1004 }
1005
1006 list_addtail(&wait_info->list_link, &queue->submit_wait_list);
1007
1008 done:
1009 mtx_unlock(&queue->mutex);
1010 return result;
1011 }
1012
1013 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_QueueSubmit(VkQueue _queue,uint32_t submitCount,const VkSubmitInfo * pSubmits,VkFence fence)1014 v3dv_QueueSubmit(VkQueue _queue,
1015 uint32_t submitCount,
1016 const VkSubmitInfo* pSubmits,
1017 VkFence fence)
1018 {
1019 V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
1020
1021 struct v3dv_queue_submit_wait_info *wait_info = NULL;
1022
1023 VkResult result = VK_SUCCESS;
1024 for (uint32_t i = 0; i < submitCount; i++) {
1025 result = queue_submit_cmd_buffer_batch(queue, &pSubmits[i], &wait_info);
1026 if (result != VK_SUCCESS && result != VK_NOT_READY)
1027 goto done;
1028 }
1029
1030 if (!wait_info) {
1031 assert(result != VK_NOT_READY);
1032 result = process_fence_to_signal(queue->device, fence);
1033 goto done;
1034 }
1035
1036 /* We emitted wait threads, so we have to spwan a master thread for this
1037 * queue submission that waits for all other threads to complete and then
1038 * will signal any semaphores and fences.
1039 */
1040 assert(wait_info);
1041 wait_info->fence = fence;
1042 result = spawn_master_wait_thread(queue, wait_info);
1043
1044 done:
1045 return result;
1046 }
1047
1048 static void
destroy_syncobj(uint32_t device_fd,uint32_t * sync)1049 destroy_syncobj(uint32_t device_fd, uint32_t *sync)
1050 {
1051 assert(sync);
1052 drmSyncobjDestroy(device_fd, *sync);
1053 *sync = 0;
1054 }
1055
1056 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateSemaphore(VkDevice _device,const VkSemaphoreCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkSemaphore * pSemaphore)1057 v3dv_CreateSemaphore(VkDevice _device,
1058 const VkSemaphoreCreateInfo *pCreateInfo,
1059 const VkAllocationCallbacks *pAllocator,
1060 VkSemaphore *pSemaphore)
1061 {
1062 V3DV_FROM_HANDLE(v3dv_device, device, _device);
1063
1064 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO);
1065
1066 struct v3dv_semaphore *sem =
1067 vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_semaphore),
1068 VK_OBJECT_TYPE_SEMAPHORE);
1069 if (sem == NULL)
1070 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1071
1072 int ret = drmSyncobjCreate(device->pdevice->render_fd, 0, &sem->sync);
1073 if (ret) {
1074 vk_object_free(&device->vk, pAllocator, sem);
1075 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1076 }
1077
1078 *pSemaphore = v3dv_semaphore_to_handle(sem);
1079
1080 return VK_SUCCESS;
1081 }
1082
1083 VKAPI_ATTR void VKAPI_CALL
v3dv_GetPhysicalDeviceExternalSemaphoreProperties(VkPhysicalDevice physicalDevice,const VkPhysicalDeviceExternalSemaphoreInfo * pExternalSemaphoreInfo,VkExternalSemaphoreProperties * pExternalSemaphoreProperties)1084 v3dv_GetPhysicalDeviceExternalSemaphoreProperties(
1085 VkPhysicalDevice physicalDevice,
1086 const VkPhysicalDeviceExternalSemaphoreInfo *pExternalSemaphoreInfo,
1087 VkExternalSemaphoreProperties *pExternalSemaphoreProperties)
1088 {
1089 switch (pExternalSemaphoreInfo->handleType) {
1090 case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT:
1091 case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT:
1092 pExternalSemaphoreProperties->exportFromImportedHandleTypes =
1093 VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT |
1094 VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
1095 pExternalSemaphoreProperties->compatibleHandleTypes =
1096 VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT |
1097 VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
1098
1099 /* FIXME: we can't import external semaphores until we improve the kernel
1100 * submit interface to handle multiple in syncobjs, because once we have
1101 * an imported semaphore in our list of semaphores to wait on, we can no
1102 * longer use the workaround of waiting on the last syncobj fence produced
1103 * from the device, since the imported semaphore may not (and in fact, it
1104 * would typically not) have been produced from same device.
1105 *
1106 * This behavior is exercised via dEQP-VK.synchronization.cross_instance.*.
1107 * Particularly, this test:
1108 * dEQP-VK.synchronization.cross_instance.dedicated.
1109 * write_ssbo_compute_read_vertex_input.buffer_16384_binary_semaphore_fd
1110 * fails consistently because of this, so it'll be a good reference to
1111 * verify the implementation when the kernel bits are in place.
1112 */
1113 pExternalSemaphoreProperties->externalSemaphoreFeatures = 0;
1114
1115 /* FIXME: See comment in GetPhysicalDeviceExternalFenceProperties
1116 * for details on why we can't export to SYNC_FD.
1117 */
1118 if (pExternalSemaphoreInfo->handleType !=
1119 VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) {
1120 pExternalSemaphoreProperties->externalSemaphoreFeatures |=
1121 VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT;
1122 }
1123 break;
1124 default:
1125 pExternalSemaphoreProperties->exportFromImportedHandleTypes = 0;
1126 pExternalSemaphoreProperties->compatibleHandleTypes = 0;
1127 pExternalSemaphoreProperties->externalSemaphoreFeatures = 0;
1128 break;
1129 }
1130 }
1131
1132 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_ImportSemaphoreFdKHR(VkDevice _device,const VkImportSemaphoreFdInfoKHR * pImportSemaphoreFdInfo)1133 v3dv_ImportSemaphoreFdKHR(
1134 VkDevice _device,
1135 const VkImportSemaphoreFdInfoKHR *pImportSemaphoreFdInfo)
1136 {
1137 V3DV_FROM_HANDLE(v3dv_device, device, _device);
1138 V3DV_FROM_HANDLE(v3dv_semaphore, sem, pImportSemaphoreFdInfo->semaphore);
1139
1140 assert(pImportSemaphoreFdInfo->sType ==
1141 VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR);
1142
1143 int fd = pImportSemaphoreFdInfo->fd;
1144 int render_fd = device->pdevice->render_fd;
1145
1146 bool is_temporary =
1147 pImportSemaphoreFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT ||
1148 (pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT);
1149
1150 uint32_t new_sync;
1151 switch (pImportSemaphoreFdInfo->handleType) {
1152 case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: {
1153 /* "If handleType is VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT, the
1154 * special value -1 for fd is treated like a valid sync file descriptor
1155 * referring to an object that has already signaled. The import
1156 * operation will succeed and the VkSemaphore will have a temporarily
1157 * imported payload as if a valid file descriptor had been provided."
1158 */
1159 unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0;
1160 if (drmSyncobjCreate(render_fd, flags, &new_sync))
1161 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1162
1163 if (fd != -1) {
1164 if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) {
1165 drmSyncobjDestroy(render_fd, new_sync);
1166 return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1167 }
1168 }
1169 break;
1170 }
1171 case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: {
1172 if (drmSyncobjFDToHandle(render_fd, fd, &new_sync))
1173 return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1174 break;
1175 }
1176 default:
1177 return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1178 }
1179
1180 destroy_syncobj(render_fd, &sem->temp_sync);
1181 if (is_temporary) {
1182 sem->temp_sync = new_sync;
1183 } else {
1184 destroy_syncobj(render_fd, &sem->sync);
1185 sem->sync = new_sync;
1186 }
1187
1188 /* From the Vulkan 1.0.53 spec:
1189 *
1190 * "Importing a semaphore payload from a file descriptor transfers
1191 * ownership of the file descriptor from the application to the
1192 * Vulkan implementation. The application must not perform any
1193 * operations on the file descriptor after a successful import."
1194 *
1195 * If the import fails, we leave the file descriptor open.
1196 */
1197 if (fd != -1)
1198 close(fd);
1199
1200 return VK_SUCCESS;
1201 }
1202
1203 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetSemaphoreFdKHR(VkDevice _device,const VkSemaphoreGetFdInfoKHR * pGetFdInfo,int * pFd)1204 v3dv_GetSemaphoreFdKHR(VkDevice _device,
1205 const VkSemaphoreGetFdInfoKHR *pGetFdInfo,
1206 int *pFd)
1207 {
1208 V3DV_FROM_HANDLE(v3dv_device, device, _device);
1209 V3DV_FROM_HANDLE(v3dv_semaphore, sem, pGetFdInfo->semaphore);
1210
1211 assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR);
1212
1213 *pFd = -1;
1214 int render_fd = device->pdevice->render_fd;
1215 switch (pGetFdInfo->handleType) {
1216 case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: {
1217 drmSyncobjExportSyncFile(render_fd, sem->sync, pFd);
1218 if (*pFd == -1)
1219 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1220 break;
1221 case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT:
1222 drmSyncobjHandleToFD(render_fd, sem->sync, pFd);
1223 if (*pFd == -1)
1224 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1225 break;
1226 }
1227 default:
1228 unreachable("Unsupported external semaphore handle type");
1229 }
1230
1231 return VK_SUCCESS;
1232 }
1233
1234 VKAPI_ATTR void VKAPI_CALL
v3dv_DestroySemaphore(VkDevice _device,VkSemaphore semaphore,const VkAllocationCallbacks * pAllocator)1235 v3dv_DestroySemaphore(VkDevice _device,
1236 VkSemaphore semaphore,
1237 const VkAllocationCallbacks *pAllocator)
1238 {
1239 V3DV_FROM_HANDLE(v3dv_device, device, _device);
1240 V3DV_FROM_HANDLE(v3dv_semaphore, sem, semaphore);
1241
1242 if (sem == NULL)
1243 return;
1244
1245 destroy_syncobj(device->pdevice->render_fd, &sem->sync);
1246 destroy_syncobj(device->pdevice->render_fd, &sem->temp_sync);
1247
1248 vk_object_free(&device->vk, pAllocator, sem);
1249 }
1250
1251 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateFence(VkDevice _device,const VkFenceCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkFence * pFence)1252 v3dv_CreateFence(VkDevice _device,
1253 const VkFenceCreateInfo *pCreateInfo,
1254 const VkAllocationCallbacks *pAllocator,
1255 VkFence *pFence)
1256 {
1257 V3DV_FROM_HANDLE(v3dv_device, device, _device);
1258
1259 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO);
1260
1261 struct v3dv_fence *fence =
1262 vk_object_zalloc(&device->vk, pAllocator, sizeof(struct v3dv_fence),
1263 VK_OBJECT_TYPE_FENCE);
1264 if (fence == NULL)
1265 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1266
1267 unsigned flags = 0;
1268 if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT)
1269 flags |= DRM_SYNCOBJ_CREATE_SIGNALED;
1270 int ret = drmSyncobjCreate(device->pdevice->render_fd, flags, &fence->sync);
1271 if (ret) {
1272 vk_object_free(&device->vk, pAllocator, fence);
1273 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1274 }
1275
1276 *pFence = v3dv_fence_to_handle(fence);
1277
1278 return VK_SUCCESS;
1279 }
1280
1281 VKAPI_ATTR void VKAPI_CALL
v3dv_GetPhysicalDeviceExternalFenceProperties(VkPhysicalDevice physicalDevice,const VkPhysicalDeviceExternalFenceInfo * pExternalFenceInfo,VkExternalFenceProperties * pExternalFenceProperties)1282 v3dv_GetPhysicalDeviceExternalFenceProperties(
1283 VkPhysicalDevice physicalDevice,
1284 const VkPhysicalDeviceExternalFenceInfo *pExternalFenceInfo,
1285 VkExternalFenceProperties *pExternalFenceProperties)
1286
1287 {
1288 switch (pExternalFenceInfo->handleType) {
1289 case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT:
1290 case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT:
1291 pExternalFenceProperties->exportFromImportedHandleTypes =
1292 VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT |
1293 VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT;
1294 pExternalFenceProperties->compatibleHandleTypes =
1295 VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT |
1296 VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT;
1297 pExternalFenceProperties->externalFenceFeatures =
1298 VK_EXTERNAL_FENCE_FEATURE_IMPORTABLE_BIT;
1299
1300 /* FIXME: SYNC_FD exports the actual fence referenced by the syncobj, not
1301 * the syncobj itself, and that fence is only created after we have
1302 * submitted to the kernel and updated the syncobj for the fence to import
1303 * the actual DRM fence created with the submission. Unfortunately, if the
1304 * queue submission has a 'wait for events' we may hold any jobs after the
1305 * wait in a user-space thread until the events are signaled, and in that
1306 * case we don't update the out fence of the submit until the events are
1307 * signaled and we can submit all the jobs involved with the vkQueueSubmit
1308 * call. This means that if the applications submits with an out fence and
1309 * a wait for events, trying to export the out fence to a SYNC_FD rigth
1310 * after the submission and before the events are signaled will fail,
1311 * because the actual DRM fence won't exist yet. This is not a problem
1312 * with OPAQUE_FD because in this case we export the entire syncobj, not
1313 * the underlying DRM fence. To fix this we need to rework our kernel
1314 * interface to be more flexible and accept multiple in/out syncobjs so
1315 * we can implement event waits as regular fence waits on the kernel side,
1316 * until then, we can only reliably export OPAQUE_FD.
1317 */
1318 if (pExternalFenceInfo->handleType !=
1319 VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT) {
1320 pExternalFenceProperties->externalFenceFeatures |=
1321 VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT;
1322 }
1323 break;
1324 default:
1325 pExternalFenceProperties->exportFromImportedHandleTypes = 0;
1326 pExternalFenceProperties->compatibleHandleTypes = 0;
1327 pExternalFenceProperties->externalFenceFeatures = 0;
1328 break;
1329 }
1330 }
1331
1332 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_ImportFenceFdKHR(VkDevice _device,const VkImportFenceFdInfoKHR * pImportFenceFdInfo)1333 v3dv_ImportFenceFdKHR(VkDevice _device,
1334 const VkImportFenceFdInfoKHR *pImportFenceFdInfo)
1335 {
1336 V3DV_FROM_HANDLE(v3dv_device, device, _device);
1337 V3DV_FROM_HANDLE(v3dv_fence, fence, pImportFenceFdInfo->fence);
1338
1339 assert(pImportFenceFdInfo->sType ==
1340 VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR);
1341
1342 int fd = pImportFenceFdInfo->fd;
1343 int render_fd = device->pdevice->render_fd;
1344
1345 bool is_temporary =
1346 pImportFenceFdInfo->handleType == VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT ||
1347 (pImportFenceFdInfo->flags & VK_FENCE_IMPORT_TEMPORARY_BIT);
1348
1349 uint32_t new_sync;
1350 switch (pImportFenceFdInfo->handleType) {
1351 case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: {
1352 /* "If handleType is VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT, the
1353 * special value -1 for fd is treated like a valid sync file descriptor
1354 * referring to an object that has already signaled. The import
1355 * operation will succeed and the VkFence will have a temporarily
1356 * imported payload as if a valid file descriptor had been provided."
1357 */
1358 unsigned flags = fd == -1 ? DRM_SYNCOBJ_CREATE_SIGNALED : 0;
1359 if (drmSyncobjCreate(render_fd, flags, &new_sync))
1360 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1361
1362 if (fd != -1) {
1363 if (drmSyncobjImportSyncFile(render_fd, new_sync, fd)) {
1364 drmSyncobjDestroy(render_fd, new_sync);
1365 return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1366 }
1367 }
1368 break;
1369 }
1370 case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT: {
1371 if (drmSyncobjFDToHandle(render_fd, fd, &new_sync))
1372 return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1373 break;
1374 }
1375 default:
1376 return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1377 }
1378
1379 destroy_syncobj(render_fd, &fence->temp_sync);
1380 if (is_temporary) {
1381 fence->temp_sync = new_sync;
1382 } else {
1383 destroy_syncobj(render_fd, &fence->sync);
1384 fence->sync = new_sync;
1385 }
1386
1387 /* From the Vulkan 1.0.53 spec:
1388 *
1389 * "Importing a fence payload from a file descriptor transfers
1390 * ownership of the file descriptor from the application to the
1391 * Vulkan implementation. The application must not perform any
1392 * operations on the file descriptor after a successful import."
1393 *
1394 * If the import fails, we leave the file descriptor open.
1395 */
1396 if (fd != -1)
1397 close(fd);
1398
1399 return VK_SUCCESS;
1400 }
1401
1402 VKAPI_ATTR void VKAPI_CALL
v3dv_DestroyFence(VkDevice _device,VkFence _fence,const VkAllocationCallbacks * pAllocator)1403 v3dv_DestroyFence(VkDevice _device,
1404 VkFence _fence,
1405 const VkAllocationCallbacks *pAllocator)
1406 {
1407 V3DV_FROM_HANDLE(v3dv_device, device, _device);
1408 V3DV_FROM_HANDLE(v3dv_fence, fence, _fence);
1409
1410 if (fence == NULL)
1411 return;
1412
1413 destroy_syncobj(device->pdevice->render_fd, &fence->sync);
1414 destroy_syncobj(device->pdevice->render_fd, &fence->temp_sync);
1415
1416 vk_object_free(&device->vk, pAllocator, fence);
1417 }
1418
1419 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetFenceStatus(VkDevice _device,VkFence _fence)1420 v3dv_GetFenceStatus(VkDevice _device, VkFence _fence)
1421 {
1422 V3DV_FROM_HANDLE(v3dv_device, device, _device);
1423 V3DV_FROM_HANDLE(v3dv_fence, fence, _fence);
1424
1425 int ret = drmSyncobjWait(device->pdevice->render_fd, &fence->sync, 1,
1426 0, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT, NULL);
1427 if (ret == -ETIME)
1428 return VK_NOT_READY;
1429 else if (ret)
1430 return vk_error(device, VK_ERROR_DEVICE_LOST);
1431 return VK_SUCCESS;
1432 }
1433
1434 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetFenceFdKHR(VkDevice _device,const VkFenceGetFdInfoKHR * pGetFdInfo,int * pFd)1435 v3dv_GetFenceFdKHR(VkDevice _device,
1436 const VkFenceGetFdInfoKHR *pGetFdInfo,
1437 int *pFd)
1438 {
1439 V3DV_FROM_HANDLE(v3dv_device, device, _device);
1440 V3DV_FROM_HANDLE(v3dv_fence, fence, pGetFdInfo->fence);
1441
1442 assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR);
1443
1444 *pFd = -1;
1445 int render_fd = device->pdevice->render_fd;
1446 switch (pGetFdInfo->handleType) {
1447 case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: {
1448 drmSyncobjExportSyncFile(render_fd, fence->sync, pFd);
1449 if (*pFd == -1)
1450 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1451 break;
1452 case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT:
1453 drmSyncobjHandleToFD(render_fd, fence->sync, pFd);
1454 if (*pFd == -1)
1455 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1456 break;
1457 }
1458 default:
1459 unreachable("Unsupported external fence handle type");
1460 }
1461
1462 return VK_SUCCESS;
1463 }
1464
1465 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_ResetFences(VkDevice _device,uint32_t fenceCount,const VkFence * pFences)1466 v3dv_ResetFences(VkDevice _device, uint32_t fenceCount, const VkFence *pFences)
1467 {
1468 V3DV_FROM_HANDLE(v3dv_device, device, _device);
1469
1470 uint32_t *syncobjs = vk_alloc(&device->vk.alloc,
1471 sizeof(*syncobjs) * fenceCount, 8,
1472 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
1473 if (!syncobjs)
1474 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1475
1476 int render_fd = device->pdevice->render_fd;
1477 uint32_t reset_count = 0;
1478 for (uint32_t i = 0; i < fenceCount; i++) {
1479 struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]);
1480 /* From the Vulkan spec, section 'Importing Fence Payloads':
1481 *
1482 * "If the import is temporary, the fence will be restored to its
1483 * permanent state the next time that fence is passed to
1484 * vkResetFences.
1485 *
1486 * Note: Restoring a fence to its prior permanent payload is a
1487 * distinct operation from resetting a fence payload."
1488 *
1489 * To restore the previous state, we just need to destroy the temporary.
1490 */
1491 if (fence->temp_sync)
1492 destroy_syncobj(render_fd, &fence->temp_sync);
1493 else
1494 syncobjs[reset_count++] = fence->sync;
1495 }
1496
1497 int ret = 0;
1498 if (reset_count > 0)
1499 ret = drmSyncobjReset(render_fd, syncobjs, reset_count);
1500
1501 vk_free(&device->vk.alloc, syncobjs);
1502
1503 if (ret)
1504 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1505 return VK_SUCCESS;
1506 }
1507
1508 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_WaitForFences(VkDevice _device,uint32_t fenceCount,const VkFence * pFences,VkBool32 waitAll,uint64_t timeout)1509 v3dv_WaitForFences(VkDevice _device,
1510 uint32_t fenceCount,
1511 const VkFence *pFences,
1512 VkBool32 waitAll,
1513 uint64_t timeout)
1514 {
1515 V3DV_FROM_HANDLE(v3dv_device, device, _device);
1516
1517 const uint64_t abs_timeout = get_absolute_timeout(timeout);
1518
1519 uint32_t *syncobjs = vk_alloc(&device->vk.alloc,
1520 sizeof(*syncobjs) * fenceCount, 8,
1521 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
1522 if (!syncobjs)
1523 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1524
1525 for (uint32_t i = 0; i < fenceCount; i++) {
1526 struct v3dv_fence *fence = v3dv_fence_from_handle(pFences[i]);
1527 syncobjs[i] = fence->temp_sync ? fence->temp_sync : fence->sync;
1528 }
1529
1530 unsigned flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
1531 if (waitAll)
1532 flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL;
1533
1534 int ret;
1535 do {
1536 ret = drmSyncobjWait(device->pdevice->render_fd, syncobjs, fenceCount,
1537 timeout, flags, NULL);
1538 } while (ret == -ETIME && gettime_ns() < abs_timeout);
1539
1540 vk_free(&device->vk.alloc, syncobjs);
1541
1542 if (ret == -ETIME)
1543 return VK_TIMEOUT;
1544 else if (ret)
1545 return vk_error(device, VK_ERROR_DEVICE_LOST);
1546 return VK_SUCCESS;
1547 }
1548
1549 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_QueueBindSparse(VkQueue _queue,uint32_t bindInfoCount,const VkBindSparseInfo * pBindInfo,VkFence fence)1550 v3dv_QueueBindSparse(VkQueue _queue,
1551 uint32_t bindInfoCount,
1552 const VkBindSparseInfo *pBindInfo,
1553 VkFence fence)
1554 {
1555 V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
1556 return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT);
1557 }
1558