1 /*
2 * Copyright © 2019 Raspberry Pi Ltd
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "v3dv_private.h"
25 #include "drm-uapi/v3d_drm.h"
26
27 #include "broadcom/clif/clif_dump.h"
28 #include "util/libsync.h"
29 #include "util/os_time.h"
30 #include "vk_drm_syncobj.h"
31
32 #include <errno.h>
33 #include <time.h>
34
35 static void
v3dv_clif_dump(struct v3dv_device * device,struct v3dv_job * job,struct drm_v3d_submit_cl * submit)36 v3dv_clif_dump(struct v3dv_device *device,
37 struct v3dv_job *job,
38 struct drm_v3d_submit_cl *submit)
39 {
40 if (!(V3D_DBG(CL) ||
41 V3D_DBG(CL_NO_BIN) ||
42 V3D_DBG(CLIF)))
43 return;
44
45 struct clif_dump *clif = clif_dump_init(&device->devinfo,
46 stderr,
47 V3D_DBG(CL) ||
48 V3D_DBG(CL_NO_BIN),
49 V3D_DBG(CL_NO_BIN));
50
51 set_foreach(job->bos, entry) {
52 struct v3dv_bo *bo = (void *)entry->key;
53 char *name = ralloc_asprintf(NULL, "%s_0x%x",
54 bo->name, bo->offset);
55
56 bool ok = v3dv_bo_map(device, bo, bo->size);
57 if (!ok) {
58 fprintf(stderr, "failed to map BO for clif_dump.\n");
59 ralloc_free(name);
60 goto free_clif;
61 }
62 clif_dump_add_bo(clif, name, bo->offset, bo->size, bo->map);
63
64 ralloc_free(name);
65 }
66
67 clif_dump(clif, submit);
68
69 free_clif:
70 clif_dump_destroy(clif);
71 }
72
73 static VkResult
queue_wait_idle(struct v3dv_queue * queue,struct v3dv_submit_sync_info * sync_info)74 queue_wait_idle(struct v3dv_queue *queue,
75 struct v3dv_submit_sync_info *sync_info)
76 {
77 if (queue->device->pdevice->caps.multisync) {
78 int ret = drmSyncobjWait(queue->device->pdevice->render_fd,
79 queue->last_job_syncs.syncs, 4,
80 INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
81 NULL);
82 if (ret) {
83 return vk_errorf(queue, VK_ERROR_DEVICE_LOST,
84 "syncobj wait failed: %m");
85 }
86
87 bool first = true;
88 for (int i = 0; i < 4; i++) {
89 if (!queue->last_job_syncs.first[i])
90 first = false;
91 }
92
93 /* If we're not the first job, that means we're waiting on some
94 * per-queue-type syncobj which transitively waited on the semaphores
95 * so we can skip the semaphore wait.
96 */
97 if (first) {
98 VkResult result = vk_sync_wait_many(&queue->device->vk,
99 sync_info->wait_count,
100 sync_info->waits,
101 VK_SYNC_WAIT_COMPLETE,
102 UINT64_MAX);
103 if (result != VK_SUCCESS)
104 return result;
105 }
106 } else {
107 /* Without multisync, all the semaphores are baked into the one syncobj
108 * at the start of each submit so we only need to wait on the one.
109 */
110 int ret = drmSyncobjWait(queue->device->pdevice->render_fd,
111 &queue->last_job_syncs.syncs[V3DV_QUEUE_ANY], 1,
112 INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
113 NULL);
114 if (ret) {
115 return vk_errorf(queue, VK_ERROR_DEVICE_LOST,
116 "syncobj wait failed: %m");
117 }
118 }
119
120 for (int i = 0; i < 4; i++)
121 queue->last_job_syncs.first[i] = false;
122
123 return VK_SUCCESS;
124 }
125
126 static void
multisync_free(struct v3dv_device * device,struct drm_v3d_multi_sync * ms)127 multisync_free(struct v3dv_device *device,
128 struct drm_v3d_multi_sync *ms)
129 {
130 vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->out_syncs);
131 vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->in_syncs);
132 }
133
134 static struct drm_v3d_sem *
set_in_syncs(struct v3dv_queue * queue,struct v3dv_job * job,enum v3dv_queue_type queue_sync,uint32_t * count,struct vk_sync_wait * waits,unsigned wait_count,struct v3dv_submit_sync_info * sync_info)135 set_in_syncs(struct v3dv_queue *queue,
136 struct v3dv_job *job,
137 enum v3dv_queue_type queue_sync,
138 uint32_t *count,
139 struct vk_sync_wait *waits,
140 unsigned wait_count,
141 struct v3dv_submit_sync_info *sync_info)
142 {
143 struct v3dv_device *device = queue->device;
144 uint32_t n_syncs = 0;
145
146 /* If this is the first job submitted to a given GPU queue in this cmd buf
147 * batch, it has to wait on wait semaphores (if any) before running.
148 */
149 if (queue->last_job_syncs.first[queue_sync])
150 n_syncs = sync_info->wait_count;
151
152 /* If the serialize flag is set the job needs to be serialized in the
153 * corresponding queues. Notice that we may implement transfer operations
154 * as both CL or TFU jobs.
155 *
156 * FIXME: maybe we could track more precisely if the source of a transfer
157 * barrier is a CL and/or a TFU job.
158 */
159 bool sync_csd = job->serialize & V3DV_BARRIER_COMPUTE_BIT;
160 bool sync_tfu = job->serialize & V3DV_BARRIER_TRANSFER_BIT;
161 bool sync_cl = job->serialize & (V3DV_BARRIER_GRAPHICS_BIT |
162 V3DV_BARRIER_TRANSFER_BIT);
163 bool sync_cpu = job->serialize & V3DV_BARRIER_CPU_BIT;
164
165 *count = n_syncs;
166 if (sync_cl)
167 (*count)++;
168 if (sync_tfu)
169 (*count)++;
170 if (sync_csd)
171 (*count)++;
172 if (sync_cpu)
173 (*count)++;
174
175 *count += wait_count;
176
177 if (!*count)
178 return NULL;
179
180 struct drm_v3d_sem *syncs =
181 vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
182 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
183
184 if (!syncs)
185 return NULL;
186
187 for (int i = 0; i < n_syncs; i++) {
188 syncs[i].handle =
189 vk_sync_as_drm_syncobj(sync_info->waits[i].sync)->syncobj;
190 }
191
192 for (int i = 0; i < wait_count; i++) {
193 syncs[n_syncs++].handle =
194 vk_sync_as_drm_syncobj(waits[i].sync)->syncobj;
195 }
196
197 if (sync_cl)
198 syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CL];
199
200 if (sync_csd)
201 syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CSD];
202
203 if (sync_tfu)
204 syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_TFU];
205
206 if (sync_cpu)
207 syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CPU];
208
209 assert(n_syncs == *count);
210 return syncs;
211 }
212
213 static struct drm_v3d_sem *
set_out_syncs(struct v3dv_queue * queue,struct v3dv_job * job,enum v3dv_queue_type queue_sync,uint32_t * count,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)214 set_out_syncs(struct v3dv_queue *queue,
215 struct v3dv_job *job,
216 enum v3dv_queue_type queue_sync,
217 uint32_t *count,
218 struct v3dv_submit_sync_info *sync_info,
219 bool signal_syncs)
220 {
221 struct v3dv_device *device = queue->device;
222
223 uint32_t n_vk_syncs = signal_syncs ? sync_info->signal_count : 0;
224
225 /* We always signal the syncobj from `device->last_job_syncs` related to
226 * this v3dv_queue_type to track the last job submitted to this queue.
227 */
228 (*count) = n_vk_syncs + 1;
229
230 struct drm_v3d_sem *syncs =
231 vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
232 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
233
234 if (!syncs)
235 return NULL;
236
237 if (n_vk_syncs) {
238 for (unsigned i = 0; i < n_vk_syncs; i++) {
239 syncs[i].handle =
240 vk_sync_as_drm_syncobj(sync_info->signals[i].sync)->syncobj;
241 }
242 }
243
244 syncs[n_vk_syncs].handle = queue->last_job_syncs.syncs[queue_sync];
245
246 return syncs;
247 }
248
249 static void
set_ext(struct drm_v3d_extension * ext,struct drm_v3d_extension * next,uint32_t id,uintptr_t flags)250 set_ext(struct drm_v3d_extension *ext,
251 struct drm_v3d_extension *next,
252 uint32_t id,
253 uintptr_t flags)
254 {
255 ext->next = (uintptr_t)(void *)next;
256 ext->id = id;
257 ext->flags = flags;
258 }
259
260 /* This function sets the extension for multiple in/out syncobjs. When it is
261 * successful, it sets the extension id to DRM_V3D_EXT_ID_MULTI_SYNC.
262 * Otherwise, the extension id is 0, which means an out-of-memory error.
263 */
264 static void
set_multisync(struct drm_v3d_multi_sync * ms,struct v3dv_submit_sync_info * sync_info,struct vk_sync_wait * waits,unsigned wait_count,struct drm_v3d_extension * next,struct v3dv_device * device,struct v3dv_job * job,enum v3dv_queue_type in_queue_sync,enum v3dv_queue_type out_queue_sync,enum v3d_queue wait_stage,bool signal_syncs)265 set_multisync(struct drm_v3d_multi_sync *ms,
266 struct v3dv_submit_sync_info *sync_info,
267 struct vk_sync_wait *waits,
268 unsigned wait_count,
269 struct drm_v3d_extension *next,
270 struct v3dv_device *device,
271 struct v3dv_job *job,
272 enum v3dv_queue_type in_queue_sync,
273 enum v3dv_queue_type out_queue_sync,
274 enum v3d_queue wait_stage,
275 bool signal_syncs)
276 {
277 struct v3dv_queue *queue = &device->queue;
278 uint32_t out_sync_count = 0, in_sync_count = 0;
279 struct drm_v3d_sem *out_syncs = NULL, *in_syncs = NULL;
280
281 in_syncs = set_in_syncs(queue, job, in_queue_sync,
282 &in_sync_count, waits, wait_count, sync_info);
283 if (!in_syncs && in_sync_count)
284 goto fail;
285
286 out_syncs = set_out_syncs(queue, job, out_queue_sync,
287 &out_sync_count, sync_info, signal_syncs);
288
289 assert(out_sync_count > 0);
290
291 if (!out_syncs)
292 goto fail;
293
294 set_ext(&ms->base, next, DRM_V3D_EXT_ID_MULTI_SYNC, 0);
295 ms->wait_stage = wait_stage;
296 ms->out_sync_count = out_sync_count;
297 ms->out_syncs = (uintptr_t)(void *)out_syncs;
298 ms->in_sync_count = in_sync_count;
299 ms->in_syncs = (uintptr_t)(void *)in_syncs;
300
301 return;
302
303 fail:
304 if (in_syncs)
305 vk_free(&device->vk.alloc, in_syncs);
306 assert(!out_syncs);
307
308 return;
309 }
310
311 static VkResult
handle_reset_query_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)312 handle_reset_query_cpu_job(struct v3dv_queue *queue,
313 struct v3dv_job *job,
314 struct v3dv_submit_sync_info *sync_info,
315 bool signal_syncs)
316 {
317 struct v3dv_device *device = queue->device;
318 struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset;
319 assert(info->pool);
320
321 assert(info->pool->query_type != VK_QUERY_TYPE_OCCLUSION);
322
323 if (device->pdevice->caps.cpu_queue) {
324 assert(info->first + info->count <= info->pool->query_count);
325
326 struct drm_v3d_submit_cpu submit = {0};
327 struct drm_v3d_multi_sync ms = {0};
328
329 uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
330 uintptr_t *kperfmon_ids = NULL;
331
332 if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
333 submit.bo_handle_count = 1;
334 submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle;
335
336 struct drm_v3d_reset_timestamp_query reset = {0};
337
338 set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_TIMESTAMP_QUERY, 0);
339
340 reset.count = info->count;
341 reset.offset = info->pool->queries[info->first].timestamp.offset;
342
343 for (uint32_t i = 0; i < info->count; i++) {
344 struct v3dv_query *query = &info->pool->queries[info->first + i];
345 syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
346 }
347
348 reset.syncs = (uintptr_t)(void *)syncs;
349
350 set_multisync(&ms, sync_info, NULL, 0, (void *)&reset, device, job,
351 V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
352 if (!ms.base.id)
353 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
354 } else {
355 assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
356 struct drm_v3d_reset_performance_query reset = {0};
357
358 set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_PERFORMANCE_QUERY, 0);
359
360 struct vk_sync_wait waits[info->count];
361 unsigned wait_count = 0;
362 for (int i = 0; i < info->count; i++) {
363 struct v3dv_query *query = &info->pool->queries[info->first + i];
364 /* Only wait for a query if we've used it otherwise we will be
365 * waiting forever for the fence to become signaled.
366 */
367 if (query->maybe_available) {
368 waits[wait_count] = (struct vk_sync_wait){
369 .sync = query->perf.last_job_sync
370 };
371 wait_count++;
372 };
373 }
374
375 reset.count = info->count;
376 reset.nperfmons = info->pool->perfmon.nperfmons;
377
378 kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count);
379
380 for (uint32_t i = 0; i < info->count; i++) {
381 struct v3dv_query *query = &info->pool->queries[info->first + i];
382
383 syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
384 kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids;
385 }
386
387 reset.syncs = (uintptr_t)(void *)syncs;
388 reset.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids;
389
390 set_multisync(&ms, sync_info, waits, wait_count, (void *)&reset, device, job,
391 V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
392 if (!ms.base.id)
393 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
394 }
395
396 submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
397 submit.extensions = (uintptr_t)(void *)&ms;
398
399 /* From the Vulkan spec for vkCmdResetQueryPool:
400 *
401 * "This command defines an execution dependency between other query commands
402 * that reference the same query.
403 * ...
404 * The second synchronization scope includes all commands which reference the
405 * queries in queryPool indicated by firstQuery and queryCount that occur later
406 * in submission order."
407 *
408 * This means we should ensure that any timestamps after a reset don't execute before
409 * the reset, however, for timestamps queries in particular we don't have to do
410 * anything special because timestamp queries have to wait for all previously
411 * submitted work to complete before executing (which we accomplish by using
412 * V3DV_BARRIER_ALL on them) and that includes reset jobs submitted to the CPU queue.
413 */
414 int ret = v3dv_ioctl(device->pdevice->render_fd,
415 DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
416
417 free(syncs);
418 free(kperfmon_ids);
419 multisync_free(device, &ms);
420
421 queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
422
423 if (ret)
424 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
425
426 return VK_SUCCESS;
427 }
428
429 /* We are about to reset query counters in user-space so we need to make
430 * sure that the GPU is not using them.
431 */
432 if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
433 VkResult result = queue_wait_idle(queue, sync_info);
434 if (result != VK_SUCCESS)
435 return result;
436
437 v3dv_bo_wait(job->device, info->pool->timestamp.bo, OS_TIMEOUT_INFINITE);
438 }
439
440 if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
441 struct vk_sync_wait waits[info->count];
442 unsigned wait_count = 0;
443 for (int i = 0; i < info->count; i++) {
444 struct v3dv_query *query = &info->pool->queries[info->first + i];
445 /* Only wait for a query if we've used it otherwise we will be
446 * waiting forever for the fence to become signaled.
447 */
448 if (query->maybe_available) {
449 waits[wait_count] = (struct vk_sync_wait){
450 .sync = query->perf.last_job_sync
451 };
452 wait_count++;
453 };
454 }
455
456 VkResult result = vk_sync_wait_many(&job->device->vk, wait_count, waits,
457 VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
458
459 if (result != VK_SUCCESS)
460 return result;
461 }
462
463 v3dv_reset_query_pool_cpu(job->device, info->pool, info->first, info->count);
464
465 return VK_SUCCESS;
466 }
467
468 static VkResult
export_perfmon_last_job_sync(struct v3dv_queue * queue,struct v3dv_job * job,int * fd)469 export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int *fd)
470 {
471 int err;
472 if (job->device->pdevice->caps.multisync) {
473 static const enum v3dv_queue_type queues_to_sync[] = {
474 V3DV_QUEUE_CL,
475 V3DV_QUEUE_CSD,
476 };
477
478 for (uint32_t i = 0; i < ARRAY_SIZE(queues_to_sync); i++) {
479 enum v3dv_queue_type queue_type = queues_to_sync[i];
480 int tmp_fd = -1;
481
482 err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
483 queue->last_job_syncs.syncs[queue_type],
484 &tmp_fd);
485
486 if (err) {
487 close(*fd);
488 return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
489 "sync file export failed: %m");
490 }
491
492 err = sync_accumulate("v3dv", fd, tmp_fd);
493
494 if (err) {
495 close(tmp_fd);
496 close(*fd);
497 return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
498 "failed to accumulate sync files: %m");
499 }
500 }
501 } else {
502 err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
503 queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
504 fd);
505
506 if (err) {
507 return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
508 "sync file export failed: %m");
509 }
510 }
511 return VK_SUCCESS;
512 }
513
514 static VkResult
handle_end_query_cpu_job(struct v3dv_job * job,uint32_t counter_pass_idx)515 handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx)
516 {
517 VkResult result = VK_SUCCESS;
518
519 mtx_lock(&job->device->query_mutex);
520
521 struct v3dv_end_query_info *info = &job->cpu.query_end;
522 struct v3dv_queue *queue = &job->device->queue;
523
524 int err = 0;
525 int fd = -1;
526
527 assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
528
529 if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
530 result = export_perfmon_last_job_sync(queue, job, &fd);
531
532 if (result != VK_SUCCESS)
533 goto fail;
534
535 assert(fd >= 0);
536 }
537
538 for (uint32_t i = 0; i < info->count; i++) {
539 assert(info->query + i < info->pool->query_count);
540 struct v3dv_query *query = &info->pool->queries[info->query + i];
541
542 if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
543 uint32_t syncobj = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
544 err = drmSyncobjImportSyncFile(job->device->pdevice->render_fd,
545 syncobj, fd);
546
547 if (err) {
548 result = vk_errorf(queue, VK_ERROR_UNKNOWN,
549 "sync file import failed: %m");
550 goto fail;
551 }
552 }
553
554 query->maybe_available = true;
555 }
556
557 fail:
558 if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
559 close(fd);
560
561 cnd_broadcast(&job->device->query_ended);
562 mtx_unlock(&job->device->query_mutex);
563
564 return result;
565 }
566
567 static VkResult
handle_copy_query_results_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)568 handle_copy_query_results_cpu_job(struct v3dv_queue *queue,
569 struct v3dv_job *job,
570 struct v3dv_submit_sync_info *sync_info,
571 bool signal_syncs)
572 {
573 struct v3dv_device *device = queue->device;
574 struct v3dv_copy_query_results_cpu_job_info *info =
575 &job->cpu.query_copy_results;
576
577 assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
578 info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
579
580 assert(info->dst && info->dst->mem && info->dst->mem->bo);
581 struct v3dv_bo *bo = info->dst->mem->bo;
582
583 if (device->pdevice->caps.cpu_queue) {
584 struct drm_v3d_submit_cpu submit = {0};
585 struct drm_v3d_multi_sync ms = {0};
586
587 uint32_t *offsets = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
588 uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
589 uint32_t *bo_handles = NULL;
590 uintptr_t *kperfmon_ids = NULL;
591
592 if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
593 submit.bo_handle_count = 2;
594
595 bo_handles = (uint32_t *)
596 malloc(sizeof(uint32_t) * submit.bo_handle_count);
597
598 bo_handles[0] = bo->handle;
599 bo_handles[1] = info->pool->timestamp.bo->handle;
600 submit.bo_handles = (uintptr_t)(void *)bo_handles;
601
602 struct drm_v3d_copy_timestamp_query copy = {0};
603
604 set_ext(©.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_TIMESTAMP_QUERY, 0);
605
606 copy.do_64bit = info->flags & VK_QUERY_RESULT_64_BIT;
607 copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT;
608 copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
609 copy.offset = info->offset + info->dst->mem_offset;
610 copy.stride = info->stride;
611 copy.count = info->count;
612
613 for (uint32_t i = 0; i < info->count; i++) {
614 assert(info->first < info->pool->query_count);
615 assert(info->first + info->count <= info->pool->query_count);
616 struct v3dv_query *query = &info->pool->queries[info->first + i];
617
618 offsets[i] = query->timestamp.offset;
619 syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
620 }
621
622 copy.offsets = (uintptr_t)(void *)offsets;
623 copy.syncs = (uintptr_t)(void *)syncs;
624
625 set_multisync(&ms, sync_info, NULL, 0, (void *)©, device, job,
626 V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
627 if (!ms.base.id)
628 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
629 } else {
630 assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
631
632 submit.bo_handle_count = 1;
633 submit.bo_handles = (uintptr_t)(void *)&bo->handle;
634
635 struct drm_v3d_copy_performance_query copy = {0};
636
637 set_ext(©.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_PERFORMANCE_QUERY, 0);
638
639 /* If the queryPool was created with VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR,
640 * results for each query are written as an array of the type indicated
641 * by VkPerformanceCounterKHR::storage for the counter being queried.
642 * For v3dv, VkPerformanceCounterKHR::storage is
643 * VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR.
644 */
645 copy.do_64bit = true;
646 copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT;
647 copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
648 copy.offset = info->offset + info->dst->mem_offset;
649 copy.stride = info->stride;
650 copy.count = info->count;
651 copy.nperfmons = info->pool->perfmon.nperfmons;
652 copy.ncounters = info->pool->perfmon.ncounters;
653
654 kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count);
655
656 struct vk_sync_wait waits[info->count];
657 unsigned wait_count = 0;
658
659 for (uint32_t i = 0; i < info->count; i++) {
660 assert(info->first < info->pool->query_count);
661 assert(info->first + info->count <= info->pool->query_count);
662 struct v3dv_query *query = &info->pool->queries[info->first + i];
663
664 syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
665 kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids;
666
667 if (info->flags & VK_QUERY_RESULT_WAIT_BIT) {
668 waits[wait_count] = (struct vk_sync_wait){
669 .sync = query->perf.last_job_sync
670 };
671 wait_count++;
672 }
673 }
674
675 copy.syncs = (uintptr_t)(void *)syncs;
676 copy.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids;
677
678 set_multisync(&ms, sync_info, waits, wait_count, (void *)©, device, job,
679 V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
680 if (!ms.base.id)
681 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
682 }
683
684 submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
685 submit.extensions = (uintptr_t)(void *)&ms;
686
687 int ret = v3dv_ioctl(device->pdevice->render_fd,
688 DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
689
690 free(kperfmon_ids);
691 free(bo_handles);
692 free(offsets);
693 free(syncs);
694 multisync_free(device, &ms);
695
696 queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
697
698 if (ret)
699 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
700
701 return VK_SUCCESS;
702 }
703
704 /* Map the entire dst buffer for the CPU copy if needed */
705 assert(!bo->map || bo->map_size == bo->size);
706 if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
707 return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
708
709 uint8_t *offset = ((uint8_t *) bo->map) +
710 info->offset + info->dst->mem_offset;
711 v3dv_get_query_pool_results_cpu(job->device,
712 info->pool,
713 info->first,
714 info->count,
715 offset,
716 info->stride,
717 info->flags);
718
719 return VK_SUCCESS;
720 }
721
722 static VkResult
handle_timestamp_query_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)723 handle_timestamp_query_cpu_job(struct v3dv_queue *queue,
724 struct v3dv_job *job,
725 struct v3dv_submit_sync_info *sync_info,
726 bool signal_syncs)
727 {
728 struct v3dv_device *device = queue->device;
729
730 assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY);
731 struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp;
732
733 if (!device->pdevice->caps.cpu_queue) {
734 /* Wait for completion of all work queued before the timestamp query */
735 VkResult result = queue_wait_idle(queue, sync_info);
736 if (result != VK_SUCCESS)
737 return result;
738
739 mtx_lock(&job->device->query_mutex);
740
741 /* Compute timestamp */
742 struct timespec t;
743 clock_gettime(CLOCK_MONOTONIC, &t);
744
745 for (uint32_t i = 0; i < info->count; i++) {
746 assert(info->query + i < info->pool->query_count);
747 struct v3dv_query *query = &info->pool->queries[info->query + i];
748 query->maybe_available = true;
749
750 /* Value */
751 uint8_t *value_addr =
752 ((uint8_t *) info->pool->timestamp.bo->map) + query->timestamp.offset;
753 *((uint64_t*)value_addr) = (i == 0) ? t.tv_sec * 1000000000ull + t.tv_nsec : 0ull;
754
755 /* Availability */
756 result = vk_sync_signal(&job->device->vk, query->timestamp.sync, 0);
757 }
758
759 cnd_broadcast(&job->device->query_ended);
760 mtx_unlock(&job->device->query_mutex);
761
762 return result;
763 }
764
765 struct drm_v3d_submit_cpu submit = {0};
766
767 submit.bo_handle_count = 1;
768 submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle;
769
770 struct drm_v3d_timestamp_query timestamp = {0};
771
772 set_ext(×tamp.base, NULL, DRM_V3D_EXT_ID_CPU_TIMESTAMP_QUERY, 0);
773
774 timestamp.count = info->count;
775
776 uint32_t *offsets =
777 (uint32_t *) malloc(sizeof(uint32_t) * info->count);
778 uint32_t *syncs =
779 (uint32_t *) malloc(sizeof(uint32_t) * info->count);
780
781 for (uint32_t i = 0; i < info->count; i++) {
782 assert(info->query + i < info->pool->query_count);
783 struct v3dv_query *query = &info->pool->queries[info->query + i];
784 query->maybe_available = true;
785
786 offsets[i] = query->timestamp.offset;
787 syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
788 }
789
790 timestamp.offsets = (uintptr_t)(void *)offsets;
791 timestamp.syncs = (uintptr_t)(void *)syncs;
792
793 struct drm_v3d_multi_sync ms = {0};
794
795 /* The CPU job should be serialized so it only executes after all previously
796 * submitted work has completed
797 */
798 job->serialize = V3DV_BARRIER_ALL;
799
800 set_multisync(&ms, sync_info, NULL, 0, (void *)×tamp, device, job,
801 V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
802 if (!ms.base.id)
803 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
804
805 submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
806 submit.extensions = (uintptr_t)(void *)&ms;
807
808 int ret = v3dv_ioctl(device->pdevice->render_fd,
809 DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
810
811 free(offsets);
812 free(syncs);
813 multisync_free(device, &ms);
814
815 queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
816
817 if (ret)
818 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
819
820 return VK_SUCCESS;
821 }
822
823 static VkResult
handle_csd_indirect_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)824 handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
825 struct v3dv_job *job,
826 struct v3dv_submit_sync_info *sync_info,
827 bool signal_syncs)
828 {
829 struct v3dv_device *device = queue->device;
830
831 assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
832 struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect;
833 assert(info->csd_job);
834
835 assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
836 struct v3dv_bo *bo = info->buffer->mem->bo;
837
838 if (!device->pdevice->caps.cpu_queue) {
839 /* Make sure the GPU is no longer using the indirect buffer*/
840 v3dv_bo_wait(queue->device, bo, OS_TIMEOUT_INFINITE);
841
842 /* Map the indirect buffer and read the dispatch parameters */
843 if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
844 return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
845 assert(bo->map);
846
847 const uint32_t offset = info->buffer->mem_offset + info->offset;
848 const uint32_t *group_counts = (uint32_t *) (bo->map + offset);
849 if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0)
850 return VK_SUCCESS;
851
852 if (memcmp(group_counts, info->csd_job->csd.wg_count,
853 sizeof(info->csd_job->csd.wg_count)) != 0) {
854 v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts);
855 }
856
857 return VK_SUCCESS;
858 }
859
860 struct v3dv_job *csd_job = info->csd_job;
861
862 struct drm_v3d_submit_cpu submit = {0};
863
864 submit.bo_handle_count = 1;
865 submit.bo_handles = (uintptr_t)(void *)&bo->handle;
866
867 csd_job->csd.submit.bo_handle_count = csd_job->bo_count;
868 uint32_t *bo_handles = (uint32_t *) malloc(sizeof(uint32_t) * csd_job->bo_count);
869 uint32_t bo_idx = 0;
870 set_foreach (csd_job->bos, entry) {
871 struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
872 bo_handles[bo_idx++] = bo->handle;
873 }
874 csd_job->csd.submit.bo_handles = (uintptr_t)(void *)bo_handles;
875
876 struct drm_v3d_indirect_csd indirect = {0};
877
878 set_ext(&indirect.base, NULL, DRM_V3D_EXT_ID_CPU_INDIRECT_CSD, 0);
879
880 indirect.submit = csd_job->csd.submit;
881 indirect.offset = info->buffer->mem_offset + info->offset;
882 indirect.wg_size = info->wg_size;
883
884 for (int i = 0; i < 3; i++) {
885 if (info->wg_uniform_offsets[i]) {
886 assert(info->wg_uniform_offsets[i] >= (uint32_t *) csd_job->indirect.base);
887 indirect.wg_uniform_offsets[i] = info->wg_uniform_offsets[i] - (uint32_t *) csd_job->indirect.base;
888 } else {
889 indirect.wg_uniform_offsets[i] = 0xffffffff; /* No rewrite */
890 }
891 }
892
893 indirect.indirect = csd_job->indirect.bo->handle;
894
895 struct drm_v3d_multi_sync ms = {0};
896
897 /* We need to configure the semaphores of this job with the indirect
898 * CSD job, as the CPU job must obey to the CSD job synchronization
899 * demands, such as barriers.
900 */
901 set_multisync(&ms, sync_info, NULL, 0, (void *)&indirect, device, csd_job,
902 V3DV_QUEUE_CPU, V3DV_QUEUE_CSD, V3D_CPU, signal_syncs);
903 if (!ms.base.id)
904 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
905
906 submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
907 submit.extensions = (uintptr_t)(void *)&ms;
908
909 int ret = v3dv_ioctl(device->pdevice->render_fd,
910 DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
911
912 free(bo_handles);
913 multisync_free(device, &ms);
914
915 queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
916 queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
917
918 if (ret)
919 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
920
921 return VK_SUCCESS;
922 }
923
924 /**
925 * This handles semaphore waits for the single sync path by accumulating
926 * wait semaphores into the QUEUE_ANY syncobj. Notice this is only required
927 * to ensure we accumulate any *external* semaphores (since for anything else
928 * we are already accumulating out syncs with each submission to the kernel).
929 */
930 static VkResult
process_singlesync_waits(struct v3dv_queue * queue,uint32_t count,struct vk_sync_wait * waits)931 process_singlesync_waits(struct v3dv_queue *queue,
932 uint32_t count, struct vk_sync_wait *waits)
933 {
934 struct v3dv_device *device = queue->device;
935 assert(!device->pdevice->caps.multisync);
936
937 if (count == 0)
938 return VK_SUCCESS;
939
940 VkResult result = VK_SUCCESS;
941
942 int err = 0;
943 int fd = -1;
944 err = drmSyncobjExportSyncFile(device->pdevice->render_fd,
945 queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
946 &fd);
947 if (err) {
948 result = vk_errorf(queue, VK_ERROR_UNKNOWN,
949 "sync file export failed: %m");
950 goto fail;
951 }
952
953 for (uint32_t i = 0; i < count; i++) {
954 uint32_t syncobj = vk_sync_as_drm_syncobj(waits[i].sync)->syncobj;
955 int wait_fd = -1;
956
957 err = drmSyncobjExportSyncFile(device->pdevice->render_fd,
958 syncobj, &wait_fd);
959 if (err) {
960 result = vk_errorf(queue, VK_ERROR_UNKNOWN,
961 "sync file export failed: %m");
962 goto fail;
963 }
964
965 err = sync_accumulate("v3dv", &fd, wait_fd);
966 close(wait_fd);
967 if (err) {
968 result = vk_errorf(queue, VK_ERROR_UNKNOWN,
969 "sync file merge failed: %m");
970 goto fail;
971 }
972 }
973
974 err = drmSyncobjImportSyncFile(device->pdevice->render_fd,
975 queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
976 fd);
977 if (err) {
978 result = vk_errorf(queue, VK_ERROR_UNKNOWN,
979 "sync file import failed: %m");
980 }
981
982 fail:
983 close(fd);
984 return result;
985 }
986
987 /**
988 * This handles signaling for the single-sync path by importing the QUEUE_ANY
989 * syncobj into all syncs to be signaled.
990 */
991 static VkResult
process_singlesync_signals(struct v3dv_queue * queue,uint32_t count,struct vk_sync_signal * signals)992 process_singlesync_signals(struct v3dv_queue *queue,
993 uint32_t count, struct vk_sync_signal *signals)
994 {
995 struct v3dv_device *device = queue->device;
996 assert(!device->pdevice->caps.multisync && count > 0);
997
998 if (device->pdevice->caps.multisync)
999 return VK_SUCCESS;
1000
1001 int fd = -1;
1002 drmSyncobjExportSyncFile(device->pdevice->render_fd,
1003 queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
1004 &fd);
1005 if (fd == -1) {
1006 return vk_errorf(queue, VK_ERROR_UNKNOWN,
1007 "sync file export failed: %m");
1008 }
1009
1010 VkResult result = VK_SUCCESS;
1011 for (uint32_t i = 0; i < count; i++) {
1012 uint32_t syncobj = vk_sync_as_drm_syncobj(signals[i].sync)->syncobj;
1013 int err = drmSyncobjImportSyncFile(device->pdevice->render_fd,
1014 syncobj, fd);
1015 if (err) {
1016 result = vk_errorf(queue, VK_ERROR_UNKNOWN,
1017 "sync file import failed: %m");
1018 break;
1019 }
1020 }
1021
1022 assert(fd >= 0);
1023 close(fd);
1024
1025 return result;
1026 }
1027
1028 /* This must be called after every submission in the single-sync path to
1029 * accumulate the out_sync into the QUEUE_ANY sync so we can serialize
1030 * jobs by waiting on the QUEUE_ANY sync.
1031 */
1032 static int
update_any_queue_sync(struct v3dv_queue * queue,uint32_t out_sync)1033 update_any_queue_sync(struct v3dv_queue *queue, uint32_t out_sync)
1034 {
1035 struct v3dv_device *device = queue->device;
1036 assert(!device->pdevice->caps.multisync);
1037
1038 int render_fd = device->pdevice->render_fd;
1039 int fd_any = -1, fd_out_sync = -1;
1040 int err;
1041 err = drmSyncobjExportSyncFile(render_fd,
1042 queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
1043 &fd_any);
1044 if (err)
1045 goto fail;
1046
1047 err = drmSyncobjExportSyncFile(render_fd, out_sync, &fd_out_sync);
1048 if (err)
1049 goto fail;
1050
1051 err = sync_accumulate("v3dv", &fd_any, fd_out_sync);
1052 if (err)
1053 goto fail;
1054
1055 err = drmSyncobjImportSyncFile(render_fd,
1056 queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
1057 fd_any);
1058
1059 fail:
1060 close(fd_any);
1061 close(fd_out_sync);
1062 return err;
1063 }
1064
1065 static VkResult
handle_cl_job(struct v3dv_queue * queue,struct v3dv_job * job,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1066 handle_cl_job(struct v3dv_queue *queue,
1067 struct v3dv_job *job,
1068 uint32_t counter_pass_idx,
1069 struct v3dv_submit_sync_info *sync_info,
1070 bool signal_syncs)
1071 {
1072 struct v3dv_device *device = queue->device;
1073
1074 struct drm_v3d_submit_cl submit = { 0 };
1075
1076 /* Sanity check: we should only flag a bcl sync on a job that needs to be
1077 * serialized.
1078 */
1079 assert(job->serialize || !job->needs_bcl_sync);
1080
1081 /* We expect to have just one RCL per job which should fit in just one BO.
1082 * Our BCL, could chain multiple BOS together though.
1083 */
1084 assert(list_length(&job->rcl.bo_list) == 1);
1085 assert(list_length(&job->bcl.bo_list) >= 1);
1086 struct v3dv_bo *bcl_fist_bo =
1087 list_first_entry(&job->bcl.bo_list, struct v3dv_bo, list_link);
1088 submit.bcl_start = bcl_fist_bo->offset;
1089 submit.bcl_end = job->bcl.bo->offset + v3dv_cl_offset(&job->bcl);
1090 submit.rcl_start = job->rcl.bo->offset;
1091 submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl);
1092
1093 submit.qma = job->tile_alloc->offset;
1094 submit.qms = job->tile_alloc->size;
1095 submit.qts = job->tile_state->offset;
1096
1097 submit.flags = 0;
1098 if (job->tmu_dirty_rcl)
1099 submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
1100
1101 /* If the job uses VK_KHR_buffer_device_address we need to ensure all
1102 * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
1103 * are included.
1104 */
1105 if (job->uses_buffer_device_address) {
1106 util_dynarray_foreach(&queue->device->device_address_bo_list,
1107 struct v3dv_bo *, bo) {
1108 v3dv_job_add_bo(job, *bo);
1109 }
1110 }
1111
1112 submit.bo_handle_count = job->bo_count;
1113 uint32_t *bo_handles =
1114 (uint32_t *) malloc(sizeof(uint32_t) * submit.bo_handle_count);
1115 uint32_t bo_idx = 0;
1116 set_foreach(job->bos, entry) {
1117 struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
1118 bo_handles[bo_idx++] = bo->handle;
1119 }
1120 assert(bo_idx == submit.bo_handle_count);
1121 submit.bo_handles = (uintptr_t)(void *)bo_handles;
1122
1123 submit.perfmon_id = job->perf ?
1124 job->perf->kperfmon_ids[counter_pass_idx] : 0;
1125 const bool needs_perf_sync = queue->last_perfmon_id != submit.perfmon_id;
1126 queue->last_perfmon_id = submit.perfmon_id;
1127
1128 /* We need a binning sync if we are the first CL job waiting on a semaphore
1129 * with a wait stage that involves the geometry pipeline, or if the job
1130 * comes after a pipeline barrier that involves geometry stages
1131 * (needs_bcl_sync) or when performance queries are in use.
1132 *
1133 * We need a render sync if the job doesn't need a binning sync but has
1134 * still been flagged for serialization. It should be noted that RCL jobs
1135 * don't start until the previous RCL job has finished so we don't really
1136 * need to add a fence for those, however, we might need to wait on a CSD or
1137 * TFU job, which are not automatically serialized with CL jobs.
1138 */
1139 bool needs_bcl_sync = job->needs_bcl_sync || needs_perf_sync;
1140 if (queue->last_job_syncs.first[V3DV_QUEUE_CL]) {
1141 for (int i = 0; !needs_bcl_sync && i < sync_info->wait_count; i++) {
1142 needs_bcl_sync = sync_info->waits[i].stage_mask &
1143 (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
1144 VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
1145 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
1146 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
1147 VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
1148 VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
1149 VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
1150 VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
1151 VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
1152 VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
1153 VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
1154 VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT);
1155 }
1156 }
1157
1158 bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
1159
1160 /* Replace single semaphore settings whenever our kernel-driver supports
1161 * multiple semaphores extension.
1162 */
1163 struct drm_v3d_multi_sync ms = { 0 };
1164 if (device->pdevice->caps.multisync) {
1165 enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN;
1166 set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
1167 V3DV_QUEUE_CL, V3DV_QUEUE_CL, wait_stage, signal_syncs);
1168 if (!ms.base.id)
1169 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1170
1171 submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
1172 submit.extensions = (uintptr_t)(void *)&ms;
1173 /* Disable legacy sync interface when multisync extension is used */
1174 submit.in_sync_rcl = 0;
1175 submit.in_sync_bcl = 0;
1176 submit.out_sync = 0;
1177 } else {
1178 uint32_t last_job_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_ANY];
1179 submit.in_sync_bcl = needs_bcl_sync ? last_job_sync : 0;
1180 submit.in_sync_rcl = needs_rcl_sync ? last_job_sync : 0;
1181 submit.out_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_CL];
1182 }
1183
1184 v3dv_clif_dump(device, job, &submit);
1185 int ret = v3dv_ioctl(device->pdevice->render_fd,
1186 DRM_IOCTL_V3D_SUBMIT_CL, &submit);
1187
1188 static bool warned = false;
1189 if (ret && !warned) {
1190 fprintf(stderr, "Draw call returned %s. Expect corruption.\n",
1191 strerror(errno));
1192 warned = true;
1193 }
1194
1195 if (!device->pdevice->caps.multisync && ret == 0)
1196 ret = update_any_queue_sync(queue, submit.out_sync);
1197
1198 free(bo_handles);
1199 multisync_free(device, &ms);
1200
1201 queue->last_job_syncs.first[V3DV_QUEUE_CL] = false;
1202
1203 if (ret)
1204 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CL failed: %m");
1205
1206 return VK_SUCCESS;
1207 }
1208
1209 static VkResult
handle_tfu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1210 handle_tfu_job(struct v3dv_queue *queue,
1211 struct v3dv_job *job,
1212 struct v3dv_submit_sync_info *sync_info,
1213 bool signal_syncs)
1214 {
1215 assert(!V3D_DBG(DISABLE_TFU));
1216
1217 struct v3dv_device *device = queue->device;
1218
1219 const bool needs_sync = sync_info->wait_count || job->serialize;
1220
1221 /* Replace single semaphore settings whenever our kernel-driver supports
1222 * multiple semaphore extension.
1223 */
1224 struct drm_v3d_multi_sync ms = { 0 };
1225 if (device->pdevice->caps.multisync) {
1226 set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
1227 V3DV_QUEUE_TFU, V3DV_QUEUE_TFU, V3D_TFU, signal_syncs);
1228 if (!ms.base.id)
1229 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1230
1231 job->tfu.flags |= DRM_V3D_SUBMIT_EXTENSION;
1232 job->tfu.extensions = (uintptr_t)(void *)&ms;
1233 /* Disable legacy sync interface when multisync extension is used */
1234 job->tfu.in_sync = 0;
1235 job->tfu.out_sync = 0;
1236 } else {
1237 uint32_t last_job_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_ANY];
1238 job->tfu.in_sync = needs_sync ? last_job_sync : 0;
1239 job->tfu.out_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_TFU];
1240 }
1241 int ret = v3dv_ioctl(device->pdevice->render_fd,
1242 DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu);
1243
1244 if (!device->pdevice->caps.multisync && ret == 0)
1245 ret = update_any_queue_sync(queue, job->tfu.out_sync);
1246
1247 multisync_free(device, &ms);
1248 queue->last_job_syncs.first[V3DV_QUEUE_TFU] = false;
1249
1250 if (ret != 0)
1251 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_TFU failed: %m");
1252
1253 return VK_SUCCESS;
1254 }
1255
1256 static VkResult
handle_csd_job(struct v3dv_queue * queue,struct v3dv_job * job,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1257 handle_csd_job(struct v3dv_queue *queue,
1258 struct v3dv_job *job,
1259 uint32_t counter_pass_idx,
1260 struct v3dv_submit_sync_info *sync_info,
1261 bool signal_syncs)
1262 {
1263 struct v3dv_device *device = queue->device;
1264
1265 struct drm_v3d_submit_csd *submit = &job->csd.submit;
1266
1267 /* If the job uses VK_KHR_buffer_device_address we need to ensure all
1268 * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
1269 * are included.
1270 */
1271 if (job->uses_buffer_device_address) {
1272 util_dynarray_foreach(&queue->device->device_address_bo_list,
1273 struct v3dv_bo *, bo) {
1274 v3dv_job_add_bo(job, *bo);
1275 }
1276 }
1277
1278 submit->bo_handle_count = job->bo_count;
1279 uint32_t *bo_handles =
1280 (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2));
1281 uint32_t bo_idx = 0;
1282 set_foreach(job->bos, entry) {
1283 struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
1284 bo_handles[bo_idx++] = bo->handle;
1285 }
1286 assert(bo_idx == submit->bo_handle_count);
1287 submit->bo_handles = (uintptr_t)(void *)bo_handles;
1288
1289 const bool needs_sync = sync_info->wait_count || job->serialize;
1290
1291 /* Replace single semaphore settings whenever our kernel-driver supports
1292 * multiple semaphore extension.
1293 */
1294 struct drm_v3d_multi_sync ms = { 0 };
1295 if (device->pdevice->caps.multisync) {
1296 set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
1297 V3DV_QUEUE_CSD, V3DV_QUEUE_CSD, V3D_CSD, signal_syncs);
1298 if (!ms.base.id)
1299 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1300
1301 submit->flags |= DRM_V3D_SUBMIT_EXTENSION;
1302 submit->extensions = (uintptr_t)(void *)&ms;
1303 /* Disable legacy sync interface when multisync extension is used */
1304 submit->in_sync = 0;
1305 submit->out_sync = 0;
1306 } else {
1307 uint32_t last_job_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_ANY];
1308 submit->in_sync = needs_sync ? last_job_sync : 0;
1309 submit->out_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_CSD];
1310 }
1311 submit->perfmon_id = job->perf ?
1312 job->perf->kperfmon_ids[counter_pass_idx] : 0;
1313 queue->last_perfmon_id = submit->perfmon_id;
1314 int ret = v3dv_ioctl(device->pdevice->render_fd,
1315 DRM_IOCTL_V3D_SUBMIT_CSD, submit);
1316
1317 static bool warned = false;
1318 if (ret && !warned) {
1319 fprintf(stderr, "Compute dispatch returned %s. Expect corruption.\n",
1320 strerror(errno));
1321 warned = true;
1322 }
1323
1324 if (!device->pdevice->caps.multisync && ret == 0)
1325 ret = update_any_queue_sync(queue, submit->out_sync);
1326
1327 free(bo_handles);
1328
1329 multisync_free(device, &ms);
1330 queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
1331
1332 if (ret)
1333 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CSD failed: %m");
1334
1335 return VK_SUCCESS;
1336 }
1337
1338 static VkResult
queue_handle_job(struct v3dv_queue * queue,struct v3dv_job * job,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1339 queue_handle_job(struct v3dv_queue *queue,
1340 struct v3dv_job *job,
1341 uint32_t counter_pass_idx,
1342 struct v3dv_submit_sync_info *sync_info,
1343 bool signal_syncs)
1344 {
1345 switch (job->type) {
1346 case V3DV_JOB_TYPE_GPU_CL:
1347 return handle_cl_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
1348 case V3DV_JOB_TYPE_GPU_TFU:
1349 return handle_tfu_job(queue, job, sync_info, signal_syncs);
1350 case V3DV_JOB_TYPE_GPU_CSD:
1351 return handle_csd_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
1352 case V3DV_JOB_TYPE_CPU_RESET_QUERIES:
1353 return handle_reset_query_cpu_job(queue, job, sync_info, signal_syncs);
1354 case V3DV_JOB_TYPE_CPU_END_QUERY:
1355 return handle_end_query_cpu_job(job, counter_pass_idx);
1356 case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
1357 return handle_copy_query_results_cpu_job(queue, job, sync_info, signal_syncs);
1358 case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
1359 return handle_csd_indirect_cpu_job(queue, job, sync_info, signal_syncs);
1360 case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY:
1361 return handle_timestamp_query_cpu_job(queue, job, sync_info, signal_syncs);
1362 default:
1363 unreachable("Unhandled job type");
1364 }
1365 }
1366
1367 static VkResult
queue_create_noop_job(struct v3dv_queue * queue)1368 queue_create_noop_job(struct v3dv_queue *queue)
1369 {
1370 struct v3dv_device *device = queue->device;
1371 queue->noop_job = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_job), 8,
1372 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1373 if (!queue->noop_job)
1374 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1375 v3dv_job_init(queue->noop_job, V3DV_JOB_TYPE_GPU_CL, device, NULL, -1);
1376
1377 v3dv_X(device, job_emit_noop)(queue->noop_job);
1378
1379 /* We use no-op jobs to signal semaphores/fences. These jobs needs to be
1380 * serialized across all hw queues to comply with Vulkan's signal operation
1381 * order requirements, which basically require that signal operations occur
1382 * in submission order.
1383 */
1384 queue->noop_job->serialize = V3DV_BARRIER_ALL;
1385
1386 return VK_SUCCESS;
1387 }
1388
1389 static VkResult
queue_submit_noop_job(struct v3dv_queue * queue,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1390 queue_submit_noop_job(struct v3dv_queue *queue,
1391 uint32_t counter_pass_idx,
1392 struct v3dv_submit_sync_info *sync_info,
1393 bool signal_syncs)
1394 {
1395 if (!queue->noop_job) {
1396 VkResult result = queue_create_noop_job(queue);
1397 if (result != VK_SUCCESS)
1398 return result;
1399 }
1400
1401 assert(queue->noop_job);
1402 return queue_handle_job(queue, queue->noop_job, counter_pass_idx,
1403 sync_info, signal_syncs);
1404 }
1405
1406 VkResult
v3dv_queue_driver_submit(struct vk_queue * vk_queue,struct vk_queue_submit * submit)1407 v3dv_queue_driver_submit(struct vk_queue *vk_queue,
1408 struct vk_queue_submit *submit)
1409 {
1410 struct v3dv_queue *queue = container_of(vk_queue, struct v3dv_queue, vk);
1411 VkResult result;
1412
1413 struct v3dv_submit_sync_info sync_info = {
1414 .wait_count = submit->wait_count,
1415 .waits = submit->waits,
1416 .signal_count = submit->signal_count,
1417 .signals = submit->signals,
1418 };
1419
1420 for (int i = 0; i < V3DV_QUEUE_COUNT; i++)
1421 queue->last_job_syncs.first[i] = true;
1422
1423 /* If we do not have multisync we need to ensure we accumulate any wait
1424 * semaphores into our QUEUE_ANY syncobj so we can handle waiting on
1425 * external semaphores.
1426 */
1427 if (!queue->device->pdevice->caps.multisync) {
1428 result =
1429 process_singlesync_waits(queue, sync_info.wait_count, sync_info.waits);
1430 if (result != VK_SUCCESS)
1431 return result;
1432 }
1433
1434 for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
1435 struct v3dv_cmd_buffer *cmd_buffer =
1436 container_of(submit->command_buffers[i], struct v3dv_cmd_buffer, vk);
1437 list_for_each_entry_safe(struct v3dv_job, job,
1438 &cmd_buffer->jobs, list_link) {
1439
1440 result = queue_handle_job(queue, job, submit->perf_pass_index,
1441 &sync_info, false);
1442 if (result != VK_SUCCESS)
1443 return result;
1444 }
1445
1446 /* If the command buffer ends with a barrier we need to consume it now.
1447 *
1448 * FIXME: this will drain all hw queues. Instead, we could use the pending
1449 * barrier state to limit the queues we serialize against.
1450 */
1451 if (cmd_buffer->state.barrier.dst_mask) {
1452 result = queue_submit_noop_job(queue, submit->perf_pass_index,
1453 &sync_info, false);
1454 if (result != VK_SUCCESS)
1455 return result;
1456 }
1457 }
1458
1459 /* Handle signaling now */
1460 if (submit->signal_count > 0) {
1461 if (queue->device->pdevice->caps.multisync) {
1462 /* Finish by submitting a no-op job that synchronizes across all queues.
1463 * This will ensure that the signal semaphores don't get triggered until
1464 * all work on any queue completes. See Vulkan's signal operation order
1465 * requirements.
1466 */
1467 return queue_submit_noop_job(queue, submit->perf_pass_index,
1468 &sync_info, true);
1469 } else {
1470 return process_singlesync_signals(queue, sync_info.signal_count,
1471 sync_info.signals);
1472 }
1473 }
1474
1475 return VK_SUCCESS;
1476 }
1477
1478 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_QueueBindSparse(VkQueue _queue,uint32_t bindInfoCount,const VkBindSparseInfo * pBindInfo,VkFence fence)1479 v3dv_QueueBindSparse(VkQueue _queue,
1480 uint32_t bindInfoCount,
1481 const VkBindSparseInfo *pBindInfo,
1482 VkFence fence)
1483 {
1484 V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
1485 return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT);
1486 }
1487