1 /*
2 * Copyright © 2019 Raspberry Pi Ltd
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "v3dv_private.h"
25 #include "drm-uapi/v3d_drm.h"
26
27 #include "broadcom/clif/clif_dump.h"
28 #include "util/libsync.h"
29 #include "util/os_time.h"
30 #include "util/perf/cpu_trace.h"
31 #include "vk_drm_syncobj.h"
32
33 #include <errno.h>
34 #include <time.h>
35
36 static void
v3dv_clif_dump(struct v3dv_device * device,struct v3dv_job * job,struct drm_v3d_submit_cl * submit)37 v3dv_clif_dump(struct v3dv_device *device,
38 struct v3dv_job *job,
39 struct drm_v3d_submit_cl *submit)
40 {
41 if (!(V3D_DBG(CL) ||
42 V3D_DBG(CL_NO_BIN) ||
43 V3D_DBG(CLIF)))
44 return;
45
46 struct clif_dump *clif = clif_dump_init(&device->devinfo,
47 stderr,
48 V3D_DBG(CL) ||
49 V3D_DBG(CL_NO_BIN),
50 V3D_DBG(CL_NO_BIN));
51
52 set_foreach(job->bos, entry) {
53 struct v3dv_bo *bo = (void *)entry->key;
54 char *name = ralloc_asprintf(NULL, "%s_0x%x",
55 bo->name, bo->offset);
56
57 bool ok = v3dv_bo_map(device, bo, bo->size);
58 if (!ok) {
59 mesa_loge("failed to map BO for clif_dump.\n");
60 ralloc_free(name);
61 goto free_clif;
62 }
63 clif_dump_add_bo(clif, name, bo->offset, bo->size, bo->map);
64
65 ralloc_free(name);
66 }
67
68 clif_dump(clif, submit);
69
70 free_clif:
71 clif_dump_destroy(clif);
72 }
73
74 static VkResult
queue_wait_idle(struct v3dv_queue * queue,struct v3dv_submit_sync_info * sync_info)75 queue_wait_idle(struct v3dv_queue *queue,
76 struct v3dv_submit_sync_info *sync_info)
77 {
78 int ret = drmSyncobjWait(queue->device->pdevice->render_fd,
79 queue->last_job_syncs.syncs, 4,
80 INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
81 NULL);
82 if (ret)
83 return vk_errorf(queue, VK_ERROR_DEVICE_LOST, "syncobj wait failed: %m");
84
85 bool first = true;
86 for (int i = 0; i < 4; i++) {
87 if (!queue->last_job_syncs.first[i])
88 first = false;
89 }
90
91 /* If we're not the first job, that means we're waiting on some
92 * per-queue-type syncobj which transitively waited on the semaphores
93 * so we can skip the semaphore wait.
94 */
95 if (first) {
96 VkResult result = vk_sync_wait_many(&queue->device->vk,
97 sync_info->wait_count,
98 sync_info->waits,
99 VK_SYNC_WAIT_COMPLETE,
100 UINT64_MAX);
101 if (result != VK_SUCCESS)
102 return result;
103 }
104
105 for (int i = 0; i < 4; i++)
106 queue->last_job_syncs.first[i] = false;
107
108 return VK_SUCCESS;
109 }
110
111 static void
multisync_free(struct v3dv_device * device,struct drm_v3d_multi_sync * ms)112 multisync_free(struct v3dv_device *device,
113 struct drm_v3d_multi_sync *ms)
114 {
115 vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->out_syncs);
116 vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->in_syncs);
117 }
118
119 static struct drm_v3d_sem *
set_in_syncs(struct v3dv_queue * queue,struct v3dv_job * job,enum v3dv_queue_type queue_sync,uint32_t * count,struct vk_sync_wait * waits,unsigned wait_count,struct v3dv_submit_sync_info * sync_info)120 set_in_syncs(struct v3dv_queue *queue,
121 struct v3dv_job *job,
122 enum v3dv_queue_type queue_sync,
123 uint32_t *count,
124 struct vk_sync_wait *waits,
125 unsigned wait_count,
126 struct v3dv_submit_sync_info *sync_info)
127 {
128 struct v3dv_device *device = queue->device;
129 uint32_t n_syncs = 0;
130
131 /* If this is the first job submitted to a given GPU queue in this cmd buf
132 * batch, it has to wait on wait semaphores (if any) before running.
133 */
134 if (queue->last_job_syncs.first[queue_sync])
135 n_syncs = sync_info->wait_count;
136
137 /* If the serialize flag is set the job needs to be serialized in the
138 * corresponding queues. Notice that we may implement transfer operations
139 * as both CL or TFU jobs.
140 *
141 * FIXME: maybe we could track more precisely if the source of a transfer
142 * barrier is a CL and/or a TFU job.
143 */
144 bool sync_csd = job->serialize & V3DV_BARRIER_COMPUTE_BIT;
145 bool sync_tfu = job->serialize & V3DV_BARRIER_TRANSFER_BIT;
146 bool sync_cl = job->serialize & (V3DV_BARRIER_GRAPHICS_BIT |
147 V3DV_BARRIER_TRANSFER_BIT);
148 bool sync_cpu = job->serialize & V3DV_BARRIER_CPU_BIT;
149
150 *count = n_syncs;
151 if (sync_cl)
152 (*count)++;
153 if (sync_tfu)
154 (*count)++;
155 if (sync_csd)
156 (*count)++;
157 if (sync_cpu)
158 (*count)++;
159
160 *count += wait_count;
161
162 if (!*count)
163 return NULL;
164
165 struct drm_v3d_sem *syncs =
166 vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
167 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
168
169 if (!syncs)
170 return NULL;
171
172 for (int i = 0; i < n_syncs; i++) {
173 syncs[i].handle =
174 vk_sync_as_drm_syncobj(sync_info->waits[i].sync)->syncobj;
175 }
176
177 for (int i = 0; i < wait_count; i++) {
178 syncs[n_syncs++].handle =
179 vk_sync_as_drm_syncobj(waits[i].sync)->syncobj;
180 }
181
182 if (sync_cl)
183 syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CL];
184
185 if (sync_csd)
186 syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CSD];
187
188 if (sync_tfu)
189 syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_TFU];
190
191 if (sync_cpu)
192 syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CPU];
193
194 assert(n_syncs == *count);
195 return syncs;
196 }
197
198 static struct drm_v3d_sem *
set_out_syncs(struct v3dv_queue * queue,struct v3dv_job * job,enum v3dv_queue_type queue_sync,uint32_t * count,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)199 set_out_syncs(struct v3dv_queue *queue,
200 struct v3dv_job *job,
201 enum v3dv_queue_type queue_sync,
202 uint32_t *count,
203 struct v3dv_submit_sync_info *sync_info,
204 bool signal_syncs)
205 {
206 struct v3dv_device *device = queue->device;
207
208 uint32_t n_vk_syncs = signal_syncs ? sync_info->signal_count : 0;
209
210 /* We always signal the syncobj from `device->last_job_syncs` related to
211 * this v3dv_queue_type to track the last job submitted to this queue.
212 */
213 (*count) = n_vk_syncs + 1;
214
215 struct drm_v3d_sem *syncs =
216 vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
217 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
218
219 if (!syncs)
220 return NULL;
221
222 if (n_vk_syncs) {
223 for (unsigned i = 0; i < n_vk_syncs; i++) {
224 syncs[i].handle =
225 vk_sync_as_drm_syncobj(sync_info->signals[i].sync)->syncobj;
226 }
227 }
228
229 syncs[n_vk_syncs].handle = queue->last_job_syncs.syncs[queue_sync];
230
231 return syncs;
232 }
233
234 static void
set_ext(struct drm_v3d_extension * ext,struct drm_v3d_extension * next,uint32_t id,uintptr_t flags)235 set_ext(struct drm_v3d_extension *ext,
236 struct drm_v3d_extension *next,
237 uint32_t id,
238 uintptr_t flags)
239 {
240 ext->next = (uintptr_t)(void *)next;
241 ext->id = id;
242 ext->flags = flags;
243 }
244
245 /* This function sets the extension for multiple in/out syncobjs. When it is
246 * successful, it sets the extension id to DRM_V3D_EXT_ID_MULTI_SYNC.
247 * Otherwise, the extension id is 0, which means an out-of-memory error.
248 */
249 static void
set_multisync(struct drm_v3d_multi_sync * ms,struct v3dv_submit_sync_info * sync_info,struct vk_sync_wait * waits,unsigned wait_count,struct drm_v3d_extension * next,struct v3dv_device * device,struct v3dv_job * job,enum v3dv_queue_type in_queue_sync,enum v3dv_queue_type out_queue_sync,enum v3d_queue wait_stage,bool signal_syncs)250 set_multisync(struct drm_v3d_multi_sync *ms,
251 struct v3dv_submit_sync_info *sync_info,
252 struct vk_sync_wait *waits,
253 unsigned wait_count,
254 struct drm_v3d_extension *next,
255 struct v3dv_device *device,
256 struct v3dv_job *job,
257 enum v3dv_queue_type in_queue_sync,
258 enum v3dv_queue_type out_queue_sync,
259 enum v3d_queue wait_stage,
260 bool signal_syncs)
261 {
262 struct v3dv_queue *queue = &device->queue;
263 uint32_t out_sync_count = 0, in_sync_count = 0;
264 struct drm_v3d_sem *out_syncs = NULL, *in_syncs = NULL;
265
266 in_syncs = set_in_syncs(queue, job, in_queue_sync,
267 &in_sync_count, waits, wait_count, sync_info);
268 if (!in_syncs && in_sync_count)
269 goto fail;
270
271 out_syncs = set_out_syncs(queue, job, out_queue_sync,
272 &out_sync_count, sync_info, signal_syncs);
273
274 assert(out_sync_count > 0);
275
276 if (!out_syncs)
277 goto fail;
278
279 set_ext(&ms->base, next, DRM_V3D_EXT_ID_MULTI_SYNC, 0);
280 ms->wait_stage = wait_stage;
281 ms->out_sync_count = out_sync_count;
282 ms->out_syncs = (uintptr_t)(void *)out_syncs;
283 ms->in_sync_count = in_sync_count;
284 ms->in_syncs = (uintptr_t)(void *)in_syncs;
285
286 return;
287
288 fail:
289 if (in_syncs)
290 vk_free(&device->vk.alloc, in_syncs);
291 assert(!out_syncs);
292
293 return;
294 }
295
296 static VkResult
handle_reset_query_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)297 handle_reset_query_cpu_job(struct v3dv_queue *queue,
298 struct v3dv_job *job,
299 struct v3dv_submit_sync_info *sync_info,
300 bool signal_syncs)
301 {
302 MESA_TRACE_FUNC();
303 struct v3dv_device *device = queue->device;
304 struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset;
305 assert(info->pool);
306
307 assert(info->pool->query_type != VK_QUERY_TYPE_OCCLUSION);
308
309 if (device->pdevice->caps.cpu_queue) {
310 assert(info->first + info->count <= info->pool->query_count);
311
312 struct drm_v3d_submit_cpu submit = {0};
313 struct drm_v3d_multi_sync ms = {0};
314
315 uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
316 uintptr_t *kperfmon_ids = NULL;
317
318 if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
319 submit.bo_handle_count = 1;
320 submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle;
321
322 struct drm_v3d_reset_timestamp_query reset = {0};
323
324 set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_TIMESTAMP_QUERY, 0);
325
326 reset.count = info->count;
327 reset.offset = info->pool->queries[info->first].timestamp.offset;
328
329 for (uint32_t i = 0; i < info->count; i++) {
330 struct v3dv_query *query = &info->pool->queries[info->first + i];
331 syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
332 }
333
334 reset.syncs = (uintptr_t)(void *)syncs;
335
336 set_multisync(&ms, sync_info, NULL, 0, (void *)&reset, device, job,
337 V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
338 if (!ms.base.id) {
339 free(syncs);
340 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
341 }
342 } else {
343 assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
344 struct drm_v3d_reset_performance_query reset = {0};
345
346 set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_PERFORMANCE_QUERY, 0);
347
348 struct vk_sync_wait waits[info->count];
349 unsigned wait_count = 0;
350 for (int i = 0; i < info->count; i++) {
351 struct v3dv_query *query = &info->pool->queries[info->first + i];
352 /* Only wait for a query if we've used it otherwise we will be
353 * waiting forever for the fence to become signaled.
354 */
355 if (query->maybe_available) {
356 waits[wait_count] = (struct vk_sync_wait){
357 .sync = query->perf.last_job_sync
358 };
359 wait_count++;
360 };
361 }
362
363 reset.count = info->count;
364 reset.nperfmons = info->pool->perfmon.nperfmons;
365
366 kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count);
367
368 for (uint32_t i = 0; i < info->count; i++) {
369 struct v3dv_query *query = &info->pool->queries[info->first + i];
370
371 syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
372 kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids;
373 }
374
375 reset.syncs = (uintptr_t)(void *)syncs;
376 reset.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids;
377
378 set_multisync(&ms, sync_info, waits, wait_count, (void *)&reset, device, job,
379 V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
380 if (!ms.base.id) {
381 free(syncs);
382 free(kperfmon_ids);
383 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
384 }
385 }
386
387 submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
388 submit.extensions = (uintptr_t)(void *)&ms;
389
390 /* From the Vulkan spec for vkCmdResetQueryPool:
391 *
392 * "This command defines an execution dependency between other query commands
393 * that reference the same query.
394 * ...
395 * The second synchronization scope includes all commands which reference the
396 * queries in queryPool indicated by firstQuery and queryCount that occur later
397 * in submission order."
398 *
399 * This means we should ensure that any timestamps after a reset don't execute before
400 * the reset, however, for timestamps queries in particular we don't have to do
401 * anything special because timestamp queries have to wait for all previously
402 * submitted work to complete before executing (which we accomplish by using
403 * V3DV_BARRIER_ALL on them) and that includes reset jobs submitted to the CPU queue.
404 */
405 int ret = v3d_ioctl(device->pdevice->render_fd,
406 DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
407
408 free(syncs);
409 free(kperfmon_ids);
410 multisync_free(device, &ms);
411
412 queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
413
414 if (ret)
415 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
416
417 return VK_SUCCESS;
418 }
419
420 /* We are about to reset query counters in user-space so we need to make
421 * sure that the GPU is not using them.
422 */
423 if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
424 VkResult result = queue_wait_idle(queue, sync_info);
425 if (result != VK_SUCCESS)
426 return result;
427
428 v3dv_bo_wait(job->device, info->pool->timestamp.bo, OS_TIMEOUT_INFINITE);
429 }
430
431 if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
432 struct vk_sync_wait waits[info->count];
433 unsigned wait_count = 0;
434 for (int i = 0; i < info->count; i++) {
435 struct v3dv_query *query = &info->pool->queries[info->first + i];
436 /* Only wait for a query if we've used it otherwise we will be
437 * waiting forever for the fence to become signaled.
438 */
439 if (query->maybe_available) {
440 waits[wait_count] = (struct vk_sync_wait){
441 .sync = query->perf.last_job_sync
442 };
443 wait_count++;
444 };
445 }
446
447 VkResult result = vk_sync_wait_many(&job->device->vk, wait_count, waits,
448 VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
449
450 if (result != VK_SUCCESS)
451 return result;
452 }
453
454 v3dv_reset_query_pool_cpu(job->device, info->pool, info->first, info->count);
455
456 return VK_SUCCESS;
457 }
458
459 static VkResult
export_perfmon_last_job_sync(struct v3dv_queue * queue,struct v3dv_job * job,int * fd)460 export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int *fd)
461 {
462 int err;
463 static const enum v3dv_queue_type queues_to_sync[] = {
464 V3DV_QUEUE_CL,
465 V3DV_QUEUE_CSD,
466 };
467
468 for (uint32_t i = 0; i < ARRAY_SIZE(queues_to_sync); i++) {
469 enum v3dv_queue_type queue_type = queues_to_sync[i];
470 int tmp_fd = -1;
471
472 err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
473 queue->last_job_syncs.syncs[queue_type],
474 &tmp_fd);
475
476 if (err) {
477 close(*fd);
478 return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
479 "sync file export failed: %m");
480 }
481
482 err = sync_accumulate("v3dv", fd, tmp_fd);
483
484 if (err) {
485 close(tmp_fd);
486 close(*fd);
487 return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
488 "failed to accumulate sync files: %m");
489 }
490 }
491
492 return VK_SUCCESS;
493 }
494
495 static VkResult
handle_end_query_cpu_job(struct v3dv_job * job,uint32_t counter_pass_idx)496 handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx)
497 {
498 MESA_TRACE_FUNC();
499 VkResult result = VK_SUCCESS;
500
501 mtx_lock(&job->device->query_mutex);
502
503 struct v3dv_end_query_info *info = &job->cpu.query_end;
504 struct v3dv_queue *queue = &job->device->queue;
505
506 int err = 0;
507 int fd = -1;
508
509 assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
510
511 if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
512 result = export_perfmon_last_job_sync(queue, job, &fd);
513
514 if (result != VK_SUCCESS)
515 goto fail;
516
517 assert(fd >= 0);
518 }
519
520 for (uint32_t i = 0; i < info->count; i++) {
521 assert(info->query + i < info->pool->query_count);
522 struct v3dv_query *query = &info->pool->queries[info->query + i];
523
524 if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
525 uint32_t syncobj = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
526 err = drmSyncobjImportSyncFile(job->device->pdevice->render_fd,
527 syncobj, fd);
528
529 if (err) {
530 result = vk_errorf(queue, VK_ERROR_UNKNOWN,
531 "sync file import failed: %m");
532 goto fail;
533 }
534 }
535
536 query->maybe_available = true;
537 }
538
539 fail:
540 if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
541 close(fd);
542
543 cnd_broadcast(&job->device->query_ended);
544 mtx_unlock(&job->device->query_mutex);
545
546 return result;
547 }
548
549 static VkResult
handle_copy_query_results_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)550 handle_copy_query_results_cpu_job(struct v3dv_queue *queue,
551 struct v3dv_job *job,
552 struct v3dv_submit_sync_info *sync_info,
553 bool signal_syncs)
554 {
555 MESA_TRACE_FUNC();
556 struct v3dv_device *device = queue->device;
557 struct v3dv_copy_query_results_cpu_job_info *info =
558 &job->cpu.query_copy_results;
559
560 assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
561 info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
562
563 assert(info->dst && info->dst->mem && info->dst->mem->bo);
564 struct v3dv_bo *bo = info->dst->mem->bo;
565
566 if (device->pdevice->caps.cpu_queue) {
567 struct drm_v3d_submit_cpu submit = {0};
568 struct drm_v3d_multi_sync ms = {0};
569
570 uint32_t *offsets = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
571 uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
572 uint32_t *bo_handles = NULL;
573 uintptr_t *kperfmon_ids = NULL;
574
575 if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
576 submit.bo_handle_count = 2;
577
578 bo_handles = (uint32_t *)
579 malloc(sizeof(uint32_t) * submit.bo_handle_count);
580
581 bo_handles[0] = bo->handle;
582 bo_handles[1] = info->pool->timestamp.bo->handle;
583 submit.bo_handles = (uintptr_t)(void *)bo_handles;
584
585 struct drm_v3d_copy_timestamp_query copy = {0};
586
587 set_ext(©.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_TIMESTAMP_QUERY, 0);
588
589 copy.do_64bit = info->flags & VK_QUERY_RESULT_64_BIT;
590 copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT;
591 copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
592 copy.offset = info->offset + info->dst->mem_offset;
593 copy.stride = info->stride;
594 copy.count = info->count;
595
596 for (uint32_t i = 0; i < info->count; i++) {
597 assert(info->first < info->pool->query_count);
598 assert(info->first + info->count <= info->pool->query_count);
599 struct v3dv_query *query = &info->pool->queries[info->first + i];
600
601 offsets[i] = query->timestamp.offset;
602 syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
603 }
604
605 copy.offsets = (uintptr_t)(void *)offsets;
606 copy.syncs = (uintptr_t)(void *)syncs;
607
608 set_multisync(&ms, sync_info, NULL, 0, (void *)©, device, job,
609 V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
610 if (!ms.base.id) {
611 free(bo_handles);
612 free(offsets);
613 free(syncs);
614 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
615 }
616 } else {
617 assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
618
619 submit.bo_handle_count = 1;
620 submit.bo_handles = (uintptr_t)(void *)&bo->handle;
621
622 struct drm_v3d_copy_performance_query copy = {0};
623
624 set_ext(©.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_PERFORMANCE_QUERY, 0);
625
626 /* If the queryPool was created with VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR,
627 * results for each query are written as an array of the type indicated
628 * by VkPerformanceCounterKHR::storage for the counter being queried.
629 * For v3dv, VkPerformanceCounterKHR::storage is
630 * VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR.
631 */
632 copy.do_64bit = true;
633 copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT;
634 copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
635 copy.offset = info->offset + info->dst->mem_offset;
636 copy.stride = info->stride;
637 copy.count = info->count;
638 copy.nperfmons = info->pool->perfmon.nperfmons;
639 copy.ncounters = info->pool->perfmon.ncounters;
640
641 kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count);
642
643 struct vk_sync_wait waits[info->count];
644 unsigned wait_count = 0;
645
646 for (uint32_t i = 0; i < info->count; i++) {
647 assert(info->first < info->pool->query_count);
648 assert(info->first + info->count <= info->pool->query_count);
649 struct v3dv_query *query = &info->pool->queries[info->first + i];
650
651 syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
652 kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids;
653
654 if (info->flags & VK_QUERY_RESULT_WAIT_BIT) {
655 waits[wait_count] = (struct vk_sync_wait){
656 .sync = query->perf.last_job_sync
657 };
658 wait_count++;
659 }
660 }
661
662 copy.syncs = (uintptr_t)(void *)syncs;
663 copy.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids;
664
665 set_multisync(&ms, sync_info, waits, wait_count, (void *)©, device, job,
666 V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
667 if (!ms.base.id) {
668 free(kperfmon_ids);
669 free(bo_handles);
670 free(offsets);
671 free(syncs);
672 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
673 }
674 }
675
676 submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
677 submit.extensions = (uintptr_t)(void *)&ms;
678
679 int ret = v3d_ioctl(device->pdevice->render_fd,
680 DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
681
682 free(kperfmon_ids);
683 free(bo_handles);
684 free(offsets);
685 free(syncs);
686 multisync_free(device, &ms);
687
688 queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
689
690 if (ret)
691 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
692
693 return VK_SUCCESS;
694 }
695
696 /* Map the entire dst buffer for the CPU copy if needed */
697 assert(!bo->map || bo->map_size == bo->size);
698 if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
699 return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
700
701 uint8_t *offset = ((uint8_t *) bo->map) +
702 info->offset + info->dst->mem_offset;
703 v3dv_get_query_pool_results_cpu(job->device,
704 info->pool,
705 info->first,
706 info->count,
707 offset,
708 info->stride,
709 info->flags);
710
711 return VK_SUCCESS;
712 }
713
714 static VkResult
handle_timestamp_query_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)715 handle_timestamp_query_cpu_job(struct v3dv_queue *queue,
716 struct v3dv_job *job,
717 struct v3dv_submit_sync_info *sync_info,
718 bool signal_syncs)
719 {
720 MESA_TRACE_FUNC();
721 struct v3dv_device *device = queue->device;
722
723 assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY);
724 struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp;
725
726 if (!device->pdevice->caps.cpu_queue) {
727 /* Wait for completion of all work queued before the timestamp query */
728 VkResult result = queue_wait_idle(queue, sync_info);
729 if (result != VK_SUCCESS)
730 return result;
731
732 mtx_lock(&job->device->query_mutex);
733
734 /* Compute timestamp */
735 struct timespec t;
736 clock_gettime(CLOCK_MONOTONIC, &t);
737
738 for (uint32_t i = 0; i < info->count; i++) {
739 assert(info->query + i < info->pool->query_count);
740 struct v3dv_query *query = &info->pool->queries[info->query + i];
741 query->maybe_available = true;
742
743 /* Value */
744 uint8_t *value_addr =
745 ((uint8_t *) info->pool->timestamp.bo->map) + query->timestamp.offset;
746 *((uint64_t*)value_addr) = (i == 0) ? t.tv_sec * 1000000000ull + t.tv_nsec : 0ull;
747
748 /* Availability */
749 result = vk_sync_signal(&job->device->vk, query->timestamp.sync, 0);
750 }
751
752 cnd_broadcast(&job->device->query_ended);
753 mtx_unlock(&job->device->query_mutex);
754
755 return result;
756 }
757
758 struct drm_v3d_submit_cpu submit = {0};
759
760 submit.bo_handle_count = 1;
761 submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle;
762
763 struct drm_v3d_timestamp_query timestamp = {0};
764
765 set_ext(×tamp.base, NULL, DRM_V3D_EXT_ID_CPU_TIMESTAMP_QUERY, 0);
766
767 timestamp.count = info->count;
768
769 uint32_t *offsets =
770 (uint32_t *) malloc(sizeof(uint32_t) * info->count);
771 uint32_t *syncs =
772 (uint32_t *) malloc(sizeof(uint32_t) * info->count);
773
774 for (uint32_t i = 0; i < info->count; i++) {
775 assert(info->query + i < info->pool->query_count);
776 struct v3dv_query *query = &info->pool->queries[info->query + i];
777 query->maybe_available = true;
778
779 offsets[i] = query->timestamp.offset;
780 syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
781 }
782
783 timestamp.offsets = (uintptr_t)(void *)offsets;
784 timestamp.syncs = (uintptr_t)(void *)syncs;
785
786 struct drm_v3d_multi_sync ms = {0};
787
788 /* The CPU job should be serialized so it only executes after all previously
789 * submitted work has completed
790 */
791 job->serialize = V3DV_BARRIER_ALL;
792
793 set_multisync(&ms, sync_info, NULL, 0, (void *)×tamp, device, job,
794 V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
795 if (!ms.base.id) {
796 free(offsets);
797 free(syncs);
798 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
799 }
800
801 submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
802 submit.extensions = (uintptr_t)(void *)&ms;
803
804 int ret = v3d_ioctl(device->pdevice->render_fd,
805 DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
806
807 free(offsets);
808 free(syncs);
809 multisync_free(device, &ms);
810
811 queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
812
813 if (ret)
814 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
815
816 return VK_SUCCESS;
817 }
818
819 static VkResult
handle_csd_indirect_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)820 handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
821 struct v3dv_job *job,
822 struct v3dv_submit_sync_info *sync_info,
823 bool signal_syncs)
824 {
825 MESA_TRACE_FUNC();
826 struct v3dv_device *device = queue->device;
827
828 assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
829 struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect;
830 assert(info->csd_job);
831
832 assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
833 struct v3dv_bo *bo = info->buffer->mem->bo;
834
835 if (!device->pdevice->caps.cpu_queue) {
836 /* Make sure the GPU is no longer using the indirect buffer*/
837 v3dv_bo_wait(queue->device, bo, OS_TIMEOUT_INFINITE);
838
839 /* Map the indirect buffer and read the dispatch parameters */
840 if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
841 return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
842 assert(bo->map);
843
844 const uint32_t offset = info->buffer->mem_offset + info->offset;
845 const uint32_t *group_counts = (uint32_t *) (bo->map + offset);
846 if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0)
847 return VK_SUCCESS;
848
849 if (memcmp(group_counts, info->csd_job->csd.wg_count,
850 sizeof(info->csd_job->csd.wg_count)) != 0) {
851 v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts);
852 }
853
854 return VK_SUCCESS;
855 }
856
857 struct v3dv_job *csd_job = info->csd_job;
858
859 struct drm_v3d_submit_cpu submit = {0};
860
861 submit.bo_handle_count = 1;
862 submit.bo_handles = (uintptr_t)(void *)&bo->handle;
863
864 csd_job->csd.submit.bo_handle_count = csd_job->bo_count;
865 uint32_t *bo_handles = (uint32_t *) malloc(sizeof(uint32_t) * csd_job->bo_count);
866 uint32_t bo_idx = 0;
867 set_foreach (csd_job->bos, entry) {
868 struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
869 bo_handles[bo_idx++] = bo->handle;
870 }
871 csd_job->csd.submit.bo_handles = (uintptr_t)(void *)bo_handles;
872
873 struct drm_v3d_indirect_csd indirect = {0};
874
875 set_ext(&indirect.base, NULL, DRM_V3D_EXT_ID_CPU_INDIRECT_CSD, 0);
876
877 indirect.submit = csd_job->csd.submit;
878 indirect.offset = info->buffer->mem_offset + info->offset;
879 indirect.wg_size = info->wg_size;
880
881 for (int i = 0; i < 3; i++) {
882 if (info->wg_uniform_offsets[i]) {
883 assert(info->wg_uniform_offsets[i] >= (uint32_t *) csd_job->indirect.base);
884 indirect.wg_uniform_offsets[i] = info->wg_uniform_offsets[i] - (uint32_t *) csd_job->indirect.base;
885 } else {
886 indirect.wg_uniform_offsets[i] = 0xffffffff; /* No rewrite */
887 }
888 }
889
890 indirect.indirect = csd_job->indirect.bo->handle;
891
892 struct drm_v3d_multi_sync ms = {0};
893
894 /* We need to configure the semaphores of this job with the indirect
895 * CSD job, as the CPU job must obey to the CSD job synchronization
896 * demands, such as barriers.
897 */
898 set_multisync(&ms, sync_info, NULL, 0, (void *)&indirect, device, csd_job,
899 V3DV_QUEUE_CPU, V3DV_QUEUE_CSD, V3D_CPU, signal_syncs);
900 if (!ms.base.id)
901 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
902
903 submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
904 submit.extensions = (uintptr_t)(void *)&ms;
905
906 int ret = v3d_ioctl(device->pdevice->render_fd,
907 DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
908
909 free(bo_handles);
910 multisync_free(device, &ms);
911
912 queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
913 queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
914
915 if (ret)
916 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
917
918 return VK_SUCCESS;
919 }
920
921 static VkResult
handle_cl_job(struct v3dv_queue * queue,struct v3dv_job * job,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)922 handle_cl_job(struct v3dv_queue *queue,
923 struct v3dv_job *job,
924 uint32_t counter_pass_idx,
925 struct v3dv_submit_sync_info *sync_info,
926 bool signal_syncs)
927 {
928 MESA_TRACE_FUNC();
929 struct v3dv_device *device = queue->device;
930
931 struct drm_v3d_submit_cl submit = { 0 };
932
933 /* Sanity check: we should only flag a bcl sync on a job that needs to be
934 * serialized.
935 */
936 assert(job->serialize || !job->needs_bcl_sync);
937
938 /* We expect to have just one RCL per job which should fit in just one BO.
939 * Our BCL, could chain multiple BOS together though.
940 */
941 assert(list_length(&job->rcl.bo_list) == 1);
942 assert(list_length(&job->bcl.bo_list) >= 1);
943 struct v3dv_bo *bcl_fist_bo =
944 list_first_entry(&job->bcl.bo_list, struct v3dv_bo, list_link);
945 submit.bcl_start = bcl_fist_bo->offset;
946 submit.bcl_end = job->suspending ? job->suspended_bcl_end :
947 job->bcl.bo->offset + v3dv_cl_offset(&job->bcl);
948 submit.rcl_start = job->rcl.bo->offset;
949 submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl);
950
951 submit.qma = job->tile_alloc->offset;
952 submit.qms = job->tile_alloc->size;
953 submit.qts = job->tile_state->offset;
954
955 submit.flags = 0;
956 if (job->tmu_dirty_rcl)
957 submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
958
959 /* If the job uses VK_KHR_buffer_device_address we need to ensure all
960 * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
961 * are included.
962 */
963 if (job->uses_buffer_device_address) {
964 util_dynarray_foreach(&queue->device->device_address_bo_list,
965 struct v3dv_bo *, bo) {
966 v3dv_job_add_bo(job, *bo);
967 }
968 }
969
970 submit.bo_handle_count = job->bo_count;
971 uint32_t *bo_handles =
972 (uint32_t *) malloc(sizeof(uint32_t) * submit.bo_handle_count);
973 uint32_t bo_idx = 0;
974 set_foreach(job->bos, entry) {
975 struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
976 bo_handles[bo_idx++] = bo->handle;
977 }
978 assert(bo_idx == submit.bo_handle_count);
979 submit.bo_handles = (uintptr_t)(void *)bo_handles;
980
981 submit.perfmon_id = job->perf ?
982 job->perf->kperfmon_ids[counter_pass_idx] : 0;
983 const bool needs_perf_sync = queue->last_perfmon_id != submit.perfmon_id;
984 queue->last_perfmon_id = submit.perfmon_id;
985
986 /* We need a binning sync if we are the first CL job waiting on a semaphore
987 * with a wait stage that involves the geometry pipeline, or if the job
988 * comes after a pipeline barrier that involves geometry stages
989 * (needs_bcl_sync) or when performance queries are in use.
990 *
991 * We need a render sync if the job doesn't need a binning sync but has
992 * still been flagged for serialization. It should be noted that RCL jobs
993 * don't start until the previous RCL job has finished so we don't really
994 * need to add a fence for those, however, we might need to wait on a CSD or
995 * TFU job, which are not automatically serialized with CL jobs.
996 */
997 bool needs_bcl_sync = job->needs_bcl_sync || needs_perf_sync;
998 if (queue->last_job_syncs.first[V3DV_QUEUE_CL]) {
999 for (int i = 0; !needs_bcl_sync && i < sync_info->wait_count; i++) {
1000 needs_bcl_sync = sync_info->waits[i].stage_mask &
1001 (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
1002 VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
1003 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
1004 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
1005 VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
1006 VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
1007 VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
1008 VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
1009 VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
1010 VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
1011 VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
1012 VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT);
1013 }
1014 }
1015
1016 bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
1017
1018 /* Replace single semaphore settings whenever our kernel-driver supports
1019 * multiple semaphores extension.
1020 */
1021 struct drm_v3d_multi_sync ms = { 0 };
1022 enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN;
1023 set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
1024 V3DV_QUEUE_CL, V3DV_QUEUE_CL, wait_stage, signal_syncs);
1025 if (!ms.base.id) {
1026 free(bo_handles);
1027 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1028 }
1029
1030 submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
1031 submit.extensions = (uintptr_t)(void *)&ms;
1032
1033 /* We are using multisync so disable legacy single-sync interface */
1034 submit.in_sync_rcl = 0;
1035 submit.in_sync_bcl = 0;
1036 submit.out_sync = 0;
1037
1038 v3dv_clif_dump(device, job, &submit);
1039 int ret = v3d_ioctl(device->pdevice->render_fd,
1040 DRM_IOCTL_V3D_SUBMIT_CL, &submit);
1041
1042 static bool warned = false;
1043 if (ret && !warned) {
1044 mesa_loge("Draw call returned %s. Expect corruption.\n",
1045 strerror(errno));
1046 warned = true;
1047 }
1048
1049 free(bo_handles);
1050 multisync_free(device, &ms);
1051
1052 queue->last_job_syncs.first[V3DV_QUEUE_CL] = false;
1053
1054 if (ret)
1055 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CL failed: %m");
1056
1057 return VK_SUCCESS;
1058 }
1059
1060 static VkResult
handle_tfu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1061 handle_tfu_job(struct v3dv_queue *queue,
1062 struct v3dv_job *job,
1063 struct v3dv_submit_sync_info *sync_info,
1064 bool signal_syncs)
1065 {
1066 MESA_TRACE_FUNC();
1067 assert(!V3D_DBG(DISABLE_TFU));
1068
1069 struct v3dv_device *device = queue->device;
1070
1071 /* Replace single semaphore settings whenever our kernel-driver supports
1072 * multiple semaphore extension.
1073 */
1074 struct drm_v3d_multi_sync ms = { 0 };
1075 set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
1076 V3DV_QUEUE_TFU, V3DV_QUEUE_TFU, V3D_TFU, signal_syncs);
1077 if (!ms.base.id)
1078 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1079
1080 job->tfu.flags |= DRM_V3D_SUBMIT_EXTENSION;
1081 job->tfu.extensions = (uintptr_t)(void *)&ms;
1082
1083 /* We are using multisync so disable legacy single-sync interface */
1084 job->tfu.in_sync = 0;
1085 job->tfu.out_sync = 0;
1086
1087 int ret = v3d_ioctl(device->pdevice->render_fd,
1088 DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu);
1089
1090 multisync_free(device, &ms);
1091 queue->last_job_syncs.first[V3DV_QUEUE_TFU] = false;
1092
1093 if (ret != 0)
1094 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_TFU failed: %m");
1095
1096 return VK_SUCCESS;
1097 }
1098
1099 static VkResult
handle_csd_job(struct v3dv_queue * queue,struct v3dv_job * job,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1100 handle_csd_job(struct v3dv_queue *queue,
1101 struct v3dv_job *job,
1102 uint32_t counter_pass_idx,
1103 struct v3dv_submit_sync_info *sync_info,
1104 bool signal_syncs)
1105 {
1106 MESA_TRACE_FUNC();
1107 struct v3dv_device *device = queue->device;
1108
1109 struct drm_v3d_submit_csd *submit = &job->csd.submit;
1110
1111 /* If the job uses VK_KHR_buffer_device_address we need to ensure all
1112 * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
1113 * are included.
1114 */
1115 if (job->uses_buffer_device_address) {
1116 util_dynarray_foreach(&queue->device->device_address_bo_list,
1117 struct v3dv_bo *, bo) {
1118 v3dv_job_add_bo(job, *bo);
1119 }
1120 }
1121
1122 submit->bo_handle_count = job->bo_count;
1123 uint32_t *bo_handles =
1124 (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2));
1125 uint32_t bo_idx = 0;
1126 set_foreach(job->bos, entry) {
1127 struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
1128 bo_handles[bo_idx++] = bo->handle;
1129 }
1130 assert(bo_idx == submit->bo_handle_count);
1131 submit->bo_handles = (uintptr_t)(void *)bo_handles;
1132
1133 /* Replace single semaphore settings whenever our kernel-driver supports
1134 * multiple semaphore extension.
1135 */
1136 struct drm_v3d_multi_sync ms = { 0 };
1137 set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
1138 V3DV_QUEUE_CSD, V3DV_QUEUE_CSD, V3D_CSD, signal_syncs);
1139 if (!ms.base.id)
1140 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1141
1142 submit->flags |= DRM_V3D_SUBMIT_EXTENSION;
1143 submit->extensions = (uintptr_t)(void *)&ms;
1144
1145 /* We are using multisync so disable legacy single-sync interface */
1146 submit->in_sync = 0;
1147 submit->out_sync = 0;
1148
1149 submit->perfmon_id = job->perf ?
1150 job->perf->kperfmon_ids[counter_pass_idx] : 0;
1151 queue->last_perfmon_id = submit->perfmon_id;
1152
1153 int ret = v3d_ioctl(device->pdevice->render_fd,
1154 DRM_IOCTL_V3D_SUBMIT_CSD, submit);
1155
1156 static bool warned = false;
1157 if (ret && !warned) {
1158 mesa_loge("Compute dispatch returned %s. Expect corruption.\n",
1159 strerror(errno));
1160 warned = true;
1161 }
1162
1163 free(bo_handles);
1164
1165 multisync_free(device, &ms);
1166 queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
1167
1168 if (ret)
1169 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CSD failed: %m");
1170
1171 return VK_SUCCESS;
1172 }
1173
1174 static VkResult
queue_handle_job(struct v3dv_queue * queue,struct v3dv_job * job,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1175 queue_handle_job(struct v3dv_queue *queue,
1176 struct v3dv_job *job,
1177 uint32_t counter_pass_idx,
1178 struct v3dv_submit_sync_info *sync_info,
1179 bool signal_syncs)
1180 {
1181 switch (job->type) {
1182 case V3DV_JOB_TYPE_GPU_CL:
1183 return handle_cl_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
1184 case V3DV_JOB_TYPE_GPU_TFU:
1185 return handle_tfu_job(queue, job, sync_info, signal_syncs);
1186 case V3DV_JOB_TYPE_GPU_CSD:
1187 return handle_csd_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
1188 case V3DV_JOB_TYPE_CPU_RESET_QUERIES:
1189 return handle_reset_query_cpu_job(queue, job, sync_info, signal_syncs);
1190 case V3DV_JOB_TYPE_CPU_END_QUERY:
1191 return handle_end_query_cpu_job(job, counter_pass_idx);
1192 case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
1193 return handle_copy_query_results_cpu_job(queue, job, sync_info, signal_syncs);
1194 case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
1195 return handle_csd_indirect_cpu_job(queue, job, sync_info, signal_syncs);
1196 case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY:
1197 return handle_timestamp_query_cpu_job(queue, job, sync_info, signal_syncs);
1198 default:
1199 unreachable("Unhandled job type");
1200 }
1201 }
1202
1203 static VkResult
queue_create_noop_job(struct v3dv_queue * queue)1204 queue_create_noop_job(struct v3dv_queue *queue)
1205 {
1206 struct v3dv_device *device = queue->device;
1207 queue->noop_job = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_job), 8,
1208 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1209 if (!queue->noop_job)
1210 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1211 v3dv_job_init(queue->noop_job, V3DV_JOB_TYPE_GPU_CL, device, NULL, -1);
1212
1213 v3d_X((&device->devinfo), job_emit_noop)(queue->noop_job);
1214
1215 /* We use no-op jobs to signal semaphores/fences. These jobs needs to be
1216 * serialized across all hw queues to comply with Vulkan's signal operation
1217 * order requirements, which basically require that signal operations occur
1218 * in submission order.
1219 */
1220 queue->noop_job->serialize = V3DV_BARRIER_ALL;
1221
1222 return VK_SUCCESS;
1223 }
1224
1225 static VkResult
queue_submit_noop_job(struct v3dv_queue * queue,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1226 queue_submit_noop_job(struct v3dv_queue *queue,
1227 uint32_t counter_pass_idx,
1228 struct v3dv_submit_sync_info *sync_info,
1229 bool signal_syncs)
1230 {
1231 if (!queue->noop_job) {
1232 VkResult result = queue_create_noop_job(queue);
1233 if (result != VK_SUCCESS)
1234 return result;
1235 }
1236
1237 assert(queue->noop_job);
1238 return queue_handle_job(queue, queue->noop_job, counter_pass_idx,
1239 sync_info, signal_syncs);
1240 }
1241
1242 VkResult
v3dv_queue_driver_submit(struct vk_queue * vk_queue,struct vk_queue_submit * submit)1243 v3dv_queue_driver_submit(struct vk_queue *vk_queue,
1244 struct vk_queue_submit *submit)
1245 {
1246 MESA_TRACE_FUNC();
1247 struct v3dv_queue *queue = container_of(vk_queue, struct v3dv_queue, vk);
1248 VkResult result;
1249
1250 struct v3dv_submit_sync_info sync_info = {
1251 .wait_count = submit->wait_count,
1252 .waits = submit->waits,
1253 .signal_count = submit->signal_count,
1254 .signals = submit->signals,
1255 };
1256
1257 for (int i = 0; i < V3DV_QUEUE_COUNT; i++)
1258 queue->last_job_syncs.first[i] = true;
1259
1260 struct v3dv_job *first_suspend_job = NULL;
1261 struct v3dv_job *current_suspend_job = NULL;
1262 for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
1263 struct v3dv_cmd_buffer *cmd_buffer =
1264 container_of(submit->command_buffers[i], struct v3dv_cmd_buffer, vk);
1265 list_for_each_entry_safe(struct v3dv_job, job,
1266 &cmd_buffer->jobs, list_link) {
1267 if (job->suspending) {
1268 job = v3d_X((&job->device->devinfo),
1269 cmd_buffer_prepare_suspend_job_for_submit)(job);
1270 if (!job)
1271 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1272 }
1273
1274 if (job->suspending && !job->resuming) {
1275 assert(!first_suspend_job);
1276 assert(!current_suspend_job);
1277 first_suspend_job = job;
1278 }
1279
1280 if (job->resuming) {
1281 assert(first_suspend_job);
1282 assert(current_suspend_job);
1283 v3d_X((&job->device->devinfo), job_patch_resume_address)(first_suspend_job,
1284 current_suspend_job,
1285 job);
1286 current_suspend_job = NULL;
1287 }
1288
1289 if (job->suspending) {
1290 current_suspend_job = job;
1291 } else {
1292 assert(!current_suspend_job);
1293 struct v3dv_job *submit_job = first_suspend_job ?
1294 first_suspend_job : job;
1295 result =
1296 queue_handle_job(queue, submit_job, submit->perf_pass_index,
1297 &sync_info, false);
1298
1299 if (result != VK_SUCCESS)
1300 return result;
1301
1302 first_suspend_job = NULL;
1303 }
1304 }
1305
1306 /* If the command buffer ends with a barrier we need to consume it now.
1307 *
1308 * FIXME: this will drain all hw queues. Instead, we could use the pending
1309 * barrier state to limit the queues we serialize against.
1310 */
1311 if (cmd_buffer->state.barrier.dst_mask) {
1312 result = queue_submit_noop_job(queue, submit->perf_pass_index,
1313 &sync_info, false);
1314 if (result != VK_SUCCESS)
1315 return result;
1316 }
1317 }
1318
1319 assert(!first_suspend_job);
1320 assert(!current_suspend_job);
1321
1322 /* Handle signaling now */
1323 if (submit->signal_count > 0) {
1324 /* Finish by submitting a no-op job that synchronizes across all queues.
1325 * This will ensure that the signal semaphores don't get triggered until
1326 * all work on any queue completes. See Vulkan's signal operation order
1327 * requirements.
1328 */
1329 return queue_submit_noop_job(queue, submit->perf_pass_index,
1330 &sync_info, true);
1331 }
1332
1333 return VK_SUCCESS;
1334 }
1335
1336 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_QueueBindSparse(VkQueue _queue,uint32_t bindInfoCount,const VkBindSparseInfo * pBindInfo,VkFence fence)1337 v3dv_QueueBindSparse(VkQueue _queue,
1338 uint32_t bindInfoCount,
1339 const VkBindSparseInfo *pBindInfo,
1340 VkFence fence)
1341 {
1342 V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
1343 return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT);
1344 }
1345