• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Raspberry Pi Ltd
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3dv_private.h"
25 #include "drm-uapi/v3d_drm.h"
26 
27 #include "broadcom/clif/clif_dump.h"
28 #include "util/libsync.h"
29 #include "util/os_time.h"
30 #include "util/perf/cpu_trace.h"
31 #include "vk_drm_syncobj.h"
32 
33 #include <errno.h>
34 #include <time.h>
35 
36 static void
v3dv_clif_dump(struct v3dv_device * device,struct v3dv_job * job,struct drm_v3d_submit_cl * submit)37 v3dv_clif_dump(struct v3dv_device *device,
38                struct v3dv_job *job,
39                struct drm_v3d_submit_cl *submit)
40 {
41    if (!(V3D_DBG(CL) ||
42          V3D_DBG(CL_NO_BIN) ||
43          V3D_DBG(CLIF)))
44       return;
45 
46    struct clif_dump *clif = clif_dump_init(&device->devinfo,
47                                            stderr,
48                                            V3D_DBG(CL) ||
49                                            V3D_DBG(CL_NO_BIN),
50                                            V3D_DBG(CL_NO_BIN));
51 
52    set_foreach(job->bos, entry) {
53       struct v3dv_bo *bo = (void *)entry->key;
54       char *name = ralloc_asprintf(NULL, "%s_0x%x",
55                                    bo->name, bo->offset);
56 
57       bool ok = v3dv_bo_map(device, bo, bo->size);
58       if (!ok) {
59          mesa_loge("failed to map BO for clif_dump.\n");
60          ralloc_free(name);
61          goto free_clif;
62       }
63       clif_dump_add_bo(clif, name, bo->offset, bo->size, bo->map);
64 
65       ralloc_free(name);
66    }
67 
68    clif_dump(clif, submit);
69 
70  free_clif:
71    clif_dump_destroy(clif);
72 }
73 
74 static VkResult
queue_wait_idle(struct v3dv_queue * queue,struct v3dv_submit_sync_info * sync_info)75 queue_wait_idle(struct v3dv_queue *queue,
76                 struct v3dv_submit_sync_info *sync_info)
77 {
78    int ret = drmSyncobjWait(queue->device->pdevice->render_fd,
79                             queue->last_job_syncs.syncs, 4,
80                             INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
81                             NULL);
82    if (ret)
83       return vk_errorf(queue, VK_ERROR_DEVICE_LOST, "syncobj wait failed: %m");
84 
85    bool first = true;
86    for (int i = 0; i < 4; i++) {
87       if (!queue->last_job_syncs.first[i])
88          first = false;
89    }
90 
91    /* If we're not the first job, that means we're waiting on some
92     * per-queue-type syncobj which transitively waited on the semaphores
93     * so we can skip the semaphore wait.
94     */
95    if (first) {
96       VkResult result = vk_sync_wait_many(&queue->device->vk,
97                                           sync_info->wait_count,
98                                           sync_info->waits,
99                                           VK_SYNC_WAIT_COMPLETE,
100                                           UINT64_MAX);
101       if (result != VK_SUCCESS)
102          return result;
103    }
104 
105    for (int i = 0; i < 4; i++)
106       queue->last_job_syncs.first[i] = false;
107 
108    return VK_SUCCESS;
109 }
110 
111 static void
multisync_free(struct v3dv_device * device,struct drm_v3d_multi_sync * ms)112 multisync_free(struct v3dv_device *device,
113                struct drm_v3d_multi_sync *ms)
114 {
115    vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->out_syncs);
116    vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->in_syncs);
117 }
118 
119 static struct drm_v3d_sem *
set_in_syncs(struct v3dv_queue * queue,struct v3dv_job * job,enum v3dv_queue_type queue_sync,uint32_t * count,struct vk_sync_wait * waits,unsigned wait_count,struct v3dv_submit_sync_info * sync_info)120 set_in_syncs(struct v3dv_queue *queue,
121              struct v3dv_job *job,
122              enum v3dv_queue_type queue_sync,
123              uint32_t *count,
124              struct vk_sync_wait *waits,
125              unsigned wait_count,
126              struct v3dv_submit_sync_info *sync_info)
127 {
128    struct v3dv_device *device = queue->device;
129    uint32_t n_syncs = 0;
130 
131    /* If this is the first job submitted to a given GPU queue in this cmd buf
132     * batch, it has to wait on wait semaphores (if any) before running.
133     */
134    if (queue->last_job_syncs.first[queue_sync])
135       n_syncs = sync_info->wait_count;
136 
137    /* If the serialize flag is set the job needs to be serialized in the
138     * corresponding queues. Notice that we may implement transfer operations
139     * as both CL or TFU jobs.
140     *
141     * FIXME: maybe we could track more precisely if the source of a transfer
142     * barrier is a CL and/or a TFU job.
143     */
144    bool sync_csd  = job->serialize & V3DV_BARRIER_COMPUTE_BIT;
145    bool sync_tfu  = job->serialize & V3DV_BARRIER_TRANSFER_BIT;
146    bool sync_cl   = job->serialize & (V3DV_BARRIER_GRAPHICS_BIT |
147                                       V3DV_BARRIER_TRANSFER_BIT);
148    bool sync_cpu  = job->serialize & V3DV_BARRIER_CPU_BIT;
149 
150    *count = n_syncs;
151    if (sync_cl)
152       (*count)++;
153    if (sync_tfu)
154       (*count)++;
155    if (sync_csd)
156       (*count)++;
157    if (sync_cpu)
158       (*count)++;
159 
160    *count += wait_count;
161 
162    if (!*count)
163       return NULL;
164 
165    struct drm_v3d_sem *syncs =
166       vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
167                 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
168 
169    if (!syncs)
170       return NULL;
171 
172    for (int i = 0; i < n_syncs; i++) {
173       syncs[i].handle =
174          vk_sync_as_drm_syncobj(sync_info->waits[i].sync)->syncobj;
175    }
176 
177    for (int i = 0; i < wait_count; i++) {
178       syncs[n_syncs++].handle =
179          vk_sync_as_drm_syncobj(waits[i].sync)->syncobj;
180    }
181 
182    if (sync_cl)
183       syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CL];
184 
185    if (sync_csd)
186       syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CSD];
187 
188    if (sync_tfu)
189       syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_TFU];
190 
191    if (sync_cpu)
192       syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CPU];
193 
194    assert(n_syncs == *count);
195    return syncs;
196 }
197 
198 static struct drm_v3d_sem *
set_out_syncs(struct v3dv_queue * queue,struct v3dv_job * job,enum v3dv_queue_type queue_sync,uint32_t * count,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)199 set_out_syncs(struct v3dv_queue *queue,
200               struct v3dv_job *job,
201               enum v3dv_queue_type queue_sync,
202               uint32_t *count,
203               struct v3dv_submit_sync_info *sync_info,
204               bool signal_syncs)
205 {
206    struct v3dv_device *device = queue->device;
207 
208    uint32_t n_vk_syncs = signal_syncs ? sync_info->signal_count : 0;
209 
210    /* We always signal the syncobj from `device->last_job_syncs` related to
211     * this v3dv_queue_type to track the last job submitted to this queue.
212     */
213    (*count) = n_vk_syncs + 1;
214 
215    struct drm_v3d_sem *syncs =
216       vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
217                 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
218 
219    if (!syncs)
220       return NULL;
221 
222    if (n_vk_syncs) {
223       for (unsigned i = 0; i < n_vk_syncs; i++) {
224          syncs[i].handle =
225             vk_sync_as_drm_syncobj(sync_info->signals[i].sync)->syncobj;
226       }
227    }
228 
229    syncs[n_vk_syncs].handle = queue->last_job_syncs.syncs[queue_sync];
230 
231    return syncs;
232 }
233 
234 static void
set_ext(struct drm_v3d_extension * ext,struct drm_v3d_extension * next,uint32_t id,uintptr_t flags)235 set_ext(struct drm_v3d_extension *ext,
236 	struct drm_v3d_extension *next,
237 	uint32_t id,
238 	uintptr_t flags)
239 {
240    ext->next = (uintptr_t)(void *)next;
241    ext->id = id;
242    ext->flags = flags;
243 }
244 
245 /* This function sets the extension for multiple in/out syncobjs. When it is
246  * successful, it sets the extension id to DRM_V3D_EXT_ID_MULTI_SYNC.
247  * Otherwise, the extension id is 0, which means an out-of-memory error.
248  */
249 static void
set_multisync(struct drm_v3d_multi_sync * ms,struct v3dv_submit_sync_info * sync_info,struct vk_sync_wait * waits,unsigned wait_count,struct drm_v3d_extension * next,struct v3dv_device * device,struct v3dv_job * job,enum v3dv_queue_type in_queue_sync,enum v3dv_queue_type out_queue_sync,enum v3d_queue wait_stage,bool signal_syncs)250 set_multisync(struct drm_v3d_multi_sync *ms,
251               struct v3dv_submit_sync_info *sync_info,
252               struct vk_sync_wait *waits,
253               unsigned wait_count,
254               struct drm_v3d_extension *next,
255               struct v3dv_device *device,
256               struct v3dv_job *job,
257               enum v3dv_queue_type in_queue_sync,
258               enum v3dv_queue_type out_queue_sync,
259               enum v3d_queue wait_stage,
260               bool signal_syncs)
261 {
262    struct v3dv_queue *queue = &device->queue;
263    uint32_t out_sync_count = 0, in_sync_count = 0;
264    struct drm_v3d_sem *out_syncs = NULL, *in_syncs = NULL;
265 
266    in_syncs = set_in_syncs(queue, job, in_queue_sync,
267                            &in_sync_count, waits, wait_count, sync_info);
268    if (!in_syncs && in_sync_count)
269       goto fail;
270 
271    out_syncs = set_out_syncs(queue, job, out_queue_sync,
272                              &out_sync_count, sync_info, signal_syncs);
273 
274    assert(out_sync_count > 0);
275 
276    if (!out_syncs)
277       goto fail;
278 
279    set_ext(&ms->base, next, DRM_V3D_EXT_ID_MULTI_SYNC, 0);
280    ms->wait_stage = wait_stage;
281    ms->out_sync_count = out_sync_count;
282    ms->out_syncs = (uintptr_t)(void *)out_syncs;
283    ms->in_sync_count = in_sync_count;
284    ms->in_syncs = (uintptr_t)(void *)in_syncs;
285 
286    return;
287 
288 fail:
289    if (in_syncs)
290       vk_free(&device->vk.alloc, in_syncs);
291    assert(!out_syncs);
292 
293    return;
294 }
295 
296 static VkResult
handle_reset_query_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)297 handle_reset_query_cpu_job(struct v3dv_queue *queue,
298                            struct v3dv_job *job,
299                            struct v3dv_submit_sync_info *sync_info,
300                            bool signal_syncs)
301 {
302    MESA_TRACE_FUNC();
303    struct v3dv_device *device = queue->device;
304    struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset;
305    assert(info->pool);
306 
307    assert(info->pool->query_type != VK_QUERY_TYPE_OCCLUSION);
308 
309    if (device->pdevice->caps.cpu_queue) {
310       assert(info->first + info->count <= info->pool->query_count);
311 
312       struct drm_v3d_submit_cpu submit = {0};
313       struct drm_v3d_multi_sync ms = {0};
314 
315       uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
316       uintptr_t *kperfmon_ids = NULL;
317 
318       if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
319          submit.bo_handle_count = 1;
320          submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle;
321 
322          struct drm_v3d_reset_timestamp_query reset = {0};
323 
324          set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_TIMESTAMP_QUERY, 0);
325 
326          reset.count = info->count;
327          reset.offset = info->pool->queries[info->first].timestamp.offset;
328 
329          for (uint32_t i = 0; i < info->count; i++) {
330             struct v3dv_query *query = &info->pool->queries[info->first + i];
331             syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
332          }
333 
334          reset.syncs = (uintptr_t)(void *)syncs;
335 
336          set_multisync(&ms, sync_info, NULL, 0, (void *)&reset, device, job,
337                        V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
338          if (!ms.base.id) {
339             free(syncs);
340             return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
341          }
342       } else {
343          assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
344          struct drm_v3d_reset_performance_query reset = {0};
345 
346          set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_PERFORMANCE_QUERY, 0);
347 
348          struct vk_sync_wait waits[info->count];
349          unsigned wait_count = 0;
350          for (int i = 0; i < info->count; i++) {
351             struct v3dv_query *query = &info->pool->queries[info->first + i];
352             /* Only wait for a query if we've used it otherwise we will be
353              * waiting forever for the fence to become signaled.
354              */
355             if (query->maybe_available) {
356                waits[wait_count] = (struct vk_sync_wait){
357                   .sync = query->perf.last_job_sync
358                };
359                wait_count++;
360             };
361          }
362 
363          reset.count = info->count;
364          reset.nperfmons = info->pool->perfmon.nperfmons;
365 
366          kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count);
367 
368          for (uint32_t i = 0; i < info->count; i++) {
369             struct v3dv_query *query = &info->pool->queries[info->first + i];
370 
371             syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
372             kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids;
373          }
374 
375          reset.syncs = (uintptr_t)(void *)syncs;
376          reset.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids;
377 
378          set_multisync(&ms, sync_info, waits, wait_count, (void *)&reset, device, job,
379                        V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
380          if (!ms.base.id) {
381             free(syncs);
382             free(kperfmon_ids);
383             return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
384          }
385       }
386 
387       submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
388       submit.extensions = (uintptr_t)(void *)&ms;
389 
390       /* From the Vulkan spec for vkCmdResetQueryPool:
391        *
392        *    "This command defines an execution dependency between other query commands
393        *     that reference the same query.
394        *     ...
395        *     The second synchronization scope includes all commands which reference the
396        *     queries in queryPool indicated by firstQuery and queryCount that occur later
397        *     in submission order."
398        *
399        * This means we should ensure that any timestamps after a reset don't execute before
400        * the reset, however, for timestamps queries in particular we don't have to do
401        * anything special because timestamp queries have to wait for all previously
402        * submitted work to complete before executing (which we accomplish by using
403        * V3DV_BARRIER_ALL on them) and that includes reset jobs submitted to the CPU queue.
404        */
405       int ret = v3d_ioctl(device->pdevice->render_fd,
406                           DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
407 
408       free(syncs);
409       free(kperfmon_ids);
410       multisync_free(device, &ms);
411 
412       queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
413 
414       if (ret)
415          return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
416 
417       return VK_SUCCESS;
418    }
419 
420    /* We are about to reset query counters in user-space so we need to make
421     * sure that the GPU is not using them.
422     */
423    if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
424       VkResult result = queue_wait_idle(queue, sync_info);
425       if (result != VK_SUCCESS)
426          return result;
427 
428       v3dv_bo_wait(job->device, info->pool->timestamp.bo, OS_TIMEOUT_INFINITE);
429    }
430 
431    if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
432       struct vk_sync_wait waits[info->count];
433       unsigned wait_count = 0;
434       for (int i = 0; i < info->count; i++) {
435          struct v3dv_query *query = &info->pool->queries[info->first + i];
436          /* Only wait for a query if we've used it otherwise we will be
437           * waiting forever for the fence to become signaled.
438           */
439          if (query->maybe_available) {
440             waits[wait_count] = (struct vk_sync_wait){
441                .sync = query->perf.last_job_sync
442             };
443             wait_count++;
444          };
445       }
446 
447       VkResult result = vk_sync_wait_many(&job->device->vk, wait_count, waits,
448                                           VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
449 
450       if (result != VK_SUCCESS)
451          return result;
452    }
453 
454    v3dv_reset_query_pool_cpu(job->device, info->pool, info->first, info->count);
455 
456    return VK_SUCCESS;
457 }
458 
459 static VkResult
export_perfmon_last_job_sync(struct v3dv_queue * queue,struct v3dv_job * job,int * fd)460 export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int *fd)
461 {
462    int err;
463    static const enum v3dv_queue_type queues_to_sync[] = {
464       V3DV_QUEUE_CL,
465       V3DV_QUEUE_CSD,
466    };
467 
468    for (uint32_t i = 0; i < ARRAY_SIZE(queues_to_sync); i++) {
469       enum v3dv_queue_type queue_type = queues_to_sync[i];
470       int tmp_fd = -1;
471 
472       err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
473                                      queue->last_job_syncs.syncs[queue_type],
474                                      &tmp_fd);
475 
476       if (err) {
477          close(*fd);
478          return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
479                           "sync file export failed: %m");
480       }
481 
482       err = sync_accumulate("v3dv", fd, tmp_fd);
483 
484       if (err) {
485          close(tmp_fd);
486          close(*fd);
487          return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
488                           "failed to accumulate sync files: %m");
489       }
490    }
491 
492    return VK_SUCCESS;
493 }
494 
495 static VkResult
handle_end_query_cpu_job(struct v3dv_job * job,uint32_t counter_pass_idx)496 handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx)
497 {
498    MESA_TRACE_FUNC();
499    VkResult result = VK_SUCCESS;
500 
501    mtx_lock(&job->device->query_mutex);
502 
503    struct v3dv_end_query_info *info = &job->cpu.query_end;
504    struct v3dv_queue *queue = &job->device->queue;
505 
506    int err = 0;
507    int fd = -1;
508 
509    assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
510 
511    if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
512       result = export_perfmon_last_job_sync(queue, job, &fd);
513 
514       if (result != VK_SUCCESS)
515          goto fail;
516 
517       assert(fd >= 0);
518    }
519 
520    for (uint32_t i = 0; i < info->count; i++) {
521       assert(info->query + i < info->pool->query_count);
522       struct v3dv_query *query = &info->pool->queries[info->query + i];
523 
524       if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
525          uint32_t syncobj = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
526          err = drmSyncobjImportSyncFile(job->device->pdevice->render_fd,
527                                         syncobj, fd);
528 
529          if (err) {
530             result = vk_errorf(queue, VK_ERROR_UNKNOWN,
531                                "sync file import failed: %m");
532             goto fail;
533          }
534       }
535 
536       query->maybe_available = true;
537    }
538 
539 fail:
540    if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
541       close(fd);
542 
543    cnd_broadcast(&job->device->query_ended);
544    mtx_unlock(&job->device->query_mutex);
545 
546    return result;
547 }
548 
549 static VkResult
handle_copy_query_results_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)550 handle_copy_query_results_cpu_job(struct v3dv_queue *queue,
551                                   struct v3dv_job *job,
552                                   struct v3dv_submit_sync_info *sync_info,
553                                   bool signal_syncs)
554 {
555    MESA_TRACE_FUNC();
556    struct v3dv_device *device = queue->device;
557    struct v3dv_copy_query_results_cpu_job_info *info =
558       &job->cpu.query_copy_results;
559 
560    assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
561           info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
562 
563    assert(info->dst && info->dst->mem && info->dst->mem->bo);
564    struct v3dv_bo *bo = info->dst->mem->bo;
565 
566    if (device->pdevice->caps.cpu_queue) {
567       struct drm_v3d_submit_cpu submit = {0};
568       struct drm_v3d_multi_sync ms = {0};
569 
570       uint32_t *offsets = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
571       uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
572       uint32_t *bo_handles = NULL;
573       uintptr_t *kperfmon_ids = NULL;
574 
575       if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
576          submit.bo_handle_count = 2;
577 
578          bo_handles = (uint32_t *)
579             malloc(sizeof(uint32_t) * submit.bo_handle_count);
580 
581          bo_handles[0] = bo->handle;
582          bo_handles[1] = info->pool->timestamp.bo->handle;
583          submit.bo_handles = (uintptr_t)(void *)bo_handles;
584 
585          struct drm_v3d_copy_timestamp_query copy = {0};
586 
587          set_ext(&copy.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_TIMESTAMP_QUERY, 0);
588 
589          copy.do_64bit = info->flags & VK_QUERY_RESULT_64_BIT;
590          copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT;
591          copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
592          copy.offset = info->offset + info->dst->mem_offset;
593          copy.stride = info->stride;
594          copy.count = info->count;
595 
596          for (uint32_t i = 0; i < info->count; i++) {
597             assert(info->first < info->pool->query_count);
598             assert(info->first + info->count <= info->pool->query_count);
599             struct v3dv_query *query = &info->pool->queries[info->first + i];
600 
601             offsets[i] = query->timestamp.offset;
602             syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
603          }
604 
605          copy.offsets = (uintptr_t)(void *)offsets;
606          copy.syncs = (uintptr_t)(void *)syncs;
607 
608          set_multisync(&ms, sync_info, NULL, 0, (void *)&copy, device, job,
609                        V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
610          if (!ms.base.id) {
611             free(bo_handles);
612             free(offsets);
613             free(syncs);
614             return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
615          }
616       } else {
617          assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
618 
619          submit.bo_handle_count = 1;
620          submit.bo_handles = (uintptr_t)(void *)&bo->handle;
621 
622          struct drm_v3d_copy_performance_query copy = {0};
623 
624          set_ext(&copy.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_PERFORMANCE_QUERY, 0);
625 
626 	 /* If the queryPool was created with VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR,
627 	  * results for each query are written as an array of the type indicated
628 	  * by VkPerformanceCounterKHR::storage for the counter being queried.
629 	  * For v3dv, VkPerformanceCounterKHR::storage is
630 	  * VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR.
631 	  */
632          copy.do_64bit = true;
633          copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT;
634          copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
635          copy.offset = info->offset + info->dst->mem_offset;
636          copy.stride = info->stride;
637          copy.count = info->count;
638          copy.nperfmons = info->pool->perfmon.nperfmons;
639          copy.ncounters = info->pool->perfmon.ncounters;
640 
641          kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count);
642 
643          struct vk_sync_wait waits[info->count];
644          unsigned wait_count = 0;
645 
646          for (uint32_t i = 0; i < info->count; i++) {
647             assert(info->first < info->pool->query_count);
648             assert(info->first + info->count <= info->pool->query_count);
649             struct v3dv_query *query = &info->pool->queries[info->first + i];
650 
651             syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
652             kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids;
653 
654             if (info->flags & VK_QUERY_RESULT_WAIT_BIT) {
655                 waits[wait_count] = (struct vk_sync_wait){
656                    .sync = query->perf.last_job_sync
657                 };
658                 wait_count++;
659             }
660          }
661 
662          copy.syncs = (uintptr_t)(void *)syncs;
663          copy.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids;
664 
665          set_multisync(&ms, sync_info, waits, wait_count, (void *)&copy, device, job,
666                        V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
667          if (!ms.base.id) {
668             free(kperfmon_ids);
669             free(bo_handles);
670             free(offsets);
671             free(syncs);
672             return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
673          }
674       }
675 
676       submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
677       submit.extensions = (uintptr_t)(void *)&ms;
678 
679       int ret = v3d_ioctl(device->pdevice->render_fd,
680                           DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
681 
682       free(kperfmon_ids);
683       free(bo_handles);
684       free(offsets);
685       free(syncs);
686       multisync_free(device, &ms);
687 
688       queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
689 
690       if (ret)
691          return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
692 
693       return VK_SUCCESS;
694    }
695 
696    /* Map the entire dst buffer for the CPU copy if needed */
697    assert(!bo->map || bo->map_size == bo->size);
698    if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
699       return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
700 
701    uint8_t *offset = ((uint8_t *) bo->map) +
702                      info->offset + info->dst->mem_offset;
703    v3dv_get_query_pool_results_cpu(job->device,
704                                    info->pool,
705                                    info->first,
706                                    info->count,
707                                    offset,
708                                    info->stride,
709                                    info->flags);
710 
711    return VK_SUCCESS;
712 }
713 
714 static VkResult
handle_timestamp_query_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)715 handle_timestamp_query_cpu_job(struct v3dv_queue *queue,
716                                struct v3dv_job *job,
717                                struct v3dv_submit_sync_info *sync_info,
718                                bool signal_syncs)
719 {
720    MESA_TRACE_FUNC();
721    struct v3dv_device *device = queue->device;
722 
723    assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY);
724    struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp;
725 
726    if (!device->pdevice->caps.cpu_queue) {
727       /* Wait for completion of all work queued before the timestamp query */
728       VkResult result = queue_wait_idle(queue, sync_info);
729       if (result != VK_SUCCESS)
730          return result;
731 
732       mtx_lock(&job->device->query_mutex);
733 
734       /* Compute timestamp */
735       struct timespec t;
736       clock_gettime(CLOCK_MONOTONIC, &t);
737 
738       for (uint32_t i = 0; i < info->count; i++) {
739          assert(info->query + i < info->pool->query_count);
740 	 struct v3dv_query *query = &info->pool->queries[info->query + i];
741          query->maybe_available = true;
742 
743          /* Value */
744          uint8_t *value_addr =
745             ((uint8_t *) info->pool->timestamp.bo->map) + query->timestamp.offset;
746          *((uint64_t*)value_addr) = (i == 0) ? t.tv_sec * 1000000000ull + t.tv_nsec : 0ull;
747 
748          /* Availability */
749          result = vk_sync_signal(&job->device->vk, query->timestamp.sync, 0);
750       }
751 
752       cnd_broadcast(&job->device->query_ended);
753       mtx_unlock(&job->device->query_mutex);
754 
755       return result;
756    }
757 
758    struct drm_v3d_submit_cpu submit = {0};
759 
760    submit.bo_handle_count = 1;
761    submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle;
762 
763    struct drm_v3d_timestamp_query timestamp = {0};
764 
765    set_ext(&timestamp.base, NULL, DRM_V3D_EXT_ID_CPU_TIMESTAMP_QUERY, 0);
766 
767    timestamp.count = info->count;
768 
769    uint32_t *offsets =
770       (uint32_t *) malloc(sizeof(uint32_t) * info->count);
771    uint32_t *syncs =
772       (uint32_t *) malloc(sizeof(uint32_t) * info->count);
773 
774    for (uint32_t i = 0; i < info->count; i++) {
775       assert(info->query + i < info->pool->query_count);
776       struct v3dv_query *query = &info->pool->queries[info->query + i];
777       query->maybe_available = true;
778 
779       offsets[i] = query->timestamp.offset;
780       syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
781    }
782 
783    timestamp.offsets = (uintptr_t)(void *)offsets;
784    timestamp.syncs = (uintptr_t)(void *)syncs;
785 
786    struct drm_v3d_multi_sync ms = {0};
787 
788    /* The CPU job should be serialized so it only executes after all previously
789     * submitted work has completed
790     */
791    job->serialize = V3DV_BARRIER_ALL;
792 
793    set_multisync(&ms, sync_info, NULL, 0, (void *)&timestamp, device, job,
794 	         V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
795    if (!ms.base.id) {
796       free(offsets);
797       free(syncs);
798       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
799    }
800 
801    submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
802    submit.extensions = (uintptr_t)(void *)&ms;
803 
804    int ret = v3d_ioctl(device->pdevice->render_fd,
805 			DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
806 
807    free(offsets);
808    free(syncs);
809    multisync_free(device, &ms);
810 
811    queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
812 
813    if (ret)
814       return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
815 
816    return VK_SUCCESS;
817 }
818 
819 static VkResult
handle_csd_indirect_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)820 handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
821                             struct v3dv_job *job,
822                             struct v3dv_submit_sync_info *sync_info,
823                             bool signal_syncs)
824 {
825    MESA_TRACE_FUNC();
826    struct v3dv_device *device = queue->device;
827 
828    assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
829    struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect;
830    assert(info->csd_job);
831 
832    assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
833    struct v3dv_bo *bo = info->buffer->mem->bo;
834 
835    if (!device->pdevice->caps.cpu_queue) {
836       /* Make sure the GPU is no longer using the indirect buffer*/
837       v3dv_bo_wait(queue->device, bo, OS_TIMEOUT_INFINITE);
838 
839       /* Map the indirect buffer and read the dispatch parameters */
840       if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
841          return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
842       assert(bo->map);
843 
844       const uint32_t offset = info->buffer->mem_offset + info->offset;
845       const uint32_t *group_counts = (uint32_t *) (bo->map + offset);
846       if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0)
847          return VK_SUCCESS;
848 
849       if (memcmp(group_counts, info->csd_job->csd.wg_count,
850 		 sizeof(info->csd_job->csd.wg_count)) != 0) {
851          v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts);
852       }
853 
854       return VK_SUCCESS;
855    }
856 
857    struct v3dv_job *csd_job = info->csd_job;
858 
859    struct drm_v3d_submit_cpu submit = {0};
860 
861    submit.bo_handle_count = 1;
862    submit.bo_handles = (uintptr_t)(void *)&bo->handle;
863 
864    csd_job->csd.submit.bo_handle_count = csd_job->bo_count;
865    uint32_t *bo_handles = (uint32_t *) malloc(sizeof(uint32_t) * csd_job->bo_count);
866    uint32_t bo_idx = 0;
867    set_foreach (csd_job->bos, entry) {
868       struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
869       bo_handles[bo_idx++] = bo->handle;
870    }
871    csd_job->csd.submit.bo_handles = (uintptr_t)(void *)bo_handles;
872 
873    struct drm_v3d_indirect_csd indirect = {0};
874 
875    set_ext(&indirect.base, NULL, DRM_V3D_EXT_ID_CPU_INDIRECT_CSD, 0);
876 
877    indirect.submit = csd_job->csd.submit;
878    indirect.offset = info->buffer->mem_offset + info->offset;
879    indirect.wg_size = info->wg_size;
880 
881    for (int i = 0; i < 3; i++) {
882       if (info->wg_uniform_offsets[i]) {
883          assert(info->wg_uniform_offsets[i] >= (uint32_t *) csd_job->indirect.base);
884          indirect.wg_uniform_offsets[i] = info->wg_uniform_offsets[i] - (uint32_t *) csd_job->indirect.base;
885       } else {
886          indirect.wg_uniform_offsets[i] = 0xffffffff; /* No rewrite */
887       }
888    }
889 
890    indirect.indirect = csd_job->indirect.bo->handle;
891 
892    struct drm_v3d_multi_sync ms = {0};
893 
894    /* We need to configure the semaphores of this job with the indirect
895     * CSD job, as the CPU job must obey to the CSD job synchronization
896     * demands, such as barriers.
897     */
898    set_multisync(&ms, sync_info, NULL, 0, (void *)&indirect, device, csd_job,
899 	         V3DV_QUEUE_CPU, V3DV_QUEUE_CSD, V3D_CPU, signal_syncs);
900    if (!ms.base.id)
901       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
902 
903    submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
904    submit.extensions = (uintptr_t)(void *)&ms;
905 
906    int ret = v3d_ioctl(device->pdevice->render_fd,
907 			DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
908 
909    free(bo_handles);
910    multisync_free(device, &ms);
911 
912    queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
913    queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
914 
915    if (ret)
916       return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
917 
918    return VK_SUCCESS;
919 }
920 
921 static VkResult
handle_cl_job(struct v3dv_queue * queue,struct v3dv_job * job,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)922 handle_cl_job(struct v3dv_queue *queue,
923               struct v3dv_job *job,
924               uint32_t counter_pass_idx,
925               struct v3dv_submit_sync_info *sync_info,
926               bool signal_syncs)
927 {
928    MESA_TRACE_FUNC();
929    struct v3dv_device *device = queue->device;
930 
931    struct drm_v3d_submit_cl submit = { 0 };
932 
933    /* Sanity check: we should only flag a bcl sync on a job that needs to be
934     * serialized.
935     */
936    assert(job->serialize || !job->needs_bcl_sync);
937 
938    /* We expect to have just one RCL per job which should fit in just one BO.
939     * Our BCL, could chain multiple BOS together though.
940     */
941    assert(list_length(&job->rcl.bo_list) == 1);
942    assert(list_length(&job->bcl.bo_list) >= 1);
943    struct v3dv_bo *bcl_fist_bo =
944       list_first_entry(&job->bcl.bo_list, struct v3dv_bo, list_link);
945    submit.bcl_start = bcl_fist_bo->offset;
946    submit.bcl_end = job->suspending ? job->suspended_bcl_end :
947                                       job->bcl.bo->offset + v3dv_cl_offset(&job->bcl);
948    submit.rcl_start = job->rcl.bo->offset;
949    submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl);
950 
951    submit.qma = job->tile_alloc->offset;
952    submit.qms = job->tile_alloc->size;
953    submit.qts = job->tile_state->offset;
954 
955    submit.flags = 0;
956    if (job->tmu_dirty_rcl)
957       submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
958 
959    /* If the job uses VK_KHR_buffer_device_address we need to ensure all
960     * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
961     * are included.
962     */
963    if (job->uses_buffer_device_address) {
964       util_dynarray_foreach(&queue->device->device_address_bo_list,
965                             struct v3dv_bo *, bo) {
966          v3dv_job_add_bo(job, *bo);
967       }
968    }
969 
970    submit.bo_handle_count = job->bo_count;
971    uint32_t *bo_handles =
972       (uint32_t *) malloc(sizeof(uint32_t) * submit.bo_handle_count);
973    uint32_t bo_idx = 0;
974    set_foreach(job->bos, entry) {
975       struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
976       bo_handles[bo_idx++] = bo->handle;
977    }
978    assert(bo_idx == submit.bo_handle_count);
979    submit.bo_handles = (uintptr_t)(void *)bo_handles;
980 
981    submit.perfmon_id = job->perf ?
982       job->perf->kperfmon_ids[counter_pass_idx] : 0;
983    const bool needs_perf_sync = queue->last_perfmon_id != submit.perfmon_id;
984    queue->last_perfmon_id = submit.perfmon_id;
985 
986    /* We need a binning sync if we are the first CL job waiting on a semaphore
987     * with a wait stage that involves the geometry pipeline, or if the job
988     * comes after a pipeline barrier that involves geometry stages
989     * (needs_bcl_sync) or when performance queries are in use.
990     *
991     * We need a render sync if the job doesn't need a binning sync but has
992     * still been flagged for serialization. It should be noted that RCL jobs
993     * don't start until the previous RCL job has finished so we don't really
994     * need to add a fence for those, however, we might need to wait on a CSD or
995     * TFU job, which are not automatically serialized with CL jobs.
996     */
997    bool needs_bcl_sync = job->needs_bcl_sync || needs_perf_sync;
998    if (queue->last_job_syncs.first[V3DV_QUEUE_CL]) {
999       for (int i = 0; !needs_bcl_sync && i < sync_info->wait_count; i++) {
1000          needs_bcl_sync = sync_info->waits[i].stage_mask &
1001              (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
1002               VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
1003               VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
1004               VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
1005               VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
1006               VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
1007               VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
1008               VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
1009               VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
1010               VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
1011               VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
1012               VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT);
1013       }
1014    }
1015 
1016    bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
1017 
1018    /* Replace single semaphore settings whenever our kernel-driver supports
1019     * multiple semaphores extension.
1020     */
1021    struct drm_v3d_multi_sync ms = { 0 };
1022    enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN;
1023    set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
1024                  V3DV_QUEUE_CL, V3DV_QUEUE_CL, wait_stage, signal_syncs);
1025    if (!ms.base.id) {
1026       free(bo_handles);
1027       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1028    }
1029 
1030    submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
1031    submit.extensions = (uintptr_t)(void *)&ms;
1032 
1033    /* We are using multisync so disable legacy single-sync interface */
1034    submit.in_sync_rcl = 0;
1035    submit.in_sync_bcl = 0;
1036    submit.out_sync = 0;
1037 
1038    v3dv_clif_dump(device, job, &submit);
1039    int ret = v3d_ioctl(device->pdevice->render_fd,
1040                        DRM_IOCTL_V3D_SUBMIT_CL, &submit);
1041 
1042    static bool warned = false;
1043    if (ret && !warned) {
1044       mesa_loge("Draw call returned %s. Expect corruption.\n",
1045                 strerror(errno));
1046       warned = true;
1047    }
1048 
1049    free(bo_handles);
1050    multisync_free(device, &ms);
1051 
1052    queue->last_job_syncs.first[V3DV_QUEUE_CL] = false;
1053 
1054    if (ret)
1055       return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CL failed: %m");
1056 
1057    return VK_SUCCESS;
1058 }
1059 
1060 static VkResult
handle_tfu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1061 handle_tfu_job(struct v3dv_queue *queue,
1062                struct v3dv_job *job,
1063                struct v3dv_submit_sync_info *sync_info,
1064                bool signal_syncs)
1065 {
1066    MESA_TRACE_FUNC();
1067    assert(!V3D_DBG(DISABLE_TFU));
1068 
1069    struct v3dv_device *device = queue->device;
1070 
1071    /* Replace single semaphore settings whenever our kernel-driver supports
1072     * multiple semaphore extension.
1073     */
1074    struct drm_v3d_multi_sync ms = { 0 };
1075    set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
1076                  V3DV_QUEUE_TFU, V3DV_QUEUE_TFU, V3D_TFU, signal_syncs);
1077    if (!ms.base.id)
1078       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1079 
1080    job->tfu.flags |= DRM_V3D_SUBMIT_EXTENSION;
1081    job->tfu.extensions = (uintptr_t)(void *)&ms;
1082 
1083    /* We are using multisync so disable legacy single-sync interface */
1084    job->tfu.in_sync = 0;
1085    job->tfu.out_sync = 0;
1086 
1087    int ret = v3d_ioctl(device->pdevice->render_fd,
1088                        DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu);
1089 
1090    multisync_free(device, &ms);
1091    queue->last_job_syncs.first[V3DV_QUEUE_TFU] = false;
1092 
1093    if (ret != 0)
1094       return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_TFU failed: %m");
1095 
1096    return VK_SUCCESS;
1097 }
1098 
1099 static VkResult
handle_csd_job(struct v3dv_queue * queue,struct v3dv_job * job,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1100 handle_csd_job(struct v3dv_queue *queue,
1101                struct v3dv_job *job,
1102                uint32_t counter_pass_idx,
1103                struct v3dv_submit_sync_info *sync_info,
1104                bool signal_syncs)
1105 {
1106    MESA_TRACE_FUNC();
1107    struct v3dv_device *device = queue->device;
1108 
1109    struct drm_v3d_submit_csd *submit = &job->csd.submit;
1110 
1111    /* If the job uses VK_KHR_buffer_device_address we need to ensure all
1112     * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
1113     * are included.
1114     */
1115    if (job->uses_buffer_device_address) {
1116       util_dynarray_foreach(&queue->device->device_address_bo_list,
1117                             struct v3dv_bo *, bo) {
1118          v3dv_job_add_bo(job, *bo);
1119       }
1120    }
1121 
1122    submit->bo_handle_count = job->bo_count;
1123    uint32_t *bo_handles =
1124       (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2));
1125    uint32_t bo_idx = 0;
1126    set_foreach(job->bos, entry) {
1127       struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
1128       bo_handles[bo_idx++] = bo->handle;
1129    }
1130    assert(bo_idx == submit->bo_handle_count);
1131    submit->bo_handles = (uintptr_t)(void *)bo_handles;
1132 
1133    /* Replace single semaphore settings whenever our kernel-driver supports
1134     * multiple semaphore extension.
1135     */
1136    struct drm_v3d_multi_sync ms = { 0 };
1137    set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
1138                  V3DV_QUEUE_CSD, V3DV_QUEUE_CSD, V3D_CSD, signal_syncs);
1139    if (!ms.base.id)
1140       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1141 
1142    submit->flags |= DRM_V3D_SUBMIT_EXTENSION;
1143    submit->extensions = (uintptr_t)(void *)&ms;
1144 
1145    /* We are using multisync so disable legacy single-sync interface */
1146    submit->in_sync = 0;
1147    submit->out_sync = 0;
1148 
1149    submit->perfmon_id = job->perf ?
1150       job->perf->kperfmon_ids[counter_pass_idx] : 0;
1151    queue->last_perfmon_id = submit->perfmon_id;
1152 
1153    int ret = v3d_ioctl(device->pdevice->render_fd,
1154                        DRM_IOCTL_V3D_SUBMIT_CSD, submit);
1155 
1156    static bool warned = false;
1157    if (ret && !warned) {
1158       mesa_loge("Compute dispatch returned %s. Expect corruption.\n",
1159                 strerror(errno));
1160       warned = true;
1161    }
1162 
1163    free(bo_handles);
1164 
1165    multisync_free(device, &ms);
1166    queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
1167 
1168    if (ret)
1169       return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CSD failed: %m");
1170 
1171    return VK_SUCCESS;
1172 }
1173 
1174 static VkResult
queue_handle_job(struct v3dv_queue * queue,struct v3dv_job * job,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1175 queue_handle_job(struct v3dv_queue *queue,
1176                  struct v3dv_job *job,
1177                  uint32_t counter_pass_idx,
1178                  struct v3dv_submit_sync_info *sync_info,
1179                  bool signal_syncs)
1180 {
1181    switch (job->type) {
1182    case V3DV_JOB_TYPE_GPU_CL:
1183       return handle_cl_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
1184    case V3DV_JOB_TYPE_GPU_TFU:
1185       return handle_tfu_job(queue, job, sync_info, signal_syncs);
1186    case V3DV_JOB_TYPE_GPU_CSD:
1187       return handle_csd_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
1188    case V3DV_JOB_TYPE_CPU_RESET_QUERIES:
1189       return handle_reset_query_cpu_job(queue, job, sync_info, signal_syncs);
1190    case V3DV_JOB_TYPE_CPU_END_QUERY:
1191       return handle_end_query_cpu_job(job, counter_pass_idx);
1192    case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
1193       return handle_copy_query_results_cpu_job(queue, job, sync_info, signal_syncs);
1194    case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
1195       return handle_csd_indirect_cpu_job(queue, job, sync_info, signal_syncs);
1196    case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY:
1197       return handle_timestamp_query_cpu_job(queue, job, sync_info, signal_syncs);
1198    default:
1199       unreachable("Unhandled job type");
1200    }
1201 }
1202 
1203 static VkResult
queue_create_noop_job(struct v3dv_queue * queue)1204 queue_create_noop_job(struct v3dv_queue *queue)
1205 {
1206    struct v3dv_device *device = queue->device;
1207    queue->noop_job = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_job), 8,
1208                                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1209    if (!queue->noop_job)
1210       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1211    v3dv_job_init(queue->noop_job, V3DV_JOB_TYPE_GPU_CL, device, NULL, -1);
1212 
1213    v3d_X((&device->devinfo), job_emit_noop)(queue->noop_job);
1214 
1215    /* We use no-op jobs to signal semaphores/fences. These jobs needs to be
1216     * serialized across all hw queues to comply with Vulkan's signal operation
1217     * order requirements, which basically require that signal operations occur
1218     * in submission order.
1219     */
1220    queue->noop_job->serialize = V3DV_BARRIER_ALL;
1221 
1222    return VK_SUCCESS;
1223 }
1224 
1225 static VkResult
queue_submit_noop_job(struct v3dv_queue * queue,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1226 queue_submit_noop_job(struct v3dv_queue *queue,
1227                       uint32_t counter_pass_idx,
1228                       struct v3dv_submit_sync_info *sync_info,
1229                       bool signal_syncs)
1230 {
1231    if (!queue->noop_job) {
1232       VkResult result = queue_create_noop_job(queue);
1233       if (result != VK_SUCCESS)
1234          return result;
1235    }
1236 
1237    assert(queue->noop_job);
1238    return queue_handle_job(queue, queue->noop_job, counter_pass_idx,
1239                            sync_info, signal_syncs);
1240 }
1241 
1242 VkResult
v3dv_queue_driver_submit(struct vk_queue * vk_queue,struct vk_queue_submit * submit)1243 v3dv_queue_driver_submit(struct vk_queue *vk_queue,
1244                          struct vk_queue_submit *submit)
1245 {
1246    MESA_TRACE_FUNC();
1247    struct v3dv_queue *queue = container_of(vk_queue, struct v3dv_queue, vk);
1248    VkResult result;
1249 
1250    struct v3dv_submit_sync_info sync_info = {
1251       .wait_count = submit->wait_count,
1252       .waits = submit->waits,
1253       .signal_count = submit->signal_count,
1254       .signals = submit->signals,
1255    };
1256 
1257    for (int i = 0; i < V3DV_QUEUE_COUNT; i++)
1258       queue->last_job_syncs.first[i] = true;
1259 
1260    struct v3dv_job *first_suspend_job = NULL;
1261    struct v3dv_job *current_suspend_job = NULL;
1262    for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
1263       struct v3dv_cmd_buffer *cmd_buffer =
1264          container_of(submit->command_buffers[i], struct v3dv_cmd_buffer, vk);
1265       list_for_each_entry_safe(struct v3dv_job, job,
1266                                &cmd_buffer->jobs, list_link) {
1267          if (job->suspending) {
1268             job = v3d_X((&job->device->devinfo),
1269                          cmd_buffer_prepare_suspend_job_for_submit)(job);
1270             if (!job)
1271                return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1272          }
1273 
1274          if (job->suspending && !job->resuming) {
1275             assert(!first_suspend_job);
1276             assert(!current_suspend_job);
1277             first_suspend_job = job;
1278          }
1279 
1280          if (job->resuming) {
1281             assert(first_suspend_job);
1282             assert(current_suspend_job);
1283             v3d_X((&job->device->devinfo), job_patch_resume_address)(first_suspend_job,
1284                                                           current_suspend_job,
1285                                                           job);
1286             current_suspend_job = NULL;
1287          }
1288 
1289          if (job->suspending) {
1290             current_suspend_job = job;
1291          } else {
1292             assert(!current_suspend_job);
1293             struct v3dv_job *submit_job = first_suspend_job ?
1294                                           first_suspend_job : job;
1295             result =
1296                queue_handle_job(queue, submit_job, submit->perf_pass_index,
1297                                 &sync_info, false);
1298 
1299             if (result != VK_SUCCESS)
1300                return result;
1301 
1302             first_suspend_job = NULL;
1303          }
1304       }
1305 
1306       /* If the command buffer ends with a barrier we need to consume it now.
1307        *
1308        * FIXME: this will drain all hw queues. Instead, we could use the pending
1309        * barrier state to limit the queues we serialize against.
1310        */
1311       if (cmd_buffer->state.barrier.dst_mask) {
1312          result = queue_submit_noop_job(queue, submit->perf_pass_index,
1313                                         &sync_info, false);
1314          if (result != VK_SUCCESS)
1315             return result;
1316       }
1317    }
1318 
1319    assert(!first_suspend_job);
1320    assert(!current_suspend_job);
1321 
1322    /* Handle signaling now */
1323    if (submit->signal_count > 0) {
1324       /* Finish by submitting a no-op job that synchronizes across all queues.
1325        * This will ensure that the signal semaphores don't get triggered until
1326        * all work on any queue completes. See Vulkan's signal operation order
1327        * requirements.
1328        */
1329       return queue_submit_noop_job(queue, submit->perf_pass_index,
1330                                    &sync_info, true);
1331    }
1332 
1333    return VK_SUCCESS;
1334 }
1335 
1336 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_QueueBindSparse(VkQueue _queue,uint32_t bindInfoCount,const VkBindSparseInfo * pBindInfo,VkFence fence)1337 v3dv_QueueBindSparse(VkQueue _queue,
1338                      uint32_t bindInfoCount,
1339                      const VkBindSparseInfo *pBindInfo,
1340                      VkFence fence)
1341 {
1342    V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
1343    return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT);
1344 }
1345