• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2019 Raspberry Pi Ltd
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3dv_private.h"
25 #include "drm-uapi/v3d_drm.h"
26 
27 #include "broadcom/clif/clif_dump.h"
28 #include "util/libsync.h"
29 #include "util/os_time.h"
30 #include "vk_drm_syncobj.h"
31 
32 #include <errno.h>
33 #include <time.h>
34 
35 static void
v3dv_clif_dump(struct v3dv_device * device,struct v3dv_job * job,struct drm_v3d_submit_cl * submit)36 v3dv_clif_dump(struct v3dv_device *device,
37                struct v3dv_job *job,
38                struct drm_v3d_submit_cl *submit)
39 {
40    if (!(V3D_DBG(CL) ||
41          V3D_DBG(CL_NO_BIN) ||
42          V3D_DBG(CLIF)))
43       return;
44 
45    struct clif_dump *clif = clif_dump_init(&device->devinfo,
46                                            stderr,
47                                            V3D_DBG(CL) ||
48                                            V3D_DBG(CL_NO_BIN),
49                                            V3D_DBG(CL_NO_BIN));
50 
51    set_foreach(job->bos, entry) {
52       struct v3dv_bo *bo = (void *)entry->key;
53       char *name = ralloc_asprintf(NULL, "%s_0x%x",
54                                    bo->name, bo->offset);
55 
56       bool ok = v3dv_bo_map(device, bo, bo->size);
57       if (!ok) {
58          fprintf(stderr, "failed to map BO for clif_dump.\n");
59          ralloc_free(name);
60          goto free_clif;
61       }
62       clif_dump_add_bo(clif, name, bo->offset, bo->size, bo->map);
63 
64       ralloc_free(name);
65    }
66 
67    clif_dump(clif, submit);
68 
69  free_clif:
70    clif_dump_destroy(clif);
71 }
72 
73 static VkResult
queue_wait_idle(struct v3dv_queue * queue,struct v3dv_submit_sync_info * sync_info)74 queue_wait_idle(struct v3dv_queue *queue,
75                 struct v3dv_submit_sync_info *sync_info)
76 {
77    if (queue->device->pdevice->caps.multisync) {
78       int ret = drmSyncobjWait(queue->device->pdevice->render_fd,
79                                queue->last_job_syncs.syncs, 4,
80                                INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
81                                NULL);
82       if (ret) {
83          return vk_errorf(queue, VK_ERROR_DEVICE_LOST,
84                           "syncobj wait failed: %m");
85       }
86 
87       bool first = true;
88       for (int i = 0; i < 4; i++) {
89          if (!queue->last_job_syncs.first[i])
90             first = false;
91       }
92 
93       /* If we're not the first job, that means we're waiting on some
94        * per-queue-type syncobj which transitively waited on the semaphores
95        * so we can skip the semaphore wait.
96        */
97       if (first) {
98          VkResult result = vk_sync_wait_many(&queue->device->vk,
99                                              sync_info->wait_count,
100                                              sync_info->waits,
101                                              VK_SYNC_WAIT_COMPLETE,
102                                              UINT64_MAX);
103          if (result != VK_SUCCESS)
104             return result;
105       }
106    } else {
107       /* Without multisync, all the semaphores are baked into the one syncobj
108        * at the start of each submit so we only need to wait on the one.
109        */
110       int ret = drmSyncobjWait(queue->device->pdevice->render_fd,
111                                &queue->last_job_syncs.syncs[V3DV_QUEUE_ANY], 1,
112                                INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
113                                NULL);
114       if (ret) {
115          return vk_errorf(queue, VK_ERROR_DEVICE_LOST,
116                           "syncobj wait failed: %m");
117       }
118    }
119 
120    for (int i = 0; i < 4; i++)
121       queue->last_job_syncs.first[i] = false;
122 
123    return VK_SUCCESS;
124 }
125 
126 static void
multisync_free(struct v3dv_device * device,struct drm_v3d_multi_sync * ms)127 multisync_free(struct v3dv_device *device,
128                struct drm_v3d_multi_sync *ms)
129 {
130    vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->out_syncs);
131    vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->in_syncs);
132 }
133 
134 static struct drm_v3d_sem *
set_in_syncs(struct v3dv_queue * queue,struct v3dv_job * job,enum v3dv_queue_type queue_sync,uint32_t * count,struct vk_sync_wait * waits,unsigned wait_count,struct v3dv_submit_sync_info * sync_info)135 set_in_syncs(struct v3dv_queue *queue,
136              struct v3dv_job *job,
137              enum v3dv_queue_type queue_sync,
138              uint32_t *count,
139              struct vk_sync_wait *waits,
140              unsigned wait_count,
141              struct v3dv_submit_sync_info *sync_info)
142 {
143    struct v3dv_device *device = queue->device;
144    uint32_t n_syncs = 0;
145 
146    /* If this is the first job submitted to a given GPU queue in this cmd buf
147     * batch, it has to wait on wait semaphores (if any) before running.
148     */
149    if (queue->last_job_syncs.first[queue_sync])
150       n_syncs = sync_info->wait_count;
151 
152    /* If the serialize flag is set the job needs to be serialized in the
153     * corresponding queues. Notice that we may implement transfer operations
154     * as both CL or TFU jobs.
155     *
156     * FIXME: maybe we could track more precisely if the source of a transfer
157     * barrier is a CL and/or a TFU job.
158     */
159    bool sync_csd  = job->serialize & V3DV_BARRIER_COMPUTE_BIT;
160    bool sync_tfu  = job->serialize & V3DV_BARRIER_TRANSFER_BIT;
161    bool sync_cl   = job->serialize & (V3DV_BARRIER_GRAPHICS_BIT |
162                                       V3DV_BARRIER_TRANSFER_BIT);
163    bool sync_cpu  = job->serialize & V3DV_BARRIER_CPU_BIT;
164 
165    *count = n_syncs;
166    if (sync_cl)
167       (*count)++;
168    if (sync_tfu)
169       (*count)++;
170    if (sync_csd)
171       (*count)++;
172    if (sync_cpu)
173       (*count)++;
174 
175    *count += wait_count;
176 
177    if (!*count)
178       return NULL;
179 
180    struct drm_v3d_sem *syncs =
181       vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
182                 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
183 
184    if (!syncs)
185       return NULL;
186 
187    for (int i = 0; i < n_syncs; i++) {
188       syncs[i].handle =
189          vk_sync_as_drm_syncobj(sync_info->waits[i].sync)->syncobj;
190    }
191 
192    for (int i = 0; i < wait_count; i++) {
193       syncs[n_syncs++].handle =
194          vk_sync_as_drm_syncobj(waits[i].sync)->syncobj;
195    }
196 
197    if (sync_cl)
198       syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CL];
199 
200    if (sync_csd)
201       syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CSD];
202 
203    if (sync_tfu)
204       syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_TFU];
205 
206    if (sync_cpu)
207       syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CPU];
208 
209    assert(n_syncs == *count);
210    return syncs;
211 }
212 
213 static struct drm_v3d_sem *
set_out_syncs(struct v3dv_queue * queue,struct v3dv_job * job,enum v3dv_queue_type queue_sync,uint32_t * count,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)214 set_out_syncs(struct v3dv_queue *queue,
215               struct v3dv_job *job,
216               enum v3dv_queue_type queue_sync,
217               uint32_t *count,
218               struct v3dv_submit_sync_info *sync_info,
219               bool signal_syncs)
220 {
221    struct v3dv_device *device = queue->device;
222 
223    uint32_t n_vk_syncs = signal_syncs ? sync_info->signal_count : 0;
224 
225    /* We always signal the syncobj from `device->last_job_syncs` related to
226     * this v3dv_queue_type to track the last job submitted to this queue.
227     */
228    (*count) = n_vk_syncs + 1;
229 
230    struct drm_v3d_sem *syncs =
231       vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
232                 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
233 
234    if (!syncs)
235       return NULL;
236 
237    if (n_vk_syncs) {
238       for (unsigned i = 0; i < n_vk_syncs; i++) {
239          syncs[i].handle =
240             vk_sync_as_drm_syncobj(sync_info->signals[i].sync)->syncobj;
241       }
242    }
243 
244    syncs[n_vk_syncs].handle = queue->last_job_syncs.syncs[queue_sync];
245 
246    return syncs;
247 }
248 
249 static void
set_ext(struct drm_v3d_extension * ext,struct drm_v3d_extension * next,uint32_t id,uintptr_t flags)250 set_ext(struct drm_v3d_extension *ext,
251 	struct drm_v3d_extension *next,
252 	uint32_t id,
253 	uintptr_t flags)
254 {
255    ext->next = (uintptr_t)(void *)next;
256    ext->id = id;
257    ext->flags = flags;
258 }
259 
260 /* This function sets the extension for multiple in/out syncobjs. When it is
261  * successful, it sets the extension id to DRM_V3D_EXT_ID_MULTI_SYNC.
262  * Otherwise, the extension id is 0, which means an out-of-memory error.
263  */
264 static void
set_multisync(struct drm_v3d_multi_sync * ms,struct v3dv_submit_sync_info * sync_info,struct vk_sync_wait * waits,unsigned wait_count,struct drm_v3d_extension * next,struct v3dv_device * device,struct v3dv_job * job,enum v3dv_queue_type in_queue_sync,enum v3dv_queue_type out_queue_sync,enum v3d_queue wait_stage,bool signal_syncs)265 set_multisync(struct drm_v3d_multi_sync *ms,
266               struct v3dv_submit_sync_info *sync_info,
267               struct vk_sync_wait *waits,
268               unsigned wait_count,
269               struct drm_v3d_extension *next,
270               struct v3dv_device *device,
271               struct v3dv_job *job,
272               enum v3dv_queue_type in_queue_sync,
273               enum v3dv_queue_type out_queue_sync,
274               enum v3d_queue wait_stage,
275               bool signal_syncs)
276 {
277    struct v3dv_queue *queue = &device->queue;
278    uint32_t out_sync_count = 0, in_sync_count = 0;
279    struct drm_v3d_sem *out_syncs = NULL, *in_syncs = NULL;
280 
281    in_syncs = set_in_syncs(queue, job, in_queue_sync,
282                            &in_sync_count, waits, wait_count, sync_info);
283    if (!in_syncs && in_sync_count)
284       goto fail;
285 
286    out_syncs = set_out_syncs(queue, job, out_queue_sync,
287                              &out_sync_count, sync_info, signal_syncs);
288 
289    assert(out_sync_count > 0);
290 
291    if (!out_syncs)
292       goto fail;
293 
294    set_ext(&ms->base, next, DRM_V3D_EXT_ID_MULTI_SYNC, 0);
295    ms->wait_stage = wait_stage;
296    ms->out_sync_count = out_sync_count;
297    ms->out_syncs = (uintptr_t)(void *)out_syncs;
298    ms->in_sync_count = in_sync_count;
299    ms->in_syncs = (uintptr_t)(void *)in_syncs;
300 
301    return;
302 
303 fail:
304    if (in_syncs)
305       vk_free(&device->vk.alloc, in_syncs);
306    assert(!out_syncs);
307 
308    return;
309 }
310 
311 static VkResult
handle_reset_query_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)312 handle_reset_query_cpu_job(struct v3dv_queue *queue,
313                            struct v3dv_job *job,
314                            struct v3dv_submit_sync_info *sync_info,
315                            bool signal_syncs)
316 {
317    struct v3dv_device *device = queue->device;
318    struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset;
319    assert(info->pool);
320 
321    assert(info->pool->query_type != VK_QUERY_TYPE_OCCLUSION);
322 
323    if (device->pdevice->caps.cpu_queue) {
324       assert(info->first + info->count <= info->pool->query_count);
325 
326       struct drm_v3d_submit_cpu submit = {0};
327       struct drm_v3d_multi_sync ms = {0};
328 
329       uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
330       uintptr_t *kperfmon_ids = NULL;
331 
332       if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
333          submit.bo_handle_count = 1;
334          submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle;
335 
336          struct drm_v3d_reset_timestamp_query reset = {0};
337 
338          set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_TIMESTAMP_QUERY, 0);
339 
340          reset.count = info->count;
341          reset.offset = info->pool->queries[info->first].timestamp.offset;
342 
343          for (uint32_t i = 0; i < info->count; i++) {
344             struct v3dv_query *query = &info->pool->queries[info->first + i];
345             syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
346          }
347 
348          reset.syncs = (uintptr_t)(void *)syncs;
349 
350          set_multisync(&ms, sync_info, NULL, 0, (void *)&reset, device, job,
351                        V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
352          if (!ms.base.id)
353             return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
354       } else {
355          assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
356          struct drm_v3d_reset_performance_query reset = {0};
357 
358          set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_PERFORMANCE_QUERY, 0);
359 
360          struct vk_sync_wait waits[info->count];
361          unsigned wait_count = 0;
362          for (int i = 0; i < info->count; i++) {
363             struct v3dv_query *query = &info->pool->queries[info->first + i];
364             /* Only wait for a query if we've used it otherwise we will be
365              * waiting forever for the fence to become signaled.
366              */
367             if (query->maybe_available) {
368                waits[wait_count] = (struct vk_sync_wait){
369                   .sync = query->perf.last_job_sync
370                };
371                wait_count++;
372             };
373          }
374 
375          reset.count = info->count;
376          reset.nperfmons = info->pool->perfmon.nperfmons;
377 
378          kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count);
379 
380          for (uint32_t i = 0; i < info->count; i++) {
381             struct v3dv_query *query = &info->pool->queries[info->first + i];
382 
383             syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
384             kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids;
385          }
386 
387          reset.syncs = (uintptr_t)(void *)syncs;
388          reset.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids;
389 
390          set_multisync(&ms, sync_info, waits, wait_count, (void *)&reset, device, job,
391                        V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
392          if (!ms.base.id)
393             return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
394       }
395 
396       submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
397       submit.extensions = (uintptr_t)(void *)&ms;
398 
399       /* From the Vulkan spec for vkCmdResetQueryPool:
400        *
401        *    "This command defines an execution dependency between other query commands
402        *     that reference the same query.
403        *     ...
404        *     The second synchronization scope includes all commands which reference the
405        *     queries in queryPool indicated by firstQuery and queryCount that occur later
406        *     in submission order."
407        *
408        * This means we should ensure that any timestamps after a reset don't execute before
409        * the reset, however, for timestamps queries in particular we don't have to do
410        * anything special because timestamp queries have to wait for all previously
411        * submitted work to complete before executing (which we accomplish by using
412        * V3DV_BARRIER_ALL on them) and that includes reset jobs submitted to the CPU queue.
413        */
414       int ret = v3dv_ioctl(device->pdevice->render_fd,
415                            DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
416 
417       free(syncs);
418       free(kperfmon_ids);
419       multisync_free(device, &ms);
420 
421       queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
422 
423       if (ret)
424          return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
425 
426       return VK_SUCCESS;
427    }
428 
429    /* We are about to reset query counters in user-space so we need to make
430     * sure that the GPU is not using them.
431     */
432    if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
433       VkResult result = queue_wait_idle(queue, sync_info);
434       if (result != VK_SUCCESS)
435          return result;
436 
437       v3dv_bo_wait(job->device, info->pool->timestamp.bo, OS_TIMEOUT_INFINITE);
438    }
439 
440    if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
441       struct vk_sync_wait waits[info->count];
442       unsigned wait_count = 0;
443       for (int i = 0; i < info->count; i++) {
444          struct v3dv_query *query = &info->pool->queries[info->first + i];
445          /* Only wait for a query if we've used it otherwise we will be
446           * waiting forever for the fence to become signaled.
447           */
448          if (query->maybe_available) {
449             waits[wait_count] = (struct vk_sync_wait){
450                .sync = query->perf.last_job_sync
451             };
452             wait_count++;
453          };
454       }
455 
456       VkResult result = vk_sync_wait_many(&job->device->vk, wait_count, waits,
457                                           VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
458 
459       if (result != VK_SUCCESS)
460          return result;
461    }
462 
463    v3dv_reset_query_pool_cpu(job->device, info->pool, info->first, info->count);
464 
465    return VK_SUCCESS;
466 }
467 
468 static VkResult
export_perfmon_last_job_sync(struct v3dv_queue * queue,struct v3dv_job * job,int * fd)469 export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int *fd)
470 {
471    int err;
472    if (job->device->pdevice->caps.multisync) {
473       static const enum v3dv_queue_type queues_to_sync[] = {
474          V3DV_QUEUE_CL,
475          V3DV_QUEUE_CSD,
476       };
477 
478       for (uint32_t i = 0; i < ARRAY_SIZE(queues_to_sync); i++) {
479          enum v3dv_queue_type queue_type = queues_to_sync[i];
480          int tmp_fd = -1;
481 
482          err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
483                                         queue->last_job_syncs.syncs[queue_type],
484                                         &tmp_fd);
485 
486          if (err) {
487             close(*fd);
488             return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
489                              "sync file export failed: %m");
490          }
491 
492          err = sync_accumulate("v3dv", fd, tmp_fd);
493 
494          if (err) {
495             close(tmp_fd);
496             close(*fd);
497             return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
498                              "failed to accumulate sync files: %m");
499          }
500       }
501    } else {
502       err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
503                                      queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
504                                      fd);
505 
506       if (err) {
507          return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
508                           "sync file export failed: %m");
509       }
510    }
511    return VK_SUCCESS;
512 }
513 
514 static VkResult
handle_end_query_cpu_job(struct v3dv_job * job,uint32_t counter_pass_idx)515 handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx)
516 {
517    VkResult result = VK_SUCCESS;
518 
519    mtx_lock(&job->device->query_mutex);
520 
521    struct v3dv_end_query_info *info = &job->cpu.query_end;
522    struct v3dv_queue *queue = &job->device->queue;
523 
524    int err = 0;
525    int fd = -1;
526 
527    assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
528 
529    if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
530       result = export_perfmon_last_job_sync(queue, job, &fd);
531 
532       if (result != VK_SUCCESS)
533          goto fail;
534 
535       assert(fd >= 0);
536    }
537 
538    for (uint32_t i = 0; i < info->count; i++) {
539       assert(info->query + i < info->pool->query_count);
540       struct v3dv_query *query = &info->pool->queries[info->query + i];
541 
542       if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
543          uint32_t syncobj = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
544          err = drmSyncobjImportSyncFile(job->device->pdevice->render_fd,
545                                         syncobj, fd);
546 
547          if (err) {
548             result = vk_errorf(queue, VK_ERROR_UNKNOWN,
549                                "sync file import failed: %m");
550             goto fail;
551          }
552       }
553 
554       query->maybe_available = true;
555    }
556 
557 fail:
558    if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
559       close(fd);
560 
561    cnd_broadcast(&job->device->query_ended);
562    mtx_unlock(&job->device->query_mutex);
563 
564    return result;
565 }
566 
567 static VkResult
handle_copy_query_results_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)568 handle_copy_query_results_cpu_job(struct v3dv_queue *queue,
569                                   struct v3dv_job *job,
570                                   struct v3dv_submit_sync_info *sync_info,
571                                   bool signal_syncs)
572 {
573    struct v3dv_device *device = queue->device;
574    struct v3dv_copy_query_results_cpu_job_info *info =
575       &job->cpu.query_copy_results;
576 
577    assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
578           info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
579 
580    assert(info->dst && info->dst->mem && info->dst->mem->bo);
581    struct v3dv_bo *bo = info->dst->mem->bo;
582 
583    if (device->pdevice->caps.cpu_queue) {
584       struct drm_v3d_submit_cpu submit = {0};
585       struct drm_v3d_multi_sync ms = {0};
586 
587       uint32_t *offsets = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
588       uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
589       uint32_t *bo_handles = NULL;
590       uintptr_t *kperfmon_ids = NULL;
591 
592       if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
593          submit.bo_handle_count = 2;
594 
595          bo_handles = (uint32_t *)
596             malloc(sizeof(uint32_t) * submit.bo_handle_count);
597 
598          bo_handles[0] = bo->handle;
599          bo_handles[1] = info->pool->timestamp.bo->handle;
600          submit.bo_handles = (uintptr_t)(void *)bo_handles;
601 
602          struct drm_v3d_copy_timestamp_query copy = {0};
603 
604          set_ext(&copy.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_TIMESTAMP_QUERY, 0);
605 
606          copy.do_64bit = info->flags & VK_QUERY_RESULT_64_BIT;
607          copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT;
608          copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
609          copy.offset = info->offset + info->dst->mem_offset;
610          copy.stride = info->stride;
611          copy.count = info->count;
612 
613          for (uint32_t i = 0; i < info->count; i++) {
614             assert(info->first < info->pool->query_count);
615             assert(info->first + info->count <= info->pool->query_count);
616             struct v3dv_query *query = &info->pool->queries[info->first + i];
617 
618             offsets[i] = query->timestamp.offset;
619             syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
620          }
621 
622          copy.offsets = (uintptr_t)(void *)offsets;
623          copy.syncs = (uintptr_t)(void *)syncs;
624 
625          set_multisync(&ms, sync_info, NULL, 0, (void *)&copy, device, job,
626                        V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
627          if (!ms.base.id)
628             return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
629       } else {
630          assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
631 
632          submit.bo_handle_count = 1;
633          submit.bo_handles = (uintptr_t)(void *)&bo->handle;
634 
635          struct drm_v3d_copy_performance_query copy = {0};
636 
637          set_ext(&copy.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_PERFORMANCE_QUERY, 0);
638 
639 	 /* If the queryPool was created with VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR,
640 	  * results for each query are written as an array of the type indicated
641 	  * by VkPerformanceCounterKHR::storage for the counter being queried.
642 	  * For v3dv, VkPerformanceCounterKHR::storage is
643 	  * VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR.
644 	  */
645          copy.do_64bit = true;
646          copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT;
647          copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
648          copy.offset = info->offset + info->dst->mem_offset;
649          copy.stride = info->stride;
650          copy.count = info->count;
651          copy.nperfmons = info->pool->perfmon.nperfmons;
652          copy.ncounters = info->pool->perfmon.ncounters;
653 
654          kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count);
655 
656          struct vk_sync_wait waits[info->count];
657          unsigned wait_count = 0;
658 
659          for (uint32_t i = 0; i < info->count; i++) {
660             assert(info->first < info->pool->query_count);
661             assert(info->first + info->count <= info->pool->query_count);
662             struct v3dv_query *query = &info->pool->queries[info->first + i];
663 
664             syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
665             kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids;
666 
667             if (info->flags & VK_QUERY_RESULT_WAIT_BIT) {
668                 waits[wait_count] = (struct vk_sync_wait){
669                    .sync = query->perf.last_job_sync
670                 };
671                 wait_count++;
672             }
673          }
674 
675          copy.syncs = (uintptr_t)(void *)syncs;
676          copy.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids;
677 
678          set_multisync(&ms, sync_info, waits, wait_count, (void *)&copy, device, job,
679                        V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
680          if (!ms.base.id)
681             return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
682       }
683 
684       submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
685       submit.extensions = (uintptr_t)(void *)&ms;
686 
687       int ret = v3dv_ioctl(device->pdevice->render_fd,
688                            DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
689 
690       free(kperfmon_ids);
691       free(bo_handles);
692       free(offsets);
693       free(syncs);
694       multisync_free(device, &ms);
695 
696       queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
697 
698       if (ret)
699          return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
700 
701       return VK_SUCCESS;
702    }
703 
704    /* Map the entire dst buffer for the CPU copy if needed */
705    assert(!bo->map || bo->map_size == bo->size);
706    if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
707       return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
708 
709    uint8_t *offset = ((uint8_t *) bo->map) +
710                      info->offset + info->dst->mem_offset;
711    v3dv_get_query_pool_results_cpu(job->device,
712                                    info->pool,
713                                    info->first,
714                                    info->count,
715                                    offset,
716                                    info->stride,
717                                    info->flags);
718 
719    return VK_SUCCESS;
720 }
721 
722 static VkResult
handle_timestamp_query_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)723 handle_timestamp_query_cpu_job(struct v3dv_queue *queue,
724                                struct v3dv_job *job,
725                                struct v3dv_submit_sync_info *sync_info,
726                                bool signal_syncs)
727 {
728    struct v3dv_device *device = queue->device;
729 
730    assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY);
731    struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp;
732 
733    if (!device->pdevice->caps.cpu_queue) {
734       /* Wait for completion of all work queued before the timestamp query */
735       VkResult result = queue_wait_idle(queue, sync_info);
736       if (result != VK_SUCCESS)
737          return result;
738 
739       mtx_lock(&job->device->query_mutex);
740 
741       /* Compute timestamp */
742       struct timespec t;
743       clock_gettime(CLOCK_MONOTONIC, &t);
744 
745       for (uint32_t i = 0; i < info->count; i++) {
746          assert(info->query + i < info->pool->query_count);
747 	 struct v3dv_query *query = &info->pool->queries[info->query + i];
748          query->maybe_available = true;
749 
750          /* Value */
751          uint8_t *value_addr =
752             ((uint8_t *) info->pool->timestamp.bo->map) + query->timestamp.offset;
753          *((uint64_t*)value_addr) = (i == 0) ? t.tv_sec * 1000000000ull + t.tv_nsec : 0ull;
754 
755          /* Availability */
756          result = vk_sync_signal(&job->device->vk, query->timestamp.sync, 0);
757       }
758 
759       cnd_broadcast(&job->device->query_ended);
760       mtx_unlock(&job->device->query_mutex);
761 
762       return result;
763    }
764 
765    struct drm_v3d_submit_cpu submit = {0};
766 
767    submit.bo_handle_count = 1;
768    submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle;
769 
770    struct drm_v3d_timestamp_query timestamp = {0};
771 
772    set_ext(&timestamp.base, NULL, DRM_V3D_EXT_ID_CPU_TIMESTAMP_QUERY, 0);
773 
774    timestamp.count = info->count;
775 
776    uint32_t *offsets =
777       (uint32_t *) malloc(sizeof(uint32_t) * info->count);
778    uint32_t *syncs =
779       (uint32_t *) malloc(sizeof(uint32_t) * info->count);
780 
781    for (uint32_t i = 0; i < info->count; i++) {
782       assert(info->query + i < info->pool->query_count);
783       struct v3dv_query *query = &info->pool->queries[info->query + i];
784       query->maybe_available = true;
785 
786       offsets[i] = query->timestamp.offset;
787       syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
788    }
789 
790    timestamp.offsets = (uintptr_t)(void *)offsets;
791    timestamp.syncs = (uintptr_t)(void *)syncs;
792 
793    struct drm_v3d_multi_sync ms = {0};
794 
795    /* The CPU job should be serialized so it only executes after all previously
796     * submitted work has completed
797     */
798    job->serialize = V3DV_BARRIER_ALL;
799 
800    set_multisync(&ms, sync_info, NULL, 0, (void *)&timestamp, device, job,
801 	         V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
802    if (!ms.base.id)
803       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
804 
805    submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
806    submit.extensions = (uintptr_t)(void *)&ms;
807 
808    int ret = v3dv_ioctl(device->pdevice->render_fd,
809 			DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
810 
811    free(offsets);
812    free(syncs);
813    multisync_free(device, &ms);
814 
815    queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
816 
817    if (ret)
818       return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
819 
820    return VK_SUCCESS;
821 }
822 
823 static VkResult
handle_csd_indirect_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)824 handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
825                             struct v3dv_job *job,
826                             struct v3dv_submit_sync_info *sync_info,
827                             bool signal_syncs)
828 {
829    struct v3dv_device *device = queue->device;
830 
831    assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
832    struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect;
833    assert(info->csd_job);
834 
835    assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
836    struct v3dv_bo *bo = info->buffer->mem->bo;
837 
838    if (!device->pdevice->caps.cpu_queue) {
839       /* Make sure the GPU is no longer using the indirect buffer*/
840       v3dv_bo_wait(queue->device, bo, OS_TIMEOUT_INFINITE);
841 
842       /* Map the indirect buffer and read the dispatch parameters */
843       if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
844          return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
845       assert(bo->map);
846 
847       const uint32_t offset = info->buffer->mem_offset + info->offset;
848       const uint32_t *group_counts = (uint32_t *) (bo->map + offset);
849       if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0)
850          return VK_SUCCESS;
851 
852       if (memcmp(group_counts, info->csd_job->csd.wg_count,
853 		 sizeof(info->csd_job->csd.wg_count)) != 0) {
854          v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts);
855       }
856 
857       return VK_SUCCESS;
858    }
859 
860    struct v3dv_job *csd_job = info->csd_job;
861 
862    struct drm_v3d_submit_cpu submit = {0};
863 
864    submit.bo_handle_count = 1;
865    submit.bo_handles = (uintptr_t)(void *)&bo->handle;
866 
867    csd_job->csd.submit.bo_handle_count = csd_job->bo_count;
868    uint32_t *bo_handles = (uint32_t *) malloc(sizeof(uint32_t) * csd_job->bo_count);
869    uint32_t bo_idx = 0;
870    set_foreach (csd_job->bos, entry) {
871       struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
872       bo_handles[bo_idx++] = bo->handle;
873    }
874    csd_job->csd.submit.bo_handles = (uintptr_t)(void *)bo_handles;
875 
876    struct drm_v3d_indirect_csd indirect = {0};
877 
878    set_ext(&indirect.base, NULL, DRM_V3D_EXT_ID_CPU_INDIRECT_CSD, 0);
879 
880    indirect.submit = csd_job->csd.submit;
881    indirect.offset = info->buffer->mem_offset + info->offset;
882    indirect.wg_size = info->wg_size;
883 
884    for (int i = 0; i < 3; i++) {
885       if (info->wg_uniform_offsets[i]) {
886          assert(info->wg_uniform_offsets[i] >= (uint32_t *) csd_job->indirect.base);
887          indirect.wg_uniform_offsets[i] = info->wg_uniform_offsets[i] - (uint32_t *) csd_job->indirect.base;
888       } else {
889          indirect.wg_uniform_offsets[i] = 0xffffffff; /* No rewrite */
890       }
891    }
892 
893    indirect.indirect = csd_job->indirect.bo->handle;
894 
895    struct drm_v3d_multi_sync ms = {0};
896 
897    /* We need to configure the semaphores of this job with the indirect
898     * CSD job, as the CPU job must obey to the CSD job synchronization
899     * demands, such as barriers.
900     */
901    set_multisync(&ms, sync_info, NULL, 0, (void *)&indirect, device, csd_job,
902 	         V3DV_QUEUE_CPU, V3DV_QUEUE_CSD, V3D_CPU, signal_syncs);
903    if (!ms.base.id)
904       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
905 
906    submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
907    submit.extensions = (uintptr_t)(void *)&ms;
908 
909    int ret = v3dv_ioctl(device->pdevice->render_fd,
910 			DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
911 
912    free(bo_handles);
913    multisync_free(device, &ms);
914 
915    queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
916    queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
917 
918    if (ret)
919       return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
920 
921    return VK_SUCCESS;
922 }
923 
924 /**
925  * This handles semaphore waits for the single sync path by accumulating
926  * wait semaphores into the QUEUE_ANY syncobj. Notice this is only required
927  * to ensure we accumulate any *external* semaphores (since for anything else
928  * we are already accumulating out syncs with each submission to the kernel).
929  */
930 static VkResult
process_singlesync_waits(struct v3dv_queue * queue,uint32_t count,struct vk_sync_wait * waits)931 process_singlesync_waits(struct v3dv_queue *queue,
932                          uint32_t count, struct vk_sync_wait *waits)
933 {
934    struct v3dv_device *device = queue->device;
935    assert(!device->pdevice->caps.multisync);
936 
937    if (count == 0)
938       return VK_SUCCESS;
939 
940    VkResult result = VK_SUCCESS;
941 
942    int err = 0;
943    int fd = -1;
944    err = drmSyncobjExportSyncFile(device->pdevice->render_fd,
945                                   queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
946                                   &fd);
947    if (err) {
948       result = vk_errorf(queue, VK_ERROR_UNKNOWN,
949                          "sync file export failed: %m");
950       goto fail;
951    }
952 
953    for (uint32_t i = 0; i < count; i++) {
954       uint32_t syncobj = vk_sync_as_drm_syncobj(waits[i].sync)->syncobj;
955       int wait_fd = -1;
956 
957       err = drmSyncobjExportSyncFile(device->pdevice->render_fd,
958                                      syncobj, &wait_fd);
959       if (err) {
960          result = vk_errorf(queue, VK_ERROR_UNKNOWN,
961                             "sync file export failed: %m");
962          goto fail;
963       }
964 
965       err = sync_accumulate("v3dv", &fd, wait_fd);
966       close(wait_fd);
967       if (err) {
968          result = vk_errorf(queue, VK_ERROR_UNKNOWN,
969                             "sync file merge failed: %m");
970          goto fail;
971       }
972    }
973 
974    err = drmSyncobjImportSyncFile(device->pdevice->render_fd,
975                                   queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
976                                   fd);
977    if (err) {
978       result = vk_errorf(queue, VK_ERROR_UNKNOWN,
979                          "sync file import failed: %m");
980    }
981 
982 fail:
983    close(fd);
984    return result;
985 }
986 
987 /**
988  * This handles signaling for the single-sync path by importing the QUEUE_ANY
989  * syncobj into all syncs to be signaled.
990  */
991 static VkResult
process_singlesync_signals(struct v3dv_queue * queue,uint32_t count,struct vk_sync_signal * signals)992 process_singlesync_signals(struct v3dv_queue *queue,
993                            uint32_t count, struct vk_sync_signal *signals)
994 {
995    struct v3dv_device *device = queue->device;
996    assert(!device->pdevice->caps.multisync && count > 0);
997 
998    if (device->pdevice->caps.multisync)
999       return VK_SUCCESS;
1000 
1001    int fd = -1;
1002    drmSyncobjExportSyncFile(device->pdevice->render_fd,
1003                             queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
1004                             &fd);
1005    if (fd == -1) {
1006       return vk_errorf(queue, VK_ERROR_UNKNOWN,
1007                        "sync file export failed: %m");
1008    }
1009 
1010    VkResult result = VK_SUCCESS;
1011    for (uint32_t i = 0; i < count; i++) {
1012       uint32_t syncobj = vk_sync_as_drm_syncobj(signals[i].sync)->syncobj;
1013       int err = drmSyncobjImportSyncFile(device->pdevice->render_fd,
1014                                          syncobj, fd);
1015       if (err) {
1016          result = vk_errorf(queue, VK_ERROR_UNKNOWN,
1017                             "sync file import failed: %m");
1018          break;
1019       }
1020    }
1021 
1022    assert(fd >= 0);
1023    close(fd);
1024 
1025    return result;
1026 }
1027 
1028 /* This must be called after every submission in the single-sync path to
1029  * accumulate the out_sync into the QUEUE_ANY sync so we can serialize
1030  * jobs by waiting on the QUEUE_ANY sync.
1031  */
1032 static int
update_any_queue_sync(struct v3dv_queue * queue,uint32_t out_sync)1033 update_any_queue_sync(struct v3dv_queue *queue, uint32_t out_sync)
1034 {
1035    struct v3dv_device *device = queue->device;
1036    assert(!device->pdevice->caps.multisync);
1037 
1038    int render_fd = device->pdevice->render_fd;
1039    int fd_any = -1, fd_out_sync = -1;
1040    int err;
1041    err  = drmSyncobjExportSyncFile(render_fd,
1042                                    queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
1043                                    &fd_any);
1044    if (err)
1045       goto fail;
1046 
1047    err = drmSyncobjExportSyncFile(render_fd, out_sync, &fd_out_sync);
1048    if (err)
1049       goto fail;
1050 
1051    err = sync_accumulate("v3dv", &fd_any, fd_out_sync);
1052    if (err)
1053       goto fail;
1054 
1055    err = drmSyncobjImportSyncFile(render_fd,
1056                                   queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
1057                                   fd_any);
1058 
1059 fail:
1060    close(fd_any);
1061    close(fd_out_sync);
1062    return err;
1063 }
1064 
1065 static VkResult
handle_cl_job(struct v3dv_queue * queue,struct v3dv_job * job,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1066 handle_cl_job(struct v3dv_queue *queue,
1067               struct v3dv_job *job,
1068               uint32_t counter_pass_idx,
1069               struct v3dv_submit_sync_info *sync_info,
1070               bool signal_syncs)
1071 {
1072    struct v3dv_device *device = queue->device;
1073 
1074    struct drm_v3d_submit_cl submit = { 0 };
1075 
1076    /* Sanity check: we should only flag a bcl sync on a job that needs to be
1077     * serialized.
1078     */
1079    assert(job->serialize || !job->needs_bcl_sync);
1080 
1081    /* We expect to have just one RCL per job which should fit in just one BO.
1082     * Our BCL, could chain multiple BOS together though.
1083     */
1084    assert(list_length(&job->rcl.bo_list) == 1);
1085    assert(list_length(&job->bcl.bo_list) >= 1);
1086    struct v3dv_bo *bcl_fist_bo =
1087       list_first_entry(&job->bcl.bo_list, struct v3dv_bo, list_link);
1088    submit.bcl_start = bcl_fist_bo->offset;
1089    submit.bcl_end = job->bcl.bo->offset + v3dv_cl_offset(&job->bcl);
1090    submit.rcl_start = job->rcl.bo->offset;
1091    submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl);
1092 
1093    submit.qma = job->tile_alloc->offset;
1094    submit.qms = job->tile_alloc->size;
1095    submit.qts = job->tile_state->offset;
1096 
1097    submit.flags = 0;
1098    if (job->tmu_dirty_rcl)
1099       submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
1100 
1101    /* If the job uses VK_KHR_buffer_device_address we need to ensure all
1102     * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
1103     * are included.
1104     */
1105    if (job->uses_buffer_device_address) {
1106       util_dynarray_foreach(&queue->device->device_address_bo_list,
1107                             struct v3dv_bo *, bo) {
1108          v3dv_job_add_bo(job, *bo);
1109       }
1110    }
1111 
1112    submit.bo_handle_count = job->bo_count;
1113    uint32_t *bo_handles =
1114       (uint32_t *) malloc(sizeof(uint32_t) * submit.bo_handle_count);
1115    uint32_t bo_idx = 0;
1116    set_foreach(job->bos, entry) {
1117       struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
1118       bo_handles[bo_idx++] = bo->handle;
1119    }
1120    assert(bo_idx == submit.bo_handle_count);
1121    submit.bo_handles = (uintptr_t)(void *)bo_handles;
1122 
1123    submit.perfmon_id = job->perf ?
1124       job->perf->kperfmon_ids[counter_pass_idx] : 0;
1125    const bool needs_perf_sync = queue->last_perfmon_id != submit.perfmon_id;
1126    queue->last_perfmon_id = submit.perfmon_id;
1127 
1128    /* We need a binning sync if we are the first CL job waiting on a semaphore
1129     * with a wait stage that involves the geometry pipeline, or if the job
1130     * comes after a pipeline barrier that involves geometry stages
1131     * (needs_bcl_sync) or when performance queries are in use.
1132     *
1133     * We need a render sync if the job doesn't need a binning sync but has
1134     * still been flagged for serialization. It should be noted that RCL jobs
1135     * don't start until the previous RCL job has finished so we don't really
1136     * need to add a fence for those, however, we might need to wait on a CSD or
1137     * TFU job, which are not automatically serialized with CL jobs.
1138     */
1139    bool needs_bcl_sync = job->needs_bcl_sync || needs_perf_sync;
1140    if (queue->last_job_syncs.first[V3DV_QUEUE_CL]) {
1141       for (int i = 0; !needs_bcl_sync && i < sync_info->wait_count; i++) {
1142          needs_bcl_sync = sync_info->waits[i].stage_mask &
1143              (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
1144               VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
1145               VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
1146               VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
1147               VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
1148               VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
1149               VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
1150               VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
1151               VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
1152               VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
1153               VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
1154               VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT);
1155       }
1156    }
1157 
1158    bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
1159 
1160    /* Replace single semaphore settings whenever our kernel-driver supports
1161     * multiple semaphores extension.
1162     */
1163    struct drm_v3d_multi_sync ms = { 0 };
1164    if (device->pdevice->caps.multisync) {
1165       enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN;
1166       set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
1167                     V3DV_QUEUE_CL, V3DV_QUEUE_CL, wait_stage, signal_syncs);
1168       if (!ms.base.id)
1169          return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1170 
1171       submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
1172       submit.extensions = (uintptr_t)(void *)&ms;
1173       /* Disable legacy sync interface when multisync extension is used */
1174       submit.in_sync_rcl = 0;
1175       submit.in_sync_bcl = 0;
1176       submit.out_sync = 0;
1177    } else {
1178       uint32_t last_job_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_ANY];
1179       submit.in_sync_bcl = needs_bcl_sync ? last_job_sync : 0;
1180       submit.in_sync_rcl = needs_rcl_sync ? last_job_sync : 0;
1181       submit.out_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_CL];
1182    }
1183 
1184    v3dv_clif_dump(device, job, &submit);
1185    int ret = v3dv_ioctl(device->pdevice->render_fd,
1186                         DRM_IOCTL_V3D_SUBMIT_CL, &submit);
1187 
1188    static bool warned = false;
1189    if (ret && !warned) {
1190       fprintf(stderr, "Draw call returned %s. Expect corruption.\n",
1191               strerror(errno));
1192       warned = true;
1193    }
1194 
1195    if (!device->pdevice->caps.multisync && ret == 0)
1196       ret = update_any_queue_sync(queue, submit.out_sync);
1197 
1198    free(bo_handles);
1199    multisync_free(device, &ms);
1200 
1201    queue->last_job_syncs.first[V3DV_QUEUE_CL] = false;
1202 
1203    if (ret)
1204       return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CL failed: %m");
1205 
1206    return VK_SUCCESS;
1207 }
1208 
1209 static VkResult
handle_tfu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1210 handle_tfu_job(struct v3dv_queue *queue,
1211                struct v3dv_job *job,
1212                struct v3dv_submit_sync_info *sync_info,
1213                bool signal_syncs)
1214 {
1215    assert(!V3D_DBG(DISABLE_TFU));
1216 
1217    struct v3dv_device *device = queue->device;
1218 
1219    const bool needs_sync = sync_info->wait_count || job->serialize;
1220 
1221    /* Replace single semaphore settings whenever our kernel-driver supports
1222     * multiple semaphore extension.
1223     */
1224    struct drm_v3d_multi_sync ms = { 0 };
1225    if (device->pdevice->caps.multisync) {
1226       set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
1227                     V3DV_QUEUE_TFU, V3DV_QUEUE_TFU, V3D_TFU, signal_syncs);
1228       if (!ms.base.id)
1229          return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1230 
1231       job->tfu.flags |= DRM_V3D_SUBMIT_EXTENSION;
1232       job->tfu.extensions = (uintptr_t)(void *)&ms;
1233       /* Disable legacy sync interface when multisync extension is used */
1234       job->tfu.in_sync = 0;
1235       job->tfu.out_sync = 0;
1236    } else {
1237       uint32_t last_job_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_ANY];
1238       job->tfu.in_sync = needs_sync ? last_job_sync : 0;
1239       job->tfu.out_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_TFU];
1240    }
1241    int ret = v3dv_ioctl(device->pdevice->render_fd,
1242                         DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu);
1243 
1244    if (!device->pdevice->caps.multisync && ret == 0)
1245       ret = update_any_queue_sync(queue, job->tfu.out_sync);
1246 
1247    multisync_free(device, &ms);
1248    queue->last_job_syncs.first[V3DV_QUEUE_TFU] = false;
1249 
1250    if (ret != 0)
1251       return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_TFU failed: %m");
1252 
1253    return VK_SUCCESS;
1254 }
1255 
1256 static VkResult
handle_csd_job(struct v3dv_queue * queue,struct v3dv_job * job,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1257 handle_csd_job(struct v3dv_queue *queue,
1258                struct v3dv_job *job,
1259                uint32_t counter_pass_idx,
1260                struct v3dv_submit_sync_info *sync_info,
1261                bool signal_syncs)
1262 {
1263    struct v3dv_device *device = queue->device;
1264 
1265    struct drm_v3d_submit_csd *submit = &job->csd.submit;
1266 
1267    /* If the job uses VK_KHR_buffer_device_address we need to ensure all
1268     * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
1269     * are included.
1270     */
1271    if (job->uses_buffer_device_address) {
1272       util_dynarray_foreach(&queue->device->device_address_bo_list,
1273                             struct v3dv_bo *, bo) {
1274          v3dv_job_add_bo(job, *bo);
1275       }
1276    }
1277 
1278    submit->bo_handle_count = job->bo_count;
1279    uint32_t *bo_handles =
1280       (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2));
1281    uint32_t bo_idx = 0;
1282    set_foreach(job->bos, entry) {
1283       struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
1284       bo_handles[bo_idx++] = bo->handle;
1285    }
1286    assert(bo_idx == submit->bo_handle_count);
1287    submit->bo_handles = (uintptr_t)(void *)bo_handles;
1288 
1289    const bool needs_sync = sync_info->wait_count || job->serialize;
1290 
1291    /* Replace single semaphore settings whenever our kernel-driver supports
1292     * multiple semaphore extension.
1293     */
1294    struct drm_v3d_multi_sync ms = { 0 };
1295    if (device->pdevice->caps.multisync) {
1296       set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
1297                     V3DV_QUEUE_CSD, V3DV_QUEUE_CSD, V3D_CSD, signal_syncs);
1298       if (!ms.base.id)
1299          return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1300 
1301       submit->flags |= DRM_V3D_SUBMIT_EXTENSION;
1302       submit->extensions = (uintptr_t)(void *)&ms;
1303       /* Disable legacy sync interface when multisync extension is used */
1304       submit->in_sync = 0;
1305       submit->out_sync = 0;
1306    } else {
1307       uint32_t last_job_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_ANY];
1308       submit->in_sync = needs_sync ? last_job_sync : 0;
1309       submit->out_sync = queue->last_job_syncs.syncs[V3DV_QUEUE_CSD];
1310    }
1311    submit->perfmon_id = job->perf ?
1312       job->perf->kperfmon_ids[counter_pass_idx] : 0;
1313    queue->last_perfmon_id = submit->perfmon_id;
1314    int ret = v3dv_ioctl(device->pdevice->render_fd,
1315                         DRM_IOCTL_V3D_SUBMIT_CSD, submit);
1316 
1317    static bool warned = false;
1318    if (ret && !warned) {
1319       fprintf(stderr, "Compute dispatch returned %s. Expect corruption.\n",
1320               strerror(errno));
1321       warned = true;
1322    }
1323 
1324    if (!device->pdevice->caps.multisync && ret == 0)
1325       ret = update_any_queue_sync(queue, submit->out_sync);
1326 
1327    free(bo_handles);
1328 
1329    multisync_free(device, &ms);
1330    queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
1331 
1332    if (ret)
1333       return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CSD failed: %m");
1334 
1335    return VK_SUCCESS;
1336 }
1337 
1338 static VkResult
queue_handle_job(struct v3dv_queue * queue,struct v3dv_job * job,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1339 queue_handle_job(struct v3dv_queue *queue,
1340                  struct v3dv_job *job,
1341                  uint32_t counter_pass_idx,
1342                  struct v3dv_submit_sync_info *sync_info,
1343                  bool signal_syncs)
1344 {
1345    switch (job->type) {
1346    case V3DV_JOB_TYPE_GPU_CL:
1347       return handle_cl_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
1348    case V3DV_JOB_TYPE_GPU_TFU:
1349       return handle_tfu_job(queue, job, sync_info, signal_syncs);
1350    case V3DV_JOB_TYPE_GPU_CSD:
1351       return handle_csd_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
1352    case V3DV_JOB_TYPE_CPU_RESET_QUERIES:
1353       return handle_reset_query_cpu_job(queue, job, sync_info, signal_syncs);
1354    case V3DV_JOB_TYPE_CPU_END_QUERY:
1355       return handle_end_query_cpu_job(job, counter_pass_idx);
1356    case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
1357       return handle_copy_query_results_cpu_job(queue, job, sync_info, signal_syncs);
1358    case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
1359       return handle_csd_indirect_cpu_job(queue, job, sync_info, signal_syncs);
1360    case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY:
1361       return handle_timestamp_query_cpu_job(queue, job, sync_info, signal_syncs);
1362    default:
1363       unreachable("Unhandled job type");
1364    }
1365 }
1366 
1367 static VkResult
queue_create_noop_job(struct v3dv_queue * queue)1368 queue_create_noop_job(struct v3dv_queue *queue)
1369 {
1370    struct v3dv_device *device = queue->device;
1371    queue->noop_job = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_job), 8,
1372                                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1373    if (!queue->noop_job)
1374       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1375    v3dv_job_init(queue->noop_job, V3DV_JOB_TYPE_GPU_CL, device, NULL, -1);
1376 
1377    v3dv_X(device, job_emit_noop)(queue->noop_job);
1378 
1379    /* We use no-op jobs to signal semaphores/fences. These jobs needs to be
1380     * serialized across all hw queues to comply with Vulkan's signal operation
1381     * order requirements, which basically require that signal operations occur
1382     * in submission order.
1383     */
1384    queue->noop_job->serialize = V3DV_BARRIER_ALL;
1385 
1386    return VK_SUCCESS;
1387 }
1388 
1389 static VkResult
queue_submit_noop_job(struct v3dv_queue * queue,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1390 queue_submit_noop_job(struct v3dv_queue *queue,
1391                       uint32_t counter_pass_idx,
1392                       struct v3dv_submit_sync_info *sync_info,
1393                       bool signal_syncs)
1394 {
1395    if (!queue->noop_job) {
1396       VkResult result = queue_create_noop_job(queue);
1397       if (result != VK_SUCCESS)
1398          return result;
1399    }
1400 
1401    assert(queue->noop_job);
1402    return queue_handle_job(queue, queue->noop_job, counter_pass_idx,
1403                            sync_info, signal_syncs);
1404 }
1405 
1406 VkResult
v3dv_queue_driver_submit(struct vk_queue * vk_queue,struct vk_queue_submit * submit)1407 v3dv_queue_driver_submit(struct vk_queue *vk_queue,
1408                          struct vk_queue_submit *submit)
1409 {
1410    struct v3dv_queue *queue = container_of(vk_queue, struct v3dv_queue, vk);
1411    VkResult result;
1412 
1413    struct v3dv_submit_sync_info sync_info = {
1414       .wait_count = submit->wait_count,
1415       .waits = submit->waits,
1416       .signal_count = submit->signal_count,
1417       .signals = submit->signals,
1418    };
1419 
1420    for (int i = 0; i < V3DV_QUEUE_COUNT; i++)
1421       queue->last_job_syncs.first[i] = true;
1422 
1423    /* If we do not have multisync we need to ensure we accumulate any wait
1424     * semaphores into our QUEUE_ANY syncobj so we can handle waiting on
1425     * external semaphores.
1426     */
1427    if (!queue->device->pdevice->caps.multisync) {
1428       result =
1429          process_singlesync_waits(queue, sync_info.wait_count, sync_info.waits);
1430       if (result != VK_SUCCESS)
1431          return result;
1432    }
1433 
1434    for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
1435       struct v3dv_cmd_buffer *cmd_buffer =
1436          container_of(submit->command_buffers[i], struct v3dv_cmd_buffer, vk);
1437       list_for_each_entry_safe(struct v3dv_job, job,
1438                                &cmd_buffer->jobs, list_link) {
1439 
1440          result = queue_handle_job(queue, job, submit->perf_pass_index,
1441                                    &sync_info, false);
1442          if (result != VK_SUCCESS)
1443             return result;
1444       }
1445 
1446       /* If the command buffer ends with a barrier we need to consume it now.
1447        *
1448        * FIXME: this will drain all hw queues. Instead, we could use the pending
1449        * barrier state to limit the queues we serialize against.
1450        */
1451       if (cmd_buffer->state.barrier.dst_mask) {
1452          result = queue_submit_noop_job(queue, submit->perf_pass_index,
1453                                         &sync_info, false);
1454          if (result != VK_SUCCESS)
1455             return result;
1456       }
1457    }
1458 
1459    /* Handle signaling now */
1460    if (submit->signal_count > 0) {
1461       if (queue->device->pdevice->caps.multisync) {
1462          /* Finish by submitting a no-op job that synchronizes across all queues.
1463           * This will ensure that the signal semaphores don't get triggered until
1464           * all work on any queue completes. See Vulkan's signal operation order
1465           * requirements.
1466           */
1467          return queue_submit_noop_job(queue, submit->perf_pass_index,
1468                                       &sync_info, true);
1469       } else {
1470          return process_singlesync_signals(queue, sync_info.signal_count,
1471                                            sync_info.signals);
1472       }
1473    }
1474 
1475    return VK_SUCCESS;
1476 }
1477 
1478 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_QueueBindSparse(VkQueue _queue,uint32_t bindInfoCount,const VkBindSparseInfo * pBindInfo,VkFence fence)1479 v3dv_QueueBindSparse(VkQueue _queue,
1480                      uint32_t bindInfoCount,
1481                      const VkBindSparseInfo *pBindInfo,
1482                      VkFence fence)
1483 {
1484    V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
1485    return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT);
1486 }
1487