• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2024 Valve Corporation
3  * Copyright 2024 Alyssa Rosenzweig
4  * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5  * Copyright 2024 Valve Corporation
6  * Copyright 2024 Alyssa Rosenzweig
7  * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
8  * SPDX-License-Identifier: MIT
9  */
10 #include "hk_queue.h"
11 
12 #include "agx_bg_eot.h"
13 #include "agx_bo.h"
14 #include "agx_device.h"
15 #include "agx_pack.h"
16 #include "decode.h"
17 #include "hk_cmd_buffer.h"
18 #include "hk_device.h"
19 #include "hk_physical_device.h"
20 
21 #include <xf86drm.h>
22 #include "asahi/lib/unstable_asahi_drm.h"
23 #include "util/list.h"
24 #include "vulkan/vulkan_core.h"
25 
26 #include "vk_drm_syncobj.h"
27 #include "vk_sync.h"
28 
29 /*
30  * We need to specially handle submits with no control streams. The kernel
31  * can't accept empty submits, but we can end up here in Vulkan for
32  * synchronization purposes only. Rather than submit a no-op job (slow),
33  * we simply tie the fences together.
34  */
35 static VkResult
queue_submit_empty(struct hk_device * dev,struct hk_queue * queue,struct vk_queue_submit * submit)36 queue_submit_empty(struct hk_device *dev, struct hk_queue *queue,
37                    struct vk_queue_submit *submit)
38 {
39    int fd = dev->dev.fd;
40 
41    /* Transfer the waits into the queue timeline. */
42    for (unsigned i = 0; i < submit->wait_count; ++i) {
43       struct vk_sync_wait *wait = &submit->waits[i];
44 
45       assert(vk_sync_type_is_drm_syncobj(wait->sync->type));
46       const struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(wait->sync);
47 
48       drmSyncobjTransfer(fd, queue->drm.syncobj, ++queue->drm.timeline_value,
49                          syncobj->syncobj, wait->wait_value, 0);
50    }
51 
52    /* Transfer the queue timeline into each out fence. They will all be
53     * signalled when we reach this point.
54     */
55    for (unsigned i = 0; i < submit->signal_count; ++i) {
56       struct vk_sync_signal *signal = &submit->signals[i];
57 
58       assert(vk_sync_type_is_drm_syncobj(signal->sync->type));
59       const struct vk_drm_syncobj *syncobj =
60          vk_sync_as_drm_syncobj(signal->sync);
61 
62       drmSyncobjTransfer(fd, syncobj->syncobj, signal->signal_value,
63                          queue->drm.syncobj, queue->drm.timeline_value, 0);
64    }
65 
66    return VK_SUCCESS;
67 }
68 
69 static void
asahi_fill_cdm_command(struct hk_device * dev,struct hk_cs * cs,struct drm_asahi_cmd_compute * cmd,struct drm_asahi_cmd_compute_user_timestamps * timestamps)70 asahi_fill_cdm_command(struct hk_device *dev, struct hk_cs *cs,
71                        struct drm_asahi_cmd_compute *cmd,
72                        struct drm_asahi_cmd_compute_user_timestamps *timestamps)
73 {
74    size_t len = cs->stream_linked ? 65536 /* XXX */ : (cs->current - cs->start);
75 
76    *cmd = (struct drm_asahi_cmd_compute){
77       .encoder_ptr = cs->addr,
78       .encoder_end = cs->addr + len,
79 
80       .sampler_array = dev->samplers.table.bo->va->addr,
81       .sampler_count = dev->samplers.table.alloc,
82       .sampler_max = dev->samplers.table.alloc + 1,
83 
84       .usc_base = dev->dev.shader_base,
85 
86       .encoder_id = agx_get_global_id(&dev->dev),
87       .cmd_id = agx_get_global_id(&dev->dev),
88       .unk_mask = 0xffffffff,
89    };
90 
91    if (cs->timestamp.end.handle) {
92       assert(agx_supports_timestamps(&dev->dev));
93 
94       *timestamps = (struct drm_asahi_cmd_compute_user_timestamps){
95          .type = ASAHI_COMPUTE_EXT_TIMESTAMPS,
96          .end_handle = cs->timestamp.end.handle,
97          .end_offset = cs->timestamp.end.offset_B,
98       };
99 
100       cmd->extensions = (uint64_t)(uintptr_t)timestamps;
101    }
102 
103    if (cs->scratch.cs.main || cs->scratch.cs.preamble) {
104       cmd->helper_arg = dev->scratch.cs.buf->va->addr;
105       cmd->helper_cfg = cs->scratch.cs.preamble ? (1 << 16) : 0;
106       cmd->helper_program = agx_helper_program(&dev->bg_eot);
107    }
108 }
109 
110 static void
asahi_fill_vdm_command(struct hk_device * dev,struct hk_cs * cs,struct drm_asahi_cmd_render * c,struct drm_asahi_cmd_render_user_timestamps * timestamps)111 asahi_fill_vdm_command(struct hk_device *dev, struct hk_cs *cs,
112                        struct drm_asahi_cmd_render *c,
113                        struct drm_asahi_cmd_render_user_timestamps *timestamps)
114 {
115    unsigned cmd_ta_id = agx_get_global_id(&dev->dev);
116    unsigned cmd_3d_id = agx_get_global_id(&dev->dev);
117    unsigned encoder_id = agx_get_global_id(&dev->dev);
118 
119    memset(c, 0, sizeof(*c));
120 
121    c->encoder_ptr = cs->addr;
122    c->encoder_id = encoder_id;
123    c->cmd_3d_id = cmd_3d_id;
124    c->cmd_ta_id = cmd_ta_id;
125    c->ppp_ctrl = 0x202;
126 
127    c->fragment_usc_base = dev->dev.shader_base;
128    c->vertex_usc_base = c->fragment_usc_base;
129 
130    c->fb_width = cs->cr.width;
131    c->fb_height = cs->cr.height;
132 
133    c->isp_bgobjdepth = cs->cr.isp_bgobjdepth;
134    c->isp_bgobjvals = cs->cr.isp_bgobjvals;
135 
136    static_assert(sizeof(c->zls_ctrl) == sizeof(cs->cr.zls_control));
137    memcpy(&c->zls_ctrl, &cs->cr.zls_control, sizeof(cs->cr.zls_control));
138 
139    c->depth_dimensions =
140       (cs->cr.zls_width - 1) | ((cs->cr.zls_height - 1) << 15);
141 
142    c->depth_buffer_load = cs->cr.depth.buffer;
143    c->depth_buffer_store = cs->cr.depth.buffer;
144    c->depth_buffer_partial = cs->cr.depth.buffer;
145 
146    c->depth_buffer_load_stride = cs->cr.depth.stride;
147    c->depth_buffer_store_stride = cs->cr.depth.stride;
148    c->depth_buffer_partial_stride = cs->cr.depth.stride;
149 
150    c->depth_meta_buffer_load = cs->cr.depth.meta;
151    c->depth_meta_buffer_store = cs->cr.depth.meta;
152    c->depth_meta_buffer_partial = cs->cr.depth.meta;
153 
154    c->depth_meta_buffer_load_stride = cs->cr.depth.stride;
155    c->depth_meta_buffer_store_stride = cs->cr.depth.meta_stride;
156    c->depth_meta_buffer_partial_stride = cs->cr.depth.meta_stride;
157 
158    c->stencil_buffer_load = cs->cr.stencil.buffer;
159    c->stencil_buffer_store = cs->cr.stencil.buffer;
160    c->stencil_buffer_partial = cs->cr.stencil.buffer;
161 
162    c->stencil_buffer_load_stride = cs->cr.stencil.stride;
163    c->stencil_buffer_store_stride = cs->cr.stencil.stride;
164    c->stencil_buffer_partial_stride = cs->cr.stencil.stride;
165 
166    c->stencil_meta_buffer_load = cs->cr.stencil.meta;
167    c->stencil_meta_buffer_store = cs->cr.stencil.meta;
168    c->stencil_meta_buffer_partial = cs->cr.stencil.meta;
169 
170    c->stencil_meta_buffer_load_stride = cs->cr.stencil.stride;
171    c->stencil_meta_buffer_store_stride = cs->cr.stencil.meta_stride;
172    c->stencil_meta_buffer_partial_stride = cs->cr.stencil.meta_stride;
173 
174    c->iogpu_unk_214 = cs->cr.iogpu_unk_214;
175 
176    if (cs->cr.dbias_is_int == U_TRISTATE_YES) {
177       c->iogpu_unk_214 |= 0x40000;
178    }
179 
180    if (dev->dev.debug & AGX_DBG_NOCLUSTER) {
181       c->flags |= ASAHI_RENDER_NO_VERTEX_CLUSTERING;
182    } else {
183       /* XXX: We don't know what this does exactly, and the name is
184        * surely wrong. But it fixes dEQP-VK.memory.pipeline_barrier.* tests on
185        * G14C when clustering is enabled...
186        */
187       c->flags |= ASAHI_RENDER_NO_CLEAR_PIPELINE_TEXTURES;
188    }
189 
190 #if 0
191    /* XXX is this for just MSAA+Z+S or MSAA+(Z|S)? */
192    if (tib->nr_samples > 1 && framebuffer->zsbuf)
193       c->flags |= ASAHI_RENDER_MSAA_ZS;
194 #endif
195 
196    c->utile_width = cs->tib.tile_size.width;
197    c->utile_height = cs->tib.tile_size.height;
198 
199    /* Can be 0 for attachmentless rendering with no draws */
200    c->samples = MAX2(cs->tib.nr_samples, 1);
201    c->layers = cs->cr.layers;
202 
203    /* Drawing max size will OOM and fail submission. But vkd3d-proton does this
204     * for emulating no-attachment rendering. Clamp to something reasonable and
205     * hope this is good enough in practice. This only affects a case that would
206     * otherwise be guaranteed broken.
207     *
208     * XXX: Hack for vkd3d-proton.
209     */
210    if (c->layers == 2048 && c->fb_width == 16384 && c->fb_height == 16384) {
211       mesa_log(MESA_LOG_WARN, MESA_LOG_TAG, "Clamping massive framebuffer");
212       c->layers = 32;
213    }
214 
215    c->ppp_multisamplectl = cs->ppp_multisamplectl;
216    c->sample_size = cs->tib.sample_size_B;
217    c->tib_blocks = ALIGN_POT(agx_tilebuffer_total_size(&cs->tib), 2048) / 2048;
218 
219    float tan_60 = 1.732051f;
220    c->merge_upper_x = fui(tan_60 / cs->cr.width);
221    c->merge_upper_y = fui(tan_60 / cs->cr.height);
222 
223    c->load_pipeline = cs->cr.bg.main.usc | 4;
224    c->store_pipeline = cs->cr.eot.main.usc | 4;
225    c->partial_reload_pipeline = cs->cr.bg.partial.usc | 4;
226    c->partial_store_pipeline = cs->cr.eot.partial.usc | 4;
227 
228    memcpy(&c->load_pipeline_bind, &cs->cr.bg.main.counts,
229           sizeof(struct agx_counts_packed));
230 
231    memcpy(&c->store_pipeline_bind, &cs->cr.eot.main.counts,
232           sizeof(struct agx_counts_packed));
233 
234    memcpy(&c->partial_reload_pipeline_bind, &cs->cr.bg.partial.counts,
235           sizeof(struct agx_counts_packed));
236 
237    memcpy(&c->partial_store_pipeline_bind, &cs->cr.eot.partial.counts,
238           sizeof(struct agx_counts_packed));
239 
240    c->scissor_array = cs->uploaded_scissor;
241    c->depth_bias_array = cs->uploaded_zbias;
242 
243    c->vertex_sampler_array = dev->samplers.table.bo->va->addr;
244    c->vertex_sampler_count = dev->samplers.table.alloc;
245    c->vertex_sampler_max = dev->samplers.table.alloc + 1;
246 
247    c->fragment_sampler_array = c->vertex_sampler_array;
248    c->fragment_sampler_count = c->vertex_sampler_count;
249    c->fragment_sampler_max = c->vertex_sampler_max;
250 
251    c->visibility_result_buffer = dev->occlusion_queries.bo->va->addr;
252 
253    if (cs->cr.process_empty_tiles)
254       c->flags |= ASAHI_RENDER_PROCESS_EMPTY_TILES;
255 
256    if (cs->scratch.vs.main || cs->scratch.vs.preamble) {
257       c->flags |= ASAHI_RENDER_VERTEX_SPILLS;
258       c->vertex_helper_arg = dev->scratch.vs.buf->va->addr;
259       c->vertex_helper_cfg = cs->scratch.vs.preamble ? (1 << 16) : 0;
260       c->vertex_helper_program = agx_helper_program(&dev->bg_eot);
261    }
262 
263    if (cs->scratch.fs.main || cs->scratch.fs.preamble) {
264       c->fragment_helper_arg = dev->scratch.fs.buf->va->addr;
265       c->fragment_helper_cfg = cs->scratch.fs.preamble ? (1 << 16) : 0;
266       c->fragment_helper_program = agx_helper_program(&dev->bg_eot);
267    }
268 
269    if (cs->timestamp.end.handle) {
270       assert(agx_supports_timestamps(&dev->dev));
271 
272       c->extensions = (uint64_t)(uintptr_t)timestamps;
273 
274       *timestamps = (struct drm_asahi_cmd_render_user_timestamps){
275          .type = ASAHI_RENDER_EXT_TIMESTAMPS,
276          .frg_end_handle = cs->timestamp.end.handle,
277          .frg_end_offset = cs->timestamp.end.offset_B,
278       };
279    }
280 }
281 
282 static void
asahi_fill_sync(struct drm_asahi_sync * sync,struct vk_sync * vk_sync,uint64_t value)283 asahi_fill_sync(struct drm_asahi_sync *sync, struct vk_sync *vk_sync,
284                 uint64_t value)
285 {
286    if (unlikely(!vk_sync_type_is_drm_syncobj(vk_sync->type))) {
287       unreachable("Unsupported sync type");
288       return;
289    }
290 
291    const struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(vk_sync);
292    *sync = (struct drm_asahi_sync){.handle = syncobj->syncobj};
293 
294    if (vk_sync->flags & VK_SYNC_IS_TIMELINE) {
295       sync->sync_type = DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ;
296       sync->timeline_value = value;
297    } else {
298       sync->sync_type = DRM_ASAHI_SYNC_SYNCOBJ;
299    }
300 }
301 
302 union drm_asahi_cmd {
303    struct drm_asahi_cmd_compute compute;
304    struct drm_asahi_cmd_render render;
305 };
306 
307 union drm_asahi_user_timestamps {
308    struct drm_asahi_cmd_compute_user_timestamps compute;
309    struct drm_asahi_cmd_render_user_timestamps render;
310 };
311 
312 /* XXX: Batching multiple commands per submission is causing rare (7ppm) flakes
313  * on the CTS once lossless compression is enabled. This needs to be
314  * investigated before we can reenable this mechanism. We are likely missing a
315  * cache flush or barrier somewhere.
316  */
317 static inline unsigned
max_commands_per_submit(struct hk_device * dev)318 max_commands_per_submit(struct hk_device *dev)
319 {
320    return HK_PERF(dev, BATCH) ? 64 : 1;
321 }
322 
323 static VkResult
queue_submit_single(struct hk_device * dev,struct drm_asahi_submit * submit)324 queue_submit_single(struct hk_device *dev, struct drm_asahi_submit *submit)
325 {
326    /* Currently we don't use the result buffer or implicit sync */
327    struct agx_submit_virt virt = {
328       .vbo_res_id = 0,
329       .extres_count = 0,
330    };
331 
332    if (dev->dev.is_virtio) {
333       u_rwlock_rdlock(&dev->external_bos.lock);
334       virt.extres_count = util_dynarray_num_elements(
335          &dev->external_bos.list, struct asahi_ccmd_submit_res);
336       virt.extres = util_dynarray_begin(&dev->external_bos.list);
337    }
338 
339    int ret = dev->dev.ops.submit(&dev->dev, submit, &virt);
340 
341    if (dev->dev.is_virtio)
342       u_rwlock_rdunlock(&dev->external_bos.lock);
343 
344    /* XXX: don't trap */
345    if (ret) {
346       fprintf(stderr, "DRM_IOCTL_ASAHI_SUBMIT failed: %m\n");
347       assert(0);
348    }
349 
350    return VK_SUCCESS;
351 }
352 
353 /*
354  * The kernel/firmware jointly impose a limit on commands per submit ioctl, but
355  * we can build up arbitrarily large command buffers. We handle this here by
356  * looping the ioctl, submitting slices of the command buffers that are within
357  * bounds.
358  */
359 static VkResult
queue_submit_looped(struct hk_device * dev,struct drm_asahi_submit * submit)360 queue_submit_looped(struct hk_device *dev, struct drm_asahi_submit *submit)
361 {
362    struct drm_asahi_command *cmds = (void *)(uintptr_t)submit->commands;
363    unsigned commands_remaining = submit->command_count;
364    unsigned submitted[DRM_ASAHI_SUBQUEUE_COUNT] = {0};
365 
366    while (commands_remaining) {
367       bool first = commands_remaining == submit->command_count;
368       bool last = commands_remaining <= max_commands_per_submit(dev);
369 
370       unsigned count = MIN2(commands_remaining, max_commands_per_submit(dev));
371       commands_remaining -= count;
372 
373       assert(!last || commands_remaining == 0);
374       assert(count > 0);
375 
376       /* We need to fix up the barriers since barriers are ioctl-relative */
377       for (unsigned i = 0; i < count; ++i) {
378          for (unsigned q = 0; q < DRM_ASAHI_SUBQUEUE_COUNT; ++q) {
379             if (cmds[i].barriers[q] != DRM_ASAHI_BARRIER_NONE) {
380                assert(cmds[i].barriers[q] >= submitted[q]);
381                cmds[i].barriers[q] -= submitted[q];
382             }
383          }
384       }
385 
386       /* We can't signal the out-syncobjs until all prior work finishes. Since
387        * only the last ioctl will signal, make sure it waits on prior ioctls.
388        *
389        * TODO: there might be a more performant way to do this.
390        */
391       if (last && !first) {
392          for (unsigned q = 0; q < DRM_ASAHI_SUBQUEUE_COUNT; ++q) {
393             if (cmds[0].barriers[q] == DRM_ASAHI_BARRIER_NONE)
394                cmds[0].barriers[q] = 0;
395          }
396       }
397 
398       struct drm_asahi_submit submit_ioctl = {
399          .flags = submit->flags,
400          .queue_id = submit->queue_id,
401          .result_handle = submit->result_handle,
402          .commands = (uint64_t)(uintptr_t)(cmds),
403          .command_count = count,
404          .in_syncs = first ? submit->in_syncs : 0,
405          .in_sync_count = first ? submit->in_sync_count : 0,
406          .out_syncs = last ? submit->out_syncs : 0,
407          .out_sync_count = last ? submit->out_sync_count : 0,
408       };
409 
410       VkResult result = queue_submit_single(dev, &submit_ioctl);
411       if (result != VK_SUCCESS)
412          return result;
413 
414       for (unsigned i = 0; i < count; ++i) {
415          if (cmds[i].cmd_type == DRM_ASAHI_CMD_COMPUTE)
416             submitted[DRM_ASAHI_SUBQUEUE_COMPUTE]++;
417          else if (cmds[i].cmd_type == DRM_ASAHI_CMD_RENDER)
418             submitted[DRM_ASAHI_SUBQUEUE_RENDER]++;
419          else
420             unreachable("unknown subqueue");
421       }
422 
423       cmds += count;
424    }
425 
426    return VK_SUCCESS;
427 }
428 
429 static VkResult
queue_submit(struct hk_device * dev,struct hk_queue * queue,struct vk_queue_submit * submit)430 queue_submit(struct hk_device *dev, struct hk_queue *queue,
431              struct vk_queue_submit *submit)
432 {
433    unsigned command_count = 0;
434 
435    /* Gather the number of individual commands to submit up front */
436    for (unsigned i = 0; i < submit->command_buffer_count; ++i) {
437       struct hk_cmd_buffer *cmdbuf =
438          (struct hk_cmd_buffer *)submit->command_buffers[i];
439 
440       command_count += list_length(&cmdbuf->control_streams);
441    }
442 
443    perf_debug(dev, "Submitting %u control streams (%u command buffers)",
444               command_count, submit->command_buffer_count);
445 
446    if (command_count == 0)
447       return queue_submit_empty(dev, queue, submit);
448 
449    unsigned wait_count = 0;
450    struct drm_asahi_sync *waits =
451       alloca(submit->wait_count * sizeof(struct drm_asahi_sync));
452 
453    struct drm_asahi_sync *signals =
454       alloca((submit->signal_count + 1) * sizeof(struct drm_asahi_sync));
455 
456    for (unsigned i = 0; i < submit->wait_count; ++i) {
457       /* The kernel rejects the submission if we try to wait on the same
458        * timeline semaphore at multiple points.
459        *
460        * TODO: Can we relax the UAPI?
461        *
462        * XXX: This is quadratic time.
463        */
464       bool skip = false;
465       if (submit->waits[i].sync->flags & VK_SYNC_IS_TIMELINE) {
466          uint32_t v1 = submit->waits[i].wait_value;
467          for (unsigned j = 0; j < submit->wait_count; ++j) {
468             uint32_t v2 = submit->waits[j].wait_value;
469             if (i != j && submit->waits[i].sync == submit->waits[j].sync &&
470                 (v1 < v2 || (v1 == v2 && i < j))) {
471                skip = true;
472                break;
473             }
474          }
475 
476          if (skip)
477             continue;
478       }
479 
480       asahi_fill_sync(&waits[wait_count++], submit->waits[i].sync,
481                       submit->waits[i].wait_value);
482    }
483 
484    for (unsigned i = 0; i < submit->signal_count; ++i) {
485       asahi_fill_sync(&signals[i], submit->signals[i].sync,
486                       submit->signals[i].signal_value);
487    }
488 
489    /* Signal progress on the queue itself */
490    signals[submit->signal_count] = (struct drm_asahi_sync){
491       .sync_type = DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ,
492       .handle = queue->drm.syncobj,
493       .timeline_value = ++queue->drm.timeline_value,
494    };
495 
496    /* Now setup the command structs */
497    struct drm_asahi_command *cmds = alloca(sizeof(*cmds) * command_count);
498    union drm_asahi_cmd *cmds_inner =
499       alloca(sizeof(*cmds_inner) * command_count);
500    union drm_asahi_user_timestamps *ts_inner =
501       alloca(sizeof(*ts_inner) * command_count);
502 
503    unsigned cmd_it = 0;
504    unsigned nr_vdm = 0, nr_cdm = 0;
505 
506    for (unsigned i = 0; i < submit->command_buffer_count; ++i) {
507       struct hk_cmd_buffer *cmdbuf =
508          (struct hk_cmd_buffer *)submit->command_buffers[i];
509 
510       list_for_each_entry(struct hk_cs, cs, &cmdbuf->control_streams, node) {
511          assert(cmd_it < command_count);
512 
513          struct drm_asahi_command cmd = {
514             .cmd_buffer = (uint64_t)(uintptr_t)&cmds_inner[cmd_it],
515             .result_offset = 0 /* TODO */,
516             .result_size = 0 /* TODO */,
517             /* Barrier on previous command */
518             .barriers = {nr_vdm, nr_cdm},
519          };
520 
521          if (cs->type == HK_CS_CDM) {
522             perf_debug(
523                dev,
524                "%u: Submitting CDM with %u API calls, %u dispatches, %u flushes",
525                i, cs->stats.calls, cs->stats.cmds, cs->stats.flushes);
526 
527             assert(cs->stats.cmds > 0 || cs->stats.flushes > 0 ||
528                    cs->timestamp.end.handle);
529 
530             cmd.cmd_type = DRM_ASAHI_CMD_COMPUTE;
531             cmd.cmd_buffer_size = sizeof(struct drm_asahi_cmd_compute);
532             nr_cdm++;
533 
534             asahi_fill_cdm_command(dev, cs, &cmds_inner[cmd_it].compute,
535                                    &ts_inner[cmd_it].compute);
536 
537             /* Work around for shipping 6.11.8 kernels, remove when we bump uapi
538              */
539             if (!agx_supports_timestamps(&dev->dev))
540                cmd.cmd_buffer_size -= 8;
541          } else {
542             assert(cs->type == HK_CS_VDM);
543             perf_debug(dev, "%u: Submitting VDM with %u API draws, %u draws", i,
544                        cs->stats.calls, cs->stats.cmds);
545             assert(cs->stats.cmds > 0 || cs->cr.process_empty_tiles ||
546                    cs->timestamp.end.handle);
547 
548             cmd.cmd_type = DRM_ASAHI_CMD_RENDER;
549             cmd.cmd_buffer_size = sizeof(struct drm_asahi_cmd_render);
550             nr_vdm++;
551 
552             asahi_fill_vdm_command(dev, cs, &cmds_inner[cmd_it].render,
553                                    &ts_inner[cmd_it].render);
554          }
555 
556          cmds[cmd_it++] = cmd;
557       }
558    }
559 
560    assert(cmd_it == command_count);
561 
562    if (dev->dev.debug & AGX_DBG_TRACE) {
563       for (unsigned i = 0; i < command_count; ++i) {
564          if (cmds[i].cmd_type == DRM_ASAHI_CMD_COMPUTE) {
565             agxdecode_drm_cmd_compute(dev->dev.agxdecode, &dev->dev.params,
566                                       &cmds_inner[i].compute, true);
567          } else {
568             assert(cmds[i].cmd_type == DRM_ASAHI_CMD_RENDER);
569             agxdecode_drm_cmd_render(dev->dev.agxdecode, &dev->dev.params,
570                                      &cmds_inner[i].render, true);
571          }
572       }
573 
574       agxdecode_image_heap(dev->dev.agxdecode, dev->images.bo->va->addr,
575                            dev->images.alloc);
576 
577       agxdecode_next_frame();
578    }
579 
580    struct drm_asahi_submit submit_ioctl = {
581       .flags = 0,
582       .queue_id = queue->drm.id,
583       .result_handle = 0 /* TODO */,
584       .in_sync_count = wait_count,
585       .out_sync_count = submit->signal_count + 1,
586       .command_count = command_count,
587       .in_syncs = (uint64_t)(uintptr_t)(waits),
588       .out_syncs = (uint64_t)(uintptr_t)(signals),
589       .commands = (uint64_t)(uintptr_t)(cmds),
590    };
591 
592    if (command_count <= max_commands_per_submit(dev))
593       return queue_submit_single(dev, &submit_ioctl);
594    else
595       return queue_submit_looped(dev, &submit_ioctl);
596 }
597 
598 static VkResult
hk_queue_submit(struct vk_queue * vk_queue,struct vk_queue_submit * submit)599 hk_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit)
600 {
601    struct hk_queue *queue = container_of(vk_queue, struct hk_queue, vk);
602    struct hk_device *dev = hk_queue_device(queue);
603 
604    if (vk_queue_is_lost(&queue->vk))
605       return VK_ERROR_DEVICE_LOST;
606 
607    VkResult result = queue_submit(dev, queue, submit);
608    if (result != VK_SUCCESS)
609       result = vk_queue_set_lost(&queue->vk, "Submit failed");
610 
611    if (dev->dev.debug & AGX_DBG_SYNC) {
612       /* Wait for completion */
613       int err = drmSyncobjTimelineWait(
614          dev->dev.fd, &queue->drm.syncobj, &queue->drm.timeline_value, 1,
615          INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT, NULL);
616 
617       if (err) {
618          result = vk_queue_set_lost(&queue->vk, "Wait failed");
619       } else {
620          VkResult res = dev->vk.check_status(&dev->vk);
621          if (result == VK_SUCCESS)
622             result = res;
623       }
624    }
625 
626    return result;
627 }
628 
629 static uint32_t
translate_priority(VkQueueGlobalPriorityKHR prio)630 translate_priority(VkQueueGlobalPriorityKHR prio)
631 {
632    /* clang-format off */
633    switch (prio) {
634    case VK_QUEUE_GLOBAL_PRIORITY_REALTIME_KHR: return 0;
635    case VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR:     return 1;
636    case VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR:   return 2;
637    case VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR:      return 3;
638    default: unreachable("Invalid VkQueueGlobalPriorityKHR");
639    }
640    /* clang-format on */
641 }
642 
643 VkResult
hk_queue_init(struct hk_device * dev,struct hk_queue * queue,const VkDeviceQueueCreateInfo * pCreateInfo,uint32_t index_in_family)644 hk_queue_init(struct hk_device *dev, struct hk_queue *queue,
645               const VkDeviceQueueCreateInfo *pCreateInfo,
646               uint32_t index_in_family)
647 {
648    struct hk_physical_device *pdev = hk_device_physical(dev);
649    VkResult result;
650 
651    assert(pCreateInfo->queueFamilyIndex < pdev->queue_family_count);
652 
653    const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
654       vk_find_struct_const(pCreateInfo->pNext,
655                            DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
656    const VkQueueGlobalPriorityKHR priority =
657       priority_info ? priority_info->globalPriority
658                     : VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
659 
660    result = vk_queue_init(&queue->vk, &dev->vk, pCreateInfo, index_in_family);
661    if (result != VK_SUCCESS)
662       return result;
663 
664    queue->vk.driver_submit = hk_queue_submit;
665 
666    queue->drm.id = agx_create_command_queue(&dev->dev,
667                                             DRM_ASAHI_QUEUE_CAP_RENDER |
668                                                DRM_ASAHI_QUEUE_CAP_BLIT |
669                                                DRM_ASAHI_QUEUE_CAP_COMPUTE,
670                                             translate_priority(priority));
671 
672    if (drmSyncobjCreate(dev->dev.fd, 0, &queue->drm.syncobj)) {
673       mesa_loge("drmSyncobjCreate() failed %d\n", errno);
674       agx_destroy_command_queue(&dev->dev, queue->drm.id);
675       vk_queue_finish(&queue->vk);
676 
677       return vk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY,
678                        "DRM_IOCTL_SYNCOBJ_CREATE failed: %m");
679    }
680 
681    uint64_t initial_value = 1;
682    if (drmSyncobjTimelineSignal(dev->dev.fd, &queue->drm.syncobj,
683                                 &initial_value, 1)) {
684       hk_queue_finish(dev, queue);
685       return vk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY,
686                        "DRM_IOCTL_TIMELINE_SYNCOBJ_SIGNAL failed: %m");
687    }
688 
689    return VK_SUCCESS;
690 }
691 
692 void
hk_queue_finish(struct hk_device * dev,struct hk_queue * queue)693 hk_queue_finish(struct hk_device *dev, struct hk_queue *queue)
694 {
695    drmSyncobjDestroy(dev->dev.fd, queue->drm.syncobj);
696    agx_destroy_command_queue(&dev->dev, queue->drm.id);
697    vk_queue_finish(&queue->vk);
698 }
699