1 /*
2 * Copyright 2024 Valve Corporation
3 * Copyright 2024 Alyssa Rosenzweig
4 * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5 * Copyright 2024 Valve Corporation
6 * Copyright 2024 Alyssa Rosenzweig
7 * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
8 * SPDX-License-Identifier: MIT
9 */
10 #include "hk_queue.h"
11
12 #include "agx_bg_eot.h"
13 #include "agx_bo.h"
14 #include "agx_device.h"
15 #include "agx_pack.h"
16 #include "decode.h"
17 #include "hk_cmd_buffer.h"
18 #include "hk_device.h"
19 #include "hk_physical_device.h"
20
21 #include <xf86drm.h>
22 #include "asahi/lib/unstable_asahi_drm.h"
23 #include "util/list.h"
24 #include "vulkan/vulkan_core.h"
25
26 #include "vk_drm_syncobj.h"
27 #include "vk_sync.h"
28
29 /*
30 * We need to specially handle submits with no control streams. The kernel
31 * can't accept empty submits, but we can end up here in Vulkan for
32 * synchronization purposes only. Rather than submit a no-op job (slow),
33 * we simply tie the fences together.
34 */
35 static VkResult
queue_submit_empty(struct hk_device * dev,struct hk_queue * queue,struct vk_queue_submit * submit)36 queue_submit_empty(struct hk_device *dev, struct hk_queue *queue,
37 struct vk_queue_submit *submit)
38 {
39 int fd = dev->dev.fd;
40
41 /* Transfer the waits into the queue timeline. */
42 for (unsigned i = 0; i < submit->wait_count; ++i) {
43 struct vk_sync_wait *wait = &submit->waits[i];
44
45 assert(vk_sync_type_is_drm_syncobj(wait->sync->type));
46 const struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(wait->sync);
47
48 drmSyncobjTransfer(fd, queue->drm.syncobj, ++queue->drm.timeline_value,
49 syncobj->syncobj, wait->wait_value, 0);
50 }
51
52 /* Transfer the queue timeline into each out fence. They will all be
53 * signalled when we reach this point.
54 */
55 for (unsigned i = 0; i < submit->signal_count; ++i) {
56 struct vk_sync_signal *signal = &submit->signals[i];
57
58 assert(vk_sync_type_is_drm_syncobj(signal->sync->type));
59 const struct vk_drm_syncobj *syncobj =
60 vk_sync_as_drm_syncobj(signal->sync);
61
62 drmSyncobjTransfer(fd, syncobj->syncobj, signal->signal_value,
63 queue->drm.syncobj, queue->drm.timeline_value, 0);
64 }
65
66 return VK_SUCCESS;
67 }
68
69 static void
asahi_fill_cdm_command(struct hk_device * dev,struct hk_cs * cs,struct drm_asahi_cmd_compute * cmd,struct drm_asahi_cmd_compute_user_timestamps * timestamps)70 asahi_fill_cdm_command(struct hk_device *dev, struct hk_cs *cs,
71 struct drm_asahi_cmd_compute *cmd,
72 struct drm_asahi_cmd_compute_user_timestamps *timestamps)
73 {
74 size_t len = cs->stream_linked ? 65536 /* XXX */ : (cs->current - cs->start);
75
76 *cmd = (struct drm_asahi_cmd_compute){
77 .encoder_ptr = cs->addr,
78 .encoder_end = cs->addr + len,
79
80 .sampler_array = dev->samplers.table.bo->va->addr,
81 .sampler_count = dev->samplers.table.alloc,
82 .sampler_max = dev->samplers.table.alloc + 1,
83
84 .usc_base = dev->dev.shader_base,
85
86 .encoder_id = agx_get_global_id(&dev->dev),
87 .cmd_id = agx_get_global_id(&dev->dev),
88 .unk_mask = 0xffffffff,
89 };
90
91 if (cs->timestamp.end.handle) {
92 assert(agx_supports_timestamps(&dev->dev));
93
94 *timestamps = (struct drm_asahi_cmd_compute_user_timestamps){
95 .type = ASAHI_COMPUTE_EXT_TIMESTAMPS,
96 .end_handle = cs->timestamp.end.handle,
97 .end_offset = cs->timestamp.end.offset_B,
98 };
99
100 cmd->extensions = (uint64_t)(uintptr_t)timestamps;
101 }
102
103 if (cs->scratch.cs.main || cs->scratch.cs.preamble) {
104 cmd->helper_arg = dev->scratch.cs.buf->va->addr;
105 cmd->helper_cfg = cs->scratch.cs.preamble ? (1 << 16) : 0;
106 cmd->helper_program = agx_helper_program(&dev->bg_eot);
107 }
108 }
109
110 static void
asahi_fill_vdm_command(struct hk_device * dev,struct hk_cs * cs,struct drm_asahi_cmd_render * c,struct drm_asahi_cmd_render_user_timestamps * timestamps)111 asahi_fill_vdm_command(struct hk_device *dev, struct hk_cs *cs,
112 struct drm_asahi_cmd_render *c,
113 struct drm_asahi_cmd_render_user_timestamps *timestamps)
114 {
115 unsigned cmd_ta_id = agx_get_global_id(&dev->dev);
116 unsigned cmd_3d_id = agx_get_global_id(&dev->dev);
117 unsigned encoder_id = agx_get_global_id(&dev->dev);
118
119 memset(c, 0, sizeof(*c));
120
121 c->encoder_ptr = cs->addr;
122 c->encoder_id = encoder_id;
123 c->cmd_3d_id = cmd_3d_id;
124 c->cmd_ta_id = cmd_ta_id;
125 c->ppp_ctrl = 0x202;
126
127 c->fragment_usc_base = dev->dev.shader_base;
128 c->vertex_usc_base = c->fragment_usc_base;
129
130 c->fb_width = cs->cr.width;
131 c->fb_height = cs->cr.height;
132
133 c->isp_bgobjdepth = cs->cr.isp_bgobjdepth;
134 c->isp_bgobjvals = cs->cr.isp_bgobjvals;
135
136 static_assert(sizeof(c->zls_ctrl) == sizeof(cs->cr.zls_control));
137 memcpy(&c->zls_ctrl, &cs->cr.zls_control, sizeof(cs->cr.zls_control));
138
139 c->depth_dimensions =
140 (cs->cr.zls_width - 1) | ((cs->cr.zls_height - 1) << 15);
141
142 c->depth_buffer_load = cs->cr.depth.buffer;
143 c->depth_buffer_store = cs->cr.depth.buffer;
144 c->depth_buffer_partial = cs->cr.depth.buffer;
145
146 c->depth_buffer_load_stride = cs->cr.depth.stride;
147 c->depth_buffer_store_stride = cs->cr.depth.stride;
148 c->depth_buffer_partial_stride = cs->cr.depth.stride;
149
150 c->depth_meta_buffer_load = cs->cr.depth.meta;
151 c->depth_meta_buffer_store = cs->cr.depth.meta;
152 c->depth_meta_buffer_partial = cs->cr.depth.meta;
153
154 c->depth_meta_buffer_load_stride = cs->cr.depth.stride;
155 c->depth_meta_buffer_store_stride = cs->cr.depth.meta_stride;
156 c->depth_meta_buffer_partial_stride = cs->cr.depth.meta_stride;
157
158 c->stencil_buffer_load = cs->cr.stencil.buffer;
159 c->stencil_buffer_store = cs->cr.stencil.buffer;
160 c->stencil_buffer_partial = cs->cr.stencil.buffer;
161
162 c->stencil_buffer_load_stride = cs->cr.stencil.stride;
163 c->stencil_buffer_store_stride = cs->cr.stencil.stride;
164 c->stencil_buffer_partial_stride = cs->cr.stencil.stride;
165
166 c->stencil_meta_buffer_load = cs->cr.stencil.meta;
167 c->stencil_meta_buffer_store = cs->cr.stencil.meta;
168 c->stencil_meta_buffer_partial = cs->cr.stencil.meta;
169
170 c->stencil_meta_buffer_load_stride = cs->cr.stencil.stride;
171 c->stencil_meta_buffer_store_stride = cs->cr.stencil.meta_stride;
172 c->stencil_meta_buffer_partial_stride = cs->cr.stencil.meta_stride;
173
174 c->iogpu_unk_214 = cs->cr.iogpu_unk_214;
175
176 if (cs->cr.dbias_is_int == U_TRISTATE_YES) {
177 c->iogpu_unk_214 |= 0x40000;
178 }
179
180 if (dev->dev.debug & AGX_DBG_NOCLUSTER) {
181 c->flags |= ASAHI_RENDER_NO_VERTEX_CLUSTERING;
182 } else {
183 /* XXX: We don't know what this does exactly, and the name is
184 * surely wrong. But it fixes dEQP-VK.memory.pipeline_barrier.* tests on
185 * G14C when clustering is enabled...
186 */
187 c->flags |= ASAHI_RENDER_NO_CLEAR_PIPELINE_TEXTURES;
188 }
189
190 #if 0
191 /* XXX is this for just MSAA+Z+S or MSAA+(Z|S)? */
192 if (tib->nr_samples > 1 && framebuffer->zsbuf)
193 c->flags |= ASAHI_RENDER_MSAA_ZS;
194 #endif
195
196 c->utile_width = cs->tib.tile_size.width;
197 c->utile_height = cs->tib.tile_size.height;
198
199 /* Can be 0 for attachmentless rendering with no draws */
200 c->samples = MAX2(cs->tib.nr_samples, 1);
201 c->layers = cs->cr.layers;
202
203 /* Drawing max size will OOM and fail submission. But vkd3d-proton does this
204 * for emulating no-attachment rendering. Clamp to something reasonable and
205 * hope this is good enough in practice. This only affects a case that would
206 * otherwise be guaranteed broken.
207 *
208 * XXX: Hack for vkd3d-proton.
209 */
210 if (c->layers == 2048 && c->fb_width == 16384 && c->fb_height == 16384) {
211 mesa_log(MESA_LOG_WARN, MESA_LOG_TAG, "Clamping massive framebuffer");
212 c->layers = 32;
213 }
214
215 c->ppp_multisamplectl = cs->ppp_multisamplectl;
216 c->sample_size = cs->tib.sample_size_B;
217 c->tib_blocks = ALIGN_POT(agx_tilebuffer_total_size(&cs->tib), 2048) / 2048;
218
219 float tan_60 = 1.732051f;
220 c->merge_upper_x = fui(tan_60 / cs->cr.width);
221 c->merge_upper_y = fui(tan_60 / cs->cr.height);
222
223 c->load_pipeline = cs->cr.bg.main.usc | 4;
224 c->store_pipeline = cs->cr.eot.main.usc | 4;
225 c->partial_reload_pipeline = cs->cr.bg.partial.usc | 4;
226 c->partial_store_pipeline = cs->cr.eot.partial.usc | 4;
227
228 memcpy(&c->load_pipeline_bind, &cs->cr.bg.main.counts,
229 sizeof(struct agx_counts_packed));
230
231 memcpy(&c->store_pipeline_bind, &cs->cr.eot.main.counts,
232 sizeof(struct agx_counts_packed));
233
234 memcpy(&c->partial_reload_pipeline_bind, &cs->cr.bg.partial.counts,
235 sizeof(struct agx_counts_packed));
236
237 memcpy(&c->partial_store_pipeline_bind, &cs->cr.eot.partial.counts,
238 sizeof(struct agx_counts_packed));
239
240 c->scissor_array = cs->uploaded_scissor;
241 c->depth_bias_array = cs->uploaded_zbias;
242
243 c->vertex_sampler_array = dev->samplers.table.bo->va->addr;
244 c->vertex_sampler_count = dev->samplers.table.alloc;
245 c->vertex_sampler_max = dev->samplers.table.alloc + 1;
246
247 c->fragment_sampler_array = c->vertex_sampler_array;
248 c->fragment_sampler_count = c->vertex_sampler_count;
249 c->fragment_sampler_max = c->vertex_sampler_max;
250
251 c->visibility_result_buffer = dev->occlusion_queries.bo->va->addr;
252
253 if (cs->cr.process_empty_tiles)
254 c->flags |= ASAHI_RENDER_PROCESS_EMPTY_TILES;
255
256 if (cs->scratch.vs.main || cs->scratch.vs.preamble) {
257 c->flags |= ASAHI_RENDER_VERTEX_SPILLS;
258 c->vertex_helper_arg = dev->scratch.vs.buf->va->addr;
259 c->vertex_helper_cfg = cs->scratch.vs.preamble ? (1 << 16) : 0;
260 c->vertex_helper_program = agx_helper_program(&dev->bg_eot);
261 }
262
263 if (cs->scratch.fs.main || cs->scratch.fs.preamble) {
264 c->fragment_helper_arg = dev->scratch.fs.buf->va->addr;
265 c->fragment_helper_cfg = cs->scratch.fs.preamble ? (1 << 16) : 0;
266 c->fragment_helper_program = agx_helper_program(&dev->bg_eot);
267 }
268
269 if (cs->timestamp.end.handle) {
270 assert(agx_supports_timestamps(&dev->dev));
271
272 c->extensions = (uint64_t)(uintptr_t)timestamps;
273
274 *timestamps = (struct drm_asahi_cmd_render_user_timestamps){
275 .type = ASAHI_RENDER_EXT_TIMESTAMPS,
276 .frg_end_handle = cs->timestamp.end.handle,
277 .frg_end_offset = cs->timestamp.end.offset_B,
278 };
279 }
280 }
281
282 static void
asahi_fill_sync(struct drm_asahi_sync * sync,struct vk_sync * vk_sync,uint64_t value)283 asahi_fill_sync(struct drm_asahi_sync *sync, struct vk_sync *vk_sync,
284 uint64_t value)
285 {
286 if (unlikely(!vk_sync_type_is_drm_syncobj(vk_sync->type))) {
287 unreachable("Unsupported sync type");
288 return;
289 }
290
291 const struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(vk_sync);
292 *sync = (struct drm_asahi_sync){.handle = syncobj->syncobj};
293
294 if (vk_sync->flags & VK_SYNC_IS_TIMELINE) {
295 sync->sync_type = DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ;
296 sync->timeline_value = value;
297 } else {
298 sync->sync_type = DRM_ASAHI_SYNC_SYNCOBJ;
299 }
300 }
301
302 union drm_asahi_cmd {
303 struct drm_asahi_cmd_compute compute;
304 struct drm_asahi_cmd_render render;
305 };
306
307 union drm_asahi_user_timestamps {
308 struct drm_asahi_cmd_compute_user_timestamps compute;
309 struct drm_asahi_cmd_render_user_timestamps render;
310 };
311
312 /* XXX: Batching multiple commands per submission is causing rare (7ppm) flakes
313 * on the CTS once lossless compression is enabled. This needs to be
314 * investigated before we can reenable this mechanism. We are likely missing a
315 * cache flush or barrier somewhere.
316 */
317 static inline unsigned
max_commands_per_submit(struct hk_device * dev)318 max_commands_per_submit(struct hk_device *dev)
319 {
320 return HK_PERF(dev, BATCH) ? 64 : 1;
321 }
322
323 static VkResult
queue_submit_single(struct hk_device * dev,struct drm_asahi_submit * submit)324 queue_submit_single(struct hk_device *dev, struct drm_asahi_submit *submit)
325 {
326 /* Currently we don't use the result buffer or implicit sync */
327 struct agx_submit_virt virt = {
328 .vbo_res_id = 0,
329 .extres_count = 0,
330 };
331
332 if (dev->dev.is_virtio) {
333 u_rwlock_rdlock(&dev->external_bos.lock);
334 virt.extres_count = util_dynarray_num_elements(
335 &dev->external_bos.list, struct asahi_ccmd_submit_res);
336 virt.extres = util_dynarray_begin(&dev->external_bos.list);
337 }
338
339 int ret = dev->dev.ops.submit(&dev->dev, submit, &virt);
340
341 if (dev->dev.is_virtio)
342 u_rwlock_rdunlock(&dev->external_bos.lock);
343
344 /* XXX: don't trap */
345 if (ret) {
346 fprintf(stderr, "DRM_IOCTL_ASAHI_SUBMIT failed: %m\n");
347 assert(0);
348 }
349
350 return VK_SUCCESS;
351 }
352
353 /*
354 * The kernel/firmware jointly impose a limit on commands per submit ioctl, but
355 * we can build up arbitrarily large command buffers. We handle this here by
356 * looping the ioctl, submitting slices of the command buffers that are within
357 * bounds.
358 */
359 static VkResult
queue_submit_looped(struct hk_device * dev,struct drm_asahi_submit * submit)360 queue_submit_looped(struct hk_device *dev, struct drm_asahi_submit *submit)
361 {
362 struct drm_asahi_command *cmds = (void *)(uintptr_t)submit->commands;
363 unsigned commands_remaining = submit->command_count;
364 unsigned submitted[DRM_ASAHI_SUBQUEUE_COUNT] = {0};
365
366 while (commands_remaining) {
367 bool first = commands_remaining == submit->command_count;
368 bool last = commands_remaining <= max_commands_per_submit(dev);
369
370 unsigned count = MIN2(commands_remaining, max_commands_per_submit(dev));
371 commands_remaining -= count;
372
373 assert(!last || commands_remaining == 0);
374 assert(count > 0);
375
376 /* We need to fix up the barriers since barriers are ioctl-relative */
377 for (unsigned i = 0; i < count; ++i) {
378 for (unsigned q = 0; q < DRM_ASAHI_SUBQUEUE_COUNT; ++q) {
379 if (cmds[i].barriers[q] != DRM_ASAHI_BARRIER_NONE) {
380 assert(cmds[i].barriers[q] >= submitted[q]);
381 cmds[i].barriers[q] -= submitted[q];
382 }
383 }
384 }
385
386 /* We can't signal the out-syncobjs until all prior work finishes. Since
387 * only the last ioctl will signal, make sure it waits on prior ioctls.
388 *
389 * TODO: there might be a more performant way to do this.
390 */
391 if (last && !first) {
392 for (unsigned q = 0; q < DRM_ASAHI_SUBQUEUE_COUNT; ++q) {
393 if (cmds[0].barriers[q] == DRM_ASAHI_BARRIER_NONE)
394 cmds[0].barriers[q] = 0;
395 }
396 }
397
398 struct drm_asahi_submit submit_ioctl = {
399 .flags = submit->flags,
400 .queue_id = submit->queue_id,
401 .result_handle = submit->result_handle,
402 .commands = (uint64_t)(uintptr_t)(cmds),
403 .command_count = count,
404 .in_syncs = first ? submit->in_syncs : 0,
405 .in_sync_count = first ? submit->in_sync_count : 0,
406 .out_syncs = last ? submit->out_syncs : 0,
407 .out_sync_count = last ? submit->out_sync_count : 0,
408 };
409
410 VkResult result = queue_submit_single(dev, &submit_ioctl);
411 if (result != VK_SUCCESS)
412 return result;
413
414 for (unsigned i = 0; i < count; ++i) {
415 if (cmds[i].cmd_type == DRM_ASAHI_CMD_COMPUTE)
416 submitted[DRM_ASAHI_SUBQUEUE_COMPUTE]++;
417 else if (cmds[i].cmd_type == DRM_ASAHI_CMD_RENDER)
418 submitted[DRM_ASAHI_SUBQUEUE_RENDER]++;
419 else
420 unreachable("unknown subqueue");
421 }
422
423 cmds += count;
424 }
425
426 return VK_SUCCESS;
427 }
428
429 static VkResult
queue_submit(struct hk_device * dev,struct hk_queue * queue,struct vk_queue_submit * submit)430 queue_submit(struct hk_device *dev, struct hk_queue *queue,
431 struct vk_queue_submit *submit)
432 {
433 unsigned command_count = 0;
434
435 /* Gather the number of individual commands to submit up front */
436 for (unsigned i = 0; i < submit->command_buffer_count; ++i) {
437 struct hk_cmd_buffer *cmdbuf =
438 (struct hk_cmd_buffer *)submit->command_buffers[i];
439
440 command_count += list_length(&cmdbuf->control_streams);
441 }
442
443 perf_debug(dev, "Submitting %u control streams (%u command buffers)",
444 command_count, submit->command_buffer_count);
445
446 if (command_count == 0)
447 return queue_submit_empty(dev, queue, submit);
448
449 unsigned wait_count = 0;
450 struct drm_asahi_sync *waits =
451 alloca(submit->wait_count * sizeof(struct drm_asahi_sync));
452
453 struct drm_asahi_sync *signals =
454 alloca((submit->signal_count + 1) * sizeof(struct drm_asahi_sync));
455
456 for (unsigned i = 0; i < submit->wait_count; ++i) {
457 /* The kernel rejects the submission if we try to wait on the same
458 * timeline semaphore at multiple points.
459 *
460 * TODO: Can we relax the UAPI?
461 *
462 * XXX: This is quadratic time.
463 */
464 bool skip = false;
465 if (submit->waits[i].sync->flags & VK_SYNC_IS_TIMELINE) {
466 uint32_t v1 = submit->waits[i].wait_value;
467 for (unsigned j = 0; j < submit->wait_count; ++j) {
468 uint32_t v2 = submit->waits[j].wait_value;
469 if (i != j && submit->waits[i].sync == submit->waits[j].sync &&
470 (v1 < v2 || (v1 == v2 && i < j))) {
471 skip = true;
472 break;
473 }
474 }
475
476 if (skip)
477 continue;
478 }
479
480 asahi_fill_sync(&waits[wait_count++], submit->waits[i].sync,
481 submit->waits[i].wait_value);
482 }
483
484 for (unsigned i = 0; i < submit->signal_count; ++i) {
485 asahi_fill_sync(&signals[i], submit->signals[i].sync,
486 submit->signals[i].signal_value);
487 }
488
489 /* Signal progress on the queue itself */
490 signals[submit->signal_count] = (struct drm_asahi_sync){
491 .sync_type = DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ,
492 .handle = queue->drm.syncobj,
493 .timeline_value = ++queue->drm.timeline_value,
494 };
495
496 /* Now setup the command structs */
497 struct drm_asahi_command *cmds = alloca(sizeof(*cmds) * command_count);
498 union drm_asahi_cmd *cmds_inner =
499 alloca(sizeof(*cmds_inner) * command_count);
500 union drm_asahi_user_timestamps *ts_inner =
501 alloca(sizeof(*ts_inner) * command_count);
502
503 unsigned cmd_it = 0;
504 unsigned nr_vdm = 0, nr_cdm = 0;
505
506 for (unsigned i = 0; i < submit->command_buffer_count; ++i) {
507 struct hk_cmd_buffer *cmdbuf =
508 (struct hk_cmd_buffer *)submit->command_buffers[i];
509
510 list_for_each_entry(struct hk_cs, cs, &cmdbuf->control_streams, node) {
511 assert(cmd_it < command_count);
512
513 struct drm_asahi_command cmd = {
514 .cmd_buffer = (uint64_t)(uintptr_t)&cmds_inner[cmd_it],
515 .result_offset = 0 /* TODO */,
516 .result_size = 0 /* TODO */,
517 /* Barrier on previous command */
518 .barriers = {nr_vdm, nr_cdm},
519 };
520
521 if (cs->type == HK_CS_CDM) {
522 perf_debug(
523 dev,
524 "%u: Submitting CDM with %u API calls, %u dispatches, %u flushes",
525 i, cs->stats.calls, cs->stats.cmds, cs->stats.flushes);
526
527 assert(cs->stats.cmds > 0 || cs->stats.flushes > 0 ||
528 cs->timestamp.end.handle);
529
530 cmd.cmd_type = DRM_ASAHI_CMD_COMPUTE;
531 cmd.cmd_buffer_size = sizeof(struct drm_asahi_cmd_compute);
532 nr_cdm++;
533
534 asahi_fill_cdm_command(dev, cs, &cmds_inner[cmd_it].compute,
535 &ts_inner[cmd_it].compute);
536
537 /* Work around for shipping 6.11.8 kernels, remove when we bump uapi
538 */
539 if (!agx_supports_timestamps(&dev->dev))
540 cmd.cmd_buffer_size -= 8;
541 } else {
542 assert(cs->type == HK_CS_VDM);
543 perf_debug(dev, "%u: Submitting VDM with %u API draws, %u draws", i,
544 cs->stats.calls, cs->stats.cmds);
545 assert(cs->stats.cmds > 0 || cs->cr.process_empty_tiles ||
546 cs->timestamp.end.handle);
547
548 cmd.cmd_type = DRM_ASAHI_CMD_RENDER;
549 cmd.cmd_buffer_size = sizeof(struct drm_asahi_cmd_render);
550 nr_vdm++;
551
552 asahi_fill_vdm_command(dev, cs, &cmds_inner[cmd_it].render,
553 &ts_inner[cmd_it].render);
554 }
555
556 cmds[cmd_it++] = cmd;
557 }
558 }
559
560 assert(cmd_it == command_count);
561
562 if (dev->dev.debug & AGX_DBG_TRACE) {
563 for (unsigned i = 0; i < command_count; ++i) {
564 if (cmds[i].cmd_type == DRM_ASAHI_CMD_COMPUTE) {
565 agxdecode_drm_cmd_compute(dev->dev.agxdecode, &dev->dev.params,
566 &cmds_inner[i].compute, true);
567 } else {
568 assert(cmds[i].cmd_type == DRM_ASAHI_CMD_RENDER);
569 agxdecode_drm_cmd_render(dev->dev.agxdecode, &dev->dev.params,
570 &cmds_inner[i].render, true);
571 }
572 }
573
574 agxdecode_image_heap(dev->dev.agxdecode, dev->images.bo->va->addr,
575 dev->images.alloc);
576
577 agxdecode_next_frame();
578 }
579
580 struct drm_asahi_submit submit_ioctl = {
581 .flags = 0,
582 .queue_id = queue->drm.id,
583 .result_handle = 0 /* TODO */,
584 .in_sync_count = wait_count,
585 .out_sync_count = submit->signal_count + 1,
586 .command_count = command_count,
587 .in_syncs = (uint64_t)(uintptr_t)(waits),
588 .out_syncs = (uint64_t)(uintptr_t)(signals),
589 .commands = (uint64_t)(uintptr_t)(cmds),
590 };
591
592 if (command_count <= max_commands_per_submit(dev))
593 return queue_submit_single(dev, &submit_ioctl);
594 else
595 return queue_submit_looped(dev, &submit_ioctl);
596 }
597
598 static VkResult
hk_queue_submit(struct vk_queue * vk_queue,struct vk_queue_submit * submit)599 hk_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit)
600 {
601 struct hk_queue *queue = container_of(vk_queue, struct hk_queue, vk);
602 struct hk_device *dev = hk_queue_device(queue);
603
604 if (vk_queue_is_lost(&queue->vk))
605 return VK_ERROR_DEVICE_LOST;
606
607 VkResult result = queue_submit(dev, queue, submit);
608 if (result != VK_SUCCESS)
609 result = vk_queue_set_lost(&queue->vk, "Submit failed");
610
611 if (dev->dev.debug & AGX_DBG_SYNC) {
612 /* Wait for completion */
613 int err = drmSyncobjTimelineWait(
614 dev->dev.fd, &queue->drm.syncobj, &queue->drm.timeline_value, 1,
615 INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT, NULL);
616
617 if (err) {
618 result = vk_queue_set_lost(&queue->vk, "Wait failed");
619 } else {
620 VkResult res = dev->vk.check_status(&dev->vk);
621 if (result == VK_SUCCESS)
622 result = res;
623 }
624 }
625
626 return result;
627 }
628
629 static uint32_t
translate_priority(VkQueueGlobalPriorityKHR prio)630 translate_priority(VkQueueGlobalPriorityKHR prio)
631 {
632 /* clang-format off */
633 switch (prio) {
634 case VK_QUEUE_GLOBAL_PRIORITY_REALTIME_KHR: return 0;
635 case VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR: return 1;
636 case VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR: return 2;
637 case VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR: return 3;
638 default: unreachable("Invalid VkQueueGlobalPriorityKHR");
639 }
640 /* clang-format on */
641 }
642
643 VkResult
hk_queue_init(struct hk_device * dev,struct hk_queue * queue,const VkDeviceQueueCreateInfo * pCreateInfo,uint32_t index_in_family)644 hk_queue_init(struct hk_device *dev, struct hk_queue *queue,
645 const VkDeviceQueueCreateInfo *pCreateInfo,
646 uint32_t index_in_family)
647 {
648 struct hk_physical_device *pdev = hk_device_physical(dev);
649 VkResult result;
650
651 assert(pCreateInfo->queueFamilyIndex < pdev->queue_family_count);
652
653 const VkDeviceQueueGlobalPriorityCreateInfoKHR *priority_info =
654 vk_find_struct_const(pCreateInfo->pNext,
655 DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
656 const VkQueueGlobalPriorityKHR priority =
657 priority_info ? priority_info->globalPriority
658 : VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
659
660 result = vk_queue_init(&queue->vk, &dev->vk, pCreateInfo, index_in_family);
661 if (result != VK_SUCCESS)
662 return result;
663
664 queue->vk.driver_submit = hk_queue_submit;
665
666 queue->drm.id = agx_create_command_queue(&dev->dev,
667 DRM_ASAHI_QUEUE_CAP_RENDER |
668 DRM_ASAHI_QUEUE_CAP_BLIT |
669 DRM_ASAHI_QUEUE_CAP_COMPUTE,
670 translate_priority(priority));
671
672 if (drmSyncobjCreate(dev->dev.fd, 0, &queue->drm.syncobj)) {
673 mesa_loge("drmSyncobjCreate() failed %d\n", errno);
674 agx_destroy_command_queue(&dev->dev, queue->drm.id);
675 vk_queue_finish(&queue->vk);
676
677 return vk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY,
678 "DRM_IOCTL_SYNCOBJ_CREATE failed: %m");
679 }
680
681 uint64_t initial_value = 1;
682 if (drmSyncobjTimelineSignal(dev->dev.fd, &queue->drm.syncobj,
683 &initial_value, 1)) {
684 hk_queue_finish(dev, queue);
685 return vk_errorf(dev, VK_ERROR_OUT_OF_HOST_MEMORY,
686 "DRM_IOCTL_TIMELINE_SYNCOBJ_SIGNAL failed: %m");
687 }
688
689 return VK_SUCCESS;
690 }
691
692 void
hk_queue_finish(struct hk_device * dev,struct hk_queue * queue)693 hk_queue_finish(struct hk_device *dev, struct hk_queue *queue)
694 {
695 drmSyncobjDestroy(dev->dev.fd, queue->drm.syncobj);
696 agx_destroy_command_queue(&dev->dev, queue->drm.id);
697 vk_queue_finish(&queue->vk);
698 }
699