1 /*
2 * Copyright 2022 Alyssa Rosenzweig
3 * Copyright 2019-2020 Collabora, Ltd.
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include <xf86drm.h>
8 #include "asahi/lib/agx_device_virtio.h"
9 #include "asahi/lib/decode.h"
10 #include "util/bitset.h"
11 #include "util/u_dynarray.h"
12 #include "util/u_range.h"
13 #include "agx_state.h"
14 #include "vdrm.h"
15
16 #define foreach_active(ctx, idx) \
17 BITSET_FOREACH_SET(idx, ctx->batches.active, AGX_MAX_BATCHES)
18
19 #define foreach_submitted(ctx, idx) \
20 BITSET_FOREACH_SET(idx, ctx->batches.submitted, AGX_MAX_BATCHES)
21
22 #define batch_debug(batch, fmt, ...) \
23 do { \
24 if (unlikely(agx_device(batch->ctx->base.screen)->debug & \
25 AGX_DBG_BATCH)) \
26 agx_msg("[Queue %u Batch %u] " fmt "\n", batch->ctx->queue_id, \
27 agx_batch_idx(batch), ##__VA_ARGS__); \
28 } while (0)
29
30 bool
agx_batch_is_active(struct agx_batch * batch)31 agx_batch_is_active(struct agx_batch *batch)
32 {
33 return BITSET_TEST(batch->ctx->batches.active, agx_batch_idx(batch));
34 }
35
36 bool
agx_batch_is_submitted(struct agx_batch * batch)37 agx_batch_is_submitted(struct agx_batch *batch)
38 {
39 return BITSET_TEST(batch->ctx->batches.submitted, agx_batch_idx(batch));
40 }
41
42 static void
agx_batch_mark_active(struct agx_batch * batch)43 agx_batch_mark_active(struct agx_batch *batch)
44 {
45 unsigned batch_idx = agx_batch_idx(batch);
46
47 batch_debug(batch, "ACTIVE");
48
49 assert(!BITSET_TEST(batch->ctx->batches.submitted, batch_idx));
50 assert(!BITSET_TEST(batch->ctx->batches.active, batch_idx));
51 BITSET_SET(batch->ctx->batches.active, batch_idx);
52 }
53
54 static void
agx_batch_mark_submitted(struct agx_batch * batch)55 agx_batch_mark_submitted(struct agx_batch *batch)
56 {
57 unsigned batch_idx = agx_batch_idx(batch);
58
59 batch_debug(batch, "SUBMIT");
60
61 assert(BITSET_TEST(batch->ctx->batches.active, batch_idx));
62 assert(!BITSET_TEST(batch->ctx->batches.submitted, batch_idx));
63 BITSET_CLEAR(batch->ctx->batches.active, batch_idx);
64 BITSET_SET(batch->ctx->batches.submitted, batch_idx);
65 }
66
67 static void
agx_batch_mark_complete(struct agx_batch * batch)68 agx_batch_mark_complete(struct agx_batch *batch)
69 {
70 unsigned batch_idx = agx_batch_idx(batch);
71
72 batch_debug(batch, "COMPLETE");
73
74 assert(!BITSET_TEST(batch->ctx->batches.active, batch_idx));
75 assert(BITSET_TEST(batch->ctx->batches.submitted, batch_idx));
76 BITSET_CLEAR(batch->ctx->batches.submitted, batch_idx);
77 }
78
79 struct agx_encoder
agx_encoder_allocate(struct agx_batch * batch,struct agx_device * dev)80 agx_encoder_allocate(struct agx_batch *batch, struct agx_device *dev)
81 {
82 struct agx_bo *bo = agx_bo_create(dev, 0x80000, 0, 0, "Encoder");
83 uint8_t *map = agx_bo_map(bo);
84 return (struct agx_encoder){.bo = bo, .current = map, .end = map + bo->size};
85 }
86
87 static void
agx_batch_init(struct agx_context * ctx,const struct pipe_framebuffer_state * key,struct agx_batch * batch)88 agx_batch_init(struct agx_context *ctx,
89 const struct pipe_framebuffer_state *key,
90 struct agx_batch *batch)
91 {
92 struct agx_device *dev = agx_device(ctx->base.screen);
93 struct agx_screen *screen = agx_screen(ctx->base.screen);
94
95 batch->ctx = ctx;
96 util_copy_framebuffer_state(&batch->key, key);
97 batch->seqnum = ++ctx->batches.seqnum;
98
99 agx_bo_reference(screen->rodata);
100 agx_pool_init(&batch->pool, dev, "Batch pool", 0, true);
101 agx_pool_init(&batch->pipeline_pool, dev, "Batch low VA pool", AGX_BO_LOW_VA,
102 true);
103
104 /* These allocations can happen only once and will just be zeroed (not freed)
105 * during batch clean up. The memory is owned by the context.
106 */
107 if (!batch->bo_list.set) {
108 batch->bo_list.set = rzalloc_array(ctx, BITSET_WORD, 128);
109 batch->bo_list.bit_count = 128 * sizeof(BITSET_WORD) * 8;
110 } else {
111 memset(batch->bo_list.set, 0, batch->bo_list.bit_count / 8);
112 }
113
114 if (agx_batch_is_compute(batch)) {
115 batch->cdm = agx_encoder_allocate(batch, dev);
116 memset(&batch->vdm, 0, sizeof(batch->vdm));
117 } else {
118 batch->vdm = agx_encoder_allocate(batch, dev);
119 memset(&batch->cdm, 0, sizeof(batch->cdm));
120 }
121
122 util_dynarray_init(&batch->scissor, ctx);
123 util_dynarray_init(&batch->depth_bias, ctx);
124 util_dynarray_init(&batch->timestamps, ctx);
125
126 batch->clear = 0;
127 batch->draw = 0;
128 batch->load = 0;
129 batch->resolve = 0;
130 batch->feedback = 0;
131 memset(batch->uploaded_clear_color, 0, sizeof(batch->uploaded_clear_color));
132 batch->clear_depth = 0;
133 batch->clear_stencil = 0;
134 batch->varyings = 0;
135 batch->geometry_state = 0;
136 batch->initialized = false;
137 batch->draws = 0;
138 batch->incoherent_writes = false;
139 agx_bo_unreference(dev, batch->sampler_heap.bo);
140 batch->sampler_heap.bo = NULL;
141 batch->sampler_heap.count = 0;
142 batch->vs_scratch = false;
143 batch->fs_scratch = false;
144 batch->cs_scratch = false;
145 batch->vs_preamble_scratch = 0;
146 batch->fs_preamble_scratch = 0;
147 batch->cs_preamble_scratch = 0;
148
149 /* May get read before write, need to initialize to 0 to avoid GPU-side UAF
150 * conditions.
151 */
152 batch->uniforms.tables[AGX_SYSVAL_TABLE_PARAMS] = 0;
153
154 /* We need to emit prim state at the start. Max collides with all. */
155 batch->reduced_prim = MESA_PRIM_COUNT;
156
157 if (!batch->syncobj) {
158 int ret = drmSyncobjCreate(dev->fd, 0, &batch->syncobj);
159 assert(!ret && batch->syncobj);
160 }
161
162 batch->result_off =
163 (2 * sizeof(union agx_batch_result)) * agx_batch_idx(batch);
164 batch->result =
165 (void *)(((uint8_t *)agx_bo_map(ctx->result_buf)) + batch->result_off);
166 memset(batch->result, 0, sizeof(union agx_batch_result) * 2);
167
168 agx_batch_mark_active(batch);
169 }
170
171 const char *status_str[] = {
172 [DRM_ASAHI_STATUS_PENDING] = "(pending)",
173 [DRM_ASAHI_STATUS_COMPLETE] = "Complete",
174 [DRM_ASAHI_STATUS_UNKNOWN_ERROR] = "UNKNOWN ERROR",
175 [DRM_ASAHI_STATUS_TIMEOUT] = "TIMEOUT",
176 [DRM_ASAHI_STATUS_FAULT] = "FAULT",
177 [DRM_ASAHI_STATUS_KILLED] = "KILLED",
178 [DRM_ASAHI_STATUS_NO_DEVICE] = "NO DEVICE",
179 };
180
181 const char *fault_type_str[] = {
182 [DRM_ASAHI_FAULT_NONE] = "(none)",
183 [DRM_ASAHI_FAULT_UNKNOWN] = "Unknown",
184 [DRM_ASAHI_FAULT_UNMAPPED] = "Unmapped",
185 [DRM_ASAHI_FAULT_AF_FAULT] = "AF Fault",
186 [DRM_ASAHI_FAULT_WRITE_ONLY] = "Write Only",
187 [DRM_ASAHI_FAULT_READ_ONLY] = "Read Only",
188 [DRM_ASAHI_FAULT_NO_ACCESS] = "No Access",
189 };
190
191 const char *low_unit_str[16] = {
192 "DCMP", "UL1C", "CMP", "GSL1", "IAP", "VCE", "TE", "RAS",
193 "VDM", "PPP", "IPF", "IPF_CPF", "VF", "VF_CPF", "ZLS", "UNK",
194 };
195
196 const char *mid_unit_str[16] = {
197 "UNK", "dPM", "dCDM_KS0", "dCDM_KS1", "dCDM_KS2", "dIPP",
198 "dIPP_CS", "dVDM_CSD", "dVDM_SSD", "dVDM_ILF", "dVDM_ILD", "dRDE0",
199 "dRDE1", "FC", "GSL2", "UNK",
200 };
201
202 const char *high_unit_str[16] = {
203 "gPM_SP", "gVDM_CSD_SP", "gVDM_SSD_SP", "gVDM_ILF_SP",
204 "gVDM_TFP_SP", "gVDM_MMB_SP", "gCDM_CS_KS0_SP", "gCDM_CS_KS1_SP",
205 "gCDM_CS_KS2_SP", "gCDM_KS0_SP", "gCDM_KS1_SP", "gCDM_KS2_SP",
206 "gIPP_SP", "gIPP_CS_SP", "gRDE0_SP", "gRDE1_SP",
207 };
208
209 static void
agx_print_result(struct agx_device * dev,struct agx_context * ctx,struct drm_asahi_result_info * info,unsigned batch_idx,bool is_compute)210 agx_print_result(struct agx_device *dev, struct agx_context *ctx,
211 struct drm_asahi_result_info *info, unsigned batch_idx,
212 bool is_compute)
213 {
214 if (unlikely(info->status != DRM_ASAHI_STATUS_COMPLETE)) {
215 ctx->any_faults = true;
216 }
217
218 if (likely(info->status == DRM_ASAHI_STATUS_COMPLETE &&
219 !((dev)->debug & AGX_DBG_STATS)))
220 return;
221
222 if (is_compute) {
223 struct drm_asahi_result_compute *r = (void *)info;
224 float time = (r->ts_end - r->ts_start) / dev->params.timer_frequency_hz;
225
226 mesa_logw(
227 "[Batch %d] Compute %s: %.06f\n", batch_idx,
228 info->status < ARRAY_SIZE(status_str) ? status_str[info->status] : "?",
229 time);
230 } else {
231 struct drm_asahi_result_render *r = (void *)info;
232 float time_vtx = (r->vertex_ts_end - r->vertex_ts_start) /
233 (float)dev->params.timer_frequency_hz;
234 float time_frag = (r->fragment_ts_end - r->fragment_ts_start) /
235 (float)dev->params.timer_frequency_hz;
236 mesa_logw(
237 "[Batch %d] Render %s: TVB %9ld/%9ld bytes (%d ovf) %c%c%c | vtx %.06f frag %.06f\n",
238 batch_idx,
239 info->status < ARRAY_SIZE(status_str) ? status_str[info->status] : "?",
240 (long)r->tvb_usage_bytes, (long)r->tvb_size_bytes,
241 (int)r->num_tvb_overflows,
242 r->flags & DRM_ASAHI_RESULT_RENDER_TVB_GROW_OVF ? 'G' : ' ',
243 r->flags & DRM_ASAHI_RESULT_RENDER_TVB_GROW_MIN ? 'M' : ' ',
244 r->flags & DRM_ASAHI_RESULT_RENDER_TVB_OVERFLOWED ? 'O' : ' ',
245 time_vtx, time_frag);
246 }
247
248 if (info->fault_type != DRM_ASAHI_FAULT_NONE) {
249 const char *unit_name;
250 int unit_index;
251
252 switch (info->unit) {
253 case 0x00 ... 0x9f:
254 unit_name = low_unit_str[info->unit & 0xf];
255 unit_index = info->unit >> 4;
256 break;
257 case 0xa0 ... 0xaf:
258 unit_name = mid_unit_str[info->unit & 0xf];
259 unit_index = 0;
260 break;
261 case 0xb0 ... 0xb7:
262 unit_name = "GL2CC_META";
263 unit_index = info->unit & 0x7;
264 break;
265 case 0xb8:
266 unit_name = "GL2CC_MB";
267 unit_index = 0;
268 break;
269 case 0xe0 ... 0xff:
270 unit_name = high_unit_str[info->unit & 0xf];
271 unit_index = (info->unit >> 4) & 1;
272 break;
273 default:
274 unit_name = "UNK";
275 unit_index = 0;
276 break;
277 }
278
279 mesa_logw(
280 "[Batch %d] Fault: %s : Addr 0x%llx %c Unit %02x (%s/%d) SB 0x%02x L%d Extra 0x%x\n",
281 batch_idx,
282 info->fault_type < ARRAY_SIZE(fault_type_str)
283 ? fault_type_str[info->fault_type]
284 : "?",
285 (long long)info->address, info->is_read ? 'r' : 'W', info->unit,
286 unit_name, unit_index, info->sideband, info->level, info->extra);
287
288 agx_debug_fault(dev, info->address);
289 }
290
291 assert(info->status == DRM_ASAHI_STATUS_COMPLETE ||
292 info->status == DRM_ASAHI_STATUS_KILLED);
293 }
294
295 static void
agx_batch_print_stats(struct agx_device * dev,struct agx_batch * batch)296 agx_batch_print_stats(struct agx_device *dev, struct agx_batch *batch)
297 {
298 unsigned batch_idx = agx_batch_idx(batch);
299
300 if (!batch->result)
301 return;
302
303 if (batch->cdm.bo) {
304 agx_print_result(dev, batch->ctx, &batch->result[0].compute.info,
305 batch_idx, true);
306 }
307
308 if (batch->vdm.bo) {
309 agx_print_result(dev, batch->ctx, &batch->result[1].render.info,
310 batch_idx, false);
311 }
312 }
313
314 static void
agx_batch_cleanup(struct agx_context * ctx,struct agx_batch * batch,bool reset)315 agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)
316 {
317 struct agx_device *dev = agx_device(ctx->base.screen);
318 struct agx_screen *screen = agx_screen(ctx->base.screen);
319
320 assert(batch->ctx == ctx);
321 assert(agx_batch_is_submitted(batch));
322
323 assert(ctx->batch != batch);
324
325 uint64_t begin_ts = ~0, end_ts = 0;
326 if (batch->result) {
327 if (batch->cdm.bo) {
328 begin_ts = MIN2(begin_ts, batch->result[0].compute.ts_start);
329 end_ts = MAX2(end_ts, batch->result[0].compute.ts_end);
330 }
331
332 if (batch->vdm.bo) {
333 begin_ts = MIN2(begin_ts, batch->result[1].render.vertex_ts_start);
334 end_ts = MAX2(end_ts, batch->result[1].render.fragment_ts_end);
335 }
336 }
337
338 agx_finish_batch_queries(batch, begin_ts, end_ts);
339
340 if (reset) {
341 int handle;
342 AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) {
343 /* We should write no buffers if this is an empty batch */
344 assert(agx_writer_get(ctx, handle) != batch);
345
346 agx_bo_unreference(dev, agx_lookup_bo(dev, handle));
347 }
348 } else {
349 int handle;
350 AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) {
351 struct agx_bo *bo = agx_lookup_bo(dev, handle);
352
353 /* There is no more writer on this context for anything we wrote */
354 struct agx_batch *writer = agx_writer_get(ctx, handle);
355
356 if (writer == batch)
357 agx_writer_remove(ctx, handle);
358
359 p_atomic_cmpxchg(&bo->writer,
360 agx_bo_writer(ctx->queue_id, batch->syncobj), 0);
361
362 agx_bo_unreference(dev, agx_lookup_bo(dev, handle));
363 }
364 }
365
366 agx_bo_unreference(dev, screen->rodata);
367 agx_bo_unreference(dev, batch->vdm.bo);
368 agx_bo_unreference(dev, batch->cdm.bo);
369 agx_pool_cleanup(&batch->pool);
370 agx_pool_cleanup(&batch->pipeline_pool);
371
372 util_dynarray_fini(&batch->scissor);
373 util_dynarray_fini(&batch->depth_bias);
374 util_dynarray_fini(&batch->timestamps);
375
376 if (!(dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC))) {
377 agx_batch_print_stats(dev, batch);
378 }
379
380 util_unreference_framebuffer_state(&batch->key);
381 agx_batch_mark_complete(batch);
382 }
383
384 int
agx_cleanup_batches(struct agx_context * ctx)385 agx_cleanup_batches(struct agx_context *ctx)
386 {
387 struct agx_device *dev = agx_device(ctx->base.screen);
388
389 unsigned i;
390 unsigned count = 0;
391 struct agx_batch *batches[AGX_MAX_BATCHES];
392 uint32_t syncobjs[AGX_MAX_BATCHES];
393 uint32_t first = 0;
394
395 foreach_submitted(ctx, i) {
396 batches[count] = &ctx->batches.slots[i];
397 syncobjs[count++] = ctx->batches.slots[i].syncobj;
398 }
399
400 if (!count)
401 return -1;
402
403 int ret = drmSyncobjWait(dev->fd, syncobjs, count, 0, 0, &first);
404 assert(!ret || ret == -ETIME);
405 if (ret)
406 return -1;
407
408 assert(first < AGX_MAX_BATCHES);
409 agx_batch_cleanup(ctx, batches[first], false);
410 return agx_batch_idx(batches[first]);
411 }
412
413 static struct agx_batch *
agx_get_batch_for_framebuffer(struct agx_context * ctx,const struct pipe_framebuffer_state * state)414 agx_get_batch_for_framebuffer(struct agx_context *ctx,
415 const struct pipe_framebuffer_state *state)
416 {
417 /* Look if we have a matching batch */
418 unsigned i;
419 foreach_active(ctx, i) {
420 struct agx_batch *candidate = &ctx->batches.slots[i];
421
422 if (util_framebuffer_state_equal(&candidate->key, state)) {
423 /* We found a match, increase the seqnum for the LRU
424 * eviction logic.
425 */
426 candidate->seqnum = ++ctx->batches.seqnum;
427 return candidate;
428 }
429 }
430
431 /* Look for a free batch */
432 for (i = 0; i < AGX_MAX_BATCHES; ++i) {
433 if (!BITSET_TEST(ctx->batches.active, i) &&
434 !BITSET_TEST(ctx->batches.submitted, i)) {
435 struct agx_batch *batch = &ctx->batches.slots[i];
436 agx_batch_init(ctx, state, batch);
437 return batch;
438 }
439 }
440
441 /* Try to clean up one batch */
442 int freed = agx_cleanup_batches(ctx);
443 if (freed >= 0) {
444 struct agx_batch *batch = &ctx->batches.slots[freed];
445 agx_batch_init(ctx, state, batch);
446 return batch;
447 }
448
449 /* Else, evict something */
450 struct agx_batch *batch = NULL;
451 bool submitted = false;
452 for (i = 0; i < AGX_MAX_BATCHES; ++i) {
453 struct agx_batch *candidate = &ctx->batches.slots[i];
454 bool cand_submitted = BITSET_TEST(ctx->batches.submitted, i);
455
456 /* Prefer submitted batches first */
457 if (!cand_submitted && submitted)
458 continue;
459
460 if (!batch || batch->seqnum > candidate->seqnum) {
461 batch = candidate;
462 submitted = cand_submitted;
463 }
464 }
465 assert(batch);
466
467 agx_sync_batch_for_reason(ctx, batch, "Too many batches");
468
469 /* Batch is now free */
470 agx_batch_init(ctx, state, batch);
471 return batch;
472 }
473
474 struct agx_batch *
agx_get_batch(struct agx_context * ctx)475 agx_get_batch(struct agx_context *ctx)
476 {
477 if (!ctx->batch || agx_batch_is_compute(ctx->batch)) {
478 ctx->batch = agx_get_batch_for_framebuffer(ctx, &ctx->framebuffer);
479 agx_dirty_all(ctx);
480 }
481
482 assert(util_framebuffer_state_equal(&ctx->framebuffer, &ctx->batch->key));
483 return ctx->batch;
484 }
485
486 struct agx_batch *
agx_get_compute_batch(struct agx_context * ctx)487 agx_get_compute_batch(struct agx_context *ctx)
488 {
489 agx_dirty_all(ctx);
490
491 struct pipe_framebuffer_state key = {.width = AGX_COMPUTE_BATCH_WIDTH};
492 ctx->batch = agx_get_batch_for_framebuffer(ctx, &key);
493 return ctx->batch;
494 }
495
496 void
agx_flush_all(struct agx_context * ctx,const char * reason)497 agx_flush_all(struct agx_context *ctx, const char *reason)
498 {
499 unsigned idx;
500 foreach_active(ctx, idx) {
501 if (reason)
502 perf_debug_ctx(ctx, "Flushing due to: %s\n", reason);
503
504 agx_flush_batch(ctx, &ctx->batches.slots[idx]);
505 }
506 }
507
508 void
agx_flush_batch_for_reason(struct agx_context * ctx,struct agx_batch * batch,const char * reason)509 agx_flush_batch_for_reason(struct agx_context *ctx, struct agx_batch *batch,
510 const char *reason)
511 {
512 if (reason)
513 perf_debug_ctx(ctx, "Flushing due to: %s\n", reason);
514
515 if (agx_batch_is_active(batch))
516 agx_flush_batch(ctx, batch);
517 }
518
519 static void
agx_flush_readers_except(struct agx_context * ctx,struct agx_resource * rsrc,struct agx_batch * except,const char * reason,bool sync)520 agx_flush_readers_except(struct agx_context *ctx, struct agx_resource *rsrc,
521 struct agx_batch *except, const char *reason,
522 bool sync)
523 {
524 unsigned idx;
525
526 /* Flush everything to the hardware first */
527 foreach_active(ctx, idx) {
528 struct agx_batch *batch = &ctx->batches.slots[idx];
529
530 if (batch == except)
531 continue;
532
533 if (agx_batch_uses_bo(batch, rsrc->bo)) {
534 perf_debug_ctx(ctx, "Flush reader due to: %s\n", reason);
535 agx_flush_batch(ctx, batch);
536 }
537 }
538
539 /* Then wait on everything if necessary */
540 if (sync) {
541 foreach_submitted(ctx, idx) {
542 struct agx_batch *batch = &ctx->batches.slots[idx];
543
544 if (batch == except)
545 continue;
546
547 if (agx_batch_uses_bo(batch, rsrc->bo)) {
548 perf_debug_ctx(ctx, "Sync reader due to: %s\n", reason);
549 agx_sync_batch(ctx, batch);
550 }
551 }
552 }
553 }
554
555 static void
agx_flush_writer_except(struct agx_context * ctx,struct agx_resource * rsrc,struct agx_batch * except,const char * reason,bool sync)556 agx_flush_writer_except(struct agx_context *ctx, struct agx_resource *rsrc,
557 struct agx_batch *except, const char *reason, bool sync)
558 {
559 struct agx_batch *writer = agx_writer_get(ctx, rsrc->bo->handle);
560
561 if (writer && writer != except &&
562 (agx_batch_is_active(writer) || agx_batch_is_submitted(writer))) {
563 if (agx_batch_is_active(writer) || sync) {
564 perf_debug_ctx(ctx, "%s writer due to: %s\n", sync ? "Sync" : "Flush",
565 reason);
566 }
567 if (agx_batch_is_active(writer))
568 agx_flush_batch(ctx, writer);
569 /* Check for submitted state, because if the batch was a no-op it'll
570 * already be cleaned up */
571 if (sync && agx_batch_is_submitted(writer))
572 agx_sync_batch(ctx, writer);
573 }
574 }
575
576 bool
agx_any_batch_uses_resource(struct agx_context * ctx,struct agx_resource * rsrc)577 agx_any_batch_uses_resource(struct agx_context *ctx, struct agx_resource *rsrc)
578 {
579 unsigned idx;
580 foreach_active(ctx, idx) {
581 struct agx_batch *batch = &ctx->batches.slots[idx];
582
583 if (agx_batch_uses_bo(batch, rsrc->bo))
584 return true;
585 }
586
587 foreach_submitted(ctx, idx) {
588 struct agx_batch *batch = &ctx->batches.slots[idx];
589
590 if (agx_batch_uses_bo(batch, rsrc->bo))
591 return true;
592 }
593
594 return false;
595 }
596
597 void
agx_flush_readers(struct agx_context * ctx,struct agx_resource * rsrc,const char * reason)598 agx_flush_readers(struct agx_context *ctx, struct agx_resource *rsrc,
599 const char *reason)
600 {
601 agx_flush_readers_except(ctx, rsrc, NULL, reason, false);
602 }
603
604 void
agx_sync_readers(struct agx_context * ctx,struct agx_resource * rsrc,const char * reason)605 agx_sync_readers(struct agx_context *ctx, struct agx_resource *rsrc,
606 const char *reason)
607 {
608 agx_flush_readers_except(ctx, rsrc, NULL, reason, true);
609 }
610
611 void
agx_flush_writer(struct agx_context * ctx,struct agx_resource * rsrc,const char * reason)612 agx_flush_writer(struct agx_context *ctx, struct agx_resource *rsrc,
613 const char *reason)
614 {
615 agx_flush_writer_except(ctx, rsrc, NULL, reason, false);
616 }
617
618 void
agx_sync_writer(struct agx_context * ctx,struct agx_resource * rsrc,const char * reason)619 agx_sync_writer(struct agx_context *ctx, struct agx_resource *rsrc,
620 const char *reason)
621 {
622 agx_flush_writer_except(ctx, rsrc, NULL, reason, true);
623 }
624
625 void
agx_batch_reads(struct agx_batch * batch,struct agx_resource * rsrc)626 agx_batch_reads(struct agx_batch *batch, struct agx_resource *rsrc)
627 {
628 agx_batch_add_bo(batch, rsrc->bo);
629
630 if (rsrc->separate_stencil)
631 agx_batch_add_bo(batch, rsrc->separate_stencil->bo);
632
633 /* Don't hazard track fake resources internally created for meta */
634 if (!rsrc->base.screen)
635 return;
636
637 /* Hazard: read-after-write */
638 agx_flush_writer_except(batch->ctx, rsrc, batch, "Read from another batch",
639 false);
640 }
641
642 static void
agx_batch_writes_internal(struct agx_batch * batch,struct agx_resource * rsrc,unsigned level)643 agx_batch_writes_internal(struct agx_batch *batch, struct agx_resource *rsrc,
644 unsigned level)
645 {
646 struct agx_context *ctx = batch->ctx;
647 struct agx_batch *writer = agx_writer_get(ctx, rsrc->bo->handle);
648
649 assert(batch->initialized);
650
651 agx_flush_readers_except(ctx, rsrc, batch, "Write from other batch", false);
652
653 BITSET_SET(rsrc->data_valid, level);
654
655 /* Nothing to do if we're already writing */
656 if (writer == batch)
657 return;
658
659 /* Hazard: writer-after-write, write-after-read */
660 if (writer)
661 agx_flush_writer(ctx, rsrc, "Multiple writers");
662
663 /* Write is strictly stronger than a read */
664 agx_batch_reads(batch, rsrc);
665
666 writer = agx_writer_get(ctx, rsrc->bo->handle);
667 assert(!writer || agx_batch_is_submitted(writer));
668
669 /* We are now the new writer. Disregard the previous writer -- anything that
670 * needs to wait for the writer going forward needs to wait for us.
671 */
672 agx_writer_remove(ctx, rsrc->bo->handle);
673 agx_writer_add(ctx, agx_batch_idx(batch), rsrc->bo->handle);
674 assert(agx_batch_is_active(batch));
675 }
676
677 void
agx_batch_writes(struct agx_batch * batch,struct agx_resource * rsrc,unsigned level)678 agx_batch_writes(struct agx_batch *batch, struct agx_resource *rsrc,
679 unsigned level)
680 {
681 agx_batch_writes_internal(batch, rsrc, level);
682
683 if (rsrc->base.target == PIPE_BUFFER) {
684 /* Assume BOs written by the GPU are fully valid */
685 rsrc->valid_buffer_range.start = 0;
686 rsrc->valid_buffer_range.end = ~0;
687 }
688 }
689
690 void
agx_batch_writes_range(struct agx_batch * batch,struct agx_resource * rsrc,unsigned offset,unsigned size)691 agx_batch_writes_range(struct agx_batch *batch, struct agx_resource *rsrc,
692 unsigned offset, unsigned size)
693 {
694 assert(rsrc->base.target == PIPE_BUFFER);
695 agx_batch_writes_internal(batch, rsrc, 0);
696 util_range_add(&rsrc->base, &rsrc->valid_buffer_range, offset,
697 offset + size);
698 }
699
700 static int
agx_get_in_sync(struct agx_context * ctx)701 agx_get_in_sync(struct agx_context *ctx)
702 {
703 struct agx_device *dev = agx_device(ctx->base.screen);
704
705 if (ctx->in_sync_fd >= 0) {
706 int ret =
707 drmSyncobjImportSyncFile(dev->fd, ctx->in_sync_obj, ctx->in_sync_fd);
708 assert(!ret);
709
710 close(ctx->in_sync_fd);
711 ctx->in_sync_fd = -1;
712
713 return ctx->in_sync_obj;
714 } else {
715 return 0;
716 }
717 }
718
719 static void
agx_add_sync(struct drm_asahi_sync * syncs,unsigned * count,uint32_t handle)720 agx_add_sync(struct drm_asahi_sync *syncs, unsigned *count, uint32_t handle)
721 {
722 if (!handle)
723 return;
724
725 syncs[(*count)++] = (struct drm_asahi_sync){
726 .sync_type = DRM_ASAHI_SYNC_SYNCOBJ,
727 .handle = handle,
728 };
729 }
730
731 void
agx_batch_submit(struct agx_context * ctx,struct agx_batch * batch,struct drm_asahi_cmd_compute * compute,struct drm_asahi_cmd_render * render)732 agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
733 struct drm_asahi_cmd_compute *compute,
734 struct drm_asahi_cmd_render *render)
735 {
736 struct agx_device *dev = agx_device(ctx->base.screen);
737 struct agx_screen *screen = agx_screen(ctx->base.screen);
738
739 bool feedback = dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC | AGX_DBG_STATS);
740
741 #ifndef NDEBUG
742 /* Debug builds always get feedback (for fault checks) */
743 feedback = true;
744 #endif
745
746 /* Timer queries use the feedback timestamping */
747 feedback |= (batch->timestamps.size > 0);
748
749 if (!feedback)
750 batch->result = NULL;
751
752 /* We allocate the worst-case sync array size since this won't be excessive
753 * for most workloads
754 */
755 unsigned max_syncs = batch->bo_list.bit_count + 2;
756 unsigned in_sync_count = 0;
757 unsigned shared_bo_count = 0;
758 struct drm_asahi_sync *in_syncs =
759 malloc(max_syncs * sizeof(struct drm_asahi_sync));
760 struct agx_bo **shared_bos = malloc(max_syncs * sizeof(struct agx_bo *));
761
762 uint64_t wait_seqid = p_atomic_read(&screen->flush_wait_seqid);
763
764 struct agx_submit_virt virt = {
765 .vbo_res_id = ctx->result_buf->vbo_res_id,
766 };
767
768 /* Elide syncing against our own queue */
769 if (wait_seqid && wait_seqid == ctx->flush_my_seqid) {
770 batch_debug(batch,
771 "Wait sync point %" PRIu64 " is ours, waiting on %" PRIu64
772 " instead",
773 wait_seqid, ctx->flush_other_seqid);
774 wait_seqid = ctx->flush_other_seqid;
775 }
776
777 uint64_t seqid = p_atomic_inc_return(&screen->flush_cur_seqid);
778 assert(seqid > wait_seqid);
779
780 batch_debug(batch, "Sync point is %" PRIu64, seqid);
781
782 /* Subtle concurrency note: Since we assign seqids atomically and do
783 * not lock submission across contexts, it is possible for two threads
784 * to submit timeline syncobj updates out of order. As far as I can
785 * tell, this case is handled in the kernel conservatively: it triggers
786 * a fence context bump and effectively "splits" the timeline at the
787 * larger point, causing future lookups for earlier points to return a
788 * later point, waiting more. The signaling code still makes sure all
789 * prior fences have to be signaled before considering a given point
790 * signaled, regardless of order. That's good enough for us.
791 *
792 * (Note: this case breaks drm_syncobj_query_ioctl and for this reason
793 * triggers a DRM_DEBUG message on submission, but we don't use that
794 * so we don't care.)
795 *
796 * This case can be tested by setting seqid = 1 unconditionally here,
797 * causing every single syncobj update to reuse the same timeline point.
798 * Everything still works (but over-synchronizes because this effectively
799 * serializes all submissions once any context flushes once).
800 */
801 struct drm_asahi_sync out_syncs[2] = {
802 {
803 .sync_type = DRM_ASAHI_SYNC_SYNCOBJ,
804 .handle = batch->syncobj,
805 },
806 {
807 .sync_type = DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ,
808 .handle = screen->flush_syncobj,
809 .timeline_value = seqid,
810 },
811 };
812
813 /* This lock protects against a subtle race scenario:
814 * - Context 1 submits and registers itself as writer for a BO
815 * - Context 2 runs the below loop, and finds the writer syncobj
816 * - Context 1 is destroyed,
817 * - flushing all batches, unregistering itself as a writer, and
818 * - Destroying syncobjs for all batches
819 * - Context 2 submits, with a now invalid syncobj ID
820 *
821 * Since batch syncobjs are only destroyed on context destruction, we can
822 * protect against this scenario with a screen-wide rwlock to ensure that
823 * the syncobj destroy code cannot run concurrently with any other
824 * submission. If a submit runs before the wrlock is taken, the syncobjs
825 * must still exist (even if the batch was flushed and no longer a writer).
826 * If it runs after the wrlock is released, then by definition the
827 * just-destroyed syncobjs cannot be writers for any BO at that point.
828 *
829 * A screen-wide (not device-wide) rwlock is sufficient because by definition
830 * resources can only be implicitly shared within a screen. Any shared
831 * resources across screens must have been imported and will go through the
832 * AGX_BO_SHARED path instead, which has no race (but is slower).
833 */
834 u_rwlock_rdlock(&screen->destroy_lock);
835
836 int handle;
837 AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) {
838 struct agx_bo *bo = agx_lookup_bo(dev, handle);
839
840 if (bo->flags & AGX_BO_SHARED) {
841 batch_debug(batch, "Waits on shared BO @ 0x%" PRIx64, bo->va->addr);
842
843 /* Get a sync file fd from the buffer */
844 int in_sync_fd = agx_export_sync_file(dev, bo);
845 assert(in_sync_fd >= 0);
846
847 /* Create a new syncobj */
848 uint32_t sync_handle;
849 int ret = drmSyncobjCreate(dev->fd, 0, &sync_handle);
850 assert(ret >= 0);
851
852 /* Import the sync file into it */
853 ret = drmSyncobjImportSyncFile(dev->fd, sync_handle, in_sync_fd);
854 assert(ret >= 0);
855 assert(sync_handle);
856 close(in_sync_fd);
857
858 /* Add it to our wait list */
859 agx_add_sync(in_syncs, &in_sync_count, sync_handle);
860
861 /* And keep track of the BO for cloning the out_sync */
862 shared_bos[shared_bo_count++] = bo;
863 if (dev->is_virtio)
864 virt.extres_count++;
865 } else {
866 /* Deal with BOs which are not externally shared, but which have been
867 * written from another context within the same screen. We also need to
868 * wait on these using their syncobj.
869 */
870 uint64_t writer = p_atomic_read_relaxed(&bo->writer);
871 uint32_t queue_id = agx_bo_writer_queue(writer);
872 if (writer && queue_id != ctx->queue_id) {
873 batch_debug(
874 batch, "Waits on inter-context BO @ 0x%" PRIx64 " from queue %u",
875 bo->va->addr, queue_id);
876
877 agx_add_sync(in_syncs, &in_sync_count,
878 agx_bo_writer_syncobj(writer));
879 shared_bos[shared_bo_count++] = NULL;
880 }
881 }
882 }
883
884 if (dev->is_virtio && virt.extres_count) {
885 struct agx_bo **p = shared_bos;
886 virt.extres =
887 malloc(virt.extres_count * sizeof(struct asahi_ccmd_submit_res));
888
889 for (unsigned i = 0; i < virt.extres_count; i++) {
890 while (!*p)
891 p++; // Skip inter-context slots which are not recorded here
892 virt.extres[i].res_id = (*p)->vbo_res_id;
893 virt.extres[i].flags = ASAHI_EXTRES_READ | ASAHI_EXTRES_WRITE;
894 p++;
895 }
896 }
897
898 if (dev->debug & AGX_DBG_SCRATCH) {
899 if (compute)
900 agx_scratch_debug_pre(&ctx->scratch_cs);
901 if (render) {
902 agx_scratch_debug_pre(&ctx->scratch_vs);
903 agx_scratch_debug_pre(&ctx->scratch_fs);
904 }
905 }
906
907 /* Add an explicit fence from gallium, if any */
908 agx_add_sync(in_syncs, &in_sync_count, agx_get_in_sync(ctx));
909
910 /* Add an implicit cross-context flush sync point, if any */
911 if (wait_seqid) {
912 batch_debug(batch, "Waits on inter-context sync point %" PRIu64,
913 wait_seqid);
914 in_syncs[in_sync_count++] = (struct drm_asahi_sync){
915 .sync_type = DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ,
916 .handle = screen->flush_syncobj,
917 .timeline_value = wait_seqid,
918 };
919 }
920
921 /* Submit! */
922 struct drm_asahi_command commands[2];
923 unsigned command_count = 0;
924
925 if (compute) {
926 commands[command_count++] = (struct drm_asahi_command){
927 .cmd_type = DRM_ASAHI_CMD_COMPUTE,
928 .flags = 0,
929 .cmd_buffer = (uint64_t)(uintptr_t)compute,
930
931 /* Work around for shipping 6.11.8 kernels, remove when we bump uapi
932 */
933 .cmd_buffer_size = sizeof(struct drm_asahi_cmd_compute) - 8,
934 .result_offset = feedback ? batch->result_off : 0,
935 .result_size = feedback ? sizeof(union agx_batch_result) : 0,
936 /* Barrier on previous submission */
937 .barriers = {0, 0},
938 };
939 }
940
941 if (render) {
942 commands[command_count++] = (struct drm_asahi_command){
943 .cmd_type = DRM_ASAHI_CMD_RENDER,
944 .flags = 0,
945 .cmd_buffer = (uint64_t)(uintptr_t)render,
946 .cmd_buffer_size = sizeof(struct drm_asahi_cmd_render),
947 .result_offset =
948 feedback ? (batch->result_off + sizeof(union agx_batch_result)) : 0,
949 .result_size = feedback ? sizeof(union agx_batch_result) : 0,
950 /* Barrier on previous submission */
951 .barriers = {compute ? DRM_ASAHI_BARRIER_NONE : 0, compute ? 1 : 0},
952 };
953 }
954
955 struct drm_asahi_submit submit = {
956 .flags = 0,
957 .queue_id = ctx->queue_id,
958 .result_handle = feedback ? ctx->result_buf->handle : 0,
959 .in_sync_count = in_sync_count,
960 .out_sync_count = 2,
961 .command_count = command_count,
962 .in_syncs = (uint64_t)(uintptr_t)(in_syncs),
963 .out_syncs = (uint64_t)(uintptr_t)(out_syncs),
964 .commands = (uint64_t)(uintptr_t)(&commands[0]),
965 };
966
967 int ret = dev->ops.submit(dev, &submit, &virt);
968
969 u_rwlock_rdunlock(&screen->destroy_lock);
970
971 if (ret) {
972 if (compute) {
973 fprintf(stderr, "DRM_IOCTL_ASAHI_SUBMIT compute failed: %m\n");
974 }
975
976 if (render) {
977 struct drm_asahi_cmd_render *c = render;
978 fprintf(
979 stderr,
980 "DRM_IOCTL_ASAHI_SUBMIT render failed: %m (%dx%d tile %dx%d layers %d samples %d)\n",
981 c->fb_width, c->fb_height, c->utile_width, c->utile_height,
982 c->layers, c->samples);
983 }
984
985 assert(0);
986 }
987
988 if (ret == ENODEV)
989 abort();
990
991 /* Now stash our batch fence into any shared BOs. */
992 if (shared_bo_count) {
993 /* Convert our handle to a sync file */
994 int out_sync_fd = -1;
995 int ret = drmSyncobjExportSyncFile(dev->fd, batch->syncobj, &out_sync_fd);
996 assert(ret >= 0);
997 assert(out_sync_fd >= 0);
998
999 for (unsigned i = 0; i < shared_bo_count; i++) {
1000 if (!shared_bos[i])
1001 continue;
1002
1003 batch_debug(batch, "Signals shared BO @ 0x%" PRIx64,
1004 shared_bos[i]->va->addr);
1005
1006 /* Free the in_sync handle we just acquired */
1007 ret = drmSyncobjDestroy(dev->fd, in_syncs[i].handle);
1008 assert(ret >= 0);
1009 /* And then import the out_sync sync file into it */
1010 ret = agx_import_sync_file(dev, shared_bos[i], out_sync_fd);
1011 assert(ret >= 0);
1012 }
1013
1014 close(out_sync_fd);
1015 }
1016
1017 /* Record the syncobj on each BO we write, so it can be added post-facto as a
1018 * fence if the BO is exported later...
1019 */
1020 AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) {
1021 struct agx_bo *bo = agx_lookup_bo(dev, handle);
1022 struct agx_batch *writer = agx_writer_get(ctx, handle);
1023
1024 if (!writer)
1025 continue;
1026
1027 /* Skip BOs that are written by submitted batches, they're not ours */
1028 if (agx_batch_is_submitted(writer))
1029 continue;
1030
1031 /* But any BOs written by active batches are ours */
1032 assert(writer == batch && "exclusive writer");
1033 p_atomic_set(&bo->writer, agx_bo_writer(ctx->queue_id, batch->syncobj));
1034 batch_debug(batch, "Writes to BO @ 0x%" PRIx64, bo->va->addr);
1035 }
1036
1037 free(in_syncs);
1038 free(shared_bos);
1039
1040 if (dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC | AGX_DBG_SCRATCH)) {
1041 if (dev->debug & AGX_DBG_TRACE) {
1042 if (compute) {
1043 agxdecode_drm_cmd_compute(dev->agxdecode, &dev->params, compute,
1044 true);
1045 }
1046
1047 if (render) {
1048 agxdecode_drm_cmd_render(dev->agxdecode, &dev->params, render,
1049 true);
1050 }
1051
1052 agxdecode_next_frame();
1053 }
1054
1055 /* Wait so we can get errors reported back */
1056 int ret = drmSyncobjWait(dev->fd, &batch->syncobj, 1, INT64_MAX, 0, NULL);
1057 assert(!ret);
1058
1059 agx_batch_print_stats(dev, batch);
1060
1061 if (dev->debug & AGX_DBG_SCRATCH) {
1062 if (compute) {
1063 fprintf(stderr, "CS scratch:\n");
1064 agx_scratch_debug_post(&ctx->scratch_cs);
1065 }
1066 if (render) {
1067 fprintf(stderr, "VS scratch:\n");
1068 agx_scratch_debug_post(&ctx->scratch_vs);
1069 fprintf(stderr, "FS scratch:\n");
1070 agx_scratch_debug_post(&ctx->scratch_fs);
1071 }
1072 }
1073 }
1074
1075 agx_batch_mark_submitted(batch);
1076
1077 if (virt.extres)
1078 free(virt.extres);
1079
1080 /* Record the last syncobj for fence creation */
1081 ctx->syncobj = batch->syncobj;
1082
1083 /* Update the last seqid in the context (must only happen if the submit
1084 * succeeded, otherwise the timeline point would not be valid).
1085 */
1086 ctx->flush_last_seqid = seqid;
1087
1088 if (ctx->batch == batch)
1089 ctx->batch = NULL;
1090
1091 /* Try to clean up up to two batches, to keep memory usage down */
1092 if (agx_cleanup_batches(ctx) >= 0)
1093 agx_cleanup_batches(ctx);
1094 }
1095
1096 void
agx_sync_batch(struct agx_context * ctx,struct agx_batch * batch)1097 agx_sync_batch(struct agx_context *ctx, struct agx_batch *batch)
1098 {
1099 struct agx_device *dev = agx_device(ctx->base.screen);
1100
1101 if (agx_batch_is_active(batch))
1102 agx_flush_batch(ctx, batch);
1103
1104 /* Empty batch case, already cleaned up */
1105 if (!agx_batch_is_submitted(batch))
1106 return;
1107
1108 assert(batch->syncobj);
1109 int ret = drmSyncobjWait(dev->fd, &batch->syncobj, 1, INT64_MAX, 0, NULL);
1110 assert(!ret);
1111 agx_batch_cleanup(ctx, batch, false);
1112 }
1113
1114 void
agx_sync_batch_for_reason(struct agx_context * ctx,struct agx_batch * batch,const char * reason)1115 agx_sync_batch_for_reason(struct agx_context *ctx, struct agx_batch *batch,
1116 const char *reason)
1117 {
1118 if (reason)
1119 perf_debug_ctx(ctx, "Syncing due to: %s\n", reason);
1120
1121 agx_sync_batch(ctx, batch);
1122 }
1123
1124 void
agx_sync_all(struct agx_context * ctx,const char * reason)1125 agx_sync_all(struct agx_context *ctx, const char *reason)
1126 {
1127 if (reason)
1128 perf_debug_ctx(ctx, "Syncing all due to: %s\n", reason);
1129
1130 unsigned idx;
1131 foreach_active(ctx, idx) {
1132 agx_flush_batch(ctx, &ctx->batches.slots[idx]);
1133 }
1134
1135 foreach_submitted(ctx, idx) {
1136 agx_sync_batch(ctx, &ctx->batches.slots[idx]);
1137 }
1138 }
1139
1140 void
agx_batch_reset(struct agx_context * ctx,struct agx_batch * batch)1141 agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch)
1142 {
1143 batch_debug(batch, "RESET");
1144
1145 assert(!batch->initialized);
1146
1147 /* Reset an empty batch. Like submit, but does nothing. */
1148 agx_batch_mark_submitted(batch);
1149
1150 if (ctx->batch == batch)
1151 ctx->batch = NULL;
1152
1153 /* Elide printing stats */
1154 batch->result = NULL;
1155
1156 agx_batch_cleanup(ctx, batch, true);
1157 }
1158
1159 /*
1160 * Timestamp queries record the time after all current work is finished,
1161 * which we handle as the time after all current batches finish (since we're a
1162 * tiler and would rather not split the batch). So add a query to all active
1163 * batches.
1164 */
1165 void
agx_add_timestamp_end_query(struct agx_context * ctx,struct agx_query * q)1166 agx_add_timestamp_end_query(struct agx_context *ctx, struct agx_query *q)
1167 {
1168 unsigned idx;
1169 foreach_active(ctx, idx) {
1170 agx_batch_add_timestamp_query(&ctx->batches.slots[idx], q);
1171 }
1172 }
1173
1174 /*
1175 * To implement a memory barrier conservatively, flush any batch that contains
1176 * an incoherent memory write (requiring a memory barrier to synchronize). This
1177 * could be further optimized.
1178 */
1179 void
agx_memory_barrier(struct pipe_context * pctx,unsigned flags)1180 agx_memory_barrier(struct pipe_context *pctx, unsigned flags)
1181 {
1182 struct agx_context *ctx = agx_context(pctx);
1183
1184 unsigned i;
1185 foreach_active(ctx, i) {
1186 struct agx_batch *batch = &ctx->batches.slots[i];
1187
1188 if (batch->incoherent_writes)
1189 agx_flush_batch_for_reason(ctx, batch, "Memory barrier");
1190 }
1191 }
1192