1 /*
2 * Copyright (C) 2023 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "decode.h"
25
26 #include "drm-uapi/panthor_drm.h"
27
28 #include "genxml/cs_builder.h"
29 #include "panfrost/lib/genxml/cs_builder.h"
30
31 #include "gen_macros.h"
32 #include "pan_cmdstream.h"
33 #include "pan_context.h"
34 #include "pan_csf.h"
35 #include "pan_fb_preload.h"
36 #include "pan_job.h"
37
38 #if PAN_ARCH < 10
39 #error "CSF helpers are only used for gen >= 10"
40 #endif
41
42 static struct cs_buffer
csf_alloc_cs_buffer(void * cookie)43 csf_alloc_cs_buffer(void *cookie)
44 {
45 assert(cookie && "Self-contained queues can't be extended.");
46
47 struct panfrost_batch *batch = cookie;
48 unsigned capacity = 4096;
49
50 struct panfrost_ptr ptr =
51 pan_pool_alloc_aligned(&batch->csf.cs_chunk_pool.base, capacity * 8, 64);
52
53 return (struct cs_buffer){
54 .cpu = ptr.cpu,
55 .gpu = ptr.gpu,
56 .capacity = capacity,
57 };
58 }
59
60 /*
61 * Register is reserved to pass the batch tiler OOM context
62 */
63 #define TILER_OOM_CTX_REG 76
64
65 static enum cs_reg_perm
csf_reg_perm_cb(struct cs_builder * b,unsigned reg)66 csf_reg_perm_cb(struct cs_builder *b, unsigned reg)
67 {
68 if (reg == TILER_OOM_CTX_REG)
69 return CS_REG_RD;
70 return CS_REG_RW;
71 }
72
73 static void
csf_update_tiler_oom_ctx(struct cs_builder * b,uint64_t addr)74 csf_update_tiler_oom_ctx(struct cs_builder *b, uint64_t addr)
75 {
76 reg_perm_cb_t orig_cb;
77
78 if (likely(!b->conf.reg_perm)) {
79 cs_move64_to(b, cs_reg64(b, TILER_OOM_CTX_REG), addr);
80 return;
81 }
82
83 orig_cb = b->conf.reg_perm;
84 b->conf.reg_perm = NULL;
85 cs_move64_to(b, cs_reg64(b, TILER_OOM_CTX_REG), addr);
86 b->conf.reg_perm = orig_cb;
87 }
88
89 #define FIELD_OFFSET(_name) offsetof(struct pan_csf_tiler_oom_ctx, _name)
90
91 #define FBD_OFFSET(_pass) \
92 (FIELD_OFFSET(fbds) + \
93 (PAN_INCREMENTAL_RENDERING_##_pass##_PASS * sizeof(struct panfrost_ptr)) + \
94 offsetof(struct panfrost_ptr, gpu))
95
96 static int
csf_oom_handler_init(struct panfrost_context * ctx)97 csf_oom_handler_init(struct panfrost_context *ctx)
98 {
99 struct panfrost_bo *cs_bo = NULL, *reg_save_bo = NULL;
100 struct panfrost_device *dev = pan_device(ctx->base.screen);
101
102 cs_bo =
103 panfrost_bo_create(dev, 4096, 0, "Temporary CS buffer");
104 reg_save_bo =
105 panfrost_bo_create(dev, 4096, 0, "reg save bo");
106
107 if (!cs_bo || !reg_save_bo)
108 goto fail;
109
110 struct cs_buffer queue = {
111 .cpu = cs_bo->ptr.cpu,
112 .gpu = cs_bo->ptr.gpu,
113 .capacity = panfrost_bo_size(cs_bo) / sizeof(uint64_t),
114 };
115 struct cs_builder b;
116 const struct cs_builder_conf conf = {
117 .nr_registers = 96,
118 .nr_kernel_registers = 4,
119 .reg_perm = (dev->debug & PAN_DBG_CS) ? csf_reg_perm_cb : NULL,
120 };
121 cs_builder_init(&b, &conf, queue);
122
123 struct cs_exception_handler_ctx handler_ctx = {
124 .ctx_reg = cs_reg64(&b, TILER_OOM_CTX_REG),
125 .dump_addr_offset = offsetof(struct pan_csf_tiler_oom_ctx, dump_addr),
126 .ls_sb_slot = 0,
127 };
128 struct cs_exception_handler handler;
129
130 cs_exception_handler_def(&b, &handler, handler_ctx) {
131 struct cs_index tiler_oom_ctx = cs_reg64(&b, TILER_OOM_CTX_REG);
132 struct cs_index counter = cs_reg32(&b, 47);
133 struct cs_index zero = cs_reg64(&b, 48);
134 struct cs_index flush_id = cs_reg32(&b, 48);
135 struct cs_index tiler_ctx = cs_reg64(&b, 50);
136 struct cs_index completed_top = cs_reg64(&b, 52);
137 struct cs_index completed_bottom = cs_reg64(&b, 54);
138 struct cs_index completed_chunks = cs_reg_tuple(&b, 52, 4);
139
140 /* Use different framebuffer descriptor depending on whether incremental
141 * rendering has already been triggered */
142 cs_load32_to(&b, counter, tiler_oom_ctx, FIELD_OFFSET(counter));
143 cs_wait_slot(&b, 0, false);
144 cs_if(&b, MALI_CS_CONDITION_GREATER, counter) {
145 cs_load64_to(&b, cs_reg64(&b, 40), tiler_oom_ctx, FBD_OFFSET(MIDDLE));
146 }
147 cs_else(&b) {
148 cs_load64_to(&b, cs_reg64(&b, 40), tiler_oom_ctx, FBD_OFFSET(FIRST));
149 }
150
151 cs_load32_to(&b, cs_reg32(&b, 42), tiler_oom_ctx, FIELD_OFFSET(bbox_min));
152 cs_load32_to(&b, cs_reg32(&b, 43), tiler_oom_ctx, FIELD_OFFSET(bbox_max));
153 cs_move64_to(&b, cs_reg64(&b, 44), 0);
154 cs_move32_to(&b, cs_reg32(&b, 46), 0);
155 cs_wait_slot(&b, 0, false);
156
157 /* Run the fragment job and wait */
158 cs_set_scoreboard_entry(&b, 3, 0);
159 cs_run_fragment(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
160 cs_wait_slot(&b, 3, false);
161
162 /* Increment counter */
163 cs_add32(&b, counter, counter, 1);
164 cs_store32(&b, counter, tiler_oom_ctx, FIELD_OFFSET(counter));
165
166 /* Load completed chunks */
167 cs_load64_to(&b, tiler_ctx, tiler_oom_ctx, FIELD_OFFSET(tiler_desc));
168 cs_wait_slot(&b, 0, false);
169 cs_load_to(&b, completed_chunks, tiler_ctx, BITFIELD_MASK(4), 10 * 4);
170 cs_wait_slot(&b, 0, false);
171
172 cs_finish_fragment(&b, false, completed_top, completed_bottom, cs_now());
173
174 /* Zero out polygon list, completed_top and completed_bottom */
175 cs_move64_to(&b, zero, 0);
176 cs_store64(&b, zero, tiler_ctx, 0);
177 cs_store64(&b, zero, tiler_ctx, 10 * 4);
178 cs_store64(&b, zero, tiler_ctx, 12 * 4);
179
180 /* We need to flush the texture caches so future preloads see the new
181 * content. */
182 cs_flush_caches(&b, MALI_CS_FLUSH_MODE_NONE, MALI_CS_FLUSH_MODE_NONE,
183 true, flush_id, cs_defer(0, 0));
184
185 cs_wait_slot(&b, 0, false);
186
187 cs_set_scoreboard_entry(&b, 2, 0);
188 }
189
190 assert(cs_is_valid(&b));
191 cs_finish(&b);
192 ctx->csf.tiler_oom_handler.cs_bo = cs_bo;
193 ctx->csf.tiler_oom_handler.length = handler.length * sizeof(uint64_t);
194 ctx->csf.tiler_oom_handler.save_bo = reg_save_bo;
195
196 return 0;
197
198 fail:
199 if (cs_bo)
200 panfrost_bo_unreference(cs_bo);
201
202 if (reg_save_bo)
203 panfrost_bo_unreference(reg_save_bo);
204
205 return -1;
206 }
207
208 #undef FBD_OFFSET
209 #undef FIELD_OFFSET
210
211 void
GENX(csf_cleanup_batch)212 GENX(csf_cleanup_batch)(struct panfrost_batch *batch)
213 {
214 free(batch->csf.cs.builder);
215 free(batch->csf.cs.ls_tracker);
216
217 panfrost_pool_cleanup(&batch->csf.cs_chunk_pool);
218 }
219
220 static inline struct panfrost_ptr
alloc_fbd(struct panfrost_batch * batch)221 alloc_fbd(struct panfrost_batch *batch)
222 {
223 return pan_pool_alloc_desc_aggregate(
224 &batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION),
225 PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
226 }
227
228 int
GENX(csf_init_batch)229 GENX(csf_init_batch)(struct panfrost_batch *batch)
230 {
231 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
232
233 /* Initialize the CS chunk pool. */
234 if (panfrost_pool_init(&batch->csf.cs_chunk_pool, NULL, dev, 0, 32768,
235 "CS chunk pool", false, true))
236 return -1;
237
238 if (dev->debug & PAN_DBG_CS) {
239 /* Load/store tracker if extra checks are enabled. */
240 batch->csf.cs.ls_tracker =
241 calloc(1, sizeof(struct cs_load_store_tracker));
242 batch->csf.cs.ls_tracker->sb_slot = 0;
243 }
244
245 /* Allocate and bind the command queue */
246 struct cs_buffer queue = csf_alloc_cs_buffer(batch);
247 if (!queue.gpu)
248 return -1;
249
250 const struct cs_builder_conf conf = {
251 .nr_registers = 96,
252 .nr_kernel_registers = 4,
253 .alloc_buffer = csf_alloc_cs_buffer,
254 .cookie = batch,
255 .ls_tracker = batch->csf.cs.ls_tracker,
256 .reg_perm = (dev->debug & PAN_DBG_CS) ? csf_reg_perm_cb : NULL,
257 };
258
259 /* Setup the queue builder */
260 batch->csf.cs.builder = malloc(sizeof(struct cs_builder));
261 cs_builder_init(batch->csf.cs.builder, &conf, queue);
262 cs_req_res(batch->csf.cs.builder,
263 CS_COMPUTE_RES | CS_TILER_RES | CS_IDVS_RES | CS_FRAG_RES);
264
265 /* Set up entries */
266 struct cs_builder *b = batch->csf.cs.builder;
267 cs_set_scoreboard_entry(b, 2, 0);
268
269 batch->framebuffer = alloc_fbd(batch);
270 if (!batch->framebuffer.gpu)
271 return -1;
272
273 batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
274 if (!batch->tls.cpu)
275 return -1;
276
277 return 0;
278 }
279
280 static void
csf_prepare_qsubmit(struct panfrost_context * ctx,struct drm_panthor_queue_submit * submit,uint8_t queue,uint64_t cs_start,uint32_t cs_size,struct drm_panthor_sync_op * syncs,uint32_t sync_count)281 csf_prepare_qsubmit(struct panfrost_context *ctx,
282 struct drm_panthor_queue_submit *submit, uint8_t queue,
283 uint64_t cs_start, uint32_t cs_size,
284 struct drm_panthor_sync_op *syncs, uint32_t sync_count)
285 {
286 struct panfrost_device *dev = pan_device(ctx->base.screen);
287
288 *submit = (struct drm_panthor_queue_submit){
289 .queue_index = queue,
290 .stream_addr = cs_start,
291 .stream_size = cs_size,
292 .latest_flush = panthor_kmod_get_flush_id(dev->kmod.dev),
293 .syncs = DRM_PANTHOR_OBJ_ARRAY(sync_count, syncs),
294 };
295 }
296
297 static void
csf_prepare_gsubmit(struct panfrost_context * ctx,struct drm_panthor_group_submit * gsubmit,struct drm_panthor_queue_submit * qsubmits,uint32_t qsubmit_count)298 csf_prepare_gsubmit(struct panfrost_context *ctx,
299 struct drm_panthor_group_submit *gsubmit,
300 struct drm_panthor_queue_submit *qsubmits,
301 uint32_t qsubmit_count)
302 {
303 *gsubmit = (struct drm_panthor_group_submit){
304 .group_handle = ctx->csf.group_handle,
305 .queue_submits = DRM_PANTHOR_OBJ_ARRAY(qsubmit_count, qsubmits),
306 };
307 }
308
309 static int
csf_submit_gsubmit(struct panfrost_context * ctx,struct drm_panthor_group_submit * gsubmit)310 csf_submit_gsubmit(struct panfrost_context *ctx,
311 struct drm_panthor_group_submit *gsubmit)
312 {
313 struct panfrost_device *dev = pan_device(ctx->base.screen);
314 int ret = 0;
315
316 if (!ctx->is_noop) {
317 ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_SUBMIT,
318 gsubmit);
319 }
320
321 if (ret)
322 return errno;
323
324 return 0;
325 }
326
327 static int
csf_emit_batch_end(struct panfrost_batch * batch)328 csf_emit_batch_end(struct panfrost_batch *batch)
329 {
330 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
331 struct cs_builder *b = batch->csf.cs.builder;
332
333 /* Barrier to let everything finish */
334 cs_wait_slots(b, BITFIELD_MASK(8), false);
335
336 if (dev->debug & PAN_DBG_SYNC) {
337 /* Get the CS state */
338 batch->csf.cs.state = pan_pool_alloc_aligned(&batch->pool.base, 8, 8);
339 if (!batch->csf.cs.state.cpu)
340 return -1;
341
342 memset(batch->csf.cs.state.cpu, ~0, 8);
343 cs_move64_to(b, cs_reg64(b, 90), batch->csf.cs.state.gpu);
344 cs_store_state(b, cs_reg64(b, 90), 0, MALI_CS_STATE_ERROR_STATUS,
345 cs_now());
346 }
347
348 /* Flush caches now that we're done (synchronous) */
349 struct cs_index flush_id = cs_reg32(b, 74);
350 cs_move32_to(b, flush_id, 0);
351 cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN, true,
352 flush_id, cs_defer(0, 0));
353 cs_wait_slot(b, 0, false);
354
355 /* Finish the command stream */
356 if (!cs_is_valid(batch->csf.cs.builder))
357 return -1;
358
359 cs_finish(batch->csf.cs.builder);
360 return 0;
361 }
362
363 static int
csf_submit_collect_wait_ops(struct panfrost_batch * batch,struct util_dynarray * syncops,uint32_t vm_sync_handle)364 csf_submit_collect_wait_ops(struct panfrost_batch *batch,
365 struct util_dynarray *syncops,
366 uint32_t vm_sync_handle)
367 {
368 struct panfrost_context *ctx = batch->ctx;
369 struct panfrost_device *dev = pan_device(ctx->base.screen);
370 uint64_t vm_sync_wait_point = 0, bo_sync_point;
371 uint32_t bo_sync_handle;
372 int ret;
373
374 /* We don't wait on BOs attached to the various batch pools, because those
375 * are private to the batch, and are guaranteed to be idle at allocation
376 * time. We need to iterate over other BOs accessed by the batch though,
377 * to add the corresponding wait operations.
378 */
379 util_dynarray_foreach(&batch->bos, pan_bo_access, ptr) {
380 unsigned i = ptr - util_dynarray_element(&batch->bos, pan_bo_access, 0);
381 pan_bo_access flags = *ptr;
382
383 if (!flags)
384 continue;
385
386 /* Update the BO access flags so that panfrost_bo_wait() knows
387 * about all pending accesses.
388 * We only keep the READ/WRITE info since this is all the BO
389 * wait logic cares about.
390 * We also preserve existing flags as this batch might not
391 * be the first one to access the BO.
392 */
393 struct panfrost_bo *bo = pan_lookup_bo(dev, i);
394
395 ret = panthor_kmod_bo_get_sync_point(bo->kmod_bo, &bo_sync_handle,
396 &bo_sync_point,
397 !(flags & PAN_BO_ACCESS_WRITE));
398 if (ret)
399 return ret;
400
401 if (bo_sync_handle == vm_sync_handle) {
402 vm_sync_wait_point = MAX2(vm_sync_wait_point, bo_sync_point);
403 continue;
404 }
405
406 assert(bo_sync_point == 0 || !bo->kmod_bo->exclusive_vm);
407
408 struct drm_panthor_sync_op waitop = {
409 .flags =
410 DRM_PANTHOR_SYNC_OP_WAIT |
411 (bo_sync_point ? DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ
412 : DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ),
413 .handle = bo_sync_handle,
414 .timeline_value = bo_sync_point,
415 };
416
417 util_dynarray_append(syncops, struct drm_panthor_sync_op, waitop);
418 }
419
420 if (vm_sync_wait_point > 0) {
421 struct drm_panthor_sync_op waitop = {
422 .flags = DRM_PANTHOR_SYNC_OP_WAIT |
423 DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ,
424 .handle = vm_sync_handle,
425 .timeline_value = vm_sync_wait_point,
426 };
427
428 util_dynarray_append(syncops, struct drm_panthor_sync_op, waitop);
429 }
430
431 if (ctx->in_sync_fd >= 0) {
432 ret = drmSyncobjImportSyncFile(panfrost_device_fd(dev), ctx->in_sync_obj,
433 ctx->in_sync_fd);
434 if (ret)
435 return ret;
436
437 struct drm_panthor_sync_op waitop = {
438 .flags =
439 DRM_PANTHOR_SYNC_OP_WAIT | DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ,
440 .handle = ctx->in_sync_obj,
441 };
442
443 util_dynarray_append(syncops, struct drm_panthor_sync_op, waitop);
444
445 close(ctx->in_sync_fd);
446 ctx->in_sync_fd = -1;
447 }
448
449 return 0;
450 }
451
452 static int
csf_attach_sync_points(struct panfrost_batch * batch,uint32_t vm_sync_handle,uint64_t vm_sync_signal_point)453 csf_attach_sync_points(struct panfrost_batch *batch, uint32_t vm_sync_handle,
454 uint64_t vm_sync_signal_point)
455 {
456 struct panfrost_context *ctx = batch->ctx;
457 struct panfrost_device *dev = pan_device(ctx->base.screen);
458 int ret;
459
460 /* There should be no invisble allocation on CSF. */
461 assert(batch->invisible_pool.bos.size == 0);
462
463 /* Attach sync points to batch-private BOs first. We assume BOs can
464 * be written by the GPU to keep things simple.
465 */
466 util_dynarray_foreach(&batch->pool.bos, struct panfrost_bo *, bo) {
467 (*bo)->gpu_access |= PAN_BO_ACCESS_RW;
468 ret = panthor_kmod_bo_attach_sync_point((*bo)->kmod_bo, vm_sync_handle,
469 vm_sync_signal_point, true);
470 if (ret)
471 return ret;
472 }
473
474 util_dynarray_foreach(&batch->csf.cs_chunk_pool.bos, struct panfrost_bo *,
475 bo) {
476 (*bo)->gpu_access |= PAN_BO_ACCESS_RW;
477 ret = panthor_kmod_bo_attach_sync_point((*bo)->kmod_bo, vm_sync_handle,
478 vm_sync_signal_point, true);
479 if (ret)
480 return ret;
481 }
482
483 /* Attach the VM sync point to all resources accessed by the batch. */
484 util_dynarray_foreach(&batch->bos, pan_bo_access, ptr) {
485 unsigned i = ptr - util_dynarray_element(&batch->bos, pan_bo_access, 0);
486 pan_bo_access flags = *ptr;
487
488 if (!flags)
489 continue;
490
491 struct panfrost_bo *bo = pan_lookup_bo(dev, i);
492
493 bo->gpu_access |= flags & (PAN_BO_ACCESS_RW);
494 ret = panthor_kmod_bo_attach_sync_point(bo->kmod_bo, vm_sync_handle,
495 vm_sync_signal_point,
496 flags & PAN_BO_ACCESS_WRITE);
497 if (ret)
498 return ret;
499 }
500
501 /* And finally transfer the VM sync point to the context syncobj. */
502 return drmSyncobjTransfer(panfrost_device_fd(dev), ctx->syncobj, 0,
503 vm_sync_handle, vm_sync_signal_point, 0);
504 }
505
506 static void
csf_check_ctx_state_and_reinit(struct panfrost_context * ctx)507 csf_check_ctx_state_and_reinit(struct panfrost_context *ctx)
508 {
509 struct panfrost_device *dev = pan_device(ctx->base.screen);
510 struct drm_panthor_group_get_state state = {
511 .group_handle = ctx->csf.group_handle,
512 };
513 int ret;
514
515 ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_GET_STATE,
516 &state);
517 if (ret) {
518 mesa_loge("DRM_IOCTL_PANTHOR_GROUP_GET_STATE failed (err=%d)", errno);
519 return;
520 }
521
522 /* Context is still usable. This was a transient error. */
523 if (state.state == 0)
524 return;
525
526 /* If the VM is unusable, we can't do much, as this is shared between all
527 * contexts, and restoring the VM state is non-trivial.
528 */
529 if (pan_kmod_vm_query_state(dev->kmod.vm) != PAN_KMOD_VM_USABLE) {
530 mesa_loge("VM became unusable, we can't reset the context");
531 assert(!"VM became unusable, we can't reset the context");
532 }
533
534 panfrost_context_reinit(ctx);
535 }
536
537 static void
csf_submit_wait_and_dump(struct panfrost_batch * batch,const struct drm_panthor_group_submit * gsubmit,uint32_t vm_sync_handle,uint64_t vm_sync_signal_point)538 csf_submit_wait_and_dump(struct panfrost_batch *batch,
539 const struct drm_panthor_group_submit *gsubmit,
540 uint32_t vm_sync_handle, uint64_t vm_sync_signal_point)
541 {
542 struct panfrost_context *ctx = batch->ctx;
543 struct panfrost_device *dev = pan_device(ctx->base.screen);
544 bool wait = (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) && !ctx->is_noop;
545 bool dump = (dev->debug & PAN_DBG_TRACE);
546 bool crash = false;
547
548 if (!wait && !dump)
549 return;
550
551 /* Wait so we can get errors reported back */
552 if (wait) {
553 int ret =
554 drmSyncobjTimelineWait(panfrost_device_fd(dev), &vm_sync_handle,
555 &vm_sync_signal_point, 1, INT64_MAX, 0, NULL);
556 assert(ret >= 0);
557
558 struct pan_csf_tiler_oom_ctx *tiler_oom_ctx =
559 batch->csf.tiler_oom_ctx.cpu;
560 if (tiler_oom_ctx != NULL && tiler_oom_ctx->counter > 0) {
561 perf_debug(ctx, "Incremental rendering was triggered %i time(s)",
562 tiler_oom_ctx->counter);
563 }
564 }
565
566 /* Jobs won't be complete if blackhole rendering, that's ok */
567 if (!ctx->is_noop && (dev->debug & PAN_DBG_SYNC) &&
568 *((uint64_t *)batch->csf.cs.state.cpu) != 0) {
569 crash = true;
570 dump = true;
571 }
572
573 if (dump) {
574 const struct drm_panthor_queue_submit *qsubmits =
575 (void *)(uintptr_t)gsubmit->queue_submits.array;
576
577 for (unsigned i = 0; i < gsubmit->queue_submits.count; i++) {
578 uint32_t regs[256] = {0};
579 pandecode_interpret_cs(dev->decode_ctx, qsubmits[i].stream_addr,
580 qsubmits[i].stream_size,
581 panfrost_device_gpu_id(dev), regs);
582 }
583
584 if (dev->debug & PAN_DBG_DUMP)
585 pandecode_dump_mappings(dev->decode_ctx);
586 }
587
588 if (crash) {
589 mesa_loge("Incomplete job or timeout\n");
590 abort();
591 }
592 }
593
594 int
GENX(csf_submit_batch)595 GENX(csf_submit_batch)(struct panfrost_batch *batch)
596 {
597 int ret;
598
599 /* Close the batch before submitting. */
600 ret = csf_emit_batch_end(batch);
601 if (ret)
602 return ret;
603
604 uint64_t cs_start = cs_root_chunk_gpu_addr(batch->csf.cs.builder);
605 uint32_t cs_size = cs_root_chunk_size(batch->csf.cs.builder);
606 struct panfrost_context *ctx = batch->ctx;
607 struct panfrost_device *dev = pan_device(ctx->base.screen);
608 uint32_t vm_sync_handle = panthor_kmod_vm_sync_handle(dev->kmod.vm);
609 struct util_dynarray syncops;
610
611 util_dynarray_init(&syncops, NULL);
612
613 ret = csf_submit_collect_wait_ops(batch, &syncops, vm_sync_handle);
614 if (ret)
615 goto out_free_syncops;
616
617 uint64_t vm_sync_cur_point = panthor_kmod_vm_sync_lock(dev->kmod.vm);
618 uint64_t vm_sync_signal_point = vm_sync_cur_point + 1;
619
620 struct drm_panthor_sync_op signalop = {
621 .flags = DRM_PANTHOR_SYNC_OP_SIGNAL |
622 DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ,
623 .handle = vm_sync_handle,
624 .timeline_value = vm_sync_signal_point,
625 };
626
627 util_dynarray_append(&syncops, struct drm_panthor_sync_op, signalop);
628
629 struct drm_panthor_queue_submit qsubmit;
630 struct drm_panthor_group_submit gsubmit;
631
632 csf_prepare_qsubmit(
633 ctx, &qsubmit, 0, cs_start, cs_size, util_dynarray_begin(&syncops),
634 util_dynarray_num_elements(&syncops, struct drm_panthor_sync_op));
635 csf_prepare_gsubmit(ctx, &gsubmit, &qsubmit, 1);
636 ret = csf_submit_gsubmit(ctx, &gsubmit);
637 panthor_kmod_vm_sync_unlock(dev->kmod.vm,
638 ret ? vm_sync_cur_point : vm_sync_signal_point);
639
640 if (!ret) {
641 csf_submit_wait_and_dump(batch, &gsubmit, vm_sync_handle,
642 vm_sync_signal_point);
643 ret = csf_attach_sync_points(batch, vm_sync_handle, vm_sync_signal_point);
644 } else {
645 csf_check_ctx_state_and_reinit(batch->ctx);
646 }
647
648 out_free_syncops:
649 util_dynarray_fini(&syncops);
650 return ret;
651 }
652
653 static uint64_t
csf_get_tiler_desc(struct panfrost_batch * batch)654 csf_get_tiler_desc(struct panfrost_batch *batch)
655 {
656 if (batch->tiler_ctx.valhall.desc)
657 return batch->tiler_ctx.valhall.desc;
658
659 struct panfrost_ptr t =
660 pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
661
662 batch->csf.pending_tiler_desc = t.cpu;
663 batch->tiler_ctx.valhall.desc = t.gpu;
664 return batch->tiler_ctx.valhall.desc;
665 }
666
667 static void
csf_emit_tiler_desc(struct panfrost_batch * batch,const struct pan_fb_info * fb)668 csf_emit_tiler_desc(struct panfrost_batch *batch, const struct pan_fb_info *fb)
669 {
670 struct panfrost_context *ctx = batch->ctx;
671 struct panfrost_device *dev = pan_device(ctx->base.screen);
672
673 if (!batch->csf.pending_tiler_desc)
674 return;
675
676 pan_pack(batch->csf.pending_tiler_desc, TILER_CONTEXT, tiler) {
677 unsigned max_levels = dev->tiler_features.max_levels;
678 assert(max_levels >= 2);
679
680 /* TODO: Select hierarchy mask more effectively */
681 tiler.hierarchy_mask = (max_levels >= 8) ? 0xFF : 0x28;
682
683 /* For large framebuffers, disable the smallest bin size to
684 * avoid pathological tiler memory usage. Required to avoid OOM
685 * on dEQP-GLES31.functional.fbo.no_attachments.maximums.all on
686 * Mali-G57.
687 */
688 if (MAX2(batch->key.width, batch->key.height) >= 4096)
689 tiler.hierarchy_mask &= ~1;
690
691 /* For effective tile size larger than 16x16, disable first level */
692 if (fb->tile_size > 16 * 16)
693 tiler.hierarchy_mask &= ~1;
694
695 tiler.fb_width = batch->key.width;
696 tiler.fb_height = batch->key.height;
697 tiler.heap = batch->ctx->csf.heap.desc_bo->ptr.gpu;
698 tiler.sample_pattern =
699 pan_sample_pattern(util_framebuffer_get_num_samples(&batch->key));
700 tiler.first_provoking_vertex =
701 batch->first_provoking_vertex == U_TRISTATE_YES;
702 tiler.geometry_buffer = ctx->csf.tmp_geom_bo->ptr.gpu;
703 tiler.geometry_buffer_size = ctx->csf.tmp_geom_bo->kmod_bo->size;
704 }
705
706 batch->csf.pending_tiler_desc = NULL;
707 }
708
709 void
GENX(csf_prepare_tiler)710 GENX(csf_prepare_tiler)(struct panfrost_batch *batch, struct pan_fb_info *fb)
711 {
712 csf_emit_tiler_desc(batch, fb);
713 }
714
715 void
GENX(csf_preload_fb)716 GENX(csf_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb)
717 {
718 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
719
720 GENX(pan_preload_fb)
721 (&dev->fb_preload_cache, &batch->pool.base, fb, batch->tls.gpu, NULL);
722 }
723
724 #define GET_FBD(_ctx, _pass) \
725 (_ctx)->fbds[PAN_INCREMENTAL_RENDERING_##_pass##_PASS]
726 #define EMIT_FBD(_ctx, _pass, _fb, _tls, _tiler_ctx) \
727 GET_FBD(_ctx, _pass).gpu |= \
728 GENX(pan_emit_fbd)(_fb, 0, _tls, _tiler_ctx, GET_FBD(_ctx, _pass).cpu)
729
730 void
GENX(csf_emit_fbds)731 GENX(csf_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb,
732 struct pan_tls_info *tls)
733 {
734 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
735
736 /* Default framebuffer descriptor */
737
738 batch->framebuffer.gpu |=
739 GENX(pan_emit_fbd)(fb, 0, tls, &batch->tiler_ctx, batch->framebuffer.cpu);
740
741 if (batch->draw_count == 0)
742 return;
743
744 struct pan_csf_tiler_oom_ctx *tiler_oom_ctx = batch->csf.tiler_oom_ctx.cpu;
745 struct pan_fb_info alt_fb;
746 bool changed = false;
747
748 /* First incremental rendering pass: don't discard result */
749
750 memcpy(&alt_fb, fb, sizeof(alt_fb));
751 for (unsigned i = 0; i < fb->rt_count; i++)
752 alt_fb.rts[i].discard = false;
753 alt_fb.zs.discard.z = false;
754 alt_fb.zs.discard.s = false;
755
756 EMIT_FBD(tiler_oom_ctx, FIRST, &alt_fb, tls, &batch->tiler_ctx);
757
758 /* Subsequent incremental rendering passes: preload old content and don't
759 * discard result */
760
761 for (unsigned i = 0; i < fb->rt_count; i++) {
762 if (fb->rts[i].view && !fb->rts[i].preload) {
763 alt_fb.rts[i].preload = true;
764 changed = true;
765 }
766
767 if (alt_fb.rts[i].clear) {
768 alt_fb.rts[i].clear = false;
769 changed = true;
770 }
771 }
772 if (fb->zs.view.zs && !fb->zs.preload.z && !fb->zs.preload.s) {
773 alt_fb.zs.preload.z = true;
774 alt_fb.zs.preload.s = true;
775 changed = true;
776 } else if (fb->zs.view.s && !fb->zs.preload.s) {
777 alt_fb.zs.preload.s = true;
778 changed = true;
779 }
780
781 if (alt_fb.zs.clear.z || alt_fb.zs.clear.s) {
782 alt_fb.zs.clear.z = false;
783 alt_fb.zs.clear.s = false;
784 changed = true;
785 }
786
787 if (changed) {
788 alt_fb.bifrost.pre_post.dcds.gpu = 0;
789 GENX(pan_preload_fb)
790 (&dev->fb_preload_cache, &batch->pool.base, &alt_fb, batch->tls.gpu, NULL);
791 }
792
793 EMIT_FBD(tiler_oom_ctx, MIDDLE, &alt_fb, tls, &batch->tiler_ctx);
794
795 /* Last incremental rendering pass: preload previous content and deal with
796 * results as specified by user */
797
798 for (unsigned i = 0; i < fb->rt_count; i++)
799 alt_fb.rts[i].discard = fb->rts[i].discard;
800 alt_fb.zs.discard.z = fb->zs.discard.z;
801 alt_fb.zs.discard.s = fb->zs.discard.s;
802
803 EMIT_FBD(tiler_oom_ctx, LAST, &alt_fb, tls, &batch->tiler_ctx);
804 }
805
806 void
GENX(csf_emit_fragment_job)807 GENX(csf_emit_fragment_job)(struct panfrost_batch *batch,
808 const struct pan_fb_info *pfb)
809 {
810 struct cs_builder *b = batch->csf.cs.builder;
811 struct pan_csf_tiler_oom_ctx *oom_ctx = batch->csf.tiler_oom_ctx.cpu;
812
813 if (batch->draw_count > 0) {
814 /* Finish tiling and wait for IDVS and tiling */
815 cs_finish_tiling(b, false);
816 cs_wait_slot(b, 2, false);
817 cs_vt_end(b, cs_now());
818 }
819
820 /* Set up the fragment job */
821 cs_move64_to(b, cs_reg64(b, 40), batch->framebuffer.gpu);
822 cs_move32_to(b, cs_reg32(b, 42), (batch->miny << 16) | batch->minx);
823 cs_move32_to(b, cs_reg32(b, 43),
824 ((batch->maxy - 1) << 16) | (batch->maxx - 1));
825 cs_move64_to(b, cs_reg64(b, 44), 0);
826 cs_move32_to(b, cs_reg32(b, 46), 0);
827
828 /* Use different framebuffer descriptor if incremental rendering was
829 * triggered while tiling */
830 if (batch->draw_count > 0) {
831 struct cs_index counter = cs_reg32(b, 78);
832 cs_load32_to(b, counter, cs_reg64(b, TILER_OOM_CTX_REG), 0);
833 cs_wait_slot(b, 0, false);
834 cs_if(b, MALI_CS_CONDITION_GREATER, counter) {
835 cs_move64_to(b, cs_reg64(b, 40), GET_FBD(oom_ctx, LAST).gpu);
836 }
837 }
838
839 /* Run the fragment job and wait */
840 cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
841 cs_wait_slot(b, 2, false);
842
843 /* Gather freed heap chunks and add them to the heap context free list
844 * so they can be re-used next time the tiler heap runs out of chunks.
845 * That's what cs_finish_fragment() is all about. The list of freed
846 * chunks is in the tiler context descriptor
847 * (completed_{top,bottom fields}). */
848 if (batch->draw_count > 0) {
849 assert(batch->tiler_ctx.valhall.desc);
850 cs_move64_to(b, cs_reg64(b, 90), batch->tiler_ctx.valhall.desc);
851 cs_load_to(b, cs_reg_tuple(b, 86, 4), cs_reg64(b, 90), BITFIELD_MASK(4),
852 40);
853 cs_wait_slot(b, 0, false);
854 cs_finish_fragment(b, true, cs_reg64(b, 86), cs_reg64(b, 88), cs_now());
855 }
856 }
857
858 static void
csf_emit_shader_regs(struct panfrost_batch * batch,enum pipe_shader_type stage,uint64_t shader)859 csf_emit_shader_regs(struct panfrost_batch *batch, enum pipe_shader_type stage,
860 uint64_t shader)
861 {
862 uint64_t resources = panfrost_emit_resources(batch, stage);
863
864 assert(stage == PIPE_SHADER_VERTEX || stage == PIPE_SHADER_FRAGMENT ||
865 stage == PIPE_SHADER_COMPUTE);
866
867 unsigned offset = (stage == PIPE_SHADER_FRAGMENT) ? 4 : 0;
868 unsigned fau_count = DIV_ROUND_UP(batch->nr_push_uniforms[stage], 2);
869
870 struct cs_builder *b = batch->csf.cs.builder;
871 cs_move64_to(b, cs_reg64(b, 0 + offset), resources);
872 cs_move64_to(b, cs_reg64(b, 8 + offset),
873 batch->push_uniforms[stage] | ((uint64_t)fau_count << 56));
874 cs_move64_to(b, cs_reg64(b, 16 + offset), shader);
875 }
876
877 void
GENX(csf_launch_grid)878 GENX(csf_launch_grid)(struct panfrost_batch *batch,
879 const struct pipe_grid_info *info)
880 {
881 /* Empty compute programs are invalid and don't make sense */
882 if (batch->rsd[PIPE_SHADER_COMPUTE] == 0)
883 return;
884
885 struct panfrost_context *ctx = batch->ctx;
886 struct panfrost_device *dev = pan_device(ctx->base.screen);
887 struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE];
888 struct cs_builder *b = batch->csf.cs.builder;
889
890 csf_emit_shader_regs(batch, PIPE_SHADER_COMPUTE,
891 batch->rsd[PIPE_SHADER_COMPUTE]);
892
893 cs_move64_to(b, cs_reg64(b, 24), batch->tls.gpu);
894
895 /* Global attribute offset */
896 cs_move32_to(b, cs_reg32(b, 32), 0);
897
898 /* Compute workgroup size */
899 struct mali_compute_size_workgroup_packed wg_size;
900 pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
901 cfg.workgroup_size_x = info->block[0];
902 cfg.workgroup_size_y = info->block[1];
903 cfg.workgroup_size_z = info->block[2];
904
905 /* Workgroups may be merged if the shader does not use barriers
906 * or shared memory. This condition is checked against the
907 * static shared_size at compile-time. We need to check the
908 * variable shared size at launch_grid time, because the
909 * compiler doesn't know about that.
910 */
911 cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups &&
912 (info->variable_shared_mem == 0);
913 }
914
915 cs_move32_to(b, cs_reg32(b, 33), wg_size.opaque[0]);
916
917 /* Offset */
918 for (unsigned i = 0; i < 3; ++i)
919 cs_move32_to(b, cs_reg32(b, 34 + i), 0);
920
921 unsigned threads_per_wg = info->block[0] * info->block[1] * info->block[2];
922 unsigned max_thread_cnt = panfrost_compute_max_thread_count(
923 &dev->kmod.props, cs->info.work_reg_count);
924
925 if (info->indirect) {
926 /* Load size in workgroups per dimension from memory */
927 struct cs_index address = cs_reg64(b, 64);
928 cs_move64_to(
929 b, address,
930 pan_resource(info->indirect)->image.data.base + info->indirect_offset);
931
932 struct cs_index grid_xyz = cs_reg_tuple(b, 37, 3);
933 cs_load_to(b, grid_xyz, address, BITFIELD_MASK(3), 0);
934
935 /* Wait for the load */
936 cs_wait_slot(b, 0, false);
937
938 /* Copy to FAU */
939 for (unsigned i = 0; i < 3; ++i) {
940 if (batch->num_wg_sysval[i]) {
941 cs_move64_to(b, address, batch->num_wg_sysval[i]);
942 cs_store(b, cs_extract32(b, grid_xyz, i), address, BITFIELD_MASK(1),
943 0);
944 }
945 }
946
947 /* Wait for the stores */
948 cs_wait_slot(b, 0, false);
949
950 cs_run_compute_indirect(b, DIV_ROUND_UP(max_thread_cnt, threads_per_wg),
951 false, cs_shader_res_sel(0, 0, 0, 0));
952 } else {
953 /* Set size in workgroups per dimension immediately */
954 for (unsigned i = 0; i < 3; ++i)
955 cs_move32_to(b, cs_reg32(b, 37 + i), info->grid[i]);
956
957 /* Pick the task_axis and task_increment to maximize thread utilization. */
958 unsigned task_axis = MALI_TASK_AXIS_X;
959 unsigned threads_per_task = threads_per_wg;
960 unsigned task_increment = 0;
961
962 for (unsigned i = 0; i < 3; i++) {
963 if (threads_per_task * info->grid[i] >= max_thread_cnt) {
964 /* We reached out thread limit, stop at the current axis and
965 * calculate the increment so it doesn't exceed the per-core
966 * thread capacity.
967 */
968 task_increment = max_thread_cnt / threads_per_task;
969 break;
970 } else if (task_axis == MALI_TASK_AXIS_Z) {
971 /* We reached the Z axis, and there's still room to stuff more
972 * threads. Pick the current axis grid size as our increment
973 * as there's no point using something bigger.
974 */
975 task_increment = info->grid[i];
976 break;
977 }
978
979 threads_per_task *= info->grid[i];
980 task_axis++;
981 }
982
983 assert(task_axis <= MALI_TASK_AXIS_Z);
984 assert(task_increment > 0);
985 cs_run_compute(b, task_increment, task_axis, false,
986 cs_shader_res_sel(0, 0, 0, 0));
987 }
988 }
989
990 void
GENX(csf_launch_xfb)991 GENX(csf_launch_xfb)(struct panfrost_batch *batch,
992 const struct pipe_draw_info *info, unsigned count)
993 {
994 struct cs_builder *b = batch->csf.cs.builder;
995
996 cs_move64_to(b, cs_reg64(b, 24), batch->tls.gpu);
997
998 /* TODO: Indexing. Also, attribute_offset is a legacy feature.. */
999 cs_move32_to(b, cs_reg32(b, 32), batch->ctx->offset_start);
1000
1001 /* Compute workgroup size */
1002 struct mali_compute_size_workgroup_packed wg_size;
1003 pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
1004 cfg.workgroup_size_x = 1;
1005 cfg.workgroup_size_y = 1;
1006 cfg.workgroup_size_z = 1;
1007
1008 /* Transform feedback shaders do not use barriers or
1009 * shared memory, so we may merge workgroups.
1010 */
1011 cfg.allow_merging_workgroups = true;
1012 }
1013 cs_move32_to(b, cs_reg32(b, 33), wg_size.opaque[0]);
1014
1015 /* Offset */
1016 for (unsigned i = 0; i < 3; ++i)
1017 cs_move32_to(b, cs_reg32(b, 34 + i), 0);
1018
1019 cs_move32_to(b, cs_reg32(b, 37), count);
1020 cs_move32_to(b, cs_reg32(b, 38), info->instance_count);
1021 cs_move32_to(b, cs_reg32(b, 39), 1);
1022
1023 csf_emit_shader_regs(batch, PIPE_SHADER_VERTEX,
1024 batch->rsd[PIPE_SHADER_VERTEX]);
1025 /* force a barrier to avoid read/write sync issues with buffers */
1026 cs_wait_slot(b, 2, false);
1027
1028 /* XXX: Choose correctly */
1029 cs_run_compute(b, 1, MALI_TASK_AXIS_Z, false, cs_shader_res_sel(0, 0, 0, 0));
1030 }
1031
1032 static void
emit_tiler_oom_context(struct cs_builder * b,struct panfrost_batch * batch)1033 emit_tiler_oom_context(struct cs_builder *b, struct panfrost_batch *batch)
1034 {
1035 struct pan_csf_tiler_oom_ctx *ctx;
1036
1037 batch->csf.tiler_oom_ctx =
1038 pan_pool_alloc_aligned(&batch->pool.base, sizeof(*ctx), 8);
1039 ctx = batch->csf.tiler_oom_ctx.cpu;
1040
1041 ctx->tiler_desc = csf_get_tiler_desc(batch);
1042 ctx->counter = 0;
1043 ctx->bbox_min = (batch->miny << 16) | batch->minx;
1044 ctx->bbox_max = ((batch->maxy - 1) << 16) | (batch->maxx - 1);
1045 ctx->dump_addr = batch->ctx->csf.tiler_oom_handler.save_bo->ptr.gpu;
1046
1047 for (unsigned i = 0; i < PAN_INCREMENTAL_RENDERING_PASS_COUNT; ++i)
1048 ctx->fbds[i] = alloc_fbd(batch);
1049
1050 csf_update_tiler_oom_ctx(b, batch->csf.tiler_oom_ctx.gpu);
1051 }
1052
1053 static uint32_t
csf_emit_draw_state(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset)1054 csf_emit_draw_state(struct panfrost_batch *batch,
1055 const struct pipe_draw_info *info, unsigned drawid_offset)
1056 {
1057 struct panfrost_context *ctx = batch->ctx;
1058 struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
1059 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
1060
1061 bool idvs = vs->info.vs.idvs;
1062 bool fs_required = panfrost_fs_required(
1063 fs, ctx->blend, &ctx->pipe_framebuffer, ctx->depth_stencil);
1064 bool secondary_shader = vs->info.vs.secondary_enable && fs_required;
1065
1066 assert(idvs && "IDVS required for CSF");
1067
1068 struct cs_builder *b = batch->csf.cs.builder;
1069
1070 if (batch->draw_count == 0) {
1071 emit_tiler_oom_context(b, batch);
1072 cs_vt_start(batch->csf.cs.builder, cs_now());
1073 }
1074
1075 csf_emit_shader_regs(batch, PIPE_SHADER_VERTEX,
1076 panfrost_get_position_shader(batch, info));
1077
1078 if (fs_required) {
1079 csf_emit_shader_regs(batch, PIPE_SHADER_FRAGMENT,
1080 batch->rsd[PIPE_SHADER_FRAGMENT]);
1081 } else {
1082 cs_move64_to(b, cs_reg64(b, 4), 0);
1083 cs_move64_to(b, cs_reg64(b, 12), 0);
1084 cs_move64_to(b, cs_reg64(b, 20), 0);
1085 }
1086
1087 if (secondary_shader) {
1088 cs_move64_to(b, cs_reg64(b, 18), panfrost_get_varying_shader(batch));
1089 }
1090
1091 cs_move64_to(b, cs_reg64(b, 24), batch->tls.gpu);
1092 cs_move64_to(b, cs_reg64(b, 30), batch->tls.gpu);
1093 cs_move32_to(b, cs_reg32(b, 32), 0);
1094 cs_move32_to(b, cs_reg32(b, 37), 0);
1095 cs_move32_to(b, cs_reg32(b, 38), 0);
1096
1097 cs_move64_to(b, cs_reg64(b, 40), csf_get_tiler_desc(batch));
1098
1099 STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR));
1100 STATIC_ASSERT(sizeof(uint64_t) == pan_size(SCISSOR));
1101 uint64_t *sbd = (uint64_t *)&batch->scissor[0];
1102 cs_move64_to(b, cs_reg64(b, 42), *sbd);
1103
1104 cs_move32_to(b, cs_reg32(b, 44), fui(batch->minimum_z));
1105 cs_move32_to(b, cs_reg32(b, 45), fui(batch->maximum_z));
1106
1107 if (ctx->occlusion_query && ctx->active_queries) {
1108 struct panfrost_resource *rsrc = pan_resource(ctx->occlusion_query->rsrc);
1109 cs_move64_to(b, cs_reg64(b, 46), rsrc->image.data.base);
1110 panfrost_batch_write_rsrc(ctx->batch, rsrc, PIPE_SHADER_FRAGMENT);
1111 }
1112
1113 cs_move32_to(b, cs_reg32(b, 48), panfrost_vertex_attribute_stride(vs, fs));
1114 cs_move64_to(b, cs_reg64(b, 50),
1115 batch->blend | MAX2(batch->key.nr_cbufs, 1));
1116 cs_move64_to(b, cs_reg64(b, 52), batch->depth_stencil);
1117
1118 if (info->index_size)
1119 cs_move64_to(b, cs_reg64(b, 54), batch->indices);
1120
1121 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
1122
1123 struct mali_primitive_flags_packed primitive_flags;
1124 pan_pack(&primitive_flags, PRIMITIVE_FLAGS, cfg) {
1125 if (panfrost_writes_point_size(ctx))
1126 cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
1127
1128 cfg.allow_rotating_primitives = allow_rotating_primitives(fs, info);
1129
1130 cfg.low_depth_cull = rast->depth_clip_near;
1131 cfg.high_depth_cull = rast->depth_clip_far;
1132
1133 /* Non-fixed restart indices should have been lowered */
1134 assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info));
1135 cfg.primitive_restart = info->primitive_restart;
1136
1137 cfg.position_fifo_format = panfrost_writes_point_size(ctx)
1138 ? MALI_FIFO_FORMAT_EXTENDED
1139 : MALI_FIFO_FORMAT_BASIC;
1140 }
1141
1142 cs_move32_to(b, cs_reg32(b, 56), primitive_flags.opaque[0]);
1143
1144 struct mali_dcd_flags_0_packed dcd_flags0;
1145 struct mali_dcd_flags_1_packed dcd_flags1;
1146
1147 pan_pack(&dcd_flags0, DCD_FLAGS_0, cfg) {
1148 enum mesa_prim reduced_mode = u_reduced_prim(info->mode);
1149 bool polygon = reduced_mode == MESA_PRIM_TRIANGLES;
1150 bool lines = reduced_mode == MESA_PRIM_LINES;
1151
1152 /*
1153 * From the Gallium documentation,
1154 * pipe_rasterizer_state::cull_face "indicates which faces of
1155 * polygons to cull". Points and lines are not considered
1156 * polygons and should be drawn even if all faces are culled.
1157 * The hardware does not take primitive type into account when
1158 * culling, so we need to do that check ourselves.
1159 */
1160 cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT);
1161 cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK);
1162 cfg.front_face_ccw = rast->front_ccw;
1163
1164 cfg.multisample_enable = rast->multisample;
1165
1166 /* Use per-sample shading if required by API Also use it when a
1167 * blend shader is used with multisampling, as this is handled
1168 * by a single ST_TILE in the blend shader with the current
1169 * sample ID, requiring per-sample shading.
1170 */
1171 cfg.evaluate_per_sample =
1172 (rast->multisample &&
1173 ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader));
1174
1175 cfg.single_sampled_lines = !rast->multisample;
1176
1177 if (lines && rast->line_smooth) {
1178 cfg.multisample_enable = true;
1179 cfg.single_sampled_lines = false;
1180 }
1181
1182 bool has_oq = ctx->occlusion_query && ctx->active_queries;
1183 if (has_oq) {
1184 if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
1185 cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
1186 else
1187 cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
1188 }
1189
1190 if (fs_required) {
1191 struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
1192 fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
1193 ctx->blend->base.alpha_to_coverage,
1194 ctx->depth_stencil->zs_always_passes);
1195
1196 cfg.pixel_kill_operation = earlyzs.kill;
1197 cfg.zs_update_operation = earlyzs.update;
1198
1199 cfg.allow_forward_pixel_to_kill =
1200 pan_allow_forward_pixel_to_kill(ctx, fs);
1201 cfg.allow_forward_pixel_to_be_killed = !fs->info.writes_global;
1202
1203 cfg.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
1204 cfg.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
1205
1206 /* Also use per-sample shading if required by the shader
1207 */
1208 cfg.evaluate_per_sample |= fs->info.fs.sample_shading;
1209
1210 /* Unlike Bifrost, alpha-to-coverage must be included in
1211 * this identically-named flag. Confusing, isn't it?
1212 */
1213 cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
1214 fs->info.fs.can_discard ||
1215 ctx->blend->base.alpha_to_coverage;
1216
1217 cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
1218 } else {
1219 /* These operations need to be FORCE to benefit from the
1220 * depth-only pass optimizations.
1221 */
1222 cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
1223 cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
1224
1225 /* No shader and no blend => no shader or blend
1226 * reasons to disable FPK. The only FPK-related state
1227 * not covered is alpha-to-coverage which we don't set
1228 * without blend.
1229 */
1230 cfg.allow_forward_pixel_to_kill = true;
1231
1232 /* No shader => no shader side effects */
1233 cfg.allow_forward_pixel_to_be_killed = true;
1234
1235 /* Alpha isn't written so these are vacuous */
1236 cfg.overdraw_alpha0 = true;
1237 cfg.overdraw_alpha1 = true;
1238 }
1239 }
1240
1241 pan_pack(&dcd_flags1, DCD_FLAGS_1, cfg) {
1242 cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF;
1243
1244 if (fs_required) {
1245 /* See JM Valhall equivalent code */
1246 cfg.render_target_mask =
1247 (fs->info.outputs_written >> FRAG_RESULT_DATA0) & ctx->fb_rt_mask;
1248 }
1249 }
1250
1251 cs_move32_to(b, cs_reg32(b, 57), dcd_flags0.opaque[0]);
1252 cs_move32_to(b, cs_reg32(b, 58), dcd_flags1.opaque[0]);
1253
1254 struct mali_primitive_size_packed primsize;
1255 panfrost_emit_primitive_size(ctx, info->mode == MESA_PRIM_POINTS, 0,
1256 &primsize);
1257 struct mali_primitive_size_packed *primsize_ptr = &primsize;
1258 cs_move64_to(b, cs_reg64(b, 60), *((uint64_t*)primsize_ptr));
1259
1260 struct mali_primitive_flags_packed flags_override;
1261 /* Pack with nodefaults so only explicitly set override fields affect the
1262 * previously set register values */
1263 pan_pack_nodefaults(&flags_override, PRIMITIVE_FLAGS, cfg) {
1264 cfg.draw_mode = pan_draw_mode(info->mode);
1265 cfg.index_type = panfrost_translate_index_size(info->index_size);
1266 cfg.secondary_shader = secondary_shader;
1267 };
1268
1269 return flags_override.opaque[0];
1270 }
1271
1272 static struct cs_index
csf_emit_draw_id_register(struct panfrost_batch * batch,unsigned offset)1273 csf_emit_draw_id_register(struct panfrost_batch *batch, unsigned offset)
1274 {
1275 struct cs_builder *b = batch->csf.cs.builder;
1276 struct panfrost_context *ctx = batch->ctx;
1277 struct panfrost_uncompiled_shader *vs = ctx->uncompiled[PIPE_SHADER_VERTEX];
1278
1279 if (!BITSET_TEST(vs->nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID))
1280 return cs_undef();
1281
1282 struct cs_index drawid = cs_reg32(b, 67);
1283 cs_move32_to(b, drawid, offset);
1284
1285 return drawid;
1286 }
1287
1288 void
GENX(csf_launch_draw)1289 GENX(csf_launch_draw)(struct panfrost_batch *batch,
1290 const struct pipe_draw_info *info, unsigned drawid_offset,
1291 const struct pipe_draw_start_count_bias *draw,
1292 unsigned vertex_count)
1293 {
1294 struct cs_builder *b = batch->csf.cs.builder;
1295
1296 uint32_t flags_override = csf_emit_draw_state(batch, info, drawid_offset);
1297 struct cs_index drawid = csf_emit_draw_id_register(batch, drawid_offset);
1298
1299 cs_move32_to(b, cs_reg32(b, 33), draw->count);
1300 cs_move32_to(b, cs_reg32(b, 34), info->instance_count);
1301 cs_move32_to(b, cs_reg32(b, 35), 0);
1302
1303 /* Base vertex offset on Valhall is used for both indexed and
1304 * non-indexed draws, in a simple way for either. Handle both cases.
1305 */
1306 if (info->index_size) {
1307 cs_move32_to(b, cs_reg32(b, 36), draw->index_bias);
1308 cs_move32_to(b, cs_reg32(b, 39), info->index_size * draw->count);
1309 } else {
1310 cs_move32_to(b, cs_reg32(b, 36), draw->start);
1311 cs_move32_to(b, cs_reg32(b, 39), 0);
1312 }
1313
1314 cs_run_idvs(b, flags_override, false, true, cs_shader_res_sel(0, 0, 1, 0),
1315 cs_shader_res_sel(2, 2, 2, 0), drawid);
1316 }
1317
1318 void
GENX(csf_launch_draw_indirect)1319 GENX(csf_launch_draw_indirect)(struct panfrost_batch *batch,
1320 const struct pipe_draw_info *info,
1321 unsigned drawid_offset,
1322 const struct pipe_draw_indirect_info *indirect)
1323 {
1324 struct cs_builder *b = batch->csf.cs.builder;
1325
1326 uint32_t flags_override = csf_emit_draw_state(batch, info, drawid_offset);
1327 struct cs_index drawid = csf_emit_draw_id_register(batch, drawid_offset);
1328
1329 struct cs_index address = cs_reg64(b, 64);
1330 struct cs_index counter = cs_reg32(b, 66);
1331 cs_move64_to(
1332 b, address,
1333 pan_resource(indirect->buffer)->image.data.base + indirect->offset);
1334 cs_move32_to(b, counter, indirect->draw_count);
1335
1336 cs_while(b, MALI_CS_CONDITION_GREATER, counter) {
1337 if (info->index_size) {
1338 /* loads vertex count, instance count, index offset, vertex offset */
1339 cs_load_to(b, cs_reg_tuple(b, 33, 4), address, BITFIELD_MASK(4), 0);
1340 cs_move32_to(b, cs_reg32(b, 39), info->index.resource->width0);
1341 } else {
1342 /* vertex count, instance count */
1343 cs_load_to(b, cs_reg_tuple(b, 33, 2), address, BITFIELD_MASK(2), 0);
1344 cs_move32_to(b, cs_reg32(b, 35), 0);
1345 cs_load_to(b, cs_reg_tuple(b, 36, 1), address, BITFIELD_MASK(1),
1346 2 * sizeof(uint32_t)); // instance offset
1347 cs_move32_to(b, cs_reg32(b, 37), 0);
1348 cs_move32_to(b, cs_reg32(b, 39), 0);
1349 }
1350
1351 cs_wait_slot(b, 0, false);
1352 cs_run_idvs(b, flags_override, false, true, cs_shader_res_sel(0, 0, 1, 0),
1353 cs_shader_res_sel(2, 2, 2, 0), drawid);
1354
1355 cs_add64(b, address, address, indirect->stride);
1356 cs_add32(b, counter, counter, (unsigned int)-1);
1357 if (drawid.type != CS_INDEX_UNDEF)
1358 cs_add32(b, drawid, drawid, 1);
1359 }
1360 }
1361
1362 #define POSITION_FIFO_SIZE (64 * 1024)
1363
1364 static enum drm_panthor_group_priority
get_panthor_group_priority(struct panfrost_context * ctx)1365 get_panthor_group_priority(struct panfrost_context *ctx)
1366 {
1367 if (ctx->flags & PIPE_CONTEXT_REALTIME_PRIORITY)
1368 return PANTHOR_GROUP_PRIORITY_REALTIME;
1369 else if (ctx->flags & PIPE_CONTEXT_HIGH_PRIORITY)
1370 return PANTHOR_GROUP_PRIORITY_HIGH;
1371 else if (ctx->flags & PIPE_CONTEXT_LOW_PRIORITY)
1372 return PANTHOR_GROUP_PRIORITY_LOW;
1373
1374 return PANTHOR_GROUP_PRIORITY_MEDIUM;
1375 }
1376
1377 int
GENX(csf_init_context)1378 GENX(csf_init_context)(struct panfrost_context *ctx)
1379 {
1380 struct panfrost_device *dev = pan_device(ctx->base.screen);
1381 struct drm_panthor_queue_create qc[] = {{
1382 .priority = 1,
1383 .ringbuf_size = 64 * 1024,
1384 }};
1385
1386 struct drm_panthor_group_create gc = {
1387 .compute_core_mask = dev->kmod.props.shader_present,
1388 .fragment_core_mask = dev->kmod.props.shader_present,
1389 .tiler_core_mask = 1,
1390 .max_compute_cores = util_bitcount64(dev->kmod.props.shader_present),
1391 .max_fragment_cores = util_bitcount64(dev->kmod.props.shader_present),
1392 .max_tiler_cores = 1,
1393 .priority = get_panthor_group_priority(ctx),
1394 .queues = DRM_PANTHOR_OBJ_ARRAY(ARRAY_SIZE(qc), qc),
1395 .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
1396 };
1397
1398 int ret =
1399 drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_CREATE, &gc);
1400
1401 if (ret)
1402 goto err_group_create;
1403
1404 ctx->csf.group_handle = gc.group_handle;
1405
1406 struct drm_panthor_group_destroy gd = {
1407 .group_handle = ctx->csf.group_handle,
1408 };
1409
1410 /* Get tiler heap */
1411 struct drm_panthor_tiler_heap_create thc = {
1412 .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
1413 .chunk_size = pan_screen(ctx->base.screen)->csf_tiler_heap.chunk_size,
1414 .initial_chunk_count =
1415 pan_screen(ctx->base.screen)->csf_tiler_heap.initial_chunks,
1416 .max_chunks = pan_screen(ctx->base.screen)->csf_tiler_heap.max_chunks,
1417 .target_in_flight = 65535,
1418 };
1419 ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE,
1420 &thc);
1421
1422 if (ret)
1423 goto err_tiler_heap;
1424
1425 ctx->csf.heap.handle = thc.handle;
1426
1427 struct drm_panthor_tiler_heap_destroy thd = {
1428 .handle = ctx->csf.heap.handle,
1429 };
1430
1431 ctx->csf.heap.desc_bo =
1432 panfrost_bo_create(dev, pan_size(TILER_HEAP), 0, "Tiler Heap");
1433
1434 if (ctx->csf.heap.desc_bo == NULL)
1435 goto err_tiler_heap_desc_bo;
1436
1437 pan_cast_and_pack(ctx->csf.heap.desc_bo->ptr.cpu, TILER_HEAP, heap) {
1438 heap.size = pan_screen(ctx->base.screen)->csf_tiler_heap.chunk_size;
1439 heap.base = thc.first_heap_chunk_gpu_va;
1440 heap.bottom = heap.base + 64;
1441 heap.top = heap.base + heap.size;
1442 }
1443
1444 ctx->csf.tmp_geom_bo = panfrost_bo_create(
1445 dev, POSITION_FIFO_SIZE, PAN_BO_INVISIBLE, "Temporary Geometry buffer");
1446
1447 if (ctx->csf.tmp_geom_bo == NULL)
1448 goto err_tiler_heap_tmp_geom_bo;
1449
1450 /* Setup the tiler heap */
1451 struct panfrost_bo *cs_bo =
1452 panfrost_bo_create(dev, 4096, 0, "Temporary CS buffer");
1453
1454 if (cs_bo == NULL)
1455 goto err_tiler_heap_cs_bo;
1456
1457 if (csf_oom_handler_init(ctx))
1458 goto err_g_submit;
1459
1460 struct cs_buffer init_buffer = {
1461 .cpu = cs_bo->ptr.cpu,
1462 .gpu = cs_bo->ptr.gpu,
1463 .capacity = panfrost_bo_size(cs_bo) / sizeof(uint64_t),
1464 };
1465 const struct cs_builder_conf bconf = {
1466 .nr_registers = 96,
1467 .nr_kernel_registers = 4,
1468 };
1469 struct cs_builder b;
1470 cs_builder_init(&b, &bconf, init_buffer);
1471 struct cs_index heap = cs_reg64(&b, 72);
1472 cs_move64_to(&b, heap, thc.tiler_heap_ctx_gpu_va);
1473 cs_heap_set(&b, heap);
1474
1475 struct cs_index addr_reg = cs_reg64(&b, 86);
1476 struct cs_index length_reg = cs_reg32(&b, 88);
1477 cs_move64_to(&b, addr_reg, ctx->csf.tiler_oom_handler.cs_bo->ptr.gpu);
1478 cs_move32_to(&b, length_reg, ctx->csf.tiler_oom_handler.length);
1479 cs_set_exception_handler(&b, MALI_CS_EXCEPTION_TYPE_TILER_OOM,
1480 addr_reg, length_reg);
1481
1482 struct drm_panthor_queue_submit qsubmit;
1483 struct drm_panthor_group_submit gsubmit;
1484 struct drm_panthor_sync_op sync = {
1485 .flags =
1486 DRM_PANTHOR_SYNC_OP_SIGNAL | DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ,
1487 .handle = ctx->syncobj,
1488 };
1489
1490 assert(cs_is_valid(&b));
1491 cs_finish(&b);
1492
1493 uint64_t cs_start = cs_root_chunk_gpu_addr(&b);
1494 uint32_t cs_size = cs_root_chunk_size(&b);
1495
1496 csf_prepare_qsubmit(ctx, &qsubmit, 0, cs_start, cs_size, &sync, 1);
1497 csf_prepare_gsubmit(ctx, &gsubmit, &qsubmit, 1);
1498 ret = csf_submit_gsubmit(ctx, &gsubmit);
1499
1500 if (dev->debug & PAN_DBG_TRACE) {
1501 uint32_t regs[256] = {0};
1502 pandecode_interpret_cs(dev->decode_ctx, qsubmit.stream_addr,
1503 qsubmit.stream_size, panfrost_device_gpu_id(dev),
1504 regs);
1505 }
1506
1507 if (ret)
1508 goto err_g_submit;
1509
1510 /* Wait before freeing the buffer. */
1511 ret = drmSyncobjWait(panfrost_device_fd(dev), &ctx->syncobj, 1, INT64_MAX, 0,
1512 NULL);
1513 assert(!ret);
1514
1515 panfrost_bo_unreference(cs_bo);
1516
1517 ctx->csf.is_init = true;
1518 return 0;
1519 err_g_submit:
1520 panfrost_bo_unreference(cs_bo);
1521 err_tiler_heap_cs_bo:
1522 panfrost_bo_unreference(ctx->csf.tmp_geom_bo);
1523 err_tiler_heap_tmp_geom_bo:
1524 panfrost_bo_unreference(ctx->csf.heap.desc_bo);
1525 err_tiler_heap_desc_bo:
1526 drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY,
1527 &thd);
1528 err_tiler_heap:
1529 drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_DESTROY, &gd);
1530 err_group_create:
1531 return -1;
1532 }
1533
1534 void
GENX(csf_cleanup_context)1535 GENX(csf_cleanup_context)(struct panfrost_context *ctx)
1536 {
1537 if (!ctx->csf.is_init)
1538 return;
1539
1540 struct panfrost_device *dev = pan_device(ctx->base.screen);
1541 struct drm_panthor_tiler_heap_destroy thd = {
1542 .handle = ctx->csf.heap.handle,
1543 };
1544 int ret;
1545
1546 /* Make sure all jobs are done before destroying the heap. */
1547 ret = drmSyncobjWait(panfrost_device_fd(dev), &ctx->syncobj, 1, INT64_MAX, 0,
1548 NULL);
1549 assert(!ret);
1550
1551 ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY,
1552 &thd);
1553 assert(!ret);
1554
1555 struct drm_panthor_group_destroy gd = {
1556 .group_handle = ctx->csf.group_handle,
1557 };
1558
1559 ret =
1560 drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_DESTROY, &gd);
1561 assert(!ret);
1562
1563 panfrost_bo_unreference(ctx->csf.tmp_geom_bo);
1564 panfrost_bo_unreference(ctx->csf.heap.desc_bo);
1565 panfrost_bo_unreference(ctx->csf.tiler_oom_handler.cs_bo);
1566 panfrost_bo_unreference(ctx->csf.tiler_oom_handler.save_bo);
1567 ctx->csf.is_init = false;
1568 }
1569
1570 void
GENX(csf_emit_write_timestamp)1571 GENX(csf_emit_write_timestamp)(struct panfrost_batch *batch,
1572 struct panfrost_resource *dst, unsigned offset)
1573 {
1574 struct cs_builder *b = batch->csf.cs.builder;
1575
1576 struct cs_index address = cs_reg64(b, 40);
1577 cs_move64_to(b, address,
1578 dst->image.data.base + dst->image.data.offset + offset);
1579 cs_store_state(b, address, 0, MALI_CS_STATE_TIMESTAMP, cs_now());
1580
1581 panfrost_batch_write_rsrc(batch, dst, PIPE_SHADER_VERTEX);
1582 }
1583