• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2023 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "decode.h"
25 
26 #include "drm-uapi/panthor_drm.h"
27 
28 #include "genxml/cs_builder.h"
29 #include "panfrost/lib/genxml/cs_builder.h"
30 
31 #include "gen_macros.h"
32 #include "pan_cmdstream.h"
33 #include "pan_context.h"
34 #include "pan_csf.h"
35 #include "pan_fb_preload.h"
36 #include "pan_job.h"
37 
38 #if PAN_ARCH < 10
39 #error "CSF helpers are only used for gen >= 10"
40 #endif
41 
42 static struct cs_buffer
csf_alloc_cs_buffer(void * cookie)43 csf_alloc_cs_buffer(void *cookie)
44 {
45    assert(cookie && "Self-contained queues can't be extended.");
46 
47    struct panfrost_batch *batch = cookie;
48    unsigned capacity = 4096;
49 
50    struct panfrost_ptr ptr =
51       pan_pool_alloc_aligned(&batch->csf.cs_chunk_pool.base, capacity * 8, 64);
52 
53    return (struct cs_buffer){
54       .cpu = ptr.cpu,
55       .gpu = ptr.gpu,
56       .capacity = capacity,
57    };
58 }
59 
60 /*
61  * Register is reserved to pass the batch tiler OOM context
62  */
63 #define TILER_OOM_CTX_REG 76
64 
65 static enum cs_reg_perm
csf_reg_perm_cb(struct cs_builder * b,unsigned reg)66 csf_reg_perm_cb(struct cs_builder *b, unsigned reg)
67 {
68    if (reg == TILER_OOM_CTX_REG)
69       return CS_REG_RD;
70    return CS_REG_RW;
71 }
72 
73 static void
csf_update_tiler_oom_ctx(struct cs_builder * b,uint64_t addr)74 csf_update_tiler_oom_ctx(struct cs_builder *b, uint64_t addr)
75 {
76    reg_perm_cb_t orig_cb;
77 
78    if (likely(!b->conf.reg_perm)) {
79       cs_move64_to(b, cs_reg64(b, TILER_OOM_CTX_REG), addr);
80       return;
81    }
82 
83    orig_cb = b->conf.reg_perm;
84    b->conf.reg_perm = NULL;
85    cs_move64_to(b, cs_reg64(b, TILER_OOM_CTX_REG), addr);
86    b->conf.reg_perm = orig_cb;
87 }
88 
89 #define FIELD_OFFSET(_name) offsetof(struct pan_csf_tiler_oom_ctx, _name)
90 
91 #define FBD_OFFSET(_pass)                                                      \
92    (FIELD_OFFSET(fbds) +                                                       \
93     (PAN_INCREMENTAL_RENDERING_##_pass##_PASS * sizeof(struct panfrost_ptr)) + \
94     offsetof(struct panfrost_ptr, gpu))
95 
96 static int
csf_oom_handler_init(struct panfrost_context * ctx)97 csf_oom_handler_init(struct panfrost_context *ctx)
98 {
99    struct panfrost_bo *cs_bo = NULL, *reg_save_bo = NULL;
100    struct panfrost_device *dev = pan_device(ctx->base.screen);
101 
102    cs_bo =
103       panfrost_bo_create(dev, 4096, 0, "Temporary CS buffer");
104    reg_save_bo =
105       panfrost_bo_create(dev, 4096, 0, "reg save bo");
106 
107    if (!cs_bo || !reg_save_bo)
108       goto fail;
109 
110    struct cs_buffer queue = {
111       .cpu = cs_bo->ptr.cpu,
112       .gpu = cs_bo->ptr.gpu,
113       .capacity = panfrost_bo_size(cs_bo) / sizeof(uint64_t),
114    };
115    struct cs_builder b;
116    const struct cs_builder_conf conf = {
117       .nr_registers = 96,
118       .nr_kernel_registers = 4,
119       .reg_perm = (dev->debug & PAN_DBG_CS) ? csf_reg_perm_cb : NULL,
120    };
121    cs_builder_init(&b, &conf, queue);
122 
123    struct cs_exception_handler_ctx handler_ctx = {
124       .ctx_reg = cs_reg64(&b, TILER_OOM_CTX_REG),
125       .dump_addr_offset = offsetof(struct pan_csf_tiler_oom_ctx, dump_addr),
126       .ls_sb_slot = 0,
127    };
128    struct cs_exception_handler handler;
129 
130    cs_exception_handler_def(&b, &handler, handler_ctx) {
131       struct cs_index tiler_oom_ctx = cs_reg64(&b, TILER_OOM_CTX_REG);
132       struct cs_index counter = cs_reg32(&b, 47);
133       struct cs_index zero = cs_reg64(&b, 48);
134       struct cs_index flush_id = cs_reg32(&b, 48);
135       struct cs_index tiler_ctx = cs_reg64(&b, 50);
136       struct cs_index completed_top = cs_reg64(&b, 52);
137       struct cs_index completed_bottom = cs_reg64(&b, 54);
138       struct cs_index completed_chunks = cs_reg_tuple(&b, 52, 4);
139 
140       /* Use different framebuffer descriptor depending on whether incremental
141        * rendering has already been triggered */
142       cs_load32_to(&b, counter, tiler_oom_ctx, FIELD_OFFSET(counter));
143       cs_wait_slot(&b, 0, false);
144       cs_if(&b, MALI_CS_CONDITION_GREATER, counter) {
145          cs_load64_to(&b, cs_reg64(&b, 40), tiler_oom_ctx, FBD_OFFSET(MIDDLE));
146       }
147       cs_else(&b) {
148          cs_load64_to(&b, cs_reg64(&b, 40), tiler_oom_ctx, FBD_OFFSET(FIRST));
149       }
150 
151       cs_load32_to(&b, cs_reg32(&b, 42), tiler_oom_ctx, FIELD_OFFSET(bbox_min));
152       cs_load32_to(&b, cs_reg32(&b, 43), tiler_oom_ctx, FIELD_OFFSET(bbox_max));
153       cs_move64_to(&b, cs_reg64(&b, 44), 0);
154       cs_move32_to(&b, cs_reg32(&b, 46), 0);
155       cs_wait_slot(&b, 0, false);
156 
157       /* Run the fragment job and wait */
158       cs_set_scoreboard_entry(&b, 3, 0);
159       cs_run_fragment(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
160       cs_wait_slot(&b, 3, false);
161 
162       /* Increment counter */
163       cs_add32(&b, counter, counter, 1);
164       cs_store32(&b, counter, tiler_oom_ctx, FIELD_OFFSET(counter));
165 
166       /* Load completed chunks */
167       cs_load64_to(&b, tiler_ctx, tiler_oom_ctx, FIELD_OFFSET(tiler_desc));
168       cs_wait_slot(&b, 0, false);
169       cs_load_to(&b, completed_chunks, tiler_ctx, BITFIELD_MASK(4), 10 * 4);
170       cs_wait_slot(&b, 0, false);
171 
172       cs_finish_fragment(&b, false, completed_top, completed_bottom, cs_now());
173 
174       /* Zero out polygon list, completed_top and completed_bottom */
175       cs_move64_to(&b, zero, 0);
176       cs_store64(&b, zero, tiler_ctx, 0);
177       cs_store64(&b, zero, tiler_ctx, 10 * 4);
178       cs_store64(&b, zero, tiler_ctx, 12 * 4);
179 
180       /* We need to flush the texture caches so future preloads see the new
181        * content. */
182       cs_flush_caches(&b, MALI_CS_FLUSH_MODE_NONE, MALI_CS_FLUSH_MODE_NONE,
183                       true, flush_id, cs_defer(0, 0));
184 
185       cs_wait_slot(&b, 0, false);
186 
187       cs_set_scoreboard_entry(&b, 2, 0);
188    }
189 
190    assert(cs_is_valid(&b));
191    cs_finish(&b);
192    ctx->csf.tiler_oom_handler.cs_bo = cs_bo;
193    ctx->csf.tiler_oom_handler.length = handler.length * sizeof(uint64_t);
194    ctx->csf.tiler_oom_handler.save_bo = reg_save_bo;
195 
196    return 0;
197 
198 fail:
199    if (cs_bo)
200       panfrost_bo_unreference(cs_bo);
201 
202    if (reg_save_bo)
203       panfrost_bo_unreference(reg_save_bo);
204 
205    return -1;
206 }
207 
208 #undef FBD_OFFSET
209 #undef FIELD_OFFSET
210 
211 void
GENX(csf_cleanup_batch)212 GENX(csf_cleanup_batch)(struct panfrost_batch *batch)
213 {
214    free(batch->csf.cs.builder);
215    free(batch->csf.cs.ls_tracker);
216 
217    panfrost_pool_cleanup(&batch->csf.cs_chunk_pool);
218 }
219 
220 static inline struct panfrost_ptr
alloc_fbd(struct panfrost_batch * batch)221 alloc_fbd(struct panfrost_batch *batch)
222 {
223    return pan_pool_alloc_desc_aggregate(
224       &batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION),
225       PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
226 }
227 
228 int
GENX(csf_init_batch)229 GENX(csf_init_batch)(struct panfrost_batch *batch)
230 {
231    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
232 
233    /* Initialize the CS chunk pool. */
234    if (panfrost_pool_init(&batch->csf.cs_chunk_pool, NULL, dev, 0, 32768,
235                           "CS chunk pool", false, true))
236       return -1;
237 
238    if (dev->debug & PAN_DBG_CS) {
239       /* Load/store tracker if extra checks are enabled. */
240       batch->csf.cs.ls_tracker =
241          calloc(1, sizeof(struct cs_load_store_tracker));
242       batch->csf.cs.ls_tracker->sb_slot = 0;
243    }
244 
245    /* Allocate and bind the command queue */
246    struct cs_buffer queue = csf_alloc_cs_buffer(batch);
247    if (!queue.gpu)
248       return -1;
249 
250    const struct cs_builder_conf conf = {
251       .nr_registers = 96,
252       .nr_kernel_registers = 4,
253       .alloc_buffer = csf_alloc_cs_buffer,
254       .cookie = batch,
255       .ls_tracker = batch->csf.cs.ls_tracker,
256       .reg_perm = (dev->debug & PAN_DBG_CS) ? csf_reg_perm_cb : NULL,
257    };
258 
259    /* Setup the queue builder */
260    batch->csf.cs.builder = malloc(sizeof(struct cs_builder));
261    cs_builder_init(batch->csf.cs.builder, &conf, queue);
262    cs_req_res(batch->csf.cs.builder,
263               CS_COMPUTE_RES | CS_TILER_RES | CS_IDVS_RES | CS_FRAG_RES);
264 
265    /* Set up entries */
266    struct cs_builder *b = batch->csf.cs.builder;
267    cs_set_scoreboard_entry(b, 2, 0);
268 
269    batch->framebuffer = alloc_fbd(batch);
270    if (!batch->framebuffer.gpu)
271       return -1;
272 
273    batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
274    if (!batch->tls.cpu)
275       return -1;
276 
277    return 0;
278 }
279 
280 static void
csf_prepare_qsubmit(struct panfrost_context * ctx,struct drm_panthor_queue_submit * submit,uint8_t queue,uint64_t cs_start,uint32_t cs_size,struct drm_panthor_sync_op * syncs,uint32_t sync_count)281 csf_prepare_qsubmit(struct panfrost_context *ctx,
282                     struct drm_panthor_queue_submit *submit, uint8_t queue,
283                     uint64_t cs_start, uint32_t cs_size,
284                     struct drm_panthor_sync_op *syncs, uint32_t sync_count)
285 {
286    struct panfrost_device *dev = pan_device(ctx->base.screen);
287 
288    *submit = (struct drm_panthor_queue_submit){
289       .queue_index = queue,
290       .stream_addr = cs_start,
291       .stream_size = cs_size,
292       .latest_flush = panthor_kmod_get_flush_id(dev->kmod.dev),
293       .syncs = DRM_PANTHOR_OBJ_ARRAY(sync_count, syncs),
294    };
295 }
296 
297 static void
csf_prepare_gsubmit(struct panfrost_context * ctx,struct drm_panthor_group_submit * gsubmit,struct drm_panthor_queue_submit * qsubmits,uint32_t qsubmit_count)298 csf_prepare_gsubmit(struct panfrost_context *ctx,
299                     struct drm_panthor_group_submit *gsubmit,
300                     struct drm_panthor_queue_submit *qsubmits,
301                     uint32_t qsubmit_count)
302 {
303    *gsubmit = (struct drm_panthor_group_submit){
304       .group_handle = ctx->csf.group_handle,
305       .queue_submits = DRM_PANTHOR_OBJ_ARRAY(qsubmit_count, qsubmits),
306    };
307 }
308 
309 static int
csf_submit_gsubmit(struct panfrost_context * ctx,struct drm_panthor_group_submit * gsubmit)310 csf_submit_gsubmit(struct panfrost_context *ctx,
311                    struct drm_panthor_group_submit *gsubmit)
312 {
313    struct panfrost_device *dev = pan_device(ctx->base.screen);
314    int ret = 0;
315 
316    if (!ctx->is_noop) {
317       ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_SUBMIT,
318                      gsubmit);
319    }
320 
321    if (ret)
322       return errno;
323 
324    return 0;
325 }
326 
327 static int
csf_emit_batch_end(struct panfrost_batch * batch)328 csf_emit_batch_end(struct panfrost_batch *batch)
329 {
330    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
331    struct cs_builder *b = batch->csf.cs.builder;
332 
333    /* Barrier to let everything finish */
334    cs_wait_slots(b, BITFIELD_MASK(8), false);
335 
336    if (dev->debug & PAN_DBG_SYNC) {
337       /* Get the CS state */
338       batch->csf.cs.state = pan_pool_alloc_aligned(&batch->pool.base, 8, 8);
339       if (!batch->csf.cs.state.cpu)
340          return -1;
341 
342       memset(batch->csf.cs.state.cpu, ~0, 8);
343       cs_move64_to(b, cs_reg64(b, 90), batch->csf.cs.state.gpu);
344       cs_store_state(b, cs_reg64(b, 90), 0, MALI_CS_STATE_ERROR_STATUS,
345                      cs_now());
346    }
347 
348    /* Flush caches now that we're done (synchronous) */
349    struct cs_index flush_id = cs_reg32(b, 74);
350    cs_move32_to(b, flush_id, 0);
351    cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN, true,
352                    flush_id, cs_defer(0, 0));
353    cs_wait_slot(b, 0, false);
354 
355    /* Finish the command stream */
356    if (!cs_is_valid(batch->csf.cs.builder))
357       return -1;
358 
359    cs_finish(batch->csf.cs.builder);
360    return 0;
361 }
362 
363 static int
csf_submit_collect_wait_ops(struct panfrost_batch * batch,struct util_dynarray * syncops,uint32_t vm_sync_handle)364 csf_submit_collect_wait_ops(struct panfrost_batch *batch,
365                             struct util_dynarray *syncops,
366                             uint32_t vm_sync_handle)
367 {
368    struct panfrost_context *ctx = batch->ctx;
369    struct panfrost_device *dev = pan_device(ctx->base.screen);
370    uint64_t vm_sync_wait_point = 0, bo_sync_point;
371    uint32_t bo_sync_handle;
372    int ret;
373 
374    /* We don't wait on BOs attached to the various batch pools, because those
375     * are private to the batch, and are guaranteed to be idle at allocation
376     * time. We need to iterate over other BOs accessed by the batch though,
377     * to add the corresponding wait operations.
378     */
379    util_dynarray_foreach(&batch->bos, pan_bo_access, ptr) {
380       unsigned i = ptr - util_dynarray_element(&batch->bos, pan_bo_access, 0);
381       pan_bo_access flags = *ptr;
382 
383       if (!flags)
384          continue;
385 
386       /* Update the BO access flags so that panfrost_bo_wait() knows
387        * about all pending accesses.
388        * We only keep the READ/WRITE info since this is all the BO
389        * wait logic cares about.
390        * We also preserve existing flags as this batch might not
391        * be the first one to access the BO.
392        */
393       struct panfrost_bo *bo = pan_lookup_bo(dev, i);
394 
395       ret = panthor_kmod_bo_get_sync_point(bo->kmod_bo, &bo_sync_handle,
396                                            &bo_sync_point,
397                                            !(flags & PAN_BO_ACCESS_WRITE));
398       if (ret)
399          return ret;
400 
401       if (bo_sync_handle == vm_sync_handle) {
402          vm_sync_wait_point = MAX2(vm_sync_wait_point, bo_sync_point);
403          continue;
404       }
405 
406       assert(bo_sync_point == 0 || !bo->kmod_bo->exclusive_vm);
407 
408       struct drm_panthor_sync_op waitop = {
409          .flags =
410             DRM_PANTHOR_SYNC_OP_WAIT |
411             (bo_sync_point ? DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ
412                            : DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ),
413          .handle = bo_sync_handle,
414          .timeline_value = bo_sync_point,
415       };
416 
417       util_dynarray_append(syncops, struct drm_panthor_sync_op, waitop);
418    }
419 
420    if (vm_sync_wait_point > 0) {
421       struct drm_panthor_sync_op waitop = {
422          .flags = DRM_PANTHOR_SYNC_OP_WAIT |
423                   DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ,
424          .handle = vm_sync_handle,
425          .timeline_value = vm_sync_wait_point,
426       };
427 
428       util_dynarray_append(syncops, struct drm_panthor_sync_op, waitop);
429    }
430 
431    if (ctx->in_sync_fd >= 0) {
432       ret = drmSyncobjImportSyncFile(panfrost_device_fd(dev), ctx->in_sync_obj,
433                                      ctx->in_sync_fd);
434       if (ret)
435          return ret;
436 
437       struct drm_panthor_sync_op waitop = {
438          .flags =
439             DRM_PANTHOR_SYNC_OP_WAIT | DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ,
440          .handle = ctx->in_sync_obj,
441       };
442 
443       util_dynarray_append(syncops, struct drm_panthor_sync_op, waitop);
444 
445       close(ctx->in_sync_fd);
446       ctx->in_sync_fd = -1;
447    }
448 
449    return 0;
450 }
451 
452 static int
csf_attach_sync_points(struct panfrost_batch * batch,uint32_t vm_sync_handle,uint64_t vm_sync_signal_point)453 csf_attach_sync_points(struct panfrost_batch *batch, uint32_t vm_sync_handle,
454                        uint64_t vm_sync_signal_point)
455 {
456    struct panfrost_context *ctx = batch->ctx;
457    struct panfrost_device *dev = pan_device(ctx->base.screen);
458    int ret;
459 
460    /* There should be no invisble allocation on CSF. */
461    assert(batch->invisible_pool.bos.size == 0);
462 
463    /* Attach sync points to batch-private BOs first. We assume BOs can
464     * be written by the GPU to keep things simple.
465     */
466    util_dynarray_foreach(&batch->pool.bos, struct panfrost_bo *, bo) {
467       (*bo)->gpu_access |= PAN_BO_ACCESS_RW;
468       ret = panthor_kmod_bo_attach_sync_point((*bo)->kmod_bo, vm_sync_handle,
469                                               vm_sync_signal_point, true);
470       if (ret)
471          return ret;
472    }
473 
474    util_dynarray_foreach(&batch->csf.cs_chunk_pool.bos, struct panfrost_bo *,
475                          bo) {
476       (*bo)->gpu_access |= PAN_BO_ACCESS_RW;
477       ret = panthor_kmod_bo_attach_sync_point((*bo)->kmod_bo, vm_sync_handle,
478                                               vm_sync_signal_point, true);
479       if (ret)
480          return ret;
481    }
482 
483    /* Attach the VM sync point to all resources accessed by the batch. */
484    util_dynarray_foreach(&batch->bos, pan_bo_access, ptr) {
485       unsigned i = ptr - util_dynarray_element(&batch->bos, pan_bo_access, 0);
486       pan_bo_access flags = *ptr;
487 
488       if (!flags)
489          continue;
490 
491       struct panfrost_bo *bo = pan_lookup_bo(dev, i);
492 
493       bo->gpu_access |= flags & (PAN_BO_ACCESS_RW);
494       ret = panthor_kmod_bo_attach_sync_point(bo->kmod_bo, vm_sync_handle,
495                                               vm_sync_signal_point,
496                                               flags & PAN_BO_ACCESS_WRITE);
497       if (ret)
498          return ret;
499    }
500 
501    /* And finally transfer the VM sync point to the context syncobj. */
502    return drmSyncobjTransfer(panfrost_device_fd(dev), ctx->syncobj, 0,
503                              vm_sync_handle, vm_sync_signal_point, 0);
504 }
505 
506 static void
csf_check_ctx_state_and_reinit(struct panfrost_context * ctx)507 csf_check_ctx_state_and_reinit(struct panfrost_context *ctx)
508 {
509    struct panfrost_device *dev = pan_device(ctx->base.screen);
510    struct drm_panthor_group_get_state state = {
511       .group_handle = ctx->csf.group_handle,
512    };
513    int ret;
514 
515    ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_GET_STATE,
516                   &state);
517    if (ret) {
518       mesa_loge("DRM_IOCTL_PANTHOR_GROUP_GET_STATE failed (err=%d)", errno);
519       return;
520    }
521 
522    /* Context is still usable. This was a transient error. */
523    if (state.state == 0)
524       return;
525 
526    /* If the VM is unusable, we can't do much, as this is shared between all
527     * contexts, and restoring the VM state is non-trivial.
528     */
529    if (pan_kmod_vm_query_state(dev->kmod.vm) != PAN_KMOD_VM_USABLE) {
530       mesa_loge("VM became unusable, we can't reset the context");
531       assert(!"VM became unusable, we can't reset the context");
532    }
533 
534    panfrost_context_reinit(ctx);
535 }
536 
537 static void
csf_submit_wait_and_dump(struct panfrost_batch * batch,const struct drm_panthor_group_submit * gsubmit,uint32_t vm_sync_handle,uint64_t vm_sync_signal_point)538 csf_submit_wait_and_dump(struct panfrost_batch *batch,
539                          const struct drm_panthor_group_submit *gsubmit,
540                          uint32_t vm_sync_handle, uint64_t vm_sync_signal_point)
541 {
542    struct panfrost_context *ctx = batch->ctx;
543    struct panfrost_device *dev = pan_device(ctx->base.screen);
544    bool wait = (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) && !ctx->is_noop;
545    bool dump = (dev->debug & PAN_DBG_TRACE);
546    bool crash = false;
547 
548    if (!wait && !dump)
549       return;
550 
551    /* Wait so we can get errors reported back */
552    if (wait) {
553       int ret =
554          drmSyncobjTimelineWait(panfrost_device_fd(dev), &vm_sync_handle,
555                                 &vm_sync_signal_point, 1, INT64_MAX, 0, NULL);
556       assert(ret >= 0);
557 
558       struct pan_csf_tiler_oom_ctx *tiler_oom_ctx =
559          batch->csf.tiler_oom_ctx.cpu;
560       if (tiler_oom_ctx != NULL && tiler_oom_ctx->counter > 0) {
561          perf_debug(ctx, "Incremental rendering was triggered %i time(s)",
562                     tiler_oom_ctx->counter);
563       }
564    }
565 
566    /* Jobs won't be complete if blackhole rendering, that's ok */
567    if (!ctx->is_noop && (dev->debug & PAN_DBG_SYNC) &&
568        *((uint64_t *)batch->csf.cs.state.cpu) != 0) {
569       crash = true;
570       dump = true;
571    }
572 
573    if (dump) {
574       const struct drm_panthor_queue_submit *qsubmits =
575          (void *)(uintptr_t)gsubmit->queue_submits.array;
576 
577       for (unsigned i = 0; i < gsubmit->queue_submits.count; i++) {
578          uint32_t regs[256] = {0};
579          pandecode_interpret_cs(dev->decode_ctx, qsubmits[i].stream_addr,
580                                 qsubmits[i].stream_size,
581                                 panfrost_device_gpu_id(dev), regs);
582       }
583 
584       if (dev->debug & PAN_DBG_DUMP)
585          pandecode_dump_mappings(dev->decode_ctx);
586    }
587 
588    if (crash) {
589       mesa_loge("Incomplete job or timeout\n");
590       abort();
591    }
592 }
593 
594 int
GENX(csf_submit_batch)595 GENX(csf_submit_batch)(struct panfrost_batch *batch)
596 {
597    int ret;
598 
599    /* Close the batch before submitting. */
600    ret = csf_emit_batch_end(batch);
601    if (ret)
602       return ret;
603 
604    uint64_t cs_start = cs_root_chunk_gpu_addr(batch->csf.cs.builder);
605    uint32_t cs_size = cs_root_chunk_size(batch->csf.cs.builder);
606    struct panfrost_context *ctx = batch->ctx;
607    struct panfrost_device *dev = pan_device(ctx->base.screen);
608    uint32_t vm_sync_handle = panthor_kmod_vm_sync_handle(dev->kmod.vm);
609    struct util_dynarray syncops;
610 
611    util_dynarray_init(&syncops, NULL);
612 
613    ret = csf_submit_collect_wait_ops(batch, &syncops, vm_sync_handle);
614    if (ret)
615       goto out_free_syncops;
616 
617    uint64_t vm_sync_cur_point = panthor_kmod_vm_sync_lock(dev->kmod.vm);
618    uint64_t vm_sync_signal_point = vm_sync_cur_point + 1;
619 
620    struct drm_panthor_sync_op signalop = {
621       .flags = DRM_PANTHOR_SYNC_OP_SIGNAL |
622                DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ,
623       .handle = vm_sync_handle,
624       .timeline_value = vm_sync_signal_point,
625    };
626 
627    util_dynarray_append(&syncops, struct drm_panthor_sync_op, signalop);
628 
629    struct drm_panthor_queue_submit qsubmit;
630    struct drm_panthor_group_submit gsubmit;
631 
632    csf_prepare_qsubmit(
633       ctx, &qsubmit, 0, cs_start, cs_size, util_dynarray_begin(&syncops),
634       util_dynarray_num_elements(&syncops, struct drm_panthor_sync_op));
635    csf_prepare_gsubmit(ctx, &gsubmit, &qsubmit, 1);
636    ret = csf_submit_gsubmit(ctx, &gsubmit);
637    panthor_kmod_vm_sync_unlock(dev->kmod.vm,
638                                ret ? vm_sync_cur_point : vm_sync_signal_point);
639 
640    if (!ret) {
641       csf_submit_wait_and_dump(batch, &gsubmit, vm_sync_handle,
642                                vm_sync_signal_point);
643       ret = csf_attach_sync_points(batch, vm_sync_handle, vm_sync_signal_point);
644    } else {
645       csf_check_ctx_state_and_reinit(batch->ctx);
646    }
647 
648 out_free_syncops:
649    util_dynarray_fini(&syncops);
650    return ret;
651 }
652 
653 static uint64_t
csf_get_tiler_desc(struct panfrost_batch * batch)654 csf_get_tiler_desc(struct panfrost_batch *batch)
655 {
656    if (batch->tiler_ctx.valhall.desc)
657       return batch->tiler_ctx.valhall.desc;
658 
659    struct panfrost_ptr t =
660       pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
661 
662    batch->csf.pending_tiler_desc = t.cpu;
663    batch->tiler_ctx.valhall.desc = t.gpu;
664    return batch->tiler_ctx.valhall.desc;
665 }
666 
667 static void
csf_emit_tiler_desc(struct panfrost_batch * batch,const struct pan_fb_info * fb)668 csf_emit_tiler_desc(struct panfrost_batch *batch, const struct pan_fb_info *fb)
669 {
670    struct panfrost_context *ctx = batch->ctx;
671    struct panfrost_device *dev = pan_device(ctx->base.screen);
672 
673    if (!batch->csf.pending_tiler_desc)
674       return;
675 
676    pan_pack(batch->csf.pending_tiler_desc, TILER_CONTEXT, tiler) {
677       unsigned max_levels = dev->tiler_features.max_levels;
678       assert(max_levels >= 2);
679 
680       /* TODO: Select hierarchy mask more effectively */
681       tiler.hierarchy_mask = (max_levels >= 8) ? 0xFF : 0x28;
682 
683       /* For large framebuffers, disable the smallest bin size to
684        * avoid pathological tiler memory usage. Required to avoid OOM
685        * on dEQP-GLES31.functional.fbo.no_attachments.maximums.all on
686        * Mali-G57.
687        */
688       if (MAX2(batch->key.width, batch->key.height) >= 4096)
689          tiler.hierarchy_mask &= ~1;
690 
691       /* For effective tile size larger than 16x16, disable first level */
692       if (fb->tile_size > 16 * 16)
693          tiler.hierarchy_mask &= ~1;
694 
695       tiler.fb_width = batch->key.width;
696       tiler.fb_height = batch->key.height;
697       tiler.heap = batch->ctx->csf.heap.desc_bo->ptr.gpu;
698       tiler.sample_pattern =
699          pan_sample_pattern(util_framebuffer_get_num_samples(&batch->key));
700       tiler.first_provoking_vertex =
701          batch->first_provoking_vertex == U_TRISTATE_YES;
702       tiler.geometry_buffer = ctx->csf.tmp_geom_bo->ptr.gpu;
703       tiler.geometry_buffer_size = ctx->csf.tmp_geom_bo->kmod_bo->size;
704    }
705 
706    batch->csf.pending_tiler_desc = NULL;
707 }
708 
709 void
GENX(csf_prepare_tiler)710 GENX(csf_prepare_tiler)(struct panfrost_batch *batch, struct pan_fb_info *fb)
711 {
712    csf_emit_tiler_desc(batch, fb);
713 }
714 
715 void
GENX(csf_preload_fb)716 GENX(csf_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb)
717 {
718    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
719 
720    GENX(pan_preload_fb)
721    (&dev->fb_preload_cache, &batch->pool.base, fb, batch->tls.gpu, NULL);
722 }
723 
724 #define GET_FBD(_ctx, _pass)                                                   \
725    (_ctx)->fbds[PAN_INCREMENTAL_RENDERING_##_pass##_PASS]
726 #define EMIT_FBD(_ctx, _pass, _fb, _tls, _tiler_ctx)                           \
727    GET_FBD(_ctx, _pass).gpu |=                                                 \
728       GENX(pan_emit_fbd)(_fb, 0, _tls, _tiler_ctx, GET_FBD(_ctx, _pass).cpu)
729 
730 void
GENX(csf_emit_fbds)731 GENX(csf_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb,
732                     struct pan_tls_info *tls)
733 {
734    struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
735 
736    /* Default framebuffer descriptor */
737 
738    batch->framebuffer.gpu |=
739       GENX(pan_emit_fbd)(fb, 0, tls, &batch->tiler_ctx, batch->framebuffer.cpu);
740 
741    if (batch->draw_count == 0)
742       return;
743 
744    struct pan_csf_tiler_oom_ctx *tiler_oom_ctx = batch->csf.tiler_oom_ctx.cpu;
745    struct pan_fb_info alt_fb;
746    bool changed = false;
747 
748    /* First incremental rendering pass: don't discard result */
749 
750    memcpy(&alt_fb, fb, sizeof(alt_fb));
751    for (unsigned i = 0; i < fb->rt_count; i++)
752       alt_fb.rts[i].discard = false;
753    alt_fb.zs.discard.z = false;
754    alt_fb.zs.discard.s = false;
755 
756    EMIT_FBD(tiler_oom_ctx, FIRST, &alt_fb, tls, &batch->tiler_ctx);
757 
758    /* Subsequent incremental rendering passes: preload old content and don't
759     * discard result */
760 
761    for (unsigned i = 0; i < fb->rt_count; i++) {
762       if (fb->rts[i].view && !fb->rts[i].preload) {
763          alt_fb.rts[i].preload = true;
764          changed = true;
765       }
766 
767       if (alt_fb.rts[i].clear) {
768          alt_fb.rts[i].clear = false;
769          changed = true;
770       }
771    }
772    if (fb->zs.view.zs && !fb->zs.preload.z && !fb->zs.preload.s) {
773       alt_fb.zs.preload.z = true;
774       alt_fb.zs.preload.s = true;
775       changed = true;
776    } else if (fb->zs.view.s && !fb->zs.preload.s) {
777       alt_fb.zs.preload.s = true;
778       changed = true;
779    }
780 
781    if (alt_fb.zs.clear.z || alt_fb.zs.clear.s) {
782       alt_fb.zs.clear.z = false;
783       alt_fb.zs.clear.s = false;
784       changed = true;
785    }
786 
787    if (changed) {
788       alt_fb.bifrost.pre_post.dcds.gpu = 0;
789       GENX(pan_preload_fb)
790       (&dev->fb_preload_cache, &batch->pool.base, &alt_fb, batch->tls.gpu, NULL);
791    }
792 
793    EMIT_FBD(tiler_oom_ctx, MIDDLE, &alt_fb, tls, &batch->tiler_ctx);
794 
795    /* Last incremental rendering pass: preload previous content and deal with
796     * results as specified by user */
797 
798    for (unsigned i = 0; i < fb->rt_count; i++)
799       alt_fb.rts[i].discard = fb->rts[i].discard;
800    alt_fb.zs.discard.z = fb->zs.discard.z;
801    alt_fb.zs.discard.s = fb->zs.discard.s;
802 
803    EMIT_FBD(tiler_oom_ctx, LAST, &alt_fb, tls, &batch->tiler_ctx);
804 }
805 
806 void
GENX(csf_emit_fragment_job)807 GENX(csf_emit_fragment_job)(struct panfrost_batch *batch,
808                             const struct pan_fb_info *pfb)
809 {
810    struct cs_builder *b = batch->csf.cs.builder;
811    struct pan_csf_tiler_oom_ctx *oom_ctx = batch->csf.tiler_oom_ctx.cpu;
812 
813    if (batch->draw_count > 0) {
814       /* Finish tiling and wait for IDVS and tiling */
815       cs_finish_tiling(b, false);
816       cs_wait_slot(b, 2, false);
817       cs_vt_end(b, cs_now());
818    }
819 
820    /* Set up the fragment job */
821    cs_move64_to(b, cs_reg64(b, 40), batch->framebuffer.gpu);
822    cs_move32_to(b, cs_reg32(b, 42), (batch->miny << 16) | batch->minx);
823    cs_move32_to(b, cs_reg32(b, 43),
824                 ((batch->maxy - 1) << 16) | (batch->maxx - 1));
825    cs_move64_to(b, cs_reg64(b, 44), 0);
826    cs_move32_to(b, cs_reg32(b, 46), 0);
827 
828    /* Use different framebuffer descriptor if incremental rendering was
829     * triggered while tiling */
830    if (batch->draw_count > 0) {
831       struct cs_index counter = cs_reg32(b, 78);
832       cs_load32_to(b, counter, cs_reg64(b, TILER_OOM_CTX_REG), 0);
833       cs_wait_slot(b, 0, false);
834       cs_if(b, MALI_CS_CONDITION_GREATER, counter) {
835          cs_move64_to(b, cs_reg64(b, 40), GET_FBD(oom_ctx, LAST).gpu);
836       }
837    }
838 
839    /* Run the fragment job and wait */
840    cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
841    cs_wait_slot(b, 2, false);
842 
843    /* Gather freed heap chunks and add them to the heap context free list
844     * so they can be re-used next time the tiler heap runs out of chunks.
845     * That's what cs_finish_fragment() is all about. The list of freed
846     * chunks is in the tiler context descriptor
847     * (completed_{top,bottom fields}). */
848    if (batch->draw_count > 0) {
849       assert(batch->tiler_ctx.valhall.desc);
850       cs_move64_to(b, cs_reg64(b, 90), batch->tiler_ctx.valhall.desc);
851       cs_load_to(b, cs_reg_tuple(b, 86, 4), cs_reg64(b, 90), BITFIELD_MASK(4),
852                  40);
853       cs_wait_slot(b, 0, false);
854       cs_finish_fragment(b, true, cs_reg64(b, 86), cs_reg64(b, 88), cs_now());
855    }
856 }
857 
858 static void
csf_emit_shader_regs(struct panfrost_batch * batch,enum pipe_shader_type stage,uint64_t shader)859 csf_emit_shader_regs(struct panfrost_batch *batch, enum pipe_shader_type stage,
860                      uint64_t shader)
861 {
862    uint64_t resources = panfrost_emit_resources(batch, stage);
863 
864    assert(stage == PIPE_SHADER_VERTEX || stage == PIPE_SHADER_FRAGMENT ||
865           stage == PIPE_SHADER_COMPUTE);
866 
867    unsigned offset = (stage == PIPE_SHADER_FRAGMENT) ? 4 : 0;
868    unsigned fau_count = DIV_ROUND_UP(batch->nr_push_uniforms[stage], 2);
869 
870    struct cs_builder *b = batch->csf.cs.builder;
871    cs_move64_to(b, cs_reg64(b, 0 + offset), resources);
872    cs_move64_to(b, cs_reg64(b, 8 + offset),
873                 batch->push_uniforms[stage] | ((uint64_t)fau_count << 56));
874    cs_move64_to(b, cs_reg64(b, 16 + offset), shader);
875 }
876 
877 void
GENX(csf_launch_grid)878 GENX(csf_launch_grid)(struct panfrost_batch *batch,
879                       const struct pipe_grid_info *info)
880 {
881    /* Empty compute programs are invalid and don't make sense */
882    if (batch->rsd[PIPE_SHADER_COMPUTE] == 0)
883       return;
884 
885    struct panfrost_context *ctx = batch->ctx;
886    struct panfrost_device *dev = pan_device(ctx->base.screen);
887    struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE];
888    struct cs_builder *b = batch->csf.cs.builder;
889 
890    csf_emit_shader_regs(batch, PIPE_SHADER_COMPUTE,
891                         batch->rsd[PIPE_SHADER_COMPUTE]);
892 
893    cs_move64_to(b, cs_reg64(b, 24), batch->tls.gpu);
894 
895    /* Global attribute offset */
896    cs_move32_to(b, cs_reg32(b, 32), 0);
897 
898    /* Compute workgroup size */
899    struct mali_compute_size_workgroup_packed wg_size;
900    pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
901       cfg.workgroup_size_x = info->block[0];
902       cfg.workgroup_size_y = info->block[1];
903       cfg.workgroup_size_z = info->block[2];
904 
905       /* Workgroups may be merged if the shader does not use barriers
906        * or shared memory. This condition is checked against the
907        * static shared_size at compile-time. We need to check the
908        * variable shared size at launch_grid time, because the
909        * compiler doesn't know about that.
910        */
911       cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups &&
912                                      (info->variable_shared_mem == 0);
913    }
914 
915    cs_move32_to(b, cs_reg32(b, 33), wg_size.opaque[0]);
916 
917    /* Offset */
918    for (unsigned i = 0; i < 3; ++i)
919       cs_move32_to(b, cs_reg32(b, 34 + i), 0);
920 
921    unsigned threads_per_wg = info->block[0] * info->block[1] * info->block[2];
922    unsigned max_thread_cnt = panfrost_compute_max_thread_count(
923       &dev->kmod.props, cs->info.work_reg_count);
924 
925    if (info->indirect) {
926       /* Load size in workgroups per dimension from memory */
927       struct cs_index address = cs_reg64(b, 64);
928       cs_move64_to(
929          b, address,
930          pan_resource(info->indirect)->image.data.base + info->indirect_offset);
931 
932       struct cs_index grid_xyz = cs_reg_tuple(b, 37, 3);
933       cs_load_to(b, grid_xyz, address, BITFIELD_MASK(3), 0);
934 
935       /* Wait for the load */
936       cs_wait_slot(b, 0, false);
937 
938       /* Copy to FAU */
939       for (unsigned i = 0; i < 3; ++i) {
940          if (batch->num_wg_sysval[i]) {
941             cs_move64_to(b, address, batch->num_wg_sysval[i]);
942             cs_store(b, cs_extract32(b, grid_xyz, i), address, BITFIELD_MASK(1),
943                      0);
944          }
945       }
946 
947       /* Wait for the stores */
948       cs_wait_slot(b, 0, false);
949 
950       cs_run_compute_indirect(b, DIV_ROUND_UP(max_thread_cnt, threads_per_wg),
951                               false, cs_shader_res_sel(0, 0, 0, 0));
952    } else {
953       /* Set size in workgroups per dimension immediately */
954       for (unsigned i = 0; i < 3; ++i)
955          cs_move32_to(b, cs_reg32(b, 37 + i), info->grid[i]);
956 
957       /* Pick the task_axis and task_increment to maximize thread utilization. */
958       unsigned task_axis = MALI_TASK_AXIS_X;
959       unsigned threads_per_task = threads_per_wg;
960       unsigned task_increment = 0;
961 
962       for (unsigned i = 0; i < 3; i++) {
963          if (threads_per_task * info->grid[i] >= max_thread_cnt) {
964             /* We reached out thread limit, stop at the current axis and
965              * calculate the increment so it doesn't exceed the per-core
966              * thread capacity.
967              */
968             task_increment = max_thread_cnt / threads_per_task;
969             break;
970          } else if (task_axis == MALI_TASK_AXIS_Z) {
971             /* We reached the Z axis, and there's still room to stuff more
972              * threads. Pick the current axis grid size as our increment
973              * as there's no point using something bigger.
974              */
975             task_increment = info->grid[i];
976             break;
977          }
978 
979          threads_per_task *= info->grid[i];
980          task_axis++;
981       }
982 
983       assert(task_axis <= MALI_TASK_AXIS_Z);
984       assert(task_increment > 0);
985       cs_run_compute(b, task_increment, task_axis, false,
986                      cs_shader_res_sel(0, 0, 0, 0));
987    }
988 }
989 
990 void
GENX(csf_launch_xfb)991 GENX(csf_launch_xfb)(struct panfrost_batch *batch,
992                      const struct pipe_draw_info *info, unsigned count)
993 {
994    struct cs_builder *b = batch->csf.cs.builder;
995 
996    cs_move64_to(b, cs_reg64(b, 24), batch->tls.gpu);
997 
998    /* TODO: Indexing. Also, attribute_offset is a legacy feature.. */
999    cs_move32_to(b, cs_reg32(b, 32), batch->ctx->offset_start);
1000 
1001    /* Compute workgroup size */
1002    struct mali_compute_size_workgroup_packed wg_size;
1003    pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
1004       cfg.workgroup_size_x = 1;
1005       cfg.workgroup_size_y = 1;
1006       cfg.workgroup_size_z = 1;
1007 
1008       /* Transform feedback shaders do not use barriers or
1009        * shared memory, so we may merge workgroups.
1010        */
1011       cfg.allow_merging_workgroups = true;
1012    }
1013    cs_move32_to(b, cs_reg32(b, 33), wg_size.opaque[0]);
1014 
1015    /* Offset */
1016    for (unsigned i = 0; i < 3; ++i)
1017       cs_move32_to(b, cs_reg32(b, 34 + i), 0);
1018 
1019    cs_move32_to(b, cs_reg32(b, 37), count);
1020    cs_move32_to(b, cs_reg32(b, 38), info->instance_count);
1021    cs_move32_to(b, cs_reg32(b, 39), 1);
1022 
1023    csf_emit_shader_regs(batch, PIPE_SHADER_VERTEX,
1024                         batch->rsd[PIPE_SHADER_VERTEX]);
1025    /* force a barrier to avoid read/write sync issues with buffers */
1026    cs_wait_slot(b, 2, false);
1027 
1028    /* XXX: Choose correctly */
1029    cs_run_compute(b, 1, MALI_TASK_AXIS_Z, false, cs_shader_res_sel(0, 0, 0, 0));
1030 }
1031 
1032 static void
emit_tiler_oom_context(struct cs_builder * b,struct panfrost_batch * batch)1033 emit_tiler_oom_context(struct cs_builder *b, struct panfrost_batch *batch)
1034 {
1035    struct pan_csf_tiler_oom_ctx *ctx;
1036 
1037    batch->csf.tiler_oom_ctx =
1038       pan_pool_alloc_aligned(&batch->pool.base, sizeof(*ctx), 8);
1039    ctx = batch->csf.tiler_oom_ctx.cpu;
1040 
1041    ctx->tiler_desc = csf_get_tiler_desc(batch);
1042    ctx->counter = 0;
1043    ctx->bbox_min = (batch->miny << 16) | batch->minx;
1044    ctx->bbox_max = ((batch->maxy - 1) << 16) | (batch->maxx - 1);
1045    ctx->dump_addr = batch->ctx->csf.tiler_oom_handler.save_bo->ptr.gpu;
1046 
1047    for (unsigned i = 0; i < PAN_INCREMENTAL_RENDERING_PASS_COUNT; ++i)
1048       ctx->fbds[i] = alloc_fbd(batch);
1049 
1050    csf_update_tiler_oom_ctx(b, batch->csf.tiler_oom_ctx.gpu);
1051 }
1052 
1053 static uint32_t
csf_emit_draw_state(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset)1054 csf_emit_draw_state(struct panfrost_batch *batch,
1055                     const struct pipe_draw_info *info, unsigned drawid_offset)
1056 {
1057    struct panfrost_context *ctx = batch->ctx;
1058    struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
1059    struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
1060 
1061    bool idvs = vs->info.vs.idvs;
1062    bool fs_required = panfrost_fs_required(
1063       fs, ctx->blend, &ctx->pipe_framebuffer, ctx->depth_stencil);
1064    bool secondary_shader = vs->info.vs.secondary_enable && fs_required;
1065 
1066    assert(idvs && "IDVS required for CSF");
1067 
1068    struct cs_builder *b = batch->csf.cs.builder;
1069 
1070    if (batch->draw_count == 0) {
1071       emit_tiler_oom_context(b, batch);
1072       cs_vt_start(batch->csf.cs.builder, cs_now());
1073    }
1074 
1075    csf_emit_shader_regs(batch, PIPE_SHADER_VERTEX,
1076                         panfrost_get_position_shader(batch, info));
1077 
1078    if (fs_required) {
1079       csf_emit_shader_regs(batch, PIPE_SHADER_FRAGMENT,
1080                            batch->rsd[PIPE_SHADER_FRAGMENT]);
1081    } else {
1082       cs_move64_to(b, cs_reg64(b, 4), 0);
1083       cs_move64_to(b, cs_reg64(b, 12), 0);
1084       cs_move64_to(b, cs_reg64(b, 20), 0);
1085    }
1086 
1087    if (secondary_shader) {
1088       cs_move64_to(b, cs_reg64(b, 18), panfrost_get_varying_shader(batch));
1089    }
1090 
1091    cs_move64_to(b, cs_reg64(b, 24), batch->tls.gpu);
1092    cs_move64_to(b, cs_reg64(b, 30), batch->tls.gpu);
1093    cs_move32_to(b, cs_reg32(b, 32), 0);
1094    cs_move32_to(b, cs_reg32(b, 37), 0);
1095    cs_move32_to(b, cs_reg32(b, 38), 0);
1096 
1097    cs_move64_to(b, cs_reg64(b, 40), csf_get_tiler_desc(batch));
1098 
1099    STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR));
1100    STATIC_ASSERT(sizeof(uint64_t) == pan_size(SCISSOR));
1101    uint64_t *sbd = (uint64_t *)&batch->scissor[0];
1102    cs_move64_to(b, cs_reg64(b, 42), *sbd);
1103 
1104    cs_move32_to(b, cs_reg32(b, 44), fui(batch->minimum_z));
1105    cs_move32_to(b, cs_reg32(b, 45), fui(batch->maximum_z));
1106 
1107    if (ctx->occlusion_query && ctx->active_queries) {
1108       struct panfrost_resource *rsrc = pan_resource(ctx->occlusion_query->rsrc);
1109       cs_move64_to(b, cs_reg64(b, 46), rsrc->image.data.base);
1110       panfrost_batch_write_rsrc(ctx->batch, rsrc, PIPE_SHADER_FRAGMENT);
1111    }
1112 
1113    cs_move32_to(b, cs_reg32(b, 48), panfrost_vertex_attribute_stride(vs, fs));
1114    cs_move64_to(b, cs_reg64(b, 50),
1115                 batch->blend | MAX2(batch->key.nr_cbufs, 1));
1116    cs_move64_to(b, cs_reg64(b, 52), batch->depth_stencil);
1117 
1118    if (info->index_size)
1119       cs_move64_to(b, cs_reg64(b, 54), batch->indices);
1120 
1121    struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
1122 
1123    struct mali_primitive_flags_packed primitive_flags;
1124    pan_pack(&primitive_flags, PRIMITIVE_FLAGS, cfg) {
1125       if (panfrost_writes_point_size(ctx))
1126          cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
1127 
1128       cfg.allow_rotating_primitives = allow_rotating_primitives(fs, info);
1129 
1130       cfg.low_depth_cull = rast->depth_clip_near;
1131       cfg.high_depth_cull = rast->depth_clip_far;
1132 
1133       /* Non-fixed restart indices should have been lowered */
1134       assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info));
1135       cfg.primitive_restart = info->primitive_restart;
1136 
1137       cfg.position_fifo_format = panfrost_writes_point_size(ctx)
1138                                     ? MALI_FIFO_FORMAT_EXTENDED
1139                                     : MALI_FIFO_FORMAT_BASIC;
1140    }
1141 
1142    cs_move32_to(b, cs_reg32(b, 56), primitive_flags.opaque[0]);
1143 
1144    struct mali_dcd_flags_0_packed dcd_flags0;
1145    struct mali_dcd_flags_1_packed dcd_flags1;
1146 
1147    pan_pack(&dcd_flags0, DCD_FLAGS_0, cfg) {
1148       enum mesa_prim reduced_mode = u_reduced_prim(info->mode);
1149       bool polygon = reduced_mode == MESA_PRIM_TRIANGLES;
1150       bool lines = reduced_mode == MESA_PRIM_LINES;
1151 
1152       /*
1153        * From the Gallium documentation,
1154        * pipe_rasterizer_state::cull_face "indicates which faces of
1155        * polygons to cull". Points and lines are not considered
1156        * polygons and should be drawn even if all faces are culled.
1157        * The hardware does not take primitive type into account when
1158        * culling, so we need to do that check ourselves.
1159        */
1160       cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT);
1161       cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK);
1162       cfg.front_face_ccw = rast->front_ccw;
1163 
1164       cfg.multisample_enable = rast->multisample;
1165 
1166       /* Use per-sample shading if required by API Also use it when a
1167        * blend shader is used with multisampling, as this is handled
1168        * by a single ST_TILE in the blend shader with the current
1169        * sample ID, requiring per-sample shading.
1170        */
1171       cfg.evaluate_per_sample =
1172          (rast->multisample &&
1173           ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader));
1174 
1175       cfg.single_sampled_lines = !rast->multisample;
1176 
1177       if (lines && rast->line_smooth) {
1178          cfg.multisample_enable = true;
1179          cfg.single_sampled_lines = false;
1180       }
1181 
1182       bool has_oq = ctx->occlusion_query && ctx->active_queries;
1183       if (has_oq) {
1184          if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
1185             cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
1186          else
1187             cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
1188       }
1189 
1190       if (fs_required) {
1191          struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
1192             fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
1193             ctx->blend->base.alpha_to_coverage,
1194             ctx->depth_stencil->zs_always_passes);
1195 
1196          cfg.pixel_kill_operation = earlyzs.kill;
1197          cfg.zs_update_operation = earlyzs.update;
1198 
1199          cfg.allow_forward_pixel_to_kill =
1200             pan_allow_forward_pixel_to_kill(ctx, fs);
1201          cfg.allow_forward_pixel_to_be_killed = !fs->info.writes_global;
1202 
1203          cfg.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
1204          cfg.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
1205 
1206          /* Also use per-sample shading if required by the shader
1207           */
1208          cfg.evaluate_per_sample |= fs->info.fs.sample_shading;
1209 
1210          /* Unlike Bifrost, alpha-to-coverage must be included in
1211           * this identically-named flag. Confusing, isn't it?
1212           */
1213          cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
1214                                         fs->info.fs.can_discard ||
1215                                         ctx->blend->base.alpha_to_coverage;
1216 
1217          cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
1218       } else {
1219          /* These operations need to be FORCE to benefit from the
1220           * depth-only pass optimizations.
1221           */
1222          cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
1223          cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
1224 
1225          /* No shader and no blend => no shader or blend
1226           * reasons to disable FPK. The only FPK-related state
1227           * not covered is alpha-to-coverage which we don't set
1228           * without blend.
1229           */
1230          cfg.allow_forward_pixel_to_kill = true;
1231 
1232          /* No shader => no shader side effects */
1233          cfg.allow_forward_pixel_to_be_killed = true;
1234 
1235          /* Alpha isn't written so these are vacuous */
1236          cfg.overdraw_alpha0 = true;
1237          cfg.overdraw_alpha1 = true;
1238       }
1239    }
1240 
1241    pan_pack(&dcd_flags1, DCD_FLAGS_1, cfg) {
1242       cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF;
1243 
1244       if (fs_required) {
1245          /* See JM Valhall equivalent code */
1246          cfg.render_target_mask =
1247             (fs->info.outputs_written >> FRAG_RESULT_DATA0) & ctx->fb_rt_mask;
1248       }
1249    }
1250 
1251    cs_move32_to(b, cs_reg32(b, 57), dcd_flags0.opaque[0]);
1252    cs_move32_to(b, cs_reg32(b, 58), dcd_flags1.opaque[0]);
1253 
1254    struct mali_primitive_size_packed primsize;
1255    panfrost_emit_primitive_size(ctx, info->mode == MESA_PRIM_POINTS, 0,
1256                                 &primsize);
1257    struct mali_primitive_size_packed *primsize_ptr = &primsize;
1258    cs_move64_to(b, cs_reg64(b, 60), *((uint64_t*)primsize_ptr));
1259 
1260    struct mali_primitive_flags_packed flags_override;
1261    /* Pack with nodefaults so only explicitly set override fields affect the
1262     * previously set register values */
1263    pan_pack_nodefaults(&flags_override, PRIMITIVE_FLAGS, cfg) {
1264       cfg.draw_mode = pan_draw_mode(info->mode);
1265       cfg.index_type = panfrost_translate_index_size(info->index_size);
1266       cfg.secondary_shader = secondary_shader;
1267    };
1268 
1269    return flags_override.opaque[0];
1270 }
1271 
1272 static struct cs_index
csf_emit_draw_id_register(struct panfrost_batch * batch,unsigned offset)1273 csf_emit_draw_id_register(struct panfrost_batch *batch, unsigned offset)
1274 {
1275    struct cs_builder *b = batch->csf.cs.builder;
1276    struct panfrost_context *ctx = batch->ctx;
1277    struct panfrost_uncompiled_shader *vs = ctx->uncompiled[PIPE_SHADER_VERTEX];
1278 
1279    if (!BITSET_TEST(vs->nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID))
1280       return cs_undef();
1281 
1282    struct cs_index drawid = cs_reg32(b, 67);
1283    cs_move32_to(b, drawid, offset);
1284 
1285    return drawid;
1286 }
1287 
1288 void
GENX(csf_launch_draw)1289 GENX(csf_launch_draw)(struct panfrost_batch *batch,
1290                       const struct pipe_draw_info *info, unsigned drawid_offset,
1291                       const struct pipe_draw_start_count_bias *draw,
1292                       unsigned vertex_count)
1293 {
1294    struct cs_builder *b = batch->csf.cs.builder;
1295 
1296    uint32_t flags_override = csf_emit_draw_state(batch, info, drawid_offset);
1297    struct cs_index drawid = csf_emit_draw_id_register(batch, drawid_offset);
1298 
1299    cs_move32_to(b, cs_reg32(b, 33), draw->count);
1300    cs_move32_to(b, cs_reg32(b, 34), info->instance_count);
1301    cs_move32_to(b, cs_reg32(b, 35), 0);
1302 
1303    /* Base vertex offset on Valhall is used for both indexed and
1304     * non-indexed draws, in a simple way for either. Handle both cases.
1305     */
1306    if (info->index_size) {
1307       cs_move32_to(b, cs_reg32(b, 36), draw->index_bias);
1308       cs_move32_to(b, cs_reg32(b, 39), info->index_size * draw->count);
1309    } else {
1310       cs_move32_to(b, cs_reg32(b, 36), draw->start);
1311       cs_move32_to(b, cs_reg32(b, 39), 0);
1312    }
1313 
1314    cs_run_idvs(b, flags_override, false, true, cs_shader_res_sel(0, 0, 1, 0),
1315                cs_shader_res_sel(2, 2, 2, 0), drawid);
1316 }
1317 
1318 void
GENX(csf_launch_draw_indirect)1319 GENX(csf_launch_draw_indirect)(struct panfrost_batch *batch,
1320                                const struct pipe_draw_info *info,
1321                                unsigned drawid_offset,
1322                                const struct pipe_draw_indirect_info *indirect)
1323 {
1324    struct cs_builder *b = batch->csf.cs.builder;
1325 
1326    uint32_t flags_override = csf_emit_draw_state(batch, info, drawid_offset);
1327    struct cs_index drawid = csf_emit_draw_id_register(batch, drawid_offset);
1328 
1329    struct cs_index address = cs_reg64(b, 64);
1330    struct cs_index counter = cs_reg32(b, 66);
1331    cs_move64_to(
1332       b, address,
1333       pan_resource(indirect->buffer)->image.data.base + indirect->offset);
1334    cs_move32_to(b, counter, indirect->draw_count);
1335 
1336    cs_while(b, MALI_CS_CONDITION_GREATER, counter) {
1337       if (info->index_size) {
1338          /* loads vertex count, instance count, index offset, vertex offset */
1339          cs_load_to(b, cs_reg_tuple(b, 33, 4), address, BITFIELD_MASK(4), 0);
1340          cs_move32_to(b, cs_reg32(b, 39), info->index.resource->width0);
1341       } else {
1342          /* vertex count, instance count */
1343          cs_load_to(b, cs_reg_tuple(b, 33, 2), address, BITFIELD_MASK(2), 0);
1344          cs_move32_to(b, cs_reg32(b, 35), 0);
1345          cs_load_to(b, cs_reg_tuple(b, 36, 1), address, BITFIELD_MASK(1),
1346                     2 * sizeof(uint32_t)); // instance offset
1347          cs_move32_to(b, cs_reg32(b, 37), 0);
1348          cs_move32_to(b, cs_reg32(b, 39), 0);
1349       }
1350 
1351       cs_wait_slot(b, 0, false);
1352       cs_run_idvs(b, flags_override, false, true, cs_shader_res_sel(0, 0, 1, 0),
1353                   cs_shader_res_sel(2, 2, 2, 0), drawid);
1354 
1355       cs_add64(b, address, address, indirect->stride);
1356       cs_add32(b, counter, counter, (unsigned int)-1);
1357       if (drawid.type != CS_INDEX_UNDEF)
1358          cs_add32(b, drawid, drawid, 1);
1359    }
1360 }
1361 
1362 #define POSITION_FIFO_SIZE (64 * 1024)
1363 
1364 static enum drm_panthor_group_priority
get_panthor_group_priority(struct panfrost_context * ctx)1365 get_panthor_group_priority(struct panfrost_context *ctx)
1366 {
1367    if (ctx->flags & PIPE_CONTEXT_REALTIME_PRIORITY)
1368       return PANTHOR_GROUP_PRIORITY_REALTIME;
1369    else if (ctx->flags & PIPE_CONTEXT_HIGH_PRIORITY)
1370       return PANTHOR_GROUP_PRIORITY_HIGH;
1371    else if (ctx->flags & PIPE_CONTEXT_LOW_PRIORITY)
1372       return PANTHOR_GROUP_PRIORITY_LOW;
1373 
1374    return PANTHOR_GROUP_PRIORITY_MEDIUM;
1375 }
1376 
1377 int
GENX(csf_init_context)1378 GENX(csf_init_context)(struct panfrost_context *ctx)
1379 {
1380    struct panfrost_device *dev = pan_device(ctx->base.screen);
1381    struct drm_panthor_queue_create qc[] = {{
1382       .priority = 1,
1383       .ringbuf_size = 64 * 1024,
1384    }};
1385 
1386    struct drm_panthor_group_create gc = {
1387       .compute_core_mask = dev->kmod.props.shader_present,
1388       .fragment_core_mask = dev->kmod.props.shader_present,
1389       .tiler_core_mask = 1,
1390       .max_compute_cores = util_bitcount64(dev->kmod.props.shader_present),
1391       .max_fragment_cores = util_bitcount64(dev->kmod.props.shader_present),
1392       .max_tiler_cores = 1,
1393       .priority = get_panthor_group_priority(ctx),
1394       .queues = DRM_PANTHOR_OBJ_ARRAY(ARRAY_SIZE(qc), qc),
1395       .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
1396    };
1397 
1398    int ret =
1399       drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_CREATE, &gc);
1400 
1401    if (ret)
1402       goto err_group_create;
1403 
1404    ctx->csf.group_handle = gc.group_handle;
1405 
1406    struct drm_panthor_group_destroy gd = {
1407       .group_handle = ctx->csf.group_handle,
1408    };
1409 
1410    /* Get tiler heap */
1411    struct drm_panthor_tiler_heap_create thc = {
1412       .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
1413       .chunk_size = pan_screen(ctx->base.screen)->csf_tiler_heap.chunk_size,
1414       .initial_chunk_count =
1415          pan_screen(ctx->base.screen)->csf_tiler_heap.initial_chunks,
1416       .max_chunks = pan_screen(ctx->base.screen)->csf_tiler_heap.max_chunks,
1417       .target_in_flight = 65535,
1418    };
1419    ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE,
1420                   &thc);
1421 
1422    if (ret)
1423       goto err_tiler_heap;
1424 
1425    ctx->csf.heap.handle = thc.handle;
1426 
1427    struct drm_panthor_tiler_heap_destroy thd = {
1428       .handle = ctx->csf.heap.handle,
1429    };
1430 
1431    ctx->csf.heap.desc_bo =
1432       panfrost_bo_create(dev, pan_size(TILER_HEAP), 0, "Tiler Heap");
1433 
1434    if (ctx->csf.heap.desc_bo == NULL)
1435       goto err_tiler_heap_desc_bo;
1436 
1437    pan_cast_and_pack(ctx->csf.heap.desc_bo->ptr.cpu, TILER_HEAP, heap) {
1438       heap.size = pan_screen(ctx->base.screen)->csf_tiler_heap.chunk_size;
1439       heap.base = thc.first_heap_chunk_gpu_va;
1440       heap.bottom = heap.base + 64;
1441       heap.top = heap.base + heap.size;
1442    }
1443 
1444    ctx->csf.tmp_geom_bo = panfrost_bo_create(
1445       dev, POSITION_FIFO_SIZE, PAN_BO_INVISIBLE, "Temporary Geometry buffer");
1446 
1447    if (ctx->csf.tmp_geom_bo == NULL)
1448       goto err_tiler_heap_tmp_geom_bo;
1449 
1450    /* Setup the tiler heap */
1451    struct panfrost_bo *cs_bo =
1452       panfrost_bo_create(dev, 4096, 0, "Temporary CS buffer");
1453 
1454    if (cs_bo == NULL)
1455       goto err_tiler_heap_cs_bo;
1456 
1457    if (csf_oom_handler_init(ctx))
1458       goto err_g_submit;
1459 
1460    struct cs_buffer init_buffer = {
1461       .cpu = cs_bo->ptr.cpu,
1462       .gpu = cs_bo->ptr.gpu,
1463       .capacity = panfrost_bo_size(cs_bo) / sizeof(uint64_t),
1464    };
1465    const struct cs_builder_conf bconf = {
1466       .nr_registers = 96,
1467       .nr_kernel_registers = 4,
1468    };
1469    struct cs_builder b;
1470    cs_builder_init(&b, &bconf, init_buffer);
1471    struct cs_index heap = cs_reg64(&b, 72);
1472    cs_move64_to(&b, heap, thc.tiler_heap_ctx_gpu_va);
1473    cs_heap_set(&b, heap);
1474 
1475    struct cs_index addr_reg = cs_reg64(&b, 86);
1476    struct cs_index length_reg = cs_reg32(&b, 88);
1477    cs_move64_to(&b, addr_reg, ctx->csf.tiler_oom_handler.cs_bo->ptr.gpu);
1478    cs_move32_to(&b, length_reg, ctx->csf.tiler_oom_handler.length);
1479    cs_set_exception_handler(&b, MALI_CS_EXCEPTION_TYPE_TILER_OOM,
1480                             addr_reg, length_reg);
1481 
1482    struct drm_panthor_queue_submit qsubmit;
1483    struct drm_panthor_group_submit gsubmit;
1484    struct drm_panthor_sync_op sync = {
1485       .flags =
1486          DRM_PANTHOR_SYNC_OP_SIGNAL | DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ,
1487       .handle = ctx->syncobj,
1488    };
1489 
1490    assert(cs_is_valid(&b));
1491    cs_finish(&b);
1492 
1493    uint64_t cs_start = cs_root_chunk_gpu_addr(&b);
1494    uint32_t cs_size = cs_root_chunk_size(&b);
1495 
1496    csf_prepare_qsubmit(ctx, &qsubmit, 0, cs_start, cs_size, &sync, 1);
1497    csf_prepare_gsubmit(ctx, &gsubmit, &qsubmit, 1);
1498    ret = csf_submit_gsubmit(ctx, &gsubmit);
1499 
1500    if (dev->debug & PAN_DBG_TRACE) {
1501       uint32_t regs[256] = {0};
1502       pandecode_interpret_cs(dev->decode_ctx, qsubmit.stream_addr,
1503                              qsubmit.stream_size, panfrost_device_gpu_id(dev),
1504                              regs);
1505    }
1506 
1507    if (ret)
1508       goto err_g_submit;
1509 
1510    /* Wait before freeing the buffer. */
1511    ret = drmSyncobjWait(panfrost_device_fd(dev), &ctx->syncobj, 1, INT64_MAX, 0,
1512                         NULL);
1513    assert(!ret);
1514 
1515    panfrost_bo_unreference(cs_bo);
1516 
1517    ctx->csf.is_init = true;
1518    return 0;
1519 err_g_submit:
1520    panfrost_bo_unreference(cs_bo);
1521 err_tiler_heap_cs_bo:
1522    panfrost_bo_unreference(ctx->csf.tmp_geom_bo);
1523 err_tiler_heap_tmp_geom_bo:
1524    panfrost_bo_unreference(ctx->csf.heap.desc_bo);
1525 err_tiler_heap_desc_bo:
1526    drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY,
1527             &thd);
1528 err_tiler_heap:
1529    drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_DESTROY, &gd);
1530 err_group_create:
1531    return -1;
1532 }
1533 
1534 void
GENX(csf_cleanup_context)1535 GENX(csf_cleanup_context)(struct panfrost_context *ctx)
1536 {
1537    if (!ctx->csf.is_init)
1538       return;
1539 
1540    struct panfrost_device *dev = pan_device(ctx->base.screen);
1541    struct drm_panthor_tiler_heap_destroy thd = {
1542       .handle = ctx->csf.heap.handle,
1543    };
1544    int ret;
1545 
1546    /* Make sure all jobs are done before destroying the heap. */
1547    ret = drmSyncobjWait(panfrost_device_fd(dev), &ctx->syncobj, 1, INT64_MAX, 0,
1548                         NULL);
1549    assert(!ret);
1550 
1551    ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY,
1552                   &thd);
1553    assert(!ret);
1554 
1555    struct drm_panthor_group_destroy gd = {
1556       .group_handle = ctx->csf.group_handle,
1557    };
1558 
1559    ret =
1560       drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_DESTROY, &gd);
1561    assert(!ret);
1562 
1563    panfrost_bo_unreference(ctx->csf.tmp_geom_bo);
1564    panfrost_bo_unreference(ctx->csf.heap.desc_bo);
1565    panfrost_bo_unreference(ctx->csf.tiler_oom_handler.cs_bo);
1566    panfrost_bo_unreference(ctx->csf.tiler_oom_handler.save_bo);
1567    ctx->csf.is_init = false;
1568 }
1569 
1570 void
GENX(csf_emit_write_timestamp)1571 GENX(csf_emit_write_timestamp)(struct panfrost_batch *batch,
1572                                struct panfrost_resource *dst, unsigned offset)
1573 {
1574    struct cs_builder *b = batch->csf.cs.builder;
1575 
1576    struct cs_index address = cs_reg64(b, 40);
1577    cs_move64_to(b, address,
1578                 dst->image.data.base + dst->image.data.offset + offset);
1579    cs_store_state(b, address, 0, MALI_CS_STATE_TIMESTAMP, cs_now());
1580 
1581    panfrost_batch_write_rsrc(batch, dst, PIPE_SHADER_VERTEX);
1582 }
1583