• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2024 Collabora Ltd.
3  * Copyright © 2024 Arm Ltd.
4  *
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include "panvk_cmd_buffer.h"
9 #include "panvk_device.h"
10 
11 static enum cs_reg_perm
tiler_oom_reg_perm_cb(struct cs_builder * b,unsigned reg)12 tiler_oom_reg_perm_cb(struct cs_builder *b, unsigned reg)
13 {
14    switch (reg) {
15    /* The bbox is set up by the fragment subqueue, we should not modify it. */
16    case 42:
17    case 43:
18    /* We should only load from the subqueue context. */
19    case PANVK_CS_REG_SUBQUEUE_CTX_START:
20    case PANVK_CS_REG_SUBQUEUE_CTX_END:
21       return CS_REG_RD;
22    }
23    return CS_REG_RW;
24 }
25 
26 static size_t
generate_tiler_oom_handler(struct cs_buffer handler_mem,bool has_zs_ext,uint32_t rt_count,bool tracing_enabled,uint32_t * dump_region_size)27 generate_tiler_oom_handler(struct cs_buffer handler_mem, bool has_zs_ext,
28                            uint32_t rt_count, bool tracing_enabled,
29                            uint32_t *dump_region_size)
30 {
31    assert(rt_count >= 1 && rt_count <= MAX_RTS);
32    uint32_t fbd_size = get_fbd_size(has_zs_ext, rt_count);
33 
34    struct cs_builder b;
35    struct cs_builder_conf conf = {
36       .nr_registers = 96,
37       .nr_kernel_registers = 4,
38       .reg_perm = tiler_oom_reg_perm_cb,
39    };
40    cs_builder_init(&b, &conf, handler_mem);
41 
42    struct cs_exception_handler handler;
43    struct cs_exception_handler_ctx handler_ctx = {
44       .ctx_reg = cs_subqueue_ctx_reg(&b),
45       .dump_addr_offset = TILER_OOM_CTX_FIELD_OFFSET(reg_dump_addr),
46       .ls_sb_slot = SB_ID(LS),
47    };
48    struct cs_tracing_ctx tracing_ctx = {
49       .enabled = tracing_enabled,
50       .ctx_reg = cs_subqueue_ctx_reg(&b),
51       .tracebuf_addr_offset =
52          offsetof(struct panvk_cs_subqueue_context, debug.tracebuf.cs),
53       .ls_sb_slot = SB_ID(LS),
54    };
55 
56    cs_exception_handler_def(&b, &handler, handler_ctx) {
57       struct cs_index subqueue_ctx = cs_subqueue_ctx_reg(&b);
58       struct cs_index zero = cs_scratch_reg64(&b, 0);
59       /* Have flush_id read part of the double zero register */
60       struct cs_index flush_id = cs_scratch_reg32(&b, 0);
61       struct cs_index completed_chunks = cs_scratch_reg_tuple(&b, 2, 4);
62       struct cs_index completed_top = cs_scratch_reg64(&b, 2);
63       struct cs_index completed_bottom = cs_scratch_reg64(&b, 4);
64       struct cs_index counter = cs_scratch_reg32(&b, 6);
65       struct cs_index layer_count = cs_scratch_reg32(&b, 7);
66 
67       /* The tiler pointer is pre-filled. */
68       struct cs_index tiler_ptr = cs_sr_reg64(&b, 38);
69       struct cs_index fbd_ptr = cs_sr_reg64(&b, 40);
70 
71       /* Use different framebuffer descriptor depending on whether incremental
72        * rendering has already been triggered */
73       cs_load32_to(&b, counter, subqueue_ctx,
74                    TILER_OOM_CTX_FIELD_OFFSET(counter));
75       cs_wait_slot(&b, SB_ID(LS), false);
76 
77       cs_if(&b, MALI_CS_CONDITION_GREATER, counter)
78          cs_load64_to(&b, fbd_ptr, subqueue_ctx,
79                       TILER_OOM_CTX_FBDPTR_OFFSET(MIDDLE));
80       cs_else(&b)
81          cs_load64_to(&b, fbd_ptr, subqueue_ctx,
82                       TILER_OOM_CTX_FBDPTR_OFFSET(FIRST));
83 
84       cs_load32_to(&b, layer_count, subqueue_ctx,
85                    TILER_OOM_CTX_FIELD_OFFSET(layer_count));
86       cs_wait_slot(&b, SB_ID(LS), false);
87 
88       cs_req_res(&b, CS_FRAG_RES);
89       cs_while(&b, MALI_CS_CONDITION_GREATER, layer_count) {
90          cs_trace_run_fragment(&b, &tracing_ctx,
91                                cs_scratch_reg_tuple(&b, 8, 4), false,
92                                MALI_TILE_RENDER_ORDER_Z_ORDER, false);
93          cs_add32(&b, layer_count, layer_count, -1);
94          cs_add64(&b, fbd_ptr, fbd_ptr, fbd_size);
95       }
96       cs_req_res(&b, 0);
97       /* Wait for all iter scoreboards for simplicity. */
98       cs_wait_slots(&b, SB_ALL_ITERS_MASK, false);
99 
100       /* Increment counter */
101       cs_add32(&b, counter, counter, 1);
102       cs_store32(&b, counter, subqueue_ctx,
103                  TILER_OOM_CTX_FIELD_OFFSET(counter));
104 
105       /* Reuse layer_count reg for td_count */
106       struct cs_index td_count = layer_count;
107       cs_load32_to(&b, td_count, subqueue_ctx,
108                    TILER_OOM_CTX_FIELD_OFFSET(td_count));
109       cs_move64_to(&b, zero, 0);
110       cs_wait_slot(&b, SB_ID(LS), false);
111 
112       cs_while(&b, MALI_CS_CONDITION_GREATER, td_count) {
113          /* Load completed chunks */
114          cs_load_to(&b, completed_chunks, tiler_ptr, BITFIELD_MASK(4), 10 * 4);
115          cs_wait_slot(&b, SB_ID(LS), false);
116 
117          cs_finish_fragment(&b, false, completed_top, completed_bottom,
118                             cs_now());
119 
120          /* Zero out polygon list, completed_top and completed_bottom */
121          cs_store64(&b, zero, tiler_ptr, 0);
122          cs_store64(&b, zero, tiler_ptr, 10 * 4);
123          cs_store64(&b, zero, tiler_ptr, 12 * 4);
124 
125          cs_add64(&b, tiler_ptr, tiler_ptr, pan_size(TILER_CONTEXT));
126          cs_add32(&b, td_count, td_count, -1);
127       }
128 
129       /* We need to flush the texture caches so future preloads see the new
130        * content. */
131       cs_flush_caches(&b, MALI_CS_FLUSH_MODE_NONE, MALI_CS_FLUSH_MODE_NONE,
132                       true, flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH)));
133 
134       cs_wait_slot(&b, SB_ID(IMM_FLUSH), false);
135    }
136 
137    assert(cs_is_valid(&b));
138    cs_finish(&b);
139    *dump_region_size = handler.dump_size;
140 
141    return handler.length * sizeof(uint64_t);
142 }
143 
144 #define TILER_OOM_HANDLER_MAX_SIZE 512
145 VkResult
panvk_per_arch(init_tiler_oom)146 panvk_per_arch(init_tiler_oom)(struct panvk_device *device)
147 {
148    struct panvk_instance *instance =
149       to_panvk_instance(device->vk.physical->instance);
150    bool tracing_enabled = instance->debug_flags & PANVK_DEBUG_TRACE;
151    VkResult result = panvk_priv_bo_create(
152       device, TILER_OOM_HANDLER_MAX_SIZE * 2 * MAX_RTS, 0,
153       VK_SYSTEM_ALLOCATION_SCOPE_DEVICE, &device->tiler_oom.handlers_bo);
154    if (result != VK_SUCCESS)
155       return result;
156 
157    for (uint32_t zs_ext = 0; zs_ext <= 1; zs_ext++) {
158       for (uint32_t rt_count = 1; rt_count <= MAX_RTS; rt_count++) {
159          uint32_t idx = get_tiler_oom_handler_idx(zs_ext, rt_count);
160          /* Check that we have calculated a handler_stride if we need it to
161           * offset addresses. */
162          assert(idx == 0 || device->tiler_oom.handler_stride != 0);
163          size_t offset = idx * device->tiler_oom.handler_stride;
164 
165          struct cs_buffer handler_mem = {
166             .cpu = device->tiler_oom.handlers_bo->addr.host + offset,
167             .gpu = device->tiler_oom.handlers_bo->addr.dev + offset,
168             .capacity = TILER_OOM_HANDLER_MAX_SIZE / sizeof(uint64_t),
169          };
170 
171          uint32_t dump_region_size;
172          size_t handler_length = generate_tiler_oom_handler(
173             handler_mem, zs_ext, rt_count, tracing_enabled, &dump_region_size);
174 
175          /* All handlers must have the same length */
176          assert(idx == 0 || handler_length == device->tiler_oom.handler_stride);
177          assert(idx == 0 ||
178                 dump_region_size == device->tiler_oom.dump_region_size);
179          device->tiler_oom.handler_stride = handler_length;
180          device->tiler_oom.dump_region_size = dump_region_size;
181       }
182    }
183 
184    return result;
185 }
186