1 /*
2 * Copyright © 2024 Collabora Ltd.
3 * Copyright © 2024 Arm Ltd.
4 *
5 * SPDX-License-Identifier: MIT
6 */
7
8 #include "panvk_cmd_buffer.h"
9 #include "panvk_device.h"
10
11 static enum cs_reg_perm
tiler_oom_reg_perm_cb(struct cs_builder * b,unsigned reg)12 tiler_oom_reg_perm_cb(struct cs_builder *b, unsigned reg)
13 {
14 switch (reg) {
15 /* The bbox is set up by the fragment subqueue, we should not modify it. */
16 case 42:
17 case 43:
18 /* We should only load from the subqueue context. */
19 case PANVK_CS_REG_SUBQUEUE_CTX_START:
20 case PANVK_CS_REG_SUBQUEUE_CTX_END:
21 return CS_REG_RD;
22 }
23 return CS_REG_RW;
24 }
25
26 static size_t
generate_tiler_oom_handler(struct cs_buffer handler_mem,bool has_zs_ext,uint32_t rt_count,bool tracing_enabled,uint32_t * dump_region_size)27 generate_tiler_oom_handler(struct cs_buffer handler_mem, bool has_zs_ext,
28 uint32_t rt_count, bool tracing_enabled,
29 uint32_t *dump_region_size)
30 {
31 assert(rt_count >= 1 && rt_count <= MAX_RTS);
32 uint32_t fbd_size = get_fbd_size(has_zs_ext, rt_count);
33
34 struct cs_builder b;
35 struct cs_builder_conf conf = {
36 .nr_registers = 96,
37 .nr_kernel_registers = 4,
38 .reg_perm = tiler_oom_reg_perm_cb,
39 };
40 cs_builder_init(&b, &conf, handler_mem);
41
42 struct cs_exception_handler handler;
43 struct cs_exception_handler_ctx handler_ctx = {
44 .ctx_reg = cs_subqueue_ctx_reg(&b),
45 .dump_addr_offset = TILER_OOM_CTX_FIELD_OFFSET(reg_dump_addr),
46 .ls_sb_slot = SB_ID(LS),
47 };
48 struct cs_tracing_ctx tracing_ctx = {
49 .enabled = tracing_enabled,
50 .ctx_reg = cs_subqueue_ctx_reg(&b),
51 .tracebuf_addr_offset =
52 offsetof(struct panvk_cs_subqueue_context, debug.tracebuf.cs),
53 .ls_sb_slot = SB_ID(LS),
54 };
55
56 cs_exception_handler_def(&b, &handler, handler_ctx) {
57 struct cs_index subqueue_ctx = cs_subqueue_ctx_reg(&b);
58 struct cs_index zero = cs_scratch_reg64(&b, 0);
59 /* Have flush_id read part of the double zero register */
60 struct cs_index flush_id = cs_scratch_reg32(&b, 0);
61 struct cs_index completed_chunks = cs_scratch_reg_tuple(&b, 2, 4);
62 struct cs_index completed_top = cs_scratch_reg64(&b, 2);
63 struct cs_index completed_bottom = cs_scratch_reg64(&b, 4);
64 struct cs_index counter = cs_scratch_reg32(&b, 6);
65 struct cs_index layer_count = cs_scratch_reg32(&b, 7);
66
67 /* The tiler pointer is pre-filled. */
68 struct cs_index tiler_ptr = cs_sr_reg64(&b, 38);
69 struct cs_index fbd_ptr = cs_sr_reg64(&b, 40);
70
71 /* Use different framebuffer descriptor depending on whether incremental
72 * rendering has already been triggered */
73 cs_load32_to(&b, counter, subqueue_ctx,
74 TILER_OOM_CTX_FIELD_OFFSET(counter));
75 cs_wait_slot(&b, SB_ID(LS), false);
76
77 cs_if(&b, MALI_CS_CONDITION_GREATER, counter)
78 cs_load64_to(&b, fbd_ptr, subqueue_ctx,
79 TILER_OOM_CTX_FBDPTR_OFFSET(MIDDLE));
80 cs_else(&b)
81 cs_load64_to(&b, fbd_ptr, subqueue_ctx,
82 TILER_OOM_CTX_FBDPTR_OFFSET(FIRST));
83
84 cs_load32_to(&b, layer_count, subqueue_ctx,
85 TILER_OOM_CTX_FIELD_OFFSET(layer_count));
86 cs_wait_slot(&b, SB_ID(LS), false);
87
88 cs_req_res(&b, CS_FRAG_RES);
89 cs_while(&b, MALI_CS_CONDITION_GREATER, layer_count) {
90 cs_trace_run_fragment(&b, &tracing_ctx,
91 cs_scratch_reg_tuple(&b, 8, 4), false,
92 MALI_TILE_RENDER_ORDER_Z_ORDER, false);
93 cs_add32(&b, layer_count, layer_count, -1);
94 cs_add64(&b, fbd_ptr, fbd_ptr, fbd_size);
95 }
96 cs_req_res(&b, 0);
97 /* Wait for all iter scoreboards for simplicity. */
98 cs_wait_slots(&b, SB_ALL_ITERS_MASK, false);
99
100 /* Increment counter */
101 cs_add32(&b, counter, counter, 1);
102 cs_store32(&b, counter, subqueue_ctx,
103 TILER_OOM_CTX_FIELD_OFFSET(counter));
104
105 /* Reuse layer_count reg for td_count */
106 struct cs_index td_count = layer_count;
107 cs_load32_to(&b, td_count, subqueue_ctx,
108 TILER_OOM_CTX_FIELD_OFFSET(td_count));
109 cs_move64_to(&b, zero, 0);
110 cs_wait_slot(&b, SB_ID(LS), false);
111
112 cs_while(&b, MALI_CS_CONDITION_GREATER, td_count) {
113 /* Load completed chunks */
114 cs_load_to(&b, completed_chunks, tiler_ptr, BITFIELD_MASK(4), 10 * 4);
115 cs_wait_slot(&b, SB_ID(LS), false);
116
117 cs_finish_fragment(&b, false, completed_top, completed_bottom,
118 cs_now());
119
120 /* Zero out polygon list, completed_top and completed_bottom */
121 cs_store64(&b, zero, tiler_ptr, 0);
122 cs_store64(&b, zero, tiler_ptr, 10 * 4);
123 cs_store64(&b, zero, tiler_ptr, 12 * 4);
124
125 cs_add64(&b, tiler_ptr, tiler_ptr, pan_size(TILER_CONTEXT));
126 cs_add32(&b, td_count, td_count, -1);
127 }
128
129 /* We need to flush the texture caches so future preloads see the new
130 * content. */
131 cs_flush_caches(&b, MALI_CS_FLUSH_MODE_NONE, MALI_CS_FLUSH_MODE_NONE,
132 true, flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH)));
133
134 cs_wait_slot(&b, SB_ID(IMM_FLUSH), false);
135 }
136
137 assert(cs_is_valid(&b));
138 cs_finish(&b);
139 *dump_region_size = handler.dump_size;
140
141 return handler.length * sizeof(uint64_t);
142 }
143
144 #define TILER_OOM_HANDLER_MAX_SIZE 512
145 VkResult
panvk_per_arch(init_tiler_oom)146 panvk_per_arch(init_tiler_oom)(struct panvk_device *device)
147 {
148 struct panvk_instance *instance =
149 to_panvk_instance(device->vk.physical->instance);
150 bool tracing_enabled = instance->debug_flags & PANVK_DEBUG_TRACE;
151 VkResult result = panvk_priv_bo_create(
152 device, TILER_OOM_HANDLER_MAX_SIZE * 2 * MAX_RTS, 0,
153 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE, &device->tiler_oom.handlers_bo);
154 if (result != VK_SUCCESS)
155 return result;
156
157 for (uint32_t zs_ext = 0; zs_ext <= 1; zs_ext++) {
158 for (uint32_t rt_count = 1; rt_count <= MAX_RTS; rt_count++) {
159 uint32_t idx = get_tiler_oom_handler_idx(zs_ext, rt_count);
160 /* Check that we have calculated a handler_stride if we need it to
161 * offset addresses. */
162 assert(idx == 0 || device->tiler_oom.handler_stride != 0);
163 size_t offset = idx * device->tiler_oom.handler_stride;
164
165 struct cs_buffer handler_mem = {
166 .cpu = device->tiler_oom.handlers_bo->addr.host + offset,
167 .gpu = device->tiler_oom.handlers_bo->addr.dev + offset,
168 .capacity = TILER_OOM_HANDLER_MAX_SIZE / sizeof(uint64_t),
169 };
170
171 uint32_t dump_region_size;
172 size_t handler_length = generate_tiler_oom_handler(
173 handler_mem, zs_ext, rt_count, tracing_enabled, &dump_region_size);
174
175 /* All handlers must have the same length */
176 assert(idx == 0 || handler_length == device->tiler_oom.handler_stride);
177 assert(idx == 0 ||
178 dump_region_size == device->tiler_oom.dump_region_size);
179 device->tiler_oom.handler_stride = handler_length;
180 device->tiler_oom.dump_region_size = dump_region_size;
181 }
182 }
183
184 return result;
185 }
186