1 /*
2 * Copyright © 2024 Collabora Ltd.
3 * Copyright © 2024 Arm Ltd.
4 *
5 * Derived from tu_cmd_buffer.c which is:
6 * Copyright © 2016 Red Hat.
7 * Copyright © 2016 Bas Nieuwenhuizen
8 * Copyright © 2015 Intel Corporation
9 *
10 * SPDX-License-Identifier: MIT
11 */
12
13 #include "genxml/gen_macros.h"
14
15 #include "panvk_buffer.h"
16 #include "panvk_cmd_alloc.h"
17 #include "panvk_cmd_buffer.h"
18 #include "panvk_cmd_desc_state.h"
19 #include "panvk_cmd_meta.h"
20 #include "panvk_cmd_push_constant.h"
21 #include "panvk_device.h"
22 #include "panvk_entrypoints.h"
23 #include "panvk_meta.h"
24 #include "panvk_physical_device.h"
25
26 #include "pan_desc.h"
27 #include "pan_encoder.h"
28 #include "pan_props.h"
29
30 #include <vulkan/vulkan_core.h>
31
32 static VkResult
prepare_driver_set(struct panvk_cmd_buffer * cmdbuf)33 prepare_driver_set(struct panvk_cmd_buffer *cmdbuf)
34 {
35 struct panvk_shader_desc_state *cs_desc_state =
36 &cmdbuf->state.compute.cs.desc;
37
38 if (!compute_state_dirty(cmdbuf, CS) &&
39 !compute_state_dirty(cmdbuf, DESC_STATE))
40 return VK_SUCCESS;
41
42 const struct panvk_descriptor_state *desc_state =
43 &cmdbuf->state.compute.desc_state;
44 const struct panvk_shader *cs = cmdbuf->state.compute.shader;
45 uint32_t desc_count = cs->desc_info.dyn_bufs.count + 1;
46 struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
47 cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
48 struct panvk_opaque_desc *descs = driver_set.cpu;
49
50 if (!driver_set.gpu)
51 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
52
53 /* Dummy sampler always comes first. */
54 pan_cast_and_pack(&descs[0], SAMPLER, cfg) {
55 cfg.clamp_integer_array_indices = false;
56 }
57
58 panvk_per_arch(cmd_fill_dyn_bufs)(desc_state, cs,
59 (struct mali_buffer_packed *)(&descs[1]));
60
61 cs_desc_state->driver_set.dev_addr = driver_set.gpu;
62 cs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
63 compute_state_set_dirty(cmdbuf, DESC_STATE);
64 return VK_SUCCESS;
65 }
66
67 static void
calculate_task_axis_and_increment(const struct panvk_shader * shader,struct panvk_physical_device * phys_dev,unsigned * task_axis,unsigned * task_increment)68 calculate_task_axis_and_increment(const struct panvk_shader *shader,
69 struct panvk_physical_device *phys_dev,
70 unsigned *task_axis, unsigned *task_increment)
71 {
72 /* Pick the task_axis and task_increment to maximize thread
73 * utilization. */
74 unsigned threads_per_wg =
75 shader->local_size.x * shader->local_size.y * shader->local_size.z;
76 unsigned max_thread_cnt = panfrost_compute_max_thread_count(
77 &phys_dev->kmod.props, shader->info.work_reg_count);
78 unsigned threads_per_task = threads_per_wg;
79 unsigned local_size[3] = {
80 shader->local_size.x,
81 shader->local_size.y,
82 shader->local_size.z,
83 };
84
85 for (unsigned i = 0; i < 3; i++) {
86 if (threads_per_task * local_size[i] >= max_thread_cnt) {
87 /* We reached out thread limit, stop at the current axis and
88 * calculate the increment so it doesn't exceed the per-core
89 * thread capacity.
90 */
91 *task_increment = max_thread_cnt / threads_per_task;
92 break;
93 } else if (*task_axis == MALI_TASK_AXIS_Z) {
94 /* We reached the Z axis, and there's still room to stuff more
95 * threads. Pick the current axis grid size as our increment
96 * as there's no point using something bigger.
97 */
98 *task_increment = local_size[i];
99 break;
100 }
101
102 threads_per_task *= local_size[i];
103 (*task_axis)++;
104 }
105
106 assert(*task_axis <= MALI_TASK_AXIS_Z);
107 assert(*task_increment > 0);
108 }
109
110 static unsigned
calculate_workgroups_per_task(const struct panvk_shader * shader,struct panvk_physical_device * phys_dev)111 calculate_workgroups_per_task(const struct panvk_shader *shader,
112 struct panvk_physical_device *phys_dev)
113 {
114 /* Each shader core can run N tasks and a total of M threads at any single
115 * time, thus each task should ideally have no more than M/N threads. */
116 unsigned max_threads_per_task = phys_dev->kmod.props.max_threads_per_core /
117 phys_dev->kmod.props.max_tasks_per_core;
118
119 /* To achieve the best utilization, we should aim for as many workgroups
120 * per tasks as we can fit without exceeding the above thread limit */
121 unsigned threads_per_wg =
122 shader->local_size.x * shader->local_size.y * shader->local_size.z;
123 assert(threads_per_wg > 0 &&
124 threads_per_wg <= phys_dev->kmod.props.max_threads_per_wg);
125 unsigned wg_per_task = DIV_ROUND_UP(max_threads_per_task, threads_per_wg);
126 assert(wg_per_task > 0 && wg_per_task <= max_threads_per_task);
127
128 return wg_per_task;
129 }
130
131 static void
cmd_dispatch(struct panvk_cmd_buffer * cmdbuf,struct panvk_dispatch_info * info)132 cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
133 {
134 const struct panvk_shader *shader = cmdbuf->state.compute.shader;
135 VkResult result;
136
137 /* If there's no compute shader, we can skip the dispatch. */
138 if (!panvk_priv_mem_dev_addr(shader->spd))
139 return;
140
141 struct panvk_physical_device *phys_dev =
142 to_panvk_physical_device(cmdbuf->vk.base.device->physical);
143 struct panvk_descriptor_state *desc_state =
144 &cmdbuf->state.compute.desc_state;
145 struct panvk_shader_desc_state *cs_desc_state =
146 &cmdbuf->state.compute.cs.desc;
147 const struct cs_tracing_ctx *tracing_ctx =
148 &cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].tracing;
149
150 struct panfrost_ptr tsd = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
151 if (!tsd.gpu)
152 return;
153
154 struct pan_tls_info tlsinfo = {
155 .tls.size = shader->info.tls_size,
156 .wls.size = shader->info.wls_size,
157 };
158 unsigned core_id_range;
159 unsigned core_count =
160 panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
161
162 bool indirect = info->indirect.buffer_dev_addr != 0;
163
164 /* Only used for indirect dispatch */
165 unsigned wg_per_task = 0;
166 if (indirect)
167 wg_per_task = calculate_workgroups_per_task(shader, phys_dev);
168
169 if (tlsinfo.wls.size) {
170 /* NOTE: If the instance count is lower than the number of workgroups
171 * being dispatched, the HW will hold back workgroups until instances
172 * can be reused. */
173 /* NOTE: There is no benefit from allocating more instances than what
174 * can concurrently be used by the HW */
175 if (indirect) {
176 /* Assume we utilize all shader cores to the max */
177 tlsinfo.wls.instances = util_next_power_of_two(
178 wg_per_task * phys_dev->kmod.props.max_tasks_per_core * core_count);
179 } else {
180 /* TODO: Similar to what we are doing for indirect this should change
181 * to calculate the maximum number of workgroups we can execute
182 * concurrently. */
183 struct pan_compute_dim dim = {
184 info->direct.wg_count.x,
185 info->direct.wg_count.y,
186 info->direct.wg_count.z,
187 };
188
189 tlsinfo.wls.instances = pan_wls_instances(&dim);
190 }
191
192 /* TODO: Clamp WLS instance to some maximum WLS budget. */
193 unsigned wls_total_size = pan_wls_adjust_size(tlsinfo.wls.size) *
194 tlsinfo.wls.instances * core_id_range;
195
196 /* TODO: Reuse WLS allocation for all dispatch commands in the command
197 * buffer, similar to what we do for TLS in draw. As WLS size (and
198 * instance count) might differ significantly between dispatch commands,
199 * rather than track a single maximum size, we might want to consider
200 * multiple allocations for different size buckets. */
201 tlsinfo.wls.ptr =
202 panvk_cmd_alloc_dev_mem(cmdbuf, tls, wls_total_size, 4096).gpu;
203 if (!tlsinfo.wls.ptr)
204 return;
205 }
206
207 cmdbuf->state.tls.info.tls.size =
208 MAX2(shader->info.tls_size, cmdbuf->state.tls.info.tls.size);
209
210 if (!cmdbuf->state.tls.desc.gpu) {
211 cmdbuf->state.tls.desc = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
212 if (!cmdbuf->state.tls.desc.gpu)
213 return;
214 }
215
216 GENX(pan_emit_tls)(&tlsinfo, tsd.cpu);
217
218 if (compute_state_dirty(cmdbuf, DESC_STATE) ||
219 compute_state_dirty(cmdbuf, CS)) {
220 result = panvk_per_arch(cmd_prepare_push_descs)(
221 cmdbuf, desc_state, shader->desc_info.used_set_mask);
222 if (result != VK_SUCCESS)
223 return;
224 }
225
226 panvk_per_arch(cmd_prepare_dispatch_sysvals)(cmdbuf, info);
227
228 result = prepare_driver_set(cmdbuf);
229 if (result != VK_SUCCESS)
230 return;
231
232 result = panvk_per_arch(cmd_prepare_push_uniforms)(
233 cmdbuf, cmdbuf->state.compute.shader);
234 if (result != VK_SUCCESS)
235 return;
236
237 if (compute_state_dirty(cmdbuf, CS) ||
238 compute_state_dirty(cmdbuf, DESC_STATE)) {
239 result = panvk_per_arch(cmd_prepare_shader_res_table)(
240 cmdbuf, desc_state, shader, cs_desc_state);
241 if (result != VK_SUCCESS)
242 return;
243 }
244
245 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
246
247 /* Copy the global TLS pointer to the per-job TSD. */
248 if (tlsinfo.tls.size) {
249 cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.tls.desc.gpu);
250 cs_load64_to(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
251 cs_wait_slot(b, SB_ID(LS), false);
252 cs_move64_to(b, cs_scratch_reg64(b, 0), tsd.gpu);
253 cs_store64(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
254 cs_wait_slot(b, SB_ID(LS), false);
255 }
256
257 cs_update_compute_ctx(b) {
258 if (compute_state_dirty(cmdbuf, CS) ||
259 compute_state_dirty(cmdbuf, DESC_STATE))
260 cs_move64_to(b, cs_sr_reg64(b, 0), cs_desc_state->res_table);
261
262 if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
263 uint64_t fau_ptr = cmdbuf->state.compute.push_uniforms |
264 ((uint64_t)shader->fau.total_count << 56);
265 cs_move64_to(b, cs_sr_reg64(b, 8), fau_ptr);
266 }
267
268 if (compute_state_dirty(cmdbuf, CS))
269 cs_move64_to(b, cs_sr_reg64(b, 16),
270 panvk_priv_mem_dev_addr(shader->spd));
271
272 cs_move64_to(b, cs_sr_reg64(b, 24), tsd.gpu);
273
274 /* Global attribute offset */
275 cs_move32_to(b, cs_sr_reg32(b, 32), 0);
276
277 struct mali_compute_size_workgroup_packed wg_size;
278 pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
279 cfg.workgroup_size_x = shader->local_size.x;
280 cfg.workgroup_size_y = shader->local_size.y;
281 cfg.workgroup_size_z = shader->local_size.z;
282 cfg.allow_merging_workgroups = false;
283 }
284 cs_move32_to(b, cs_sr_reg32(b, 33), wg_size.opaque[0]);
285 cs_move32_to(b, cs_sr_reg32(b, 34),
286 info->wg_base.x * shader->local_size.x);
287 cs_move32_to(b, cs_sr_reg32(b, 35),
288 info->wg_base.y * shader->local_size.y);
289 cs_move32_to(b, cs_sr_reg32(b, 36),
290 info->wg_base.z * shader->local_size.z);
291 if (indirect) {
292 /* Load parameters from indirect buffer and update workgroup count
293 * registers and sysvals */
294 cs_move64_to(b, cs_scratch_reg64(b, 0),
295 info->indirect.buffer_dev_addr);
296 cs_load_to(b, cs_sr_reg_tuple(b, 37, 3), cs_scratch_reg64(b, 0),
297 BITFIELD_MASK(3), 0);
298 cs_move64_to(b, cs_scratch_reg64(b, 0),
299 cmdbuf->state.compute.push_uniforms);
300 cs_wait_slot(b, SB_ID(LS), false);
301
302 if (shader_uses_sysval(shader, compute, num_work_groups.x)) {
303 cs_store32(b, cs_sr_reg32(b, 37), cs_scratch_reg64(b, 0),
304 shader_remapped_sysval_offset(
305 shader, sysval_offset(compute, num_work_groups.x)));
306 }
307
308 if (shader_uses_sysval(shader, compute, num_work_groups.y)) {
309 cs_store32(b, cs_sr_reg32(b, 38), cs_scratch_reg64(b, 0),
310 shader_remapped_sysval_offset(
311 shader, sysval_offset(compute, num_work_groups.y)));
312 }
313
314 if (shader_uses_sysval(shader, compute, num_work_groups.z)) {
315 cs_store32(b, cs_sr_reg32(b, 39), cs_scratch_reg64(b, 0),
316 shader_remapped_sysval_offset(
317 shader, sysval_offset(compute, num_work_groups.z)));
318 }
319
320 cs_wait_slot(b, SB_ID(LS), false);
321 } else {
322 cs_move32_to(b, cs_sr_reg32(b, 37), info->direct.wg_count.x);
323 cs_move32_to(b, cs_sr_reg32(b, 38), info->direct.wg_count.y);
324 cs_move32_to(b, cs_sr_reg32(b, 39), info->direct.wg_count.z);
325 }
326 }
327
328 panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
329
330 cs_req_res(b, CS_COMPUTE_RES);
331 if (indirect) {
332 /* Use run_compute with a set task axis instead of run_compute_indirect as
333 * run_compute_indirect has been found to cause intermittent hangs. This
334 * is safe, as the task increment will be clamped by the job size along
335 * the specified axis.
336 * The chosen task axis is potentially suboptimal, as choosing good
337 * increment/axis parameters requires knowledge of job dimensions, but
338 * this is somewhat offset by run_compute being a native instruction. */
339 unsigned task_axis = MALI_TASK_AXIS_X;
340 cs_trace_run_compute(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
341 wg_per_task, task_axis, false,
342 cs_shader_res_sel(0, 0, 0, 0));
343 } else {
344 unsigned task_axis = MALI_TASK_AXIS_X;
345 unsigned task_increment = 0;
346 calculate_task_axis_and_increment(shader, phys_dev, &task_axis,
347 &task_increment);
348 cs_trace_run_compute(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
349 task_increment, task_axis, false,
350 cs_shader_res_sel(0, 0, 0, 0));
351 }
352 cs_req_res(b, 0);
353
354 struct cs_index sync_addr = cs_scratch_reg64(b, 0);
355 struct cs_index iter_sb = cs_scratch_reg32(b, 2);
356 struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
357 struct cs_index add_val = cs_scratch_reg64(b, 4);
358
359 cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
360 BITFIELD_MASK(3),
361 offsetof(struct panvk_cs_subqueue_context, syncobjs));
362 cs_wait_slot(b, SB_ID(LS), false);
363
364 cs_add64(b, sync_addr, sync_addr,
365 PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64));
366 cs_move64_to(b, add_val, 1);
367
368 cs_match(b, iter_sb, cmp_scratch) {
369 #define CASE(x) \
370 cs_case(b, x) { \
371 cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr, \
372 cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC))); \
373 cs_move32_to(b, iter_sb, next_iter_sb(x)); \
374 }
375
376 CASE(0)
377 CASE(1)
378 CASE(2)
379 CASE(3)
380 CASE(4)
381 #undef CASE
382 }
383
384 cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
385 offsetof(struct panvk_cs_subqueue_context, iter_sb));
386 cs_wait_slot(b, SB_ID(LS), false);
387
388 ++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point;
389 clear_dirty_after_dispatch(cmdbuf);
390 }
391
392 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDispatchBase)393 panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
394 uint32_t baseGroupX, uint32_t baseGroupY,
395 uint32_t baseGroupZ, uint32_t groupCountX,
396 uint32_t groupCountY, uint32_t groupCountZ)
397 {
398 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
399 struct panvk_dispatch_info info = {
400 .wg_base = {baseGroupX, baseGroupY, baseGroupZ},
401 .direct.wg_count = {groupCountX, groupCountY, groupCountZ},
402 };
403 cmd_dispatch(cmdbuf, &info);
404 }
405
406 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDispatchIndirect)407 panvk_per_arch(CmdDispatchIndirect)(VkCommandBuffer commandBuffer,
408 VkBuffer _buffer, VkDeviceSize offset)
409 {
410 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
411 VK_FROM_HANDLE(panvk_buffer, buffer, _buffer);
412 uint64_t buffer_gpu = panvk_buffer_gpu_ptr(buffer, offset);
413 struct panvk_dispatch_info info = {
414 .indirect.buffer_dev_addr = buffer_gpu,
415 };
416 cmd_dispatch(cmdbuf, &info);
417 }
418