• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2024 Collabora Ltd.
3  * Copyright © 2024 Arm Ltd.
4  *
5  * Derived from tu_cmd_buffer.c which is:
6  * Copyright © 2016 Red Hat.
7  * Copyright © 2016 Bas Nieuwenhuizen
8  * Copyright © 2015 Intel Corporation
9  *
10  * SPDX-License-Identifier: MIT
11  */
12 
13 #include "genxml/gen_macros.h"
14 
15 #include "panvk_buffer.h"
16 #include "panvk_cmd_alloc.h"
17 #include "panvk_cmd_buffer.h"
18 #include "panvk_cmd_desc_state.h"
19 #include "panvk_cmd_meta.h"
20 #include "panvk_cmd_push_constant.h"
21 #include "panvk_device.h"
22 #include "panvk_entrypoints.h"
23 #include "panvk_meta.h"
24 #include "panvk_physical_device.h"
25 
26 #include "pan_desc.h"
27 #include "pan_encoder.h"
28 #include "pan_props.h"
29 
30 #include <vulkan/vulkan_core.h>
31 
32 static VkResult
prepare_driver_set(struct panvk_cmd_buffer * cmdbuf)33 prepare_driver_set(struct panvk_cmd_buffer *cmdbuf)
34 {
35    struct panvk_shader_desc_state *cs_desc_state =
36       &cmdbuf->state.compute.cs.desc;
37 
38    if (!compute_state_dirty(cmdbuf, CS) &&
39        !compute_state_dirty(cmdbuf, DESC_STATE))
40       return VK_SUCCESS;
41 
42    const struct panvk_descriptor_state *desc_state =
43       &cmdbuf->state.compute.desc_state;
44    const struct panvk_shader *cs = cmdbuf->state.compute.shader;
45    uint32_t desc_count = cs->desc_info.dyn_bufs.count + 1;
46    struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
47       cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
48    struct panvk_opaque_desc *descs = driver_set.cpu;
49 
50    if (!driver_set.gpu)
51       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
52 
53    /* Dummy sampler always comes first. */
54    pan_cast_and_pack(&descs[0], SAMPLER, cfg) {
55       cfg.clamp_integer_array_indices = false;
56    }
57 
58    panvk_per_arch(cmd_fill_dyn_bufs)(desc_state, cs,
59                                      (struct mali_buffer_packed *)(&descs[1]));
60 
61    cs_desc_state->driver_set.dev_addr = driver_set.gpu;
62    cs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
63    compute_state_set_dirty(cmdbuf, DESC_STATE);
64    return VK_SUCCESS;
65 }
66 
67 static void
calculate_task_axis_and_increment(const struct panvk_shader * shader,struct panvk_physical_device * phys_dev,unsigned * task_axis,unsigned * task_increment)68 calculate_task_axis_and_increment(const struct panvk_shader *shader,
69                                   struct panvk_physical_device *phys_dev,
70                                   unsigned *task_axis, unsigned *task_increment)
71 {
72    /* Pick the task_axis and task_increment to maximize thread
73     * utilization. */
74    unsigned threads_per_wg =
75       shader->local_size.x * shader->local_size.y * shader->local_size.z;
76    unsigned max_thread_cnt = panfrost_compute_max_thread_count(
77       &phys_dev->kmod.props, shader->info.work_reg_count);
78    unsigned threads_per_task = threads_per_wg;
79    unsigned local_size[3] = {
80       shader->local_size.x,
81       shader->local_size.y,
82       shader->local_size.z,
83    };
84 
85    for (unsigned i = 0; i < 3; i++) {
86       if (threads_per_task * local_size[i] >= max_thread_cnt) {
87          /* We reached out thread limit, stop at the current axis and
88           * calculate the increment so it doesn't exceed the per-core
89           * thread capacity.
90           */
91          *task_increment = max_thread_cnt / threads_per_task;
92          break;
93       } else if (*task_axis == MALI_TASK_AXIS_Z) {
94          /* We reached the Z axis, and there's still room to stuff more
95           * threads. Pick the current axis grid size as our increment
96           * as there's no point using something bigger.
97           */
98          *task_increment = local_size[i];
99          break;
100       }
101 
102       threads_per_task *= local_size[i];
103       (*task_axis)++;
104    }
105 
106    assert(*task_axis <= MALI_TASK_AXIS_Z);
107    assert(*task_increment > 0);
108 }
109 
110 static unsigned
calculate_workgroups_per_task(const struct panvk_shader * shader,struct panvk_physical_device * phys_dev)111 calculate_workgroups_per_task(const struct panvk_shader *shader,
112                               struct panvk_physical_device *phys_dev)
113 {
114    /* Each shader core can run N tasks and a total of M threads at any single
115     * time, thus each task should ideally have no more than M/N threads. */
116    unsigned max_threads_per_task = phys_dev->kmod.props.max_threads_per_core /
117                                    phys_dev->kmod.props.max_tasks_per_core;
118 
119    /* To achieve the best utilization, we should aim for as many workgroups
120     * per tasks as we can fit without exceeding the above thread limit */
121    unsigned threads_per_wg =
122       shader->local_size.x * shader->local_size.y * shader->local_size.z;
123    assert(threads_per_wg > 0 &&
124           threads_per_wg <= phys_dev->kmod.props.max_threads_per_wg);
125    unsigned wg_per_task = DIV_ROUND_UP(max_threads_per_task, threads_per_wg);
126    assert(wg_per_task > 0 && wg_per_task <= max_threads_per_task);
127 
128    return wg_per_task;
129 }
130 
131 static void
cmd_dispatch(struct panvk_cmd_buffer * cmdbuf,struct panvk_dispatch_info * info)132 cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
133 {
134    const struct panvk_shader *shader = cmdbuf->state.compute.shader;
135    VkResult result;
136 
137    /* If there's no compute shader, we can skip the dispatch. */
138    if (!panvk_priv_mem_dev_addr(shader->spd))
139       return;
140 
141    struct panvk_physical_device *phys_dev =
142       to_panvk_physical_device(cmdbuf->vk.base.device->physical);
143    struct panvk_descriptor_state *desc_state =
144       &cmdbuf->state.compute.desc_state;
145    struct panvk_shader_desc_state *cs_desc_state =
146       &cmdbuf->state.compute.cs.desc;
147    const struct cs_tracing_ctx *tracing_ctx =
148       &cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].tracing;
149 
150    struct panfrost_ptr tsd = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
151    if (!tsd.gpu)
152       return;
153 
154    struct pan_tls_info tlsinfo = {
155       .tls.size = shader->info.tls_size,
156       .wls.size = shader->info.wls_size,
157    };
158    unsigned core_id_range;
159    unsigned core_count =
160       panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
161 
162    bool indirect = info->indirect.buffer_dev_addr != 0;
163 
164    /* Only used for indirect dispatch */
165    unsigned wg_per_task = 0;
166    if (indirect)
167       wg_per_task = calculate_workgroups_per_task(shader, phys_dev);
168 
169    if (tlsinfo.wls.size) {
170       /* NOTE: If the instance count is lower than the number of workgroups
171        * being dispatched, the HW will hold back workgroups until instances
172        * can be reused. */
173       /* NOTE: There is no benefit from allocating more instances than what
174        * can concurrently be used by the HW */
175       if (indirect) {
176          /* Assume we utilize all shader cores to the max */
177          tlsinfo.wls.instances = util_next_power_of_two(
178             wg_per_task * phys_dev->kmod.props.max_tasks_per_core * core_count);
179       } else {
180          /* TODO: Similar to what we are doing for indirect this should change
181           * to calculate the maximum number of workgroups we can execute
182           * concurrently. */
183          struct pan_compute_dim dim = {
184             info->direct.wg_count.x,
185             info->direct.wg_count.y,
186             info->direct.wg_count.z,
187          };
188 
189          tlsinfo.wls.instances = pan_wls_instances(&dim);
190       }
191 
192       /* TODO: Clamp WLS instance to some maximum WLS budget. */
193       unsigned wls_total_size = pan_wls_adjust_size(tlsinfo.wls.size) *
194                                 tlsinfo.wls.instances * core_id_range;
195 
196       /* TODO: Reuse WLS allocation for all dispatch commands in the command
197        * buffer, similar to what we do for TLS in draw. As WLS size (and
198        * instance count) might differ significantly between dispatch commands,
199        * rather than track a single maximum size, we might want to consider
200        * multiple allocations for different size buckets. */
201       tlsinfo.wls.ptr =
202          panvk_cmd_alloc_dev_mem(cmdbuf, tls, wls_total_size, 4096).gpu;
203       if (!tlsinfo.wls.ptr)
204          return;
205    }
206 
207    cmdbuf->state.tls.info.tls.size =
208       MAX2(shader->info.tls_size, cmdbuf->state.tls.info.tls.size);
209 
210    if (!cmdbuf->state.tls.desc.gpu) {
211       cmdbuf->state.tls.desc = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
212       if (!cmdbuf->state.tls.desc.gpu)
213          return;
214    }
215 
216    GENX(pan_emit_tls)(&tlsinfo, tsd.cpu);
217 
218    if (compute_state_dirty(cmdbuf, DESC_STATE) ||
219        compute_state_dirty(cmdbuf, CS)) {
220       result = panvk_per_arch(cmd_prepare_push_descs)(
221          cmdbuf, desc_state, shader->desc_info.used_set_mask);
222       if (result != VK_SUCCESS)
223          return;
224    }
225 
226    panvk_per_arch(cmd_prepare_dispatch_sysvals)(cmdbuf, info);
227 
228    result = prepare_driver_set(cmdbuf);
229    if (result != VK_SUCCESS)
230       return;
231 
232    result = panvk_per_arch(cmd_prepare_push_uniforms)(
233       cmdbuf, cmdbuf->state.compute.shader);
234    if (result != VK_SUCCESS)
235       return;
236 
237    if (compute_state_dirty(cmdbuf, CS) ||
238        compute_state_dirty(cmdbuf, DESC_STATE)) {
239       result = panvk_per_arch(cmd_prepare_shader_res_table)(
240          cmdbuf, desc_state, shader, cs_desc_state);
241       if (result != VK_SUCCESS)
242          return;
243    }
244 
245    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
246 
247    /* Copy the global TLS pointer to the per-job TSD. */
248    if (tlsinfo.tls.size) {
249       cs_move64_to(b, cs_scratch_reg64(b, 0), cmdbuf->state.tls.desc.gpu);
250       cs_load64_to(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
251       cs_wait_slot(b, SB_ID(LS), false);
252       cs_move64_to(b, cs_scratch_reg64(b, 0), tsd.gpu);
253       cs_store64(b, cs_scratch_reg64(b, 2), cs_scratch_reg64(b, 0), 8);
254       cs_wait_slot(b, SB_ID(LS), false);
255    }
256 
257    cs_update_compute_ctx(b) {
258       if (compute_state_dirty(cmdbuf, CS) ||
259           compute_state_dirty(cmdbuf, DESC_STATE))
260          cs_move64_to(b, cs_sr_reg64(b, 0), cs_desc_state->res_table);
261 
262       if (compute_state_dirty(cmdbuf, PUSH_UNIFORMS)) {
263          uint64_t fau_ptr = cmdbuf->state.compute.push_uniforms |
264                             ((uint64_t)shader->fau.total_count << 56);
265          cs_move64_to(b, cs_sr_reg64(b, 8), fau_ptr);
266       }
267 
268       if (compute_state_dirty(cmdbuf, CS))
269          cs_move64_to(b, cs_sr_reg64(b, 16),
270                       panvk_priv_mem_dev_addr(shader->spd));
271 
272       cs_move64_to(b, cs_sr_reg64(b, 24), tsd.gpu);
273 
274       /* Global attribute offset */
275       cs_move32_to(b, cs_sr_reg32(b, 32), 0);
276 
277       struct mali_compute_size_workgroup_packed wg_size;
278       pan_pack(&wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
279          cfg.workgroup_size_x = shader->local_size.x;
280          cfg.workgroup_size_y = shader->local_size.y;
281          cfg.workgroup_size_z = shader->local_size.z;
282          cfg.allow_merging_workgroups = false;
283       }
284       cs_move32_to(b, cs_sr_reg32(b, 33), wg_size.opaque[0]);
285       cs_move32_to(b, cs_sr_reg32(b, 34),
286                    info->wg_base.x * shader->local_size.x);
287       cs_move32_to(b, cs_sr_reg32(b, 35),
288                    info->wg_base.y * shader->local_size.y);
289       cs_move32_to(b, cs_sr_reg32(b, 36),
290                    info->wg_base.z * shader->local_size.z);
291       if (indirect) {
292          /* Load parameters from indirect buffer and update workgroup count
293           * registers and sysvals */
294          cs_move64_to(b, cs_scratch_reg64(b, 0),
295                       info->indirect.buffer_dev_addr);
296          cs_load_to(b, cs_sr_reg_tuple(b, 37, 3), cs_scratch_reg64(b, 0),
297                     BITFIELD_MASK(3), 0);
298          cs_move64_to(b, cs_scratch_reg64(b, 0),
299                       cmdbuf->state.compute.push_uniforms);
300          cs_wait_slot(b, SB_ID(LS), false);
301 
302          if (shader_uses_sysval(shader, compute, num_work_groups.x)) {
303             cs_store32(b, cs_sr_reg32(b, 37), cs_scratch_reg64(b, 0),
304                        shader_remapped_sysval_offset(
305                           shader, sysval_offset(compute, num_work_groups.x)));
306          }
307 
308          if (shader_uses_sysval(shader, compute, num_work_groups.y)) {
309             cs_store32(b, cs_sr_reg32(b, 38), cs_scratch_reg64(b, 0),
310                        shader_remapped_sysval_offset(
311                           shader, sysval_offset(compute, num_work_groups.y)));
312          }
313 
314          if (shader_uses_sysval(shader, compute, num_work_groups.z)) {
315             cs_store32(b, cs_sr_reg32(b, 39), cs_scratch_reg64(b, 0),
316                        shader_remapped_sysval_offset(
317                           shader, sysval_offset(compute, num_work_groups.z)));
318          }
319 
320          cs_wait_slot(b, SB_ID(LS), false);
321       } else {
322          cs_move32_to(b, cs_sr_reg32(b, 37), info->direct.wg_count.x);
323          cs_move32_to(b, cs_sr_reg32(b, 38), info->direct.wg_count.y);
324          cs_move32_to(b, cs_sr_reg32(b, 39), info->direct.wg_count.z);
325       }
326    }
327 
328    panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE);
329 
330    cs_req_res(b, CS_COMPUTE_RES);
331    if (indirect) {
332       /* Use run_compute with a set task axis instead of run_compute_indirect as
333        * run_compute_indirect has been found to cause intermittent hangs. This
334        * is safe, as the task increment will be clamped by the job size along
335        * the specified axis.
336        * The chosen task axis is potentially suboptimal, as choosing good
337        * increment/axis parameters requires knowledge of job dimensions, but
338        * this is somewhat offset by run_compute being a native instruction. */
339       unsigned task_axis = MALI_TASK_AXIS_X;
340       cs_trace_run_compute(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
341                            wg_per_task, task_axis, false,
342                            cs_shader_res_sel(0, 0, 0, 0));
343    } else {
344       unsigned task_axis = MALI_TASK_AXIS_X;
345       unsigned task_increment = 0;
346       calculate_task_axis_and_increment(shader, phys_dev, &task_axis,
347                                         &task_increment);
348       cs_trace_run_compute(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
349                            task_increment, task_axis, false,
350                            cs_shader_res_sel(0, 0, 0, 0));
351    }
352    cs_req_res(b, 0);
353 
354    struct cs_index sync_addr = cs_scratch_reg64(b, 0);
355    struct cs_index iter_sb = cs_scratch_reg32(b, 2);
356    struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
357    struct cs_index add_val = cs_scratch_reg64(b, 4);
358 
359    cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
360               BITFIELD_MASK(3),
361               offsetof(struct panvk_cs_subqueue_context, syncobjs));
362    cs_wait_slot(b, SB_ID(LS), false);
363 
364    cs_add64(b, sync_addr, sync_addr,
365             PANVK_SUBQUEUE_COMPUTE * sizeof(struct panvk_cs_sync64));
366    cs_move64_to(b, add_val, 1);
367 
368    cs_match(b, iter_sb, cmp_scratch) {
369 #define CASE(x)                                                                \
370    cs_case(b, x) {                                                             \
371       cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr,       \
372                     cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC)));          \
373       cs_move32_to(b, iter_sb, next_iter_sb(x));                               \
374    }
375 
376       CASE(0)
377       CASE(1)
378       CASE(2)
379       CASE(3)
380       CASE(4)
381 #undef CASE
382    }
383 
384    cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
385               offsetof(struct panvk_cs_subqueue_context, iter_sb));
386    cs_wait_slot(b, SB_ID(LS), false);
387 
388    ++cmdbuf->state.cs[PANVK_SUBQUEUE_COMPUTE].relative_sync_point;
389    clear_dirty_after_dispatch(cmdbuf);
390 }
391 
392 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDispatchBase)393 panvk_per_arch(CmdDispatchBase)(VkCommandBuffer commandBuffer,
394                                 uint32_t baseGroupX, uint32_t baseGroupY,
395                                 uint32_t baseGroupZ, uint32_t groupCountX,
396                                 uint32_t groupCountY, uint32_t groupCountZ)
397 {
398    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
399    struct panvk_dispatch_info info = {
400       .wg_base = {baseGroupX, baseGroupY, baseGroupZ},
401       .direct.wg_count = {groupCountX, groupCountY, groupCountZ},
402    };
403    cmd_dispatch(cmdbuf, &info);
404 }
405 
406 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDispatchIndirect)407 panvk_per_arch(CmdDispatchIndirect)(VkCommandBuffer commandBuffer,
408                                     VkBuffer _buffer, VkDeviceSize offset)
409 {
410    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
411    VK_FROM_HANDLE(panvk_buffer, buffer, _buffer);
412    uint64_t buffer_gpu = panvk_buffer_gpu_ptr(buffer, offset);
413    struct panvk_dispatch_info info = {
414       .indirect.buffer_dev_addr = buffer_gpu,
415    };
416    cmd_dispatch(cmdbuf, &info);
417 }
418