• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2024 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "radv_dgc.h"
8 #include "meta/radv_meta.h"
9 #include "radv_entrypoints.h"
10 #include "radv_pipeline_rt.h"
11 
12 #include "ac_rgp.h"
13 
14 #include "nir_builder.h"
15 
16 #include "vk_common_entrypoints.h"
17 #include "vk_device_generated_commands.h"
18 #include "vk_shader_module.h"
19 
20 #define PKT3_INDIRECT_BUFFER_BYTES 16
21 #define DGC_VBO_INFO_SIZE (sizeof(struct radv_vbo_info) + 4 /* vbo_offsets */)
22 
23 /* The DGC command buffer layout is quite complex, here's some explanations:
24  *
25  * Without the DGC preamble, the default layout looks like:
26  *
27  * +---------+----------+---------+-----------------+
28  * | trailer | commands | padding | jump to trailer |
29  * +---------+----------+---------+-----------------+
30  *
31  * The trailer is used to implement IB chaining for compute queue because IB2 isn't supported. The
32  * trailer is patched at execute time to chain back the DGC command buffer. The trailer is added at
33  * the beginning to make sure the offset is fixed (ie. not possible to know the offset with a
34  * preamble). In practice the execution looks like:
35  *
36  * +----------+---------+-----------------+    +---------+    +-----------------------+
37  * | commands | padding | jump to trailer | -> | trailer | -> | postamble (normal CS) |
38  * +----------+---------+-----------------+    +---------+    +-----------------------+
39  *
40  * When DGC uses a preamble (to optimize large empty indirect sequence count by removing a ton of
41  * padding), the trailer is still used but the layout looks like:
42  *
43  * +---------+---------+-----------------+     +----------+---------+-----------------+
44  * | trailer | padding | INDIRECT_BUFFER | ->  | commands | padding | jump to trailer |
45  * +---------+---------+-----------------+     +----------+---------+-----------------+
46  *
47  * When DGC uses task shaders, the command buffer is split in two parts (GFX/COMPUTE), the
48  * default layout looks like:
49  *
50  * +--------------+---------+--------------+---------+
51  * | GFX commands | padding | ACE commands | padding |
52  * +--------------+---------+--------------+---------+
53  *
54  * The execution of this DGC command buffer is different if it's GFX or COMPUTE queue:
55  * - on GFX, the driver uses the IB2 packet which the easiest solution
56  * - on COMPUTE, IB2 isn't supported and the driver chains the DGC command buffer by patching the
57  *   trailer
58  */
59 
60 uint32_t
radv_dgc_get_buffer_alignment(const struct radv_device * device)61 radv_dgc_get_buffer_alignment(const struct radv_device *device)
62 {
63    const struct radv_physical_device *pdev = radv_device_physical(device);
64 
65    return MAX2(pdev->info.ip[AMD_IP_GFX].ib_alignment, pdev->info.ip[AMD_IP_COMPUTE].ib_alignment);
66 }
67 
68 static uint32_t
radv_pad_cmdbuf(const struct radv_device * device,uint32_t size,enum amd_ip_type ip_type)69 radv_pad_cmdbuf(const struct radv_device *device, uint32_t size, enum amd_ip_type ip_type)
70 {
71    const struct radv_physical_device *pdev = radv_device_physical(device);
72    const uint32_t ib_alignment = (pdev->info.ip[ip_type].ib_pad_dw_mask + 1) * 4;
73 
74    return align(size, ib_alignment);
75 }
76 
77 static uint32_t
radv_align_cmdbuf(const struct radv_device * device,uint32_t size,enum amd_ip_type ip_type)78 radv_align_cmdbuf(const struct radv_device *device, uint32_t size, enum amd_ip_type ip_type)
79 {
80    const struct radv_physical_device *pdev = radv_device_physical(device);
81    const uint32_t ib_alignment = pdev->info.ip[ip_type].ib_alignment;
82 
83    return align(size, ib_alignment);
84 }
85 
86 static unsigned
radv_dgc_preamble_cmdbuf_size(const struct radv_device * device,enum amd_ip_type ip_type)87 radv_dgc_preamble_cmdbuf_size(const struct radv_device *device, enum amd_ip_type ip_type)
88 {
89    return radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type);
90 }
91 
92 static unsigned
radv_dgc_trailer_cmdbuf_size(const struct radv_device * device,enum amd_ip_type ip_type)93 radv_dgc_trailer_cmdbuf_size(const struct radv_device *device, enum amd_ip_type ip_type)
94 {
95    return radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type);
96 }
97 
98 static bool
radv_dgc_use_preamble(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo)99 radv_dgc_use_preamble(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
100 {
101    /* Heuristic on when the overhead for the preamble (i.e. double jump) is worth it. Obviously
102     * a bit of a guess as it depends on the actual count which we don't know. */
103    return pGeneratedCommandsInfo->sequenceCountAddress != 0 && pGeneratedCommandsInfo->maxSequenceCount >= 64;
104 }
105 
106 struct radv_shader *
radv_dgc_get_shader(const VkGeneratedCommandsPipelineInfoEXT * pipeline_info,const VkGeneratedCommandsShaderInfoEXT * eso_info,gl_shader_stage stage)107 radv_dgc_get_shader(const VkGeneratedCommandsPipelineInfoEXT *pipeline_info,
108                     const VkGeneratedCommandsShaderInfoEXT *eso_info, gl_shader_stage stage)
109 {
110    if (pipeline_info) {
111       VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline);
112       return radv_get_shader(pipeline->shaders, stage);
113    } else if (eso_info) {
114       VkShaderStageFlags stages = 0;
115 
116       for (uint32_t i = 0; i < eso_info->shaderCount; i++) {
117          VK_FROM_HANDLE(radv_shader_object, shader_object, eso_info->pShaders[i]);
118          stages |= mesa_to_vk_shader_stage(shader_object->stage);
119       }
120 
121       for (uint32_t i = 0; i < eso_info->shaderCount; i++) {
122          VK_FROM_HANDLE(radv_shader_object, shader_object, eso_info->pShaders[i]);
123 
124          if (shader_object->stage != stage)
125             continue;
126 
127          if (stage == MESA_SHADER_VERTEX && (stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT)) {
128             return shader_object->as_ls.shader;
129          } else if ((stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL) &&
130                     (stages & VK_SHADER_STAGE_GEOMETRY_BIT)) {
131             return shader_object->as_es.shader;
132          } else {
133             return shader_object->shader;
134          }
135       }
136    }
137 
138    return NULL;
139 }
140 
141 static void
radv_get_sequence_size_compute(const struct radv_indirect_command_layout * layout,const void * pNext,uint32_t * cmd_size,uint32_t * upload_size)142 radv_get_sequence_size_compute(const struct radv_indirect_command_layout *layout, const void *pNext, uint32_t *cmd_size,
143                                uint32_t *upload_size)
144 {
145    const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
146    const struct radv_physical_device *pdev = radv_device_physical(device);
147 
148    const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
149       vk_find_struct_const(pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
150    const VkGeneratedCommandsShaderInfoEXT *eso_info = vk_find_struct_const(pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
151 
152    struct radv_shader *cs = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_COMPUTE);
153 
154    /* dispatch */
155    *cmd_size += 5 * 4;
156 
157    if (cs) {
158       const struct radv_userdata_info *loc = radv_get_user_sgpr_info(cs, AC_UD_CS_GRID_SIZE);
159       if (loc->sgpr_idx != -1) {
160          if (device->load_grid_size_from_user_sgpr) {
161             /* PKT3_SET_SH_REG for immediate values */
162             *cmd_size += 5 * 4;
163          } else {
164             /* PKT3_SET_SH_REG for pointer */
165             *cmd_size += 4 * 4;
166          }
167       }
168    } else {
169       /* COMPUTE_PGM_{LO,RSRC1,RSRC2} */
170       *cmd_size += 7 * 4;
171 
172       if (pdev->info.gfx_level >= GFX10) {
173          /* COMPUTE_PGM_RSRC3 */
174          *cmd_size += 3 * 4;
175       }
176 
177       /* COMPUTE_{RESOURCE_LIMITS,NUM_THREADS_X} */
178       *cmd_size += 8 * 4;
179 
180       /* Assume the compute shader needs grid size because we can't know the information for
181        * indirect pipelines.
182        */
183       if (device->load_grid_size_from_user_sgpr) {
184          /* PKT3_SET_SH_REG for immediate values */
185          *cmd_size += 5 * 4;
186       } else {
187          /* PKT3_SET_SH_REG for pointer */
188          *cmd_size += 4 * 4;
189       }
190 
191       /* PKT3_SET_SH_REG for indirect descriptor sets pointer */
192       *cmd_size += 3 * 4;
193    }
194 
195    if (device->sqtt.bo) {
196       /* sqtt markers */
197       *cmd_size += 8 * 3 * 4;
198    }
199 }
200 
201 static void
radv_get_sequence_size_graphics(const struct radv_indirect_command_layout * layout,const void * pNext,uint32_t * cmd_size,uint32_t * ace_cmd_size,uint32_t * upload_size)202 radv_get_sequence_size_graphics(const struct radv_indirect_command_layout *layout, const void *pNext,
203                                 uint32_t *cmd_size, uint32_t *ace_cmd_size, uint32_t *upload_size)
204 {
205    const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
206    const struct radv_physical_device *pdev = radv_device_physical(device);
207 
208    const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
209       vk_find_struct_const(pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
210    const VkGeneratedCommandsShaderInfoEXT *eso_info = vk_find_struct_const(pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
211 
212    struct radv_shader *vs = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_VERTEX);
213 
214    if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) {
215       *upload_size += 16 * util_bitcount(vs->info.vs.vb_desc_usage_mask);
216 
217       /* One PKT3_SET_SH_REG for emitting VBO pointer (32-bit) */
218       *cmd_size += 3 * 4;
219    }
220 
221    if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IB)) {
222       /* Index type write (normal reg write) + index buffer base write (64-bits, but special packet
223        * so only 1 word overhead) + index buffer size (again, special packet so only 1 word
224        * overhead)
225        */
226       *cmd_size += (3 + 3 + 2) * 4;
227    }
228 
229    if (layout->vk.draw_count) {
230       if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) {
231          const struct radv_shader *task_shader = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK);
232 
233          if (task_shader) {
234             /* PKT3_DISPATCH_TASKMESH_GFX */
235             *cmd_size += 4 * 4;
236 
237             /* PKT3_DISPATCH_TASKMESH_INDIRECT_MULTI_ACE */
238             *ace_cmd_size += 11 * 4;
239          } else {
240             struct radv_shader *ms = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_MESH);
241 
242             /* PKT3_SET_BASE + PKT3_SET_SH_REG + PKT3_DISPATCH_MESH_INDIRECT_MULTI */
243             *cmd_size += (4 + (ms->info.vs.needs_draw_id ? 3 : 0) + 9) * 4;
244          }
245       } else {
246          /* PKT3_SET_BASE + PKT3_DRAW_{INDEX}_INDIRECT_MULTI */
247          *cmd_size += (4 + 10) * 4;
248       }
249    } else {
250       if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_INDEXED)) {
251          if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IB)) {
252             /* userdata writes + instance count + indexed draw */
253             *cmd_size += (5 + 2 + 5) * 4;
254          } else {
255             /* PKT3_SET_BASE + PKT3_SET_SH_REG + PKT3_DRAW_{INDEX}_INDIRECT_MULTI */
256             *cmd_size += (4 + (vs->info.vs.needs_draw_id ? 10 : 5)) * 4;
257          }
258       } else {
259          if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) {
260             const struct radv_shader *task_shader = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK);
261 
262             if (task_shader) {
263                const struct radv_userdata_info *xyz_loc = radv_get_user_sgpr_info(task_shader, AC_UD_CS_GRID_SIZE);
264                const struct radv_userdata_info *draw_id_loc =
265                   radv_get_user_sgpr_info(task_shader, AC_UD_CS_TASK_DRAW_ID);
266 
267                /* PKT3_DISPATCH_TASKMESH_GFX */
268                *cmd_size += 4 * 4;
269 
270                if (xyz_loc->sgpr_idx != -1)
271                   *ace_cmd_size += 5 * 4;
272                if (draw_id_loc->sgpr_idx != -1)
273                   *ace_cmd_size += 3 * 4;
274 
275                /* PKT3_DISPATCH_TASKMESH_DIRECT_ACE */
276                *ace_cmd_size += 6 * 4;
277             } else {
278                /* userdata writes + instance count + non-indexed draw */
279                *cmd_size += (6 + 2 + (pdev->mesh_fast_launch_2 ? 5 : 3)) * 4;
280             }
281          } else {
282             /* userdata writes + instance count + non-indexed draw */
283             *cmd_size += (5 + 2 + 3) * 4;
284          }
285       }
286    }
287 
288    if (device->sqtt.bo) {
289       /* sqtt markers */
290       *cmd_size += 5 * 3 * 4;
291    }
292 }
293 
294 static void
radv_get_sequence_size_rt(const struct radv_indirect_command_layout * layout,const void * pNext,uint32_t * cmd_size,uint32_t * upload_size)295 radv_get_sequence_size_rt(const struct radv_indirect_command_layout *layout, const void *pNext, uint32_t *cmd_size,
296                           uint32_t *upload_size)
297 {
298    const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
299 
300    const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
301       vk_find_struct_const(pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
302    VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline);
303    const struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
304    const struct radv_shader *rt_prolog = rt_pipeline->prolog;
305 
306    /* dispatch */
307    *cmd_size += 5 * 4;
308 
309    const struct radv_userdata_info *cs_grid_size_loc = radv_get_user_sgpr_info(rt_prolog, AC_UD_CS_GRID_SIZE);
310    if (cs_grid_size_loc->sgpr_idx != -1) {
311       if (device->load_grid_size_from_user_sgpr) {
312          /* PKT3_LOAD_SH_REG_INDEX */
313          *cmd_size += 5 * 4;
314       } else {
315          /* PKT3_SET_SH_REG for pointer */
316          *cmd_size += 4 * 4;
317       }
318    }
319 
320    const struct radv_userdata_info *cs_sbt_descriptors_loc =
321       radv_get_user_sgpr_info(rt_prolog, AC_UD_CS_SBT_DESCRIPTORS);
322    if (cs_sbt_descriptors_loc->sgpr_idx != -1) {
323       /* PKT3_SET_SH_REG for pointer */
324       *cmd_size += 4 * 4;
325    }
326 
327    const struct radv_userdata_info *cs_ray_launch_size_addr_loc =
328       radv_get_user_sgpr_info(rt_prolog, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR);
329    if (cs_ray_launch_size_addr_loc->sgpr_idx != -1) {
330       /* PKT3_SET_SH_REG for pointer */
331       *cmd_size += 4 * 4;
332    }
333 
334    if (device->sqtt.bo) {
335       /* sqtt markers */
336       *cmd_size += 5 * 3 * 4;
337    }
338 }
339 
340 static void
radv_get_sequence_size(const struct radv_indirect_command_layout * layout,const void * pNext,uint32_t * cmd_size,uint32_t * ace_cmd_size,uint32_t * upload_size)341 radv_get_sequence_size(const struct radv_indirect_command_layout *layout, const void *pNext, uint32_t *cmd_size,
342                        uint32_t *ace_cmd_size, uint32_t *upload_size)
343 {
344    const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
345    const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
346       vk_find_struct_const(pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
347    const VkGeneratedCommandsShaderInfoEXT *eso_info = vk_find_struct_const(pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
348 
349    *cmd_size = 0;
350    *ace_cmd_size = 0;
351    *upload_size = 0;
352 
353    if (layout->vk.dgc_info & (BITFIELD_BIT(MESA_VK_DGC_PC) | BITFIELD_BIT(MESA_VK_DGC_SI))) {
354       VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, layout->vk.layout);
355       bool need_copy = false;
356 
357       if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) {
358          /* Assume the compute shader needs both user SGPRs because we can't know the information
359           * for indirect pipelines.
360           */
361          *cmd_size += 3 * 4;
362          need_copy = true;
363 
364          *cmd_size += (3 * util_bitcount64(layout->push_constant_mask)) * 4;
365       } else {
366          struct radv_shader *shaders[MESA_VULKAN_SHADER_STAGES] = {0};
367          if (pipeline_info) {
368             VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline);
369 
370             if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) {
371                const struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
372                struct radv_shader *rt_prolog = rt_pipeline->prolog;
373 
374                shaders[MESA_SHADER_COMPUTE] = rt_prolog;
375             } else {
376                memcpy(shaders, pipeline->shaders, sizeof(shaders));
377             }
378          } else if (eso_info) {
379             for (unsigned i = 0; i < eso_info->shaderCount; ++i) {
380                VK_FROM_HANDLE(radv_shader_object, shader_object, eso_info->pShaders[i]);
381                struct radv_shader *shader = shader_object->shader;
382                gl_shader_stage stage = shader->info.stage;
383 
384                shaders[stage] = shader;
385             }
386          }
387 
388          for (unsigned i = 0; i < ARRAY_SIZE(shaders); ++i) {
389             const struct radv_shader *shader = shaders[i];
390 
391             if (!shader)
392                continue;
393 
394             const struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs;
395             if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
396                /* One PKT3_SET_SH_REG for emitting push constants pointer (32-bit) */
397                if (i == MESA_SHADER_TASK) {
398                   *ace_cmd_size += 3 * 4;
399                } else {
400                   *cmd_size += 3 * 4;
401                }
402                need_copy = true;
403             }
404             if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
405                /* One PKT3_SET_SH_REG writing all inline push constants. */
406                const uint32_t inline_pc_size = (3 * util_bitcount64(layout->push_constant_mask)) * 4;
407 
408                if (i == MESA_SHADER_TASK) {
409                   *ace_cmd_size += inline_pc_size;
410                } else {
411                   *cmd_size += inline_pc_size;
412                }
413             }
414          }
415       }
416 
417       if (need_copy) {
418          *upload_size += align(pipeline_layout->push_constant_size, 16);
419       }
420    }
421 
422    if (device->sqtt.bo) {
423       /* THREAD_TRACE_MARKER */
424       *cmd_size += 2 * 4;
425    }
426 
427    if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
428       radv_get_sequence_size_compute(layout, pNext, cmd_size, upload_size);
429    } else if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) {
430       radv_get_sequence_size_rt(layout, pNext, cmd_size, upload_size);
431    } else {
432       radv_get_sequence_size_graphics(layout, pNext, cmd_size, ace_cmd_size, upload_size);
433    }
434 }
435 
436 struct dgc_cmdbuf_layout {
437    bool use_preamble;
438    uint32_t alloc_size;
439 
440    uint32_t main_trailer_offset;
441    uint32_t main_preamble_offset;
442    uint32_t main_offset;
443    uint32_t main_cmd_stride;
444    uint32_t main_preamble_size;
445    uint32_t main_size;
446 
447    uint32_t ace_trailer_offset;
448    uint32_t ace_preamble_offset;
449    uint32_t ace_main_offset;
450    uint32_t ace_cmd_stride;
451    uint32_t ace_preamble_size;
452    uint32_t ace_size;
453 
454    uint32_t upload_offset;
455    uint32_t upload_stride;
456    uint32_t upload_size;
457 };
458 
459 static void
get_dgc_cmdbuf_layout(const struct radv_device * device,const struct radv_indirect_command_layout * dgc_layout,const void * pNext,uint32_t sequences_count,bool use_preamble,struct dgc_cmdbuf_layout * layout)460 get_dgc_cmdbuf_layout(const struct radv_device *device, const struct radv_indirect_command_layout *dgc_layout,
461                       const void *pNext, uint32_t sequences_count, bool use_preamble, struct dgc_cmdbuf_layout *layout)
462 {
463    uint32_t offset = 0;
464 
465    memset(layout, 0, sizeof(*layout));
466 
467    radv_get_sequence_size(dgc_layout, pNext, &layout->main_cmd_stride, &layout->ace_cmd_stride, &layout->upload_stride);
468 
469    layout->use_preamble = use_preamble;
470    if (layout->use_preamble) {
471       layout->main_preamble_size = radv_dgc_preamble_cmdbuf_size(device, AMD_IP_GFX);
472       layout->ace_preamble_size = radv_dgc_preamble_cmdbuf_size(device, AMD_IP_COMPUTE);
473    }
474 
475    layout->main_size =
476       radv_pad_cmdbuf(device, (layout->main_cmd_stride * sequences_count) + PKT3_INDIRECT_BUFFER_BYTES, AMD_IP_GFX);
477    layout->ace_size =
478       radv_pad_cmdbuf(device, (layout->ace_cmd_stride * sequences_count) + PKT3_INDIRECT_BUFFER_BYTES, AMD_IP_COMPUTE);
479    layout->upload_size = layout->upload_stride * sequences_count;
480 
481    /* Main */
482    layout->main_trailer_offset = 0;
483 
484    offset += radv_dgc_trailer_cmdbuf_size(device, AMD_IP_GFX);
485    offset = radv_align_cmdbuf(device, offset, AMD_IP_GFX);
486    layout->main_preamble_offset = offset;
487 
488    if (layout->use_preamble)
489       offset += layout->main_preamble_size;
490    offset = radv_align_cmdbuf(device, offset, AMD_IP_GFX);
491 
492    layout->main_offset = offset;
493    offset += layout->main_size;
494 
495    /* ACE */
496    if (layout->ace_cmd_stride) {
497       offset = radv_align_cmdbuf(device, offset, AMD_IP_COMPUTE);
498 
499       layout->ace_trailer_offset = offset;
500 
501       offset += radv_dgc_trailer_cmdbuf_size(device, AMD_IP_COMPUTE);
502       offset = radv_align_cmdbuf(device, offset, AMD_IP_COMPUTE);
503 
504       layout->ace_preamble_offset = offset;
505 
506       if (layout->use_preamble)
507          offset += layout->ace_preamble_size;
508       offset = radv_align_cmdbuf(device, offset, AMD_IP_COMPUTE);
509 
510       layout->ace_main_offset = offset;
511       offset += layout->ace_size;
512    }
513 
514    /* Upload */
515    layout->upload_offset = offset;
516    offset += layout->upload_size;
517 
518    layout->alloc_size = offset;
519 }
520 
521 static uint32_t
radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo,enum amd_ip_type ip_type)522 radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo, enum amd_ip_type ip_type)
523 {
524    VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
525    const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
526    const bool use_preamble = radv_dgc_use_preamble(pGeneratedCommandsInfo);
527    const uint32_t sequences_count = pGeneratedCommandsInfo->maxSequenceCount;
528    struct dgc_cmdbuf_layout cmdbuf_layout;
529 
530    get_dgc_cmdbuf_layout(device, layout, pGeneratedCommandsInfo->pNext, sequences_count, use_preamble, &cmdbuf_layout);
531 
532    if (use_preamble)
533       return ip_type == AMD_IP_GFX ? cmdbuf_layout.main_preamble_size : cmdbuf_layout.ace_preamble_size;
534 
535    return ip_type == AMD_IP_GFX ? cmdbuf_layout.main_size : cmdbuf_layout.ace_size;
536 }
537 
538 static uint32_t
radv_get_indirect_cmdbuf_offset(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo,enum amd_ip_type ip_type)539 radv_get_indirect_cmdbuf_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo, enum amd_ip_type ip_type)
540 {
541    VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
542    const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
543    const bool use_preamble = radv_dgc_use_preamble(pGeneratedCommandsInfo);
544    const uint32_t sequences_count = pGeneratedCommandsInfo->maxSequenceCount;
545    struct dgc_cmdbuf_layout cmdbuf_layout;
546 
547    get_dgc_cmdbuf_layout(device, layout, pGeneratedCommandsInfo->pNext, sequences_count, use_preamble, &cmdbuf_layout);
548 
549    return ip_type == AMD_IP_GFX ? cmdbuf_layout.main_preamble_offset : cmdbuf_layout.ace_preamble_offset;
550 }
551 
552 static uint32_t
radv_get_indirect_trailer_offset(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo,enum amd_ip_type ip_type)553 radv_get_indirect_trailer_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo, enum amd_ip_type ip_type)
554 {
555    VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
556    const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
557    const bool use_preamble = radv_dgc_use_preamble(pGeneratedCommandsInfo);
558    const uint32_t sequences_count = pGeneratedCommandsInfo->maxSequenceCount;
559    struct dgc_cmdbuf_layout cmdbuf_layout;
560 
561    get_dgc_cmdbuf_layout(device, layout, pGeneratedCommandsInfo->pNext, sequences_count, use_preamble, &cmdbuf_layout);
562 
563    const uint32_t offset = ip_type == AMD_IP_GFX ? cmdbuf_layout.main_trailer_offset : cmdbuf_layout.ace_trailer_offset;
564 
565    return offset + radv_dgc_trailer_cmdbuf_size(device, ip_type) - PKT3_INDIRECT_BUFFER_BYTES;
566 }
567 
568 uint32_t
radv_get_indirect_main_cmdbuf_size(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo)569 radv_get_indirect_main_cmdbuf_size(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
570 {
571    return radv_get_indirect_cmdbuf_size(pGeneratedCommandsInfo, AMD_IP_GFX);
572 }
573 
574 uint32_t
radv_get_indirect_main_cmdbuf_offset(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo)575 radv_get_indirect_main_cmdbuf_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
576 {
577    return radv_get_indirect_cmdbuf_offset(pGeneratedCommandsInfo, AMD_IP_GFX);
578 }
579 
580 uint32_t
radv_get_indirect_main_trailer_offset(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo)581 radv_get_indirect_main_trailer_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
582 {
583    return radv_get_indirect_trailer_offset(pGeneratedCommandsInfo, AMD_IP_GFX);
584 }
585 
586 uint32_t
radv_get_indirect_ace_cmdbuf_size(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo)587 radv_get_indirect_ace_cmdbuf_size(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
588 {
589    return radv_get_indirect_cmdbuf_size(pGeneratedCommandsInfo, AMD_IP_COMPUTE);
590 }
591 
592 uint32_t
radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo)593 radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
594 {
595    return radv_get_indirect_cmdbuf_offset(pGeneratedCommandsInfo, AMD_IP_COMPUTE);
596 }
597 
598 uint32_t
radv_get_indirect_ace_trailer_offset(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo)599 radv_get_indirect_ace_trailer_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
600 {
601    return radv_get_indirect_trailer_offset(pGeneratedCommandsInfo, AMD_IP_COMPUTE);
602 }
603 
604 struct radv_dgc_params {
605    uint32_t cmd_buf_preamble_offset;
606    uint32_t cmd_buf_main_offset;
607    uint32_t cmd_buf_stride;
608    uint32_t cmd_buf_size;
609    uint32_t ace_cmd_buf_trailer_offset;
610    uint32_t ace_cmd_buf_preamble_offset;
611    uint32_t ace_cmd_buf_main_offset;
612    uint32_t ace_cmd_buf_stride;
613    uint32_t ace_cmd_buf_size;
614    uint32_t upload_main_offset;
615    uint32_t upload_stride;
616    uint32_t upload_addr;
617    uint32_t sequence_count;
618    uint64_t sequence_count_addr;
619    uint64_t stream_addr;
620 
621    uint8_t queue_family;
622    uint8_t use_preamble;
623 
624    /* draw info */
625    uint16_t vtx_base_sgpr;
626    uint32_t max_index_count;
627    uint32_t max_draw_count;
628 
629    /* task/mesh info */
630    uint8_t has_task_shader;
631    uint16_t mesh_ring_entry_sgpr;
632    uint8_t linear_dispatch_en;
633    uint16_t task_ring_entry_sgpr;
634    uint16_t task_xyz_sgpr;
635    uint16_t task_draw_id_sgpr;
636 
637    /* dispatch info */
638    uint16_t grid_base_sgpr;
639    uint32_t wave32;
640 
641    /* RT info */
642    uint16_t cs_sbt_descriptors;
643    uint16_t cs_ray_launch_size_addr;
644 
645    /* VBO info */
646    uint32_t vb_desc_usage_mask;
647    uint16_t vbo_reg;
648    uint8_t dynamic_vs_input;
649    uint8_t use_per_attribute_vb_descs;
650 
651    /* push constants info */
652    uint8_t const_copy;
653    uint16_t push_constant_stages;
654 
655    /* IES info */
656    uint64_t ies_addr;
657    uint32_t ies_stride;
658    uint32_t indirect_desc_sets_va;
659 
660    /* For conditional rendering on ACE. */
661    uint8_t predicating;
662    uint8_t predication_type;
663    uint64_t predication_va;
664 };
665 
666 enum {
667    DGC_USES_DRAWID = 1u << 14,
668    DGC_USES_BASEINSTANCE = 1u << 15,
669    DGC_USES_GRID_SIZE = DGC_USES_BASEINSTANCE, /* Mesh shader only */
670 };
671 
672 struct dgc_cmdbuf {
673    const struct radv_device *dev;
674    const struct radv_indirect_command_layout *layout;
675 
676    nir_builder *b;
677    nir_def *va;
678    nir_variable *offset;
679    nir_variable *upload_offset;
680 
681    nir_def *ies_va;
682 };
683 
684 static void
dgc_emit(struct dgc_cmdbuf * cs,unsigned count,nir_def ** values)685 dgc_emit(struct dgc_cmdbuf *cs, unsigned count, nir_def **values)
686 {
687    nir_builder *b = cs->b;
688 
689    for (unsigned i = 0; i < count; i += 4) {
690       nir_def *offset = nir_load_var(b, cs->offset);
691       nir_def *store_val = nir_vec(b, values + i, MIN2(count - i, 4));
692       assert(store_val->bit_size >= 32);
693       nir_build_store_global(b, store_val, nir_iadd(b, cs->va, nir_u2u64(b, offset)), .access = ACCESS_NON_READABLE);
694       nir_store_var(b, cs->offset, nir_iadd_imm(b, offset, store_val->num_components * store_val->bit_size / 8), 0x1);
695    }
696 }
697 
698 static void
dgc_upload(struct dgc_cmdbuf * cs,nir_def * data)699 dgc_upload(struct dgc_cmdbuf *cs, nir_def *data)
700 {
701    nir_builder *b = cs->b;
702 
703    nir_def *upload_offset = nir_load_var(b, cs->upload_offset);
704    nir_build_store_global(b, data, nir_iadd(b, cs->va, nir_u2u64(b, upload_offset)), .access = ACCESS_NON_READABLE);
705    nir_store_var(b, cs->upload_offset, nir_iadd_imm(b, upload_offset, data->num_components * data->bit_size / 8), 0x1);
706 }
707 
708 #define load_param32(b, field)                                                                                         \
709    nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0), .base = offsetof(struct radv_dgc_params, field), .range = 4)
710 
711 #define load_param16(b, field)                                                                                         \
712    nir_ubfe_imm((b),                                                                                                   \
713                 nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0),                                                \
714                                        .base = (offsetof(struct radv_dgc_params, field) & ~3), .range = 4),            \
715                 (offsetof(struct radv_dgc_params, field) & 2) * 8, 16)
716 
717 #define load_param8(b, field)                                                                                          \
718    nir_ubfe_imm((b),                                                                                                   \
719                 nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0),                                                \
720                                        .base = (offsetof(struct radv_dgc_params, field) & ~3), .range = 4),            \
721                 (offsetof(struct radv_dgc_params, field) & 3) * 8, 8)
722 
723 #define load_param64(b, field)                                                                                         \
724    nir_pack_64_2x32((b), nir_load_push_constant((b), 2, 32, nir_imm_int((b), 0),                                       \
725                                                 .base = offsetof(struct radv_dgc_params, field), .range = 8))
726 
727 static nir_def *
dgc_load_ies_va(struct dgc_cmdbuf * cs,nir_def * stream_addr)728 dgc_load_ies_va(struct dgc_cmdbuf *cs, nir_def *stream_addr)
729 {
730    const struct radv_indirect_command_layout *layout = cs->layout;
731    nir_builder *b = cs->b;
732 
733    nir_def *offset = nir_imm_int(b, layout->vk.ies_src_offset_B);
734    nir_def *ies_index =
735       nir_build_load_global(b, 1, 32, nir_iadd(b, stream_addr, nir_u2u64(b, offset)), .access = ACCESS_NON_WRITEABLE);
736    nir_def *ies_stride = load_param32(b, ies_stride);
737    nir_def *ies_offset = nir_imul(b, ies_index, ies_stride);
738 
739    return nir_iadd(b, load_param64(b, ies_addr), nir_u2u64(b, ies_offset));
740 }
741 
742 static nir_def *
dgc_load_shader_metadata(struct dgc_cmdbuf * cs,uint32_t bitsize,uint32_t field_offset)743 dgc_load_shader_metadata(struct dgc_cmdbuf *cs, uint32_t bitsize, uint32_t field_offset)
744 {
745    const struct radv_indirect_command_layout *layout = cs->layout;
746    nir_builder *b = cs->b;
747 
748    if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) {
749       return nir_load_global(b, nir_iadd_imm(b, cs->ies_va, field_offset), 4, 1, bitsize);
750    } else {
751       nir_def *params_buf = radv_meta_load_descriptor(b, 0, 0);
752 
753       return nir_load_ssbo(b, 1, bitsize, params_buf, nir_imm_int(b, field_offset));
754    }
755 
756    return NULL;
757 }
758 
759 #define load_shader_metadata32(cs, field)                                                                              \
760    dgc_load_shader_metadata(cs, 32, offsetof(struct radv_compute_pipeline_metadata, field))
761 #define load_shader_metadata64(cs, field)                                                                              \
762    dgc_load_shader_metadata(cs, 64, offsetof(struct radv_compute_pipeline_metadata, field))
763 
764 static nir_def *
dgc_load_vbo_metadata(struct dgc_cmdbuf * cs,uint32_t bitsize,nir_def * idx,uint32_t field_offset)765 dgc_load_vbo_metadata(struct dgc_cmdbuf *cs, uint32_t bitsize, nir_def *idx, uint32_t field_offset)
766 {
767    nir_builder *b = cs->b;
768 
769    nir_def *param_buf = radv_meta_load_descriptor(b, 0, 0);
770 
771    nir_def *offset = nir_imul_imm(b, idx, DGC_VBO_INFO_SIZE);
772 
773    return nir_load_ssbo(b, 1, bitsize, param_buf, nir_iadd_imm(b, offset, field_offset));
774 }
775 
776 #define load_vbo_metadata32(cs, idx, field) dgc_load_vbo_metadata(cs, 32, idx, offsetof(struct radv_vbo_info, field))
777 #define load_vbo_metadata64(cs, idx, field) dgc_load_vbo_metadata(cs, 64, idx, offsetof(struct radv_vbo_info, field))
778 #define load_vbo_offset(cs, idx)            dgc_load_vbo_metadata(cs, 32, idx, sizeof(struct radv_vbo_info))
779 
780 /* DGC cs emit macros */
781 #define dgc_cs_begin(cs)                                                                                               \
782    struct dgc_cmdbuf *__cs = (cs);                                                                                     \
783    nir_def *__dwords[32];                                                                                              \
784    unsigned __num_dw = 0;
785 
786 #define dgc_cs_emit(value)                                                                                             \
787    assert(__num_dw < ARRAY_SIZE(__dwords));                                                                            \
788    __dwords[__num_dw++] = value;
789 
790 #define dgc_cs_emit_imm(value) dgc_cs_emit(nir_imm_int(__cs->b, value));
791 
792 #define dgc_cs_set_sh_reg_seq(reg, num)                                                                                \
793    dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, num, 0));                                                                     \
794    dgc_cs_emit_imm((reg - SI_SH_REG_OFFSET) >> 2);
795 
796 #define dgc_cs_end() dgc_emit(__cs, __num_dw, __dwords);
797 
798 static nir_def *
nir_pkt3_base(nir_builder * b,unsigned op,nir_def * len,bool predicate)799 nir_pkt3_base(nir_builder *b, unsigned op, nir_def *len, bool predicate)
800 {
801    len = nir_iand_imm(b, len, 0x3fff);
802    return nir_ior_imm(b, nir_ishl_imm(b, len, 16), PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT3_PREDICATE(predicate));
803 }
804 
805 static nir_def *
nir_pkt3(nir_builder * b,unsigned op,nir_def * len)806 nir_pkt3(nir_builder *b, unsigned op, nir_def *len)
807 {
808    return nir_pkt3_base(b, op, len, false);
809 }
810 
811 /**
812  * SQTT
813  */
814 static void
dgc_emit_sqtt_userdata(struct dgc_cmdbuf * cs,nir_def * data)815 dgc_emit_sqtt_userdata(struct dgc_cmdbuf *cs, nir_def *data)
816 {
817    const struct radv_device *device = cs->dev;
818    const struct radv_physical_device *pdev = radv_device_physical(device);
819    nir_builder *b = cs->b;
820 
821    if (!cs->dev->sqtt.bo)
822       return;
823 
824    dgc_cs_begin(cs);
825    dgc_cs_emit(nir_pkt3_base(b, PKT3_SET_UCONFIG_REG, nir_imm_int(b, 1), pdev->info.gfx_level >= GFX10));
826    dgc_cs_emit_imm((R_030D08_SQ_THREAD_TRACE_USERDATA_2 - CIK_UCONFIG_REG_OFFSET) >> 2);
827    dgc_cs_emit(data);
828    dgc_cs_end();
829 }
830 
831 static void
dgc_emit_sqtt_thread_trace_marker(struct dgc_cmdbuf * cs)832 dgc_emit_sqtt_thread_trace_marker(struct dgc_cmdbuf *cs)
833 {
834    if (!cs->dev->sqtt.bo)
835       return;
836 
837    dgc_cs_begin(cs);
838    dgc_cs_emit_imm(PKT3(PKT3_EVENT_WRITE, 0, 0));
839    dgc_cs_emit_imm(EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER | EVENT_INDEX(0)));
840    dgc_cs_end();
841 }
842 
843 static void
dgc_emit_sqtt_marker_event(struct dgc_cmdbuf * cs,nir_def * sequence_id,enum rgp_sqtt_marker_event_type event)844 dgc_emit_sqtt_marker_event(struct dgc_cmdbuf *cs, nir_def *sequence_id, enum rgp_sqtt_marker_event_type event)
845 {
846    struct rgp_sqtt_marker_event marker = {0};
847    nir_builder *b = cs->b;
848 
849    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
850    marker.api_type = event;
851 
852    dgc_emit_sqtt_userdata(cs, nir_imm_int(b, marker.dword01));
853    dgc_emit_sqtt_userdata(cs, nir_imm_int(b, marker.dword02));
854    dgc_emit_sqtt_userdata(cs, sequence_id);
855 }
856 
857 static void
dgc_emit_sqtt_marker_event_with_dims(struct dgc_cmdbuf * cs,nir_def * sequence_id,nir_def * x,nir_def * y,nir_def * z,enum rgp_sqtt_marker_event_type event)858 dgc_emit_sqtt_marker_event_with_dims(struct dgc_cmdbuf *cs, nir_def *sequence_id, nir_def *x, nir_def *y, nir_def *z,
859                                      enum rgp_sqtt_marker_event_type event)
860 {
861    struct rgp_sqtt_marker_event_with_dims marker = {0};
862    nir_builder *b = cs->b;
863 
864    marker.event.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
865    marker.event.api_type = event;
866    marker.event.has_thread_dims = 1;
867 
868    dgc_emit_sqtt_userdata(cs, nir_imm_int(b, marker.event.dword01));
869    dgc_emit_sqtt_userdata(cs, nir_imm_int(b, marker.event.dword02));
870    dgc_emit_sqtt_userdata(cs, sequence_id);
871    dgc_emit_sqtt_userdata(cs, x);
872    dgc_emit_sqtt_userdata(cs, y);
873    dgc_emit_sqtt_userdata(cs, z);
874 }
875 
876 static void
dgc_emit_sqtt_begin_api_marker(struct dgc_cmdbuf * cs,enum rgp_sqtt_marker_general_api_type api_type)877 dgc_emit_sqtt_begin_api_marker(struct dgc_cmdbuf *cs, enum rgp_sqtt_marker_general_api_type api_type)
878 {
879    struct rgp_sqtt_marker_general_api marker = {0};
880    nir_builder *b = cs->b;
881 
882    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_GENERAL_API;
883    marker.api_type = api_type;
884 
885    dgc_emit_sqtt_userdata(cs, nir_imm_int(b, marker.dword01));
886 }
887 
888 static void
dgc_emit_sqtt_end_api_marker(struct dgc_cmdbuf * cs,enum rgp_sqtt_marker_general_api_type api_type)889 dgc_emit_sqtt_end_api_marker(struct dgc_cmdbuf *cs, enum rgp_sqtt_marker_general_api_type api_type)
890 {
891    struct rgp_sqtt_marker_general_api marker = {0};
892    nir_builder *b = cs->b;
893 
894    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_GENERAL_API;
895    marker.api_type = api_type;
896    marker.is_end = 1;
897 
898    dgc_emit_sqtt_userdata(cs, nir_imm_int(b, marker.dword01));
899 }
900 
901 /**
902  * Command buffer
903  */
904 static nir_def *
dgc_cmd_buf_size(nir_builder * b,nir_def * sequence_count,bool is_ace,const struct radv_device * device)905 dgc_cmd_buf_size(nir_builder *b, nir_def *sequence_count, bool is_ace, const struct radv_device *device)
906 {
907    nir_def *cmd_buf_size = is_ace ? load_param32(b, ace_cmd_buf_size) : load_param32(b, cmd_buf_size);
908    nir_def *cmd_buf_stride = is_ace ? load_param32(b, ace_cmd_buf_stride) : load_param32(b, cmd_buf_stride);
909    const enum amd_ip_type ip_type = is_ace ? AMD_IP_COMPUTE : AMD_IP_GFX;
910 
911    nir_def *use_preamble = nir_ine_imm(b, load_param8(b, use_preamble), 0);
912    nir_def *size = nir_iadd_imm(b, nir_imul(b, cmd_buf_stride, sequence_count), PKT3_INDIRECT_BUFFER_BYTES);
913    unsigned align_mask = radv_pad_cmdbuf(device, 1, ip_type) - 1;
914 
915    size = nir_iand_imm(b, nir_iadd_imm(b, size, align_mask), ~align_mask);
916 
917    /* Ensure we don't have to deal with a jump to an empty IB in the preamble. */
918    size = nir_imax(b, size, nir_imm_int(b, align_mask + 1));
919 
920    return nir_bcsel(b, use_preamble, size, cmd_buf_size);
921 }
922 
923 static void
build_dgc_buffer_tail(nir_builder * b,nir_def * cmd_buf_offset,nir_def * cmd_buf_size,nir_def * cmd_buf_stride,nir_def * cmd_buf_trailer_offset,nir_def * sequence_count,unsigned trailer_size,bool is_ace,const struct radv_device * device)924 build_dgc_buffer_tail(nir_builder *b, nir_def *cmd_buf_offset, nir_def *cmd_buf_size, nir_def *cmd_buf_stride,
925                       nir_def *cmd_buf_trailer_offset, nir_def *sequence_count, unsigned trailer_size, bool is_ace,
926                       const struct radv_device *device)
927 {
928    const struct radv_physical_device *pdev = radv_device_physical(device);
929    nir_def *is_compute_queue = nir_ior_imm(b, nir_ieq_imm(b, load_param8(b, queue_family), RADV_QUEUE_COMPUTE), is_ace);
930 
931    nir_def *global_id = get_global_ids(b, 1);
932 
933    nir_push_if(b, nir_ieq_imm(b, global_id, 0));
934    {
935       nir_def *cmd_buf_tail_start = nir_imul(b, cmd_buf_stride, sequence_count);
936 
937       nir_variable *offset = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "offset");
938       nir_store_var(b, offset, cmd_buf_tail_start, 0x1);
939 
940       /* On compute queue, the DGC command buffer is chained by patching the
941        * trailer but this isn't needed on graphics because it's using IB2.
942        */
943       cmd_buf_size =
944          nir_bcsel(b, is_compute_queue, nir_iadd_imm(b, cmd_buf_size, -PKT3_INDIRECT_BUFFER_BYTES), cmd_buf_size);
945 
946       nir_def *va = nir_pack_64_2x32_split(b, load_param32(b, upload_addr), nir_imm_int(b, pdev->info.address32_hi));
947       nir_push_loop(b);
948       {
949          nir_def *curr_offset = nir_load_var(b, offset);
950          const unsigned MAX_PACKET_WORDS = 0x3FFC;
951 
952          nir_break_if(b, nir_ieq(b, curr_offset, cmd_buf_size));
953 
954          nir_def *packet, *packet_size;
955 
956          packet_size = nir_isub(b, cmd_buf_size, curr_offset);
957          packet_size = nir_umin(b, packet_size, nir_imm_int(b, MAX_PACKET_WORDS * 4));
958 
959          nir_def *len = nir_ushr_imm(b, packet_size, 2);
960          len = nir_iadd_imm(b, len, -2);
961          packet = nir_pkt3(b, PKT3_NOP, len);
962 
963          nir_build_store_global(b, packet, nir_iadd(b, va, nir_u2u64(b, nir_iadd(b, curr_offset, cmd_buf_offset))),
964                                 .access = ACCESS_NON_READABLE);
965 
966          nir_store_var(b, offset, nir_iadd(b, curr_offset, packet_size), 0x1);
967       }
968       nir_pop_loop(b, NULL);
969 
970       nir_push_if(b, is_compute_queue);
971       {
972          nir_def *chain_packets[] = {
973             nir_imm_int(b, PKT3(PKT3_INDIRECT_BUFFER, 2, 0)),
974             nir_iadd(b, load_param32(b, upload_addr), cmd_buf_trailer_offset),
975             nir_imm_int(b, pdev->info.address32_hi),
976             nir_imm_int(b, trailer_size | S_3F2_CHAIN(1) | S_3F2_VALID(1) | S_3F2_PRE_ENA(false)),
977          };
978 
979          nir_build_store_global(b, nir_vec(b, chain_packets, 4),
980                                 nir_iadd(b, va, nir_u2u64(b, nir_iadd(b, nir_load_var(b, offset), cmd_buf_offset))),
981                                 .access = ACCESS_NON_READABLE);
982       }
983       nir_pop_if(b, NULL);
984    }
985    nir_pop_if(b, NULL);
986 }
987 
988 static void
build_dgc_buffer_tail_main(nir_builder * b,nir_def * sequence_count,const struct radv_device * device)989 build_dgc_buffer_tail_main(nir_builder *b, nir_def *sequence_count, const struct radv_device *device)
990 {
991    nir_def *cmd_buf_offset = load_param32(b, cmd_buf_main_offset);
992    nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, false, device);
993    nir_def *cmd_buf_stride = load_param32(b, cmd_buf_stride);
994    nir_def *cmd_buf_trailer_offset = nir_imm_int(b, 0);
995    unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_GFX) / 4;
996 
997    build_dgc_buffer_tail(b, cmd_buf_offset, cmd_buf_size, cmd_buf_stride, cmd_buf_trailer_offset, sequence_count,
998                          trailer_size, false, device);
999 }
1000 
1001 static void
build_dgc_buffer_tail_ace(nir_builder * b,nir_def * sequence_count,const struct radv_device * device)1002 build_dgc_buffer_tail_ace(nir_builder *b, nir_def *sequence_count, const struct radv_device *device)
1003 {
1004    nir_def *cmd_buf_offset = load_param32(b, ace_cmd_buf_main_offset);
1005    nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, true, device);
1006    nir_def *cmd_buf_stride = load_param32(b, ace_cmd_buf_stride);
1007    nir_def *cmd_buf_trailer_offset = load_param32(b, ace_cmd_buf_trailer_offset);
1008    unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_COMPUTE) / 4;
1009 
1010    build_dgc_buffer_tail(b, cmd_buf_offset, cmd_buf_size, cmd_buf_stride, cmd_buf_trailer_offset, sequence_count,
1011                          trailer_size, true, device);
1012 }
1013 
1014 static void
build_dgc_buffer_trailer(nir_builder * b,nir_def * cmd_buf_offset,unsigned trailer_size,const struct radv_device * device)1015 build_dgc_buffer_trailer(nir_builder *b, nir_def *cmd_buf_offset, unsigned trailer_size,
1016                          const struct radv_device *device)
1017 {
1018    const struct radv_physical_device *pdev = radv_device_physical(device);
1019 
1020    nir_def *global_id = get_global_ids(b, 1);
1021 
1022    nir_push_if(b, nir_ieq_imm(b, global_id, 0));
1023    {
1024       nir_def *va = nir_pack_64_2x32_split(b, load_param32(b, upload_addr), nir_imm_int(b, pdev->info.address32_hi));
1025       va = nir_iadd(b, va, nir_u2u64(b, cmd_buf_offset));
1026 
1027       const uint32_t pad_size = trailer_size - PKT3_INDIRECT_BUFFER_BYTES;
1028       const uint32_t pad_size_dw = pad_size >> 2;
1029 
1030       nir_def *len = nir_imm_int(b, pad_size_dw - 2);
1031       nir_def *packet = nir_pkt3(b, PKT3_NOP, len);
1032 
1033       nir_build_store_global(b, packet, va, .access = ACCESS_NON_READABLE);
1034 
1035       nir_def *nop_packets[] = {
1036          nir_imm_int(b, PKT3_NOP_PAD),
1037          nir_imm_int(b, PKT3_NOP_PAD),
1038          nir_imm_int(b, PKT3_NOP_PAD),
1039          nir_imm_int(b, PKT3_NOP_PAD),
1040       };
1041 
1042       nir_build_store_global(b, nir_vec(b, nop_packets, 4), nir_iadd_imm(b, va, pad_size),
1043                              .access = ACCESS_NON_READABLE);
1044    }
1045    nir_pop_if(b, NULL);
1046 }
1047 
1048 static void
build_dgc_buffer_trailer_main(nir_builder * b,const struct radv_device * device)1049 build_dgc_buffer_trailer_main(nir_builder *b, const struct radv_device *device)
1050 {
1051    nir_def *cmd_buf_offset = nir_imm_int(b, 0);
1052    const unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_GFX);
1053 
1054    build_dgc_buffer_trailer(b, cmd_buf_offset, trailer_size, device);
1055 }
1056 
1057 static void
build_dgc_buffer_trailer_ace(nir_builder * b,const struct radv_device * device)1058 build_dgc_buffer_trailer_ace(nir_builder *b, const struct radv_device *device)
1059 {
1060    nir_def *cmd_buf_offset = load_param32(b, ace_cmd_buf_trailer_offset);
1061    const unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_COMPUTE);
1062 
1063    build_dgc_buffer_trailer(b, cmd_buf_offset, trailer_size, device);
1064 }
1065 
1066 static void
build_dgc_buffer_preamble(nir_builder * b,nir_def * cmd_buf_preamble_offset,nir_def * cmd_buf_size,nir_def * cmd_buf_main_offset,unsigned preamble_size,nir_def * sequence_count,const struct radv_device * device)1067 build_dgc_buffer_preamble(nir_builder *b, nir_def *cmd_buf_preamble_offset, nir_def *cmd_buf_size,
1068                           nir_def *cmd_buf_main_offset, unsigned preamble_size, nir_def *sequence_count,
1069                           const struct radv_device *device)
1070 {
1071    const struct radv_physical_device *pdev = radv_device_physical(device);
1072 
1073    nir_def *global_id = get_global_ids(b, 1);
1074    nir_def *use_preamble = nir_ine_imm(b, load_param8(b, use_preamble), 0);
1075 
1076    nir_push_if(b, nir_iand(b, nir_ieq_imm(b, global_id, 0), use_preamble));
1077    {
1078       nir_def *va = nir_pack_64_2x32_split(b, load_param32(b, upload_addr), nir_imm_int(b, pdev->info.address32_hi));
1079       va = nir_iadd(b, va, nir_u2u64(b, cmd_buf_preamble_offset));
1080 
1081       nir_def *words = nir_ushr_imm(b, cmd_buf_size, 2);
1082 
1083       const uint32_t pad_size = preamble_size - PKT3_INDIRECT_BUFFER_BYTES;
1084       const uint32_t pad_size_dw = pad_size >> 2;
1085 
1086       nir_def *len = nir_imm_int(b, pad_size_dw - 2);
1087       nir_def *packet = nir_pkt3(b, PKT3_NOP, len);
1088 
1089       nir_build_store_global(b, packet, va, .access = ACCESS_NON_READABLE);
1090 
1091       nir_def *chain_packets[] = {
1092          nir_imm_int(b, PKT3(PKT3_INDIRECT_BUFFER, 2, 0)),
1093          nir_iadd(b, cmd_buf_main_offset, load_param32(b, upload_addr)),
1094          nir_imm_int(b, pdev->info.address32_hi),
1095          nir_ior_imm(b, words, S_3F2_CHAIN(1) | S_3F2_VALID(1) | S_3F2_PRE_ENA(false)),
1096       };
1097 
1098       nir_build_store_global(b, nir_vec(b, chain_packets, 4), nir_iadd_imm(b, va, pad_size),
1099                              .access = ACCESS_NON_READABLE);
1100    }
1101    nir_pop_if(b, NULL);
1102 }
1103 
1104 static void
build_dgc_buffer_preamble_main(nir_builder * b,nir_def * sequence_count,const struct radv_device * device)1105 build_dgc_buffer_preamble_main(nir_builder *b, nir_def *sequence_count, const struct radv_device *device)
1106 {
1107    nir_def *cmd_buf_preamble_offset = load_param32(b, cmd_buf_preamble_offset);
1108    nir_def *cmd_buf_main_offset = load_param32(b, cmd_buf_main_offset);
1109    nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, false, device);
1110    unsigned preamble_size = radv_dgc_preamble_cmdbuf_size(device, AMD_IP_GFX);
1111 
1112    build_dgc_buffer_preamble(b, cmd_buf_preamble_offset, cmd_buf_size, cmd_buf_main_offset, preamble_size,
1113                              sequence_count, device);
1114 }
1115 
1116 static void
build_dgc_buffer_preamble_ace(nir_builder * b,nir_def * sequence_count,const struct radv_device * device)1117 build_dgc_buffer_preamble_ace(nir_builder *b, nir_def *sequence_count, const struct radv_device *device)
1118 {
1119    nir_def *cmd_buf_preamble_offset = load_param32(b, ace_cmd_buf_preamble_offset);
1120    nir_def *cmd_buf_main_offset = load_param32(b, ace_cmd_buf_main_offset);
1121    nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, true, device);
1122    unsigned preamble_size = radv_dgc_preamble_cmdbuf_size(device, AMD_IP_COMPUTE);
1123 
1124    build_dgc_buffer_preamble(b, cmd_buf_preamble_offset, cmd_buf_size, cmd_buf_main_offset, preamble_size,
1125                              sequence_count, device);
1126 }
1127 
1128 /**
1129  * Draw
1130  */
1131 static void
dgc_emit_userdata_vertex(struct dgc_cmdbuf * cs,nir_def * first_vertex,nir_def * first_instance,nir_def * drawid)1132 dgc_emit_userdata_vertex(struct dgc_cmdbuf *cs, nir_def *first_vertex, nir_def *first_instance, nir_def *drawid)
1133 {
1134    nir_builder *b = cs->b;
1135 
1136    nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
1137    vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr);
1138 
1139    nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
1140    nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE);
1141 
1142    nir_def *pkt_cnt = nir_imm_int(b, 1);
1143    pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
1144    pkt_cnt = nir_bcsel(b, has_baseinstance, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
1145 
1146    dgc_cs_begin(cs);
1147    dgc_cs_emit(nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt));
1148    dgc_cs_emit(nir_iand_imm(b, vtx_base_sgpr, 0x3FFF));
1149    dgc_cs_emit(first_vertex);
1150    dgc_cs_emit(nir_bcsel(b, nir_ior(b, has_drawid, has_baseinstance), nir_bcsel(b, has_drawid, drawid, first_instance),
1151                          nir_imm_int(b, PKT3_NOP_PAD)));
1152    dgc_cs_emit(nir_bcsel(b, nir_iand(b, has_drawid, has_baseinstance), first_instance, nir_imm_int(b, PKT3_NOP_PAD)));
1153    dgc_cs_end();
1154 }
1155 
1156 static void
dgc_emit_instance_count(struct dgc_cmdbuf * cs,nir_def * instance_count)1157 dgc_emit_instance_count(struct dgc_cmdbuf *cs, nir_def *instance_count)
1158 {
1159    dgc_cs_begin(cs);
1160    dgc_cs_emit_imm(PKT3(PKT3_NUM_INSTANCES, 0, 0));
1161    dgc_cs_emit(instance_count);
1162    dgc_cs_end();
1163 }
1164 
1165 static void
dgc_emit_draw_index_offset_2(struct dgc_cmdbuf * cs,nir_def * index_offset,nir_def * index_count,nir_def * max_index_count)1166 dgc_emit_draw_index_offset_2(struct dgc_cmdbuf *cs, nir_def *index_offset, nir_def *index_count,
1167                              nir_def *max_index_count)
1168 {
1169    dgc_cs_begin(cs);
1170    dgc_cs_emit_imm(PKT3(PKT3_DRAW_INDEX_OFFSET_2, 3, 0));
1171    dgc_cs_emit(max_index_count);
1172    dgc_cs_emit(index_offset);
1173    dgc_cs_emit(index_count);
1174    dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_DMA);
1175    dgc_cs_end();
1176 }
1177 
1178 static void
dgc_emit_draw_index_auto(struct dgc_cmdbuf * cs,nir_def * vertex_count)1179 dgc_emit_draw_index_auto(struct dgc_cmdbuf *cs, nir_def *vertex_count)
1180 {
1181    dgc_cs_begin(cs);
1182    dgc_cs_emit_imm(PKT3(PKT3_DRAW_INDEX_AUTO, 1, 0));
1183    dgc_cs_emit(vertex_count);
1184    dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_AUTO_INDEX);
1185    dgc_cs_end();
1186 }
1187 
1188 static void
dgc_emit_pkt3_set_base(struct dgc_cmdbuf * cs,nir_def * va)1189 dgc_emit_pkt3_set_base(struct dgc_cmdbuf *cs, nir_def *va)
1190 {
1191    nir_builder *b = cs->b;
1192 
1193    nir_def *va_lo = nir_unpack_64_2x32_split_x(b, va);
1194    nir_def *va_hi = nir_unpack_64_2x32_split_y(b, va);
1195 
1196    dgc_cs_begin(cs);
1197    dgc_cs_emit_imm(PKT3(PKT3_SET_BASE, 2, 0));
1198    dgc_cs_emit_imm(1);
1199    dgc_cs_emit(va_lo);
1200    dgc_cs_emit(va_hi);
1201    dgc_cs_end();
1202 }
1203 
1204 static void
dgc_emit_pkt3_draw_indirect(struct dgc_cmdbuf * cs,bool indexed)1205 dgc_emit_pkt3_draw_indirect(struct dgc_cmdbuf *cs, bool indexed)
1206 {
1207    const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
1208    nir_builder *b = cs->b;
1209 
1210    nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
1211 
1212    nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
1213    nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE);
1214 
1215    vtx_base_sgpr = nir_iand_imm(b, nir_u2u32(b, vtx_base_sgpr), 0x3FFF);
1216 
1217    /* vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2 */
1218    nir_def *vertex_offset_reg = vtx_base_sgpr;
1219 
1220    /* start_instance_reg = (base_reg + (draw_id_enable ? 8 : 4) - SI_SH_REG_OFFSET) >> 2 */
1221    nir_def *start_instance_offset = nir_bcsel(b, has_drawid, nir_imm_int(b, 2), nir_imm_int(b, 1));
1222    nir_def *start_instance_reg = nir_iadd(b, vtx_base_sgpr, start_instance_offset);
1223 
1224    /* draw_id_reg = (base_reg + 4 - SI_SH_REG_OFFSET) >> 2 */
1225    nir_def *draw_id_reg = nir_iadd(b, vtx_base_sgpr, nir_imm_int(b, 1));
1226 
1227    nir_if *if_drawid = nir_push_if(b, has_drawid);
1228    {
1229       const unsigned pkt3_op = indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI;
1230 
1231       dgc_cs_begin(cs);
1232       dgc_cs_emit_imm(PKT3(pkt3_op, 8, 0));
1233       dgc_cs_emit_imm(0);
1234       dgc_cs_emit(vertex_offset_reg);
1235       dgc_cs_emit(nir_bcsel(b, has_baseinstance, start_instance_reg, nir_imm_int(b, 0)));
1236       dgc_cs_emit(nir_ior(b, draw_id_reg, nir_imm_int(b, S_2C3_DRAW_INDEX_ENABLE(1))));
1237       dgc_cs_emit_imm(1); /* draw count */
1238       dgc_cs_emit_imm(0); /* count va low */
1239       dgc_cs_emit_imm(0); /* count va high */
1240       dgc_cs_emit_imm(0); /* stride */
1241       dgc_cs_emit_imm(di_src_sel);
1242       dgc_cs_end();
1243    }
1244    nir_push_else(b, if_drawid);
1245    {
1246       const unsigned pkt3_op = indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT;
1247 
1248       dgc_cs_begin(cs);
1249       dgc_cs_emit_imm(PKT3(pkt3_op, 3, 0));
1250       dgc_cs_emit_imm(0);
1251       dgc_cs_emit(vertex_offset_reg);
1252       dgc_cs_emit(nir_bcsel(b, has_baseinstance, start_instance_reg, nir_imm_int(b, 0)));
1253       dgc_cs_emit_imm(di_src_sel);
1254       dgc_cs_end();
1255    }
1256    nir_pop_if(b, if_drawid);
1257 }
1258 
1259 static void
dgc_emit_draw_indirect(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id,bool indexed)1260 dgc_emit_draw_indirect(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, bool indexed)
1261 {
1262    const struct radv_indirect_command_layout *layout = cs->layout;
1263    nir_builder *b = cs->b;
1264 
1265    nir_def *va = nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B);
1266 
1267    dgc_emit_sqtt_begin_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirect : ApiCmdDrawIndirect);
1268    dgc_emit_sqtt_marker_event(cs, sequence_id, indexed ? EventCmdDrawIndexedIndirect : EventCmdDrawIndirect);
1269 
1270    dgc_emit_pkt3_set_base(cs, va);
1271    dgc_emit_pkt3_draw_indirect(cs, indexed);
1272 
1273    dgc_emit_sqtt_thread_trace_marker(cs);
1274    dgc_emit_sqtt_end_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirect : ApiCmdDrawIndirect);
1275 }
1276 
1277 static void
dgc_emit_draw(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id)1278 dgc_emit_draw(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id)
1279 {
1280    const struct radv_indirect_command_layout *layout = cs->layout;
1281    nir_builder *b = cs->b;
1282 
1283    nir_def *draw_data0 = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
1284                                                .access = ACCESS_NON_WRITEABLE);
1285    nir_def *vertex_count = nir_channel(b, draw_data0, 0);
1286    nir_def *instance_count = nir_channel(b, draw_data0, 1);
1287    nir_def *vertex_offset = nir_channel(b, draw_data0, 2);
1288    nir_def *first_instance = nir_channel(b, draw_data0, 3);
1289 
1290    nir_push_if(b, nir_iand(b, nir_ine_imm(b, vertex_count, 0), nir_ine_imm(b, instance_count, 0)));
1291    {
1292       dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDraw);
1293       dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDraw);
1294 
1295       dgc_emit_userdata_vertex(cs, vertex_offset, first_instance, nir_imm_int(b, 0));
1296       dgc_emit_instance_count(cs, instance_count);
1297       dgc_emit_draw_index_auto(cs, vertex_count);
1298 
1299       dgc_emit_sqtt_thread_trace_marker(cs);
1300       dgc_emit_sqtt_end_api_marker(cs, ApiCmdDraw);
1301    }
1302    nir_pop_if(b, 0);
1303 }
1304 
1305 static void
dgc_emit_draw_indexed(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id,nir_def * max_index_count)1306 dgc_emit_draw_indexed(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, nir_def *max_index_count)
1307 {
1308    const struct radv_indirect_command_layout *layout = cs->layout;
1309    nir_builder *b = cs->b;
1310 
1311    nir_def *draw_data0 = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
1312                                                .access = ACCESS_NON_WRITEABLE);
1313    nir_def *draw_data1 =
1314       nir_build_load_global(b, 1, 32, nir_iadd_imm(b, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B), 16),
1315                             .access = ACCESS_NON_WRITEABLE);
1316    nir_def *index_count = nir_channel(b, draw_data0, 0);
1317    nir_def *instance_count = nir_channel(b, draw_data0, 1);
1318    nir_def *first_index = nir_channel(b, draw_data0, 2);
1319    nir_def *vertex_offset = nir_channel(b, draw_data0, 3);
1320    nir_def *first_instance = nir_channel(b, draw_data1, 0);
1321 
1322    nir_push_if(b, nir_iand(b, nir_ine_imm(b, index_count, 0), nir_ine_imm(b, instance_count, 0)));
1323    {
1324       dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDrawIndexed);
1325       dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDrawIndexed);
1326 
1327       dgc_emit_userdata_vertex(cs, vertex_offset, first_instance, nir_imm_int(b, 0));
1328       dgc_emit_instance_count(cs, instance_count);
1329       dgc_emit_draw_index_offset_2(cs, first_index, index_count, max_index_count);
1330 
1331       dgc_emit_sqtt_thread_trace_marker(cs);
1332       dgc_emit_sqtt_end_api_marker(cs, ApiCmdDrawIndexed);
1333    }
1334    nir_pop_if(b, 0);
1335 }
1336 
1337 static void
dgc_emit_draw_with_count(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id,bool indexed)1338 dgc_emit_draw_with_count(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, bool indexed)
1339 {
1340    const struct radv_indirect_command_layout *layout = cs->layout;
1341    nir_builder *b = cs->b;
1342 
1343    nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
1344    nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
1345    nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE);
1346 
1347    nir_def *draw_data = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
1348                                               .access = ACCESS_NON_WRITEABLE);
1349    nir_def *va = nir_pack_64_2x32(b, nir_channels(b, draw_data, 0x3));
1350    nir_def *stride = nir_channel(b, draw_data, 2);
1351    nir_def *draw_count = nir_umin(b, load_param32(b, max_draw_count), nir_channel(b, draw_data, 3));
1352 
1353    dgc_emit_pkt3_set_base(cs, va);
1354 
1355    nir_def *vertex_offset_reg = nir_iand_imm(b, vtx_base_sgpr, 0x3FFF);
1356    nir_def *start_instance_offset = nir_bcsel(b, has_drawid, nir_imm_int(b, 2), nir_imm_int(b, 1));
1357    nir_def *start_instance_reg =
1358       nir_bcsel(b, has_baseinstance, nir_iadd(b, vertex_offset_reg, start_instance_offset), nir_imm_int(b, 0));
1359    nir_def *draw_id_reg = nir_bcsel(
1360       b, has_drawid, nir_ior_imm(b, nir_iadd(b, vertex_offset_reg, nir_imm_int(b, 1)), S_2C3_DRAW_INDEX_ENABLE(1)),
1361       nir_imm_int(b, 0));
1362 
1363    nir_def *di_src_sel = nir_imm_int(b, indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX);
1364 
1365    dgc_emit_sqtt_begin_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirectCount : ApiCmdDrawIndirectCount);
1366    dgc_emit_sqtt_marker_event(cs, sequence_id, indexed ? EventCmdDrawIndexedIndirectCount : EventCmdDrawIndirectCount);
1367 
1368    dgc_cs_begin(cs);
1369    dgc_cs_emit_imm(PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8, false));
1370    dgc_cs_emit_imm(0);
1371    dgc_cs_emit(vertex_offset_reg);
1372    dgc_cs_emit(start_instance_reg);
1373    dgc_cs_emit(draw_id_reg);
1374    dgc_cs_emit(draw_count);
1375    dgc_cs_emit_imm(0);
1376    dgc_cs_emit_imm(0);
1377    dgc_cs_emit(stride);
1378    dgc_cs_emit(di_src_sel);
1379    dgc_cs_end();
1380 
1381    dgc_emit_sqtt_thread_trace_marker(cs);
1382    dgc_emit_sqtt_end_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirectCount : ApiCmdDrawIndirectCount);
1383 }
1384 
1385 /**
1386  * Index buffer
1387  */
1388 static nir_def *
dgc_get_index_type(struct dgc_cmdbuf * cs,nir_def * user_index_type)1389 dgc_get_index_type(struct dgc_cmdbuf *cs, nir_def *user_index_type)
1390 {
1391    const struct radv_indirect_command_layout *layout = cs->layout;
1392    nir_builder *b = cs->b;
1393 
1394    if (layout->vk.index_mode_is_dx) {
1395       nir_def *index_type = nir_bcsel(b, nir_ieq_imm(b, user_index_type, 0x2a /* DXGI_FORMAT_R32_UINT */),
1396                                       nir_imm_int(b, V_028A7C_VGT_INDEX_32), nir_imm_int(b, V_028A7C_VGT_INDEX_16));
1397       return nir_bcsel(b, nir_ieq_imm(b, user_index_type, 0x3e /* DXGI_FORMAT_R8_UINT */),
1398                        nir_imm_int(b, V_028A7C_VGT_INDEX_8), index_type);
1399    } else {
1400       nir_def *index_type = nir_bcsel(b, nir_ieq_imm(b, user_index_type, VK_INDEX_TYPE_UINT32),
1401                                       nir_imm_int(b, V_028A7C_VGT_INDEX_32), nir_imm_int(b, V_028A7C_VGT_INDEX_16));
1402       return nir_bcsel(b, nir_ieq_imm(b, user_index_type, VK_INDEX_TYPE_UINT8), nir_imm_int(b, V_028A7C_VGT_INDEX_8),
1403                        index_type);
1404    }
1405 }
1406 
1407 static void
dgc_emit_index_buffer(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_variable * max_index_count_var)1408 dgc_emit_index_buffer(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_variable *max_index_count_var)
1409 {
1410    const struct radv_indirect_command_layout *layout = cs->layout;
1411    const struct radv_device *device = cs->dev;
1412    const struct radv_physical_device *pdev = radv_device_physical(device);
1413    nir_builder *b = cs->b;
1414 
1415    nir_def *data = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.index_src_offset_B),
1416                                          .access = ACCESS_NON_WRITEABLE);
1417 
1418    nir_def *index_type = dgc_get_index_type(cs, nir_channel(b, data, 3));
1419    nir_def *index_size = nir_iand_imm(b, nir_ushr(b, nir_imm_int(b, 0x142), nir_imul_imm(b, index_type, 4)), 0xf);
1420 
1421    nir_def *max_index_count = nir_udiv(b, nir_channel(b, data, 2), index_size);
1422    nir_store_var(b, max_index_count_var, max_index_count, 0x1);
1423 
1424    nir_def *addr_upper = nir_channel(b, data, 1);
1425    addr_upper = nir_ishr_imm(b, nir_ishl_imm(b, addr_upper, 16), 16);
1426 
1427    dgc_cs_begin(cs);
1428 
1429    if (pdev->info.gfx_level >= GFX9) {
1430       unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
1431       if (pdev->info.gfx_level < GFX9 || (pdev->info.gfx_level == GFX9 && pdev->info.me_fw_version < 26))
1432          opcode = PKT3_SET_UCONFIG_REG;
1433       dgc_cs_emit_imm(PKT3(opcode, 1, 0));
1434       dgc_cs_emit_imm((R_03090C_VGT_INDEX_TYPE - CIK_UCONFIG_REG_OFFSET) >> 2 | (2u << 28));
1435       dgc_cs_emit(index_type);
1436    } else {
1437       dgc_cs_emit_imm(PKT3(PKT3_INDEX_TYPE, 0, 0));
1438       dgc_cs_emit(index_type);
1439       dgc_cs_emit(nir_imm_int(b, PKT3_NOP_PAD));
1440    }
1441 
1442    dgc_cs_emit_imm(PKT3(PKT3_INDEX_BASE, 1, 0));
1443    dgc_cs_emit(nir_channel(b, data, 0));
1444    dgc_cs_emit(addr_upper);
1445 
1446    dgc_cs_emit_imm(PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
1447    dgc_cs_emit(max_index_count);
1448 
1449    dgc_cs_end();
1450 }
1451 
1452 /**
1453  * Push constants
1454  */
1455 static nir_def *
dgc_get_push_constant_stages(struct dgc_cmdbuf * cs)1456 dgc_get_push_constant_stages(struct dgc_cmdbuf *cs)
1457 {
1458    const struct radv_indirect_command_layout *layout = cs->layout;
1459    nir_builder *b = cs->b;
1460 
1461    if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
1462       nir_def *has_push_constant = nir_ine_imm(b, load_shader_metadata32(cs, push_const_sgpr), 0);
1463       return nir_bcsel(b, has_push_constant, nir_imm_int(b, VK_SHADER_STAGE_COMPUTE_BIT), nir_imm_int(b, 0));
1464    } else {
1465       return load_param16(b, push_constant_stages);
1466    }
1467 }
1468 
1469 static nir_def *
dgc_get_upload_sgpr(struct dgc_cmdbuf * cs,nir_def * param_buf,nir_def * param_offset,gl_shader_stage stage)1470 dgc_get_upload_sgpr(struct dgc_cmdbuf *cs, nir_def *param_buf, nir_def *param_offset, gl_shader_stage stage)
1471 {
1472    const struct radv_indirect_command_layout *layout = cs->layout;
1473    nir_builder *b = cs->b;
1474    nir_def *res;
1475 
1476    if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
1477       res = load_shader_metadata32(cs, push_const_sgpr);
1478    } else {
1479       res = nir_load_ssbo(b, 1, 32, param_buf, nir_iadd_imm(b, param_offset, stage * 12));
1480    }
1481 
1482    return nir_ubfe_imm(b, res, 0, 16);
1483 }
1484 
1485 static nir_def *
dgc_get_inline_sgpr(struct dgc_cmdbuf * cs,nir_def * param_buf,nir_def * param_offset,gl_shader_stage stage)1486 dgc_get_inline_sgpr(struct dgc_cmdbuf *cs, nir_def *param_buf, nir_def *param_offset, gl_shader_stage stage)
1487 {
1488    const struct radv_indirect_command_layout *layout = cs->layout;
1489    nir_builder *b = cs->b;
1490    nir_def *res;
1491 
1492    if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
1493       res = load_shader_metadata32(cs, push_const_sgpr);
1494    } else {
1495       res = nir_load_ssbo(b, 1, 32, param_buf, nir_iadd_imm(b, param_offset, stage * 12));
1496    }
1497 
1498    return nir_ubfe_imm(b, res, 16, 16);
1499 }
1500 
1501 static nir_def *
dgc_get_inline_mask(struct dgc_cmdbuf * cs,nir_def * param_buf,nir_def * param_offset,gl_shader_stage stage)1502 dgc_get_inline_mask(struct dgc_cmdbuf *cs, nir_def *param_buf, nir_def *param_offset, gl_shader_stage stage)
1503 {
1504    const struct radv_indirect_command_layout *layout = cs->layout;
1505    nir_builder *b = cs->b;
1506 
1507    if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
1508       return load_shader_metadata64(cs, inline_push_const_mask);
1509    } else {
1510       nir_def *reg_info = nir_load_ssbo(b, 2, 32, param_buf, nir_iadd_imm(b, param_offset, stage * 12 + 4));
1511       return nir_pack_64_2x32(b, nir_channels(b, reg_info, 0x3));
1512    }
1513 }
1514 
1515 static nir_def *
dgc_push_constant_needs_copy(struct dgc_cmdbuf * cs)1516 dgc_push_constant_needs_copy(struct dgc_cmdbuf *cs)
1517 {
1518    const struct radv_indirect_command_layout *layout = cs->layout;
1519    nir_builder *b = cs->b;
1520 
1521    if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
1522       return nir_ine_imm(b, nir_ubfe_imm(b, load_shader_metadata32(cs, push_const_sgpr), 0, 16), 0);
1523    } else {
1524       return nir_ine_imm(b, load_param8(b, const_copy), 0);
1525    }
1526 }
1527 
1528 struct dgc_pc_params {
1529    nir_def *buf;
1530    nir_def *offset;
1531    nir_def *const_offset;
1532 };
1533 
1534 static struct dgc_pc_params
dgc_get_pc_params(struct dgc_cmdbuf * cs)1535 dgc_get_pc_params(struct dgc_cmdbuf *cs)
1536 {
1537    const struct radv_indirect_command_layout *layout = cs->layout;
1538    struct dgc_pc_params params = {0};
1539    nir_builder *b = cs->b;
1540 
1541    params.buf = radv_meta_load_descriptor(b, 0, 0);
1542 
1543    uint32_t offset = 0;
1544    if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
1545       offset =
1546          (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) ? 0 : sizeof(struct radv_compute_pipeline_metadata);
1547    } else {
1548       if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB))
1549          offset = MAX_VBS * DGC_VBO_INFO_SIZE;
1550    }
1551 
1552    params.offset = nir_imm_int(b, offset);
1553    params.const_offset = nir_iadd_imm(b, params.offset, MESA_VULKAN_SHADER_STAGES * 12);
1554 
1555    return params;
1556 }
1557 
1558 static void
dgc_alloc_push_constant(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id,const struct dgc_pc_params * params)1559 dgc_alloc_push_constant(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id,
1560                         const struct dgc_pc_params *params)
1561 {
1562    const struct radv_indirect_command_layout *layout = cs->layout;
1563    VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, layout->vk.layout);
1564    nir_builder *b = cs->b;
1565 
1566    for (uint32_t i = 0; i < pipeline_layout->push_constant_size / 4; i++) {
1567       nir_def *data;
1568 
1569       if (layout->sequence_index_mask & (1ull << i)) {
1570          data = sequence_id;
1571       } else if ((layout->push_constant_mask & (1ull << i))) {
1572          data = nir_build_load_global(b, 1, 32, nir_iadd_imm(b, stream_addr, layout->push_constant_offsets[i]),
1573                                       .access = ACCESS_NON_WRITEABLE);
1574       } else {
1575          data = nir_load_ssbo(b, 1, 32, params->buf, nir_iadd_imm(b, params->const_offset, i * 4));
1576       }
1577 
1578       dgc_upload(cs, data);
1579    }
1580 }
1581 
1582 static void
dgc_emit_push_constant_for_stage(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id,const struct dgc_pc_params * params,gl_shader_stage stage)1583 dgc_emit_push_constant_for_stage(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id,
1584                                  const struct dgc_pc_params *params, gl_shader_stage stage)
1585 {
1586    const struct radv_indirect_command_layout *layout = cs->layout;
1587    VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, layout->vk.layout);
1588    nir_builder *b = cs->b;
1589 
1590    nir_def *upload_sgpr = dgc_get_upload_sgpr(cs, params->buf, params->offset, stage);
1591    nir_def *inline_sgpr = dgc_get_inline_sgpr(cs, params->buf, params->offset, stage);
1592    nir_def *inline_mask = dgc_get_inline_mask(cs, params->buf, params->offset, stage);
1593 
1594    nir_push_if(b, nir_ine_imm(b, upload_sgpr, 0));
1595    {
1596       dgc_cs_begin(cs);
1597       dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 1, 0));
1598       dgc_cs_emit(upload_sgpr);
1599       dgc_cs_emit(nir_iadd(b, load_param32(b, upload_addr), nir_load_var(b, cs->upload_offset)));
1600       dgc_cs_end();
1601    }
1602    nir_pop_if(b, NULL);
1603 
1604    nir_push_if(b, nir_ine_imm(b, inline_sgpr, 0));
1605    {
1606       nir_variable *pc_idx = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "pc_idx");
1607       nir_store_var(b, pc_idx, nir_imm_int(b, 0), 0x1);
1608 
1609       for (uint32_t i = 0; i < pipeline_layout->push_constant_size / 4; i++) {
1610          nir_push_if(b, nir_ine_imm(b, nir_iand_imm(b, inline_mask, 1ull << i), 0));
1611          {
1612             nir_def *data = NULL;
1613 
1614             if (layout->sequence_index_mask & (1ull << i)) {
1615                data = sequence_id;
1616             } else if (layout->push_constant_mask & (1ull << i)) {
1617                data = nir_build_load_global(b, 1, 32, nir_iadd_imm(b, stream_addr, layout->push_constant_offsets[i]),
1618                                             .access = ACCESS_NON_WRITEABLE);
1619             } else if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) {
1620                /* For indirect pipeline binds, partial push constant updates can't be emitted when
1621                 * the DGC execute is called because there is no bound pipeline and they have to be
1622                 * emitted from the DGC prepare shader.
1623                 */
1624                data = nir_load_ssbo(b, 1, 32, params->buf, nir_iadd_imm(b, params->const_offset, i * 4));
1625             }
1626 
1627             if (data) {
1628                dgc_cs_begin(cs);
1629                dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 1, 0));
1630                dgc_cs_emit(nir_iadd(b, inline_sgpr, nir_load_var(b, pc_idx)));
1631                dgc_cs_emit(data);
1632                dgc_cs_end();
1633             }
1634 
1635             nir_store_var(b, pc_idx, nir_iadd_imm(b, nir_load_var(b, pc_idx), 1), 0x1);
1636          }
1637          nir_pop_if(b, NULL);
1638       }
1639    }
1640    nir_pop_if(b, NULL);
1641 }
1642 
1643 static void
dgc_emit_push_constant(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id,VkShaderStageFlags stages)1644 dgc_emit_push_constant(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, VkShaderStageFlags stages)
1645 {
1646    const struct dgc_pc_params params = dgc_get_pc_params(cs);
1647    nir_builder *b = cs->b;
1648 
1649    nir_def *push_constant_stages = dgc_get_push_constant_stages(cs);
1650    radv_foreach_stage(s, stages)
1651    {
1652       nir_push_if(b, nir_test_mask(b, push_constant_stages, mesa_to_vk_shader_stage(s)));
1653       {
1654          dgc_emit_push_constant_for_stage(cs, stream_addr, sequence_id, &params, s);
1655       }
1656       nir_pop_if(b, NULL);
1657    }
1658 
1659    nir_def *const_copy = dgc_push_constant_needs_copy(cs);
1660    nir_push_if(b, const_copy);
1661    {
1662       dgc_alloc_push_constant(cs, stream_addr, sequence_id, &params);
1663    }
1664    nir_pop_if(b, NULL);
1665 }
1666 
1667 /**
1668  * Vertex buffers
1669  */
1670 struct dgc_vbo_info {
1671    nir_def *va;
1672    nir_def *size;
1673    nir_def *stride;
1674 
1675    nir_def *attrib_end;
1676    nir_def *attrib_index_offset;
1677 
1678    nir_def *non_trivial_format;
1679 };
1680 
1681 static nir_def *
dgc_get_rsrc3_vbo_desc(struct dgc_cmdbuf * cs,const struct dgc_vbo_info * vbo_info)1682 dgc_get_rsrc3_vbo_desc(struct dgc_cmdbuf *cs, const struct dgc_vbo_info *vbo_info)
1683 {
1684    const struct radv_device *device = cs->dev;
1685    const struct radv_physical_device *pdev = radv_device_physical(device);
1686    nir_builder *b = cs->b;
1687 
1688    uint32_t rsrc_word3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1689                          S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
1690 
1691    if (pdev->info.gfx_level >= GFX10) {
1692       rsrc_word3 |= S_008F0C_FORMAT_GFX10(V_008F0C_GFX10_FORMAT_32_UINT);
1693    } else {
1694       rsrc_word3 |=
1695          S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1696    }
1697 
1698    nir_def *uses_dynamic_inputs = nir_ieq_imm(b, load_param8(b, dynamic_vs_input), 1);
1699    nir_def *uses_non_trivial_format = nir_iand(b, uses_dynamic_inputs, nir_ine_imm(b, vbo_info->non_trivial_format, 0));
1700 
1701    return nir_bcsel(b, uses_non_trivial_format, vbo_info->non_trivial_format, nir_imm_int(b, rsrc_word3));
1702 }
1703 
1704 static void
dgc_write_vertex_descriptor(struct dgc_cmdbuf * cs,const struct dgc_vbo_info * vbo_info,nir_variable * desc)1705 dgc_write_vertex_descriptor(struct dgc_cmdbuf *cs, const struct dgc_vbo_info *vbo_info, nir_variable *desc)
1706 {
1707    const struct radv_device *device = cs->dev;
1708    const struct radv_physical_device *pdev = radv_device_physical(device);
1709    nir_builder *b = cs->b;
1710 
1711    nir_variable *num_records = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "num_records");
1712    nir_store_var(b, num_records, vbo_info->size, 0x1);
1713 
1714    nir_def *use_per_attribute_vb_descs = nir_ieq_imm(b, load_param8(b, use_per_attribute_vb_descs), 1);
1715    nir_push_if(b, use_per_attribute_vb_descs);
1716    {
1717       nir_push_if(b, nir_ult(b, nir_load_var(b, num_records), vbo_info->attrib_end));
1718       {
1719          nir_store_var(b, num_records, nir_imm_int(b, 0), 0x1);
1720       }
1721       nir_push_else(b, NULL);
1722       nir_push_if(b, nir_ieq_imm(b, vbo_info->stride, 0));
1723       {
1724          nir_store_var(b, num_records, nir_imm_int(b, 1), 0x1);
1725       }
1726       nir_push_else(b, NULL);
1727       {
1728          nir_def *r = nir_iadd(
1729             b,
1730             nir_iadd_imm(
1731                b, nir_udiv(b, nir_isub(b, nir_load_var(b, num_records), vbo_info->attrib_end), vbo_info->stride), 1),
1732             vbo_info->attrib_index_offset);
1733          nir_store_var(b, num_records, r, 0x1);
1734       }
1735       nir_pop_if(b, NULL);
1736       nir_pop_if(b, NULL);
1737 
1738       nir_def *convert_cond = nir_ine_imm(b, nir_load_var(b, num_records), 0);
1739       if (pdev->info.gfx_level == GFX9)
1740          convert_cond = nir_imm_false(b);
1741       else if (pdev->info.gfx_level != GFX8)
1742          convert_cond = nir_iand(b, convert_cond, nir_ieq_imm(b, vbo_info->stride, 0));
1743 
1744       nir_def *new_records = nir_iadd(
1745          b, nir_imul(b, nir_iadd_imm(b, nir_load_var(b, num_records), -1), vbo_info->stride), vbo_info->attrib_end);
1746       new_records = nir_bcsel(b, convert_cond, new_records, nir_load_var(b, num_records));
1747       nir_store_var(b, num_records, new_records, 0x1);
1748    }
1749    nir_push_else(b, NULL);
1750    {
1751       if (pdev->info.gfx_level != GFX8) {
1752          nir_push_if(b, nir_ine_imm(b, vbo_info->stride, 0));
1753          {
1754             nir_def *r = nir_iadd(b, nir_load_var(b, num_records), nir_iadd_imm(b, vbo_info->stride, -1));
1755             nir_store_var(b, num_records, nir_udiv(b, r, vbo_info->stride), 0x1);
1756          }
1757          nir_pop_if(b, NULL);
1758       }
1759    }
1760    nir_pop_if(b, NULL);
1761 
1762    nir_def *rsrc_word3 = dgc_get_rsrc3_vbo_desc(cs, vbo_info);
1763    if (pdev->info.gfx_level >= GFX10) {
1764       nir_def *oob_select = nir_bcsel(b, nir_ieq_imm(b, vbo_info->stride, 0), nir_imm_int(b, V_008F0C_OOB_SELECT_RAW),
1765                                       nir_imm_int(b, V_008F0C_OOB_SELECT_STRUCTURED));
1766       rsrc_word3 = nir_iand_imm(b, rsrc_word3, C_008F0C_OOB_SELECT);
1767       rsrc_word3 = nir_ior(b, rsrc_word3, nir_ishl_imm(b, oob_select, 28));
1768    }
1769 
1770    nir_def *va_hi = nir_iand_imm(b, nir_unpack_64_2x32_split_y(b, vbo_info->va), 0xFFFF);
1771    nir_def *stride = nir_iand_imm(b, vbo_info->stride, 0x3FFF);
1772    nir_def *new_vbo_data[4] = {nir_unpack_64_2x32_split_x(b, vbo_info->va),
1773                                nir_ior(b, nir_ishl_imm(b, stride, 16), va_hi), nir_load_var(b, num_records),
1774                                rsrc_word3};
1775    nir_store_var(b, desc, nir_vec(b, new_vbo_data, 4), 0xf);
1776 
1777    /* On GFX9, it seems bounds checking is disabled if both
1778     * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
1779     * GFX10.3 but it doesn't hurt.
1780     */
1781    nir_def *buf_va =
1782       nir_iand_imm(b, nir_pack_64_2x32(b, nir_trim_vector(b, nir_load_var(b, desc), 2)), (1ull << 48) - 1ull);
1783    nir_push_if(b, nir_ior(b, nir_ieq_imm(b, nir_load_var(b, num_records), 0), nir_ieq_imm(b, buf_va, 0)));
1784    {
1785       nir_def *has_dynamic_vs_input = nir_ieq_imm(b, load_param8(b, dynamic_vs_input), 1);
1786 
1787       new_vbo_data[0] = nir_imm_int(b, 0);
1788       new_vbo_data[1] = nir_bcsel(b, has_dynamic_vs_input, nir_imm_int(b, S_008F04_STRIDE(16)), nir_imm_int(b, 0));
1789       new_vbo_data[2] = nir_imm_int(b, 0);
1790       new_vbo_data[3] = nir_bcsel(b, has_dynamic_vs_input, nir_channel(b, nir_load_var(b, desc), 3), nir_imm_int(b, 0));
1791 
1792       nir_store_var(b, desc, nir_vec(b, new_vbo_data, 4), 0xf);
1793    }
1794    nir_pop_if(b, NULL);
1795 }
1796 
1797 static void
dgc_emit_vertex_buffer(struct dgc_cmdbuf * cs,nir_def * stream_addr)1798 dgc_emit_vertex_buffer(struct dgc_cmdbuf *cs, nir_def *stream_addr)
1799 {
1800    const struct radv_indirect_command_layout *layout = cs->layout;
1801    nir_builder *b = cs->b;
1802 
1803    nir_def *vb_desc_usage_mask = load_param32(b, vb_desc_usage_mask);
1804    nir_def *vbo_cnt = nir_bit_count(b, vb_desc_usage_mask);
1805 
1806    nir_push_if(b, nir_ine_imm(b, vbo_cnt, 0));
1807    {
1808       dgc_cs_begin(cs);
1809       dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 1, 0));
1810       dgc_cs_emit(load_param16(b, vbo_reg));
1811       dgc_cs_emit(nir_iadd(b, load_param32(b, upload_addr), nir_load_var(b, cs->upload_offset)));
1812       dgc_cs_end();
1813    }
1814    nir_pop_if(b, NULL);
1815 
1816    nir_variable *vbo_idx = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "vbo_idx");
1817    nir_store_var(b, vbo_idx, nir_imm_int(b, 0), 0x1);
1818 
1819    nir_push_loop(b);
1820    {
1821       nir_def *cur_idx = nir_load_var(b, vbo_idx);
1822 
1823       nir_break_if(b, nir_uge_imm(b, cur_idx, 32 /* bits in vb_desc_usage_mask */));
1824 
1825       nir_def *l = nir_ishl(b, nir_imm_int(b, 1), cur_idx);
1826       nir_push_if(b, nir_ieq_imm(b, nir_iand(b, l, vb_desc_usage_mask), 0));
1827       {
1828          nir_store_var(b, vbo_idx, nir_iadd_imm(b, cur_idx, 1), 0x1);
1829          nir_jump(b, nir_jump_continue);
1830       }
1831       nir_pop_if(b, NULL);
1832 
1833       nir_variable *va_var = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint64_t_type(), "va_var");
1834       nir_variable *size_var = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "size_var");
1835       nir_variable *stride_var = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "stride_var");
1836 
1837       nir_def *binding = load_vbo_metadata32(cs, cur_idx, binding);
1838 
1839       nir_def *vbo_override = nir_ine_imm(
1840          b, nir_iand(b, nir_imm_int(b, layout->vk.vertex_bindings), nir_ishl(b, nir_imm_int(b, 1), binding)), 0);
1841       nir_push_if(b, vbo_override);
1842       {
1843          nir_def *stream_offset = load_vbo_offset(cs, cur_idx);
1844          nir_def *stream_data = nir_build_load_global(b, 4, 32, nir_iadd(b, stream_addr, nir_u2u64(b, stream_offset)),
1845                                                       .access = ACCESS_NON_WRITEABLE);
1846 
1847          nir_def *va = nir_pack_64_2x32(b, nir_trim_vector(b, stream_data, 2));
1848          nir_def *size = nir_channel(b, stream_data, 2);
1849 
1850          nir_def *stride = nir_channel(b, stream_data, 3);
1851 
1852          nir_store_var(b, va_var, va, 0x1);
1853          nir_store_var(b, size_var, size, 0x1);
1854          nir_store_var(b, stride_var, stride, 0x1);
1855       }
1856       nir_push_else(b, NULL);
1857       {
1858          nir_store_var(b, va_var, load_vbo_metadata64(cs, cur_idx, va), 0x1);
1859          nir_store_var(b, size_var, load_vbo_metadata32(cs, cur_idx, size), 0x1);
1860          nir_store_var(b, stride_var, load_vbo_metadata32(cs, cur_idx, stride), 0x1);
1861       }
1862       nir_pop_if(b, NULL);
1863 
1864       nir_def *attrib_index_offset = load_vbo_metadata32(cs, cur_idx, attrib_index_offset);
1865       nir_def *non_trivial_format = load_vbo_metadata32(cs, cur_idx, non_trivial_format);
1866       nir_def *attrib_offset = load_vbo_metadata32(cs, cur_idx, attrib_offset);
1867       nir_def *attrib_format_size = load_vbo_metadata32(cs, cur_idx, attrib_format_size);
1868       nir_def *attrib_end = nir_iadd(b, attrib_offset, attrib_format_size);
1869 
1870       nir_def *has_dynamic_vs_input = nir_ieq_imm(b, load_param8(b, dynamic_vs_input), 1);
1871       nir_def *va = nir_iadd(b, nir_load_var(b, va_var),
1872                              nir_bcsel(b, has_dynamic_vs_input, nir_u2u64(b, attrib_offset), nir_imm_int64(b, 0)));
1873 
1874       struct dgc_vbo_info vbo_info = {
1875          .va = va,
1876          .size = nir_load_var(b, size_var),
1877          .stride = nir_load_var(b, stride_var),
1878          .attrib_end = attrib_end,
1879          .attrib_index_offset = attrib_index_offset,
1880          .non_trivial_format = non_trivial_format,
1881       };
1882 
1883       nir_variable *vbo_data = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uvec4_type(), "vbo_data");
1884 
1885       dgc_write_vertex_descriptor(cs, &vbo_info, vbo_data);
1886 
1887       dgc_upload(cs, nir_load_var(b, vbo_data));
1888 
1889       nir_store_var(b, vbo_idx, nir_iadd_imm(b, cur_idx, 1), 0x1);
1890    }
1891    nir_pop_loop(b, NULL);
1892 }
1893 
1894 /**
1895  * Compute dispatch
1896  */
1897 static nir_def *
dgc_get_dispatch_initiator(struct dgc_cmdbuf * cs)1898 dgc_get_dispatch_initiator(struct dgc_cmdbuf *cs)
1899 {
1900    const struct radv_device *device = cs->dev;
1901    nir_builder *b = cs->b;
1902 
1903    const uint32_t dispatch_initiator = device->dispatch_initiator | S_00B800_FORCE_START_AT_000(1);
1904    nir_def *is_wave32 = nir_ieq_imm(b, load_shader_metadata32(cs, wave32), 1);
1905    return nir_bcsel(b, is_wave32, nir_imm_int(b, dispatch_initiator | S_00B800_CS_W32_EN(1)),
1906                     nir_imm_int(b, dispatch_initiator));
1907 }
1908 
1909 static void
dgc_emit_grid_size_user_sgpr(struct dgc_cmdbuf * cs,nir_def * grid_base_sgpr,nir_def * wg_x,nir_def * wg_y,nir_def * wg_z)1910 dgc_emit_grid_size_user_sgpr(struct dgc_cmdbuf *cs, nir_def *grid_base_sgpr, nir_def *wg_x, nir_def *wg_y,
1911                              nir_def *wg_z)
1912 {
1913    dgc_cs_begin(cs);
1914    dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 3, 0));
1915    dgc_cs_emit(grid_base_sgpr);
1916    dgc_cs_emit(wg_x);
1917    dgc_cs_emit(wg_y);
1918    dgc_cs_emit(wg_z);
1919    dgc_cs_end();
1920 }
1921 
1922 static void
dgc_emit_grid_size_pointer(struct dgc_cmdbuf * cs,nir_def * grid_base_sgpr,nir_def * size_va)1923 dgc_emit_grid_size_pointer(struct dgc_cmdbuf *cs, nir_def *grid_base_sgpr, nir_def *size_va)
1924 {
1925    nir_builder *b = cs->b;
1926 
1927    nir_def *va_lo = nir_unpack_64_2x32_split_x(b, size_va);
1928    nir_def *va_hi = nir_unpack_64_2x32_split_y(b, size_va);
1929 
1930    dgc_cs_begin(cs);
1931    dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 2, 0));
1932    dgc_cs_emit(grid_base_sgpr);
1933    dgc_cs_emit(va_lo);
1934    dgc_cs_emit(va_hi);
1935    dgc_cs_end();
1936 }
1937 
1938 static void
dgc_emit_dispatch_direct(struct dgc_cmdbuf * cs,nir_def * wg_x,nir_def * wg_y,nir_def * wg_z,nir_def * dispatch_initiator,nir_def * grid_sgpr,nir_def * size_va,nir_def * sequence_id,bool is_rt)1939 dgc_emit_dispatch_direct(struct dgc_cmdbuf *cs, nir_def *wg_x, nir_def *wg_y, nir_def *wg_z,
1940                          nir_def *dispatch_initiator, nir_def *grid_sgpr, nir_def *size_va, nir_def *sequence_id,
1941                          bool is_rt)
1942 {
1943    const struct radv_device *device = cs->dev;
1944    nir_builder *b = cs->b;
1945 
1946    nir_push_if(b, nir_iand(b, nir_ine_imm(b, wg_x, 0), nir_iand(b, nir_ine_imm(b, wg_y, 0), nir_ine_imm(b, wg_z, 0))));
1947    {
1948       nir_push_if(b, nir_ine_imm(b, grid_sgpr, 0));
1949       {
1950          if (device->load_grid_size_from_user_sgpr) {
1951             dgc_emit_grid_size_user_sgpr(cs, grid_sgpr, wg_x, wg_y, wg_z);
1952          } else {
1953             dgc_emit_grid_size_pointer(cs, grid_sgpr, size_va);
1954          }
1955       }
1956       nir_pop_if(b, 0);
1957 
1958       dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDispatch);
1959       dgc_emit_sqtt_marker_event_with_dims(
1960          cs, sequence_id, wg_x, wg_y, wg_z,
1961          is_rt ? EventCmdTraceRaysKHR | ApiRayTracingSeparateCompiled : EventCmdDispatch);
1962 
1963       dgc_cs_begin(cs);
1964       dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));
1965       dgc_cs_emit(wg_x);
1966       dgc_cs_emit(wg_y);
1967       dgc_cs_emit(wg_z);
1968       dgc_cs_emit(dispatch_initiator);
1969       dgc_cs_end();
1970 
1971       dgc_emit_sqtt_thread_trace_marker(cs);
1972       dgc_emit_sqtt_end_api_marker(cs, ApiCmdDispatch);
1973    }
1974    nir_pop_if(b, 0);
1975 }
1976 
1977 static void
dgc_emit_dispatch(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id)1978 dgc_emit_dispatch(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id)
1979 {
1980    const struct radv_indirect_command_layout *layout = cs->layout;
1981    nir_builder *b = cs->b;
1982 
1983    nir_def *dispatch_data = nir_build_load_global(
1984       b, 3, 32, nir_iadd_imm(b, stream_addr, layout->vk.dispatch_src_offset_B), .access = ACCESS_NON_WRITEABLE);
1985    nir_def *wg_x = nir_channel(b, dispatch_data, 0);
1986    nir_def *wg_y = nir_channel(b, dispatch_data, 1);
1987    nir_def *wg_z = nir_channel(b, dispatch_data, 2);
1988 
1989    nir_def *grid_sgpr = load_shader_metadata32(cs, grid_base_sgpr);
1990    nir_def *dispatch_initiator = dgc_get_dispatch_initiator(cs);
1991    nir_def *size_va = nir_iadd_imm(b, stream_addr, layout->vk.dispatch_src_offset_B);
1992 
1993    dgc_emit_dispatch_direct(cs, wg_x, wg_y, wg_z, dispatch_initiator, grid_sgpr, size_va, sequence_id, false);
1994 }
1995 
1996 /**
1997  * Draw mesh/task
1998  */
1999 static void
dgc_emit_userdata_mesh(struct dgc_cmdbuf * cs,nir_def * x,nir_def * y,nir_def * z,nir_def * drawid)2000 dgc_emit_userdata_mesh(struct dgc_cmdbuf *cs, nir_def *x, nir_def *y, nir_def *z, nir_def *drawid)
2001 {
2002    nir_builder *b = cs->b;
2003 
2004    nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
2005    vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr);
2006 
2007    nir_def *has_grid_size = nir_test_mask(b, vtx_base_sgpr, DGC_USES_GRID_SIZE);
2008    nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
2009 
2010    nir_push_if(b, nir_ior(b, has_grid_size, has_drawid));
2011    {
2012       nir_def *pkt_cnt = nir_imm_int(b, 0);
2013       pkt_cnt = nir_bcsel(b, has_grid_size, nir_iadd_imm(b, pkt_cnt, 3), pkt_cnt);
2014       pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
2015 
2016       dgc_cs_begin(cs);
2017       dgc_cs_emit(nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt));
2018       dgc_cs_emit(nir_iand_imm(b, vtx_base_sgpr, 0x3FFF));
2019       /* DrawID needs to be first if no GridSize. */
2020       dgc_cs_emit(nir_bcsel(b, has_grid_size, x, drawid));
2021       dgc_cs_emit(nir_bcsel(b, has_grid_size, y, nir_imm_int(b, PKT3_NOP_PAD)));
2022       dgc_cs_emit(nir_bcsel(b, has_grid_size, z, nir_imm_int(b, PKT3_NOP_PAD)));
2023       dgc_cs_emit(nir_bcsel(b, has_drawid, drawid, nir_imm_int(b, PKT3_NOP_PAD)));
2024       dgc_cs_end();
2025    }
2026    nir_pop_if(b, NULL);
2027 }
2028 
2029 static void
dgc_emit_dispatch_mesh_direct(struct dgc_cmdbuf * cs,nir_def * x,nir_def * y,nir_def * z)2030 dgc_emit_dispatch_mesh_direct(struct dgc_cmdbuf *cs, nir_def *x, nir_def *y, nir_def *z)
2031 {
2032    dgc_cs_begin(cs);
2033    dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_MESH_DIRECT, 3, 0));
2034    dgc_cs_emit(x);
2035    dgc_cs_emit(y);
2036    dgc_cs_emit(z);
2037    dgc_cs_emit_imm(S_0287F0_SOURCE_SELECT(V_0287F0_DI_SRC_SEL_AUTO_INDEX));
2038    dgc_cs_end();
2039 }
2040 
2041 static void
dgc_emit_dispatch_taskmesh_gfx(struct dgc_cmdbuf * cs,nir_def * sequence_id)2042 dgc_emit_dispatch_taskmesh_gfx(struct dgc_cmdbuf *cs, nir_def *sequence_id)
2043 {
2044    const struct radv_device *device = cs->dev;
2045    const struct radv_physical_device *pdev = radv_device_physical(device);
2046    nir_builder *b = cs->b;
2047 
2048    nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
2049    nir_def *has_grid_size = nir_test_mask(b, vtx_base_sgpr, DGC_USES_GRID_SIZE);
2050    nir_def *has_linear_dispatch_en = nir_ieq_imm(b, load_param8(b, linear_dispatch_en), 1);
2051 
2052    nir_def *base_reg = nir_iand_imm(b, vtx_base_sgpr, 0x3FFF);
2053    nir_def *xyz_dim_reg = nir_bcsel(b, has_grid_size, base_reg, nir_imm_int(b, 0));
2054    nir_def *ring_entry_reg = load_param16(b, mesh_ring_entry_sgpr);
2055 
2056    nir_def *xyz_dim_enable = nir_bcsel(b, has_grid_size, nir_imm_int(b, S_4D1_XYZ_DIM_ENABLE(1)), nir_imm_int(b, 0));
2057    nir_def *mode1_enable = nir_imm_int(b, S_4D1_MODE1_ENABLE(!pdev->mesh_fast_launch_2));
2058    nir_def *linear_dispatch_en =
2059       nir_bcsel(b, has_linear_dispatch_en, nir_imm_int(b, S_4D1_LINEAR_DISPATCH_ENABLE(1)), nir_imm_int(b, 0));
2060    nir_def *sqtt_enable = nir_imm_int(b, device->sqtt.bo ? S_4D1_THREAD_TRACE_MARKER_ENABLE(1) : 0);
2061 
2062    dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDrawMeshTasksEXT);
2063    dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDrawMeshTasksEXT);
2064 
2065    dgc_cs_begin(cs);
2066    dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_TASKMESH_GFX, 2, 0) | PKT3_RESET_FILTER_CAM_S(1));
2067    /* S_4D0_RING_ENTRY_REG(ring_entry_reg) | S_4D0_XYZ_DIM_REG(xyz_dim_reg) */
2068    dgc_cs_emit(nir_ior(b, xyz_dim_reg, nir_ishl_imm(b, ring_entry_reg, 16)));
2069    if (pdev->info.gfx_level >= GFX11) {
2070       dgc_cs_emit(nir_ior(b, xyz_dim_enable, nir_ior(b, mode1_enable, nir_ior(b, linear_dispatch_en, sqtt_enable))));
2071    } else {
2072       dgc_cs_emit(sqtt_enable);
2073    }
2074    dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_AUTO_INDEX);
2075    dgc_cs_end();
2076 
2077    dgc_emit_sqtt_thread_trace_marker(cs);
2078    dgc_emit_sqtt_end_api_marker(cs, ApiCmdDrawMeshTasksEXT);
2079 }
2080 
2081 static void
dgc_emit_draw_mesh_tasks_gfx(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id)2082 dgc_emit_draw_mesh_tasks_gfx(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id)
2083 {
2084    const struct radv_indirect_command_layout *layout = cs->layout;
2085    const struct radv_device *device = cs->dev;
2086    const struct radv_physical_device *pdev = radv_device_physical(device);
2087    nir_builder *b = cs->b;
2088 
2089    nir_def *draw_data = nir_build_load_global(b, 3, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
2090                                               .access = ACCESS_NON_WRITEABLE);
2091    nir_def *x = nir_channel(b, draw_data, 0);
2092    nir_def *y = nir_channel(b, draw_data, 1);
2093    nir_def *z = nir_channel(b, draw_data, 2);
2094 
2095    nir_push_if(b, nir_iand(b, nir_ine_imm(b, x, 0), nir_iand(b, nir_ine_imm(b, y, 0), nir_ine_imm(b, z, 0))));
2096    {
2097       nir_push_if(b, nir_ieq_imm(b, load_param8(b, has_task_shader), 1));
2098       {
2099          dgc_emit_dispatch_taskmesh_gfx(cs, sequence_id);
2100       }
2101       nir_push_else(b, NULL);
2102       {
2103          dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDrawMeshTasksEXT);
2104          dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDrawMeshTasksEXT);
2105 
2106          dgc_emit_userdata_mesh(cs, x, y, z, sequence_id);
2107          dgc_emit_instance_count(cs, nir_imm_int(b, 1));
2108 
2109          if (pdev->mesh_fast_launch_2) {
2110             dgc_emit_dispatch_mesh_direct(cs, x, y, z);
2111          } else {
2112             nir_def *vertex_count = nir_imul(b, x, nir_imul(b, y, z));
2113             dgc_emit_draw_index_auto(cs, vertex_count);
2114          }
2115 
2116          dgc_emit_sqtt_thread_trace_marker(cs);
2117          dgc_emit_sqtt_end_api_marker(cs, ApiCmdDrawMeshTasksEXT);
2118       }
2119       nir_pop_if(b, NULL);
2120    }
2121    nir_pop_if(b, NULL);
2122 }
2123 
2124 static void
dgc_emit_draw_mesh_tasks_with_count_gfx(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id)2125 dgc_emit_draw_mesh_tasks_with_count_gfx(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id)
2126 {
2127    const struct radv_indirect_command_layout *layout = cs->layout;
2128    const struct radv_device *device = cs->dev;
2129    const struct radv_physical_device *pdev = radv_device_physical(device);
2130    nir_builder *b = cs->b;
2131 
2132    nir_push_if(b, nir_ieq_imm(b, load_param8(b, has_task_shader), 1));
2133    {
2134       dgc_emit_dispatch_taskmesh_gfx(cs, sequence_id);
2135    }
2136    nir_push_else(b, NULL);
2137    {
2138       nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
2139       nir_def *has_grid_size = nir_test_mask(b, vtx_base_sgpr, DGC_USES_GRID_SIZE);
2140       nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
2141 
2142       nir_def *draw_data = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
2143                                                  .access = ACCESS_NON_WRITEABLE);
2144       nir_def *va = nir_pack_64_2x32(b, nir_channels(b, draw_data, 0x3));
2145       nir_def *stride = nir_channel(b, draw_data, 2);
2146       nir_def *draw_count = nir_umin(b, load_param32(b, max_draw_count), nir_channel(b, draw_data, 3));
2147 
2148       dgc_emit_pkt3_set_base(cs, va);
2149 
2150       nir_def *base_reg = nir_iand_imm(b, vtx_base_sgpr, 0x3FFF);
2151       nir_def *xyz_dim_reg = nir_bcsel(b, has_grid_size, base_reg, nir_imm_int(b, 0));
2152       nir_def *draw_id_offset = nir_bcsel(b, has_grid_size, nir_imm_int(b, 3), nir_imm_int(b, 0));
2153       nir_def *draw_id_reg = nir_bcsel(b, has_drawid, nir_iadd(b, base_reg, draw_id_offset), nir_imm_int(b, 0));
2154 
2155       nir_push_if(b, has_drawid);
2156       {
2157          nir_def *packet[3] = {nir_imm_int(b, PKT3(PKT3_SET_SH_REG, 1, 0)), draw_id_reg, nir_imm_int(b, 0)};
2158          dgc_emit(cs, 3, packet);
2159       }
2160       nir_pop_if(b, NULL);
2161 
2162       nir_def *draw_index_enable =
2163          nir_bcsel(b, has_drawid, nir_imm_int(b, S_4C2_DRAW_INDEX_ENABLE(1)), nir_imm_int(b, 0));
2164       nir_def *xyz_dim_enable = nir_bcsel(b, has_grid_size, nir_imm_int(b, S_4C2_XYZ_DIM_ENABLE(1)), nir_imm_int(b, 0));
2165 
2166       dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDrawMeshTasksIndirectCountEXT);
2167       dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDrawMeshTasksIndirectCountEXT);
2168 
2169       dgc_cs_begin(cs);
2170       dgc_cs_emit(nir_imm_int(b, PKT3(PKT3_DISPATCH_MESH_INDIRECT_MULTI, 7, false) | PKT3_RESET_FILTER_CAM_S(1)));
2171       dgc_cs_emit_imm(0); /* data offset */
2172       /* S_4C1_XYZ_DIM_REG(xyz_dim_reg) | S_4C1_DRAW_INDEX_REG(draw_id_reg) */
2173       dgc_cs_emit(
2174          nir_ior(b, nir_iand_imm(b, xyz_dim_reg, 0xFFFF), nir_ishl_imm(b, nir_iand_imm(b, draw_id_reg, 0xFFFF), 16)));
2175       if (pdev->info.gfx_level >= GFX11) {
2176          dgc_cs_emit(nir_ior_imm(b, nir_ior(b, draw_index_enable, xyz_dim_enable),
2177                                  S_4C2_MODE1_ENABLE(!pdev->mesh_fast_launch_2)));
2178       } else {
2179          dgc_cs_emit(draw_index_enable);
2180       }
2181       dgc_cs_emit(draw_count);
2182       dgc_cs_emit_imm(0);
2183       dgc_cs_emit_imm(0);
2184       dgc_cs_emit(stride);
2185       dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_AUTO_INDEX);
2186       dgc_cs_end();
2187 
2188       dgc_emit_sqtt_thread_trace_marker(cs);
2189       dgc_emit_sqtt_end_api_marker(cs, ApiCmdDrawMeshTasksIndirectCountEXT);
2190    }
2191    nir_pop_if(b, NULL);
2192 }
2193 
2194 static void
dgc_emit_userdata_task(struct dgc_cmdbuf * ace_cs,nir_def * x,nir_def * y,nir_def * z)2195 dgc_emit_userdata_task(struct dgc_cmdbuf *ace_cs, nir_def *x, nir_def *y, nir_def *z)
2196 {
2197    nir_builder *b = ace_cs->b;
2198 
2199    nir_def *xyz_sgpr = load_param16(b, task_xyz_sgpr);
2200    nir_push_if(b, nir_ine_imm(b, xyz_sgpr, 0));
2201    {
2202       dgc_cs_begin(ace_cs);
2203       dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 3, 0));
2204       dgc_cs_emit(xyz_sgpr);
2205       dgc_cs_emit(x);
2206       dgc_cs_emit(y);
2207       dgc_cs_emit(z);
2208       dgc_cs_end();
2209    }
2210    nir_pop_if(b, NULL);
2211 
2212    nir_def *draw_id_sgpr = load_param16(b, task_draw_id_sgpr);
2213    nir_push_if(b, nir_ine_imm(b, draw_id_sgpr, 0));
2214    {
2215       dgc_cs_begin(ace_cs);
2216       dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 1, 0));
2217       dgc_cs_emit(draw_id_sgpr);
2218       dgc_cs_emit_imm(0);
2219       dgc_cs_end();
2220    }
2221    nir_pop_if(b, NULL);
2222 }
2223 
2224 static nir_def *
dgc_get_dispatch_initiator_task(struct dgc_cmdbuf * ace_cs)2225 dgc_get_dispatch_initiator_task(struct dgc_cmdbuf *ace_cs)
2226 {
2227    const struct radv_device *device = ace_cs->dev;
2228    const uint32_t dispatch_initiator_task = device->dispatch_initiator_task;
2229    nir_builder *b = ace_cs->b;
2230 
2231    nir_def *is_wave32 = nir_ieq_imm(b, load_param8(b, wave32), 1);
2232    return nir_bcsel(b, is_wave32, nir_imm_int(b, dispatch_initiator_task | S_00B800_CS_W32_EN(1)),
2233                     nir_imm_int(b, dispatch_initiator_task));
2234 }
2235 
2236 static void
dgc_emit_dispatch_taskmesh_direct_ace(struct dgc_cmdbuf * ace_cs,nir_def * x,nir_def * y,nir_def * z)2237 dgc_emit_dispatch_taskmesh_direct_ace(struct dgc_cmdbuf *ace_cs, nir_def *x, nir_def *y, nir_def *z)
2238 {
2239    nir_def *dispatch_initiator = dgc_get_dispatch_initiator_task(ace_cs);
2240    nir_builder *b = ace_cs->b;
2241 
2242    dgc_cs_begin(ace_cs);
2243    dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_TASKMESH_DIRECT_ACE, 4, 0) | PKT3_SHADER_TYPE_S(1));
2244    dgc_cs_emit(x);
2245    dgc_cs_emit(y);
2246    dgc_cs_emit(z);
2247    dgc_cs_emit(dispatch_initiator);
2248    dgc_cs_emit(load_param16(b, task_ring_entry_sgpr));
2249    dgc_cs_end();
2250 }
2251 
2252 static void
dgc_emit_draw_mesh_tasks_ace(struct dgc_cmdbuf * ace_cs,nir_def * stream_addr)2253 dgc_emit_draw_mesh_tasks_ace(struct dgc_cmdbuf *ace_cs, nir_def *stream_addr)
2254 {
2255    const struct radv_indirect_command_layout *layout = ace_cs->layout;
2256    nir_builder *b = ace_cs->b;
2257 
2258    nir_def *draw_data = nir_build_load_global(b, 3, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
2259                                               .access = ACCESS_NON_WRITEABLE);
2260    nir_def *x = nir_channel(b, draw_data, 0);
2261    nir_def *y = nir_channel(b, draw_data, 1);
2262    nir_def *z = nir_channel(b, draw_data, 2);
2263 
2264    nir_push_if(b, nir_iand(b, nir_ine_imm(b, x, 0), nir_iand(b, nir_ine_imm(b, y, 0), nir_ine_imm(b, z, 0))));
2265    {
2266       dgc_emit_userdata_task(ace_cs, x, y, z);
2267       dgc_emit_dispatch_taskmesh_direct_ace(ace_cs, x, y, z);
2268    }
2269    nir_pop_if(b, NULL);
2270 }
2271 
2272 static void
dgc_emit_draw_mesh_tasks_with_count_ace(struct dgc_cmdbuf * ace_cs,nir_def * stream_addr,nir_def * sequence_id)2273 dgc_emit_draw_mesh_tasks_with_count_ace(struct dgc_cmdbuf *ace_cs, nir_def *stream_addr, nir_def *sequence_id)
2274 {
2275    const struct radv_indirect_command_layout *layout = ace_cs->layout;
2276    nir_builder *b = ace_cs->b;
2277 
2278    nir_def *draw_data = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
2279                                               .access = ACCESS_NON_WRITEABLE);
2280    nir_def *va_lo = nir_channel(b, draw_data, 0);
2281    nir_def *va_hi = nir_channel(b, draw_data, 1);
2282    nir_def *stride = nir_channel(b, draw_data, 2);
2283    nir_def *draw_count = nir_umin(b, load_param32(b, max_draw_count), nir_channel(b, draw_data, 3));
2284 
2285    nir_def *xyz_dim_reg = load_param16(b, task_xyz_sgpr);
2286    nir_def *ring_entry_reg = load_param16(b, task_ring_entry_sgpr);
2287    nir_def *draw_id_reg = load_param16(b, task_draw_id_sgpr);
2288 
2289    nir_def *draw_index_enable =
2290       nir_bcsel(b, nir_ine_imm(b, draw_id_reg, 0), nir_imm_int(b, S_AD3_DRAW_INDEX_ENABLE(1)), nir_imm_int(b, 0));
2291    nir_def *xyz_dim_enable =
2292       nir_bcsel(b, nir_ine_imm(b, xyz_dim_reg, 0), nir_imm_int(b, S_AD3_XYZ_DIM_ENABLE(1)), nir_imm_int(b, 0));
2293 
2294    nir_def *dispatch_initiator = dgc_get_dispatch_initiator_task(ace_cs);
2295 
2296    dgc_cs_begin(ace_cs);
2297    dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_TASKMESH_INDIRECT_MULTI_ACE, 9, 0) | PKT3_SHADER_TYPE_S(1));
2298    dgc_cs_emit(va_lo);
2299    dgc_cs_emit(va_hi);
2300    dgc_cs_emit(ring_entry_reg);
2301    dgc_cs_emit(nir_ior(b, draw_index_enable, nir_ior(b, xyz_dim_enable, nir_ishl_imm(b, draw_id_reg, 16))));
2302    dgc_cs_emit(xyz_dim_reg);
2303    dgc_cs_emit(draw_count);
2304    dgc_cs_emit_imm(0);
2305    dgc_cs_emit_imm(0);
2306    dgc_cs_emit(stride);
2307    dgc_cs_emit(dispatch_initiator);
2308    dgc_cs_end();
2309 }
2310 
2311 /**
2312  * Indirect execution set
2313  */
2314 static void
dgc_emit_indirect_sets(struct dgc_cmdbuf * cs)2315 dgc_emit_indirect_sets(struct dgc_cmdbuf *cs)
2316 {
2317    nir_builder *b = cs->b;
2318 
2319    nir_def *indirect_desc_sets_sgpr = load_shader_metadata32(cs, indirect_desc_sets_sgpr);
2320    nir_push_if(b, nir_ine_imm(b, indirect_desc_sets_sgpr, 0));
2321    {
2322       dgc_cs_begin(cs);
2323       dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 1, 0));
2324       dgc_cs_emit(indirect_desc_sets_sgpr);
2325       dgc_cs_emit(load_param32(b, indirect_desc_sets_va));
2326       dgc_cs_end();
2327    }
2328    nir_pop_if(b, NULL);
2329 }
2330 
2331 static void
dgc_emit_ies(struct dgc_cmdbuf * cs)2332 dgc_emit_ies(struct dgc_cmdbuf *cs)
2333 {
2334    nir_builder *b = cs->b;
2335 
2336    nir_def *va = nir_iadd_imm(b, cs->ies_va, sizeof(struct radv_compute_pipeline_metadata));
2337    nir_def *num_dw = nir_build_load_global(b, 1, 32, va, .access = ACCESS_NON_WRITEABLE);
2338    nir_def *cs_va = nir_iadd_imm(b, va, 4);
2339 
2340    nir_variable *offset = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "offset");
2341    nir_store_var(b, offset, nir_imm_int(b, 0), 0x1);
2342 
2343    nir_push_loop(b);
2344    {
2345       nir_def *cur_offset = nir_load_var(b, offset);
2346 
2347       nir_break_if(b, nir_uge(b, cur_offset, num_dw));
2348 
2349       nir_def *data = nir_build_load_global(b, 1, 32, nir_iadd(b, cs_va, nir_u2u64(b, nir_imul_imm(b, cur_offset, 4))),
2350                                             .access = ACCESS_NON_WRITEABLE);
2351 
2352       dgc_cs_begin(cs);
2353       dgc_cs_emit(data);
2354       dgc_cs_end();
2355 
2356       nir_store_var(b, offset, nir_iadd_imm(b, cur_offset, 1), 0x1);
2357    }
2358    nir_pop_loop(b, NULL);
2359 
2360    dgc_emit_indirect_sets(cs);
2361 }
2362 
2363 /**
2364  * Raytracing.
2365  */
2366 static void
dgc_emit_shader_pointer(struct dgc_cmdbuf * cs,nir_def * sh_offset,nir_def * va)2367 dgc_emit_shader_pointer(struct dgc_cmdbuf *cs, nir_def *sh_offset, nir_def *va)
2368 {
2369    nir_builder *b = cs->b;
2370 
2371    nir_def *va_lo = nir_unpack_64_2x32_split_x(b, va);
2372    nir_def *va_hi = nir_unpack_64_2x32_split_y(b, va);
2373 
2374    dgc_cs_begin(cs);
2375    dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 2, 0));
2376    dgc_cs_emit(sh_offset);
2377    dgc_cs_emit(va_lo);
2378    dgc_cs_emit(va_hi);
2379    dgc_cs_end();
2380 }
2381 
2382 static void
dgc_emit_rt(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id)2383 dgc_emit_rt(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id)
2384 {
2385    const struct radv_indirect_command_layout *layout = cs->layout;
2386    const struct radv_device *device = cs->dev;
2387    nir_builder *b = cs->b;
2388 
2389    nir_def *indirect_va = nir_iadd_imm(b, stream_addr, layout->vk.dispatch_src_offset_B);
2390 
2391    nir_def *cs_sbt_descriptors = load_param16(b, cs_sbt_descriptors);
2392    nir_push_if(b, nir_ine_imm(b, cs_sbt_descriptors, 0));
2393    {
2394       dgc_emit_shader_pointer(cs, cs_sbt_descriptors, indirect_va);
2395    }
2396    nir_pop_if(b, NULL);
2397 
2398    nir_def *launch_size_va = nir_iadd_imm(b, indirect_va, offsetof(VkTraceRaysIndirectCommand2KHR, width));
2399 
2400    nir_def *cs_ray_launch_size_addr = load_param16(b, cs_ray_launch_size_addr);
2401    nir_push_if(b, nir_ine_imm(b, cs_ray_launch_size_addr, 0));
2402    {
2403       dgc_emit_shader_pointer(cs, cs_ray_launch_size_addr, launch_size_va);
2404    }
2405    nir_pop_if(b, NULL);
2406 
2407    const uint32_t dispatch_initiator = device->dispatch_initiator | S_00B800_USE_THREAD_DIMENSIONS(1);
2408    nir_def *is_wave32 = nir_ieq_imm(b, load_param8(b, wave32), 1);
2409    nir_def *dispatch_initiator_rt = nir_bcsel(b, is_wave32, nir_imm_int(b, dispatch_initiator | S_00B800_CS_W32_EN(1)),
2410                                               nir_imm_int(b, dispatch_initiator));
2411 
2412    nir_def *dispatch_data = nir_build_load_global(b, 3, 32, launch_size_va, .access = ACCESS_NON_WRITEABLE);
2413    nir_def *width = nir_channel(b, dispatch_data, 0);
2414    nir_def *height = nir_channel(b, dispatch_data, 1);
2415    nir_def *depth = nir_channel(b, dispatch_data, 2);
2416 
2417    nir_def *grid_sgpr = load_param16(b, grid_base_sgpr);
2418 
2419    dgc_emit_dispatch_direct(cs, width, height, depth, dispatch_initiator_rt, grid_sgpr, launch_size_va, sequence_id,
2420                             true);
2421 }
2422 
2423 static nir_def *
dgc_is_cond_render_enabled(nir_builder * b)2424 dgc_is_cond_render_enabled(nir_builder *b)
2425 {
2426    nir_def *res1, *res2;
2427 
2428    nir_push_if(b, nir_ieq_imm(b, load_param8(b, predicating), 1));
2429    {
2430       nir_def *val = nir_load_global(b, load_param64(b, predication_va), 4, 1, 32);
2431       /* By default, all rendering commands are discarded if the 32-bit value is zero. If the
2432        * inverted flag is set, they are discarded if the value is non-zero.
2433        */
2434       res1 = nir_ixor(b, nir_i2b(b, load_param8(b, predication_type)), nir_ine_imm(b, val, 0));
2435    }
2436    nir_push_else(b, 0);
2437    {
2438       res2 = nir_imm_bool(b, false);
2439    }
2440    nir_pop_if(b, 0);
2441 
2442    return nir_if_phi(b, res1, res2);
2443 }
2444 
2445 static void
dgc_pad_cmdbuf(struct dgc_cmdbuf * cs,nir_def * cmd_buf_end)2446 dgc_pad_cmdbuf(struct dgc_cmdbuf *cs, nir_def *cmd_buf_end)
2447 {
2448    nir_builder *b = cs->b;
2449 
2450    nir_push_if(b, nir_ine(b, nir_load_var(b, cs->offset), cmd_buf_end));
2451    {
2452       nir_def *cnt = nir_isub(b, cmd_buf_end, nir_load_var(b, cs->offset));
2453       cnt = nir_ushr_imm(b, cnt, 2);
2454       cnt = nir_iadd_imm(b, cnt, -2);
2455       nir_def *pkt = nir_pkt3(b, PKT3_NOP, cnt);
2456 
2457       dgc_cs_begin(cs);
2458       dgc_cs_emit(pkt);
2459       dgc_cs_end();
2460    }
2461    nir_pop_if(b, NULL);
2462 }
2463 
2464 static nir_shader *
build_dgc_prepare_shader(struct radv_device * dev,struct radv_indirect_command_layout * layout)2465 build_dgc_prepare_shader(struct radv_device *dev, struct radv_indirect_command_layout *layout)
2466 {
2467    const struct radv_physical_device *pdev = radv_device_physical(dev);
2468    nir_builder b = radv_meta_init_shader(dev, MESA_SHADER_COMPUTE, "meta_dgc_prepare");
2469    b.shader->info.workgroup_size[0] = 64;
2470 
2471    nir_def *global_id = get_global_ids(&b, 1);
2472 
2473    nir_def *sequence_id = global_id;
2474 
2475    nir_def *cmd_buf_stride = load_param32(&b, cmd_buf_stride);
2476    nir_def *cmd_buf_base_offset = load_param32(&b, cmd_buf_main_offset);
2477 
2478    nir_def *sequence_count = load_param32(&b, sequence_count);
2479    nir_def *sequence_count_addr = load_param64(&b, sequence_count_addr);
2480 
2481    /* The effective number of draws is
2482     * min(sequencesCount, sequencesCountBuffer[sequencesCountOffset]) when
2483     * using sequencesCountBuffer. Otherwise it is sequencesCount. */
2484    nir_variable *count_var = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "sequence_count");
2485    nir_store_var(&b, count_var, sequence_count, 0x1);
2486 
2487    nir_push_if(&b, nir_ine_imm(&b, sequence_count_addr, 0));
2488    {
2489       nir_def *cnt =
2490          nir_build_load_global(&b, 1, 32, load_param64(&b, sequence_count_addr), .access = ACCESS_NON_WRITEABLE);
2491 
2492       /* Must clamp count against the API count explicitly.
2493        * The workgroup potentially contains more threads than maxSequencesCount from API,
2494        * and we have to ensure these threads write NOP packets to pad out the IB. */
2495       cnt = nir_umin(&b, cnt, sequence_count);
2496       nir_store_var(&b, count_var, cnt, 0x1);
2497    }
2498    nir_pop_if(&b, NULL);
2499 
2500    nir_push_if(&b, dgc_is_cond_render_enabled(&b));
2501    {
2502       /* Reset the number of sequences when conditional rendering is enabled in order to skip the
2503        * entire shader and pad the cmdbuf with NOPs.
2504        */
2505       nir_store_var(&b, count_var, nir_imm_int(&b, 0), 0x1);
2506    }
2507    nir_pop_if(&b, NULL);
2508 
2509    sequence_count = nir_load_var(&b, count_var);
2510 
2511    build_dgc_buffer_trailer_main(&b, dev);
2512 
2513    nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count));
2514    {
2515       struct dgc_cmdbuf cmd_buf = {
2516          .b = &b,
2517          .dev = dev,
2518          .va = nir_pack_64_2x32_split(&b, load_param32(&b, upload_addr), nir_imm_int(&b, pdev->info.address32_hi)),
2519          .offset = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "cmd_buf_offset"),
2520          .upload_offset = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "upload_offset"),
2521          .layout = layout,
2522       };
2523       nir_store_var(&b, cmd_buf.offset, nir_iadd(&b, nir_imul(&b, global_id, cmd_buf_stride), cmd_buf_base_offset), 1);
2524       nir_def *cmd_buf_end = nir_iadd(&b, nir_load_var(&b, cmd_buf.offset), cmd_buf_stride);
2525 
2526       nir_def *stream_addr = load_param64(&b, stream_addr);
2527       stream_addr = nir_iadd(&b, stream_addr, nir_u2u64(&b, nir_imul_imm(&b, sequence_id, layout->vk.stride)));
2528 
2529       nir_def *upload_offset_init =
2530          nir_iadd(&b, load_param32(&b, upload_main_offset), nir_imul(&b, load_param32(&b, upload_stride), sequence_id));
2531       nir_store_var(&b, cmd_buf.upload_offset, upload_offset_init, 0x1);
2532 
2533       if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES))
2534          cmd_buf.ies_va = dgc_load_ies_va(&cmd_buf, stream_addr);
2535 
2536       if (layout->push_constant_mask) {
2537          const VkShaderStageFlags stages =
2538             (layout->vk.dgc_info & (BITFIELD_BIT(MESA_VK_DGC_RT) | BITFIELD_BIT(MESA_VK_DGC_DISPATCH)))
2539                ? VK_SHADER_STAGE_COMPUTE_BIT
2540                : (VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_MESH_BIT_EXT);
2541 
2542          dgc_emit_push_constant(&cmd_buf, stream_addr, sequence_id, stages);
2543       }
2544 
2545       if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) {
2546          /* Raytracing */
2547          dgc_emit_rt(&cmd_buf, stream_addr, sequence_id);
2548       } else if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
2549          /* Compute */
2550          if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) {
2551             dgc_emit_ies(&cmd_buf);
2552          }
2553 
2554          dgc_emit_dispatch(&cmd_buf, stream_addr, sequence_id);
2555       } else {
2556          /* Graphics */
2557          if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) {
2558             dgc_emit_vertex_buffer(&cmd_buf, stream_addr);
2559          }
2560 
2561          if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_INDEXED)) {
2562             if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IB)) {
2563                nir_variable *max_index_count_var =
2564                   nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "max_index_count");
2565 
2566                dgc_emit_index_buffer(&cmd_buf, stream_addr, max_index_count_var);
2567 
2568                nir_def *max_index_count = nir_load_var(&b, max_index_count_var);
2569 
2570                if (layout->vk.draw_count) {
2571                   dgc_emit_draw_with_count(&cmd_buf, stream_addr, sequence_id, true);
2572                } else {
2573                   dgc_emit_draw_indexed(&cmd_buf, stream_addr, sequence_id, max_index_count);
2574                }
2575             } else {
2576                if (layout->vk.draw_count) {
2577                   dgc_emit_draw_with_count(&cmd_buf, stream_addr, sequence_id, true);
2578                } else {
2579                   dgc_emit_draw_indirect(&cmd_buf, stream_addr, sequence_id, true);
2580                }
2581             }
2582          } else {
2583             /* Non-indexed draws */
2584             if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) {
2585                if (layout->vk.draw_count) {
2586                   dgc_emit_draw_mesh_tasks_with_count_gfx(&cmd_buf, stream_addr, sequence_id);
2587                } else {
2588                   dgc_emit_draw_mesh_tasks_gfx(&cmd_buf, stream_addr, sequence_id);
2589                }
2590             } else {
2591                if (layout->vk.draw_count) {
2592                   dgc_emit_draw_with_count(&cmd_buf, stream_addr, sequence_id, false);
2593                } else {
2594                   dgc_emit_draw(&cmd_buf, stream_addr, sequence_id);
2595                }
2596             }
2597          }
2598       }
2599 
2600       /* Pad the cmdbuffer if we did not use the whole stride */
2601       dgc_pad_cmdbuf(&cmd_buf, cmd_buf_end);
2602    }
2603    nir_pop_if(&b, NULL);
2604 
2605    build_dgc_buffer_tail_main(&b, sequence_count, dev);
2606    build_dgc_buffer_preamble_main(&b, sequence_count, dev);
2607 
2608    /* Prepare the ACE command stream */
2609    nir_push_if(&b, nir_ieq_imm(&b, load_param8(&b, has_task_shader), 1));
2610    {
2611       nir_def *ace_cmd_buf_stride = load_param32(&b, ace_cmd_buf_stride);
2612       nir_def *ace_cmd_buf_base_offset = load_param32(&b, ace_cmd_buf_main_offset);
2613 
2614       build_dgc_buffer_trailer_ace(&b, dev);
2615 
2616       nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count));
2617       {
2618          struct dgc_cmdbuf cmd_buf = {
2619             .b = &b,
2620             .dev = dev,
2621             .va = nir_pack_64_2x32_split(&b, load_param32(&b, upload_addr), nir_imm_int(&b, pdev->info.address32_hi)),
2622             .offset = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "cmd_buf_offset"),
2623             .upload_offset = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "upload_offset"),
2624             .layout = layout,
2625          };
2626          nir_store_var(&b, cmd_buf.offset,
2627                        nir_iadd(&b, nir_imul(&b, global_id, ace_cmd_buf_stride), ace_cmd_buf_base_offset), 1);
2628          nir_def *cmd_buf_end = nir_iadd(&b, nir_load_var(&b, cmd_buf.offset), ace_cmd_buf_stride);
2629 
2630          nir_def *stream_addr = load_param64(&b, stream_addr);
2631          stream_addr = nir_iadd(&b, stream_addr, nir_u2u64(&b, nir_imul_imm(&b, sequence_id, layout->vk.stride)));
2632 
2633          nir_def *upload_offset_init = nir_iadd(&b, load_param32(&b, upload_main_offset),
2634                                                 nir_imul(&b, load_param32(&b, upload_stride), sequence_id));
2635          nir_store_var(&b, cmd_buf.upload_offset, upload_offset_init, 0x1);
2636 
2637          if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES))
2638             cmd_buf.ies_va = dgc_load_ies_va(&cmd_buf, stream_addr);
2639 
2640          if (layout->push_constant_mask) {
2641             nir_def *push_constant_stages = dgc_get_push_constant_stages(&cmd_buf);
2642 
2643             nir_push_if(&b, nir_test_mask(&b, push_constant_stages, VK_SHADER_STAGE_TASK_BIT_EXT));
2644             {
2645                const struct dgc_pc_params params = dgc_get_pc_params(&cmd_buf);
2646                dgc_emit_push_constant_for_stage(&cmd_buf, stream_addr, sequence_id, &params, MESA_SHADER_TASK);
2647             }
2648             nir_pop_if(&b, NULL);
2649          }
2650 
2651          if (layout->vk.draw_count) {
2652             dgc_emit_draw_mesh_tasks_with_count_ace(&cmd_buf, stream_addr, sequence_id);
2653          } else {
2654             dgc_emit_draw_mesh_tasks_ace(&cmd_buf, stream_addr);
2655          }
2656 
2657          /* Pad the cmdbuffer if we did not use the whole stride */
2658          dgc_pad_cmdbuf(&cmd_buf, cmd_buf_end);
2659       }
2660       nir_pop_if(&b, NULL);
2661 
2662       build_dgc_buffer_tail_ace(&b, sequence_count, dev);
2663       build_dgc_buffer_preamble_ace(&b, sequence_count, dev);
2664    }
2665    nir_pop_if(&b, NULL);
2666 
2667    return b.shader;
2668 }
2669 
2670 static VkResult
radv_create_dgc_pipeline(struct radv_device * device,struct radv_indirect_command_layout * layout)2671 radv_create_dgc_pipeline(struct radv_device *device, struct radv_indirect_command_layout *layout)
2672 {
2673    enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_DGC;
2674    VkResult result;
2675 
2676    const VkDescriptorSetLayoutBinding binding = {
2677       .binding = 0,
2678       .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
2679       .descriptorCount = 1,
2680       .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
2681    };
2682 
2683    const VkDescriptorSetLayoutCreateInfo desc_info = {
2684       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
2685       .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT,
2686       .bindingCount = 1,
2687       .pBindings = &binding,
2688    };
2689 
2690    const VkPushConstantRange pc_range = {
2691       .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
2692       .size = sizeof(struct radv_dgc_params),
2693    };
2694 
2695    result = vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, &key,
2696                                         sizeof(key), &layout->pipeline_layout);
2697    if (result != VK_SUCCESS)
2698       return result;
2699 
2700    nir_shader *cs = build_dgc_prepare_shader(device, layout);
2701 
2702    const VkPipelineShaderStageCreateInfo stage_info = {
2703       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
2704       .stage = VK_SHADER_STAGE_COMPUTE_BIT,
2705       .module = vk_shader_module_handle_from_nir(cs),
2706       .pName = "main",
2707       .pSpecializationInfo = NULL,
2708    };
2709 
2710    const VkComputePipelineCreateInfo pipeline_info = {
2711       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
2712       .stage = stage_info,
2713       .flags = 0,
2714       .layout = layout->pipeline_layout,
2715    };
2716 
2717    /* DGC pipelines don't go through the vk_meta cache because that would require to compute a
2718     * separate key but they are cached on-disk when possible.
2719     */
2720    result = radv_CreateComputePipelines(vk_device_to_handle(&device->vk), device->meta_state.device.pipeline_cache, 1,
2721                                         &pipeline_info, NULL, &layout->pipeline);
2722 
2723    ralloc_free(cs);
2724    return result;
2725 }
2726 
2727 VKAPI_ATTR void VKAPI_CALL
radv_GetGeneratedCommandsMemoryRequirementsEXT(VkDevice _device,const VkGeneratedCommandsMemoryRequirementsInfoEXT * pInfo,VkMemoryRequirements2 * pMemoryRequirements)2728 radv_GetGeneratedCommandsMemoryRequirementsEXT(VkDevice _device,
2729                                                const VkGeneratedCommandsMemoryRequirementsInfoEXT *pInfo,
2730                                                VkMemoryRequirements2 *pMemoryRequirements)
2731 {
2732    VK_FROM_HANDLE(radv_device, device, _device);
2733    const struct radv_physical_device *pdev = radv_device_physical(device);
2734    VK_FROM_HANDLE(radv_indirect_command_layout, layout, pInfo->indirectCommandsLayout);
2735    struct dgc_cmdbuf_layout cmdbuf_layout;
2736 
2737    get_dgc_cmdbuf_layout(device, layout, pInfo->pNext, pInfo->maxSequenceCount, true, &cmdbuf_layout);
2738 
2739    pMemoryRequirements->memoryRequirements.memoryTypeBits = pdev->memory_types_32bit;
2740    pMemoryRequirements->memoryRequirements.alignment = radv_dgc_get_buffer_alignment(device);
2741    pMemoryRequirements->memoryRequirements.size =
2742       align(cmdbuf_layout.alloc_size, pMemoryRequirements->memoryRequirements.alignment);
2743 }
2744 
2745 bool
radv_use_dgc_predication(struct radv_cmd_buffer * cmd_buffer,const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo)2746 radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
2747 {
2748    const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
2749       vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
2750    const VkGeneratedCommandsShaderInfoEXT *eso_info =
2751       vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
2752 
2753    /* Enable conditional rendering (if not enabled by user) to skip prepare/execute DGC calls when
2754     * the indirect sequence count might be zero. This can only be enabled on GFX because on ACE it's
2755     * not possible to skip the execute DGC call (ie. no INDIRECT_PACKET). It should also be disabled
2756     * when the graphics pipelines has a task shader for the same reason (otherwise the DGC ACE IB
2757     * would be uninitialized).
2758     */
2759    return cmd_buffer->qf == RADV_QUEUE_GENERAL && !radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK) &&
2760           pGeneratedCommandsInfo->sequenceCountAddress != 0 && !cmd_buffer->state.predicating;
2761 }
2762 
2763 VKAPI_ATTR void VKAPI_CALL
radv_CmdPreprocessGeneratedCommandsEXT(VkCommandBuffer commandBuffer,const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo,VkCommandBuffer stateCommandBuffer)2764 radv_CmdPreprocessGeneratedCommandsEXT(VkCommandBuffer commandBuffer,
2765                                        const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo,
2766                                        VkCommandBuffer stateCommandBuffer)
2767 {
2768    VK_FROM_HANDLE(radv_cmd_buffer, state_cmd_buffer, stateCommandBuffer);
2769    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2770    VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
2771 
2772    assert(layout->vk.usage & VK_INDIRECT_COMMANDS_LAYOUT_USAGE_EXPLICIT_PREPROCESS_BIT_EXT);
2773 
2774    /* VK_EXT_conditional_rendering says that copy commands should not be
2775     * affected by conditional rendering.
2776     */
2777    const bool old_predicating = cmd_buffer->state.predicating;
2778    cmd_buffer->state.predicating = false;
2779 
2780    radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, state_cmd_buffer, old_predicating);
2781 
2782    /* Restore conditional rendering. */
2783    cmd_buffer->state.predicating = old_predicating;
2784 }
2785 
2786 static void
radv_prepare_dgc_compute(struct radv_cmd_buffer * cmd_buffer,const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo,struct radv_cmd_buffer * state_cmd_buffer,unsigned * upload_size,unsigned * upload_offset,void ** upload_data,struct radv_dgc_params * params,bool cond_render_enabled)2787 radv_prepare_dgc_compute(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo,
2788                          struct radv_cmd_buffer *state_cmd_buffer, unsigned *upload_size, unsigned *upload_offset,
2789                          void **upload_data, struct radv_dgc_params *params, bool cond_render_enabled)
2790 
2791 {
2792    VK_FROM_HANDLE(radv_indirect_execution_set, ies, pGeneratedCommandsInfo->indirectExecutionSet);
2793    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2794    const uint32_t alloc_size = ies ? 0 : sizeof(struct radv_compute_pipeline_metadata);
2795 
2796    *upload_size = MAX2(*upload_size + alloc_size, 16);
2797 
2798    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, *upload_size, upload_offset, upload_data)) {
2799       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
2800       return;
2801    }
2802 
2803    if (cond_render_enabled) {
2804       params->predicating = true;
2805       params->predication_va = cmd_buffer->state.predication_va;
2806       params->predication_type = cmd_buffer->state.predication_type;
2807    }
2808 
2809    if (ies) {
2810       struct radv_descriptor_state *descriptors_state =
2811          radv_get_descriptors_state(state_cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
2812 
2813       radv_upload_indirect_descriptor_sets(cmd_buffer, descriptors_state);
2814 
2815       params->ies_stride = ies->stride;
2816       params->indirect_desc_sets_va = descriptors_state->indirect_descriptor_sets_va;
2817    } else {
2818       const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
2819          vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
2820       const VkGeneratedCommandsShaderInfoEXT *eso_info =
2821          vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
2822       const struct radv_shader *cs = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_COMPUTE);
2823       struct radv_compute_pipeline_metadata *metadata = (struct radv_compute_pipeline_metadata *)(*upload_data);
2824 
2825       radv_get_compute_shader_metadata(device, cs, metadata);
2826 
2827       *upload_data = (char *)*upload_data + alloc_size;
2828    }
2829 }
2830 
2831 static void
radv_prepare_dgc_rt(struct radv_cmd_buffer * cmd_buffer,const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo,unsigned * upload_size,unsigned * upload_offset,void ** upload_data,struct radv_dgc_params * params)2832 radv_prepare_dgc_rt(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo,
2833                     unsigned *upload_size, unsigned *upload_offset, void **upload_data, struct radv_dgc_params *params)
2834 {
2835    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, *upload_size, upload_offset, upload_data)) {
2836       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
2837       return;
2838    }
2839 
2840    const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
2841       vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
2842    VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline);
2843    const struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
2844    const struct radv_shader *rt_prolog = rt_pipeline->prolog;
2845 
2846    params->wave32 = rt_prolog->info.wave_size == 32;
2847    params->grid_base_sgpr = radv_get_user_sgpr(rt_prolog, AC_UD_CS_GRID_SIZE);
2848    params->cs_sbt_descriptors = radv_get_user_sgpr(rt_prolog, AC_UD_CS_SBT_DESCRIPTORS);
2849    params->cs_ray_launch_size_addr = radv_get_user_sgpr(rt_prolog, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR);
2850 }
2851 
2852 static uint32_t
get_dgc_vertex_binding_offset(const struct radv_indirect_command_layout * layout,uint32_t binding)2853 get_dgc_vertex_binding_offset(const struct radv_indirect_command_layout *layout, uint32_t binding)
2854 {
2855    for (uint32_t i = 0; i < layout->vk.n_vb_layouts; i++) {
2856       if (layout->vk.vb_layouts[i].binding == binding)
2857          return layout->vk.vb_layouts[i].src_offset_B;
2858    }
2859 
2860    return -1;
2861 }
2862 
2863 static void
radv_prepare_dgc_graphics(struct radv_cmd_buffer * cmd_buffer,const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo,struct radv_cmd_buffer * state_cmd_buffer,unsigned * upload_size,unsigned * upload_offset,void ** upload_data,struct radv_dgc_params * params)2864 radv_prepare_dgc_graphics(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo,
2865                           struct radv_cmd_buffer *state_cmd_buffer, unsigned *upload_size, unsigned *upload_offset,
2866                           void **upload_data, struct radv_dgc_params *params)
2867 {
2868    VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
2869 
2870    const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
2871       vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
2872    const VkGeneratedCommandsShaderInfoEXT *eso_info =
2873       vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
2874 
2875    const gl_shader_stage first_stage =
2876       (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) ? MESA_SHADER_MESH : MESA_SHADER_VERTEX;
2877    struct radv_shader *first_shader = radv_dgc_get_shader(pipeline_info, eso_info, first_stage);
2878 
2879    unsigned vb_size = (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) ? MAX_VBS * DGC_VBO_INFO_SIZE : 0;
2880 
2881    *upload_size = MAX2(*upload_size + vb_size, 16);
2882 
2883    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, *upload_size, upload_offset, upload_data)) {
2884       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
2885       return;
2886    }
2887 
2888    uint16_t vtx_base_sgpr = radv_get_user_sgpr(first_shader, AC_UD_VS_BASE_VERTEX_START_INSTANCE);
2889    const bool uses_drawid = first_shader->info.vs.needs_draw_id;
2890 
2891    if (uses_drawid)
2892       vtx_base_sgpr |= DGC_USES_DRAWID;
2893 
2894    if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) {
2895       if (first_shader->info.cs.uses_grid_size)
2896          vtx_base_sgpr |= DGC_USES_GRID_SIZE;
2897 
2898       const struct radv_shader *task_shader = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK);
2899       if (task_shader) {
2900          params->has_task_shader = 1;
2901          params->mesh_ring_entry_sgpr = radv_get_user_sgpr(first_shader, AC_UD_TASK_RING_ENTRY);
2902          params->linear_dispatch_en = task_shader->info.cs.linear_taskmesh_dispatch;
2903          params->task_ring_entry_sgpr = radv_get_user_sgpr(task_shader, AC_UD_TASK_RING_ENTRY);
2904          params->wave32 = task_shader->info.wave_size == 32;
2905          params->task_xyz_sgpr = radv_get_user_sgpr(task_shader, AC_UD_CS_GRID_SIZE);
2906          params->task_draw_id_sgpr = radv_get_user_sgpr(task_shader, AC_UD_CS_TASK_DRAW_ID);
2907       }
2908    } else {
2909       const bool uses_baseinstance = first_shader->info.vs.needs_base_instance;
2910 
2911       if (uses_baseinstance)
2912          vtx_base_sgpr |= DGC_USES_BASEINSTANCE;
2913    }
2914 
2915    params->vtx_base_sgpr = vtx_base_sgpr;
2916    params->max_index_count = state_cmd_buffer->state.max_index_count;
2917    params->max_draw_count = pGeneratedCommandsInfo->maxDrawCount;
2918    params->dynamic_vs_input =
2919       (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) && first_shader->info.vs.dynamic_inputs;
2920    params->use_per_attribute_vb_descs =
2921       (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) && first_shader->info.vs.use_per_attribute_vb_descs;
2922 
2923    if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) {
2924       uint8_t *ptr = (uint8_t *)((char *)*upload_data);
2925 
2926       for (uint32_t i = 0; i < MAX_VBS; i++) {
2927          struct radv_vbo_info vbo_info;
2928          radv_get_vbo_info(state_cmd_buffer, i, &vbo_info);
2929 
2930          const uint32_t vbo_offset = get_dgc_vertex_binding_offset(layout, vbo_info.binding);
2931 
2932          memcpy(ptr, &vbo_info, sizeof(vbo_info));
2933          ptr += sizeof(struct radv_vbo_info);
2934 
2935          memcpy(ptr, &vbo_offset, sizeof(uint32_t));
2936          ptr += sizeof(uint32_t);
2937       }
2938       params->vb_desc_usage_mask = first_shader->info.vs.vb_desc_usage_mask;
2939       params->vbo_reg = radv_get_user_sgpr(first_shader, AC_UD_VS_VERTEX_BUFFERS);
2940 
2941       *upload_data = (char *)*upload_data + vb_size;
2942    }
2943 }
2944 
2945 void
radv_prepare_dgc(struct radv_cmd_buffer * cmd_buffer,const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo,struct radv_cmd_buffer * state_cmd_buffer,bool cond_render_enabled)2946 radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo,
2947                  struct radv_cmd_buffer *state_cmd_buffer, bool cond_render_enabled)
2948 {
2949    VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
2950    VK_FROM_HANDLE(radv_indirect_execution_set, ies, pGeneratedCommandsInfo->indirectExecutionSet);
2951    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2952    const struct radv_physical_device *pdev = radv_device_physical(device);
2953    struct radv_meta_saved_state saved_state;
2954    unsigned upload_offset, upload_size = 0;
2955    struct radv_buffer token_buffer;
2956    void *upload_data;
2957 
2958    const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
2959       vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
2960    const VkGeneratedCommandsShaderInfoEXT *eso_info =
2961       vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
2962 
2963    const bool use_preamble = radv_dgc_use_preamble(pGeneratedCommandsInfo);
2964    const uint32_t sequences_count = pGeneratedCommandsInfo->maxSequenceCount;
2965 
2966    struct dgc_cmdbuf_layout cmdbuf_layout;
2967    get_dgc_cmdbuf_layout(device, layout, pGeneratedCommandsInfo->pNext, sequences_count, use_preamble, &cmdbuf_layout);
2968 
2969    assert((cmdbuf_layout.main_offset + pGeneratedCommandsInfo->preprocessAddress) %
2970              pdev->info.ip[AMD_IP_GFX].ib_alignment ==
2971           0);
2972    assert((cmdbuf_layout.ace_main_offset + pGeneratedCommandsInfo->preprocessAddress) %
2973              pdev->info.ip[AMD_IP_COMPUTE].ib_alignment ==
2974           0);
2975 
2976    struct radv_dgc_params params = {
2977       .cmd_buf_preamble_offset = cmdbuf_layout.main_preamble_offset,
2978       .cmd_buf_main_offset = cmdbuf_layout.main_offset,
2979       .cmd_buf_stride = cmdbuf_layout.main_cmd_stride,
2980       .cmd_buf_size = cmdbuf_layout.main_size,
2981       .ace_cmd_buf_trailer_offset = cmdbuf_layout.ace_trailer_offset,
2982       .ace_cmd_buf_preamble_offset = cmdbuf_layout.ace_preamble_offset,
2983       .ace_cmd_buf_main_offset = cmdbuf_layout.ace_main_offset,
2984       .ace_cmd_buf_stride = cmdbuf_layout.ace_cmd_stride,
2985       .ace_cmd_buf_size = cmdbuf_layout.ace_size,
2986       .upload_main_offset = cmdbuf_layout.upload_offset,
2987       .upload_addr = (uint32_t)pGeneratedCommandsInfo->preprocessAddress,
2988       .upload_stride = cmdbuf_layout.upload_stride,
2989       .sequence_count = sequences_count,
2990       .use_preamble = use_preamble,
2991       .stream_addr = pGeneratedCommandsInfo->indirectAddress,
2992       .sequence_count_addr = pGeneratedCommandsInfo->sequenceCountAddress,
2993       .ies_addr = ies ? ies->va : 0,
2994       .queue_family = state_cmd_buffer->qf,
2995    };
2996 
2997    VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, layout->vk.layout);
2998 
2999    if (layout->vk.dgc_info & (BITFIELD_BIT(MESA_VK_DGC_PC) | BITFIELD_BIT(MESA_VK_DGC_SI))) {
3000       upload_size = pipeline_layout->push_constant_size + MESA_VULKAN_SHADER_STAGES * 12;
3001    }
3002 
3003    if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
3004       radv_prepare_dgc_compute(cmd_buffer, pGeneratedCommandsInfo, state_cmd_buffer, &upload_size, &upload_offset,
3005                                &upload_data, &params, cond_render_enabled);
3006    } else if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) {
3007       radv_prepare_dgc_rt(cmd_buffer, pGeneratedCommandsInfo, &upload_size, &upload_offset, &upload_data, &params);
3008    } else {
3009       radv_prepare_dgc_graphics(cmd_buffer, pGeneratedCommandsInfo, state_cmd_buffer, &upload_size, &upload_offset,
3010                                 &upload_data, &params);
3011    }
3012 
3013    if (layout->push_constant_mask) {
3014       VkShaderStageFlags pc_stages = 0;
3015       uint32_t *desc = upload_data;
3016       upload_data = (char *)upload_data + MESA_VULKAN_SHADER_STAGES * 12;
3017 
3018       struct radv_shader *shaders[MESA_VULKAN_SHADER_STAGES] = {0};
3019       if (pipeline_info) {
3020          VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline);
3021 
3022          if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) {
3023             const struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
3024             struct radv_shader *rt_prolog = rt_pipeline->prolog;
3025 
3026             shaders[MESA_SHADER_COMPUTE] = rt_prolog;
3027          } else {
3028             memcpy(shaders, pipeline->shaders, sizeof(shaders));
3029          }
3030       } else if (eso_info) {
3031          for (unsigned i = 0; i < eso_info->shaderCount; ++i) {
3032             VK_FROM_HANDLE(radv_shader_object, shader_object, eso_info->pShaders[i]);
3033             struct radv_shader *shader = shader_object->shader;
3034             gl_shader_stage stage = shader->info.stage;
3035 
3036             shaders[stage] = shader;
3037          }
3038       }
3039 
3040       for (unsigned i = 0; i < ARRAY_SIZE(shaders); i++) {
3041          const struct radv_shader *shader = shaders[i];
3042 
3043          if (!shader)
3044             continue;
3045 
3046          const struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs;
3047          if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
3048             params.const_copy = 1;
3049          }
3050 
3051          if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0 ||
3052              locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
3053             unsigned upload_sgpr = 0;
3054             unsigned inline_sgpr = 0;
3055 
3056             if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
3057                upload_sgpr = (shader->info.user_data_0 + 4 * locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx -
3058                               SI_SH_REG_OFFSET) >>
3059                              2;
3060             }
3061 
3062             if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
3063                inline_sgpr = (shader->info.user_data_0 + 4 * locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx -
3064                               SI_SH_REG_OFFSET) >>
3065                              2;
3066                desc[i * 3 + 1] = shader->info.inline_push_constant_mask;
3067                desc[i * 3 + 2] = shader->info.inline_push_constant_mask >> 32;
3068             }
3069             desc[i * 3] = upload_sgpr | (inline_sgpr << 16);
3070 
3071             pc_stages |= mesa_to_vk_shader_stage(i);
3072          }
3073       }
3074 
3075       params.push_constant_stages = pc_stages;
3076 
3077       memcpy(upload_data, state_cmd_buffer->push_constants, pipeline_layout->push_constant_size);
3078       upload_data = (char *)upload_data + pipeline_layout->push_constant_size;
3079    }
3080 
3081    radv_buffer_init(&token_buffer, device, cmd_buffer->upload.upload_bo, upload_size, upload_offset);
3082 
3083    radv_meta_save(&saved_state, cmd_buffer,
3084                   RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS);
3085 
3086    radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, layout->pipeline);
3087 
3088    vk_common_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), layout->pipeline_layout,
3089                               VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(params), &params);
3090 
3091    radv_meta_push_descriptor_set(
3092       cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, layout->pipeline_layout, 0, 1,
3093       (VkWriteDescriptorSet[]){{.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
3094                                 .dstBinding = 0,
3095                                 .dstArrayElement = 0,
3096                                 .descriptorCount = 1,
3097                                 .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
3098                                 .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&token_buffer),
3099                                                                          .offset = 0,
3100                                                                          .range = upload_size}}});
3101 
3102    unsigned block_count = MAX2(1, DIV_ROUND_UP(pGeneratedCommandsInfo->maxSequenceCount, 64));
3103    vk_common_CmdDispatch(radv_cmd_buffer_to_handle(cmd_buffer), block_count, 1, 1);
3104 
3105    radv_buffer_finish(&token_buffer);
3106    radv_meta_restore(&saved_state, cmd_buffer);
3107 }
3108 
3109 static void
radv_destroy_indirect_commands_layout(struct radv_device * device,const VkAllocationCallbacks * pAllocator,struct radv_indirect_command_layout * layout)3110 radv_destroy_indirect_commands_layout(struct radv_device *device, const VkAllocationCallbacks *pAllocator,
3111                                       struct radv_indirect_command_layout *layout)
3112 {
3113    radv_DestroyPipeline(radv_device_to_handle(device), layout->pipeline, &device->meta_state.alloc);
3114 
3115    vk_indirect_command_layout_destroy(&device->vk, pAllocator, &layout->vk);
3116 }
3117 
3118 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateIndirectCommandsLayoutEXT(VkDevice _device,const VkIndirectCommandsLayoutCreateInfoEXT * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkIndirectCommandsLayoutEXT * pIndirectCommandsLayout)3119 radv_CreateIndirectCommandsLayoutEXT(VkDevice _device, const VkIndirectCommandsLayoutCreateInfoEXT *pCreateInfo,
3120                                      const VkAllocationCallbacks *pAllocator,
3121                                      VkIndirectCommandsLayoutEXT *pIndirectCommandsLayout)
3122 {
3123    VK_FROM_HANDLE(radv_device, device, _device);
3124    struct radv_indirect_command_layout *layout;
3125    VkResult result;
3126 
3127    layout = vk_indirect_command_layout_create(&device->vk, pCreateInfo, pAllocator, sizeof(*layout));
3128    if (!layout)
3129       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
3130 
3131    for (uint32_t i = 0; i < layout->vk.n_pc_layouts; i++) {
3132       for (uint32_t j = layout->vk.pc_layouts[i].dst_offset_B / 4, k = 0; k < layout->vk.pc_layouts[i].size_B / 4;
3133            j++, k++) {
3134          layout->push_constant_mask |= 1ull << j;
3135          layout->push_constant_offsets[j] = layout->vk.pc_layouts[i].src_offset_B + k * 4;
3136       }
3137    }
3138 
3139    if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_SI)) {
3140       layout->sequence_index_mask = 1ull << (layout->vk.si_layout.dst_offset_B / 4);
3141       layout->push_constant_mask |= layout->sequence_index_mask;
3142    }
3143 
3144    result = radv_create_dgc_pipeline(device, layout);
3145    if (result != VK_SUCCESS) {
3146       radv_destroy_indirect_commands_layout(device, pAllocator, layout);
3147       return result;
3148    }
3149 
3150    *pIndirectCommandsLayout = radv_indirect_command_layout_to_handle(layout);
3151    return VK_SUCCESS;
3152 }
3153 
3154 VKAPI_ATTR void VKAPI_CALL
radv_DestroyIndirectCommandsLayoutEXT(VkDevice _device,VkIndirectCommandsLayoutEXT indirectCommandsLayout,const VkAllocationCallbacks * pAllocator)3155 radv_DestroyIndirectCommandsLayoutEXT(VkDevice _device, VkIndirectCommandsLayoutEXT indirectCommandsLayout,
3156                                       const VkAllocationCallbacks *pAllocator)
3157 {
3158    VK_FROM_HANDLE(radv_device, device, _device);
3159    VK_FROM_HANDLE(radv_indirect_command_layout, layout, indirectCommandsLayout);
3160 
3161    if (!layout)
3162       return;
3163 
3164    radv_destroy_indirect_commands_layout(device, pAllocator, layout);
3165 }
3166 
3167 static void
radv_update_ies_shader(struct radv_device * device,struct radv_indirect_execution_set * set,uint32_t index,struct radv_shader * shader)3168 radv_update_ies_shader(struct radv_device *device, struct radv_indirect_execution_set *set, uint32_t index,
3169                        struct radv_shader *shader)
3170 {
3171    const struct radv_physical_device *pdev = radv_device_physical(device);
3172    uint8_t *ptr = set->mapped_ptr + set->stride * index;
3173    struct radv_compute_pipeline_metadata md;
3174    struct radeon_cmdbuf *cs;
3175 
3176    assert(shader->info.stage == MESA_SHADER_COMPUTE);
3177    radv_get_compute_shader_metadata(device, shader, &md);
3178 
3179    cs = calloc(1, sizeof(*cs));
3180    if (!cs)
3181       return;
3182 
3183    cs->reserved_dw = cs->max_dw = 32;
3184    cs->buf = malloc(cs->max_dw * 4);
3185    if (!cs->buf) {
3186       free(cs);
3187       return;
3188    }
3189 
3190    radv_emit_compute_shader(pdev, cs, shader);
3191 
3192    memcpy(ptr, &md, sizeof(md));
3193    ptr += sizeof(md);
3194 
3195    memcpy(ptr, &cs->cdw, sizeof(uint32_t));
3196    ptr += sizeof(uint32_t);
3197 
3198    memcpy(ptr, cs->buf, cs->cdw * sizeof(uint32_t));
3199    ptr += cs->cdw * sizeof(uint32_t);
3200 
3201    set->compute_scratch_size_per_wave = MAX2(set->compute_scratch_size_per_wave, shader->config.scratch_bytes_per_wave);
3202    set->compute_scratch_waves = MAX2(set->compute_scratch_waves, radv_get_max_scratch_waves(device, shader));
3203 
3204    free(cs->buf);
3205    free(cs);
3206 }
3207 
3208 static void
radv_update_ies_pipeline(struct radv_device * device,struct radv_indirect_execution_set * set,uint32_t index,const struct radv_pipeline * pipeline)3209 radv_update_ies_pipeline(struct radv_device *device, struct radv_indirect_execution_set *set, uint32_t index,
3210                          const struct radv_pipeline *pipeline)
3211 {
3212    assert(pipeline->type == RADV_PIPELINE_COMPUTE);
3213    radv_update_ies_shader(device, set, index, pipeline->shaders[MESA_SHADER_COMPUTE]);
3214 }
3215 
3216 static void
radv_destroy_indirect_execution_set(struct radv_device * device,const VkAllocationCallbacks * pAllocator,struct radv_indirect_execution_set * set)3217 radv_destroy_indirect_execution_set(struct radv_device *device, const VkAllocationCallbacks *pAllocator,
3218                                     struct radv_indirect_execution_set *set)
3219 {
3220    if (set->bo)
3221       radv_bo_destroy(device, &set->base, set->bo);
3222 
3223    vk_object_base_finish(&set->base);
3224    vk_free2(&device->vk.alloc, pAllocator, set);
3225 }
3226 
3227 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateIndirectExecutionSetEXT(VkDevice _device,const VkIndirectExecutionSetCreateInfoEXT * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkIndirectExecutionSetEXT * pIndirectExecutionSet)3228 radv_CreateIndirectExecutionSetEXT(VkDevice _device, const VkIndirectExecutionSetCreateInfoEXT *pCreateInfo,
3229                                    const VkAllocationCallbacks *pAllocator,
3230                                    VkIndirectExecutionSetEXT *pIndirectExecutionSet)
3231 {
3232    VK_FROM_HANDLE(radv_device, device, _device);
3233    const struct radv_physical_device *pdev = radv_device_physical(device);
3234    struct radv_indirect_execution_set *set;
3235    uint32_t num_entries;
3236    uint32_t stride;
3237    VkResult result;
3238 
3239    set = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*set), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
3240    if (!set)
3241       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
3242 
3243    vk_object_base_init(&device->vk, &set->base, VK_OBJECT_TYPE_INDIRECT_EXECUTION_SET_EXT);
3244 
3245    switch (pCreateInfo->type) {
3246    case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT: {
3247       const VkIndirectExecutionSetPipelineInfoEXT *pipeline_info = pCreateInfo->info.pPipelineInfo;
3248       VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->initialPipeline);
3249 
3250       assert(pipeline->type == RADV_PIPELINE_COMPUTE);
3251       num_entries = pipeline_info->maxPipelineCount;
3252       break;
3253    }
3254    case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT: {
3255       const VkIndirectExecutionSetShaderInfoEXT *shaders_info = pCreateInfo->info.pShaderInfo;
3256       VK_FROM_HANDLE(radv_shader_object, shader_object, shaders_info->pInitialShaders[0]);
3257 
3258       assert(shader_object->stage == MESA_SHADER_COMPUTE);
3259       num_entries = shaders_info->maxShaderCount;
3260       break;
3261    }
3262    default:
3263       unreachable("Invalid IES type");
3264    }
3265 
3266    stride = sizeof(struct radv_compute_pipeline_metadata);
3267    stride += 4 /* num CS DW */;
3268    stride += (pdev->info.gfx_level >= GFX10 ? 19 : 16) * 4;
3269 
3270    result = radv_bo_create(device, &set->base, num_entries * stride, 8, RADEON_DOMAIN_VRAM,
3271                            RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_READ_ONLY, RADV_BO_PRIORITY_DESCRIPTOR, 0,
3272                            false, &set->bo);
3273    if (result != VK_SUCCESS) {
3274       radv_destroy_indirect_execution_set(device, pAllocator, set);
3275       return vk_error(device, result);
3276    }
3277 
3278    set->mapped_ptr = (uint8_t *)radv_buffer_map(device->ws, set->bo);
3279    if (!set->mapped_ptr) {
3280       radv_destroy_indirect_execution_set(device, pAllocator, set);
3281       return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
3282    }
3283 
3284    set->va = radv_buffer_get_va(set->bo);
3285    set->stride = stride;
3286 
3287    /* The driver is supposed to always populate slot 0 with the initial pipeline/shader. */
3288    switch (pCreateInfo->type) {
3289    case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT: {
3290       const VkIndirectExecutionSetPipelineInfoEXT *pipeline_info = pCreateInfo->info.pPipelineInfo;
3291       VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->initialPipeline);
3292 
3293       radv_update_ies_pipeline(device, set, 0, pipeline);
3294       break;
3295    }
3296    case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT: {
3297       const VkIndirectExecutionSetShaderInfoEXT *shaders_info = pCreateInfo->info.pShaderInfo;
3298       VK_FROM_HANDLE(radv_shader_object, shader_object, shaders_info->pInitialShaders[0]);
3299 
3300       radv_update_ies_shader(device, set, 0, shader_object->shader);
3301       break;
3302    }
3303    default:
3304       unreachable("Invalid IES type");
3305    }
3306 
3307    *pIndirectExecutionSet = radv_indirect_execution_set_to_handle(set);
3308    return VK_SUCCESS;
3309 }
3310 
3311 VKAPI_ATTR void VKAPI_CALL
radv_DestroyIndirectExecutionSetEXT(VkDevice _device,VkIndirectExecutionSetEXT indirectExecutionSet,const VkAllocationCallbacks * pAllocator)3312 radv_DestroyIndirectExecutionSetEXT(VkDevice _device, VkIndirectExecutionSetEXT indirectExecutionSet,
3313                                     const VkAllocationCallbacks *pAllocator)
3314 {
3315    VK_FROM_HANDLE(radv_device, device, _device);
3316    VK_FROM_HANDLE(radv_indirect_execution_set, set, indirectExecutionSet);
3317 
3318    if (!set)
3319       return;
3320 
3321    radv_destroy_indirect_execution_set(device, pAllocator, set);
3322 }
3323 
3324 VKAPI_ATTR void VKAPI_CALL
radv_UpdateIndirectExecutionSetPipelineEXT(VkDevice _device,VkIndirectExecutionSetEXT indirectExecutionSet,uint32_t executionSetWriteCount,const VkWriteIndirectExecutionSetPipelineEXT * pExecutionSetWrites)3325 radv_UpdateIndirectExecutionSetPipelineEXT(VkDevice _device, VkIndirectExecutionSetEXT indirectExecutionSet,
3326                                            uint32_t executionSetWriteCount,
3327                                            const VkWriteIndirectExecutionSetPipelineEXT *pExecutionSetWrites)
3328 {
3329    VK_FROM_HANDLE(radv_indirect_execution_set, set, indirectExecutionSet);
3330    VK_FROM_HANDLE(radv_device, device, _device);
3331 
3332    for (uint32_t i = 0; i < executionSetWriteCount; i++) {
3333       const VkWriteIndirectExecutionSetPipelineEXT *writeset = &pExecutionSetWrites[i];
3334       VK_FROM_HANDLE(radv_pipeline, pipeline, writeset->pipeline);
3335 
3336       radv_update_ies_pipeline(device, set, writeset->index, pipeline);
3337    }
3338 }
3339 
3340 VKAPI_ATTR void VKAPI_CALL
radv_UpdateIndirectExecutionSetShaderEXT(VkDevice _device,VkIndirectExecutionSetEXT indirectExecutionSet,uint32_t executionSetWriteCount,const VkWriteIndirectExecutionSetShaderEXT * pExecutionSetWrites)3341 radv_UpdateIndirectExecutionSetShaderEXT(VkDevice _device, VkIndirectExecutionSetEXT indirectExecutionSet,
3342                                          uint32_t executionSetWriteCount,
3343                                          const VkWriteIndirectExecutionSetShaderEXT *pExecutionSetWrites)
3344 {
3345    VK_FROM_HANDLE(radv_indirect_execution_set, set, indirectExecutionSet);
3346    VK_FROM_HANDLE(radv_device, device, _device);
3347 
3348    for (uint32_t i = 0; i < executionSetWriteCount; i++) {
3349       const VkWriteIndirectExecutionSetShaderEXT *writeset = &pExecutionSetWrites[i];
3350       VK_FROM_HANDLE(radv_shader_object, shader_object, writeset->shader);
3351 
3352       radv_update_ies_shader(device, set, writeset->index, shader_object->shader);
3353    }
3354 }
3355