1 /*
2 * Copyright © 2024 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "radv_dgc.h"
8 #include "meta/radv_meta.h"
9 #include "radv_entrypoints.h"
10 #include "radv_pipeline_rt.h"
11
12 #include "ac_rgp.h"
13
14 #include "nir_builder.h"
15
16 #include "vk_common_entrypoints.h"
17 #include "vk_device_generated_commands.h"
18 #include "vk_shader_module.h"
19
20 #define PKT3_INDIRECT_BUFFER_BYTES 16
21 #define DGC_VBO_INFO_SIZE (sizeof(struct radv_vbo_info) + 4 /* vbo_offsets */)
22
23 /* The DGC command buffer layout is quite complex, here's some explanations:
24 *
25 * Without the DGC preamble, the default layout looks like:
26 *
27 * +---------+----------+---------+-----------------+
28 * | trailer | commands | padding | jump to trailer |
29 * +---------+----------+---------+-----------------+
30 *
31 * The trailer is used to implement IB chaining for compute queue because IB2 isn't supported. The
32 * trailer is patched at execute time to chain back the DGC command buffer. The trailer is added at
33 * the beginning to make sure the offset is fixed (ie. not possible to know the offset with a
34 * preamble). In practice the execution looks like:
35 *
36 * +----------+---------+-----------------+ +---------+ +-----------------------+
37 * | commands | padding | jump to trailer | -> | trailer | -> | postamble (normal CS) |
38 * +----------+---------+-----------------+ +---------+ +-----------------------+
39 *
40 * When DGC uses a preamble (to optimize large empty indirect sequence count by removing a ton of
41 * padding), the trailer is still used but the layout looks like:
42 *
43 * +---------+---------+-----------------+ +----------+---------+-----------------+
44 * | trailer | padding | INDIRECT_BUFFER | -> | commands | padding | jump to trailer |
45 * +---------+---------+-----------------+ +----------+---------+-----------------+
46 *
47 * When DGC uses task shaders, the command buffer is split in two parts (GFX/COMPUTE), the
48 * default layout looks like:
49 *
50 * +--------------+---------+--------------+---------+
51 * | GFX commands | padding | ACE commands | padding |
52 * +--------------+---------+--------------+---------+
53 *
54 * The execution of this DGC command buffer is different if it's GFX or COMPUTE queue:
55 * - on GFX, the driver uses the IB2 packet which the easiest solution
56 * - on COMPUTE, IB2 isn't supported and the driver chains the DGC command buffer by patching the
57 * trailer
58 */
59
60 uint32_t
radv_dgc_get_buffer_alignment(const struct radv_device * device)61 radv_dgc_get_buffer_alignment(const struct radv_device *device)
62 {
63 const struct radv_physical_device *pdev = radv_device_physical(device);
64
65 return MAX2(pdev->info.ip[AMD_IP_GFX].ib_alignment, pdev->info.ip[AMD_IP_COMPUTE].ib_alignment);
66 }
67
68 static uint32_t
radv_pad_cmdbuf(const struct radv_device * device,uint32_t size,enum amd_ip_type ip_type)69 radv_pad_cmdbuf(const struct radv_device *device, uint32_t size, enum amd_ip_type ip_type)
70 {
71 const struct radv_physical_device *pdev = radv_device_physical(device);
72 const uint32_t ib_alignment = (pdev->info.ip[ip_type].ib_pad_dw_mask + 1) * 4;
73
74 return align(size, ib_alignment);
75 }
76
77 static uint32_t
radv_align_cmdbuf(const struct radv_device * device,uint32_t size,enum amd_ip_type ip_type)78 radv_align_cmdbuf(const struct radv_device *device, uint32_t size, enum amd_ip_type ip_type)
79 {
80 const struct radv_physical_device *pdev = radv_device_physical(device);
81 const uint32_t ib_alignment = pdev->info.ip[ip_type].ib_alignment;
82
83 return align(size, ib_alignment);
84 }
85
86 static unsigned
radv_dgc_preamble_cmdbuf_size(const struct radv_device * device,enum amd_ip_type ip_type)87 radv_dgc_preamble_cmdbuf_size(const struct radv_device *device, enum amd_ip_type ip_type)
88 {
89 return radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type);
90 }
91
92 static unsigned
radv_dgc_trailer_cmdbuf_size(const struct radv_device * device,enum amd_ip_type ip_type)93 radv_dgc_trailer_cmdbuf_size(const struct radv_device *device, enum amd_ip_type ip_type)
94 {
95 return radv_pad_cmdbuf(device, PKT3_INDIRECT_BUFFER_BYTES, ip_type);
96 }
97
98 static bool
radv_dgc_use_preamble(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo)99 radv_dgc_use_preamble(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
100 {
101 /* Heuristic on when the overhead for the preamble (i.e. double jump) is worth it. Obviously
102 * a bit of a guess as it depends on the actual count which we don't know. */
103 return pGeneratedCommandsInfo->sequenceCountAddress != 0 && pGeneratedCommandsInfo->maxSequenceCount >= 64;
104 }
105
106 struct radv_shader *
radv_dgc_get_shader(const VkGeneratedCommandsPipelineInfoEXT * pipeline_info,const VkGeneratedCommandsShaderInfoEXT * eso_info,gl_shader_stage stage)107 radv_dgc_get_shader(const VkGeneratedCommandsPipelineInfoEXT *pipeline_info,
108 const VkGeneratedCommandsShaderInfoEXT *eso_info, gl_shader_stage stage)
109 {
110 if (pipeline_info) {
111 VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline);
112 return radv_get_shader(pipeline->shaders, stage);
113 } else if (eso_info) {
114 VkShaderStageFlags stages = 0;
115
116 for (uint32_t i = 0; i < eso_info->shaderCount; i++) {
117 VK_FROM_HANDLE(radv_shader_object, shader_object, eso_info->pShaders[i]);
118 stages |= mesa_to_vk_shader_stage(shader_object->stage);
119 }
120
121 for (uint32_t i = 0; i < eso_info->shaderCount; i++) {
122 VK_FROM_HANDLE(radv_shader_object, shader_object, eso_info->pShaders[i]);
123
124 if (shader_object->stage != stage)
125 continue;
126
127 if (stage == MESA_SHADER_VERTEX && (stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT)) {
128 return shader_object->as_ls.shader;
129 } else if ((stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_TESS_EVAL) &&
130 (stages & VK_SHADER_STAGE_GEOMETRY_BIT)) {
131 return shader_object->as_es.shader;
132 } else {
133 return shader_object->shader;
134 }
135 }
136 }
137
138 return NULL;
139 }
140
141 static void
radv_get_sequence_size_compute(const struct radv_indirect_command_layout * layout,const void * pNext,uint32_t * cmd_size,uint32_t * upload_size)142 radv_get_sequence_size_compute(const struct radv_indirect_command_layout *layout, const void *pNext, uint32_t *cmd_size,
143 uint32_t *upload_size)
144 {
145 const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
146 const struct radv_physical_device *pdev = radv_device_physical(device);
147
148 const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
149 vk_find_struct_const(pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
150 const VkGeneratedCommandsShaderInfoEXT *eso_info = vk_find_struct_const(pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
151
152 struct radv_shader *cs = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_COMPUTE);
153
154 /* dispatch */
155 *cmd_size += 5 * 4;
156
157 if (cs) {
158 const struct radv_userdata_info *loc = radv_get_user_sgpr_info(cs, AC_UD_CS_GRID_SIZE);
159 if (loc->sgpr_idx != -1) {
160 if (device->load_grid_size_from_user_sgpr) {
161 /* PKT3_SET_SH_REG for immediate values */
162 *cmd_size += 5 * 4;
163 } else {
164 /* PKT3_SET_SH_REG for pointer */
165 *cmd_size += 4 * 4;
166 }
167 }
168 } else {
169 /* COMPUTE_PGM_{LO,RSRC1,RSRC2} */
170 *cmd_size += 7 * 4;
171
172 if (pdev->info.gfx_level >= GFX10) {
173 /* COMPUTE_PGM_RSRC3 */
174 *cmd_size += 3 * 4;
175 }
176
177 /* COMPUTE_{RESOURCE_LIMITS,NUM_THREADS_X} */
178 *cmd_size += 8 * 4;
179
180 /* Assume the compute shader needs grid size because we can't know the information for
181 * indirect pipelines.
182 */
183 if (device->load_grid_size_from_user_sgpr) {
184 /* PKT3_SET_SH_REG for immediate values */
185 *cmd_size += 5 * 4;
186 } else {
187 /* PKT3_SET_SH_REG for pointer */
188 *cmd_size += 4 * 4;
189 }
190
191 /* PKT3_SET_SH_REG for indirect descriptor sets pointer */
192 *cmd_size += 3 * 4;
193 }
194
195 if (device->sqtt.bo) {
196 /* sqtt markers */
197 *cmd_size += 8 * 3 * 4;
198 }
199 }
200
201 static void
radv_get_sequence_size_graphics(const struct radv_indirect_command_layout * layout,const void * pNext,uint32_t * cmd_size,uint32_t * ace_cmd_size,uint32_t * upload_size)202 radv_get_sequence_size_graphics(const struct radv_indirect_command_layout *layout, const void *pNext,
203 uint32_t *cmd_size, uint32_t *ace_cmd_size, uint32_t *upload_size)
204 {
205 const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
206 const struct radv_physical_device *pdev = radv_device_physical(device);
207
208 const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
209 vk_find_struct_const(pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
210 const VkGeneratedCommandsShaderInfoEXT *eso_info = vk_find_struct_const(pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
211
212 struct radv_shader *vs = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_VERTEX);
213
214 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) {
215 *upload_size += 16 * util_bitcount(vs->info.vs.vb_desc_usage_mask);
216
217 /* One PKT3_SET_SH_REG for emitting VBO pointer (32-bit) */
218 *cmd_size += 3 * 4;
219 }
220
221 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IB)) {
222 /* Index type write (normal reg write) + index buffer base write (64-bits, but special packet
223 * so only 1 word overhead) + index buffer size (again, special packet so only 1 word
224 * overhead)
225 */
226 *cmd_size += (3 + 3 + 2) * 4;
227 }
228
229 if (layout->vk.draw_count) {
230 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) {
231 const struct radv_shader *task_shader = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK);
232
233 if (task_shader) {
234 /* PKT3_DISPATCH_TASKMESH_GFX */
235 *cmd_size += 4 * 4;
236
237 /* PKT3_DISPATCH_TASKMESH_INDIRECT_MULTI_ACE */
238 *ace_cmd_size += 11 * 4;
239 } else {
240 struct radv_shader *ms = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_MESH);
241
242 /* PKT3_SET_BASE + PKT3_SET_SH_REG + PKT3_DISPATCH_MESH_INDIRECT_MULTI */
243 *cmd_size += (4 + (ms->info.vs.needs_draw_id ? 3 : 0) + 9) * 4;
244 }
245 } else {
246 /* PKT3_SET_BASE + PKT3_DRAW_{INDEX}_INDIRECT_MULTI */
247 *cmd_size += (4 + 10) * 4;
248 }
249 } else {
250 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_INDEXED)) {
251 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IB)) {
252 /* userdata writes + instance count + indexed draw */
253 *cmd_size += (5 + 2 + 5) * 4;
254 } else {
255 /* PKT3_SET_BASE + PKT3_SET_SH_REG + PKT3_DRAW_{INDEX}_INDIRECT_MULTI */
256 *cmd_size += (4 + (vs->info.vs.needs_draw_id ? 10 : 5)) * 4;
257 }
258 } else {
259 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) {
260 const struct radv_shader *task_shader = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK);
261
262 if (task_shader) {
263 const struct radv_userdata_info *xyz_loc = radv_get_user_sgpr_info(task_shader, AC_UD_CS_GRID_SIZE);
264 const struct radv_userdata_info *draw_id_loc =
265 radv_get_user_sgpr_info(task_shader, AC_UD_CS_TASK_DRAW_ID);
266
267 /* PKT3_DISPATCH_TASKMESH_GFX */
268 *cmd_size += 4 * 4;
269
270 if (xyz_loc->sgpr_idx != -1)
271 *ace_cmd_size += 5 * 4;
272 if (draw_id_loc->sgpr_idx != -1)
273 *ace_cmd_size += 3 * 4;
274
275 /* PKT3_DISPATCH_TASKMESH_DIRECT_ACE */
276 *ace_cmd_size += 6 * 4;
277 } else {
278 /* userdata writes + instance count + non-indexed draw */
279 *cmd_size += (6 + 2 + (pdev->mesh_fast_launch_2 ? 5 : 3)) * 4;
280 }
281 } else {
282 /* userdata writes + instance count + non-indexed draw */
283 *cmd_size += (5 + 2 + 3) * 4;
284 }
285 }
286 }
287
288 if (device->sqtt.bo) {
289 /* sqtt markers */
290 *cmd_size += 5 * 3 * 4;
291 }
292 }
293
294 static void
radv_get_sequence_size_rt(const struct radv_indirect_command_layout * layout,const void * pNext,uint32_t * cmd_size,uint32_t * upload_size)295 radv_get_sequence_size_rt(const struct radv_indirect_command_layout *layout, const void *pNext, uint32_t *cmd_size,
296 uint32_t *upload_size)
297 {
298 const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
299
300 const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
301 vk_find_struct_const(pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
302 VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline);
303 const struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
304 const struct radv_shader *rt_prolog = rt_pipeline->prolog;
305
306 /* dispatch */
307 *cmd_size += 5 * 4;
308
309 const struct radv_userdata_info *cs_grid_size_loc = radv_get_user_sgpr_info(rt_prolog, AC_UD_CS_GRID_SIZE);
310 if (cs_grid_size_loc->sgpr_idx != -1) {
311 if (device->load_grid_size_from_user_sgpr) {
312 /* PKT3_LOAD_SH_REG_INDEX */
313 *cmd_size += 5 * 4;
314 } else {
315 /* PKT3_SET_SH_REG for pointer */
316 *cmd_size += 4 * 4;
317 }
318 }
319
320 const struct radv_userdata_info *cs_sbt_descriptors_loc =
321 radv_get_user_sgpr_info(rt_prolog, AC_UD_CS_SBT_DESCRIPTORS);
322 if (cs_sbt_descriptors_loc->sgpr_idx != -1) {
323 /* PKT3_SET_SH_REG for pointer */
324 *cmd_size += 4 * 4;
325 }
326
327 const struct radv_userdata_info *cs_ray_launch_size_addr_loc =
328 radv_get_user_sgpr_info(rt_prolog, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR);
329 if (cs_ray_launch_size_addr_loc->sgpr_idx != -1) {
330 /* PKT3_SET_SH_REG for pointer */
331 *cmd_size += 4 * 4;
332 }
333
334 if (device->sqtt.bo) {
335 /* sqtt markers */
336 *cmd_size += 5 * 3 * 4;
337 }
338 }
339
340 static void
radv_get_sequence_size(const struct radv_indirect_command_layout * layout,const void * pNext,uint32_t * cmd_size,uint32_t * ace_cmd_size,uint32_t * upload_size)341 radv_get_sequence_size(const struct radv_indirect_command_layout *layout, const void *pNext, uint32_t *cmd_size,
342 uint32_t *ace_cmd_size, uint32_t *upload_size)
343 {
344 const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
345 const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
346 vk_find_struct_const(pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
347 const VkGeneratedCommandsShaderInfoEXT *eso_info = vk_find_struct_const(pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
348
349 *cmd_size = 0;
350 *ace_cmd_size = 0;
351 *upload_size = 0;
352
353 if (layout->vk.dgc_info & (BITFIELD_BIT(MESA_VK_DGC_PC) | BITFIELD_BIT(MESA_VK_DGC_SI))) {
354 VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, layout->vk.layout);
355 bool need_copy = false;
356
357 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) {
358 /* Assume the compute shader needs both user SGPRs because we can't know the information
359 * for indirect pipelines.
360 */
361 *cmd_size += 3 * 4;
362 need_copy = true;
363
364 *cmd_size += (3 * util_bitcount64(layout->push_constant_mask)) * 4;
365 } else {
366 struct radv_shader *shaders[MESA_VULKAN_SHADER_STAGES] = {0};
367 if (pipeline_info) {
368 VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline);
369
370 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) {
371 const struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
372 struct radv_shader *rt_prolog = rt_pipeline->prolog;
373
374 shaders[MESA_SHADER_COMPUTE] = rt_prolog;
375 } else {
376 memcpy(shaders, pipeline->shaders, sizeof(shaders));
377 }
378 } else if (eso_info) {
379 for (unsigned i = 0; i < eso_info->shaderCount; ++i) {
380 VK_FROM_HANDLE(radv_shader_object, shader_object, eso_info->pShaders[i]);
381 struct radv_shader *shader = shader_object->shader;
382 gl_shader_stage stage = shader->info.stage;
383
384 shaders[stage] = shader;
385 }
386 }
387
388 for (unsigned i = 0; i < ARRAY_SIZE(shaders); ++i) {
389 const struct radv_shader *shader = shaders[i];
390
391 if (!shader)
392 continue;
393
394 const struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs;
395 if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
396 /* One PKT3_SET_SH_REG for emitting push constants pointer (32-bit) */
397 if (i == MESA_SHADER_TASK) {
398 *ace_cmd_size += 3 * 4;
399 } else {
400 *cmd_size += 3 * 4;
401 }
402 need_copy = true;
403 }
404 if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
405 /* One PKT3_SET_SH_REG writing all inline push constants. */
406 const uint32_t inline_pc_size = (3 * util_bitcount64(layout->push_constant_mask)) * 4;
407
408 if (i == MESA_SHADER_TASK) {
409 *ace_cmd_size += inline_pc_size;
410 } else {
411 *cmd_size += inline_pc_size;
412 }
413 }
414 }
415 }
416
417 if (need_copy) {
418 *upload_size += align(pipeline_layout->push_constant_size, 16);
419 }
420 }
421
422 if (device->sqtt.bo) {
423 /* THREAD_TRACE_MARKER */
424 *cmd_size += 2 * 4;
425 }
426
427 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
428 radv_get_sequence_size_compute(layout, pNext, cmd_size, upload_size);
429 } else if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) {
430 radv_get_sequence_size_rt(layout, pNext, cmd_size, upload_size);
431 } else {
432 radv_get_sequence_size_graphics(layout, pNext, cmd_size, ace_cmd_size, upload_size);
433 }
434 }
435
436 struct dgc_cmdbuf_layout {
437 bool use_preamble;
438 uint32_t alloc_size;
439
440 uint32_t main_trailer_offset;
441 uint32_t main_preamble_offset;
442 uint32_t main_offset;
443 uint32_t main_cmd_stride;
444 uint32_t main_preamble_size;
445 uint32_t main_size;
446
447 uint32_t ace_trailer_offset;
448 uint32_t ace_preamble_offset;
449 uint32_t ace_main_offset;
450 uint32_t ace_cmd_stride;
451 uint32_t ace_preamble_size;
452 uint32_t ace_size;
453
454 uint32_t upload_offset;
455 uint32_t upload_stride;
456 uint32_t upload_size;
457 };
458
459 static void
get_dgc_cmdbuf_layout(const struct radv_device * device,const struct radv_indirect_command_layout * dgc_layout,const void * pNext,uint32_t sequences_count,bool use_preamble,struct dgc_cmdbuf_layout * layout)460 get_dgc_cmdbuf_layout(const struct radv_device *device, const struct radv_indirect_command_layout *dgc_layout,
461 const void *pNext, uint32_t sequences_count, bool use_preamble, struct dgc_cmdbuf_layout *layout)
462 {
463 uint32_t offset = 0;
464
465 memset(layout, 0, sizeof(*layout));
466
467 radv_get_sequence_size(dgc_layout, pNext, &layout->main_cmd_stride, &layout->ace_cmd_stride, &layout->upload_stride);
468
469 layout->use_preamble = use_preamble;
470 if (layout->use_preamble) {
471 layout->main_preamble_size = radv_dgc_preamble_cmdbuf_size(device, AMD_IP_GFX);
472 layout->ace_preamble_size = radv_dgc_preamble_cmdbuf_size(device, AMD_IP_COMPUTE);
473 }
474
475 layout->main_size =
476 radv_pad_cmdbuf(device, (layout->main_cmd_stride * sequences_count) + PKT3_INDIRECT_BUFFER_BYTES, AMD_IP_GFX);
477 layout->ace_size =
478 radv_pad_cmdbuf(device, (layout->ace_cmd_stride * sequences_count) + PKT3_INDIRECT_BUFFER_BYTES, AMD_IP_COMPUTE);
479 layout->upload_size = layout->upload_stride * sequences_count;
480
481 /* Main */
482 layout->main_trailer_offset = 0;
483
484 offset += radv_dgc_trailer_cmdbuf_size(device, AMD_IP_GFX);
485 offset = radv_align_cmdbuf(device, offset, AMD_IP_GFX);
486 layout->main_preamble_offset = offset;
487
488 if (layout->use_preamble)
489 offset += layout->main_preamble_size;
490 offset = radv_align_cmdbuf(device, offset, AMD_IP_GFX);
491
492 layout->main_offset = offset;
493 offset += layout->main_size;
494
495 /* ACE */
496 if (layout->ace_cmd_stride) {
497 offset = radv_align_cmdbuf(device, offset, AMD_IP_COMPUTE);
498
499 layout->ace_trailer_offset = offset;
500
501 offset += radv_dgc_trailer_cmdbuf_size(device, AMD_IP_COMPUTE);
502 offset = radv_align_cmdbuf(device, offset, AMD_IP_COMPUTE);
503
504 layout->ace_preamble_offset = offset;
505
506 if (layout->use_preamble)
507 offset += layout->ace_preamble_size;
508 offset = radv_align_cmdbuf(device, offset, AMD_IP_COMPUTE);
509
510 layout->ace_main_offset = offset;
511 offset += layout->ace_size;
512 }
513
514 /* Upload */
515 layout->upload_offset = offset;
516 offset += layout->upload_size;
517
518 layout->alloc_size = offset;
519 }
520
521 static uint32_t
radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo,enum amd_ip_type ip_type)522 radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo, enum amd_ip_type ip_type)
523 {
524 VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
525 const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
526 const bool use_preamble = radv_dgc_use_preamble(pGeneratedCommandsInfo);
527 const uint32_t sequences_count = pGeneratedCommandsInfo->maxSequenceCount;
528 struct dgc_cmdbuf_layout cmdbuf_layout;
529
530 get_dgc_cmdbuf_layout(device, layout, pGeneratedCommandsInfo->pNext, sequences_count, use_preamble, &cmdbuf_layout);
531
532 if (use_preamble)
533 return ip_type == AMD_IP_GFX ? cmdbuf_layout.main_preamble_size : cmdbuf_layout.ace_preamble_size;
534
535 return ip_type == AMD_IP_GFX ? cmdbuf_layout.main_size : cmdbuf_layout.ace_size;
536 }
537
538 static uint32_t
radv_get_indirect_cmdbuf_offset(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo,enum amd_ip_type ip_type)539 radv_get_indirect_cmdbuf_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo, enum amd_ip_type ip_type)
540 {
541 VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
542 const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
543 const bool use_preamble = radv_dgc_use_preamble(pGeneratedCommandsInfo);
544 const uint32_t sequences_count = pGeneratedCommandsInfo->maxSequenceCount;
545 struct dgc_cmdbuf_layout cmdbuf_layout;
546
547 get_dgc_cmdbuf_layout(device, layout, pGeneratedCommandsInfo->pNext, sequences_count, use_preamble, &cmdbuf_layout);
548
549 return ip_type == AMD_IP_GFX ? cmdbuf_layout.main_preamble_offset : cmdbuf_layout.ace_preamble_offset;
550 }
551
552 static uint32_t
radv_get_indirect_trailer_offset(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo,enum amd_ip_type ip_type)553 radv_get_indirect_trailer_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo, enum amd_ip_type ip_type)
554 {
555 VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
556 const struct radv_device *device = container_of(layout->vk.base.device, struct radv_device, vk);
557 const bool use_preamble = radv_dgc_use_preamble(pGeneratedCommandsInfo);
558 const uint32_t sequences_count = pGeneratedCommandsInfo->maxSequenceCount;
559 struct dgc_cmdbuf_layout cmdbuf_layout;
560
561 get_dgc_cmdbuf_layout(device, layout, pGeneratedCommandsInfo->pNext, sequences_count, use_preamble, &cmdbuf_layout);
562
563 const uint32_t offset = ip_type == AMD_IP_GFX ? cmdbuf_layout.main_trailer_offset : cmdbuf_layout.ace_trailer_offset;
564
565 return offset + radv_dgc_trailer_cmdbuf_size(device, ip_type) - PKT3_INDIRECT_BUFFER_BYTES;
566 }
567
568 uint32_t
radv_get_indirect_main_cmdbuf_size(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo)569 radv_get_indirect_main_cmdbuf_size(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
570 {
571 return radv_get_indirect_cmdbuf_size(pGeneratedCommandsInfo, AMD_IP_GFX);
572 }
573
574 uint32_t
radv_get_indirect_main_cmdbuf_offset(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo)575 radv_get_indirect_main_cmdbuf_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
576 {
577 return radv_get_indirect_cmdbuf_offset(pGeneratedCommandsInfo, AMD_IP_GFX);
578 }
579
580 uint32_t
radv_get_indirect_main_trailer_offset(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo)581 radv_get_indirect_main_trailer_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
582 {
583 return radv_get_indirect_trailer_offset(pGeneratedCommandsInfo, AMD_IP_GFX);
584 }
585
586 uint32_t
radv_get_indirect_ace_cmdbuf_size(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo)587 radv_get_indirect_ace_cmdbuf_size(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
588 {
589 return radv_get_indirect_cmdbuf_size(pGeneratedCommandsInfo, AMD_IP_COMPUTE);
590 }
591
592 uint32_t
radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo)593 radv_get_indirect_ace_cmdbuf_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
594 {
595 return radv_get_indirect_cmdbuf_offset(pGeneratedCommandsInfo, AMD_IP_COMPUTE);
596 }
597
598 uint32_t
radv_get_indirect_ace_trailer_offset(const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo)599 radv_get_indirect_ace_trailer_offset(const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
600 {
601 return radv_get_indirect_trailer_offset(pGeneratedCommandsInfo, AMD_IP_COMPUTE);
602 }
603
604 struct radv_dgc_params {
605 uint32_t cmd_buf_preamble_offset;
606 uint32_t cmd_buf_main_offset;
607 uint32_t cmd_buf_stride;
608 uint32_t cmd_buf_size;
609 uint32_t ace_cmd_buf_trailer_offset;
610 uint32_t ace_cmd_buf_preamble_offset;
611 uint32_t ace_cmd_buf_main_offset;
612 uint32_t ace_cmd_buf_stride;
613 uint32_t ace_cmd_buf_size;
614 uint32_t upload_main_offset;
615 uint32_t upload_stride;
616 uint32_t upload_addr;
617 uint32_t sequence_count;
618 uint64_t sequence_count_addr;
619 uint64_t stream_addr;
620
621 uint8_t queue_family;
622 uint8_t use_preamble;
623
624 /* draw info */
625 uint16_t vtx_base_sgpr;
626 uint32_t max_index_count;
627 uint32_t max_draw_count;
628
629 /* task/mesh info */
630 uint8_t has_task_shader;
631 uint16_t mesh_ring_entry_sgpr;
632 uint8_t linear_dispatch_en;
633 uint16_t task_ring_entry_sgpr;
634 uint16_t task_xyz_sgpr;
635 uint16_t task_draw_id_sgpr;
636
637 /* dispatch info */
638 uint16_t grid_base_sgpr;
639 uint32_t wave32;
640
641 /* RT info */
642 uint16_t cs_sbt_descriptors;
643 uint16_t cs_ray_launch_size_addr;
644
645 /* VBO info */
646 uint32_t vb_desc_usage_mask;
647 uint16_t vbo_reg;
648 uint8_t dynamic_vs_input;
649 uint8_t use_per_attribute_vb_descs;
650
651 /* push constants info */
652 uint8_t const_copy;
653 uint16_t push_constant_stages;
654
655 /* IES info */
656 uint64_t ies_addr;
657 uint32_t ies_stride;
658 uint32_t indirect_desc_sets_va;
659
660 /* For conditional rendering on ACE. */
661 uint8_t predicating;
662 uint8_t predication_type;
663 uint64_t predication_va;
664 };
665
666 enum {
667 DGC_USES_DRAWID = 1u << 14,
668 DGC_USES_BASEINSTANCE = 1u << 15,
669 DGC_USES_GRID_SIZE = DGC_USES_BASEINSTANCE, /* Mesh shader only */
670 };
671
672 struct dgc_cmdbuf {
673 const struct radv_device *dev;
674 const struct radv_indirect_command_layout *layout;
675
676 nir_builder *b;
677 nir_def *va;
678 nir_variable *offset;
679 nir_variable *upload_offset;
680
681 nir_def *ies_va;
682 };
683
684 static void
dgc_emit(struct dgc_cmdbuf * cs,unsigned count,nir_def ** values)685 dgc_emit(struct dgc_cmdbuf *cs, unsigned count, nir_def **values)
686 {
687 nir_builder *b = cs->b;
688
689 for (unsigned i = 0; i < count; i += 4) {
690 nir_def *offset = nir_load_var(b, cs->offset);
691 nir_def *store_val = nir_vec(b, values + i, MIN2(count - i, 4));
692 assert(store_val->bit_size >= 32);
693 nir_build_store_global(b, store_val, nir_iadd(b, cs->va, nir_u2u64(b, offset)), .access = ACCESS_NON_READABLE);
694 nir_store_var(b, cs->offset, nir_iadd_imm(b, offset, store_val->num_components * store_val->bit_size / 8), 0x1);
695 }
696 }
697
698 static void
dgc_upload(struct dgc_cmdbuf * cs,nir_def * data)699 dgc_upload(struct dgc_cmdbuf *cs, nir_def *data)
700 {
701 nir_builder *b = cs->b;
702
703 nir_def *upload_offset = nir_load_var(b, cs->upload_offset);
704 nir_build_store_global(b, data, nir_iadd(b, cs->va, nir_u2u64(b, upload_offset)), .access = ACCESS_NON_READABLE);
705 nir_store_var(b, cs->upload_offset, nir_iadd_imm(b, upload_offset, data->num_components * data->bit_size / 8), 0x1);
706 }
707
708 #define load_param32(b, field) \
709 nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0), .base = offsetof(struct radv_dgc_params, field), .range = 4)
710
711 #define load_param16(b, field) \
712 nir_ubfe_imm((b), \
713 nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0), \
714 .base = (offsetof(struct radv_dgc_params, field) & ~3), .range = 4), \
715 (offsetof(struct radv_dgc_params, field) & 2) * 8, 16)
716
717 #define load_param8(b, field) \
718 nir_ubfe_imm((b), \
719 nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0), \
720 .base = (offsetof(struct radv_dgc_params, field) & ~3), .range = 4), \
721 (offsetof(struct radv_dgc_params, field) & 3) * 8, 8)
722
723 #define load_param64(b, field) \
724 nir_pack_64_2x32((b), nir_load_push_constant((b), 2, 32, nir_imm_int((b), 0), \
725 .base = offsetof(struct radv_dgc_params, field), .range = 8))
726
727 static nir_def *
dgc_load_ies_va(struct dgc_cmdbuf * cs,nir_def * stream_addr)728 dgc_load_ies_va(struct dgc_cmdbuf *cs, nir_def *stream_addr)
729 {
730 const struct radv_indirect_command_layout *layout = cs->layout;
731 nir_builder *b = cs->b;
732
733 nir_def *offset = nir_imm_int(b, layout->vk.ies_src_offset_B);
734 nir_def *ies_index =
735 nir_build_load_global(b, 1, 32, nir_iadd(b, stream_addr, nir_u2u64(b, offset)), .access = ACCESS_NON_WRITEABLE);
736 nir_def *ies_stride = load_param32(b, ies_stride);
737 nir_def *ies_offset = nir_imul(b, ies_index, ies_stride);
738
739 return nir_iadd(b, load_param64(b, ies_addr), nir_u2u64(b, ies_offset));
740 }
741
742 static nir_def *
dgc_load_shader_metadata(struct dgc_cmdbuf * cs,uint32_t bitsize,uint32_t field_offset)743 dgc_load_shader_metadata(struct dgc_cmdbuf *cs, uint32_t bitsize, uint32_t field_offset)
744 {
745 const struct radv_indirect_command_layout *layout = cs->layout;
746 nir_builder *b = cs->b;
747
748 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) {
749 return nir_load_global(b, nir_iadd_imm(b, cs->ies_va, field_offset), 4, 1, bitsize);
750 } else {
751 nir_def *params_buf = radv_meta_load_descriptor(b, 0, 0);
752
753 return nir_load_ssbo(b, 1, bitsize, params_buf, nir_imm_int(b, field_offset));
754 }
755
756 return NULL;
757 }
758
759 #define load_shader_metadata32(cs, field) \
760 dgc_load_shader_metadata(cs, 32, offsetof(struct radv_compute_pipeline_metadata, field))
761 #define load_shader_metadata64(cs, field) \
762 dgc_load_shader_metadata(cs, 64, offsetof(struct radv_compute_pipeline_metadata, field))
763
764 static nir_def *
dgc_load_vbo_metadata(struct dgc_cmdbuf * cs,uint32_t bitsize,nir_def * idx,uint32_t field_offset)765 dgc_load_vbo_metadata(struct dgc_cmdbuf *cs, uint32_t bitsize, nir_def *idx, uint32_t field_offset)
766 {
767 nir_builder *b = cs->b;
768
769 nir_def *param_buf = radv_meta_load_descriptor(b, 0, 0);
770
771 nir_def *offset = nir_imul_imm(b, idx, DGC_VBO_INFO_SIZE);
772
773 return nir_load_ssbo(b, 1, bitsize, param_buf, nir_iadd_imm(b, offset, field_offset));
774 }
775
776 #define load_vbo_metadata32(cs, idx, field) dgc_load_vbo_metadata(cs, 32, idx, offsetof(struct radv_vbo_info, field))
777 #define load_vbo_metadata64(cs, idx, field) dgc_load_vbo_metadata(cs, 64, idx, offsetof(struct radv_vbo_info, field))
778 #define load_vbo_offset(cs, idx) dgc_load_vbo_metadata(cs, 32, idx, sizeof(struct radv_vbo_info))
779
780 /* DGC cs emit macros */
781 #define dgc_cs_begin(cs) \
782 struct dgc_cmdbuf *__cs = (cs); \
783 nir_def *__dwords[32]; \
784 unsigned __num_dw = 0;
785
786 #define dgc_cs_emit(value) \
787 assert(__num_dw < ARRAY_SIZE(__dwords)); \
788 __dwords[__num_dw++] = value;
789
790 #define dgc_cs_emit_imm(value) dgc_cs_emit(nir_imm_int(__cs->b, value));
791
792 #define dgc_cs_set_sh_reg_seq(reg, num) \
793 dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, num, 0)); \
794 dgc_cs_emit_imm((reg - SI_SH_REG_OFFSET) >> 2);
795
796 #define dgc_cs_end() dgc_emit(__cs, __num_dw, __dwords);
797
798 static nir_def *
nir_pkt3_base(nir_builder * b,unsigned op,nir_def * len,bool predicate)799 nir_pkt3_base(nir_builder *b, unsigned op, nir_def *len, bool predicate)
800 {
801 len = nir_iand_imm(b, len, 0x3fff);
802 return nir_ior_imm(b, nir_ishl_imm(b, len, 16), PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT3_PREDICATE(predicate));
803 }
804
805 static nir_def *
nir_pkt3(nir_builder * b,unsigned op,nir_def * len)806 nir_pkt3(nir_builder *b, unsigned op, nir_def *len)
807 {
808 return nir_pkt3_base(b, op, len, false);
809 }
810
811 /**
812 * SQTT
813 */
814 static void
dgc_emit_sqtt_userdata(struct dgc_cmdbuf * cs,nir_def * data)815 dgc_emit_sqtt_userdata(struct dgc_cmdbuf *cs, nir_def *data)
816 {
817 const struct radv_device *device = cs->dev;
818 const struct radv_physical_device *pdev = radv_device_physical(device);
819 nir_builder *b = cs->b;
820
821 if (!cs->dev->sqtt.bo)
822 return;
823
824 dgc_cs_begin(cs);
825 dgc_cs_emit(nir_pkt3_base(b, PKT3_SET_UCONFIG_REG, nir_imm_int(b, 1), pdev->info.gfx_level >= GFX10));
826 dgc_cs_emit_imm((R_030D08_SQ_THREAD_TRACE_USERDATA_2 - CIK_UCONFIG_REG_OFFSET) >> 2);
827 dgc_cs_emit(data);
828 dgc_cs_end();
829 }
830
831 static void
dgc_emit_sqtt_thread_trace_marker(struct dgc_cmdbuf * cs)832 dgc_emit_sqtt_thread_trace_marker(struct dgc_cmdbuf *cs)
833 {
834 if (!cs->dev->sqtt.bo)
835 return;
836
837 dgc_cs_begin(cs);
838 dgc_cs_emit_imm(PKT3(PKT3_EVENT_WRITE, 0, 0));
839 dgc_cs_emit_imm(EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER | EVENT_INDEX(0)));
840 dgc_cs_end();
841 }
842
843 static void
dgc_emit_sqtt_marker_event(struct dgc_cmdbuf * cs,nir_def * sequence_id,enum rgp_sqtt_marker_event_type event)844 dgc_emit_sqtt_marker_event(struct dgc_cmdbuf *cs, nir_def *sequence_id, enum rgp_sqtt_marker_event_type event)
845 {
846 struct rgp_sqtt_marker_event marker = {0};
847 nir_builder *b = cs->b;
848
849 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
850 marker.api_type = event;
851
852 dgc_emit_sqtt_userdata(cs, nir_imm_int(b, marker.dword01));
853 dgc_emit_sqtt_userdata(cs, nir_imm_int(b, marker.dword02));
854 dgc_emit_sqtt_userdata(cs, sequence_id);
855 }
856
857 static void
dgc_emit_sqtt_marker_event_with_dims(struct dgc_cmdbuf * cs,nir_def * sequence_id,nir_def * x,nir_def * y,nir_def * z,enum rgp_sqtt_marker_event_type event)858 dgc_emit_sqtt_marker_event_with_dims(struct dgc_cmdbuf *cs, nir_def *sequence_id, nir_def *x, nir_def *y, nir_def *z,
859 enum rgp_sqtt_marker_event_type event)
860 {
861 struct rgp_sqtt_marker_event_with_dims marker = {0};
862 nir_builder *b = cs->b;
863
864 marker.event.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
865 marker.event.api_type = event;
866 marker.event.has_thread_dims = 1;
867
868 dgc_emit_sqtt_userdata(cs, nir_imm_int(b, marker.event.dword01));
869 dgc_emit_sqtt_userdata(cs, nir_imm_int(b, marker.event.dword02));
870 dgc_emit_sqtt_userdata(cs, sequence_id);
871 dgc_emit_sqtt_userdata(cs, x);
872 dgc_emit_sqtt_userdata(cs, y);
873 dgc_emit_sqtt_userdata(cs, z);
874 }
875
876 static void
dgc_emit_sqtt_begin_api_marker(struct dgc_cmdbuf * cs,enum rgp_sqtt_marker_general_api_type api_type)877 dgc_emit_sqtt_begin_api_marker(struct dgc_cmdbuf *cs, enum rgp_sqtt_marker_general_api_type api_type)
878 {
879 struct rgp_sqtt_marker_general_api marker = {0};
880 nir_builder *b = cs->b;
881
882 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_GENERAL_API;
883 marker.api_type = api_type;
884
885 dgc_emit_sqtt_userdata(cs, nir_imm_int(b, marker.dword01));
886 }
887
888 static void
dgc_emit_sqtt_end_api_marker(struct dgc_cmdbuf * cs,enum rgp_sqtt_marker_general_api_type api_type)889 dgc_emit_sqtt_end_api_marker(struct dgc_cmdbuf *cs, enum rgp_sqtt_marker_general_api_type api_type)
890 {
891 struct rgp_sqtt_marker_general_api marker = {0};
892 nir_builder *b = cs->b;
893
894 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_GENERAL_API;
895 marker.api_type = api_type;
896 marker.is_end = 1;
897
898 dgc_emit_sqtt_userdata(cs, nir_imm_int(b, marker.dword01));
899 }
900
901 /**
902 * Command buffer
903 */
904 static nir_def *
dgc_cmd_buf_size(nir_builder * b,nir_def * sequence_count,bool is_ace,const struct radv_device * device)905 dgc_cmd_buf_size(nir_builder *b, nir_def *sequence_count, bool is_ace, const struct radv_device *device)
906 {
907 nir_def *cmd_buf_size = is_ace ? load_param32(b, ace_cmd_buf_size) : load_param32(b, cmd_buf_size);
908 nir_def *cmd_buf_stride = is_ace ? load_param32(b, ace_cmd_buf_stride) : load_param32(b, cmd_buf_stride);
909 const enum amd_ip_type ip_type = is_ace ? AMD_IP_COMPUTE : AMD_IP_GFX;
910
911 nir_def *use_preamble = nir_ine_imm(b, load_param8(b, use_preamble), 0);
912 nir_def *size = nir_iadd_imm(b, nir_imul(b, cmd_buf_stride, sequence_count), PKT3_INDIRECT_BUFFER_BYTES);
913 unsigned align_mask = radv_pad_cmdbuf(device, 1, ip_type) - 1;
914
915 size = nir_iand_imm(b, nir_iadd_imm(b, size, align_mask), ~align_mask);
916
917 /* Ensure we don't have to deal with a jump to an empty IB in the preamble. */
918 size = nir_imax(b, size, nir_imm_int(b, align_mask + 1));
919
920 return nir_bcsel(b, use_preamble, size, cmd_buf_size);
921 }
922
923 static void
build_dgc_buffer_tail(nir_builder * b,nir_def * cmd_buf_offset,nir_def * cmd_buf_size,nir_def * cmd_buf_stride,nir_def * cmd_buf_trailer_offset,nir_def * sequence_count,unsigned trailer_size,bool is_ace,const struct radv_device * device)924 build_dgc_buffer_tail(nir_builder *b, nir_def *cmd_buf_offset, nir_def *cmd_buf_size, nir_def *cmd_buf_stride,
925 nir_def *cmd_buf_trailer_offset, nir_def *sequence_count, unsigned trailer_size, bool is_ace,
926 const struct radv_device *device)
927 {
928 const struct radv_physical_device *pdev = radv_device_physical(device);
929 nir_def *is_compute_queue = nir_ior_imm(b, nir_ieq_imm(b, load_param8(b, queue_family), RADV_QUEUE_COMPUTE), is_ace);
930
931 nir_def *global_id = get_global_ids(b, 1);
932
933 nir_push_if(b, nir_ieq_imm(b, global_id, 0));
934 {
935 nir_def *cmd_buf_tail_start = nir_imul(b, cmd_buf_stride, sequence_count);
936
937 nir_variable *offset = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "offset");
938 nir_store_var(b, offset, cmd_buf_tail_start, 0x1);
939
940 /* On compute queue, the DGC command buffer is chained by patching the
941 * trailer but this isn't needed on graphics because it's using IB2.
942 */
943 cmd_buf_size =
944 nir_bcsel(b, is_compute_queue, nir_iadd_imm(b, cmd_buf_size, -PKT3_INDIRECT_BUFFER_BYTES), cmd_buf_size);
945
946 nir_def *va = nir_pack_64_2x32_split(b, load_param32(b, upload_addr), nir_imm_int(b, pdev->info.address32_hi));
947 nir_push_loop(b);
948 {
949 nir_def *curr_offset = nir_load_var(b, offset);
950 const unsigned MAX_PACKET_WORDS = 0x3FFC;
951
952 nir_break_if(b, nir_ieq(b, curr_offset, cmd_buf_size));
953
954 nir_def *packet, *packet_size;
955
956 packet_size = nir_isub(b, cmd_buf_size, curr_offset);
957 packet_size = nir_umin(b, packet_size, nir_imm_int(b, MAX_PACKET_WORDS * 4));
958
959 nir_def *len = nir_ushr_imm(b, packet_size, 2);
960 len = nir_iadd_imm(b, len, -2);
961 packet = nir_pkt3(b, PKT3_NOP, len);
962
963 nir_build_store_global(b, packet, nir_iadd(b, va, nir_u2u64(b, nir_iadd(b, curr_offset, cmd_buf_offset))),
964 .access = ACCESS_NON_READABLE);
965
966 nir_store_var(b, offset, nir_iadd(b, curr_offset, packet_size), 0x1);
967 }
968 nir_pop_loop(b, NULL);
969
970 nir_push_if(b, is_compute_queue);
971 {
972 nir_def *chain_packets[] = {
973 nir_imm_int(b, PKT3(PKT3_INDIRECT_BUFFER, 2, 0)),
974 nir_iadd(b, load_param32(b, upload_addr), cmd_buf_trailer_offset),
975 nir_imm_int(b, pdev->info.address32_hi),
976 nir_imm_int(b, trailer_size | S_3F2_CHAIN(1) | S_3F2_VALID(1) | S_3F2_PRE_ENA(false)),
977 };
978
979 nir_build_store_global(b, nir_vec(b, chain_packets, 4),
980 nir_iadd(b, va, nir_u2u64(b, nir_iadd(b, nir_load_var(b, offset), cmd_buf_offset))),
981 .access = ACCESS_NON_READABLE);
982 }
983 nir_pop_if(b, NULL);
984 }
985 nir_pop_if(b, NULL);
986 }
987
988 static void
build_dgc_buffer_tail_main(nir_builder * b,nir_def * sequence_count,const struct radv_device * device)989 build_dgc_buffer_tail_main(nir_builder *b, nir_def *sequence_count, const struct radv_device *device)
990 {
991 nir_def *cmd_buf_offset = load_param32(b, cmd_buf_main_offset);
992 nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, false, device);
993 nir_def *cmd_buf_stride = load_param32(b, cmd_buf_stride);
994 nir_def *cmd_buf_trailer_offset = nir_imm_int(b, 0);
995 unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_GFX) / 4;
996
997 build_dgc_buffer_tail(b, cmd_buf_offset, cmd_buf_size, cmd_buf_stride, cmd_buf_trailer_offset, sequence_count,
998 trailer_size, false, device);
999 }
1000
1001 static void
build_dgc_buffer_tail_ace(nir_builder * b,nir_def * sequence_count,const struct radv_device * device)1002 build_dgc_buffer_tail_ace(nir_builder *b, nir_def *sequence_count, const struct radv_device *device)
1003 {
1004 nir_def *cmd_buf_offset = load_param32(b, ace_cmd_buf_main_offset);
1005 nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, true, device);
1006 nir_def *cmd_buf_stride = load_param32(b, ace_cmd_buf_stride);
1007 nir_def *cmd_buf_trailer_offset = load_param32(b, ace_cmd_buf_trailer_offset);
1008 unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_COMPUTE) / 4;
1009
1010 build_dgc_buffer_tail(b, cmd_buf_offset, cmd_buf_size, cmd_buf_stride, cmd_buf_trailer_offset, sequence_count,
1011 trailer_size, true, device);
1012 }
1013
1014 static void
build_dgc_buffer_trailer(nir_builder * b,nir_def * cmd_buf_offset,unsigned trailer_size,const struct radv_device * device)1015 build_dgc_buffer_trailer(nir_builder *b, nir_def *cmd_buf_offset, unsigned trailer_size,
1016 const struct radv_device *device)
1017 {
1018 const struct radv_physical_device *pdev = radv_device_physical(device);
1019
1020 nir_def *global_id = get_global_ids(b, 1);
1021
1022 nir_push_if(b, nir_ieq_imm(b, global_id, 0));
1023 {
1024 nir_def *va = nir_pack_64_2x32_split(b, load_param32(b, upload_addr), nir_imm_int(b, pdev->info.address32_hi));
1025 va = nir_iadd(b, va, nir_u2u64(b, cmd_buf_offset));
1026
1027 const uint32_t pad_size = trailer_size - PKT3_INDIRECT_BUFFER_BYTES;
1028 const uint32_t pad_size_dw = pad_size >> 2;
1029
1030 nir_def *len = nir_imm_int(b, pad_size_dw - 2);
1031 nir_def *packet = nir_pkt3(b, PKT3_NOP, len);
1032
1033 nir_build_store_global(b, packet, va, .access = ACCESS_NON_READABLE);
1034
1035 nir_def *nop_packets[] = {
1036 nir_imm_int(b, PKT3_NOP_PAD),
1037 nir_imm_int(b, PKT3_NOP_PAD),
1038 nir_imm_int(b, PKT3_NOP_PAD),
1039 nir_imm_int(b, PKT3_NOP_PAD),
1040 };
1041
1042 nir_build_store_global(b, nir_vec(b, nop_packets, 4), nir_iadd_imm(b, va, pad_size),
1043 .access = ACCESS_NON_READABLE);
1044 }
1045 nir_pop_if(b, NULL);
1046 }
1047
1048 static void
build_dgc_buffer_trailer_main(nir_builder * b,const struct radv_device * device)1049 build_dgc_buffer_trailer_main(nir_builder *b, const struct radv_device *device)
1050 {
1051 nir_def *cmd_buf_offset = nir_imm_int(b, 0);
1052 const unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_GFX);
1053
1054 build_dgc_buffer_trailer(b, cmd_buf_offset, trailer_size, device);
1055 }
1056
1057 static void
build_dgc_buffer_trailer_ace(nir_builder * b,const struct radv_device * device)1058 build_dgc_buffer_trailer_ace(nir_builder *b, const struct radv_device *device)
1059 {
1060 nir_def *cmd_buf_offset = load_param32(b, ace_cmd_buf_trailer_offset);
1061 const unsigned trailer_size = radv_dgc_trailer_cmdbuf_size(device, AMD_IP_COMPUTE);
1062
1063 build_dgc_buffer_trailer(b, cmd_buf_offset, trailer_size, device);
1064 }
1065
1066 static void
build_dgc_buffer_preamble(nir_builder * b,nir_def * cmd_buf_preamble_offset,nir_def * cmd_buf_size,nir_def * cmd_buf_main_offset,unsigned preamble_size,nir_def * sequence_count,const struct radv_device * device)1067 build_dgc_buffer_preamble(nir_builder *b, nir_def *cmd_buf_preamble_offset, nir_def *cmd_buf_size,
1068 nir_def *cmd_buf_main_offset, unsigned preamble_size, nir_def *sequence_count,
1069 const struct radv_device *device)
1070 {
1071 const struct radv_physical_device *pdev = radv_device_physical(device);
1072
1073 nir_def *global_id = get_global_ids(b, 1);
1074 nir_def *use_preamble = nir_ine_imm(b, load_param8(b, use_preamble), 0);
1075
1076 nir_push_if(b, nir_iand(b, nir_ieq_imm(b, global_id, 0), use_preamble));
1077 {
1078 nir_def *va = nir_pack_64_2x32_split(b, load_param32(b, upload_addr), nir_imm_int(b, pdev->info.address32_hi));
1079 va = nir_iadd(b, va, nir_u2u64(b, cmd_buf_preamble_offset));
1080
1081 nir_def *words = nir_ushr_imm(b, cmd_buf_size, 2);
1082
1083 const uint32_t pad_size = preamble_size - PKT3_INDIRECT_BUFFER_BYTES;
1084 const uint32_t pad_size_dw = pad_size >> 2;
1085
1086 nir_def *len = nir_imm_int(b, pad_size_dw - 2);
1087 nir_def *packet = nir_pkt3(b, PKT3_NOP, len);
1088
1089 nir_build_store_global(b, packet, va, .access = ACCESS_NON_READABLE);
1090
1091 nir_def *chain_packets[] = {
1092 nir_imm_int(b, PKT3(PKT3_INDIRECT_BUFFER, 2, 0)),
1093 nir_iadd(b, cmd_buf_main_offset, load_param32(b, upload_addr)),
1094 nir_imm_int(b, pdev->info.address32_hi),
1095 nir_ior_imm(b, words, S_3F2_CHAIN(1) | S_3F2_VALID(1) | S_3F2_PRE_ENA(false)),
1096 };
1097
1098 nir_build_store_global(b, nir_vec(b, chain_packets, 4), nir_iadd_imm(b, va, pad_size),
1099 .access = ACCESS_NON_READABLE);
1100 }
1101 nir_pop_if(b, NULL);
1102 }
1103
1104 static void
build_dgc_buffer_preamble_main(nir_builder * b,nir_def * sequence_count,const struct radv_device * device)1105 build_dgc_buffer_preamble_main(nir_builder *b, nir_def *sequence_count, const struct radv_device *device)
1106 {
1107 nir_def *cmd_buf_preamble_offset = load_param32(b, cmd_buf_preamble_offset);
1108 nir_def *cmd_buf_main_offset = load_param32(b, cmd_buf_main_offset);
1109 nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, false, device);
1110 unsigned preamble_size = radv_dgc_preamble_cmdbuf_size(device, AMD_IP_GFX);
1111
1112 build_dgc_buffer_preamble(b, cmd_buf_preamble_offset, cmd_buf_size, cmd_buf_main_offset, preamble_size,
1113 sequence_count, device);
1114 }
1115
1116 static void
build_dgc_buffer_preamble_ace(nir_builder * b,nir_def * sequence_count,const struct radv_device * device)1117 build_dgc_buffer_preamble_ace(nir_builder *b, nir_def *sequence_count, const struct radv_device *device)
1118 {
1119 nir_def *cmd_buf_preamble_offset = load_param32(b, ace_cmd_buf_preamble_offset);
1120 nir_def *cmd_buf_main_offset = load_param32(b, ace_cmd_buf_main_offset);
1121 nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, true, device);
1122 unsigned preamble_size = radv_dgc_preamble_cmdbuf_size(device, AMD_IP_COMPUTE);
1123
1124 build_dgc_buffer_preamble(b, cmd_buf_preamble_offset, cmd_buf_size, cmd_buf_main_offset, preamble_size,
1125 sequence_count, device);
1126 }
1127
1128 /**
1129 * Draw
1130 */
1131 static void
dgc_emit_userdata_vertex(struct dgc_cmdbuf * cs,nir_def * first_vertex,nir_def * first_instance,nir_def * drawid)1132 dgc_emit_userdata_vertex(struct dgc_cmdbuf *cs, nir_def *first_vertex, nir_def *first_instance, nir_def *drawid)
1133 {
1134 nir_builder *b = cs->b;
1135
1136 nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
1137 vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr);
1138
1139 nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
1140 nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE);
1141
1142 nir_def *pkt_cnt = nir_imm_int(b, 1);
1143 pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
1144 pkt_cnt = nir_bcsel(b, has_baseinstance, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
1145
1146 dgc_cs_begin(cs);
1147 dgc_cs_emit(nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt));
1148 dgc_cs_emit(nir_iand_imm(b, vtx_base_sgpr, 0x3FFF));
1149 dgc_cs_emit(first_vertex);
1150 dgc_cs_emit(nir_bcsel(b, nir_ior(b, has_drawid, has_baseinstance), nir_bcsel(b, has_drawid, drawid, first_instance),
1151 nir_imm_int(b, PKT3_NOP_PAD)));
1152 dgc_cs_emit(nir_bcsel(b, nir_iand(b, has_drawid, has_baseinstance), first_instance, nir_imm_int(b, PKT3_NOP_PAD)));
1153 dgc_cs_end();
1154 }
1155
1156 static void
dgc_emit_instance_count(struct dgc_cmdbuf * cs,nir_def * instance_count)1157 dgc_emit_instance_count(struct dgc_cmdbuf *cs, nir_def *instance_count)
1158 {
1159 dgc_cs_begin(cs);
1160 dgc_cs_emit_imm(PKT3(PKT3_NUM_INSTANCES, 0, 0));
1161 dgc_cs_emit(instance_count);
1162 dgc_cs_end();
1163 }
1164
1165 static void
dgc_emit_draw_index_offset_2(struct dgc_cmdbuf * cs,nir_def * index_offset,nir_def * index_count,nir_def * max_index_count)1166 dgc_emit_draw_index_offset_2(struct dgc_cmdbuf *cs, nir_def *index_offset, nir_def *index_count,
1167 nir_def *max_index_count)
1168 {
1169 dgc_cs_begin(cs);
1170 dgc_cs_emit_imm(PKT3(PKT3_DRAW_INDEX_OFFSET_2, 3, 0));
1171 dgc_cs_emit(max_index_count);
1172 dgc_cs_emit(index_offset);
1173 dgc_cs_emit(index_count);
1174 dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_DMA);
1175 dgc_cs_end();
1176 }
1177
1178 static void
dgc_emit_draw_index_auto(struct dgc_cmdbuf * cs,nir_def * vertex_count)1179 dgc_emit_draw_index_auto(struct dgc_cmdbuf *cs, nir_def *vertex_count)
1180 {
1181 dgc_cs_begin(cs);
1182 dgc_cs_emit_imm(PKT3(PKT3_DRAW_INDEX_AUTO, 1, 0));
1183 dgc_cs_emit(vertex_count);
1184 dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_AUTO_INDEX);
1185 dgc_cs_end();
1186 }
1187
1188 static void
dgc_emit_pkt3_set_base(struct dgc_cmdbuf * cs,nir_def * va)1189 dgc_emit_pkt3_set_base(struct dgc_cmdbuf *cs, nir_def *va)
1190 {
1191 nir_builder *b = cs->b;
1192
1193 nir_def *va_lo = nir_unpack_64_2x32_split_x(b, va);
1194 nir_def *va_hi = nir_unpack_64_2x32_split_y(b, va);
1195
1196 dgc_cs_begin(cs);
1197 dgc_cs_emit_imm(PKT3(PKT3_SET_BASE, 2, 0));
1198 dgc_cs_emit_imm(1);
1199 dgc_cs_emit(va_lo);
1200 dgc_cs_emit(va_hi);
1201 dgc_cs_end();
1202 }
1203
1204 static void
dgc_emit_pkt3_draw_indirect(struct dgc_cmdbuf * cs,bool indexed)1205 dgc_emit_pkt3_draw_indirect(struct dgc_cmdbuf *cs, bool indexed)
1206 {
1207 const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
1208 nir_builder *b = cs->b;
1209
1210 nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
1211
1212 nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
1213 nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE);
1214
1215 vtx_base_sgpr = nir_iand_imm(b, nir_u2u32(b, vtx_base_sgpr), 0x3FFF);
1216
1217 /* vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2 */
1218 nir_def *vertex_offset_reg = vtx_base_sgpr;
1219
1220 /* start_instance_reg = (base_reg + (draw_id_enable ? 8 : 4) - SI_SH_REG_OFFSET) >> 2 */
1221 nir_def *start_instance_offset = nir_bcsel(b, has_drawid, nir_imm_int(b, 2), nir_imm_int(b, 1));
1222 nir_def *start_instance_reg = nir_iadd(b, vtx_base_sgpr, start_instance_offset);
1223
1224 /* draw_id_reg = (base_reg + 4 - SI_SH_REG_OFFSET) >> 2 */
1225 nir_def *draw_id_reg = nir_iadd(b, vtx_base_sgpr, nir_imm_int(b, 1));
1226
1227 nir_if *if_drawid = nir_push_if(b, has_drawid);
1228 {
1229 const unsigned pkt3_op = indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI;
1230
1231 dgc_cs_begin(cs);
1232 dgc_cs_emit_imm(PKT3(pkt3_op, 8, 0));
1233 dgc_cs_emit_imm(0);
1234 dgc_cs_emit(vertex_offset_reg);
1235 dgc_cs_emit(nir_bcsel(b, has_baseinstance, start_instance_reg, nir_imm_int(b, 0)));
1236 dgc_cs_emit(nir_ior(b, draw_id_reg, nir_imm_int(b, S_2C3_DRAW_INDEX_ENABLE(1))));
1237 dgc_cs_emit_imm(1); /* draw count */
1238 dgc_cs_emit_imm(0); /* count va low */
1239 dgc_cs_emit_imm(0); /* count va high */
1240 dgc_cs_emit_imm(0); /* stride */
1241 dgc_cs_emit_imm(di_src_sel);
1242 dgc_cs_end();
1243 }
1244 nir_push_else(b, if_drawid);
1245 {
1246 const unsigned pkt3_op = indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT;
1247
1248 dgc_cs_begin(cs);
1249 dgc_cs_emit_imm(PKT3(pkt3_op, 3, 0));
1250 dgc_cs_emit_imm(0);
1251 dgc_cs_emit(vertex_offset_reg);
1252 dgc_cs_emit(nir_bcsel(b, has_baseinstance, start_instance_reg, nir_imm_int(b, 0)));
1253 dgc_cs_emit_imm(di_src_sel);
1254 dgc_cs_end();
1255 }
1256 nir_pop_if(b, if_drawid);
1257 }
1258
1259 static void
dgc_emit_draw_indirect(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id,bool indexed)1260 dgc_emit_draw_indirect(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, bool indexed)
1261 {
1262 const struct radv_indirect_command_layout *layout = cs->layout;
1263 nir_builder *b = cs->b;
1264
1265 nir_def *va = nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B);
1266
1267 dgc_emit_sqtt_begin_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirect : ApiCmdDrawIndirect);
1268 dgc_emit_sqtt_marker_event(cs, sequence_id, indexed ? EventCmdDrawIndexedIndirect : EventCmdDrawIndirect);
1269
1270 dgc_emit_pkt3_set_base(cs, va);
1271 dgc_emit_pkt3_draw_indirect(cs, indexed);
1272
1273 dgc_emit_sqtt_thread_trace_marker(cs);
1274 dgc_emit_sqtt_end_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirect : ApiCmdDrawIndirect);
1275 }
1276
1277 static void
dgc_emit_draw(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id)1278 dgc_emit_draw(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id)
1279 {
1280 const struct radv_indirect_command_layout *layout = cs->layout;
1281 nir_builder *b = cs->b;
1282
1283 nir_def *draw_data0 = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
1284 .access = ACCESS_NON_WRITEABLE);
1285 nir_def *vertex_count = nir_channel(b, draw_data0, 0);
1286 nir_def *instance_count = nir_channel(b, draw_data0, 1);
1287 nir_def *vertex_offset = nir_channel(b, draw_data0, 2);
1288 nir_def *first_instance = nir_channel(b, draw_data0, 3);
1289
1290 nir_push_if(b, nir_iand(b, nir_ine_imm(b, vertex_count, 0), nir_ine_imm(b, instance_count, 0)));
1291 {
1292 dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDraw);
1293 dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDraw);
1294
1295 dgc_emit_userdata_vertex(cs, vertex_offset, first_instance, nir_imm_int(b, 0));
1296 dgc_emit_instance_count(cs, instance_count);
1297 dgc_emit_draw_index_auto(cs, vertex_count);
1298
1299 dgc_emit_sqtt_thread_trace_marker(cs);
1300 dgc_emit_sqtt_end_api_marker(cs, ApiCmdDraw);
1301 }
1302 nir_pop_if(b, 0);
1303 }
1304
1305 static void
dgc_emit_draw_indexed(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id,nir_def * max_index_count)1306 dgc_emit_draw_indexed(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, nir_def *max_index_count)
1307 {
1308 const struct radv_indirect_command_layout *layout = cs->layout;
1309 nir_builder *b = cs->b;
1310
1311 nir_def *draw_data0 = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
1312 .access = ACCESS_NON_WRITEABLE);
1313 nir_def *draw_data1 =
1314 nir_build_load_global(b, 1, 32, nir_iadd_imm(b, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B), 16),
1315 .access = ACCESS_NON_WRITEABLE);
1316 nir_def *index_count = nir_channel(b, draw_data0, 0);
1317 nir_def *instance_count = nir_channel(b, draw_data0, 1);
1318 nir_def *first_index = nir_channel(b, draw_data0, 2);
1319 nir_def *vertex_offset = nir_channel(b, draw_data0, 3);
1320 nir_def *first_instance = nir_channel(b, draw_data1, 0);
1321
1322 nir_push_if(b, nir_iand(b, nir_ine_imm(b, index_count, 0), nir_ine_imm(b, instance_count, 0)));
1323 {
1324 dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDrawIndexed);
1325 dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDrawIndexed);
1326
1327 dgc_emit_userdata_vertex(cs, vertex_offset, first_instance, nir_imm_int(b, 0));
1328 dgc_emit_instance_count(cs, instance_count);
1329 dgc_emit_draw_index_offset_2(cs, first_index, index_count, max_index_count);
1330
1331 dgc_emit_sqtt_thread_trace_marker(cs);
1332 dgc_emit_sqtt_end_api_marker(cs, ApiCmdDrawIndexed);
1333 }
1334 nir_pop_if(b, 0);
1335 }
1336
1337 static void
dgc_emit_draw_with_count(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id,bool indexed)1338 dgc_emit_draw_with_count(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, bool indexed)
1339 {
1340 const struct radv_indirect_command_layout *layout = cs->layout;
1341 nir_builder *b = cs->b;
1342
1343 nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
1344 nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
1345 nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE);
1346
1347 nir_def *draw_data = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
1348 .access = ACCESS_NON_WRITEABLE);
1349 nir_def *va = nir_pack_64_2x32(b, nir_channels(b, draw_data, 0x3));
1350 nir_def *stride = nir_channel(b, draw_data, 2);
1351 nir_def *draw_count = nir_umin(b, load_param32(b, max_draw_count), nir_channel(b, draw_data, 3));
1352
1353 dgc_emit_pkt3_set_base(cs, va);
1354
1355 nir_def *vertex_offset_reg = nir_iand_imm(b, vtx_base_sgpr, 0x3FFF);
1356 nir_def *start_instance_offset = nir_bcsel(b, has_drawid, nir_imm_int(b, 2), nir_imm_int(b, 1));
1357 nir_def *start_instance_reg =
1358 nir_bcsel(b, has_baseinstance, nir_iadd(b, vertex_offset_reg, start_instance_offset), nir_imm_int(b, 0));
1359 nir_def *draw_id_reg = nir_bcsel(
1360 b, has_drawid, nir_ior_imm(b, nir_iadd(b, vertex_offset_reg, nir_imm_int(b, 1)), S_2C3_DRAW_INDEX_ENABLE(1)),
1361 nir_imm_int(b, 0));
1362
1363 nir_def *di_src_sel = nir_imm_int(b, indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX);
1364
1365 dgc_emit_sqtt_begin_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirectCount : ApiCmdDrawIndirectCount);
1366 dgc_emit_sqtt_marker_event(cs, sequence_id, indexed ? EventCmdDrawIndexedIndirectCount : EventCmdDrawIndirectCount);
1367
1368 dgc_cs_begin(cs);
1369 dgc_cs_emit_imm(PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8, false));
1370 dgc_cs_emit_imm(0);
1371 dgc_cs_emit(vertex_offset_reg);
1372 dgc_cs_emit(start_instance_reg);
1373 dgc_cs_emit(draw_id_reg);
1374 dgc_cs_emit(draw_count);
1375 dgc_cs_emit_imm(0);
1376 dgc_cs_emit_imm(0);
1377 dgc_cs_emit(stride);
1378 dgc_cs_emit(di_src_sel);
1379 dgc_cs_end();
1380
1381 dgc_emit_sqtt_thread_trace_marker(cs);
1382 dgc_emit_sqtt_end_api_marker(cs, indexed ? ApiCmdDrawIndexedIndirectCount : ApiCmdDrawIndirectCount);
1383 }
1384
1385 /**
1386 * Index buffer
1387 */
1388 static nir_def *
dgc_get_index_type(struct dgc_cmdbuf * cs,nir_def * user_index_type)1389 dgc_get_index_type(struct dgc_cmdbuf *cs, nir_def *user_index_type)
1390 {
1391 const struct radv_indirect_command_layout *layout = cs->layout;
1392 nir_builder *b = cs->b;
1393
1394 if (layout->vk.index_mode_is_dx) {
1395 nir_def *index_type = nir_bcsel(b, nir_ieq_imm(b, user_index_type, 0x2a /* DXGI_FORMAT_R32_UINT */),
1396 nir_imm_int(b, V_028A7C_VGT_INDEX_32), nir_imm_int(b, V_028A7C_VGT_INDEX_16));
1397 return nir_bcsel(b, nir_ieq_imm(b, user_index_type, 0x3e /* DXGI_FORMAT_R8_UINT */),
1398 nir_imm_int(b, V_028A7C_VGT_INDEX_8), index_type);
1399 } else {
1400 nir_def *index_type = nir_bcsel(b, nir_ieq_imm(b, user_index_type, VK_INDEX_TYPE_UINT32),
1401 nir_imm_int(b, V_028A7C_VGT_INDEX_32), nir_imm_int(b, V_028A7C_VGT_INDEX_16));
1402 return nir_bcsel(b, nir_ieq_imm(b, user_index_type, VK_INDEX_TYPE_UINT8), nir_imm_int(b, V_028A7C_VGT_INDEX_8),
1403 index_type);
1404 }
1405 }
1406
1407 static void
dgc_emit_index_buffer(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_variable * max_index_count_var)1408 dgc_emit_index_buffer(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_variable *max_index_count_var)
1409 {
1410 const struct radv_indirect_command_layout *layout = cs->layout;
1411 const struct radv_device *device = cs->dev;
1412 const struct radv_physical_device *pdev = radv_device_physical(device);
1413 nir_builder *b = cs->b;
1414
1415 nir_def *data = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.index_src_offset_B),
1416 .access = ACCESS_NON_WRITEABLE);
1417
1418 nir_def *index_type = dgc_get_index_type(cs, nir_channel(b, data, 3));
1419 nir_def *index_size = nir_iand_imm(b, nir_ushr(b, nir_imm_int(b, 0x142), nir_imul_imm(b, index_type, 4)), 0xf);
1420
1421 nir_def *max_index_count = nir_udiv(b, nir_channel(b, data, 2), index_size);
1422 nir_store_var(b, max_index_count_var, max_index_count, 0x1);
1423
1424 nir_def *addr_upper = nir_channel(b, data, 1);
1425 addr_upper = nir_ishr_imm(b, nir_ishl_imm(b, addr_upper, 16), 16);
1426
1427 dgc_cs_begin(cs);
1428
1429 if (pdev->info.gfx_level >= GFX9) {
1430 unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
1431 if (pdev->info.gfx_level < GFX9 || (pdev->info.gfx_level == GFX9 && pdev->info.me_fw_version < 26))
1432 opcode = PKT3_SET_UCONFIG_REG;
1433 dgc_cs_emit_imm(PKT3(opcode, 1, 0));
1434 dgc_cs_emit_imm((R_03090C_VGT_INDEX_TYPE - CIK_UCONFIG_REG_OFFSET) >> 2 | (2u << 28));
1435 dgc_cs_emit(index_type);
1436 } else {
1437 dgc_cs_emit_imm(PKT3(PKT3_INDEX_TYPE, 0, 0));
1438 dgc_cs_emit(index_type);
1439 dgc_cs_emit(nir_imm_int(b, PKT3_NOP_PAD));
1440 }
1441
1442 dgc_cs_emit_imm(PKT3(PKT3_INDEX_BASE, 1, 0));
1443 dgc_cs_emit(nir_channel(b, data, 0));
1444 dgc_cs_emit(addr_upper);
1445
1446 dgc_cs_emit_imm(PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
1447 dgc_cs_emit(max_index_count);
1448
1449 dgc_cs_end();
1450 }
1451
1452 /**
1453 * Push constants
1454 */
1455 static nir_def *
dgc_get_push_constant_stages(struct dgc_cmdbuf * cs)1456 dgc_get_push_constant_stages(struct dgc_cmdbuf *cs)
1457 {
1458 const struct radv_indirect_command_layout *layout = cs->layout;
1459 nir_builder *b = cs->b;
1460
1461 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
1462 nir_def *has_push_constant = nir_ine_imm(b, load_shader_metadata32(cs, push_const_sgpr), 0);
1463 return nir_bcsel(b, has_push_constant, nir_imm_int(b, VK_SHADER_STAGE_COMPUTE_BIT), nir_imm_int(b, 0));
1464 } else {
1465 return load_param16(b, push_constant_stages);
1466 }
1467 }
1468
1469 static nir_def *
dgc_get_upload_sgpr(struct dgc_cmdbuf * cs,nir_def * param_buf,nir_def * param_offset,gl_shader_stage stage)1470 dgc_get_upload_sgpr(struct dgc_cmdbuf *cs, nir_def *param_buf, nir_def *param_offset, gl_shader_stage stage)
1471 {
1472 const struct radv_indirect_command_layout *layout = cs->layout;
1473 nir_builder *b = cs->b;
1474 nir_def *res;
1475
1476 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
1477 res = load_shader_metadata32(cs, push_const_sgpr);
1478 } else {
1479 res = nir_load_ssbo(b, 1, 32, param_buf, nir_iadd_imm(b, param_offset, stage * 12));
1480 }
1481
1482 return nir_ubfe_imm(b, res, 0, 16);
1483 }
1484
1485 static nir_def *
dgc_get_inline_sgpr(struct dgc_cmdbuf * cs,nir_def * param_buf,nir_def * param_offset,gl_shader_stage stage)1486 dgc_get_inline_sgpr(struct dgc_cmdbuf *cs, nir_def *param_buf, nir_def *param_offset, gl_shader_stage stage)
1487 {
1488 const struct radv_indirect_command_layout *layout = cs->layout;
1489 nir_builder *b = cs->b;
1490 nir_def *res;
1491
1492 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
1493 res = load_shader_metadata32(cs, push_const_sgpr);
1494 } else {
1495 res = nir_load_ssbo(b, 1, 32, param_buf, nir_iadd_imm(b, param_offset, stage * 12));
1496 }
1497
1498 return nir_ubfe_imm(b, res, 16, 16);
1499 }
1500
1501 static nir_def *
dgc_get_inline_mask(struct dgc_cmdbuf * cs,nir_def * param_buf,nir_def * param_offset,gl_shader_stage stage)1502 dgc_get_inline_mask(struct dgc_cmdbuf *cs, nir_def *param_buf, nir_def *param_offset, gl_shader_stage stage)
1503 {
1504 const struct radv_indirect_command_layout *layout = cs->layout;
1505 nir_builder *b = cs->b;
1506
1507 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
1508 return load_shader_metadata64(cs, inline_push_const_mask);
1509 } else {
1510 nir_def *reg_info = nir_load_ssbo(b, 2, 32, param_buf, nir_iadd_imm(b, param_offset, stage * 12 + 4));
1511 return nir_pack_64_2x32(b, nir_channels(b, reg_info, 0x3));
1512 }
1513 }
1514
1515 static nir_def *
dgc_push_constant_needs_copy(struct dgc_cmdbuf * cs)1516 dgc_push_constant_needs_copy(struct dgc_cmdbuf *cs)
1517 {
1518 const struct radv_indirect_command_layout *layout = cs->layout;
1519 nir_builder *b = cs->b;
1520
1521 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
1522 return nir_ine_imm(b, nir_ubfe_imm(b, load_shader_metadata32(cs, push_const_sgpr), 0, 16), 0);
1523 } else {
1524 return nir_ine_imm(b, load_param8(b, const_copy), 0);
1525 }
1526 }
1527
1528 struct dgc_pc_params {
1529 nir_def *buf;
1530 nir_def *offset;
1531 nir_def *const_offset;
1532 };
1533
1534 static struct dgc_pc_params
dgc_get_pc_params(struct dgc_cmdbuf * cs)1535 dgc_get_pc_params(struct dgc_cmdbuf *cs)
1536 {
1537 const struct radv_indirect_command_layout *layout = cs->layout;
1538 struct dgc_pc_params params = {0};
1539 nir_builder *b = cs->b;
1540
1541 params.buf = radv_meta_load_descriptor(b, 0, 0);
1542
1543 uint32_t offset = 0;
1544 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
1545 offset =
1546 (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) ? 0 : sizeof(struct radv_compute_pipeline_metadata);
1547 } else {
1548 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB))
1549 offset = MAX_VBS * DGC_VBO_INFO_SIZE;
1550 }
1551
1552 params.offset = nir_imm_int(b, offset);
1553 params.const_offset = nir_iadd_imm(b, params.offset, MESA_VULKAN_SHADER_STAGES * 12);
1554
1555 return params;
1556 }
1557
1558 static void
dgc_alloc_push_constant(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id,const struct dgc_pc_params * params)1559 dgc_alloc_push_constant(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id,
1560 const struct dgc_pc_params *params)
1561 {
1562 const struct radv_indirect_command_layout *layout = cs->layout;
1563 VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, layout->vk.layout);
1564 nir_builder *b = cs->b;
1565
1566 for (uint32_t i = 0; i < pipeline_layout->push_constant_size / 4; i++) {
1567 nir_def *data;
1568
1569 if (layout->sequence_index_mask & (1ull << i)) {
1570 data = sequence_id;
1571 } else if ((layout->push_constant_mask & (1ull << i))) {
1572 data = nir_build_load_global(b, 1, 32, nir_iadd_imm(b, stream_addr, layout->push_constant_offsets[i]),
1573 .access = ACCESS_NON_WRITEABLE);
1574 } else {
1575 data = nir_load_ssbo(b, 1, 32, params->buf, nir_iadd_imm(b, params->const_offset, i * 4));
1576 }
1577
1578 dgc_upload(cs, data);
1579 }
1580 }
1581
1582 static void
dgc_emit_push_constant_for_stage(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id,const struct dgc_pc_params * params,gl_shader_stage stage)1583 dgc_emit_push_constant_for_stage(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id,
1584 const struct dgc_pc_params *params, gl_shader_stage stage)
1585 {
1586 const struct radv_indirect_command_layout *layout = cs->layout;
1587 VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, layout->vk.layout);
1588 nir_builder *b = cs->b;
1589
1590 nir_def *upload_sgpr = dgc_get_upload_sgpr(cs, params->buf, params->offset, stage);
1591 nir_def *inline_sgpr = dgc_get_inline_sgpr(cs, params->buf, params->offset, stage);
1592 nir_def *inline_mask = dgc_get_inline_mask(cs, params->buf, params->offset, stage);
1593
1594 nir_push_if(b, nir_ine_imm(b, upload_sgpr, 0));
1595 {
1596 dgc_cs_begin(cs);
1597 dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 1, 0));
1598 dgc_cs_emit(upload_sgpr);
1599 dgc_cs_emit(nir_iadd(b, load_param32(b, upload_addr), nir_load_var(b, cs->upload_offset)));
1600 dgc_cs_end();
1601 }
1602 nir_pop_if(b, NULL);
1603
1604 nir_push_if(b, nir_ine_imm(b, inline_sgpr, 0));
1605 {
1606 nir_variable *pc_idx = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "pc_idx");
1607 nir_store_var(b, pc_idx, nir_imm_int(b, 0), 0x1);
1608
1609 for (uint32_t i = 0; i < pipeline_layout->push_constant_size / 4; i++) {
1610 nir_push_if(b, nir_ine_imm(b, nir_iand_imm(b, inline_mask, 1ull << i), 0));
1611 {
1612 nir_def *data = NULL;
1613
1614 if (layout->sequence_index_mask & (1ull << i)) {
1615 data = sequence_id;
1616 } else if (layout->push_constant_mask & (1ull << i)) {
1617 data = nir_build_load_global(b, 1, 32, nir_iadd_imm(b, stream_addr, layout->push_constant_offsets[i]),
1618 .access = ACCESS_NON_WRITEABLE);
1619 } else if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) {
1620 /* For indirect pipeline binds, partial push constant updates can't be emitted when
1621 * the DGC execute is called because there is no bound pipeline and they have to be
1622 * emitted from the DGC prepare shader.
1623 */
1624 data = nir_load_ssbo(b, 1, 32, params->buf, nir_iadd_imm(b, params->const_offset, i * 4));
1625 }
1626
1627 if (data) {
1628 dgc_cs_begin(cs);
1629 dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 1, 0));
1630 dgc_cs_emit(nir_iadd(b, inline_sgpr, nir_load_var(b, pc_idx)));
1631 dgc_cs_emit(data);
1632 dgc_cs_end();
1633 }
1634
1635 nir_store_var(b, pc_idx, nir_iadd_imm(b, nir_load_var(b, pc_idx), 1), 0x1);
1636 }
1637 nir_pop_if(b, NULL);
1638 }
1639 }
1640 nir_pop_if(b, NULL);
1641 }
1642
1643 static void
dgc_emit_push_constant(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id,VkShaderStageFlags stages)1644 dgc_emit_push_constant(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id, VkShaderStageFlags stages)
1645 {
1646 const struct dgc_pc_params params = dgc_get_pc_params(cs);
1647 nir_builder *b = cs->b;
1648
1649 nir_def *push_constant_stages = dgc_get_push_constant_stages(cs);
1650 radv_foreach_stage(s, stages)
1651 {
1652 nir_push_if(b, nir_test_mask(b, push_constant_stages, mesa_to_vk_shader_stage(s)));
1653 {
1654 dgc_emit_push_constant_for_stage(cs, stream_addr, sequence_id, ¶ms, s);
1655 }
1656 nir_pop_if(b, NULL);
1657 }
1658
1659 nir_def *const_copy = dgc_push_constant_needs_copy(cs);
1660 nir_push_if(b, const_copy);
1661 {
1662 dgc_alloc_push_constant(cs, stream_addr, sequence_id, ¶ms);
1663 }
1664 nir_pop_if(b, NULL);
1665 }
1666
1667 /**
1668 * Vertex buffers
1669 */
1670 struct dgc_vbo_info {
1671 nir_def *va;
1672 nir_def *size;
1673 nir_def *stride;
1674
1675 nir_def *attrib_end;
1676 nir_def *attrib_index_offset;
1677
1678 nir_def *non_trivial_format;
1679 };
1680
1681 static nir_def *
dgc_get_rsrc3_vbo_desc(struct dgc_cmdbuf * cs,const struct dgc_vbo_info * vbo_info)1682 dgc_get_rsrc3_vbo_desc(struct dgc_cmdbuf *cs, const struct dgc_vbo_info *vbo_info)
1683 {
1684 const struct radv_device *device = cs->dev;
1685 const struct radv_physical_device *pdev = radv_device_physical(device);
1686 nir_builder *b = cs->b;
1687
1688 uint32_t rsrc_word3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
1689 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
1690
1691 if (pdev->info.gfx_level >= GFX10) {
1692 rsrc_word3 |= S_008F0C_FORMAT_GFX10(V_008F0C_GFX10_FORMAT_32_UINT);
1693 } else {
1694 rsrc_word3 |=
1695 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
1696 }
1697
1698 nir_def *uses_dynamic_inputs = nir_ieq_imm(b, load_param8(b, dynamic_vs_input), 1);
1699 nir_def *uses_non_trivial_format = nir_iand(b, uses_dynamic_inputs, nir_ine_imm(b, vbo_info->non_trivial_format, 0));
1700
1701 return nir_bcsel(b, uses_non_trivial_format, vbo_info->non_trivial_format, nir_imm_int(b, rsrc_word3));
1702 }
1703
1704 static void
dgc_write_vertex_descriptor(struct dgc_cmdbuf * cs,const struct dgc_vbo_info * vbo_info,nir_variable * desc)1705 dgc_write_vertex_descriptor(struct dgc_cmdbuf *cs, const struct dgc_vbo_info *vbo_info, nir_variable *desc)
1706 {
1707 const struct radv_device *device = cs->dev;
1708 const struct radv_physical_device *pdev = radv_device_physical(device);
1709 nir_builder *b = cs->b;
1710
1711 nir_variable *num_records = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "num_records");
1712 nir_store_var(b, num_records, vbo_info->size, 0x1);
1713
1714 nir_def *use_per_attribute_vb_descs = nir_ieq_imm(b, load_param8(b, use_per_attribute_vb_descs), 1);
1715 nir_push_if(b, use_per_attribute_vb_descs);
1716 {
1717 nir_push_if(b, nir_ult(b, nir_load_var(b, num_records), vbo_info->attrib_end));
1718 {
1719 nir_store_var(b, num_records, nir_imm_int(b, 0), 0x1);
1720 }
1721 nir_push_else(b, NULL);
1722 nir_push_if(b, nir_ieq_imm(b, vbo_info->stride, 0));
1723 {
1724 nir_store_var(b, num_records, nir_imm_int(b, 1), 0x1);
1725 }
1726 nir_push_else(b, NULL);
1727 {
1728 nir_def *r = nir_iadd(
1729 b,
1730 nir_iadd_imm(
1731 b, nir_udiv(b, nir_isub(b, nir_load_var(b, num_records), vbo_info->attrib_end), vbo_info->stride), 1),
1732 vbo_info->attrib_index_offset);
1733 nir_store_var(b, num_records, r, 0x1);
1734 }
1735 nir_pop_if(b, NULL);
1736 nir_pop_if(b, NULL);
1737
1738 nir_def *convert_cond = nir_ine_imm(b, nir_load_var(b, num_records), 0);
1739 if (pdev->info.gfx_level == GFX9)
1740 convert_cond = nir_imm_false(b);
1741 else if (pdev->info.gfx_level != GFX8)
1742 convert_cond = nir_iand(b, convert_cond, nir_ieq_imm(b, vbo_info->stride, 0));
1743
1744 nir_def *new_records = nir_iadd(
1745 b, nir_imul(b, nir_iadd_imm(b, nir_load_var(b, num_records), -1), vbo_info->stride), vbo_info->attrib_end);
1746 new_records = nir_bcsel(b, convert_cond, new_records, nir_load_var(b, num_records));
1747 nir_store_var(b, num_records, new_records, 0x1);
1748 }
1749 nir_push_else(b, NULL);
1750 {
1751 if (pdev->info.gfx_level != GFX8) {
1752 nir_push_if(b, nir_ine_imm(b, vbo_info->stride, 0));
1753 {
1754 nir_def *r = nir_iadd(b, nir_load_var(b, num_records), nir_iadd_imm(b, vbo_info->stride, -1));
1755 nir_store_var(b, num_records, nir_udiv(b, r, vbo_info->stride), 0x1);
1756 }
1757 nir_pop_if(b, NULL);
1758 }
1759 }
1760 nir_pop_if(b, NULL);
1761
1762 nir_def *rsrc_word3 = dgc_get_rsrc3_vbo_desc(cs, vbo_info);
1763 if (pdev->info.gfx_level >= GFX10) {
1764 nir_def *oob_select = nir_bcsel(b, nir_ieq_imm(b, vbo_info->stride, 0), nir_imm_int(b, V_008F0C_OOB_SELECT_RAW),
1765 nir_imm_int(b, V_008F0C_OOB_SELECT_STRUCTURED));
1766 rsrc_word3 = nir_iand_imm(b, rsrc_word3, C_008F0C_OOB_SELECT);
1767 rsrc_word3 = nir_ior(b, rsrc_word3, nir_ishl_imm(b, oob_select, 28));
1768 }
1769
1770 nir_def *va_hi = nir_iand_imm(b, nir_unpack_64_2x32_split_y(b, vbo_info->va), 0xFFFF);
1771 nir_def *stride = nir_iand_imm(b, vbo_info->stride, 0x3FFF);
1772 nir_def *new_vbo_data[4] = {nir_unpack_64_2x32_split_x(b, vbo_info->va),
1773 nir_ior(b, nir_ishl_imm(b, stride, 16), va_hi), nir_load_var(b, num_records),
1774 rsrc_word3};
1775 nir_store_var(b, desc, nir_vec(b, new_vbo_data, 4), 0xf);
1776
1777 /* On GFX9, it seems bounds checking is disabled if both
1778 * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
1779 * GFX10.3 but it doesn't hurt.
1780 */
1781 nir_def *buf_va =
1782 nir_iand_imm(b, nir_pack_64_2x32(b, nir_trim_vector(b, nir_load_var(b, desc), 2)), (1ull << 48) - 1ull);
1783 nir_push_if(b, nir_ior(b, nir_ieq_imm(b, nir_load_var(b, num_records), 0), nir_ieq_imm(b, buf_va, 0)));
1784 {
1785 nir_def *has_dynamic_vs_input = nir_ieq_imm(b, load_param8(b, dynamic_vs_input), 1);
1786
1787 new_vbo_data[0] = nir_imm_int(b, 0);
1788 new_vbo_data[1] = nir_bcsel(b, has_dynamic_vs_input, nir_imm_int(b, S_008F04_STRIDE(16)), nir_imm_int(b, 0));
1789 new_vbo_data[2] = nir_imm_int(b, 0);
1790 new_vbo_data[3] = nir_bcsel(b, has_dynamic_vs_input, nir_channel(b, nir_load_var(b, desc), 3), nir_imm_int(b, 0));
1791
1792 nir_store_var(b, desc, nir_vec(b, new_vbo_data, 4), 0xf);
1793 }
1794 nir_pop_if(b, NULL);
1795 }
1796
1797 static void
dgc_emit_vertex_buffer(struct dgc_cmdbuf * cs,nir_def * stream_addr)1798 dgc_emit_vertex_buffer(struct dgc_cmdbuf *cs, nir_def *stream_addr)
1799 {
1800 const struct radv_indirect_command_layout *layout = cs->layout;
1801 nir_builder *b = cs->b;
1802
1803 nir_def *vb_desc_usage_mask = load_param32(b, vb_desc_usage_mask);
1804 nir_def *vbo_cnt = nir_bit_count(b, vb_desc_usage_mask);
1805
1806 nir_push_if(b, nir_ine_imm(b, vbo_cnt, 0));
1807 {
1808 dgc_cs_begin(cs);
1809 dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 1, 0));
1810 dgc_cs_emit(load_param16(b, vbo_reg));
1811 dgc_cs_emit(nir_iadd(b, load_param32(b, upload_addr), nir_load_var(b, cs->upload_offset)));
1812 dgc_cs_end();
1813 }
1814 nir_pop_if(b, NULL);
1815
1816 nir_variable *vbo_idx = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "vbo_idx");
1817 nir_store_var(b, vbo_idx, nir_imm_int(b, 0), 0x1);
1818
1819 nir_push_loop(b);
1820 {
1821 nir_def *cur_idx = nir_load_var(b, vbo_idx);
1822
1823 nir_break_if(b, nir_uge_imm(b, cur_idx, 32 /* bits in vb_desc_usage_mask */));
1824
1825 nir_def *l = nir_ishl(b, nir_imm_int(b, 1), cur_idx);
1826 nir_push_if(b, nir_ieq_imm(b, nir_iand(b, l, vb_desc_usage_mask), 0));
1827 {
1828 nir_store_var(b, vbo_idx, nir_iadd_imm(b, cur_idx, 1), 0x1);
1829 nir_jump(b, nir_jump_continue);
1830 }
1831 nir_pop_if(b, NULL);
1832
1833 nir_variable *va_var = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint64_t_type(), "va_var");
1834 nir_variable *size_var = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "size_var");
1835 nir_variable *stride_var = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "stride_var");
1836
1837 nir_def *binding = load_vbo_metadata32(cs, cur_idx, binding);
1838
1839 nir_def *vbo_override = nir_ine_imm(
1840 b, nir_iand(b, nir_imm_int(b, layout->vk.vertex_bindings), nir_ishl(b, nir_imm_int(b, 1), binding)), 0);
1841 nir_push_if(b, vbo_override);
1842 {
1843 nir_def *stream_offset = load_vbo_offset(cs, cur_idx);
1844 nir_def *stream_data = nir_build_load_global(b, 4, 32, nir_iadd(b, stream_addr, nir_u2u64(b, stream_offset)),
1845 .access = ACCESS_NON_WRITEABLE);
1846
1847 nir_def *va = nir_pack_64_2x32(b, nir_trim_vector(b, stream_data, 2));
1848 nir_def *size = nir_channel(b, stream_data, 2);
1849
1850 nir_def *stride = nir_channel(b, stream_data, 3);
1851
1852 nir_store_var(b, va_var, va, 0x1);
1853 nir_store_var(b, size_var, size, 0x1);
1854 nir_store_var(b, stride_var, stride, 0x1);
1855 }
1856 nir_push_else(b, NULL);
1857 {
1858 nir_store_var(b, va_var, load_vbo_metadata64(cs, cur_idx, va), 0x1);
1859 nir_store_var(b, size_var, load_vbo_metadata32(cs, cur_idx, size), 0x1);
1860 nir_store_var(b, stride_var, load_vbo_metadata32(cs, cur_idx, stride), 0x1);
1861 }
1862 nir_pop_if(b, NULL);
1863
1864 nir_def *attrib_index_offset = load_vbo_metadata32(cs, cur_idx, attrib_index_offset);
1865 nir_def *non_trivial_format = load_vbo_metadata32(cs, cur_idx, non_trivial_format);
1866 nir_def *attrib_offset = load_vbo_metadata32(cs, cur_idx, attrib_offset);
1867 nir_def *attrib_format_size = load_vbo_metadata32(cs, cur_idx, attrib_format_size);
1868 nir_def *attrib_end = nir_iadd(b, attrib_offset, attrib_format_size);
1869
1870 nir_def *has_dynamic_vs_input = nir_ieq_imm(b, load_param8(b, dynamic_vs_input), 1);
1871 nir_def *va = nir_iadd(b, nir_load_var(b, va_var),
1872 nir_bcsel(b, has_dynamic_vs_input, nir_u2u64(b, attrib_offset), nir_imm_int64(b, 0)));
1873
1874 struct dgc_vbo_info vbo_info = {
1875 .va = va,
1876 .size = nir_load_var(b, size_var),
1877 .stride = nir_load_var(b, stride_var),
1878 .attrib_end = attrib_end,
1879 .attrib_index_offset = attrib_index_offset,
1880 .non_trivial_format = non_trivial_format,
1881 };
1882
1883 nir_variable *vbo_data = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uvec4_type(), "vbo_data");
1884
1885 dgc_write_vertex_descriptor(cs, &vbo_info, vbo_data);
1886
1887 dgc_upload(cs, nir_load_var(b, vbo_data));
1888
1889 nir_store_var(b, vbo_idx, nir_iadd_imm(b, cur_idx, 1), 0x1);
1890 }
1891 nir_pop_loop(b, NULL);
1892 }
1893
1894 /**
1895 * Compute dispatch
1896 */
1897 static nir_def *
dgc_get_dispatch_initiator(struct dgc_cmdbuf * cs)1898 dgc_get_dispatch_initiator(struct dgc_cmdbuf *cs)
1899 {
1900 const struct radv_device *device = cs->dev;
1901 nir_builder *b = cs->b;
1902
1903 const uint32_t dispatch_initiator = device->dispatch_initiator | S_00B800_FORCE_START_AT_000(1);
1904 nir_def *is_wave32 = nir_ieq_imm(b, load_shader_metadata32(cs, wave32), 1);
1905 return nir_bcsel(b, is_wave32, nir_imm_int(b, dispatch_initiator | S_00B800_CS_W32_EN(1)),
1906 nir_imm_int(b, dispatch_initiator));
1907 }
1908
1909 static void
dgc_emit_grid_size_user_sgpr(struct dgc_cmdbuf * cs,nir_def * grid_base_sgpr,nir_def * wg_x,nir_def * wg_y,nir_def * wg_z)1910 dgc_emit_grid_size_user_sgpr(struct dgc_cmdbuf *cs, nir_def *grid_base_sgpr, nir_def *wg_x, nir_def *wg_y,
1911 nir_def *wg_z)
1912 {
1913 dgc_cs_begin(cs);
1914 dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 3, 0));
1915 dgc_cs_emit(grid_base_sgpr);
1916 dgc_cs_emit(wg_x);
1917 dgc_cs_emit(wg_y);
1918 dgc_cs_emit(wg_z);
1919 dgc_cs_end();
1920 }
1921
1922 static void
dgc_emit_grid_size_pointer(struct dgc_cmdbuf * cs,nir_def * grid_base_sgpr,nir_def * size_va)1923 dgc_emit_grid_size_pointer(struct dgc_cmdbuf *cs, nir_def *grid_base_sgpr, nir_def *size_va)
1924 {
1925 nir_builder *b = cs->b;
1926
1927 nir_def *va_lo = nir_unpack_64_2x32_split_x(b, size_va);
1928 nir_def *va_hi = nir_unpack_64_2x32_split_y(b, size_va);
1929
1930 dgc_cs_begin(cs);
1931 dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 2, 0));
1932 dgc_cs_emit(grid_base_sgpr);
1933 dgc_cs_emit(va_lo);
1934 dgc_cs_emit(va_hi);
1935 dgc_cs_end();
1936 }
1937
1938 static void
dgc_emit_dispatch_direct(struct dgc_cmdbuf * cs,nir_def * wg_x,nir_def * wg_y,nir_def * wg_z,nir_def * dispatch_initiator,nir_def * grid_sgpr,nir_def * size_va,nir_def * sequence_id,bool is_rt)1939 dgc_emit_dispatch_direct(struct dgc_cmdbuf *cs, nir_def *wg_x, nir_def *wg_y, nir_def *wg_z,
1940 nir_def *dispatch_initiator, nir_def *grid_sgpr, nir_def *size_va, nir_def *sequence_id,
1941 bool is_rt)
1942 {
1943 const struct radv_device *device = cs->dev;
1944 nir_builder *b = cs->b;
1945
1946 nir_push_if(b, nir_iand(b, nir_ine_imm(b, wg_x, 0), nir_iand(b, nir_ine_imm(b, wg_y, 0), nir_ine_imm(b, wg_z, 0))));
1947 {
1948 nir_push_if(b, nir_ine_imm(b, grid_sgpr, 0));
1949 {
1950 if (device->load_grid_size_from_user_sgpr) {
1951 dgc_emit_grid_size_user_sgpr(cs, grid_sgpr, wg_x, wg_y, wg_z);
1952 } else {
1953 dgc_emit_grid_size_pointer(cs, grid_sgpr, size_va);
1954 }
1955 }
1956 nir_pop_if(b, 0);
1957
1958 dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDispatch);
1959 dgc_emit_sqtt_marker_event_with_dims(
1960 cs, sequence_id, wg_x, wg_y, wg_z,
1961 is_rt ? EventCmdTraceRaysKHR | ApiRayTracingSeparateCompiled : EventCmdDispatch);
1962
1963 dgc_cs_begin(cs);
1964 dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1));
1965 dgc_cs_emit(wg_x);
1966 dgc_cs_emit(wg_y);
1967 dgc_cs_emit(wg_z);
1968 dgc_cs_emit(dispatch_initiator);
1969 dgc_cs_end();
1970
1971 dgc_emit_sqtt_thread_trace_marker(cs);
1972 dgc_emit_sqtt_end_api_marker(cs, ApiCmdDispatch);
1973 }
1974 nir_pop_if(b, 0);
1975 }
1976
1977 static void
dgc_emit_dispatch(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id)1978 dgc_emit_dispatch(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id)
1979 {
1980 const struct radv_indirect_command_layout *layout = cs->layout;
1981 nir_builder *b = cs->b;
1982
1983 nir_def *dispatch_data = nir_build_load_global(
1984 b, 3, 32, nir_iadd_imm(b, stream_addr, layout->vk.dispatch_src_offset_B), .access = ACCESS_NON_WRITEABLE);
1985 nir_def *wg_x = nir_channel(b, dispatch_data, 0);
1986 nir_def *wg_y = nir_channel(b, dispatch_data, 1);
1987 nir_def *wg_z = nir_channel(b, dispatch_data, 2);
1988
1989 nir_def *grid_sgpr = load_shader_metadata32(cs, grid_base_sgpr);
1990 nir_def *dispatch_initiator = dgc_get_dispatch_initiator(cs);
1991 nir_def *size_va = nir_iadd_imm(b, stream_addr, layout->vk.dispatch_src_offset_B);
1992
1993 dgc_emit_dispatch_direct(cs, wg_x, wg_y, wg_z, dispatch_initiator, grid_sgpr, size_va, sequence_id, false);
1994 }
1995
1996 /**
1997 * Draw mesh/task
1998 */
1999 static void
dgc_emit_userdata_mesh(struct dgc_cmdbuf * cs,nir_def * x,nir_def * y,nir_def * z,nir_def * drawid)2000 dgc_emit_userdata_mesh(struct dgc_cmdbuf *cs, nir_def *x, nir_def *y, nir_def *z, nir_def *drawid)
2001 {
2002 nir_builder *b = cs->b;
2003
2004 nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
2005 vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr);
2006
2007 nir_def *has_grid_size = nir_test_mask(b, vtx_base_sgpr, DGC_USES_GRID_SIZE);
2008 nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
2009
2010 nir_push_if(b, nir_ior(b, has_grid_size, has_drawid));
2011 {
2012 nir_def *pkt_cnt = nir_imm_int(b, 0);
2013 pkt_cnt = nir_bcsel(b, has_grid_size, nir_iadd_imm(b, pkt_cnt, 3), pkt_cnt);
2014 pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
2015
2016 dgc_cs_begin(cs);
2017 dgc_cs_emit(nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt));
2018 dgc_cs_emit(nir_iand_imm(b, vtx_base_sgpr, 0x3FFF));
2019 /* DrawID needs to be first if no GridSize. */
2020 dgc_cs_emit(nir_bcsel(b, has_grid_size, x, drawid));
2021 dgc_cs_emit(nir_bcsel(b, has_grid_size, y, nir_imm_int(b, PKT3_NOP_PAD)));
2022 dgc_cs_emit(nir_bcsel(b, has_grid_size, z, nir_imm_int(b, PKT3_NOP_PAD)));
2023 dgc_cs_emit(nir_bcsel(b, has_drawid, drawid, nir_imm_int(b, PKT3_NOP_PAD)));
2024 dgc_cs_end();
2025 }
2026 nir_pop_if(b, NULL);
2027 }
2028
2029 static void
dgc_emit_dispatch_mesh_direct(struct dgc_cmdbuf * cs,nir_def * x,nir_def * y,nir_def * z)2030 dgc_emit_dispatch_mesh_direct(struct dgc_cmdbuf *cs, nir_def *x, nir_def *y, nir_def *z)
2031 {
2032 dgc_cs_begin(cs);
2033 dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_MESH_DIRECT, 3, 0));
2034 dgc_cs_emit(x);
2035 dgc_cs_emit(y);
2036 dgc_cs_emit(z);
2037 dgc_cs_emit_imm(S_0287F0_SOURCE_SELECT(V_0287F0_DI_SRC_SEL_AUTO_INDEX));
2038 dgc_cs_end();
2039 }
2040
2041 static void
dgc_emit_dispatch_taskmesh_gfx(struct dgc_cmdbuf * cs,nir_def * sequence_id)2042 dgc_emit_dispatch_taskmesh_gfx(struct dgc_cmdbuf *cs, nir_def *sequence_id)
2043 {
2044 const struct radv_device *device = cs->dev;
2045 const struct radv_physical_device *pdev = radv_device_physical(device);
2046 nir_builder *b = cs->b;
2047
2048 nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
2049 nir_def *has_grid_size = nir_test_mask(b, vtx_base_sgpr, DGC_USES_GRID_SIZE);
2050 nir_def *has_linear_dispatch_en = nir_ieq_imm(b, load_param8(b, linear_dispatch_en), 1);
2051
2052 nir_def *base_reg = nir_iand_imm(b, vtx_base_sgpr, 0x3FFF);
2053 nir_def *xyz_dim_reg = nir_bcsel(b, has_grid_size, base_reg, nir_imm_int(b, 0));
2054 nir_def *ring_entry_reg = load_param16(b, mesh_ring_entry_sgpr);
2055
2056 nir_def *xyz_dim_enable = nir_bcsel(b, has_grid_size, nir_imm_int(b, S_4D1_XYZ_DIM_ENABLE(1)), nir_imm_int(b, 0));
2057 nir_def *mode1_enable = nir_imm_int(b, S_4D1_MODE1_ENABLE(!pdev->mesh_fast_launch_2));
2058 nir_def *linear_dispatch_en =
2059 nir_bcsel(b, has_linear_dispatch_en, nir_imm_int(b, S_4D1_LINEAR_DISPATCH_ENABLE(1)), nir_imm_int(b, 0));
2060 nir_def *sqtt_enable = nir_imm_int(b, device->sqtt.bo ? S_4D1_THREAD_TRACE_MARKER_ENABLE(1) : 0);
2061
2062 dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDrawMeshTasksEXT);
2063 dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDrawMeshTasksEXT);
2064
2065 dgc_cs_begin(cs);
2066 dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_TASKMESH_GFX, 2, 0) | PKT3_RESET_FILTER_CAM_S(1));
2067 /* S_4D0_RING_ENTRY_REG(ring_entry_reg) | S_4D0_XYZ_DIM_REG(xyz_dim_reg) */
2068 dgc_cs_emit(nir_ior(b, xyz_dim_reg, nir_ishl_imm(b, ring_entry_reg, 16)));
2069 if (pdev->info.gfx_level >= GFX11) {
2070 dgc_cs_emit(nir_ior(b, xyz_dim_enable, nir_ior(b, mode1_enable, nir_ior(b, linear_dispatch_en, sqtt_enable))));
2071 } else {
2072 dgc_cs_emit(sqtt_enable);
2073 }
2074 dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_AUTO_INDEX);
2075 dgc_cs_end();
2076
2077 dgc_emit_sqtt_thread_trace_marker(cs);
2078 dgc_emit_sqtt_end_api_marker(cs, ApiCmdDrawMeshTasksEXT);
2079 }
2080
2081 static void
dgc_emit_draw_mesh_tasks_gfx(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id)2082 dgc_emit_draw_mesh_tasks_gfx(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id)
2083 {
2084 const struct radv_indirect_command_layout *layout = cs->layout;
2085 const struct radv_device *device = cs->dev;
2086 const struct radv_physical_device *pdev = radv_device_physical(device);
2087 nir_builder *b = cs->b;
2088
2089 nir_def *draw_data = nir_build_load_global(b, 3, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
2090 .access = ACCESS_NON_WRITEABLE);
2091 nir_def *x = nir_channel(b, draw_data, 0);
2092 nir_def *y = nir_channel(b, draw_data, 1);
2093 nir_def *z = nir_channel(b, draw_data, 2);
2094
2095 nir_push_if(b, nir_iand(b, nir_ine_imm(b, x, 0), nir_iand(b, nir_ine_imm(b, y, 0), nir_ine_imm(b, z, 0))));
2096 {
2097 nir_push_if(b, nir_ieq_imm(b, load_param8(b, has_task_shader), 1));
2098 {
2099 dgc_emit_dispatch_taskmesh_gfx(cs, sequence_id);
2100 }
2101 nir_push_else(b, NULL);
2102 {
2103 dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDrawMeshTasksEXT);
2104 dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDrawMeshTasksEXT);
2105
2106 dgc_emit_userdata_mesh(cs, x, y, z, sequence_id);
2107 dgc_emit_instance_count(cs, nir_imm_int(b, 1));
2108
2109 if (pdev->mesh_fast_launch_2) {
2110 dgc_emit_dispatch_mesh_direct(cs, x, y, z);
2111 } else {
2112 nir_def *vertex_count = nir_imul(b, x, nir_imul(b, y, z));
2113 dgc_emit_draw_index_auto(cs, vertex_count);
2114 }
2115
2116 dgc_emit_sqtt_thread_trace_marker(cs);
2117 dgc_emit_sqtt_end_api_marker(cs, ApiCmdDrawMeshTasksEXT);
2118 }
2119 nir_pop_if(b, NULL);
2120 }
2121 nir_pop_if(b, NULL);
2122 }
2123
2124 static void
dgc_emit_draw_mesh_tasks_with_count_gfx(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id)2125 dgc_emit_draw_mesh_tasks_with_count_gfx(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id)
2126 {
2127 const struct radv_indirect_command_layout *layout = cs->layout;
2128 const struct radv_device *device = cs->dev;
2129 const struct radv_physical_device *pdev = radv_device_physical(device);
2130 nir_builder *b = cs->b;
2131
2132 nir_push_if(b, nir_ieq_imm(b, load_param8(b, has_task_shader), 1));
2133 {
2134 dgc_emit_dispatch_taskmesh_gfx(cs, sequence_id);
2135 }
2136 nir_push_else(b, NULL);
2137 {
2138 nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
2139 nir_def *has_grid_size = nir_test_mask(b, vtx_base_sgpr, DGC_USES_GRID_SIZE);
2140 nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
2141
2142 nir_def *draw_data = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
2143 .access = ACCESS_NON_WRITEABLE);
2144 nir_def *va = nir_pack_64_2x32(b, nir_channels(b, draw_data, 0x3));
2145 nir_def *stride = nir_channel(b, draw_data, 2);
2146 nir_def *draw_count = nir_umin(b, load_param32(b, max_draw_count), nir_channel(b, draw_data, 3));
2147
2148 dgc_emit_pkt3_set_base(cs, va);
2149
2150 nir_def *base_reg = nir_iand_imm(b, vtx_base_sgpr, 0x3FFF);
2151 nir_def *xyz_dim_reg = nir_bcsel(b, has_grid_size, base_reg, nir_imm_int(b, 0));
2152 nir_def *draw_id_offset = nir_bcsel(b, has_grid_size, nir_imm_int(b, 3), nir_imm_int(b, 0));
2153 nir_def *draw_id_reg = nir_bcsel(b, has_drawid, nir_iadd(b, base_reg, draw_id_offset), nir_imm_int(b, 0));
2154
2155 nir_push_if(b, has_drawid);
2156 {
2157 nir_def *packet[3] = {nir_imm_int(b, PKT3(PKT3_SET_SH_REG, 1, 0)), draw_id_reg, nir_imm_int(b, 0)};
2158 dgc_emit(cs, 3, packet);
2159 }
2160 nir_pop_if(b, NULL);
2161
2162 nir_def *draw_index_enable =
2163 nir_bcsel(b, has_drawid, nir_imm_int(b, S_4C2_DRAW_INDEX_ENABLE(1)), nir_imm_int(b, 0));
2164 nir_def *xyz_dim_enable = nir_bcsel(b, has_grid_size, nir_imm_int(b, S_4C2_XYZ_DIM_ENABLE(1)), nir_imm_int(b, 0));
2165
2166 dgc_emit_sqtt_begin_api_marker(cs, ApiCmdDrawMeshTasksIndirectCountEXT);
2167 dgc_emit_sqtt_marker_event(cs, sequence_id, EventCmdDrawMeshTasksIndirectCountEXT);
2168
2169 dgc_cs_begin(cs);
2170 dgc_cs_emit(nir_imm_int(b, PKT3(PKT3_DISPATCH_MESH_INDIRECT_MULTI, 7, false) | PKT3_RESET_FILTER_CAM_S(1)));
2171 dgc_cs_emit_imm(0); /* data offset */
2172 /* S_4C1_XYZ_DIM_REG(xyz_dim_reg) | S_4C1_DRAW_INDEX_REG(draw_id_reg) */
2173 dgc_cs_emit(
2174 nir_ior(b, nir_iand_imm(b, xyz_dim_reg, 0xFFFF), nir_ishl_imm(b, nir_iand_imm(b, draw_id_reg, 0xFFFF), 16)));
2175 if (pdev->info.gfx_level >= GFX11) {
2176 dgc_cs_emit(nir_ior_imm(b, nir_ior(b, draw_index_enable, xyz_dim_enable),
2177 S_4C2_MODE1_ENABLE(!pdev->mesh_fast_launch_2)));
2178 } else {
2179 dgc_cs_emit(draw_index_enable);
2180 }
2181 dgc_cs_emit(draw_count);
2182 dgc_cs_emit_imm(0);
2183 dgc_cs_emit_imm(0);
2184 dgc_cs_emit(stride);
2185 dgc_cs_emit_imm(V_0287F0_DI_SRC_SEL_AUTO_INDEX);
2186 dgc_cs_end();
2187
2188 dgc_emit_sqtt_thread_trace_marker(cs);
2189 dgc_emit_sqtt_end_api_marker(cs, ApiCmdDrawMeshTasksIndirectCountEXT);
2190 }
2191 nir_pop_if(b, NULL);
2192 }
2193
2194 static void
dgc_emit_userdata_task(struct dgc_cmdbuf * ace_cs,nir_def * x,nir_def * y,nir_def * z)2195 dgc_emit_userdata_task(struct dgc_cmdbuf *ace_cs, nir_def *x, nir_def *y, nir_def *z)
2196 {
2197 nir_builder *b = ace_cs->b;
2198
2199 nir_def *xyz_sgpr = load_param16(b, task_xyz_sgpr);
2200 nir_push_if(b, nir_ine_imm(b, xyz_sgpr, 0));
2201 {
2202 dgc_cs_begin(ace_cs);
2203 dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 3, 0));
2204 dgc_cs_emit(xyz_sgpr);
2205 dgc_cs_emit(x);
2206 dgc_cs_emit(y);
2207 dgc_cs_emit(z);
2208 dgc_cs_end();
2209 }
2210 nir_pop_if(b, NULL);
2211
2212 nir_def *draw_id_sgpr = load_param16(b, task_draw_id_sgpr);
2213 nir_push_if(b, nir_ine_imm(b, draw_id_sgpr, 0));
2214 {
2215 dgc_cs_begin(ace_cs);
2216 dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 1, 0));
2217 dgc_cs_emit(draw_id_sgpr);
2218 dgc_cs_emit_imm(0);
2219 dgc_cs_end();
2220 }
2221 nir_pop_if(b, NULL);
2222 }
2223
2224 static nir_def *
dgc_get_dispatch_initiator_task(struct dgc_cmdbuf * ace_cs)2225 dgc_get_dispatch_initiator_task(struct dgc_cmdbuf *ace_cs)
2226 {
2227 const struct radv_device *device = ace_cs->dev;
2228 const uint32_t dispatch_initiator_task = device->dispatch_initiator_task;
2229 nir_builder *b = ace_cs->b;
2230
2231 nir_def *is_wave32 = nir_ieq_imm(b, load_param8(b, wave32), 1);
2232 return nir_bcsel(b, is_wave32, nir_imm_int(b, dispatch_initiator_task | S_00B800_CS_W32_EN(1)),
2233 nir_imm_int(b, dispatch_initiator_task));
2234 }
2235
2236 static void
dgc_emit_dispatch_taskmesh_direct_ace(struct dgc_cmdbuf * ace_cs,nir_def * x,nir_def * y,nir_def * z)2237 dgc_emit_dispatch_taskmesh_direct_ace(struct dgc_cmdbuf *ace_cs, nir_def *x, nir_def *y, nir_def *z)
2238 {
2239 nir_def *dispatch_initiator = dgc_get_dispatch_initiator_task(ace_cs);
2240 nir_builder *b = ace_cs->b;
2241
2242 dgc_cs_begin(ace_cs);
2243 dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_TASKMESH_DIRECT_ACE, 4, 0) | PKT3_SHADER_TYPE_S(1));
2244 dgc_cs_emit(x);
2245 dgc_cs_emit(y);
2246 dgc_cs_emit(z);
2247 dgc_cs_emit(dispatch_initiator);
2248 dgc_cs_emit(load_param16(b, task_ring_entry_sgpr));
2249 dgc_cs_end();
2250 }
2251
2252 static void
dgc_emit_draw_mesh_tasks_ace(struct dgc_cmdbuf * ace_cs,nir_def * stream_addr)2253 dgc_emit_draw_mesh_tasks_ace(struct dgc_cmdbuf *ace_cs, nir_def *stream_addr)
2254 {
2255 const struct radv_indirect_command_layout *layout = ace_cs->layout;
2256 nir_builder *b = ace_cs->b;
2257
2258 nir_def *draw_data = nir_build_load_global(b, 3, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
2259 .access = ACCESS_NON_WRITEABLE);
2260 nir_def *x = nir_channel(b, draw_data, 0);
2261 nir_def *y = nir_channel(b, draw_data, 1);
2262 nir_def *z = nir_channel(b, draw_data, 2);
2263
2264 nir_push_if(b, nir_iand(b, nir_ine_imm(b, x, 0), nir_iand(b, nir_ine_imm(b, y, 0), nir_ine_imm(b, z, 0))));
2265 {
2266 dgc_emit_userdata_task(ace_cs, x, y, z);
2267 dgc_emit_dispatch_taskmesh_direct_ace(ace_cs, x, y, z);
2268 }
2269 nir_pop_if(b, NULL);
2270 }
2271
2272 static void
dgc_emit_draw_mesh_tasks_with_count_ace(struct dgc_cmdbuf * ace_cs,nir_def * stream_addr,nir_def * sequence_id)2273 dgc_emit_draw_mesh_tasks_with_count_ace(struct dgc_cmdbuf *ace_cs, nir_def *stream_addr, nir_def *sequence_id)
2274 {
2275 const struct radv_indirect_command_layout *layout = ace_cs->layout;
2276 nir_builder *b = ace_cs->b;
2277
2278 nir_def *draw_data = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, stream_addr, layout->vk.draw_src_offset_B),
2279 .access = ACCESS_NON_WRITEABLE);
2280 nir_def *va_lo = nir_channel(b, draw_data, 0);
2281 nir_def *va_hi = nir_channel(b, draw_data, 1);
2282 nir_def *stride = nir_channel(b, draw_data, 2);
2283 nir_def *draw_count = nir_umin(b, load_param32(b, max_draw_count), nir_channel(b, draw_data, 3));
2284
2285 nir_def *xyz_dim_reg = load_param16(b, task_xyz_sgpr);
2286 nir_def *ring_entry_reg = load_param16(b, task_ring_entry_sgpr);
2287 nir_def *draw_id_reg = load_param16(b, task_draw_id_sgpr);
2288
2289 nir_def *draw_index_enable =
2290 nir_bcsel(b, nir_ine_imm(b, draw_id_reg, 0), nir_imm_int(b, S_AD3_DRAW_INDEX_ENABLE(1)), nir_imm_int(b, 0));
2291 nir_def *xyz_dim_enable =
2292 nir_bcsel(b, nir_ine_imm(b, xyz_dim_reg, 0), nir_imm_int(b, S_AD3_XYZ_DIM_ENABLE(1)), nir_imm_int(b, 0));
2293
2294 nir_def *dispatch_initiator = dgc_get_dispatch_initiator_task(ace_cs);
2295
2296 dgc_cs_begin(ace_cs);
2297 dgc_cs_emit_imm(PKT3(PKT3_DISPATCH_TASKMESH_INDIRECT_MULTI_ACE, 9, 0) | PKT3_SHADER_TYPE_S(1));
2298 dgc_cs_emit(va_lo);
2299 dgc_cs_emit(va_hi);
2300 dgc_cs_emit(ring_entry_reg);
2301 dgc_cs_emit(nir_ior(b, draw_index_enable, nir_ior(b, xyz_dim_enable, nir_ishl_imm(b, draw_id_reg, 16))));
2302 dgc_cs_emit(xyz_dim_reg);
2303 dgc_cs_emit(draw_count);
2304 dgc_cs_emit_imm(0);
2305 dgc_cs_emit_imm(0);
2306 dgc_cs_emit(stride);
2307 dgc_cs_emit(dispatch_initiator);
2308 dgc_cs_end();
2309 }
2310
2311 /**
2312 * Indirect execution set
2313 */
2314 static void
dgc_emit_indirect_sets(struct dgc_cmdbuf * cs)2315 dgc_emit_indirect_sets(struct dgc_cmdbuf *cs)
2316 {
2317 nir_builder *b = cs->b;
2318
2319 nir_def *indirect_desc_sets_sgpr = load_shader_metadata32(cs, indirect_desc_sets_sgpr);
2320 nir_push_if(b, nir_ine_imm(b, indirect_desc_sets_sgpr, 0));
2321 {
2322 dgc_cs_begin(cs);
2323 dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 1, 0));
2324 dgc_cs_emit(indirect_desc_sets_sgpr);
2325 dgc_cs_emit(load_param32(b, indirect_desc_sets_va));
2326 dgc_cs_end();
2327 }
2328 nir_pop_if(b, NULL);
2329 }
2330
2331 static void
dgc_emit_ies(struct dgc_cmdbuf * cs)2332 dgc_emit_ies(struct dgc_cmdbuf *cs)
2333 {
2334 nir_builder *b = cs->b;
2335
2336 nir_def *va = nir_iadd_imm(b, cs->ies_va, sizeof(struct radv_compute_pipeline_metadata));
2337 nir_def *num_dw = nir_build_load_global(b, 1, 32, va, .access = ACCESS_NON_WRITEABLE);
2338 nir_def *cs_va = nir_iadd_imm(b, va, 4);
2339
2340 nir_variable *offset = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "offset");
2341 nir_store_var(b, offset, nir_imm_int(b, 0), 0x1);
2342
2343 nir_push_loop(b);
2344 {
2345 nir_def *cur_offset = nir_load_var(b, offset);
2346
2347 nir_break_if(b, nir_uge(b, cur_offset, num_dw));
2348
2349 nir_def *data = nir_build_load_global(b, 1, 32, nir_iadd(b, cs_va, nir_u2u64(b, nir_imul_imm(b, cur_offset, 4))),
2350 .access = ACCESS_NON_WRITEABLE);
2351
2352 dgc_cs_begin(cs);
2353 dgc_cs_emit(data);
2354 dgc_cs_end();
2355
2356 nir_store_var(b, offset, nir_iadd_imm(b, cur_offset, 1), 0x1);
2357 }
2358 nir_pop_loop(b, NULL);
2359
2360 dgc_emit_indirect_sets(cs);
2361 }
2362
2363 /**
2364 * Raytracing.
2365 */
2366 static void
dgc_emit_shader_pointer(struct dgc_cmdbuf * cs,nir_def * sh_offset,nir_def * va)2367 dgc_emit_shader_pointer(struct dgc_cmdbuf *cs, nir_def *sh_offset, nir_def *va)
2368 {
2369 nir_builder *b = cs->b;
2370
2371 nir_def *va_lo = nir_unpack_64_2x32_split_x(b, va);
2372 nir_def *va_hi = nir_unpack_64_2x32_split_y(b, va);
2373
2374 dgc_cs_begin(cs);
2375 dgc_cs_emit_imm(PKT3(PKT3_SET_SH_REG, 2, 0));
2376 dgc_cs_emit(sh_offset);
2377 dgc_cs_emit(va_lo);
2378 dgc_cs_emit(va_hi);
2379 dgc_cs_end();
2380 }
2381
2382 static void
dgc_emit_rt(struct dgc_cmdbuf * cs,nir_def * stream_addr,nir_def * sequence_id)2383 dgc_emit_rt(struct dgc_cmdbuf *cs, nir_def *stream_addr, nir_def *sequence_id)
2384 {
2385 const struct radv_indirect_command_layout *layout = cs->layout;
2386 const struct radv_device *device = cs->dev;
2387 nir_builder *b = cs->b;
2388
2389 nir_def *indirect_va = nir_iadd_imm(b, stream_addr, layout->vk.dispatch_src_offset_B);
2390
2391 nir_def *cs_sbt_descriptors = load_param16(b, cs_sbt_descriptors);
2392 nir_push_if(b, nir_ine_imm(b, cs_sbt_descriptors, 0));
2393 {
2394 dgc_emit_shader_pointer(cs, cs_sbt_descriptors, indirect_va);
2395 }
2396 nir_pop_if(b, NULL);
2397
2398 nir_def *launch_size_va = nir_iadd_imm(b, indirect_va, offsetof(VkTraceRaysIndirectCommand2KHR, width));
2399
2400 nir_def *cs_ray_launch_size_addr = load_param16(b, cs_ray_launch_size_addr);
2401 nir_push_if(b, nir_ine_imm(b, cs_ray_launch_size_addr, 0));
2402 {
2403 dgc_emit_shader_pointer(cs, cs_ray_launch_size_addr, launch_size_va);
2404 }
2405 nir_pop_if(b, NULL);
2406
2407 const uint32_t dispatch_initiator = device->dispatch_initiator | S_00B800_USE_THREAD_DIMENSIONS(1);
2408 nir_def *is_wave32 = nir_ieq_imm(b, load_param8(b, wave32), 1);
2409 nir_def *dispatch_initiator_rt = nir_bcsel(b, is_wave32, nir_imm_int(b, dispatch_initiator | S_00B800_CS_W32_EN(1)),
2410 nir_imm_int(b, dispatch_initiator));
2411
2412 nir_def *dispatch_data = nir_build_load_global(b, 3, 32, launch_size_va, .access = ACCESS_NON_WRITEABLE);
2413 nir_def *width = nir_channel(b, dispatch_data, 0);
2414 nir_def *height = nir_channel(b, dispatch_data, 1);
2415 nir_def *depth = nir_channel(b, dispatch_data, 2);
2416
2417 nir_def *grid_sgpr = load_param16(b, grid_base_sgpr);
2418
2419 dgc_emit_dispatch_direct(cs, width, height, depth, dispatch_initiator_rt, grid_sgpr, launch_size_va, sequence_id,
2420 true);
2421 }
2422
2423 static nir_def *
dgc_is_cond_render_enabled(nir_builder * b)2424 dgc_is_cond_render_enabled(nir_builder *b)
2425 {
2426 nir_def *res1, *res2;
2427
2428 nir_push_if(b, nir_ieq_imm(b, load_param8(b, predicating), 1));
2429 {
2430 nir_def *val = nir_load_global(b, load_param64(b, predication_va), 4, 1, 32);
2431 /* By default, all rendering commands are discarded if the 32-bit value is zero. If the
2432 * inverted flag is set, they are discarded if the value is non-zero.
2433 */
2434 res1 = nir_ixor(b, nir_i2b(b, load_param8(b, predication_type)), nir_ine_imm(b, val, 0));
2435 }
2436 nir_push_else(b, 0);
2437 {
2438 res2 = nir_imm_bool(b, false);
2439 }
2440 nir_pop_if(b, 0);
2441
2442 return nir_if_phi(b, res1, res2);
2443 }
2444
2445 static void
dgc_pad_cmdbuf(struct dgc_cmdbuf * cs,nir_def * cmd_buf_end)2446 dgc_pad_cmdbuf(struct dgc_cmdbuf *cs, nir_def *cmd_buf_end)
2447 {
2448 nir_builder *b = cs->b;
2449
2450 nir_push_if(b, nir_ine(b, nir_load_var(b, cs->offset), cmd_buf_end));
2451 {
2452 nir_def *cnt = nir_isub(b, cmd_buf_end, nir_load_var(b, cs->offset));
2453 cnt = nir_ushr_imm(b, cnt, 2);
2454 cnt = nir_iadd_imm(b, cnt, -2);
2455 nir_def *pkt = nir_pkt3(b, PKT3_NOP, cnt);
2456
2457 dgc_cs_begin(cs);
2458 dgc_cs_emit(pkt);
2459 dgc_cs_end();
2460 }
2461 nir_pop_if(b, NULL);
2462 }
2463
2464 static nir_shader *
build_dgc_prepare_shader(struct radv_device * dev,struct radv_indirect_command_layout * layout)2465 build_dgc_prepare_shader(struct radv_device *dev, struct radv_indirect_command_layout *layout)
2466 {
2467 const struct radv_physical_device *pdev = radv_device_physical(dev);
2468 nir_builder b = radv_meta_init_shader(dev, MESA_SHADER_COMPUTE, "meta_dgc_prepare");
2469 b.shader->info.workgroup_size[0] = 64;
2470
2471 nir_def *global_id = get_global_ids(&b, 1);
2472
2473 nir_def *sequence_id = global_id;
2474
2475 nir_def *cmd_buf_stride = load_param32(&b, cmd_buf_stride);
2476 nir_def *cmd_buf_base_offset = load_param32(&b, cmd_buf_main_offset);
2477
2478 nir_def *sequence_count = load_param32(&b, sequence_count);
2479 nir_def *sequence_count_addr = load_param64(&b, sequence_count_addr);
2480
2481 /* The effective number of draws is
2482 * min(sequencesCount, sequencesCountBuffer[sequencesCountOffset]) when
2483 * using sequencesCountBuffer. Otherwise it is sequencesCount. */
2484 nir_variable *count_var = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "sequence_count");
2485 nir_store_var(&b, count_var, sequence_count, 0x1);
2486
2487 nir_push_if(&b, nir_ine_imm(&b, sequence_count_addr, 0));
2488 {
2489 nir_def *cnt =
2490 nir_build_load_global(&b, 1, 32, load_param64(&b, sequence_count_addr), .access = ACCESS_NON_WRITEABLE);
2491
2492 /* Must clamp count against the API count explicitly.
2493 * The workgroup potentially contains more threads than maxSequencesCount from API,
2494 * and we have to ensure these threads write NOP packets to pad out the IB. */
2495 cnt = nir_umin(&b, cnt, sequence_count);
2496 nir_store_var(&b, count_var, cnt, 0x1);
2497 }
2498 nir_pop_if(&b, NULL);
2499
2500 nir_push_if(&b, dgc_is_cond_render_enabled(&b));
2501 {
2502 /* Reset the number of sequences when conditional rendering is enabled in order to skip the
2503 * entire shader and pad the cmdbuf with NOPs.
2504 */
2505 nir_store_var(&b, count_var, nir_imm_int(&b, 0), 0x1);
2506 }
2507 nir_pop_if(&b, NULL);
2508
2509 sequence_count = nir_load_var(&b, count_var);
2510
2511 build_dgc_buffer_trailer_main(&b, dev);
2512
2513 nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count));
2514 {
2515 struct dgc_cmdbuf cmd_buf = {
2516 .b = &b,
2517 .dev = dev,
2518 .va = nir_pack_64_2x32_split(&b, load_param32(&b, upload_addr), nir_imm_int(&b, pdev->info.address32_hi)),
2519 .offset = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "cmd_buf_offset"),
2520 .upload_offset = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "upload_offset"),
2521 .layout = layout,
2522 };
2523 nir_store_var(&b, cmd_buf.offset, nir_iadd(&b, nir_imul(&b, global_id, cmd_buf_stride), cmd_buf_base_offset), 1);
2524 nir_def *cmd_buf_end = nir_iadd(&b, nir_load_var(&b, cmd_buf.offset), cmd_buf_stride);
2525
2526 nir_def *stream_addr = load_param64(&b, stream_addr);
2527 stream_addr = nir_iadd(&b, stream_addr, nir_u2u64(&b, nir_imul_imm(&b, sequence_id, layout->vk.stride)));
2528
2529 nir_def *upload_offset_init =
2530 nir_iadd(&b, load_param32(&b, upload_main_offset), nir_imul(&b, load_param32(&b, upload_stride), sequence_id));
2531 nir_store_var(&b, cmd_buf.upload_offset, upload_offset_init, 0x1);
2532
2533 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES))
2534 cmd_buf.ies_va = dgc_load_ies_va(&cmd_buf, stream_addr);
2535
2536 if (layout->push_constant_mask) {
2537 const VkShaderStageFlags stages =
2538 (layout->vk.dgc_info & (BITFIELD_BIT(MESA_VK_DGC_RT) | BITFIELD_BIT(MESA_VK_DGC_DISPATCH)))
2539 ? VK_SHADER_STAGE_COMPUTE_BIT
2540 : (VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_MESH_BIT_EXT);
2541
2542 dgc_emit_push_constant(&cmd_buf, stream_addr, sequence_id, stages);
2543 }
2544
2545 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) {
2546 /* Raytracing */
2547 dgc_emit_rt(&cmd_buf, stream_addr, sequence_id);
2548 } else if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
2549 /* Compute */
2550 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES)) {
2551 dgc_emit_ies(&cmd_buf);
2552 }
2553
2554 dgc_emit_dispatch(&cmd_buf, stream_addr, sequence_id);
2555 } else {
2556 /* Graphics */
2557 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) {
2558 dgc_emit_vertex_buffer(&cmd_buf, stream_addr);
2559 }
2560
2561 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_INDEXED)) {
2562 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IB)) {
2563 nir_variable *max_index_count_var =
2564 nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "max_index_count");
2565
2566 dgc_emit_index_buffer(&cmd_buf, stream_addr, max_index_count_var);
2567
2568 nir_def *max_index_count = nir_load_var(&b, max_index_count_var);
2569
2570 if (layout->vk.draw_count) {
2571 dgc_emit_draw_with_count(&cmd_buf, stream_addr, sequence_id, true);
2572 } else {
2573 dgc_emit_draw_indexed(&cmd_buf, stream_addr, sequence_id, max_index_count);
2574 }
2575 } else {
2576 if (layout->vk.draw_count) {
2577 dgc_emit_draw_with_count(&cmd_buf, stream_addr, sequence_id, true);
2578 } else {
2579 dgc_emit_draw_indirect(&cmd_buf, stream_addr, sequence_id, true);
2580 }
2581 }
2582 } else {
2583 /* Non-indexed draws */
2584 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) {
2585 if (layout->vk.draw_count) {
2586 dgc_emit_draw_mesh_tasks_with_count_gfx(&cmd_buf, stream_addr, sequence_id);
2587 } else {
2588 dgc_emit_draw_mesh_tasks_gfx(&cmd_buf, stream_addr, sequence_id);
2589 }
2590 } else {
2591 if (layout->vk.draw_count) {
2592 dgc_emit_draw_with_count(&cmd_buf, stream_addr, sequence_id, false);
2593 } else {
2594 dgc_emit_draw(&cmd_buf, stream_addr, sequence_id);
2595 }
2596 }
2597 }
2598 }
2599
2600 /* Pad the cmdbuffer if we did not use the whole stride */
2601 dgc_pad_cmdbuf(&cmd_buf, cmd_buf_end);
2602 }
2603 nir_pop_if(&b, NULL);
2604
2605 build_dgc_buffer_tail_main(&b, sequence_count, dev);
2606 build_dgc_buffer_preamble_main(&b, sequence_count, dev);
2607
2608 /* Prepare the ACE command stream */
2609 nir_push_if(&b, nir_ieq_imm(&b, load_param8(&b, has_task_shader), 1));
2610 {
2611 nir_def *ace_cmd_buf_stride = load_param32(&b, ace_cmd_buf_stride);
2612 nir_def *ace_cmd_buf_base_offset = load_param32(&b, ace_cmd_buf_main_offset);
2613
2614 build_dgc_buffer_trailer_ace(&b, dev);
2615
2616 nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count));
2617 {
2618 struct dgc_cmdbuf cmd_buf = {
2619 .b = &b,
2620 .dev = dev,
2621 .va = nir_pack_64_2x32_split(&b, load_param32(&b, upload_addr), nir_imm_int(&b, pdev->info.address32_hi)),
2622 .offset = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "cmd_buf_offset"),
2623 .upload_offset = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "upload_offset"),
2624 .layout = layout,
2625 };
2626 nir_store_var(&b, cmd_buf.offset,
2627 nir_iadd(&b, nir_imul(&b, global_id, ace_cmd_buf_stride), ace_cmd_buf_base_offset), 1);
2628 nir_def *cmd_buf_end = nir_iadd(&b, nir_load_var(&b, cmd_buf.offset), ace_cmd_buf_stride);
2629
2630 nir_def *stream_addr = load_param64(&b, stream_addr);
2631 stream_addr = nir_iadd(&b, stream_addr, nir_u2u64(&b, nir_imul_imm(&b, sequence_id, layout->vk.stride)));
2632
2633 nir_def *upload_offset_init = nir_iadd(&b, load_param32(&b, upload_main_offset),
2634 nir_imul(&b, load_param32(&b, upload_stride), sequence_id));
2635 nir_store_var(&b, cmd_buf.upload_offset, upload_offset_init, 0x1);
2636
2637 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_IES))
2638 cmd_buf.ies_va = dgc_load_ies_va(&cmd_buf, stream_addr);
2639
2640 if (layout->push_constant_mask) {
2641 nir_def *push_constant_stages = dgc_get_push_constant_stages(&cmd_buf);
2642
2643 nir_push_if(&b, nir_test_mask(&b, push_constant_stages, VK_SHADER_STAGE_TASK_BIT_EXT));
2644 {
2645 const struct dgc_pc_params params = dgc_get_pc_params(&cmd_buf);
2646 dgc_emit_push_constant_for_stage(&cmd_buf, stream_addr, sequence_id, ¶ms, MESA_SHADER_TASK);
2647 }
2648 nir_pop_if(&b, NULL);
2649 }
2650
2651 if (layout->vk.draw_count) {
2652 dgc_emit_draw_mesh_tasks_with_count_ace(&cmd_buf, stream_addr, sequence_id);
2653 } else {
2654 dgc_emit_draw_mesh_tasks_ace(&cmd_buf, stream_addr);
2655 }
2656
2657 /* Pad the cmdbuffer if we did not use the whole stride */
2658 dgc_pad_cmdbuf(&cmd_buf, cmd_buf_end);
2659 }
2660 nir_pop_if(&b, NULL);
2661
2662 build_dgc_buffer_tail_ace(&b, sequence_count, dev);
2663 build_dgc_buffer_preamble_ace(&b, sequence_count, dev);
2664 }
2665 nir_pop_if(&b, NULL);
2666
2667 return b.shader;
2668 }
2669
2670 static VkResult
radv_create_dgc_pipeline(struct radv_device * device,struct radv_indirect_command_layout * layout)2671 radv_create_dgc_pipeline(struct radv_device *device, struct radv_indirect_command_layout *layout)
2672 {
2673 enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_DGC;
2674 VkResult result;
2675
2676 const VkDescriptorSetLayoutBinding binding = {
2677 .binding = 0,
2678 .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
2679 .descriptorCount = 1,
2680 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
2681 };
2682
2683 const VkDescriptorSetLayoutCreateInfo desc_info = {
2684 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
2685 .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT,
2686 .bindingCount = 1,
2687 .pBindings = &binding,
2688 };
2689
2690 const VkPushConstantRange pc_range = {
2691 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
2692 .size = sizeof(struct radv_dgc_params),
2693 };
2694
2695 result = vk_meta_get_pipeline_layout(&device->vk, &device->meta_state.device, &desc_info, &pc_range, &key,
2696 sizeof(key), &layout->pipeline_layout);
2697 if (result != VK_SUCCESS)
2698 return result;
2699
2700 nir_shader *cs = build_dgc_prepare_shader(device, layout);
2701
2702 const VkPipelineShaderStageCreateInfo stage_info = {
2703 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
2704 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
2705 .module = vk_shader_module_handle_from_nir(cs),
2706 .pName = "main",
2707 .pSpecializationInfo = NULL,
2708 };
2709
2710 const VkComputePipelineCreateInfo pipeline_info = {
2711 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
2712 .stage = stage_info,
2713 .flags = 0,
2714 .layout = layout->pipeline_layout,
2715 };
2716
2717 /* DGC pipelines don't go through the vk_meta cache because that would require to compute a
2718 * separate key but they are cached on-disk when possible.
2719 */
2720 result = radv_CreateComputePipelines(vk_device_to_handle(&device->vk), device->meta_state.device.pipeline_cache, 1,
2721 &pipeline_info, NULL, &layout->pipeline);
2722
2723 ralloc_free(cs);
2724 return result;
2725 }
2726
2727 VKAPI_ATTR void VKAPI_CALL
radv_GetGeneratedCommandsMemoryRequirementsEXT(VkDevice _device,const VkGeneratedCommandsMemoryRequirementsInfoEXT * pInfo,VkMemoryRequirements2 * pMemoryRequirements)2728 radv_GetGeneratedCommandsMemoryRequirementsEXT(VkDevice _device,
2729 const VkGeneratedCommandsMemoryRequirementsInfoEXT *pInfo,
2730 VkMemoryRequirements2 *pMemoryRequirements)
2731 {
2732 VK_FROM_HANDLE(radv_device, device, _device);
2733 const struct radv_physical_device *pdev = radv_device_physical(device);
2734 VK_FROM_HANDLE(radv_indirect_command_layout, layout, pInfo->indirectCommandsLayout);
2735 struct dgc_cmdbuf_layout cmdbuf_layout;
2736
2737 get_dgc_cmdbuf_layout(device, layout, pInfo->pNext, pInfo->maxSequenceCount, true, &cmdbuf_layout);
2738
2739 pMemoryRequirements->memoryRequirements.memoryTypeBits = pdev->memory_types_32bit;
2740 pMemoryRequirements->memoryRequirements.alignment = radv_dgc_get_buffer_alignment(device);
2741 pMemoryRequirements->memoryRequirements.size =
2742 align(cmdbuf_layout.alloc_size, pMemoryRequirements->memoryRequirements.alignment);
2743 }
2744
2745 bool
radv_use_dgc_predication(struct radv_cmd_buffer * cmd_buffer,const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo)2746 radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo)
2747 {
2748 const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
2749 vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
2750 const VkGeneratedCommandsShaderInfoEXT *eso_info =
2751 vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
2752
2753 /* Enable conditional rendering (if not enabled by user) to skip prepare/execute DGC calls when
2754 * the indirect sequence count might be zero. This can only be enabled on GFX because on ACE it's
2755 * not possible to skip the execute DGC call (ie. no INDIRECT_PACKET). It should also be disabled
2756 * when the graphics pipelines has a task shader for the same reason (otherwise the DGC ACE IB
2757 * would be uninitialized).
2758 */
2759 return cmd_buffer->qf == RADV_QUEUE_GENERAL && !radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK) &&
2760 pGeneratedCommandsInfo->sequenceCountAddress != 0 && !cmd_buffer->state.predicating;
2761 }
2762
2763 VKAPI_ATTR void VKAPI_CALL
radv_CmdPreprocessGeneratedCommandsEXT(VkCommandBuffer commandBuffer,const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo,VkCommandBuffer stateCommandBuffer)2764 radv_CmdPreprocessGeneratedCommandsEXT(VkCommandBuffer commandBuffer,
2765 const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo,
2766 VkCommandBuffer stateCommandBuffer)
2767 {
2768 VK_FROM_HANDLE(radv_cmd_buffer, state_cmd_buffer, stateCommandBuffer);
2769 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2770 VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
2771
2772 assert(layout->vk.usage & VK_INDIRECT_COMMANDS_LAYOUT_USAGE_EXPLICIT_PREPROCESS_BIT_EXT);
2773
2774 /* VK_EXT_conditional_rendering says that copy commands should not be
2775 * affected by conditional rendering.
2776 */
2777 const bool old_predicating = cmd_buffer->state.predicating;
2778 cmd_buffer->state.predicating = false;
2779
2780 radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, state_cmd_buffer, old_predicating);
2781
2782 /* Restore conditional rendering. */
2783 cmd_buffer->state.predicating = old_predicating;
2784 }
2785
2786 static void
radv_prepare_dgc_compute(struct radv_cmd_buffer * cmd_buffer,const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo,struct radv_cmd_buffer * state_cmd_buffer,unsigned * upload_size,unsigned * upload_offset,void ** upload_data,struct radv_dgc_params * params,bool cond_render_enabled)2787 radv_prepare_dgc_compute(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo,
2788 struct radv_cmd_buffer *state_cmd_buffer, unsigned *upload_size, unsigned *upload_offset,
2789 void **upload_data, struct radv_dgc_params *params, bool cond_render_enabled)
2790
2791 {
2792 VK_FROM_HANDLE(radv_indirect_execution_set, ies, pGeneratedCommandsInfo->indirectExecutionSet);
2793 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2794 const uint32_t alloc_size = ies ? 0 : sizeof(struct radv_compute_pipeline_metadata);
2795
2796 *upload_size = MAX2(*upload_size + alloc_size, 16);
2797
2798 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, *upload_size, upload_offset, upload_data)) {
2799 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
2800 return;
2801 }
2802
2803 if (cond_render_enabled) {
2804 params->predicating = true;
2805 params->predication_va = cmd_buffer->state.predication_va;
2806 params->predication_type = cmd_buffer->state.predication_type;
2807 }
2808
2809 if (ies) {
2810 struct radv_descriptor_state *descriptors_state =
2811 radv_get_descriptors_state(state_cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
2812
2813 radv_upload_indirect_descriptor_sets(cmd_buffer, descriptors_state);
2814
2815 params->ies_stride = ies->stride;
2816 params->indirect_desc_sets_va = descriptors_state->indirect_descriptor_sets_va;
2817 } else {
2818 const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
2819 vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
2820 const VkGeneratedCommandsShaderInfoEXT *eso_info =
2821 vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
2822 const struct radv_shader *cs = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_COMPUTE);
2823 struct radv_compute_pipeline_metadata *metadata = (struct radv_compute_pipeline_metadata *)(*upload_data);
2824
2825 radv_get_compute_shader_metadata(device, cs, metadata);
2826
2827 *upload_data = (char *)*upload_data + alloc_size;
2828 }
2829 }
2830
2831 static void
radv_prepare_dgc_rt(struct radv_cmd_buffer * cmd_buffer,const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo,unsigned * upload_size,unsigned * upload_offset,void ** upload_data,struct radv_dgc_params * params)2832 radv_prepare_dgc_rt(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo,
2833 unsigned *upload_size, unsigned *upload_offset, void **upload_data, struct radv_dgc_params *params)
2834 {
2835 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, *upload_size, upload_offset, upload_data)) {
2836 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
2837 return;
2838 }
2839
2840 const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
2841 vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
2842 VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline);
2843 const struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
2844 const struct radv_shader *rt_prolog = rt_pipeline->prolog;
2845
2846 params->wave32 = rt_prolog->info.wave_size == 32;
2847 params->grid_base_sgpr = radv_get_user_sgpr(rt_prolog, AC_UD_CS_GRID_SIZE);
2848 params->cs_sbt_descriptors = radv_get_user_sgpr(rt_prolog, AC_UD_CS_SBT_DESCRIPTORS);
2849 params->cs_ray_launch_size_addr = radv_get_user_sgpr(rt_prolog, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR);
2850 }
2851
2852 static uint32_t
get_dgc_vertex_binding_offset(const struct radv_indirect_command_layout * layout,uint32_t binding)2853 get_dgc_vertex_binding_offset(const struct radv_indirect_command_layout *layout, uint32_t binding)
2854 {
2855 for (uint32_t i = 0; i < layout->vk.n_vb_layouts; i++) {
2856 if (layout->vk.vb_layouts[i].binding == binding)
2857 return layout->vk.vb_layouts[i].src_offset_B;
2858 }
2859
2860 return -1;
2861 }
2862
2863 static void
radv_prepare_dgc_graphics(struct radv_cmd_buffer * cmd_buffer,const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo,struct radv_cmd_buffer * state_cmd_buffer,unsigned * upload_size,unsigned * upload_offset,void ** upload_data,struct radv_dgc_params * params)2864 radv_prepare_dgc_graphics(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo,
2865 struct radv_cmd_buffer *state_cmd_buffer, unsigned *upload_size, unsigned *upload_offset,
2866 void **upload_data, struct radv_dgc_params *params)
2867 {
2868 VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
2869
2870 const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
2871 vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
2872 const VkGeneratedCommandsShaderInfoEXT *eso_info =
2873 vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
2874
2875 const gl_shader_stage first_stage =
2876 (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) ? MESA_SHADER_MESH : MESA_SHADER_VERTEX;
2877 struct radv_shader *first_shader = radv_dgc_get_shader(pipeline_info, eso_info, first_stage);
2878
2879 unsigned vb_size = (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) ? MAX_VBS * DGC_VBO_INFO_SIZE : 0;
2880
2881 *upload_size = MAX2(*upload_size + vb_size, 16);
2882
2883 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, *upload_size, upload_offset, upload_data)) {
2884 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
2885 return;
2886 }
2887
2888 uint16_t vtx_base_sgpr = radv_get_user_sgpr(first_shader, AC_UD_VS_BASE_VERTEX_START_INSTANCE);
2889 const bool uses_drawid = first_shader->info.vs.needs_draw_id;
2890
2891 if (uses_drawid)
2892 vtx_base_sgpr |= DGC_USES_DRAWID;
2893
2894 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DRAW_MESH)) {
2895 if (first_shader->info.cs.uses_grid_size)
2896 vtx_base_sgpr |= DGC_USES_GRID_SIZE;
2897
2898 const struct radv_shader *task_shader = radv_dgc_get_shader(pipeline_info, eso_info, MESA_SHADER_TASK);
2899 if (task_shader) {
2900 params->has_task_shader = 1;
2901 params->mesh_ring_entry_sgpr = radv_get_user_sgpr(first_shader, AC_UD_TASK_RING_ENTRY);
2902 params->linear_dispatch_en = task_shader->info.cs.linear_taskmesh_dispatch;
2903 params->task_ring_entry_sgpr = radv_get_user_sgpr(task_shader, AC_UD_TASK_RING_ENTRY);
2904 params->wave32 = task_shader->info.wave_size == 32;
2905 params->task_xyz_sgpr = radv_get_user_sgpr(task_shader, AC_UD_CS_GRID_SIZE);
2906 params->task_draw_id_sgpr = radv_get_user_sgpr(task_shader, AC_UD_CS_TASK_DRAW_ID);
2907 }
2908 } else {
2909 const bool uses_baseinstance = first_shader->info.vs.needs_base_instance;
2910
2911 if (uses_baseinstance)
2912 vtx_base_sgpr |= DGC_USES_BASEINSTANCE;
2913 }
2914
2915 params->vtx_base_sgpr = vtx_base_sgpr;
2916 params->max_index_count = state_cmd_buffer->state.max_index_count;
2917 params->max_draw_count = pGeneratedCommandsInfo->maxDrawCount;
2918 params->dynamic_vs_input =
2919 (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) && first_shader->info.vs.dynamic_inputs;
2920 params->use_per_attribute_vb_descs =
2921 (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) && first_shader->info.vs.use_per_attribute_vb_descs;
2922
2923 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_VB)) {
2924 uint8_t *ptr = (uint8_t *)((char *)*upload_data);
2925
2926 for (uint32_t i = 0; i < MAX_VBS; i++) {
2927 struct radv_vbo_info vbo_info;
2928 radv_get_vbo_info(state_cmd_buffer, i, &vbo_info);
2929
2930 const uint32_t vbo_offset = get_dgc_vertex_binding_offset(layout, vbo_info.binding);
2931
2932 memcpy(ptr, &vbo_info, sizeof(vbo_info));
2933 ptr += sizeof(struct radv_vbo_info);
2934
2935 memcpy(ptr, &vbo_offset, sizeof(uint32_t));
2936 ptr += sizeof(uint32_t);
2937 }
2938 params->vb_desc_usage_mask = first_shader->info.vs.vb_desc_usage_mask;
2939 params->vbo_reg = radv_get_user_sgpr(first_shader, AC_UD_VS_VERTEX_BUFFERS);
2940
2941 *upload_data = (char *)*upload_data + vb_size;
2942 }
2943 }
2944
2945 void
radv_prepare_dgc(struct radv_cmd_buffer * cmd_buffer,const VkGeneratedCommandsInfoEXT * pGeneratedCommandsInfo,struct radv_cmd_buffer * state_cmd_buffer,bool cond_render_enabled)2946 radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoEXT *pGeneratedCommandsInfo,
2947 struct radv_cmd_buffer *state_cmd_buffer, bool cond_render_enabled)
2948 {
2949 VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
2950 VK_FROM_HANDLE(radv_indirect_execution_set, ies, pGeneratedCommandsInfo->indirectExecutionSet);
2951 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2952 const struct radv_physical_device *pdev = radv_device_physical(device);
2953 struct radv_meta_saved_state saved_state;
2954 unsigned upload_offset, upload_size = 0;
2955 struct radv_buffer token_buffer;
2956 void *upload_data;
2957
2958 const VkGeneratedCommandsPipelineInfoEXT *pipeline_info =
2959 vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_PIPELINE_INFO_EXT);
2960 const VkGeneratedCommandsShaderInfoEXT *eso_info =
2961 vk_find_struct_const(pGeneratedCommandsInfo->pNext, GENERATED_COMMANDS_SHADER_INFO_EXT);
2962
2963 const bool use_preamble = radv_dgc_use_preamble(pGeneratedCommandsInfo);
2964 const uint32_t sequences_count = pGeneratedCommandsInfo->maxSequenceCount;
2965
2966 struct dgc_cmdbuf_layout cmdbuf_layout;
2967 get_dgc_cmdbuf_layout(device, layout, pGeneratedCommandsInfo->pNext, sequences_count, use_preamble, &cmdbuf_layout);
2968
2969 assert((cmdbuf_layout.main_offset + pGeneratedCommandsInfo->preprocessAddress) %
2970 pdev->info.ip[AMD_IP_GFX].ib_alignment ==
2971 0);
2972 assert((cmdbuf_layout.ace_main_offset + pGeneratedCommandsInfo->preprocessAddress) %
2973 pdev->info.ip[AMD_IP_COMPUTE].ib_alignment ==
2974 0);
2975
2976 struct radv_dgc_params params = {
2977 .cmd_buf_preamble_offset = cmdbuf_layout.main_preamble_offset,
2978 .cmd_buf_main_offset = cmdbuf_layout.main_offset,
2979 .cmd_buf_stride = cmdbuf_layout.main_cmd_stride,
2980 .cmd_buf_size = cmdbuf_layout.main_size,
2981 .ace_cmd_buf_trailer_offset = cmdbuf_layout.ace_trailer_offset,
2982 .ace_cmd_buf_preamble_offset = cmdbuf_layout.ace_preamble_offset,
2983 .ace_cmd_buf_main_offset = cmdbuf_layout.ace_main_offset,
2984 .ace_cmd_buf_stride = cmdbuf_layout.ace_cmd_stride,
2985 .ace_cmd_buf_size = cmdbuf_layout.ace_size,
2986 .upload_main_offset = cmdbuf_layout.upload_offset,
2987 .upload_addr = (uint32_t)pGeneratedCommandsInfo->preprocessAddress,
2988 .upload_stride = cmdbuf_layout.upload_stride,
2989 .sequence_count = sequences_count,
2990 .use_preamble = use_preamble,
2991 .stream_addr = pGeneratedCommandsInfo->indirectAddress,
2992 .sequence_count_addr = pGeneratedCommandsInfo->sequenceCountAddress,
2993 .ies_addr = ies ? ies->va : 0,
2994 .queue_family = state_cmd_buffer->qf,
2995 };
2996
2997 VK_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, layout->vk.layout);
2998
2999 if (layout->vk.dgc_info & (BITFIELD_BIT(MESA_VK_DGC_PC) | BITFIELD_BIT(MESA_VK_DGC_SI))) {
3000 upload_size = pipeline_layout->push_constant_size + MESA_VULKAN_SHADER_STAGES * 12;
3001 }
3002
3003 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_DISPATCH)) {
3004 radv_prepare_dgc_compute(cmd_buffer, pGeneratedCommandsInfo, state_cmd_buffer, &upload_size, &upload_offset,
3005 &upload_data, ¶ms, cond_render_enabled);
3006 } else if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) {
3007 radv_prepare_dgc_rt(cmd_buffer, pGeneratedCommandsInfo, &upload_size, &upload_offset, &upload_data, ¶ms);
3008 } else {
3009 radv_prepare_dgc_graphics(cmd_buffer, pGeneratedCommandsInfo, state_cmd_buffer, &upload_size, &upload_offset,
3010 &upload_data, ¶ms);
3011 }
3012
3013 if (layout->push_constant_mask) {
3014 VkShaderStageFlags pc_stages = 0;
3015 uint32_t *desc = upload_data;
3016 upload_data = (char *)upload_data + MESA_VULKAN_SHADER_STAGES * 12;
3017
3018 struct radv_shader *shaders[MESA_VULKAN_SHADER_STAGES] = {0};
3019 if (pipeline_info) {
3020 VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->pipeline);
3021
3022 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_RT)) {
3023 const struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
3024 struct radv_shader *rt_prolog = rt_pipeline->prolog;
3025
3026 shaders[MESA_SHADER_COMPUTE] = rt_prolog;
3027 } else {
3028 memcpy(shaders, pipeline->shaders, sizeof(shaders));
3029 }
3030 } else if (eso_info) {
3031 for (unsigned i = 0; i < eso_info->shaderCount; ++i) {
3032 VK_FROM_HANDLE(radv_shader_object, shader_object, eso_info->pShaders[i]);
3033 struct radv_shader *shader = shader_object->shader;
3034 gl_shader_stage stage = shader->info.stage;
3035
3036 shaders[stage] = shader;
3037 }
3038 }
3039
3040 for (unsigned i = 0; i < ARRAY_SIZE(shaders); i++) {
3041 const struct radv_shader *shader = shaders[i];
3042
3043 if (!shader)
3044 continue;
3045
3046 const struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs;
3047 if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
3048 params.const_copy = 1;
3049 }
3050
3051 if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0 ||
3052 locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
3053 unsigned upload_sgpr = 0;
3054 unsigned inline_sgpr = 0;
3055
3056 if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
3057 upload_sgpr = (shader->info.user_data_0 + 4 * locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx -
3058 SI_SH_REG_OFFSET) >>
3059 2;
3060 }
3061
3062 if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
3063 inline_sgpr = (shader->info.user_data_0 + 4 * locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx -
3064 SI_SH_REG_OFFSET) >>
3065 2;
3066 desc[i * 3 + 1] = shader->info.inline_push_constant_mask;
3067 desc[i * 3 + 2] = shader->info.inline_push_constant_mask >> 32;
3068 }
3069 desc[i * 3] = upload_sgpr | (inline_sgpr << 16);
3070
3071 pc_stages |= mesa_to_vk_shader_stage(i);
3072 }
3073 }
3074
3075 params.push_constant_stages = pc_stages;
3076
3077 memcpy(upload_data, state_cmd_buffer->push_constants, pipeline_layout->push_constant_size);
3078 upload_data = (char *)upload_data + pipeline_layout->push_constant_size;
3079 }
3080
3081 radv_buffer_init(&token_buffer, device, cmd_buffer->upload.upload_bo, upload_size, upload_offset);
3082
3083 radv_meta_save(&saved_state, cmd_buffer,
3084 RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS);
3085
3086 radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, layout->pipeline);
3087
3088 vk_common_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), layout->pipeline_layout,
3089 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(params), ¶ms);
3090
3091 radv_meta_push_descriptor_set(
3092 cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, layout->pipeline_layout, 0, 1,
3093 (VkWriteDescriptorSet[]){{.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
3094 .dstBinding = 0,
3095 .dstArrayElement = 0,
3096 .descriptorCount = 1,
3097 .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
3098 .pBufferInfo = &(VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&token_buffer),
3099 .offset = 0,
3100 .range = upload_size}}});
3101
3102 unsigned block_count = MAX2(1, DIV_ROUND_UP(pGeneratedCommandsInfo->maxSequenceCount, 64));
3103 vk_common_CmdDispatch(radv_cmd_buffer_to_handle(cmd_buffer), block_count, 1, 1);
3104
3105 radv_buffer_finish(&token_buffer);
3106 radv_meta_restore(&saved_state, cmd_buffer);
3107 }
3108
3109 static void
radv_destroy_indirect_commands_layout(struct radv_device * device,const VkAllocationCallbacks * pAllocator,struct radv_indirect_command_layout * layout)3110 radv_destroy_indirect_commands_layout(struct radv_device *device, const VkAllocationCallbacks *pAllocator,
3111 struct radv_indirect_command_layout *layout)
3112 {
3113 radv_DestroyPipeline(radv_device_to_handle(device), layout->pipeline, &device->meta_state.alloc);
3114
3115 vk_indirect_command_layout_destroy(&device->vk, pAllocator, &layout->vk);
3116 }
3117
3118 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateIndirectCommandsLayoutEXT(VkDevice _device,const VkIndirectCommandsLayoutCreateInfoEXT * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkIndirectCommandsLayoutEXT * pIndirectCommandsLayout)3119 radv_CreateIndirectCommandsLayoutEXT(VkDevice _device, const VkIndirectCommandsLayoutCreateInfoEXT *pCreateInfo,
3120 const VkAllocationCallbacks *pAllocator,
3121 VkIndirectCommandsLayoutEXT *pIndirectCommandsLayout)
3122 {
3123 VK_FROM_HANDLE(radv_device, device, _device);
3124 struct radv_indirect_command_layout *layout;
3125 VkResult result;
3126
3127 layout = vk_indirect_command_layout_create(&device->vk, pCreateInfo, pAllocator, sizeof(*layout));
3128 if (!layout)
3129 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
3130
3131 for (uint32_t i = 0; i < layout->vk.n_pc_layouts; i++) {
3132 for (uint32_t j = layout->vk.pc_layouts[i].dst_offset_B / 4, k = 0; k < layout->vk.pc_layouts[i].size_B / 4;
3133 j++, k++) {
3134 layout->push_constant_mask |= 1ull << j;
3135 layout->push_constant_offsets[j] = layout->vk.pc_layouts[i].src_offset_B + k * 4;
3136 }
3137 }
3138
3139 if (layout->vk.dgc_info & BITFIELD_BIT(MESA_VK_DGC_SI)) {
3140 layout->sequence_index_mask = 1ull << (layout->vk.si_layout.dst_offset_B / 4);
3141 layout->push_constant_mask |= layout->sequence_index_mask;
3142 }
3143
3144 result = radv_create_dgc_pipeline(device, layout);
3145 if (result != VK_SUCCESS) {
3146 radv_destroy_indirect_commands_layout(device, pAllocator, layout);
3147 return result;
3148 }
3149
3150 *pIndirectCommandsLayout = radv_indirect_command_layout_to_handle(layout);
3151 return VK_SUCCESS;
3152 }
3153
3154 VKAPI_ATTR void VKAPI_CALL
radv_DestroyIndirectCommandsLayoutEXT(VkDevice _device,VkIndirectCommandsLayoutEXT indirectCommandsLayout,const VkAllocationCallbacks * pAllocator)3155 radv_DestroyIndirectCommandsLayoutEXT(VkDevice _device, VkIndirectCommandsLayoutEXT indirectCommandsLayout,
3156 const VkAllocationCallbacks *pAllocator)
3157 {
3158 VK_FROM_HANDLE(radv_device, device, _device);
3159 VK_FROM_HANDLE(radv_indirect_command_layout, layout, indirectCommandsLayout);
3160
3161 if (!layout)
3162 return;
3163
3164 radv_destroy_indirect_commands_layout(device, pAllocator, layout);
3165 }
3166
3167 static void
radv_update_ies_shader(struct radv_device * device,struct radv_indirect_execution_set * set,uint32_t index,struct radv_shader * shader)3168 radv_update_ies_shader(struct radv_device *device, struct radv_indirect_execution_set *set, uint32_t index,
3169 struct radv_shader *shader)
3170 {
3171 const struct radv_physical_device *pdev = radv_device_physical(device);
3172 uint8_t *ptr = set->mapped_ptr + set->stride * index;
3173 struct radv_compute_pipeline_metadata md;
3174 struct radeon_cmdbuf *cs;
3175
3176 assert(shader->info.stage == MESA_SHADER_COMPUTE);
3177 radv_get_compute_shader_metadata(device, shader, &md);
3178
3179 cs = calloc(1, sizeof(*cs));
3180 if (!cs)
3181 return;
3182
3183 cs->reserved_dw = cs->max_dw = 32;
3184 cs->buf = malloc(cs->max_dw * 4);
3185 if (!cs->buf) {
3186 free(cs);
3187 return;
3188 }
3189
3190 radv_emit_compute_shader(pdev, cs, shader);
3191
3192 memcpy(ptr, &md, sizeof(md));
3193 ptr += sizeof(md);
3194
3195 memcpy(ptr, &cs->cdw, sizeof(uint32_t));
3196 ptr += sizeof(uint32_t);
3197
3198 memcpy(ptr, cs->buf, cs->cdw * sizeof(uint32_t));
3199 ptr += cs->cdw * sizeof(uint32_t);
3200
3201 set->compute_scratch_size_per_wave = MAX2(set->compute_scratch_size_per_wave, shader->config.scratch_bytes_per_wave);
3202 set->compute_scratch_waves = MAX2(set->compute_scratch_waves, radv_get_max_scratch_waves(device, shader));
3203
3204 free(cs->buf);
3205 free(cs);
3206 }
3207
3208 static void
radv_update_ies_pipeline(struct radv_device * device,struct radv_indirect_execution_set * set,uint32_t index,const struct radv_pipeline * pipeline)3209 radv_update_ies_pipeline(struct radv_device *device, struct radv_indirect_execution_set *set, uint32_t index,
3210 const struct radv_pipeline *pipeline)
3211 {
3212 assert(pipeline->type == RADV_PIPELINE_COMPUTE);
3213 radv_update_ies_shader(device, set, index, pipeline->shaders[MESA_SHADER_COMPUTE]);
3214 }
3215
3216 static void
radv_destroy_indirect_execution_set(struct radv_device * device,const VkAllocationCallbacks * pAllocator,struct radv_indirect_execution_set * set)3217 radv_destroy_indirect_execution_set(struct radv_device *device, const VkAllocationCallbacks *pAllocator,
3218 struct radv_indirect_execution_set *set)
3219 {
3220 if (set->bo)
3221 radv_bo_destroy(device, &set->base, set->bo);
3222
3223 vk_object_base_finish(&set->base);
3224 vk_free2(&device->vk.alloc, pAllocator, set);
3225 }
3226
3227 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateIndirectExecutionSetEXT(VkDevice _device,const VkIndirectExecutionSetCreateInfoEXT * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkIndirectExecutionSetEXT * pIndirectExecutionSet)3228 radv_CreateIndirectExecutionSetEXT(VkDevice _device, const VkIndirectExecutionSetCreateInfoEXT *pCreateInfo,
3229 const VkAllocationCallbacks *pAllocator,
3230 VkIndirectExecutionSetEXT *pIndirectExecutionSet)
3231 {
3232 VK_FROM_HANDLE(radv_device, device, _device);
3233 const struct radv_physical_device *pdev = radv_device_physical(device);
3234 struct radv_indirect_execution_set *set;
3235 uint32_t num_entries;
3236 uint32_t stride;
3237 VkResult result;
3238
3239 set = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*set), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
3240 if (!set)
3241 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
3242
3243 vk_object_base_init(&device->vk, &set->base, VK_OBJECT_TYPE_INDIRECT_EXECUTION_SET_EXT);
3244
3245 switch (pCreateInfo->type) {
3246 case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT: {
3247 const VkIndirectExecutionSetPipelineInfoEXT *pipeline_info = pCreateInfo->info.pPipelineInfo;
3248 VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->initialPipeline);
3249
3250 assert(pipeline->type == RADV_PIPELINE_COMPUTE);
3251 num_entries = pipeline_info->maxPipelineCount;
3252 break;
3253 }
3254 case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT: {
3255 const VkIndirectExecutionSetShaderInfoEXT *shaders_info = pCreateInfo->info.pShaderInfo;
3256 VK_FROM_HANDLE(radv_shader_object, shader_object, shaders_info->pInitialShaders[0]);
3257
3258 assert(shader_object->stage == MESA_SHADER_COMPUTE);
3259 num_entries = shaders_info->maxShaderCount;
3260 break;
3261 }
3262 default:
3263 unreachable("Invalid IES type");
3264 }
3265
3266 stride = sizeof(struct radv_compute_pipeline_metadata);
3267 stride += 4 /* num CS DW */;
3268 stride += (pdev->info.gfx_level >= GFX10 ? 19 : 16) * 4;
3269
3270 result = radv_bo_create(device, &set->base, num_entries * stride, 8, RADEON_DOMAIN_VRAM,
3271 RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_READ_ONLY, RADV_BO_PRIORITY_DESCRIPTOR, 0,
3272 false, &set->bo);
3273 if (result != VK_SUCCESS) {
3274 radv_destroy_indirect_execution_set(device, pAllocator, set);
3275 return vk_error(device, result);
3276 }
3277
3278 set->mapped_ptr = (uint8_t *)radv_buffer_map(device->ws, set->bo);
3279 if (!set->mapped_ptr) {
3280 radv_destroy_indirect_execution_set(device, pAllocator, set);
3281 return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
3282 }
3283
3284 set->va = radv_buffer_get_va(set->bo);
3285 set->stride = stride;
3286
3287 /* The driver is supposed to always populate slot 0 with the initial pipeline/shader. */
3288 switch (pCreateInfo->type) {
3289 case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT: {
3290 const VkIndirectExecutionSetPipelineInfoEXT *pipeline_info = pCreateInfo->info.pPipelineInfo;
3291 VK_FROM_HANDLE(radv_pipeline, pipeline, pipeline_info->initialPipeline);
3292
3293 radv_update_ies_pipeline(device, set, 0, pipeline);
3294 break;
3295 }
3296 case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT: {
3297 const VkIndirectExecutionSetShaderInfoEXT *shaders_info = pCreateInfo->info.pShaderInfo;
3298 VK_FROM_HANDLE(radv_shader_object, shader_object, shaders_info->pInitialShaders[0]);
3299
3300 radv_update_ies_shader(device, set, 0, shader_object->shader);
3301 break;
3302 }
3303 default:
3304 unreachable("Invalid IES type");
3305 }
3306
3307 *pIndirectExecutionSet = radv_indirect_execution_set_to_handle(set);
3308 return VK_SUCCESS;
3309 }
3310
3311 VKAPI_ATTR void VKAPI_CALL
radv_DestroyIndirectExecutionSetEXT(VkDevice _device,VkIndirectExecutionSetEXT indirectExecutionSet,const VkAllocationCallbacks * pAllocator)3312 radv_DestroyIndirectExecutionSetEXT(VkDevice _device, VkIndirectExecutionSetEXT indirectExecutionSet,
3313 const VkAllocationCallbacks *pAllocator)
3314 {
3315 VK_FROM_HANDLE(radv_device, device, _device);
3316 VK_FROM_HANDLE(radv_indirect_execution_set, set, indirectExecutionSet);
3317
3318 if (!set)
3319 return;
3320
3321 radv_destroy_indirect_execution_set(device, pAllocator, set);
3322 }
3323
3324 VKAPI_ATTR void VKAPI_CALL
radv_UpdateIndirectExecutionSetPipelineEXT(VkDevice _device,VkIndirectExecutionSetEXT indirectExecutionSet,uint32_t executionSetWriteCount,const VkWriteIndirectExecutionSetPipelineEXT * pExecutionSetWrites)3325 radv_UpdateIndirectExecutionSetPipelineEXT(VkDevice _device, VkIndirectExecutionSetEXT indirectExecutionSet,
3326 uint32_t executionSetWriteCount,
3327 const VkWriteIndirectExecutionSetPipelineEXT *pExecutionSetWrites)
3328 {
3329 VK_FROM_HANDLE(radv_indirect_execution_set, set, indirectExecutionSet);
3330 VK_FROM_HANDLE(radv_device, device, _device);
3331
3332 for (uint32_t i = 0; i < executionSetWriteCount; i++) {
3333 const VkWriteIndirectExecutionSetPipelineEXT *writeset = &pExecutionSetWrites[i];
3334 VK_FROM_HANDLE(radv_pipeline, pipeline, writeset->pipeline);
3335
3336 radv_update_ies_pipeline(device, set, writeset->index, pipeline);
3337 }
3338 }
3339
3340 VKAPI_ATTR void VKAPI_CALL
radv_UpdateIndirectExecutionSetShaderEXT(VkDevice _device,VkIndirectExecutionSetEXT indirectExecutionSet,uint32_t executionSetWriteCount,const VkWriteIndirectExecutionSetShaderEXT * pExecutionSetWrites)3341 radv_UpdateIndirectExecutionSetShaderEXT(VkDevice _device, VkIndirectExecutionSetEXT indirectExecutionSet,
3342 uint32_t executionSetWriteCount,
3343 const VkWriteIndirectExecutionSetShaderEXT *pExecutionSetWrites)
3344 {
3345 VK_FROM_HANDLE(radv_indirect_execution_set, set, indirectExecutionSet);
3346 VK_FROM_HANDLE(radv_device, device, _device);
3347
3348 for (uint32_t i = 0; i < executionSetWriteCount; i++) {
3349 const VkWriteIndirectExecutionSetShaderEXT *writeset = &pExecutionSetWrites[i];
3350 VK_FROM_HANDLE(radv_shader_object, shader_object, writeset->shader);
3351
3352 radv_update_ies_shader(device, set, writeset->index, shader_object->shader);
3353 }
3354 }
3355