• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Google
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "meta/radv_meta.h"
25 #include "radv_private.h"
26 
27 #include "ac_rgp.h"
28 
29 #include "nir_builder.h"
30 
31 #include "vk_common_entrypoints.h"
32 
33 static void
radv_get_sequence_size_compute(const struct radv_indirect_command_layout * layout,const struct radv_compute_pipeline * pipeline,uint32_t * cmd_size)34 radv_get_sequence_size_compute(const struct radv_indirect_command_layout *layout,
35                                const struct radv_compute_pipeline *pipeline, uint32_t *cmd_size)
36 {
37    const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk);
38    struct radv_shader *cs = radv_get_shader(pipeline->base.shaders, MESA_SHADER_COMPUTE);
39 
40    /* dispatch */
41    *cmd_size += 5 * 4;
42 
43    const struct radv_userdata_info *loc = radv_get_user_sgpr(cs, AC_UD_CS_GRID_SIZE);
44    if (loc->sgpr_idx != -1) {
45       if (device->load_grid_size_from_user_sgpr) {
46          /* PKT3_SET_SH_REG for immediate values */
47          *cmd_size += 5 * 4;
48       } else {
49          /* PKT3_SET_SH_REG for pointer */
50          *cmd_size += 4 * 4;
51       }
52    }
53 
54    if (device->sqtt.bo) {
55       /* sqtt markers */
56       *cmd_size += 8 * 3 * 4;
57    }
58 }
59 
60 static void
radv_get_sequence_size_graphics(const struct radv_indirect_command_layout * layout,const struct radv_graphics_pipeline * pipeline,uint32_t * cmd_size,uint32_t * upload_size)61 radv_get_sequence_size_graphics(const struct radv_indirect_command_layout *layout,
62                                 const struct radv_graphics_pipeline *pipeline, uint32_t *cmd_size,
63                                 uint32_t *upload_size)
64 {
65    const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk);
66    const struct radv_shader *vs = radv_get_shader(pipeline->base.shaders, MESA_SHADER_VERTEX);
67 
68    if (layout->bind_vbo_mask) {
69       *upload_size += 16 * util_bitcount(vs->info.vs.vb_desc_usage_mask);
70 
71       /* One PKT3_SET_SH_REG for emitting VBO pointer (32-bit) */
72       *cmd_size += 3 * 4;
73    }
74 
75    if (layout->binds_index_buffer) {
76       /* Index type write (normal reg write) + index buffer base write (64-bits, but special packet
77        * so only 1 word overhead) + index buffer size (again, special packet so only 1 word
78        * overhead)
79        */
80       *cmd_size += (3 + 3 + 2) * 4;
81    }
82 
83    if (layout->indexed) {
84       if (layout->binds_index_buffer) {
85          /* userdata writes + instance count + indexed draw */
86          *cmd_size += (5 + 2 + 5) * 4;
87       } else {
88          /* PKT3_SET_BASE + PKT3_DRAW_{INDEX}_INDIRECT_MULTI */
89          *cmd_size += (4 + (pipeline->uses_drawid ? 10 : 5)) * 4;
90       }
91    } else {
92       if (layout->draw_mesh_tasks) {
93          /* userdata writes + instance count + non-indexed draw */
94          *cmd_size += (6 + 2 + (device->physical_device->mesh_fast_launch_2 ? 5 : 3)) * 4;
95       } else {
96          /* userdata writes + instance count + non-indexed draw */
97          *cmd_size += (5 + 2 + 3) * 4;
98       }
99    }
100 
101    if (device->sqtt.bo) {
102       /* sqtt markers */
103       *cmd_size += 5 * 3 * 4;
104    }
105 }
106 
107 static void
radv_get_sequence_size(const struct radv_indirect_command_layout * layout,struct radv_pipeline * pipeline,uint32_t * cmd_size,uint32_t * upload_size)108 radv_get_sequence_size(const struct radv_indirect_command_layout *layout, struct radv_pipeline *pipeline,
109                        uint32_t *cmd_size, uint32_t *upload_size)
110 {
111    const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk);
112 
113    *cmd_size = 0;
114    *upload_size = 0;
115 
116    if (layout->push_constant_mask) {
117       bool need_copy = false;
118 
119       for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); ++i) {
120          if (!pipeline->shaders[i])
121             continue;
122 
123          struct radv_userdata_locations *locs = &pipeline->shaders[i]->info.user_sgprs_locs;
124          if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
125             /* One PKT3_SET_SH_REG for emitting push constants pointer (32-bit) */
126             *cmd_size += 3 * 4;
127             need_copy = true;
128          }
129          if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0)
130             /* One PKT3_SET_SH_REG writing all inline push constants. */
131             *cmd_size += (3 * util_bitcount64(layout->push_constant_mask)) * 4;
132       }
133       if (need_copy)
134          *upload_size += align(pipeline->push_constant_size + 16 * pipeline->dynamic_offset_count, 16);
135    }
136 
137    if (device->sqtt.bo) {
138       /* THREAD_TRACE_MARKER */
139       *cmd_size += 2 * 4;
140    }
141 
142    if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
143       struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
144       radv_get_sequence_size_graphics(layout, graphics_pipeline, cmd_size, upload_size);
145    } else {
146       assert(layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE);
147       struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
148       radv_get_sequence_size_compute(layout, compute_pipeline, cmd_size);
149    }
150 }
151 
152 static uint32_t
radv_align_cmdbuf_size(const struct radv_device * device,uint32_t size,enum amd_ip_type ip_type)153 radv_align_cmdbuf_size(const struct radv_device *device, uint32_t size, enum amd_ip_type ip_type)
154 {
155    const uint32_t ib_alignment = device->physical_device->rad_info.ip[ip_type].ib_alignment;
156 
157    return align(size, ib_alignment);
158 }
159 
160 static unsigned
radv_dgc_preamble_cmdbuf_size(const struct radv_device * device)161 radv_dgc_preamble_cmdbuf_size(const struct radv_device *device)
162 {
163    return radv_align_cmdbuf_size(device, 16, AMD_IP_GFX);
164 }
165 
166 static bool
radv_dgc_use_preamble(const VkGeneratedCommandsInfoNV * cmd_info)167 radv_dgc_use_preamble(const VkGeneratedCommandsInfoNV *cmd_info)
168 {
169    /* Heuristic on when the overhead for the preamble (i.e. double jump) is worth it. Obviously
170     * a bit of a guess as it depends on the actual count which we don't know. */
171    return cmd_info->sequencesCountBuffer != VK_NULL_HANDLE && cmd_info->sequencesCount >= 64;
172 }
173 
174 uint32_t
radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV * cmd_info)175 radv_get_indirect_cmdbuf_size(const VkGeneratedCommandsInfoNV *cmd_info)
176 {
177    VK_FROM_HANDLE(radv_indirect_command_layout, layout, cmd_info->indirectCommandsLayout);
178    VK_FROM_HANDLE(radv_pipeline, pipeline, cmd_info->pipeline);
179    const struct radv_device *device = container_of(layout->base.device, struct radv_device, vk);
180 
181    if (radv_dgc_use_preamble(cmd_info))
182       return radv_dgc_preamble_cmdbuf_size(device);
183 
184    uint32_t cmd_size, upload_size;
185    radv_get_sequence_size(layout, pipeline, &cmd_size, &upload_size);
186    return radv_align_cmdbuf_size(device, cmd_size * cmd_info->sequencesCount, AMD_IP_GFX);
187 }
188 
189 struct radv_dgc_params {
190    uint32_t cmd_buf_stride;
191    uint32_t cmd_buf_size;
192    uint32_t upload_stride;
193    uint32_t upload_addr;
194    uint32_t sequence_count;
195    uint32_t stream_stride;
196    uint64_t stream_addr;
197 
198    /* draw info */
199    uint16_t draw_indexed;
200    uint16_t draw_params_offset;
201    uint16_t binds_index_buffer;
202    uint16_t vtx_base_sgpr;
203    uint32_t max_index_count;
204    uint8_t draw_mesh_tasks;
205 
206    /* dispatch info */
207    uint32_t dispatch_initiator;
208    uint16_t dispatch_params_offset;
209    uint16_t grid_base_sgpr;
210 
211    /* bind index buffer info. Valid if binds_index_buffer == true && draw_indexed */
212    uint16_t index_buffer_offset;
213 
214    uint8_t vbo_cnt;
215 
216    uint8_t const_copy;
217 
218    /* Which VBOs are set in this indirect layout. */
219    uint32_t vbo_bind_mask;
220 
221    uint16_t vbo_reg;
222    uint16_t const_copy_size;
223 
224    uint64_t push_constant_mask;
225 
226    uint32_t ibo_type_32;
227    uint32_t ibo_type_8;
228 
229    uint16_t push_constant_shader_cnt;
230 
231    uint8_t is_dispatch;
232    uint8_t use_preamble;
233 
234    /* For conditional rendering on ACE. */
235    uint8_t predicating;
236    uint8_t predication_type;
237    uint64_t predication_va;
238 };
239 
240 enum {
241    DGC_USES_DRAWID = 1u << 14,
242    DGC_USES_BASEINSTANCE = 1u << 15,
243    DGC_USES_GRID_SIZE = DGC_USES_BASEINSTANCE, /* Mesh shader only */
244 };
245 
246 enum {
247    DGC_DYNAMIC_STRIDE = 1u << 15,
248 };
249 
250 enum {
251    DGC_DESC_STREAM,
252    DGC_DESC_PREPARE,
253    DGC_DESC_PARAMS,
254    DGC_DESC_COUNT,
255    DGC_NUM_DESCS,
256 };
257 
258 struct dgc_cmdbuf {
259    nir_def *descriptor;
260    nir_variable *offset;
261 
262    enum amd_gfx_level gfx_level;
263    bool sqtt_enabled;
264 };
265 
266 static void
dgc_emit(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * value)267 dgc_emit(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *value)
268 {
269    assert(value->bit_size >= 32);
270    nir_def *offset = nir_load_var(b, cs->offset);
271    nir_store_ssbo(b, value, cs->descriptor, offset, .access = ACCESS_NON_READABLE);
272    nir_store_var(b, cs->offset, nir_iadd_imm(b, offset, value->num_components * value->bit_size / 8), 0x1);
273 }
274 
275 #define load_param32(b, field)                                                                                         \
276    nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0), .base = offsetof(struct radv_dgc_params, field), .range = 4)
277 
278 #define load_param16(b, field)                                                                                         \
279    nir_ubfe_imm((b),                                                                                                   \
280                 nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0),                                                \
281                                        .base = (offsetof(struct radv_dgc_params, field) & ~3), .range = 4),            \
282                 (offsetof(struct radv_dgc_params, field) & 2) * 8, 16)
283 
284 #define load_param8(b, field)                                                                                          \
285    nir_ubfe_imm((b),                                                                                                   \
286                 nir_load_push_constant((b), 1, 32, nir_imm_int((b), 0),                                                \
287                                        .base = (offsetof(struct radv_dgc_params, field) & ~3), .range = 4),            \
288                 (offsetof(struct radv_dgc_params, field) & 3) * 8, 8)
289 
290 #define load_param64(b, field)                                                                                         \
291    nir_pack_64_2x32((b), nir_load_push_constant((b), 2, 32, nir_imm_int((b), 0),                                       \
292                                                 .base = offsetof(struct radv_dgc_params, field), .range = 8))
293 
294 static nir_def *
nir_pkt3_base(nir_builder * b,unsigned op,nir_def * len,bool predicate)295 nir_pkt3_base(nir_builder *b, unsigned op, nir_def *len, bool predicate)
296 {
297    len = nir_iand_imm(b, len, 0x3fff);
298    return nir_ior_imm(b, nir_ishl_imm(b, len, 16), PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT3_PREDICATE(predicate));
299 }
300 
301 static nir_def *
nir_pkt3(nir_builder * b,unsigned op,nir_def * len)302 nir_pkt3(nir_builder *b, unsigned op, nir_def *len)
303 {
304    return nir_pkt3_base(b, op, len, false);
305 }
306 
307 static nir_def *
dgc_get_nop_packet(nir_builder * b,const struct radv_device * device)308 dgc_get_nop_packet(nir_builder *b, const struct radv_device *device)
309 {
310    if (device->physical_device->rad_info.gfx_ib_pad_with_type2) {
311       return nir_imm_int(b, PKT2_NOP_PAD);
312    } else {
313       return nir_imm_int(b, PKT3_NOP_PAD);
314    }
315 }
316 
317 static void
dgc_emit_userdata_vertex(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * vtx_base_sgpr,nir_def * first_vertex,nir_def * first_instance,nir_def * drawid,const struct radv_device * device)318 dgc_emit_userdata_vertex(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *vtx_base_sgpr, nir_def *first_vertex,
319                          nir_def *first_instance, nir_def *drawid, const struct radv_device *device)
320 {
321    vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr);
322    nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
323    nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE);
324 
325    nir_def *pkt_cnt = nir_imm_int(b, 1);
326    pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
327    pkt_cnt = nir_bcsel(b, has_baseinstance, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
328 
329    nir_def *values[5] = {
330       nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt), nir_iand_imm(b, vtx_base_sgpr, 0x3FFF), first_vertex,
331       dgc_get_nop_packet(b, device),         dgc_get_nop_packet(b, device),
332    };
333 
334    values[3] = nir_bcsel(b, nir_ior(b, has_drawid, has_baseinstance), nir_bcsel(b, has_drawid, drawid, first_instance),
335                          values[4]);
336    values[4] = nir_bcsel(b, nir_iand(b, has_drawid, has_baseinstance), first_instance, values[4]);
337 
338    dgc_emit(b, cs, nir_vec(b, values, 5));
339 }
340 
341 static void
dgc_emit_userdata_mesh(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * vtx_base_sgpr,nir_def * x,nir_def * y,nir_def * z,nir_def * drawid,const struct radv_device * device)342 dgc_emit_userdata_mesh(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *vtx_base_sgpr, nir_def *x, nir_def *y,
343                        nir_def *z, nir_def *drawid, const struct radv_device *device)
344 {
345    vtx_base_sgpr = nir_u2u32(b, vtx_base_sgpr);
346    nir_def *has_grid_size = nir_test_mask(b, vtx_base_sgpr, DGC_USES_GRID_SIZE);
347    nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
348 
349    nir_push_if(b, nir_ior(b, has_grid_size, has_drawid));
350    {
351       nir_def *pkt_cnt = nir_imm_int(b, 0);
352       pkt_cnt = nir_bcsel(b, has_grid_size, nir_iadd_imm(b, pkt_cnt, 3), pkt_cnt);
353       pkt_cnt = nir_bcsel(b, has_drawid, nir_iadd_imm(b, pkt_cnt, 1), pkt_cnt);
354 
355       nir_def *values[6] = {
356          nir_pkt3(b, PKT3_SET_SH_REG, pkt_cnt), nir_iand_imm(b, vtx_base_sgpr, 0x3FFF), dgc_get_nop_packet(b, device),
357          dgc_get_nop_packet(b, device),         dgc_get_nop_packet(b, device),          dgc_get_nop_packet(b, device),
358       };
359 
360       /* DrawID needs to be first if no GridSize. */
361       values[2] = nir_bcsel(b, has_grid_size, x, drawid);
362       values[3] = nir_bcsel(b, has_grid_size, y, values[3]);
363       values[4] = nir_bcsel(b, has_grid_size, z, values[4]);
364       values[5] = nir_bcsel(b, has_drawid, drawid, values[5]);
365 
366       for (uint32_t i = 0; i < ARRAY_SIZE(values); i++)
367          dgc_emit(b, cs, values[i]);
368    }
369    nir_pop_if(b, NULL);
370 }
371 
372 static void
dgc_emit_sqtt_userdata(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * data)373 dgc_emit_sqtt_userdata(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *data)
374 {
375    if (!cs->sqtt_enabled)
376       return;
377 
378    nir_def *values[3] = {
379       nir_pkt3_base(b, PKT3_SET_UCONFIG_REG, nir_imm_int(b, 1), cs->gfx_level >= GFX10),
380       nir_imm_int(b, (R_030D08_SQ_THREAD_TRACE_USERDATA_2 - CIK_UCONFIG_REG_OFFSET) >> 2),
381       data,
382    };
383 
384    dgc_emit(b, cs, nir_vec(b, values, 3));
385 }
386 
387 static void
dgc_emit_sqtt_thread_trace_marker(nir_builder * b,struct dgc_cmdbuf * cs)388 dgc_emit_sqtt_thread_trace_marker(nir_builder *b, struct dgc_cmdbuf *cs)
389 {
390    if (!cs->sqtt_enabled)
391       return;
392 
393    nir_def *values[2] = {
394       nir_pkt3(b, PKT3_EVENT_WRITE, nir_imm_int(b, 0)),
395       nir_imm_int(b, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER | EVENT_INDEX(0))),
396    };
397 
398    dgc_emit(b, cs, nir_vec(b, values, 2));
399 }
400 
401 static void
dgc_emit_sqtt_marker_event(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * sequence_id,enum rgp_sqtt_marker_event_type event)402 dgc_emit_sqtt_marker_event(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *sequence_id,
403                            enum rgp_sqtt_marker_event_type event)
404 {
405    struct rgp_sqtt_marker_event marker = {0};
406 
407    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
408    marker.api_type = event;
409 
410    dgc_emit_sqtt_userdata(b, cs, nir_imm_int(b, marker.dword01));
411    dgc_emit_sqtt_userdata(b, cs, nir_imm_int(b, marker.dword02));
412    dgc_emit_sqtt_userdata(b, cs, sequence_id);
413 }
414 
415 static void
dgc_emit_sqtt_marker_event_with_dims(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * sequence_id,nir_def * x,nir_def * y,nir_def * z,enum rgp_sqtt_marker_event_type event)416 dgc_emit_sqtt_marker_event_with_dims(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *sequence_id, nir_def *x,
417                                      nir_def *y, nir_def *z, enum rgp_sqtt_marker_event_type event)
418 {
419    struct rgp_sqtt_marker_event_with_dims marker = {0};
420 
421    marker.event.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT;
422    marker.event.api_type = event;
423    marker.event.has_thread_dims = 1;
424 
425    dgc_emit_sqtt_userdata(b, cs, nir_imm_int(b, marker.event.dword01));
426    dgc_emit_sqtt_userdata(b, cs, nir_imm_int(b, marker.event.dword02));
427    dgc_emit_sqtt_userdata(b, cs, sequence_id);
428    dgc_emit_sqtt_userdata(b, cs, x);
429    dgc_emit_sqtt_userdata(b, cs, y);
430    dgc_emit_sqtt_userdata(b, cs, z);
431 }
432 
433 static void
dgc_emit_sqtt_begin_api_marker(nir_builder * b,struct dgc_cmdbuf * cs,enum rgp_sqtt_marker_general_api_type api_type)434 dgc_emit_sqtt_begin_api_marker(nir_builder *b, struct dgc_cmdbuf *cs, enum rgp_sqtt_marker_general_api_type api_type)
435 {
436    struct rgp_sqtt_marker_general_api marker = {0};
437 
438    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_GENERAL_API;
439    marker.api_type = api_type;
440 
441    dgc_emit_sqtt_userdata(b, cs, nir_imm_int(b, marker.dword01));
442 }
443 
444 static void
dgc_emit_sqtt_end_api_marker(nir_builder * b,struct dgc_cmdbuf * cs,enum rgp_sqtt_marker_general_api_type api_type)445 dgc_emit_sqtt_end_api_marker(nir_builder *b, struct dgc_cmdbuf *cs, enum rgp_sqtt_marker_general_api_type api_type)
446 {
447    struct rgp_sqtt_marker_general_api marker = {0};
448 
449    marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_GENERAL_API;
450    marker.api_type = api_type;
451    marker.is_end = 1;
452 
453    dgc_emit_sqtt_userdata(b, cs, nir_imm_int(b, marker.dword01));
454 }
455 
456 static void
dgc_emit_instance_count(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * instance_count)457 dgc_emit_instance_count(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *instance_count)
458 {
459    nir_def *values[2] = {nir_imm_int(b, PKT3(PKT3_NUM_INSTANCES, 0, false)), instance_count};
460 
461    dgc_emit(b, cs, nir_vec(b, values, 2));
462 }
463 
464 static void
dgc_emit_draw_index_offset_2(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * index_offset,nir_def * index_count,nir_def * max_index_count)465 dgc_emit_draw_index_offset_2(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *index_offset, nir_def *index_count,
466                              nir_def *max_index_count)
467 {
468    nir_def *values[5] = {nir_imm_int(b, PKT3(PKT3_DRAW_INDEX_OFFSET_2, 3, false)), max_index_count, index_offset,
469                          index_count, nir_imm_int(b, V_0287F0_DI_SRC_SEL_DMA)};
470 
471    dgc_emit(b, cs, nir_vec(b, values, 5));
472 }
473 
474 static void
dgc_emit_draw_index_auto(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * vertex_count)475 dgc_emit_draw_index_auto(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *vertex_count)
476 {
477    nir_def *values[3] = {nir_imm_int(b, PKT3(PKT3_DRAW_INDEX_AUTO, 1, false)), vertex_count,
478                          nir_imm_int(b, V_0287F0_DI_SRC_SEL_AUTO_INDEX)};
479 
480    dgc_emit(b, cs, nir_vec(b, values, 3));
481 }
482 
483 static void
dgc_emit_dispatch_direct(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * wg_x,nir_def * wg_y,nir_def * wg_z,nir_def * dispatch_initiator)484 dgc_emit_dispatch_direct(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *wg_x, nir_def *wg_y, nir_def *wg_z,
485                          nir_def *dispatch_initiator)
486 {
487    nir_def *values[5] = {nir_imm_int(b, PKT3(PKT3_DISPATCH_DIRECT, 3, false) | PKT3_SHADER_TYPE_S(1)), wg_x, wg_y, wg_z,
488                          dispatch_initiator};
489 
490    dgc_emit(b, cs, nir_vec(b, values, 5));
491 }
492 
493 static void
dgc_emit_dispatch_mesh_direct(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * x,nir_def * y,nir_def * z)494 dgc_emit_dispatch_mesh_direct(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *x, nir_def *y, nir_def *z)
495 {
496    nir_def *values[5] = {nir_imm_int(b, PKT3(PKT3_DISPATCH_MESH_DIRECT, 3, false)), x, y, z,
497                          nir_imm_int(b, S_0287F0_SOURCE_SELECT(V_0287F0_DI_SRC_SEL_AUTO_INDEX))};
498 
499    dgc_emit(b, cs, nir_vec(b, values, 5));
500 }
501 
502 static void
dgc_emit_grid_size_user_sgpr(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * grid_base_sgpr,nir_def * wg_x,nir_def * wg_y,nir_def * wg_z)503 dgc_emit_grid_size_user_sgpr(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *grid_base_sgpr, nir_def *wg_x,
504                              nir_def *wg_y, nir_def *wg_z)
505 {
506    nir_def *values[5] = {
507       nir_imm_int(b, PKT3(PKT3_SET_SH_REG, 3, false)), grid_base_sgpr, wg_x, wg_y, wg_z,
508    };
509 
510    dgc_emit(b, cs, nir_vec(b, values, 5));
511 }
512 
513 static void
dgc_emit_grid_size_pointer(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * grid_base_sgpr,nir_def * stream_offset)514 dgc_emit_grid_size_pointer(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *grid_base_sgpr, nir_def *stream_offset)
515 {
516    nir_def *stream_addr = load_param64(b, stream_addr);
517    nir_def *va = nir_iadd(b, stream_addr, nir_u2u64(b, stream_offset));
518 
519    nir_def *va_lo = nir_unpack_64_2x32_split_x(b, va);
520    nir_def *va_hi = nir_unpack_64_2x32_split_y(b, va);
521 
522    nir_def *values[4] = {nir_imm_int(b, PKT3(PKT3_SET_SH_REG, 2, false)), grid_base_sgpr, va_lo, va_hi};
523 
524    dgc_emit(b, cs, nir_vec(b, values, 4));
525 }
526 
527 static void
dgc_emit_pkt3_set_base(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * va)528 dgc_emit_pkt3_set_base(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *va)
529 {
530    nir_def *va_lo = nir_unpack_64_2x32_split_x(b, va);
531    nir_def *va_hi = nir_unpack_64_2x32_split_y(b, va);
532 
533    nir_def *values[4] = {nir_imm_int(b, PKT3(PKT3_SET_BASE, 2, false)), nir_imm_int(b, 1), va_lo, va_hi};
534 
535    dgc_emit(b, cs, nir_vec(b, values, 4));
536 }
537 
538 static void
dgc_emit_pkt3_draw_indirect(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * vtx_base_sgpr,bool indexed)539 dgc_emit_pkt3_draw_indirect(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *vtx_base_sgpr, bool indexed)
540 {
541    const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
542 
543    vtx_base_sgpr = nir_iand_imm(b, nir_u2u32(b, vtx_base_sgpr), 0x3FFF);
544 
545    nir_def *has_drawid = nir_test_mask(b, vtx_base_sgpr, DGC_USES_DRAWID);
546    nir_def *has_baseinstance = nir_test_mask(b, vtx_base_sgpr, DGC_USES_BASEINSTANCE);
547 
548    /* vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2 */
549    nir_def *vertex_offset_reg = vtx_base_sgpr;
550 
551    /* start_instance_reg = (base_reg + (draw_id_enable ? 8 : 4) - SI_SH_REG_OFFSET) >> 2 */
552    nir_def *start_instance_offset = nir_bcsel(b, has_drawid, nir_imm_int(b, 2), nir_imm_int(b, 1));
553    nir_def *start_instance_reg = nir_iadd(b, vtx_base_sgpr, start_instance_offset);
554 
555    /* draw_id_reg = (base_reg + 4 - SI_SH_REG_OFFSET) >> 2 */
556    nir_def *draw_id_reg = nir_iadd(b, vtx_base_sgpr, nir_imm_int(b, 1));
557 
558    nir_if *if_drawid = nir_push_if(b, has_drawid);
559    {
560       const unsigned pkt3_op = indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI;
561 
562       nir_def *values[8];
563       values[0] = nir_imm_int(b, PKT3(pkt3_op, 8, false));
564       values[1] = nir_imm_int(b, 0);
565       values[2] = vertex_offset_reg;
566       values[3] = nir_bcsel(b, has_baseinstance, start_instance_reg, nir_imm_int(b, 0));
567       values[4] = nir_ior(b, draw_id_reg, nir_imm_int(b, S_2C3_DRAW_INDEX_ENABLE(1)));
568       values[5] = nir_imm_int(b, 1); /* draw count */
569       values[6] = nir_imm_int(b, 0); /* count va low */
570       values[7] = nir_imm_int(b, 0); /* count va high */
571 
572       dgc_emit(b, cs, nir_vec(b, values, 8));
573 
574       values[0] = nir_imm_int(b, 0); /* stride */
575       values[1] = nir_imm_int(b, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
576 
577       dgc_emit(b, cs, nir_vec(b, values, 2));
578    }
579    nir_push_else(b, if_drawid);
580    {
581       const unsigned pkt3_op = indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT;
582 
583       nir_def *values[5];
584       values[0] = nir_imm_int(b, PKT3(pkt3_op, 3, false));
585       values[1] = nir_imm_int(b, 0);
586       values[2] = vertex_offset_reg;
587       values[3] = nir_bcsel(b, has_baseinstance, start_instance_reg, nir_imm_int(b, 0));
588       values[4] = nir_imm_int(b, di_src_sel);
589 
590       dgc_emit(b, cs, nir_vec(b, values, 5));
591    }
592    nir_pop_if(b, if_drawid);
593 }
594 
595 static void
dgc_emit_draw_indirect(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * stream_base,nir_def * draw_params_offset,bool indexed)596 dgc_emit_draw_indirect(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_base, nir_def *draw_params_offset,
597                        bool indexed)
598 {
599    nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
600    nir_def *stream_offset = nir_iadd(b, draw_params_offset, stream_base);
601 
602    nir_def *stream_addr = load_param64(b, stream_addr);
603    nir_def *va = nir_iadd(b, stream_addr, nir_u2u64(b, stream_offset));
604 
605    dgc_emit_pkt3_set_base(b, cs, va);
606    dgc_emit_pkt3_draw_indirect(b, cs, vtx_base_sgpr, indexed);
607 }
608 
609 static nir_def *
dgc_cmd_buf_size(nir_builder * b,nir_def * sequence_count,const struct radv_device * device)610 dgc_cmd_buf_size(nir_builder *b, nir_def *sequence_count, const struct radv_device *device)
611 {
612    nir_def *use_preamble = nir_ine_imm(b, load_param8(b, use_preamble), 0);
613    nir_def *cmd_buf_size = load_param32(b, cmd_buf_size);
614    nir_def *cmd_buf_stride = load_param32(b, cmd_buf_stride);
615    nir_def *size = nir_imul(b, cmd_buf_stride, sequence_count);
616    unsigned align_mask = radv_align_cmdbuf_size(device, 1, AMD_IP_GFX) - 1;
617 
618    size = nir_iand_imm(b, nir_iadd_imm(b, size, align_mask), ~align_mask);
619 
620    /* Ensure we don't have to deal with a jump to an empty IB in the preamble. */
621    size = nir_imax(b, size, nir_imm_int(b, align_mask + 1));
622 
623    return nir_bcsel(b, use_preamble, size, cmd_buf_size);
624 }
625 
626 static nir_def *
dgc_main_cmd_buf_offset(nir_builder * b,const struct radv_device * device)627 dgc_main_cmd_buf_offset(nir_builder *b, const struct radv_device *device)
628 {
629    nir_def *use_preamble = nir_ine_imm(b, load_param8(b, use_preamble), 0);
630    nir_def *base_offset = nir_imm_int(b, radv_dgc_preamble_cmdbuf_size(device));
631    return nir_bcsel(b, use_preamble, base_offset, nir_imm_int(b, 0));
632 }
633 
634 static void
build_dgc_buffer_tail(nir_builder * b,nir_def * sequence_count,const struct radv_device * device)635 build_dgc_buffer_tail(nir_builder *b, nir_def *sequence_count, const struct radv_device *device)
636 {
637    nir_def *global_id = get_global_ids(b, 1);
638 
639    nir_def *cmd_buf_stride = load_param32(b, cmd_buf_stride);
640    nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, device);
641 
642    nir_push_if(b, nir_ieq_imm(b, global_id, 0));
643    {
644       nir_def *base_offset = dgc_main_cmd_buf_offset(b, device);
645       nir_def *cmd_buf_tail_start = nir_imul(b, cmd_buf_stride, sequence_count);
646 
647       nir_variable *offset = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "offset");
648       nir_store_var(b, offset, cmd_buf_tail_start, 0x1);
649 
650       nir_def *dst_buf = radv_meta_load_descriptor(b, 0, DGC_DESC_PREPARE);
651       nir_push_loop(b);
652       {
653          nir_def *curr_offset = nir_load_var(b, offset);
654          const unsigned MAX_PACKET_WORDS = 0x3FFC;
655 
656          nir_push_if(b, nir_ieq(b, curr_offset, cmd_buf_size));
657          {
658             nir_jump(b, nir_jump_break);
659          }
660          nir_pop_if(b, NULL);
661 
662          nir_def *packet, *packet_size;
663 
664          if (device->physical_device->rad_info.gfx_ib_pad_with_type2) {
665             packet_size = nir_imm_int(b, 4);
666             packet = nir_imm_int(b, PKT2_NOP_PAD);
667          } else {
668             packet_size = nir_isub(b, cmd_buf_size, curr_offset);
669             packet_size = nir_umin(b, packet_size, nir_imm_int(b, MAX_PACKET_WORDS * 4));
670 
671             nir_def *len = nir_ushr_imm(b, packet_size, 2);
672             len = nir_iadd_imm(b, len, -2);
673             packet = nir_pkt3(b, PKT3_NOP, len);
674          }
675 
676          nir_store_ssbo(b, packet, dst_buf, nir_iadd(b, curr_offset, base_offset), .access = ACCESS_NON_READABLE);
677          nir_store_var(b, offset, nir_iadd(b, curr_offset, packet_size), 0x1);
678       }
679       nir_pop_loop(b, NULL);
680    }
681    nir_pop_if(b, NULL);
682 }
683 
684 static void
build_dgc_buffer_preamble(nir_builder * b,nir_def * sequence_count,const struct radv_device * device)685 build_dgc_buffer_preamble(nir_builder *b, nir_def *sequence_count, const struct radv_device *device)
686 {
687    nir_def *global_id = get_global_ids(b, 1);
688    nir_def *use_preamble = nir_ine_imm(b, load_param8(b, use_preamble), 0);
689 
690    nir_push_if(b, nir_iand(b, nir_ieq_imm(b, global_id, 0), use_preamble));
691    {
692       unsigned preamble_size = radv_dgc_preamble_cmdbuf_size(device);
693       nir_def *cmd_buf_size = dgc_cmd_buf_size(b, sequence_count, device);
694       nir_def *dst_buf = radv_meta_load_descriptor(b, 0, DGC_DESC_PREPARE);
695 
696       nir_def *words = nir_ushr_imm(b, cmd_buf_size, 2);
697 
698       nir_def *addr = nir_iadd_imm(b, load_param32(b, upload_addr), preamble_size);
699 
700       nir_def *nop_packet = dgc_get_nop_packet(b, device);
701 
702       nir_def *nop_packets[] = {
703          nop_packet,
704          nop_packet,
705          nop_packet,
706          nop_packet,
707       };
708 
709       const unsigned jump_size = 16;
710       unsigned offset;
711 
712       /* Do vectorized store if possible */
713       for (offset = 0; offset + 16 <= preamble_size - jump_size; offset += 16) {
714          nir_store_ssbo(b, nir_vec(b, nop_packets, 4), dst_buf, nir_imm_int(b, offset), .access = ACCESS_NON_READABLE);
715       }
716 
717       for (; offset + 4 <= preamble_size - jump_size; offset += 4) {
718          nir_store_ssbo(b, nop_packet, dst_buf, nir_imm_int(b, offset), .access = ACCESS_NON_READABLE);
719       }
720 
721       nir_def *chain_packets[] = {
722          nir_imm_int(b, PKT3(PKT3_INDIRECT_BUFFER, 2, 0)),
723          addr,
724          nir_imm_int(b, device->physical_device->rad_info.address32_hi),
725          nir_ior_imm(b, words, S_3F2_CHAIN(1) | S_3F2_VALID(1) | S_3F2_PRE_ENA(false)),
726       };
727 
728       nir_store_ssbo(b, nir_vec(b, chain_packets, 4), dst_buf, nir_imm_int(b, preamble_size - jump_size),
729                      .access = ACCESS_NON_READABLE);
730    }
731    nir_pop_if(b, NULL);
732 }
733 
734 /**
735  * Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV.
736  */
737 static void
dgc_emit_draw(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * stream_buf,nir_def * stream_base,nir_def * draw_params_offset,nir_def * sequence_id,const struct radv_device * device)738 dgc_emit_draw(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_buf, nir_def *stream_base,
739               nir_def *draw_params_offset, nir_def *sequence_id, const struct radv_device *device)
740 {
741    nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
742    nir_def *stream_offset = nir_iadd(b, draw_params_offset, stream_base);
743 
744    nir_def *draw_data0 = nir_load_ssbo(b, 4, 32, stream_buf, stream_offset);
745    nir_def *vertex_count = nir_channel(b, draw_data0, 0);
746    nir_def *instance_count = nir_channel(b, draw_data0, 1);
747    nir_def *vertex_offset = nir_channel(b, draw_data0, 2);
748    nir_def *first_instance = nir_channel(b, draw_data0, 3);
749 
750    nir_push_if(b, nir_iand(b, nir_ine_imm(b, vertex_count, 0), nir_ine_imm(b, instance_count, 0)));
751    {
752       dgc_emit_sqtt_begin_api_marker(b, cs, ApiCmdDraw);
753       dgc_emit_sqtt_marker_event(b, cs, sequence_id, EventCmdDraw);
754 
755       dgc_emit_userdata_vertex(b, cs, vtx_base_sgpr, vertex_offset, first_instance, sequence_id, device);
756       dgc_emit_instance_count(b, cs, instance_count);
757       dgc_emit_draw_index_auto(b, cs, vertex_count);
758 
759       dgc_emit_sqtt_thread_trace_marker(b, cs);
760       dgc_emit_sqtt_end_api_marker(b, cs, ApiCmdDraw);
761    }
762    nir_pop_if(b, 0);
763 }
764 
765 /**
766  * Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV.
767  */
768 static void
dgc_emit_draw_indexed(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * stream_buf,nir_def * stream_base,nir_def * draw_params_offset,nir_def * sequence_id,nir_def * max_index_count,const struct radv_device * device)769 dgc_emit_draw_indexed(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_buf, nir_def *stream_base,
770                       nir_def *draw_params_offset, nir_def *sequence_id, nir_def *max_index_count,
771                       const struct radv_device *device)
772 {
773    nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
774    nir_def *stream_offset = nir_iadd(b, draw_params_offset, stream_base);
775 
776    nir_def *draw_data0 = nir_load_ssbo(b, 4, 32, stream_buf, stream_offset);
777    nir_def *draw_data1 = nir_load_ssbo(b, 1, 32, stream_buf, nir_iadd_imm(b, stream_offset, 16));
778    nir_def *index_count = nir_channel(b, draw_data0, 0);
779    nir_def *instance_count = nir_channel(b, draw_data0, 1);
780    nir_def *first_index = nir_channel(b, draw_data0, 2);
781    nir_def *vertex_offset = nir_channel(b, draw_data0, 3);
782    nir_def *first_instance = nir_channel(b, draw_data1, 0);
783 
784    nir_push_if(b, nir_iand(b, nir_ine_imm(b, index_count, 0), nir_ine_imm(b, instance_count, 0)));
785    {
786       dgc_emit_sqtt_begin_api_marker(b, cs, ApiCmdDrawIndexed);
787       dgc_emit_sqtt_marker_event(b, cs, sequence_id, EventCmdDrawIndexed);
788 
789       dgc_emit_userdata_vertex(b, cs, vtx_base_sgpr, vertex_offset, first_instance, sequence_id, device);
790       dgc_emit_instance_count(b, cs, instance_count);
791       dgc_emit_draw_index_offset_2(b, cs, first_index, index_count, max_index_count);
792 
793       dgc_emit_sqtt_thread_trace_marker(b, cs);
794       dgc_emit_sqtt_end_api_marker(b, cs, ApiCmdDrawIndexed);
795    }
796    nir_pop_if(b, 0);
797 }
798 
799 /**
800  * Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NV.
801  */
802 static void
dgc_emit_index_buffer(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * stream_buf,nir_def * stream_base,nir_def * index_buffer_offset,nir_def * ibo_type_32,nir_def * ibo_type_8,nir_variable * max_index_count_var,const struct radv_device * device)803 dgc_emit_index_buffer(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_buf, nir_def *stream_base,
804                       nir_def *index_buffer_offset, nir_def *ibo_type_32, nir_def *ibo_type_8,
805                       nir_variable *max_index_count_var, const struct radv_device *device)
806 {
807    nir_def *index_stream_offset = nir_iadd(b, index_buffer_offset, stream_base);
808    nir_def *data = nir_load_ssbo(b, 4, 32, stream_buf, index_stream_offset);
809 
810    nir_def *vk_index_type = nir_channel(b, data, 3);
811    nir_def *index_type = nir_bcsel(b, nir_ieq(b, vk_index_type, ibo_type_32), nir_imm_int(b, V_028A7C_VGT_INDEX_32),
812                                    nir_imm_int(b, V_028A7C_VGT_INDEX_16));
813    index_type = nir_bcsel(b, nir_ieq(b, vk_index_type, ibo_type_8), nir_imm_int(b, V_028A7C_VGT_INDEX_8), index_type);
814 
815    nir_def *index_size = nir_iand_imm(b, nir_ushr(b, nir_imm_int(b, 0x142), nir_imul_imm(b, index_type, 4)), 0xf);
816 
817    nir_def *max_index_count = nir_udiv(b, nir_channel(b, data, 2), index_size);
818    nir_store_var(b, max_index_count_var, max_index_count, 0x1);
819 
820    nir_def *cmd_values[3 + 2 + 3];
821 
822    if (device->physical_device->rad_info.gfx_level >= GFX9) {
823       unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
824       if (device->physical_device->rad_info.gfx_level < GFX9 ||
825           (device->physical_device->rad_info.gfx_level == GFX9 && device->physical_device->rad_info.me_fw_version < 26))
826          opcode = PKT3_SET_UCONFIG_REG;
827       cmd_values[0] = nir_imm_int(b, PKT3(opcode, 1, 0));
828       cmd_values[1] = nir_imm_int(b, (R_03090C_VGT_INDEX_TYPE - CIK_UCONFIG_REG_OFFSET) >> 2 | (2u << 28));
829       cmd_values[2] = index_type;
830    } else {
831       cmd_values[0] = nir_imm_int(b, PKT3(PKT3_INDEX_TYPE, 0, 0));
832       cmd_values[1] = index_type;
833       cmd_values[2] = dgc_get_nop_packet(b, device);
834    }
835 
836    nir_def *addr_upper = nir_channel(b, data, 1);
837    addr_upper = nir_ishr_imm(b, nir_ishl_imm(b, addr_upper, 16), 16);
838 
839    cmd_values[3] = nir_imm_int(b, PKT3(PKT3_INDEX_BASE, 1, 0));
840    cmd_values[4] = nir_channel(b, data, 0);
841    cmd_values[5] = addr_upper;
842    cmd_values[6] = nir_imm_int(b, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
843    cmd_values[7] = max_index_count;
844 
845    dgc_emit(b, cs, nir_vec(b, cmd_values, 8));
846 }
847 
848 /**
849  * Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV.
850  */
851 static void
dgc_emit_push_constant(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * stream_buf,nir_def * stream_base,nir_def * push_const_mask,nir_variable * upload_offset)852 dgc_emit_push_constant(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_buf, nir_def *stream_base,
853                        nir_def *push_const_mask, nir_variable *upload_offset)
854 {
855    nir_def *vbo_cnt = load_param8(b, vbo_cnt);
856    nir_def *const_copy = nir_ine_imm(b, load_param8(b, const_copy), 0);
857    nir_def *const_copy_size = load_param16(b, const_copy_size);
858    nir_def *const_copy_words = nir_ushr_imm(b, const_copy_size, 2);
859    const_copy_words = nir_bcsel(b, const_copy, const_copy_words, nir_imm_int(b, 0));
860 
861    nir_variable *idx = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "const_copy_idx");
862    nir_store_var(b, idx, nir_imm_int(b, 0), 0x1);
863 
864    nir_def *param_buf = radv_meta_load_descriptor(b, 0, DGC_DESC_PARAMS);
865    nir_def *param_offset = nir_imul_imm(b, vbo_cnt, 24);
866    nir_def *param_offset_offset = nir_iadd_imm(b, param_offset, MESA_VULKAN_SHADER_STAGES * 12);
867    nir_def *param_const_offset =
868       nir_iadd_imm(b, param_offset, MAX_PUSH_CONSTANTS_SIZE + MESA_VULKAN_SHADER_STAGES * 12);
869    nir_push_loop(b);
870    {
871       nir_def *cur_idx = nir_load_var(b, idx);
872       nir_push_if(b, nir_uge(b, cur_idx, const_copy_words));
873       {
874          nir_jump(b, nir_jump_break);
875       }
876       nir_pop_if(b, NULL);
877 
878       nir_variable *data = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "copy_data");
879 
880       nir_def *update = nir_iand(b, push_const_mask, nir_ishl(b, nir_imm_int64(b, 1), cur_idx));
881       update = nir_bcsel(b, nir_ult_imm(b, cur_idx, 64 /* bits in push_const_mask */), update, nir_imm_int64(b, 0));
882 
883       nir_push_if(b, nir_ine_imm(b, update, 0));
884       {
885          nir_def *stream_offset =
886             nir_load_ssbo(b, 1, 32, param_buf, nir_iadd(b, param_offset_offset, nir_ishl_imm(b, cur_idx, 2)));
887          nir_def *new_data = nir_load_ssbo(b, 1, 32, stream_buf, nir_iadd(b, stream_base, stream_offset));
888          nir_store_var(b, data, new_data, 0x1);
889       }
890       nir_push_else(b, NULL);
891       {
892          nir_store_var(b, data,
893                        nir_load_ssbo(b, 1, 32, param_buf, nir_iadd(b, param_const_offset, nir_ishl_imm(b, cur_idx, 2))),
894                        0x1);
895       }
896       nir_pop_if(b, NULL);
897 
898       nir_store_ssbo(b, nir_load_var(b, data), cs->descriptor,
899                      nir_iadd(b, nir_load_var(b, upload_offset), nir_ishl_imm(b, cur_idx, 2)),
900                      .access = ACCESS_NON_READABLE);
901 
902       nir_store_var(b, idx, nir_iadd_imm(b, cur_idx, 1), 0x1);
903    }
904    nir_pop_loop(b, NULL);
905 
906    nir_variable *shader_idx = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "shader_idx");
907    nir_store_var(b, shader_idx, nir_imm_int(b, 0), 0x1);
908    nir_def *shader_cnt = load_param16(b, push_constant_shader_cnt);
909 
910    nir_push_loop(b);
911    {
912       nir_def *cur_shader_idx = nir_load_var(b, shader_idx);
913       nir_push_if(b, nir_uge(b, cur_shader_idx, shader_cnt));
914       {
915          nir_jump(b, nir_jump_break);
916       }
917       nir_pop_if(b, NULL);
918 
919       nir_def *reg_info =
920          nir_load_ssbo(b, 3, 32, param_buf, nir_iadd(b, param_offset, nir_imul_imm(b, cur_shader_idx, 12)));
921       nir_def *upload_sgpr = nir_ubfe_imm(b, nir_channel(b, reg_info, 0), 0, 16);
922       nir_def *inline_sgpr = nir_ubfe_imm(b, nir_channel(b, reg_info, 0), 16, 16);
923       nir_def *inline_mask = nir_pack_64_2x32(b, nir_channels(b, reg_info, 0x6));
924 
925       nir_push_if(b, nir_ine_imm(b, upload_sgpr, 0));
926       {
927          nir_def *pkt[3] = {nir_imm_int(b, PKT3(PKT3_SET_SH_REG, 1, 0)), upload_sgpr,
928                             nir_iadd(b, load_param32(b, upload_addr), nir_load_var(b, upload_offset))};
929 
930          dgc_emit(b, cs, nir_vec(b, pkt, 3));
931       }
932       nir_pop_if(b, NULL);
933 
934       nir_push_if(b, nir_ine_imm(b, inline_sgpr, 0));
935       {
936          nir_store_var(b, idx, nir_imm_int(b, 0), 0x1);
937 
938          nir_variable *pc_idx = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "pc_idx");
939          nir_store_var(b, pc_idx, nir_imm_int(b, 0), 0x1);
940 
941          nir_push_loop(b);
942          {
943             nir_def *cur_idx = nir_load_var(b, idx);
944             nir_push_if(b, nir_uge_imm(b, cur_idx, 64 /* bits in inline_mask */));
945             {
946                nir_jump(b, nir_jump_break);
947             }
948             nir_pop_if(b, NULL);
949 
950             nir_def *l = nir_ishl(b, nir_imm_int64(b, 1), cur_idx);
951             nir_push_if(b, nir_ieq_imm(b, nir_iand(b, l, inline_mask), 0));
952             {
953                nir_store_var(b, idx, nir_iadd_imm(b, cur_idx, 1), 0x1);
954                nir_jump(b, nir_jump_continue);
955             }
956             nir_pop_if(b, NULL);
957 
958             nir_variable *data = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "copy_data");
959 
960             nir_def *update = nir_iand(b, push_const_mask, nir_ishl(b, nir_imm_int64(b, 1), cur_idx));
961             update =
962                nir_bcsel(b, nir_ult_imm(b, cur_idx, 64 /* bits in push_const_mask */), update, nir_imm_int64(b, 0));
963 
964             nir_push_if(b, nir_ine_imm(b, update, 0));
965             {
966                nir_def *stream_offset =
967                   nir_load_ssbo(b, 1, 32, param_buf, nir_iadd(b, param_offset_offset, nir_ishl_imm(b, cur_idx, 2)));
968                nir_def *new_data = nir_load_ssbo(b, 1, 32, stream_buf, nir_iadd(b, stream_base, stream_offset));
969                nir_store_var(b, data, new_data, 0x1);
970 
971                nir_def *pkt[3] = {nir_pkt3(b, PKT3_SET_SH_REG, nir_imm_int(b, 1)),
972                                   nir_iadd(b, inline_sgpr, nir_load_var(b, pc_idx)), nir_load_var(b, data)};
973 
974                dgc_emit(b, cs, nir_vec(b, pkt, 3));
975             }
976             nir_pop_if(b, NULL);
977 
978             nir_store_var(b, idx, nir_iadd_imm(b, cur_idx, 1), 0x1);
979             nir_store_var(b, pc_idx, nir_iadd_imm(b, nir_load_var(b, pc_idx), 1), 0x1);
980          }
981          nir_pop_loop(b, NULL);
982       }
983       nir_pop_if(b, NULL);
984       nir_store_var(b, shader_idx, nir_iadd_imm(b, cur_shader_idx, 1), 0x1);
985    }
986    nir_pop_loop(b, NULL);
987 }
988 
989 /**
990  * For emitting VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NV.
991  */
992 static void
dgc_emit_vertex_buffer(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * stream_buf,nir_def * stream_base,nir_def * vbo_bind_mask,nir_variable * upload_offset,const struct radv_device * device)993 dgc_emit_vertex_buffer(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_buf, nir_def *stream_base,
994                        nir_def *vbo_bind_mask, nir_variable *upload_offset, const struct radv_device *device)
995 {
996    nir_def *vbo_cnt = load_param8(b, vbo_cnt);
997    nir_variable *vbo_idx = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "vbo_idx");
998    nir_store_var(b, vbo_idx, nir_imm_int(b, 0), 0x1);
999 
1000    nir_push_loop(b);
1001    {
1002       nir_push_if(b, nir_uge(b, nir_load_var(b, vbo_idx), vbo_cnt));
1003       {
1004          nir_jump(b, nir_jump_break);
1005       }
1006       nir_pop_if(b, NULL);
1007 
1008       nir_def *vbo_offset = nir_imul_imm(b, nir_load_var(b, vbo_idx), 16);
1009       nir_variable *vbo_data = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uvec4_type(), "vbo_data");
1010 
1011       nir_def *param_buf = radv_meta_load_descriptor(b, 0, DGC_DESC_PARAMS);
1012       nir_store_var(b, vbo_data, nir_load_ssbo(b, 4, 32, param_buf, vbo_offset), 0xf);
1013 
1014       nir_def *vbo_override =
1015          nir_ine_imm(b, nir_iand(b, vbo_bind_mask, nir_ishl(b, nir_imm_int(b, 1), nir_load_var(b, vbo_idx))), 0);
1016       nir_push_if(b, vbo_override);
1017       {
1018          nir_def *vbo_offset_offset =
1019             nir_iadd(b, nir_imul_imm(b, vbo_cnt, 16), nir_imul_imm(b, nir_load_var(b, vbo_idx), 8));
1020          nir_def *vbo_over_data = nir_load_ssbo(b, 2, 32, param_buf, vbo_offset_offset);
1021          nir_def *stream_offset = nir_iadd(b, stream_base, nir_iand_imm(b, nir_channel(b, vbo_over_data, 0), 0x7FFF));
1022          nir_def *stream_data = nir_load_ssbo(b, 4, 32, stream_buf, stream_offset);
1023 
1024          nir_def *va = nir_pack_64_2x32(b, nir_trim_vector(b, stream_data, 2));
1025          nir_def *size = nir_channel(b, stream_data, 2);
1026          nir_def *stride = nir_channel(b, stream_data, 3);
1027 
1028          nir_def *dyn_stride = nir_test_mask(b, nir_channel(b, vbo_over_data, 0), DGC_DYNAMIC_STRIDE);
1029          nir_def *old_stride = nir_ubfe_imm(b, nir_channel(b, nir_load_var(b, vbo_data), 1), 16, 14);
1030          stride = nir_bcsel(b, dyn_stride, stride, old_stride);
1031 
1032          nir_def *use_per_attribute_vb_descs = nir_test_mask(b, nir_channel(b, vbo_over_data, 0), 1u << 31);
1033          nir_variable *num_records =
1034             nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "num_records");
1035          nir_store_var(b, num_records, size, 0x1);
1036 
1037          nir_push_if(b, use_per_attribute_vb_descs);
1038          {
1039             nir_def *attrib_end = nir_ubfe_imm(b, nir_channel(b, vbo_over_data, 1), 16, 16);
1040             nir_def *attrib_index_offset = nir_ubfe_imm(b, nir_channel(b, vbo_over_data, 1), 0, 16);
1041 
1042             nir_push_if(b, nir_ult(b, nir_load_var(b, num_records), attrib_end));
1043             {
1044                nir_store_var(b, num_records, nir_imm_int(b, 0), 0x1);
1045             }
1046             nir_push_else(b, NULL);
1047             nir_push_if(b, nir_ieq_imm(b, stride, 0));
1048             {
1049                nir_store_var(b, num_records, nir_imm_int(b, 1), 0x1);
1050             }
1051             nir_push_else(b, NULL);
1052             {
1053                nir_def *r = nir_iadd(
1054                   b, nir_iadd_imm(b, nir_udiv(b, nir_isub(b, nir_load_var(b, num_records), attrib_end), stride), 1),
1055                   attrib_index_offset);
1056                nir_store_var(b, num_records, r, 0x1);
1057             }
1058             nir_pop_if(b, NULL);
1059             nir_pop_if(b, NULL);
1060 
1061             nir_def *convert_cond = nir_ine_imm(b, nir_load_var(b, num_records), 0);
1062             if (device->physical_device->rad_info.gfx_level == GFX9)
1063                convert_cond = nir_imm_false(b);
1064             else if (device->physical_device->rad_info.gfx_level != GFX8)
1065                convert_cond = nir_iand(b, convert_cond, nir_ieq_imm(b, stride, 0));
1066 
1067             nir_def *new_records =
1068                nir_iadd(b, nir_imul(b, nir_iadd_imm(b, nir_load_var(b, num_records), -1), stride), attrib_end);
1069             new_records = nir_bcsel(b, convert_cond, new_records, nir_load_var(b, num_records));
1070             nir_store_var(b, num_records, new_records, 0x1);
1071          }
1072          nir_push_else(b, NULL);
1073          {
1074             if (device->physical_device->rad_info.gfx_level != GFX8) {
1075                nir_push_if(b, nir_ine_imm(b, stride, 0));
1076                {
1077                   nir_def *r = nir_iadd(b, nir_load_var(b, num_records), nir_iadd_imm(b, stride, -1));
1078                   nir_store_var(b, num_records, nir_udiv(b, r, stride), 0x1);
1079                }
1080                nir_pop_if(b, NULL);
1081             }
1082          }
1083          nir_pop_if(b, NULL);
1084 
1085          nir_def *rsrc_word3 = nir_channel(b, nir_load_var(b, vbo_data), 3);
1086          if (device->physical_device->rad_info.gfx_level >= GFX10) {
1087             nir_def *oob_select = nir_bcsel(b, nir_ieq_imm(b, stride, 0), nir_imm_int(b, V_008F0C_OOB_SELECT_RAW),
1088                                             nir_imm_int(b, V_008F0C_OOB_SELECT_STRUCTURED));
1089             rsrc_word3 = nir_iand_imm(b, rsrc_word3, C_008F0C_OOB_SELECT);
1090             rsrc_word3 = nir_ior(b, rsrc_word3, nir_ishl_imm(b, oob_select, 28));
1091          }
1092 
1093          nir_def *va_hi = nir_iand_imm(b, nir_unpack_64_2x32_split_y(b, va), 0xFFFF);
1094          stride = nir_iand_imm(b, stride, 0x3FFF);
1095          nir_def *new_vbo_data[4] = {nir_unpack_64_2x32_split_x(b, va), nir_ior(b, nir_ishl_imm(b, stride, 16), va_hi),
1096                                      nir_load_var(b, num_records), rsrc_word3};
1097          nir_store_var(b, vbo_data, nir_vec(b, new_vbo_data, 4), 0xf);
1098       }
1099       nir_pop_if(b, NULL);
1100 
1101       /* On GFX9, it seems bounds checking is disabled if both
1102        * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
1103        * GFX10.3 but it doesn't hurt.
1104        */
1105       nir_def *num_records = nir_channel(b, nir_load_var(b, vbo_data), 2);
1106       nir_def *buf_va =
1107          nir_iand_imm(b, nir_pack_64_2x32(b, nir_trim_vector(b, nir_load_var(b, vbo_data), 2)), (1ull << 48) - 1ull);
1108       nir_push_if(b, nir_ior(b, nir_ieq_imm(b, num_records, 0), nir_ieq_imm(b, buf_va, 0)));
1109       {
1110          nir_def *new_vbo_data[4] = {nir_imm_int(b, 0), nir_imm_int(b, 0), nir_imm_int(b, 0), nir_imm_int(b, 0)};
1111          nir_store_var(b, vbo_data, nir_vec(b, new_vbo_data, 4), 0xf);
1112       }
1113       nir_pop_if(b, NULL);
1114 
1115       nir_def *upload_off = nir_iadd(b, nir_load_var(b, upload_offset), vbo_offset);
1116       nir_store_ssbo(b, nir_load_var(b, vbo_data), cs->descriptor, upload_off, .access = ACCESS_NON_READABLE);
1117       nir_store_var(b, vbo_idx, nir_iadd_imm(b, nir_load_var(b, vbo_idx), 1), 0x1);
1118    }
1119    nir_pop_loop(b, NULL);
1120    nir_def *packet[3] = {nir_imm_int(b, PKT3(PKT3_SET_SH_REG, 1, 0)), load_param16(b, vbo_reg),
1121                          nir_iadd(b, load_param32(b, upload_addr), nir_load_var(b, upload_offset))};
1122 
1123    dgc_emit(b, cs, nir_vec(b, packet, 3));
1124 
1125    nir_store_var(b, upload_offset, nir_iadd(b, nir_load_var(b, upload_offset), nir_imul_imm(b, vbo_cnt, 16)), 0x1);
1126 }
1127 
1128 /**
1129  * For emitting VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NV.
1130  */
1131 static void
dgc_emit_dispatch(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * stream_buf,nir_def * stream_base,nir_def * dispatch_params_offset,nir_def * sequence_id,const struct radv_device * device)1132 dgc_emit_dispatch(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_buf, nir_def *stream_base,
1133                   nir_def *dispatch_params_offset, nir_def *sequence_id, const struct radv_device *device)
1134 {
1135    nir_def *stream_offset = nir_iadd(b, dispatch_params_offset, stream_base);
1136 
1137    nir_def *dispatch_data = nir_load_ssbo(b, 3, 32, stream_buf, stream_offset);
1138    nir_def *wg_x = nir_channel(b, dispatch_data, 0);
1139    nir_def *wg_y = nir_channel(b, dispatch_data, 1);
1140    nir_def *wg_z = nir_channel(b, dispatch_data, 2);
1141 
1142    nir_def *grid_sgpr = load_param16(b, grid_base_sgpr);
1143    nir_push_if(b, nir_ine_imm(b, grid_sgpr, 0));
1144    {
1145       if (device->load_grid_size_from_user_sgpr) {
1146          dgc_emit_grid_size_user_sgpr(b, cs, grid_sgpr, wg_x, wg_y, wg_z);
1147       } else {
1148          dgc_emit_grid_size_pointer(b, cs, grid_sgpr, stream_offset);
1149       }
1150    }
1151    nir_pop_if(b, 0);
1152 
1153    nir_push_if(b, nir_iand(b, nir_ine_imm(b, wg_x, 0), nir_iand(b, nir_ine_imm(b, wg_y, 0), nir_ine_imm(b, wg_z, 0))));
1154    {
1155       dgc_emit_sqtt_begin_api_marker(b, cs, ApiCmdDispatch);
1156       dgc_emit_sqtt_marker_event_with_dims(b, cs, sequence_id, wg_x, wg_y, wg_z, EventCmdDispatch);
1157 
1158       dgc_emit_dispatch_direct(b, cs, wg_x, wg_y, wg_z, load_param32(b, dispatch_initiator));
1159 
1160       dgc_emit_sqtt_thread_trace_marker(b, cs);
1161       dgc_emit_sqtt_end_api_marker(b, cs, ApiCmdDispatch);
1162    }
1163    nir_pop_if(b, 0);
1164 }
1165 
1166 /**
1167  * Emit VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_MESH_TASKS_NV.
1168  */
1169 static void
dgc_emit_draw_mesh_tasks(nir_builder * b,struct dgc_cmdbuf * cs,nir_def * stream_buf,nir_def * stream_base,nir_def * draw_params_offset,nir_def * sequence_id,const struct radv_device * device)1170 dgc_emit_draw_mesh_tasks(nir_builder *b, struct dgc_cmdbuf *cs, nir_def *stream_buf, nir_def *stream_base,
1171                          nir_def *draw_params_offset, nir_def *sequence_id, const struct radv_device *device)
1172 {
1173    nir_def *vtx_base_sgpr = load_param16(b, vtx_base_sgpr);
1174    nir_def *stream_offset = nir_iadd(b, draw_params_offset, stream_base);
1175 
1176    nir_def *draw_data = nir_load_ssbo(b, 3, 32, stream_buf, stream_offset);
1177    nir_def *x = nir_channel(b, draw_data, 0);
1178    nir_def *y = nir_channel(b, draw_data, 1);
1179    nir_def *z = nir_channel(b, draw_data, 2);
1180 
1181    nir_push_if(b, nir_iand(b, nir_ine_imm(b, x, 0), nir_iand(b, nir_ine_imm(b, y, 0), nir_ine_imm(b, z, 0))));
1182    {
1183       dgc_emit_sqtt_begin_api_marker(b, cs, ApiCmdDrawMeshTasksEXT);
1184       dgc_emit_sqtt_marker_event(b, cs, sequence_id, EventCmdDrawMeshTasksEXT);
1185 
1186       dgc_emit_userdata_mesh(b, cs, vtx_base_sgpr, x, y, z, sequence_id, device);
1187       dgc_emit_instance_count(b, cs, nir_imm_int(b, 1));
1188 
1189       if (device->physical_device->mesh_fast_launch_2) {
1190          dgc_emit_dispatch_mesh_direct(b, cs, x, y, z);
1191       } else {
1192          nir_def *vertex_count = nir_imul(b, x, nir_imul(b, y, z));
1193          dgc_emit_draw_index_auto(b, cs, vertex_count);
1194       }
1195 
1196       dgc_emit_sqtt_thread_trace_marker(b, cs);
1197       dgc_emit_sqtt_end_api_marker(b, cs, ApiCmdDrawMeshTasksEXT);
1198    }
1199    nir_pop_if(b, NULL);
1200 }
1201 
1202 static nir_def *
dgc_is_cond_render_enabled(nir_builder * b)1203 dgc_is_cond_render_enabled(nir_builder *b)
1204 {
1205    nir_def *res1, *res2;
1206 
1207    nir_push_if(b, nir_ieq_imm(b, load_param8(b, predicating), 1));
1208    {
1209       nir_def *val = nir_load_global(b, load_param64(b, predication_va), 4, 1, 32);
1210       /* By default, all rendering commands are discarded if the 32-bit value is zero. If the
1211        * inverted flag is set, they are discarded if the value is non-zero.
1212        */
1213       res1 = nir_ixor(b, nir_i2b(b, load_param8(b, predication_type)), nir_ine_imm(b, val, 0));
1214    }
1215    nir_push_else(b, 0);
1216    {
1217       res2 = nir_imm_bool(b, false);
1218    }
1219    nir_pop_if(b, 0);
1220 
1221    return nir_if_phi(b, res1, res2);
1222 }
1223 
1224 static nir_shader *
build_dgc_prepare_shader(struct radv_device * dev)1225 build_dgc_prepare_shader(struct radv_device *dev)
1226 {
1227    nir_builder b = radv_meta_init_shader(dev, MESA_SHADER_COMPUTE, "meta_dgc_prepare");
1228    b.shader->info.workgroup_size[0] = 64;
1229 
1230    nir_def *global_id = get_global_ids(&b, 1);
1231 
1232    nir_def *sequence_id = global_id;
1233 
1234    nir_def *cmd_buf_stride = load_param32(&b, cmd_buf_stride);
1235    nir_def *sequence_count = load_param32(&b, sequence_count);
1236    nir_def *stream_stride = load_param32(&b, stream_stride);
1237 
1238    nir_def *use_count = nir_iand_imm(&b, sequence_count, 1u << 31);
1239    sequence_count = nir_iand_imm(&b, sequence_count, UINT32_MAX >> 1);
1240 
1241    nir_def *cmd_buf_base_offset = dgc_main_cmd_buf_offset(&b, dev);
1242 
1243    /* The effective number of draws is
1244     * min(sequencesCount, sequencesCountBuffer[sequencesCountOffset]) when
1245     * using sequencesCountBuffer. Otherwise it is sequencesCount. */
1246    nir_variable *count_var = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "sequence_count");
1247    nir_store_var(&b, count_var, sequence_count, 0x1);
1248 
1249    nir_push_if(&b, nir_ine_imm(&b, use_count, 0));
1250    {
1251       nir_def *count_buf = radv_meta_load_descriptor(&b, 0, DGC_DESC_COUNT);
1252       nir_def *cnt = nir_load_ssbo(&b, 1, 32, count_buf, nir_imm_int(&b, 0));
1253       /* Must clamp count against the API count explicitly.
1254        * The workgroup potentially contains more threads than maxSequencesCount from API,
1255        * and we have to ensure these threads write NOP packets to pad out the IB. */
1256       cnt = nir_umin(&b, cnt, sequence_count);
1257       nir_store_var(&b, count_var, cnt, 0x1);
1258    }
1259    nir_pop_if(&b, NULL);
1260 
1261    nir_push_if(&b, dgc_is_cond_render_enabled(&b));
1262    {
1263       /* Reset the number of sequences when conditional rendering is enabled in order to skip the
1264        * entire shader and pad the cmdbuf with NOPs.
1265        */
1266       nir_store_var(&b, count_var, nir_imm_int(&b, 0), 0x1);
1267    }
1268    nir_pop_if(&b, NULL);
1269 
1270    sequence_count = nir_load_var(&b, count_var);
1271 
1272    nir_push_if(&b, nir_ult(&b, sequence_id, sequence_count));
1273    {
1274       struct dgc_cmdbuf cmd_buf = {
1275          .descriptor = radv_meta_load_descriptor(&b, 0, DGC_DESC_PREPARE),
1276          .offset = nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "cmd_buf_offset"),
1277          .gfx_level = dev->physical_device->rad_info.gfx_level,
1278          .sqtt_enabled = !!dev->sqtt.bo,
1279       };
1280       nir_store_var(&b, cmd_buf.offset, nir_iadd(&b, nir_imul(&b, global_id, cmd_buf_stride), cmd_buf_base_offset), 1);
1281       nir_def *cmd_buf_end = nir_iadd(&b, nir_load_var(&b, cmd_buf.offset), cmd_buf_stride);
1282 
1283       nir_def *stream_buf = radv_meta_load_descriptor(&b, 0, DGC_DESC_STREAM);
1284       nir_def *stream_base = nir_imul(&b, sequence_id, stream_stride);
1285 
1286       nir_variable *upload_offset =
1287          nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "upload_offset");
1288       nir_def *upload_offset_init = nir_iadd(&b, nir_iadd(&b, load_param32(&b, cmd_buf_size), cmd_buf_base_offset),
1289                                              nir_imul(&b, load_param32(&b, upload_stride), sequence_id));
1290       nir_store_var(&b, upload_offset, upload_offset_init, 0x1);
1291 
1292       nir_def *vbo_bind_mask = load_param32(&b, vbo_bind_mask);
1293       nir_push_if(&b, nir_ine_imm(&b, vbo_bind_mask, 0));
1294       {
1295          dgc_emit_vertex_buffer(&b, &cmd_buf, stream_buf, stream_base, vbo_bind_mask, upload_offset, dev);
1296       }
1297       nir_pop_if(&b, NULL);
1298 
1299       nir_def *push_const_mask = load_param64(&b, push_constant_mask);
1300       nir_push_if(&b, nir_ine_imm(&b, push_const_mask, 0));
1301       {
1302          dgc_emit_push_constant(&b, &cmd_buf, stream_buf, stream_base, push_const_mask, upload_offset);
1303       }
1304       nir_pop_if(&b, 0);
1305 
1306       nir_push_if(&b, nir_ieq_imm(&b, load_param8(&b, is_dispatch), 0));
1307       {
1308          nir_push_if(&b, nir_ieq_imm(&b, load_param16(&b, draw_indexed), 0));
1309          {
1310             nir_def *draw_mesh_tasks = load_param8(&b, draw_mesh_tasks);
1311             nir_push_if(&b, nir_ieq_imm(&b, draw_mesh_tasks, 0));
1312             {
1313                dgc_emit_draw(&b, &cmd_buf, stream_buf, stream_base, load_param16(&b, draw_params_offset), sequence_id,
1314                              dev);
1315             }
1316             nir_push_else(&b, NULL);
1317             {
1318                dgc_emit_draw_mesh_tasks(&b, &cmd_buf, stream_buf, stream_base, load_param16(&b, draw_params_offset),
1319                                         sequence_id, dev);
1320             }
1321             nir_pop_if(&b, NULL);
1322          }
1323          nir_push_else(&b, NULL);
1324          {
1325             /* Emit direct draws when index buffers are also updated by DGC. Otherwise, emit
1326              * indirect draws to remove the dependency on the cmdbuf state in order to enable
1327              * preprocessing.
1328              */
1329             nir_def *binds_index_buffer = nir_ine_imm(&b, load_param16(&b, binds_index_buffer), 0);
1330             nir_push_if(&b, binds_index_buffer);
1331             {
1332                nir_variable *max_index_count_var =
1333                   nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint_type(), "max_index_count");
1334 
1335                dgc_emit_index_buffer(&b, &cmd_buf, stream_buf, stream_base, load_param16(&b, index_buffer_offset),
1336                                      load_param32(&b, ibo_type_32), load_param32(&b, ibo_type_8), max_index_count_var,
1337                                      dev);
1338 
1339                nir_def *max_index_count = nir_load_var(&b, max_index_count_var);
1340 
1341                dgc_emit_draw_indexed(&b, &cmd_buf, stream_buf, stream_base, load_param16(&b, draw_params_offset),
1342                                      sequence_id, max_index_count, dev);
1343             }
1344             nir_push_else(&b, NULL);
1345             {
1346                dgc_emit_draw_indirect(&b, &cmd_buf, stream_base, load_param16(&b, draw_params_offset), true);
1347             }
1348 
1349             nir_pop_if(&b, NULL);
1350          }
1351          nir_pop_if(&b, NULL);
1352       }
1353       nir_push_else(&b, NULL);
1354       {
1355          dgc_emit_dispatch(&b, &cmd_buf, stream_buf, stream_base, load_param16(&b, dispatch_params_offset), sequence_id,
1356                            dev);
1357       }
1358       nir_pop_if(&b, NULL);
1359 
1360       /* Pad the cmdbuffer if we did not use the whole stride */
1361       nir_push_if(&b, nir_ine(&b, nir_load_var(&b, cmd_buf.offset), cmd_buf_end));
1362       {
1363          if (dev->physical_device->rad_info.gfx_ib_pad_with_type2) {
1364             nir_push_loop(&b);
1365             {
1366                nir_def *curr_offset = nir_load_var(&b, cmd_buf.offset);
1367 
1368                nir_push_if(&b, nir_ieq(&b, curr_offset, cmd_buf_end));
1369                {
1370                   nir_jump(&b, nir_jump_break);
1371                }
1372                nir_pop_if(&b, NULL);
1373 
1374                nir_def *pkt = nir_imm_int(&b, PKT2_NOP_PAD);
1375 
1376                dgc_emit(&b, &cmd_buf, pkt);
1377             }
1378             nir_pop_loop(&b, NULL);
1379          } else {
1380             nir_def *cnt = nir_isub(&b, cmd_buf_end, nir_load_var(&b, cmd_buf.offset));
1381             cnt = nir_ushr_imm(&b, cnt, 2);
1382             cnt = nir_iadd_imm(&b, cnt, -2);
1383             nir_def *pkt = nir_pkt3(&b, PKT3_NOP, cnt);
1384 
1385             dgc_emit(&b, &cmd_buf, pkt);
1386          }
1387       }
1388       nir_pop_if(&b, NULL);
1389    }
1390    nir_pop_if(&b, NULL);
1391 
1392    build_dgc_buffer_tail(&b, sequence_count, dev);
1393    build_dgc_buffer_preamble(&b, sequence_count, dev);
1394    return b.shader;
1395 }
1396 
1397 void
radv_device_finish_dgc_prepare_state(struct radv_device * device)1398 radv_device_finish_dgc_prepare_state(struct radv_device *device)
1399 {
1400    radv_DestroyPipeline(radv_device_to_handle(device), device->meta_state.dgc_prepare.pipeline,
1401                         &device->meta_state.alloc);
1402    radv_DestroyPipelineLayout(radv_device_to_handle(device), device->meta_state.dgc_prepare.p_layout,
1403                               &device->meta_state.alloc);
1404    device->vk.dispatch_table.DestroyDescriptorSetLayout(
1405       radv_device_to_handle(device), device->meta_state.dgc_prepare.ds_layout, &device->meta_state.alloc);
1406 }
1407 
1408 VkResult
radv_device_init_dgc_prepare_state(struct radv_device * device)1409 radv_device_init_dgc_prepare_state(struct radv_device *device)
1410 {
1411    VkResult result;
1412    nir_shader *cs = build_dgc_prepare_shader(device);
1413 
1414    VkDescriptorSetLayoutCreateInfo ds_create_info = {.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
1415                                                      .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
1416                                                      .bindingCount = DGC_NUM_DESCS,
1417                                                      .pBindings = (VkDescriptorSetLayoutBinding[]){
1418                                                         {.binding = DGC_DESC_STREAM,
1419                                                          .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1420                                                          .descriptorCount = 1,
1421                                                          .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
1422                                                          .pImmutableSamplers = NULL},
1423                                                         {.binding = DGC_DESC_PREPARE,
1424                                                          .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1425                                                          .descriptorCount = 1,
1426                                                          .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
1427                                                          .pImmutableSamplers = NULL},
1428                                                         {.binding = DGC_DESC_PARAMS,
1429                                                          .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
1430                                                          .descriptorCount = 1,
1431                                                          .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
1432                                                          .pImmutableSamplers = NULL},
1433                                                         {.binding = DGC_DESC_COUNT,
1434                                                          .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
1435                                                          .descriptorCount = 1,
1436                                                          .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
1437                                                          .pImmutableSamplers = NULL},
1438                                                      }};
1439 
1440    result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device), &ds_create_info, &device->meta_state.alloc,
1441                                            &device->meta_state.dgc_prepare.ds_layout);
1442    if (result != VK_SUCCESS)
1443       goto cleanup;
1444 
1445    const VkPipelineLayoutCreateInfo leaf_pl_create_info = {
1446       .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
1447       .setLayoutCount = 1,
1448       .pSetLayouts = &device->meta_state.dgc_prepare.ds_layout,
1449       .pushConstantRangeCount = 1,
1450       .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(struct radv_dgc_params)},
1451    };
1452 
1453    result = radv_CreatePipelineLayout(radv_device_to_handle(device), &leaf_pl_create_info, &device->meta_state.alloc,
1454                                       &device->meta_state.dgc_prepare.p_layout);
1455    if (result != VK_SUCCESS)
1456       goto cleanup;
1457 
1458    VkPipelineShaderStageCreateInfo shader_stage = {
1459       .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
1460       .stage = VK_SHADER_STAGE_COMPUTE_BIT,
1461       .module = vk_shader_module_handle_from_nir(cs),
1462       .pName = "main",
1463       .pSpecializationInfo = NULL,
1464    };
1465 
1466    VkComputePipelineCreateInfo pipeline_info = {
1467       .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
1468       .stage = shader_stage,
1469       .flags = 0,
1470       .layout = device->meta_state.dgc_prepare.p_layout,
1471    };
1472 
1473    result = radv_compute_pipeline_create(radv_device_to_handle(device), device->meta_state.cache, &pipeline_info,
1474                                          &device->meta_state.alloc, &device->meta_state.dgc_prepare.pipeline);
1475    if (result != VK_SUCCESS)
1476       goto cleanup;
1477 
1478 cleanup:
1479    ralloc_free(cs);
1480    return result;
1481 }
1482 
1483 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateIndirectCommandsLayoutNV(VkDevice _device,const VkIndirectCommandsLayoutCreateInfoNV * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkIndirectCommandsLayoutNV * pIndirectCommandsLayout)1484 radv_CreateIndirectCommandsLayoutNV(VkDevice _device, const VkIndirectCommandsLayoutCreateInfoNV *pCreateInfo,
1485                                     const VkAllocationCallbacks *pAllocator,
1486                                     VkIndirectCommandsLayoutNV *pIndirectCommandsLayout)
1487 {
1488    RADV_FROM_HANDLE(radv_device, device, _device);
1489    struct radv_indirect_command_layout *layout;
1490 
1491    size_t size = sizeof(*layout) + pCreateInfo->tokenCount * sizeof(VkIndirectCommandsLayoutTokenNV);
1492 
1493    layout = vk_zalloc2(&device->vk.alloc, pAllocator, size, alignof(struct radv_indirect_command_layout),
1494                        VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1495    if (!layout)
1496       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1497 
1498    vk_object_base_init(&device->vk, &layout->base, VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NV);
1499 
1500    layout->flags = pCreateInfo->flags;
1501    layout->pipeline_bind_point = pCreateInfo->pipelineBindPoint;
1502    layout->input_stride = pCreateInfo->pStreamStrides[0];
1503    layout->token_count = pCreateInfo->tokenCount;
1504    typed_memcpy(layout->tokens, pCreateInfo->pTokens, pCreateInfo->tokenCount);
1505 
1506    layout->ibo_type_32 = VK_INDEX_TYPE_UINT32;
1507    layout->ibo_type_8 = VK_INDEX_TYPE_UINT8_KHR;
1508 
1509    for (unsigned i = 0; i < pCreateInfo->tokenCount; ++i) {
1510       switch (pCreateInfo->pTokens[i].tokenType) {
1511       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV:
1512          layout->draw_params_offset = pCreateInfo->pTokens[i].offset;
1513          break;
1514       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV:
1515          layout->indexed = true;
1516          layout->draw_params_offset = pCreateInfo->pTokens[i].offset;
1517          break;
1518       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NV:
1519          layout->dispatch_params_offset = pCreateInfo->pTokens[i].offset;
1520          break;
1521       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NV:
1522          layout->binds_index_buffer = true;
1523          layout->index_buffer_offset = pCreateInfo->pTokens[i].offset;
1524          /* 16-bit is implied if we find no match. */
1525          for (unsigned j = 0; j < pCreateInfo->pTokens[i].indexTypeCount; j++) {
1526             if (pCreateInfo->pTokens[i].pIndexTypes[j] == VK_INDEX_TYPE_UINT32)
1527                layout->ibo_type_32 = pCreateInfo->pTokens[i].pIndexTypeValues[j];
1528             else if (pCreateInfo->pTokens[i].pIndexTypes[j] == VK_INDEX_TYPE_UINT8_KHR)
1529                layout->ibo_type_8 = pCreateInfo->pTokens[i].pIndexTypeValues[j];
1530          }
1531          break;
1532       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NV:
1533          layout->bind_vbo_mask |= 1u << pCreateInfo->pTokens[i].vertexBindingUnit;
1534          layout->vbo_offsets[pCreateInfo->pTokens[i].vertexBindingUnit] = pCreateInfo->pTokens[i].offset;
1535          if (pCreateInfo->pTokens[i].vertexDynamicStride)
1536             layout->vbo_offsets[pCreateInfo->pTokens[i].vertexBindingUnit] |= DGC_DYNAMIC_STRIDE;
1537          break;
1538       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV:
1539          for (unsigned j = pCreateInfo->pTokens[i].pushconstantOffset / 4, k = 0;
1540               k < pCreateInfo->pTokens[i].pushconstantSize / 4; ++j, ++k) {
1541             layout->push_constant_mask |= 1ull << j;
1542             layout->push_constant_offsets[j] = pCreateInfo->pTokens[i].offset + k * 4;
1543          }
1544          break;
1545       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_MESH_TASKS_NV:
1546          layout->draw_mesh_tasks = true;
1547          layout->draw_params_offset = pCreateInfo->pTokens[i].offset;
1548          break;
1549       default:
1550          unreachable("Unhandled token type");
1551       }
1552    }
1553    if (!layout->indexed)
1554       layout->binds_index_buffer = false;
1555 
1556    *pIndirectCommandsLayout = radv_indirect_command_layout_to_handle(layout);
1557    return VK_SUCCESS;
1558 }
1559 
1560 VKAPI_ATTR void VKAPI_CALL
radv_DestroyIndirectCommandsLayoutNV(VkDevice _device,VkIndirectCommandsLayoutNV indirectCommandsLayout,const VkAllocationCallbacks * pAllocator)1561 radv_DestroyIndirectCommandsLayoutNV(VkDevice _device, VkIndirectCommandsLayoutNV indirectCommandsLayout,
1562                                      const VkAllocationCallbacks *pAllocator)
1563 {
1564    RADV_FROM_HANDLE(radv_device, device, _device);
1565    VK_FROM_HANDLE(radv_indirect_command_layout, layout, indirectCommandsLayout);
1566 
1567    if (!layout)
1568       return;
1569 
1570    vk_object_base_finish(&layout->base);
1571    vk_free2(&device->vk.alloc, pAllocator, layout);
1572 }
1573 
1574 VKAPI_ATTR void VKAPI_CALL
radv_GetGeneratedCommandsMemoryRequirementsNV(VkDevice _device,const VkGeneratedCommandsMemoryRequirementsInfoNV * pInfo,VkMemoryRequirements2 * pMemoryRequirements)1575 radv_GetGeneratedCommandsMemoryRequirementsNV(VkDevice _device,
1576                                               const VkGeneratedCommandsMemoryRequirementsInfoNV *pInfo,
1577                                               VkMemoryRequirements2 *pMemoryRequirements)
1578 {
1579    RADV_FROM_HANDLE(radv_device, device, _device);
1580    VK_FROM_HANDLE(radv_indirect_command_layout, layout, pInfo->indirectCommandsLayout);
1581    VK_FROM_HANDLE(radv_pipeline, pipeline, pInfo->pipeline);
1582 
1583    uint32_t cmd_stride, upload_stride;
1584    radv_get_sequence_size(layout, pipeline, &cmd_stride, &upload_stride);
1585 
1586    VkDeviceSize cmd_buf_size = radv_align_cmdbuf_size(device, cmd_stride * pInfo->maxSequencesCount, AMD_IP_GFX) +
1587                                radv_dgc_preamble_cmdbuf_size(device);
1588    VkDeviceSize upload_buf_size = upload_stride * pInfo->maxSequencesCount;
1589 
1590    pMemoryRequirements->memoryRequirements.memoryTypeBits = device->physical_device->memory_types_32bit;
1591    pMemoryRequirements->memoryRequirements.alignment =
1592       MAX2(device->physical_device->rad_info.ip[AMD_IP_GFX].ib_alignment,
1593            device->physical_device->rad_info.ip[AMD_IP_COMPUTE].ib_alignment);
1594    pMemoryRequirements->memoryRequirements.size =
1595       align(cmd_buf_size + upload_buf_size, pMemoryRequirements->memoryRequirements.alignment);
1596 }
1597 
1598 bool
radv_use_dgc_predication(struct radv_cmd_buffer * cmd_buffer,const VkGeneratedCommandsInfoNV * pGeneratedCommandsInfo)1599 radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
1600 {
1601    VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer);
1602 
1603    /* Enable conditional rendering (if not enabled by user) to skip prepare/execute DGC calls when
1604     * the indirect sequence count might be zero. This can only be enabled on GFX because on ACE it's
1605     * not possible to skip the execute DGC call (ie. no INDIRECT_PACKET)
1606     */
1607    return cmd_buffer->qf == RADV_QUEUE_GENERAL && seq_count_buffer && !cmd_buffer->state.predicating;
1608 }
1609 
1610 static bool
radv_dgc_need_push_constants_copy(const struct radv_pipeline * pipeline)1611 radv_dgc_need_push_constants_copy(const struct radv_pipeline *pipeline)
1612 {
1613    for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); ++i) {
1614       const struct radv_shader *shader = pipeline->shaders[i];
1615 
1616       if (!shader)
1617          continue;
1618 
1619       const struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs;
1620       if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0)
1621          return true;
1622    }
1623 
1624    return false;
1625 }
1626 
1627 bool
radv_dgc_can_preprocess(const struct radv_indirect_command_layout * layout,struct radv_pipeline * pipeline)1628 radv_dgc_can_preprocess(const struct radv_indirect_command_layout *layout, struct radv_pipeline *pipeline)
1629 {
1630    if (!(layout->flags & VK_INDIRECT_COMMANDS_LAYOUT_USAGE_EXPLICIT_PREPROCESS_BIT_NV))
1631       return false;
1632 
1633    /* From the Vulkan spec (1.3.269, chapter 32):
1634     * "The bound descriptor sets and push constants that will be used with indirect command generation for the compute
1635     * piplines must already be specified at the time of preprocessing commands with vkCmdPreprocessGeneratedCommandsNV.
1636     * They must not change until the execution of indirect commands is submitted with vkCmdExecuteGeneratedCommandsNV."
1637     *
1638     * So we can always preprocess compute layouts.
1639     */
1640    if (layout->pipeline_bind_point != VK_PIPELINE_BIND_POINT_COMPUTE) {
1641       /* VBO binding (in particular partial VBO binding) uses some draw state which we don't generate at preprocess time
1642        * yet. */
1643       if (layout->bind_vbo_mask)
1644          return false;
1645 
1646       /* Do not preprocess when all push constants can't be inlined because they need to be copied
1647        * to the upload BO.
1648        */
1649       if (layout->push_constant_mask && radv_dgc_need_push_constants_copy(pipeline))
1650          return false;
1651    }
1652 
1653    return true;
1654 }
1655 
1656 VKAPI_ATTR void VKAPI_CALL
radv_CmdPreprocessGeneratedCommandsNV(VkCommandBuffer commandBuffer,const VkGeneratedCommandsInfoNV * pGeneratedCommandsInfo)1657 radv_CmdPreprocessGeneratedCommandsNV(VkCommandBuffer commandBuffer,
1658                                       const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
1659 {
1660    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1661    VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
1662    VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
1663 
1664    if (!radv_dgc_can_preprocess(layout, pipeline))
1665       return;
1666 
1667    /* VK_EXT_conditional_rendering says that copy commands should not be
1668     * affected by conditional rendering.
1669     */
1670    const bool old_predicating = cmd_buffer->state.predicating;
1671    cmd_buffer->state.predicating = false;
1672 
1673    radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, false);
1674 
1675    /* Restore conditional rendering. */
1676    cmd_buffer->state.predicating = old_predicating;
1677 }
1678 
1679 /* Always need to call this directly before draw due to dependence on bound state. */
1680 static void
radv_prepare_dgc_graphics(struct radv_cmd_buffer * cmd_buffer,const VkGeneratedCommandsInfoNV * pGeneratedCommandsInfo,unsigned * upload_size,unsigned * upload_offset,void ** upload_data,struct radv_dgc_params * params)1681 radv_prepare_dgc_graphics(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo,
1682                           unsigned *upload_size, unsigned *upload_offset, void **upload_data,
1683                           struct radv_dgc_params *params)
1684 {
1685    VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
1686    VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
1687    struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
1688    struct radv_shader *vs = radv_get_shader(graphics_pipeline->base.shaders, MESA_SHADER_VERTEX);
1689    unsigned vb_size = layout->bind_vbo_mask ? util_bitcount(vs->info.vs.vb_desc_usage_mask) * 24 : 0;
1690 
1691    *upload_size = MAX2(*upload_size + vb_size, 16);
1692 
1693    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, *upload_size, upload_offset, upload_data)) {
1694       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
1695       return;
1696    }
1697 
1698    uint16_t vtx_base_sgpr = 0;
1699 
1700    if (cmd_buffer->state.graphics_pipeline->vtx_base_sgpr)
1701       vtx_base_sgpr = (cmd_buffer->state.graphics_pipeline->vtx_base_sgpr - SI_SH_REG_OFFSET) >> 2;
1702 
1703    if (cmd_buffer->state.graphics_pipeline->uses_drawid)
1704       vtx_base_sgpr |= DGC_USES_DRAWID;
1705 
1706    if (layout->draw_mesh_tasks) {
1707       struct radv_shader *mesh_shader = radv_get_shader(graphics_pipeline->base.shaders, MESA_SHADER_MESH);
1708       if (mesh_shader->info.cs.uses_grid_size)
1709          vtx_base_sgpr |= DGC_USES_GRID_SIZE;
1710    } else {
1711       if (cmd_buffer->state.graphics_pipeline->uses_baseinstance)
1712          vtx_base_sgpr |= DGC_USES_BASEINSTANCE;
1713    }
1714 
1715    params->draw_indexed = layout->indexed;
1716    params->draw_params_offset = layout->draw_params_offset;
1717    params->binds_index_buffer = layout->binds_index_buffer;
1718    params->vtx_base_sgpr = vtx_base_sgpr;
1719    params->max_index_count = cmd_buffer->state.max_index_count;
1720    params->index_buffer_offset = layout->index_buffer_offset;
1721    params->ibo_type_32 = layout->ibo_type_32;
1722    params->ibo_type_8 = layout->ibo_type_8;
1723    params->draw_mesh_tasks = layout->draw_mesh_tasks;
1724 
1725    if (layout->bind_vbo_mask) {
1726       uint32_t mask = vs->info.vs.vb_desc_usage_mask;
1727       unsigned vb_desc_alloc_size = util_bitcount(mask) * 16;
1728 
1729       radv_write_vertex_descriptors(cmd_buffer, graphics_pipeline, true, *upload_data);
1730 
1731       uint32_t *vbo_info = (uint32_t *)((char *)*upload_data + vb_desc_alloc_size);
1732 
1733       unsigned idx = 0;
1734       while (mask) {
1735          unsigned i = u_bit_scan(&mask);
1736          unsigned binding = vs->info.vs.use_per_attribute_vb_descs ? graphics_pipeline->attrib_bindings[i] : i;
1737          uint32_t attrib_end = graphics_pipeline->attrib_ends[i];
1738 
1739          params->vbo_bind_mask |= ((layout->bind_vbo_mask >> binding) & 1u) << idx;
1740          vbo_info[2 * idx] = ((vs->info.vs.use_per_attribute_vb_descs ? 1u : 0u) << 31) | layout->vbo_offsets[binding];
1741          vbo_info[2 * idx + 1] = graphics_pipeline->attrib_index_offset[i] | (attrib_end << 16);
1742          ++idx;
1743       }
1744       params->vbo_cnt = idx;
1745       params->vbo_reg =
1746          ((radv_get_user_sgpr(vs, AC_UD_VS_VERTEX_BUFFERS)->sgpr_idx * 4 + vs->info.user_data_0) - SI_SH_REG_OFFSET) >>
1747          2;
1748       *upload_data = (char *)*upload_data + vb_size;
1749    }
1750 }
1751 
1752 static void
radv_prepare_dgc_compute(struct radv_cmd_buffer * cmd_buffer,const VkGeneratedCommandsInfoNV * pGeneratedCommandsInfo,unsigned * upload_size,unsigned * upload_offset,void ** upload_data,struct radv_dgc_params * params,bool cond_render_enabled)1753 radv_prepare_dgc_compute(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo,
1754                          unsigned *upload_size, unsigned *upload_offset, void **upload_data,
1755                          struct radv_dgc_params *params, bool cond_render_enabled)
1756 {
1757    VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
1758    VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
1759    struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
1760    struct radv_shader *cs = radv_get_shader(compute_pipeline->base.shaders, MESA_SHADER_COMPUTE);
1761 
1762    *upload_size = MAX2(*upload_size, 16);
1763 
1764    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, *upload_size, upload_offset, upload_data)) {
1765       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
1766       return;
1767    }
1768 
1769    uint32_t dispatch_initiator = cmd_buffer->device->dispatch_initiator;
1770    dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
1771    if (cs->info.wave_size == 32) {
1772       assert(cmd_buffer->device->physical_device->rad_info.gfx_level >= GFX10);
1773       dispatch_initiator |= S_00B800_CS_W32_EN(1);
1774    }
1775 
1776    params->dispatch_params_offset = layout->dispatch_params_offset;
1777    params->dispatch_initiator = dispatch_initiator;
1778    params->is_dispatch = 1;
1779 
1780    if (cond_render_enabled) {
1781       params->predicating = true;
1782       params->predication_va = cmd_buffer->state.predication_va;
1783       params->predication_type = cmd_buffer->state.predication_type;
1784    }
1785 
1786    const struct radv_userdata_info *loc = radv_get_user_sgpr(cs, AC_UD_CS_GRID_SIZE);
1787    if (loc->sgpr_idx != -1) {
1788       params->grid_base_sgpr = (cs->info.user_data_0 + 4 * loc->sgpr_idx - SI_SH_REG_OFFSET) >> 2;
1789    }
1790 }
1791 
1792 void
radv_prepare_dgc(struct radv_cmd_buffer * cmd_buffer,const VkGeneratedCommandsInfoNV * pGeneratedCommandsInfo,bool cond_render_enabled)1793 radv_prepare_dgc(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo,
1794                  bool cond_render_enabled)
1795 {
1796    VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
1797    VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
1798    VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
1799    VK_FROM_HANDLE(radv_buffer, stream_buffer, pGeneratedCommandsInfo->pStreams[0].buffer);
1800    struct radv_meta_saved_state saved_state;
1801    unsigned upload_offset, upload_size;
1802    struct radv_buffer token_buffer;
1803    void *upload_data;
1804 
1805    uint32_t cmd_stride, upload_stride;
1806    radv_get_sequence_size(layout, pipeline, &cmd_stride, &upload_stride);
1807 
1808    unsigned cmd_buf_size =
1809       radv_align_cmdbuf_size(cmd_buffer->device, cmd_stride * pGeneratedCommandsInfo->sequencesCount, AMD_IP_GFX);
1810 
1811    uint64_t upload_addr =
1812       radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset + pGeneratedCommandsInfo->preprocessOffset;
1813 
1814    uint64_t stream_addr =
1815       radv_buffer_get_va(stream_buffer->bo) + stream_buffer->offset + pGeneratedCommandsInfo->pStreams[0].offset;
1816 
1817    struct radv_dgc_params params = {
1818       .cmd_buf_stride = cmd_stride,
1819       .cmd_buf_size = cmd_buf_size,
1820       .upload_addr = (uint32_t)upload_addr,
1821       .upload_stride = upload_stride,
1822       .sequence_count = pGeneratedCommandsInfo->sequencesCount,
1823       .stream_stride = layout->input_stride,
1824       .use_preamble = radv_dgc_use_preamble(pGeneratedCommandsInfo),
1825       .stream_addr = stream_addr,
1826    };
1827 
1828    upload_size = pipeline->push_constant_size + 16 * pipeline->dynamic_offset_count +
1829                  sizeof(layout->push_constant_offsets) + ARRAY_SIZE(pipeline->shaders) * 12;
1830    if (!layout->push_constant_mask)
1831       upload_size = 0;
1832 
1833    if (layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
1834       radv_prepare_dgc_graphics(cmd_buffer, pGeneratedCommandsInfo, &upload_size, &upload_offset, &upload_data,
1835                                 &params);
1836    } else {
1837       assert(layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE);
1838       radv_prepare_dgc_compute(cmd_buffer, pGeneratedCommandsInfo, &upload_size, &upload_offset, &upload_data, &params,
1839                                cond_render_enabled);
1840    }
1841 
1842    if (layout->push_constant_mask) {
1843       uint32_t *desc = upload_data;
1844       upload_data = (char *)upload_data + ARRAY_SIZE(pipeline->shaders) * 12;
1845 
1846       unsigned idx = 0;
1847       for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); ++i) {
1848          if (!pipeline->shaders[i])
1849             continue;
1850 
1851          const struct radv_shader *shader = pipeline->shaders[i];
1852          const struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs;
1853          if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0)
1854             params.const_copy = 1;
1855 
1856          if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0 ||
1857              locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
1858             unsigned upload_sgpr = 0;
1859             unsigned inline_sgpr = 0;
1860 
1861             if (locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx >= 0) {
1862                upload_sgpr = (shader->info.user_data_0 + 4 * locs->shader_data[AC_UD_PUSH_CONSTANTS].sgpr_idx -
1863                               SI_SH_REG_OFFSET) >>
1864                              2;
1865             }
1866 
1867             if (locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx >= 0) {
1868                inline_sgpr = (shader->info.user_data_0 + 4 * locs->shader_data[AC_UD_INLINE_PUSH_CONSTANTS].sgpr_idx -
1869                               SI_SH_REG_OFFSET) >>
1870                              2;
1871                desc[idx * 3 + 1] = pipeline->shaders[i]->info.inline_push_constant_mask;
1872                desc[idx * 3 + 2] = pipeline->shaders[i]->info.inline_push_constant_mask >> 32;
1873             }
1874             desc[idx * 3] = upload_sgpr | (inline_sgpr << 16);
1875             ++idx;
1876          }
1877       }
1878 
1879       params.push_constant_shader_cnt = idx;
1880 
1881       params.const_copy_size = pipeline->push_constant_size + 16 * pipeline->dynamic_offset_count;
1882       params.push_constant_mask = layout->push_constant_mask;
1883 
1884       memcpy(upload_data, layout->push_constant_offsets, sizeof(layout->push_constant_offsets));
1885       upload_data = (char *)upload_data + sizeof(layout->push_constant_offsets);
1886 
1887       memcpy(upload_data, cmd_buffer->push_constants, pipeline->push_constant_size);
1888       upload_data = (char *)upload_data + pipeline->push_constant_size;
1889 
1890       struct radv_descriptor_state *descriptors_state =
1891          radv_get_descriptors_state(cmd_buffer, pGeneratedCommandsInfo->pipelineBindPoint);
1892       memcpy(upload_data, descriptors_state->dynamic_buffers, 16 * pipeline->dynamic_offset_count);
1893       upload_data = (char *)upload_data + 16 * pipeline->dynamic_offset_count;
1894    }
1895 
1896    radv_buffer_init(&token_buffer, cmd_buffer->device, cmd_buffer->upload.upload_bo, upload_size, upload_offset);
1897 
1898    VkWriteDescriptorSet ds_writes[5];
1899    VkDescriptorBufferInfo buf_info[ARRAY_SIZE(ds_writes)];
1900    int ds_cnt = 0;
1901    buf_info[ds_cnt] =
1902       (VkDescriptorBufferInfo){.buffer = radv_buffer_to_handle(&token_buffer), .offset = 0, .range = upload_size};
1903    ds_writes[ds_cnt] = (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1904                                               .dstBinding = DGC_DESC_PARAMS,
1905                                               .dstArrayElement = 0,
1906                                               .descriptorCount = 1,
1907                                               .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1908                                               .pBufferInfo = &buf_info[ds_cnt]};
1909    ++ds_cnt;
1910 
1911    buf_info[ds_cnt] = (VkDescriptorBufferInfo){.buffer = pGeneratedCommandsInfo->preprocessBuffer,
1912                                                .offset = pGeneratedCommandsInfo->preprocessOffset,
1913                                                .range = pGeneratedCommandsInfo->preprocessSize};
1914    ds_writes[ds_cnt] = (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1915                                               .dstBinding = DGC_DESC_PREPARE,
1916                                               .dstArrayElement = 0,
1917                                               .descriptorCount = 1,
1918                                               .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1919                                               .pBufferInfo = &buf_info[ds_cnt]};
1920    ++ds_cnt;
1921 
1922    if (pGeneratedCommandsInfo->streamCount > 0) {
1923       buf_info[ds_cnt] = (VkDescriptorBufferInfo){.buffer = pGeneratedCommandsInfo->pStreams[0].buffer,
1924                                                   .offset = pGeneratedCommandsInfo->pStreams[0].offset,
1925                                                   .range = VK_WHOLE_SIZE};
1926       ds_writes[ds_cnt] = (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1927                                                  .dstBinding = DGC_DESC_STREAM,
1928                                                  .dstArrayElement = 0,
1929                                                  .descriptorCount = 1,
1930                                                  .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1931                                                  .pBufferInfo = &buf_info[ds_cnt]};
1932       ++ds_cnt;
1933    }
1934 
1935    if (pGeneratedCommandsInfo->sequencesCountBuffer != VK_NULL_HANDLE) {
1936       buf_info[ds_cnt] = (VkDescriptorBufferInfo){.buffer = pGeneratedCommandsInfo->sequencesCountBuffer,
1937                                                   .offset = pGeneratedCommandsInfo->sequencesCountOffset,
1938                                                   .range = VK_WHOLE_SIZE};
1939       ds_writes[ds_cnt] = (VkWriteDescriptorSet){.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1940                                                  .dstBinding = DGC_DESC_COUNT,
1941                                                  .dstArrayElement = 0,
1942                                                  .descriptorCount = 1,
1943                                                  .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1944                                                  .pBufferInfo = &buf_info[ds_cnt]};
1945       ++ds_cnt;
1946       params.sequence_count |= 1u << 31;
1947    }
1948 
1949    radv_meta_save(&saved_state, cmd_buffer,
1950                   RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_DESCRIPTORS | RADV_META_SAVE_CONSTANTS);
1951 
1952    radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE,
1953                         cmd_buffer->device->meta_state.dgc_prepare.pipeline);
1954 
1955    vk_common_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
1956                               cmd_buffer->device->meta_state.dgc_prepare.p_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0,
1957                               sizeof(params), &params);
1958 
1959    radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE,
1960                                  cmd_buffer->device->meta_state.dgc_prepare.p_layout, 0, ds_cnt, ds_writes);
1961 
1962    unsigned block_count = MAX2(1, DIV_ROUND_UP(pGeneratedCommandsInfo->sequencesCount, 64));
1963    vk_common_CmdDispatch(radv_cmd_buffer_to_handle(cmd_buffer), block_count, 1, 1);
1964 
1965    radv_buffer_finish(&token_buffer);
1966    radv_meta_restore(&saved_state, cmd_buffer);
1967 }
1968 
1969 /* VK_NV_device_generated_commands_compute */
1970 VKAPI_ATTR void VKAPI_CALL
radv_GetPipelineIndirectMemoryRequirementsNV(VkDevice device,const VkComputePipelineCreateInfo * pCreateInfo,VkMemoryRequirements2 * pMemoryRequirements)1971 radv_GetPipelineIndirectMemoryRequirementsNV(VkDevice device, const VkComputePipelineCreateInfo *pCreateInfo,
1972                                              VkMemoryRequirements2 *pMemoryRequirements)
1973 {
1974    unreachable("radv: unimplemented vkGetPipelineIndirectMemoryRequirementsNV");
1975 }
1976 
1977 VKAPI_ATTR VkDeviceAddress VKAPI_CALL
radv_GetPipelineIndirectDeviceAddressNV(VkDevice device,const VkPipelineIndirectDeviceAddressInfoNV * pInfo)1978 radv_GetPipelineIndirectDeviceAddressNV(VkDevice device, const VkPipelineIndirectDeviceAddressInfoNV *pInfo)
1979 {
1980    unreachable("radv: unimplemented vkGetPipelineIndirectDeviceAddressNV");
1981 }
1982