• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2024 Collabora Ltd. and Red Hat Inc.
3  * SPDX-License-Identifier: MIT
4  */
5 #include "nvk_cmd_buffer.h"
6 #include "nvk_device.h"
7 #include "nvk_entrypoints.h"
8 #include "nvk_indirect_execution_set.h"
9 #include "nvk_mme.h"
10 #include "nvk_physical_device.h"
11 #include "nvk_shader.h"
12 
13 #include "nir_builder.h"
14 #include "vk_pipeline.h"
15 
16 #include "nv_push.h"
17 #include "nv_push_cl9097.h"
18 #include "nv_push_cl906f.h"
19 #include "nv_push_cla0c0.h"
20 #include "nv_push_clb1c0.h"
21 #include "nv_push_clc6c0.h"
22 
23 struct nvk_indirect_commands_layout {
24    struct vk_object_base base;
25 
26    VkShaderStageFlags stages;
27 
28    /* Stages set by the first TOKEN_TYPE_EXECUTION_SET */
29    VkShaderStageFlags set_stages;
30 
31    uint32_t cmd_seq_stride_B;
32    uint32_t qmd_size_per_seq_B;
33 
34    struct nvk_shader *init;
35    struct nvk_shader *process;
36 };
37 
38 VK_DEFINE_NONDISP_HANDLE_CASTS(nvk_indirect_commands_layout, base,
39                                VkIndirectCommandsLayoutEXT,
40                                VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_EXT);
41 
42 struct process_cmd_push {
43     uint64_t in_addr;
44     uint64_t out_addr;
45     uint64_t qmd_pool_addr;
46     uint64_t count_addr;
47     uint32_t max_seq_count;
48     uint32_t ies_stride;
49     uint64_t ies_addr;
50     uint64_t root_addr;
51 };
52 
53 struct process_cmd_in {
54     nir_def *in_addr;
55     nir_def *out_addr;
56     nir_def *qmd_pool_addr;
57     nir_def *count_addr;
58     nir_def *max_seq_count;
59     nir_def *ies_stride;
60     nir_def *ies_addr;
61     nir_def *root_addr;
62 };
63 
64 static nir_def *
load_struct_var(nir_builder * b,nir_variable * var,uint32_t field)65 load_struct_var(nir_builder *b, nir_variable *var, uint32_t field)
66 {
67    nir_deref_instr *deref =
68       nir_build_deref_struct(b, nir_build_deref_var(b, var), field);
69    return nir_load_deref(b, deref);
70 }
71 
72 static struct process_cmd_in
load_process_cmd_in(nir_builder * b)73 load_process_cmd_in(nir_builder *b)
74 {
75    struct glsl_struct_field push_fields[] = {
76       { .type = glsl_uint64_t_type(), .name = "in_addr",       .offset = 0x00 },
77       { .type = glsl_uint64_t_type(), .name = "out_addr",      .offset = 0x08 },
78       { .type = glsl_uint64_t_type(), .name = "qmd_pool_addr", .offset = 0x10 },
79       { .type = glsl_uint64_t_type(), .name = "count_addr",    .offset = 0x18 },
80       { .type = glsl_uint_type(),     .name = "max_seq_count", .offset = 0x20 },
81       { .type = glsl_uint_type(),     .name = "ies_stride",    .offset = 0x24 },
82       { .type = glsl_uint64_t_type(), .name = "ies_addr",      .offset = 0x28 },
83       { .type = glsl_uint64_t_type(), .name = "root_addr",     .offset = 0x30 },
84    };
85    const struct glsl_type *push_iface_type =
86       glsl_interface_type(push_fields, ARRAY_SIZE(push_fields),
87                           GLSL_INTERFACE_PACKING_STD140,
88                           false /* row_major */, "push");
89    nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const,
90                                             push_iface_type, "push");
91 
92    return (struct process_cmd_in) {
93       .in_addr       = load_struct_var(b, push, 0),
94       .out_addr      = load_struct_var(b, push, 1),
95       .qmd_pool_addr = load_struct_var(b, push, 2),
96       .count_addr    = load_struct_var(b, push, 3),
97       .max_seq_count = load_struct_var(b, push, 4),
98       .ies_stride    = load_struct_var(b, push, 5),
99       .ies_addr      = load_struct_var(b, push, 6),
100       .root_addr     = load_struct_var(b, push, 7),
101    };
102 }
103 
104 static nir_def *
build_exec_set_addr(nir_builder * b,struct process_cmd_in * in,nir_def * idx)105 build_exec_set_addr(nir_builder *b, struct process_cmd_in *in, nir_def *idx)
106 {
107    nir_def *offset = nir_imul_2x32_64(b, in->ies_stride, idx);
108    return nir_iadd(b, in->ies_addr, offset);
109 }
110 
111 static nir_def *
load_global_dw(nir_builder * b,nir_def * addr,uint32_t offset_dw)112 load_global_dw(nir_builder *b, nir_def *addr, uint32_t offset_dw)
113 {
114    return nir_load_global(b, nir_iadd_imm(b, addr, offset_dw * 4), 4, 1, 32);
115 }
116 
117 static void
store_global_dw(nir_builder * b,nir_def * addr,uint32_t offset_dw,nir_def * data)118 store_global_dw(nir_builder *b, nir_def *addr, uint32_t offset_dw,
119                 nir_def *data)
120 {
121    assert(data->bit_size == 32 && data->num_components == 1);
122    nir_store_global(b, nir_iadd_imm(b, addr, offset_dw * 4), 4, data, 0x1);
123 }
124 
125 static void
nir_iadd_to_var(nir_builder * b,nir_variable * x_var,nir_def * y)126 nir_iadd_to_var(nir_builder *b, nir_variable *x_var, nir_def *y)
127 {
128    nir_def *x = nir_load_var(b, x_var);
129    x = nir_iadd(b, x, y);
130    nir_store_var(b, x_var, x, 0x1);
131 }
132 
133 static void
nir_iadd_to_var_imm(nir_builder * b,nir_variable * x_var,uint64_t y)134 nir_iadd_to_var_imm(nir_builder *b, nir_variable *x_var, uint64_t y)
135 {
136    nir_def *x = nir_load_var(b, x_var);
137    x = nir_iadd_imm(b, x, y);
138    nir_store_var(b, x_var, x, 0x1);
139 }
140 
141 struct nvk_nir_push {
142    nir_variable *addr;
143    nir_variable *dw_count;
144    unsigned max_dw_count;
145 };
146 
147 static void
nvk_nir_push_start(nir_builder * b,struct nvk_nir_push * p,nir_def * addr)148 nvk_nir_push_start(nir_builder *b, struct nvk_nir_push *p, nir_def *addr)
149 {
150    p->addr = nir_local_variable_create(b->impl, glsl_uint64_t_type(),
151                                        "nvk_nir_push::addr");
152    nir_store_var(b, p->addr, addr, 0x1);
153    p->dw_count = nir_local_variable_create(b->impl, glsl_uint_type(),
154                                            "nvk_nir_push::dw_count");
155    nir_store_var(b, p->dw_count, nir_imm_int(b, 0), 0x1);
156    p->max_dw_count = 0;
157 }
158 
159 static inline void
nvk_nir_push_dw(nir_builder * b,struct nvk_nir_push * p,nir_def * dw)160 nvk_nir_push_dw(nir_builder *b, struct nvk_nir_push *p, nir_def *dw)
161 {
162    store_global_dw(b, nir_load_var(b, p->addr), 0, dw);
163    nir_iadd_to_var_imm(b, p->addr, 4);
164    nir_iadd_to_var_imm(b, p->dw_count, 1);
165    p->max_dw_count++;
166 }
167 
168 static inline void
nvk_nir_push_copy_dws(nir_builder * b,struct nvk_nir_push * p,nir_def * src_dw_addr_in,nir_def * dw_count,uint32_t max_dw_count)169 nvk_nir_push_copy_dws(nir_builder *b, struct nvk_nir_push *p,
170                       nir_def *src_dw_addr_in, nir_def *dw_count,
171                       uint32_t max_dw_count)
172 {
173    nir_variable *i = nir_local_variable_create(b->impl, glsl_uint_type(), "i");
174    nir_store_var(b, i, nir_imm_int(b, 0), 0x1);
175 
176    nir_variable *src_dw_addr =
177       nir_local_variable_create(b->impl, glsl_uint64_t_type(), "src_dw_addr");
178    nir_store_var(b, src_dw_addr, src_dw_addr_in, 0x1);
179 
180    nir_push_loop(b);
181    {
182       nir_push_if(b, nir_uge(b, nir_load_var(b, i), dw_count));
183       {
184          nir_jump(b, nir_jump_break);
185       }
186       nir_pop_if(b, NULL);
187 
188       nir_def *dw = load_global_dw(b, nir_load_var(b, src_dw_addr), 0);
189       store_global_dw(b, nir_load_var(b, p->addr), 0, dw);
190 
191       nir_iadd_to_var_imm(b, i, 1);
192       nir_iadd_to_var_imm(b, p->addr, 4);
193       nir_iadd_to_var_imm(b, src_dw_addr, 4);
194    }
195    nir_pop_loop(b, NULL);
196 
197    nir_iadd_to_var(b, p->dw_count, dw_count);
198    p->max_dw_count += max_dw_count;
199 }
200 
201 static inline void
nvk_nir_build_p_1inc(nir_builder * b,struct nvk_nir_push * p,int subc,int mthd,unsigned dw_count)202 nvk_nir_build_p_1inc(nir_builder *b, struct nvk_nir_push *p,
203                      int subc, int mthd, unsigned dw_count)
204 {
205    uint32_t hdr = NVC0_FIFO_PKHDR_1I(subc, mthd, dw_count);
206    nvk_nir_push_dw(b, p, nir_imm_int(b, hdr));
207 }
208 #define nvk_nir_P_1INC(b, p, class, mthd, size_dw) \
209    nvk_nir_build_p_1inc((b), (p), SUBC_##class, class##_##mthd, (size_dw))
210 
211 static void
nvk_nir_build_pad_NOP(nir_builder * b,struct nvk_nir_push * p,uint32_t nop)212 nvk_nir_build_pad_NOP(nir_builder *b, struct nvk_nir_push *p, uint32_t nop)
213 {
214    nir_push_loop(b);
215    {
216       nir_push_if(b, nir_uge_imm(b, nir_load_var(b, p->dw_count),
217                                     p->max_dw_count));
218       {
219          nir_jump(b, nir_jump_break);
220       }
221       nir_pop_if(b, NULL);
222 
223       store_global_dw(b, nir_load_var(b, p->addr), 0, nir_imm_int(b, nop));
224       nir_iadd_to_var_imm(b, p->addr, 4);
225       nir_iadd_to_var_imm(b, p->dw_count, 1);
226    }
227    nir_pop_loop(b, NULL);
228 }
229 #define nvk_nir_pad_NOP(b, p, class) \
230    nvk_nir_build_pad_NOP((b), (p), \
231       NVC0_FIFO_PKHDR_IL(SUBC_##class, class##_NO_OPERATION, 0))
232 
233 #define QMD_ALIGN 0x100
234 #define QMD_ALLOC_SIZE QMD_ALIGN
235 #define QMD_ROOT_SIZE (sizeof(struct nvk_ies_cs_qmd) + \
236                        sizeof(struct nvk_root_descriptor_table))
237 
238 static_assert(sizeof(struct nvk_ies_cs_qmd) % QMD_ALIGN == 0,
239               "QMD size is not properly algined");
240 static_assert(sizeof(struct nvk_root_descriptor_table) % QMD_ALIGN == 0,
241               "Root descriptor table size is not aligned");
242 
243 static void
copy_repl_global_dw(nir_builder * b,nir_def * dst_addr,nir_def * src_addr,nir_def ** repl_dw,uint32_t dw_count)244 copy_repl_global_dw(nir_builder *b, nir_def *dst_addr, nir_def *src_addr,
245                     nir_def **repl_dw, uint32_t dw_count)
246 {
247    for (uint32_t i = 0; i < dw_count; i++) {
248       nir_def *dw;
249       if (repl_dw[i] == NULL)
250          dw = load_global_dw(b, src_addr, i);
251       else
252          dw = repl_dw[i];
253       store_global_dw(b, dst_addr, i, dw);
254    }
255 }
256 
257 static void
build_process_cs_cmd_seq(nir_builder * b,struct nvk_nir_push * p,nir_def * in_addr,nir_def * seq_idx,struct process_cmd_in * in,struct nvk_physical_device * pdev,const VkIndirectCommandsLayoutCreateInfoEXT * info,uint32_t * qmd_size_per_seq_B_out)258 build_process_cs_cmd_seq(nir_builder *b, struct nvk_nir_push *p,
259                          nir_def *in_addr, nir_def *seq_idx,
260                          struct process_cmd_in *in,
261                          struct nvk_physical_device *pdev,
262                          const VkIndirectCommandsLayoutCreateInfoEXT *info,
263                          uint32_t *qmd_size_per_seq_B_out)
264 {
265    /* If we don't have any indirect execution set, the currently bound shader
266     * will be passed in there.
267     */
268    nir_def *shader_qmd_addr = in->ies_addr;
269 
270    nir_def *root_repl[sizeof(struct nvk_root_descriptor_table) / 4] = {};
271 
272 #define root_dw(member) ( \
273    assert(nvk_root_descriptor_offset(member) % 4 == 0), \
274    nvk_root_descriptor_offset(member) / 4)
275 
276    root_repl[root_dw(cs.base_group[0])] = nir_imm_int(b, 0);
277    root_repl[root_dw(cs.base_group[1])] = nir_imm_int(b, 0);
278    root_repl[root_dw(cs.base_group[2])] = nir_imm_int(b, 0);
279 
280    *qmd_size_per_seq_B_out = 0;
281    for (uint32_t t = 0; t < info->tokenCount; t++) {
282       const VkIndirectCommandsLayoutTokenEXT *token = &info->pTokens[t];
283 
284       nir_def *token_addr = nir_iadd_imm(b, in_addr, token->offset);
285       switch (token->type) {
286       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT: {
287          assert(token->data.pExecutionSet->shaderStages ==
288                 VK_SHADER_STAGE_COMPUTE_BIT);
289          assert(t == 0);
290 
291          nir_def *idx = load_global_dw(b, token_addr, 0);
292          shader_qmd_addr = build_exec_set_addr(b, in, idx);
293          break;
294       }
295 
296       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_EXT: {
297          const VkPushConstantRange *pc_range =
298             &token->data.pPushConstant->updateRange;
299 
300          assert(pc_range->offset % 4 == 0);
301          assert(pc_range->size % 4 == 0);
302 
303          const uint32_t start_dw = root_dw(push) + (pc_range->offset / 4);
304          for (uint32_t i = 0; i < pc_range->size / 4; i++)
305             root_repl[start_dw + i] = load_global_dw(b, token_addr, i);
306          break;
307       }
308 
309       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_SEQUENCE_INDEX_EXT: {
310          const VkPushConstantRange *pc_range =
311             &token->data.pPushConstant->updateRange;
312 
313          assert(pc_range->offset % 4 == 0);
314          assert(pc_range->size == 4);
315 
316          const uint32_t dw = root_dw(push) + (pc_range->offset / 4);
317          root_repl[dw] = seq_idx;
318          break;
319       }
320 
321       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_EXT: {
322          nir_def *disp_size_x = load_global_dw(b, token_addr, 0);
323          nir_def *disp_size_y = load_global_dw(b, token_addr, 1);
324          nir_def *disp_size_z = load_global_dw(b, token_addr, 2);
325 
326          *qmd_size_per_seq_B_out += QMD_ROOT_SIZE;
327 
328          nir_push_if(b, nir_ior(b, nir_ior(b, nir_ine_imm(b, disp_size_x, 0),
329                                               nir_ine_imm(b, disp_size_y, 0)),
330                                    nir_ine_imm(b, disp_size_z, 0)));
331          {
332             /* The first dword in qmd_addr is an allocator in units of 256
333              * bytes.
334              */
335             nir_def *qmd_idx =
336                nir_global_atomic(b, 32, in->qmd_pool_addr,
337                                  nir_imm_int(b, QMD_ROOT_SIZE / QMD_ALIGN),
338                                  .atomic_op = nir_atomic_op_iadd);
339             nir_def *qmd_offset =
340                nir_imul_imm(b, nir_u2u64(b, qmd_idx), QMD_ALIGN);
341             nir_def *qmd_addr = nir_iadd(b, in->qmd_pool_addr, qmd_offset);
342             nir_def *root_addr =
343                nir_iadd_imm(b, qmd_addr, sizeof(struct nvk_ies_cs_qmd));
344 
345             /* Upload and patch the root descriptor table */
346             root_repl[root_dw(cs.group_count[0])] = disp_size_x;
347             root_repl[root_dw(cs.group_count[1])] = disp_size_y;
348             root_repl[root_dw(cs.group_count[2])] = disp_size_z;
349             copy_repl_global_dw(b, root_addr, in->root_addr,
350                                 root_repl, ARRAY_SIZE(root_repl));
351 
352             /* Upload and patch the QMD */
353             const struct nak_qmd_dispatch_size_layout qmd_layout =
354                nak_get_qmd_dispatch_size_layout(&pdev->info);
355             assert(qmd_layout.x_start % 32 == 0);
356             assert(qmd_layout.y_start == qmd_layout.x_start + 32);
357             assert(qmd_layout.z_start == qmd_layout.x_start + 64);
358 
359             nir_def *qmd_repl[sizeof(struct nvk_ies_cs_qmd) / 4] = {};
360             qmd_repl[qmd_layout.x_start / 32] = disp_size_x;
361             qmd_repl[qmd_layout.y_start / 32] = disp_size_y;
362             qmd_repl[qmd_layout.z_start / 32] = disp_size_z;
363 
364             /* TODO: Get these from NAK? */
365             const uint32_t cb0_lo_start = 1024, cb0_hi_start = 1056;
366             qmd_repl[cb0_lo_start / 32] = nir_unpack_64_2x32_split_x(b, root_addr);
367             qmd_repl[cb0_hi_start / 32] =
368                nir_ior(b, load_global_dw(b, shader_qmd_addr, cb0_hi_start / 32),
369                           nir_unpack_64_2x32_split_y(b, root_addr));
370 
371             copy_repl_global_dw(b, qmd_addr, shader_qmd_addr,
372                                 qmd_repl, ARRAY_SIZE(qmd_repl));
373 
374             /* Now emit commands */
375             nir_def *invoc = nir_imul_2x32_64(b, disp_size_x, disp_size_y);
376             invoc = nir_imul(b, invoc, nir_u2u64(b, disp_size_z));
377             nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS), 2);
378             nvk_nir_push_dw(b, p, nir_unpack_64_2x32_split_y(b, invoc));
379             nvk_nir_push_dw(b, p, nir_unpack_64_2x32_split_x(b, invoc));
380 
381             nvk_nir_P_1INC(b, p, NVA0C0, SEND_PCAS_A, 1);
382             nvk_nir_push_dw(b, p, nir_u2u32(b, nir_ushr_imm(b, qmd_addr, 8)));
383 
384             if (pdev->info.cls_compute >= AMPERE_COMPUTE_A) {
385                uint32_t signal;
386                V_NVC6C0_SEND_SIGNALING_PCAS2_B(signal,
387                   PCAS_ACTION_INVALIDATE_COPY_SCHEDULE);
388                nvk_nir_P_1INC(b, p, NVC6C0, SEND_SIGNALING_PCAS2_B, 1);
389                nvk_nir_push_dw(b, p, nir_imm_int(b, signal));
390             } else {
391                uint32_t signal;
392                V_NVA0C0_SEND_SIGNALING_PCAS_B(signal, {
393                   .invalidate = INVALIDATE_TRUE,
394                   .schedule = SCHEDULE_TRUE
395                });
396                nvk_nir_P_1INC(b, p, NVA0C0, SEND_SIGNALING_PCAS_B, 1);
397                nvk_nir_push_dw(b, p, nir_imm_int(b, signal));
398             }
399          }
400          nir_pop_if(b, NULL);
401          break;
402       }
403 
404       default:
405          unreachable("Unsupported indirect token type");
406       }
407    }
408 }
409 
410 /*
411  * Graphics
412  */
413 
414 static void
build_gfx_set_exec(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,struct process_cmd_in * in,struct nvk_physical_device * pdev,const VkIndirectCommandsExecutionSetTokenEXT * token)415 build_gfx_set_exec(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
416                    struct process_cmd_in *in,
417                    struct nvk_physical_device *pdev,
418                    const VkIndirectCommandsExecutionSetTokenEXT *token)
419 {
420    switch (token->type) {
421    case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT: {
422       nir_def *idx = load_global_dw(b, token_addr, 0);
423       nir_def *push_addr = build_exec_set_addr(b, in, idx);
424       nir_def *dw_count = load_global_dw(b, push_addr, 0);
425       const uint16_t max_dw_count =
426          nvk_ies_gfx_pipeline_max_dw_count(pdev, token->shaderStages);
427       nvk_nir_push_copy_dws(b, p, nir_iadd_imm(b, push_addr, 4),
428                             dw_count, max_dw_count);
429       break;
430    }
431 
432    case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT: {
433       int32_t i = 0;
434       gl_shader_stage type_stage[6] = {};
435       nir_def *type_shader_idx[6] = {};
436       gl_shader_stage last_vtgm = MESA_SHADER_VERTEX;
437       u_foreach_bit(s, token->shaderStages) {
438          gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s);
439 
440          if (stage != MESA_SHADER_FRAGMENT)
441             last_vtgm = stage;
442 
443          uint32_t type = mesa_to_nv9097_shader_type(stage);
444          type_stage[type] = stage;
445          type_shader_idx[type] = load_global_dw(b, token_addr, i++);
446       }
447 
448       for (uint32_t type = 0; type < 6; type++) {
449          nir_def *shader_idx = type_shader_idx[type];
450          if (shader_idx == NULL)
451             continue;
452 
453          bool is_last_vtgm = type_stage[type] == last_vtgm;
454 
455          nir_def *push_addr = build_exec_set_addr(b, in, shader_idx);
456          nir_def *hdr = load_global_dw(b, push_addr, 0);
457          nir_def *dw_count =
458             nir_extract_u16(b, hdr, nir_imm_int(b, is_last_vtgm));
459          const uint16_t max_dw_count =
460             nvk_ies_gfx_shader_max_dw_count(pdev, token->shaderStages,
461                                             is_last_vtgm);
462          nvk_nir_push_copy_dws(b, p, nir_iadd_imm(b, push_addr, 4),
463                                dw_count, max_dw_count);
464       }
465       break;
466    }
467 
468    default:
469       unreachable("Unknown indirect execution set type");
470    }
471 }
472 
473 static void
build_push_gfx_const(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,const VkIndirectCommandsPushConstantTokenEXT * token)474 build_push_gfx_const(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
475                      const VkIndirectCommandsPushConstantTokenEXT *token)
476 {
477    const VkPushConstantRange *pc_range = &token->updateRange;
478 
479    // TODO: Compute
480    assert(!(pc_range->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT));
481 
482    assert(pc_range->offset % 4 == 0);
483    assert(pc_range->size % 4 == 0);
484    const uint32_t dw_count = pc_range->size / 4;
485 
486    nvk_nir_P_1INC(b, p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET, 1 + dw_count);
487    nvk_nir_push_dw(b, p, nir_imm_int(b,
488       nvk_root_descriptor_offset(push) + pc_range->offset));
489    for (uint32_t i = 0; i < dw_count; i++)
490       nvk_nir_push_dw(b, p, load_global_dw(b, token_addr, i));
491 }
492 
493 static void
build_push_gfx_seq_idx(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,nir_def * seq_idx,const VkIndirectCommandsPushConstantTokenEXT * token)494 build_push_gfx_seq_idx(nir_builder *b, struct nvk_nir_push *p,
495                        nir_def *token_addr, nir_def *seq_idx,
496                        const VkIndirectCommandsPushConstantTokenEXT *token)
497 {
498    const VkPushConstantRange *pc_range = &token->updateRange;
499 
500    // TODO: Compute
501    assert(!(pc_range->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT));
502 
503    assert(pc_range->offset % 4 == 0);
504    assert(pc_range->size == 4);
505    nvk_nir_P_1INC(b, p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET, 2);
506    nvk_nir_push_dw(b, p, nir_imm_int(b,
507       nvk_root_descriptor_offset(push) + pc_range->offset));
508    nvk_nir_push_dw(b, p, seq_idx);
509 }
510 
511 static void
build_set_ib(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,UNUSED const VkIndirectCommandsIndexBufferTokenEXT * token)512 build_set_ib(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
513              UNUSED const VkIndirectCommandsIndexBufferTokenEXT *token)
514 {
515    nir_def *addr_lo = load_global_dw(b, token_addr, 0);
516    nir_def *addr_hi = load_global_dw(b, token_addr, 1);
517    nir_def *size_B  = load_global_dw(b, token_addr, 2);
518    nir_def *idx_fmt = load_global_dw(b, token_addr, 3);
519 
520    nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_IB), 4);
521    nvk_nir_push_dw(b, p, addr_hi);
522    nvk_nir_push_dw(b, p, addr_lo);
523    nvk_nir_push_dw(b, p, size_B);
524    nvk_nir_push_dw(b, p, idx_fmt);
525 }
526 
527 static nir_def *
nvk_nir_vb_stride(nir_builder * b,nir_def * vb_idx,nir_def * stride)528 nvk_nir_vb_stride(nir_builder *b, nir_def *vb_idx, nir_def *stride)
529 {
530    return nir_pack_32_2x16_split(b, nir_u2u16(b, stride),
531                                     nir_u2u16(b, vb_idx));
532 }
533 
534 static void
build_set_vb(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,const VkIndirectCommandsVertexBufferTokenEXT * token)535 build_set_vb(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
536              const VkIndirectCommandsVertexBufferTokenEXT *token)
537 {
538    nir_def *vb_idx = nir_imm_int(b, token->vertexBindingUnit);
539    nir_def *addr_lo  = load_global_dw(b, token_addr, 0);
540    nir_def *addr_hi  = load_global_dw(b, token_addr, 1);
541    nir_def *size_B   = load_global_dw(b, token_addr, 2);
542    nir_def *stride_B = load_global_dw(b, token_addr, 3);
543 
544    nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_VB), 4);
545    nvk_nir_push_dw(b, p, vb_idx);
546    nvk_nir_push_dw(b, p, addr_hi);
547    nvk_nir_push_dw(b, p, addr_lo);
548    nvk_nir_push_dw(b, p, size_B);
549 
550    nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_SET_VB_STRIDE), 1);
551    nvk_nir_push_dw(b, p, nvk_nir_vb_stride(b, vb_idx, stride_B));
552 }
553 
554 static void
build_draw(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr)555 build_draw(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr)
556 {
557    nir_def *vertex_count   = load_global_dw(b, token_addr, 0);
558    nir_def *instance_count = load_global_dw(b, token_addr, 1);
559    nir_def *first_vertex   = load_global_dw(b, token_addr, 2);
560    nir_def *first_instance = load_global_dw(b, token_addr, 3);
561 
562    nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW), 5);
563    nvk_nir_push_dw(b, p, nir_imm_int(b, 0)); /* draw_index */
564    nvk_nir_push_dw(b, p, vertex_count);
565    nvk_nir_push_dw(b, p, instance_count);
566    nvk_nir_push_dw(b, p, first_vertex);
567    nvk_nir_push_dw(b, p, first_instance);
568 }
569 
570 static void
build_draw_indexed(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr)571 build_draw_indexed(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr)
572 {
573    nir_def *index_count    = load_global_dw(b, token_addr, 0);
574    nir_def *instance_count = load_global_dw(b, token_addr, 1);
575    nir_def *first_index    = load_global_dw(b, token_addr, 2);
576    nir_def *vertex_offset  = load_global_dw(b, token_addr, 3);
577    nir_def *first_instance = load_global_dw(b, token_addr, 4);
578 
579    nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED), 6);
580    nvk_nir_push_dw(b, p, nir_imm_int(b, 0)); /* draw_index */
581    nvk_nir_push_dw(b, p, index_count);
582    nvk_nir_push_dw(b, p, instance_count);
583    nvk_nir_push_dw(b, p, first_index);
584    nvk_nir_push_dw(b, p, vertex_offset);
585    nvk_nir_push_dw(b, p, first_instance);
586 }
587 
588 static void
build_draw_count(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr)589 build_draw_count(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr)
590 {
591    nir_def *addr_lo = load_global_dw(b, token_addr, 0);
592    nir_def *addr_hi = load_global_dw(b, token_addr, 1);
593    nir_def *stride  = load_global_dw(b, token_addr, 2);
594    nir_def *count   = load_global_dw(b, token_addr, 3);
595 
596    nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT), 4);
597    nvk_nir_push_dw(b, p, addr_hi);
598    nvk_nir_push_dw(b, p, addr_lo);
599    nvk_nir_push_dw(b, p, count);
600    nvk_nir_push_dw(b, p, stride);
601 }
602 
603 static void
build_draw_indexed_count(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr)604 build_draw_indexed_count(nir_builder *b, struct nvk_nir_push *p,
605                          nir_def *token_addr)
606 {
607    nir_def *addr_lo = load_global_dw(b, token_addr, 0);
608    nir_def *addr_hi = load_global_dw(b, token_addr, 1);
609    nir_def *stride  = load_global_dw(b, token_addr, 2);
610    nir_def *count   = load_global_dw(b, token_addr, 3);
611 
612    nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT), 4);
613    nvk_nir_push_dw(b, p, addr_hi);
614    nvk_nir_push_dw(b, p, addr_lo);
615    nvk_nir_push_dw(b, p, count);
616    nvk_nir_push_dw(b, p, stride);
617 }
618 
619 static void
build_process_gfx_cmd_seq(nir_builder * b,struct nvk_nir_push * p,nir_def * in_addr,nir_def * seq_idx,struct process_cmd_in * in,struct nvk_physical_device * pdev,const VkIndirectCommandsLayoutCreateInfoEXT * info)620 build_process_gfx_cmd_seq(nir_builder *b, struct nvk_nir_push *p,
621                           nir_def *in_addr, nir_def *seq_idx,
622                           struct process_cmd_in *in,
623                           struct nvk_physical_device *pdev,
624                           const VkIndirectCommandsLayoutCreateInfoEXT *info)
625 {
626    for (uint32_t t = 0; t < info->tokenCount; t++) {
627       const VkIndirectCommandsLayoutTokenEXT *token = &info->pTokens[t];
628 
629       nir_def *token_addr = nir_iadd_imm(b, in_addr, token->offset);
630       switch (token->type) {
631       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT:
632          assert(t == 0);
633          build_gfx_set_exec(b, p, token_addr, in, pdev,
634                             token->data.pExecutionSet);
635          break;
636 
637       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_EXT:
638          build_push_gfx_const(b, p, token_addr, token->data.pPushConstant);
639          break;
640 
641       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_SEQUENCE_INDEX_EXT:
642          build_push_gfx_seq_idx(b, p, token_addr, seq_idx,
643                                 token->data.pPushConstant);
644          break;
645 
646       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_EXT:
647          build_set_ib(b, p, token_addr, token->data.pIndexBuffer);
648          break;
649 
650       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_EXT:
651          build_set_vb(b, p, token_addr, token->data.pVertexBuffer);
652          break;
653 
654       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_EXT:
655          build_draw_indexed(b, p, token_addr);
656          break;
657 
658       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_EXT:
659          build_draw(b, p, token_addr);
660          break;
661 
662       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_COUNT_EXT:
663          build_draw_indexed_count(b, p, token_addr);
664          break;
665 
666       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_COUNT_EXT:
667          build_draw_count(b, p, token_addr);
668          break;
669 
670       default:
671          unreachable("Unsupported indirect token type");
672       }
673    }
674 }
675 
676 static VkResult
build_init_shader(struct nvk_device * dev,const VkIndirectCommandsLayoutCreateInfoEXT * info,uint32_t qmd_size_per_seq_B,const VkAllocationCallbacks * pAllocator,struct nvk_shader ** shader_out)677 build_init_shader(struct nvk_device *dev,
678                   const VkIndirectCommandsLayoutCreateInfoEXT *info,
679                   uint32_t qmd_size_per_seq_B,
680                   const VkAllocationCallbacks *pAllocator,
681                   struct nvk_shader **shader_out)
682 {
683    /* There's nothing to initialize for graphics */
684    if (info->shaderStages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
685       assert(!(info->shaderStages & ~NVK_SHADER_STAGE_GRAPHICS_BITS));
686       *shader_out = NULL;
687       return VK_SUCCESS;
688    }
689 
690    if (qmd_size_per_seq_B == 0) {
691       *shader_out = NULL;
692       return VK_SUCCESS;
693    }
694 
695    nir_builder build =
696       nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
697                                      "nvk-init-indirect-commands");
698    build.shader->info.workgroup_size[0] = 32;
699    nir_builder *b = &build;
700 
701    struct process_cmd_in in = load_process_cmd_in(b);
702 
703    if (qmd_size_per_seq_B > 0) {
704       /* Initialize the QMD allocator to 1 * QMD_ALIGN so that the QMDs we
705        * allocate don't stomp the allocator.
706        */
707       assert(info->shaderStages == VK_SHADER_STAGE_COMPUTE_BIT);
708       store_global_dw(b, in.qmd_pool_addr, 0, nir_imm_int(b, 1));
709    }
710 
711    return nvk_compile_nir_shader(dev, build.shader, pAllocator, shader_out);
712 }
713 
714 static VkResult
build_process_shader(struct nvk_device * dev,const VkIndirectCommandsLayoutCreateInfoEXT * info,const VkAllocationCallbacks * pAllocator,struct nvk_shader ** shader_out,uint32_t * cmd_seq_stride_B_out,uint32_t * qmd_size_per_seq_B_out)715 build_process_shader(struct nvk_device *dev,
716                      const VkIndirectCommandsLayoutCreateInfoEXT *info,
717                      const VkAllocationCallbacks *pAllocator,
718                      struct nvk_shader **shader_out,
719                      uint32_t *cmd_seq_stride_B_out,
720                      uint32_t *qmd_size_per_seq_B_out)
721 {
722    struct nvk_physical_device *pdev = nvk_device_physical(dev);
723 
724    nir_builder build =
725       nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
726                                      "nvk-process-indirect-commands");
727    build.shader->info.workgroup_size[0] = 32;
728    nir_builder *b = &build;
729 
730    struct process_cmd_in in = load_process_cmd_in(b);
731 
732    nir_def *seq_idx = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
733 
734    /* We always execute a 32-wide shader and nothing guarantees that
735     * max_seq_count is a multiple of 32 so we need to bail if our index is
736     * above the maximum.  If we're inside the maximum but less than the count,
737     * we setill need to emit a bunch of NOP.
738     */
739    nir_push_if(b, nir_uge(b, seq_idx, in.max_seq_count));
740    {
741       nir_jump(b, nir_jump_halt);
742    }
743    nir_pop_if(b, NULL);
744 
745    nir_def *ind_count;
746    nir_push_if(b, nir_ine_imm(b, in.count_addr, 0));
747    {
748       ind_count = load_global_dw(b, in.count_addr, 0);
749       ind_count = nir_umin(b, ind_count, in.max_seq_count);
750    }
751    nir_pop_if(b, NULL);
752    nir_def *count = nir_if_phi(b, ind_count, in.max_seq_count);
753 
754    nir_def *in_seq_addr = nir_iadd(b, in.in_addr,
755       nir_imul_imm(b, nir_u2u64(b, seq_idx), info->indirectStride));
756    /* We'll replace this later once we know what it is */
757    nir_def *out_stride = nir_imm_int(b, 0xc0ffee0);
758    nir_def *out_seq_addr = nir_iadd(b, in.out_addr,
759       nir_imul_2x32_64(b, seq_idx, out_stride));
760 
761    struct nvk_nir_push push = {};
762    nvk_nir_push_start(b, &push, out_seq_addr);
763 
764    nir_push_if(b, nir_ult(b, seq_idx, count));
765    {
766       if (info->shaderStages & VK_SHADER_STAGE_COMPUTE_BIT) {
767          assert(info->shaderStages == VK_SHADER_STAGE_COMPUTE_BIT);
768          build_process_cs_cmd_seq(b, &push, in_seq_addr, seq_idx,
769                                   &in, pdev, info, qmd_size_per_seq_B_out);
770       } else if (info->shaderStages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
771          assert(!(info->shaderStages & ~NVK_SHADER_STAGE_GRAPHICS_BITS));
772          *qmd_size_per_seq_B_out = 0;
773          build_process_gfx_cmd_seq(b, &push, in_seq_addr, seq_idx,
774                                    &in, pdev, info);
775       } else {
776          unreachable("Unknown shader stage");
777       }
778    }
779    nir_pop_if(b, NULL);
780 
781    /* Always pad the command buffer.  In the case where seq_idx >= count, the
782     * entire sequence will be NO_OPERATION.
783     */
784    if (info->shaderStages & VK_SHADER_STAGE_COMPUTE_BIT) {
785       nvk_nir_pad_NOP(b, &push, NVA0C0);
786    } else if (info->shaderStages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
787       nvk_nir_pad_NOP(b, &push, NV9097);
788    } else {
789       unreachable("Unknown shader stage");
790    }
791 
792    /* Replace the out stride with the actual size of a command stream */
793    nir_load_const_instr *out_stride_const =
794       nir_instr_as_load_const(out_stride->parent_instr);
795    out_stride_const->value[0].u32 = push.max_dw_count * 4;
796 
797    /* We also output this stride to go in the layout struct */
798    *cmd_seq_stride_B_out = push.max_dw_count * 4;
799 
800    return nvk_compile_nir_shader(dev, build.shader, pAllocator, shader_out);
801 }
802 
803 static void
nvk_indirect_commands_layout_destroy(struct nvk_device * dev,struct nvk_indirect_commands_layout * layout,const VkAllocationCallbacks * alloc)804 nvk_indirect_commands_layout_destroy(struct nvk_device *dev,
805                                      struct nvk_indirect_commands_layout *layout,
806                                      const VkAllocationCallbacks *alloc)
807 {
808    if (layout->init != NULL)
809       vk_shader_destroy(&dev->vk, &layout->init->vk, alloc);
810    if (layout->process != NULL)
811       vk_shader_destroy(&dev->vk, &layout->process->vk, alloc);
812    vk_object_free(&dev->vk, alloc, layout);
813 }
814 
815 VKAPI_ATTR VkResult VKAPI_CALL
nvk_CreateIndirectCommandsLayoutEXT(VkDevice _device,const VkIndirectCommandsLayoutCreateInfoEXT * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkIndirectCommandsLayoutEXT * pIndirectCommandsLayout)816 nvk_CreateIndirectCommandsLayoutEXT(
817     VkDevice _device,
818     const VkIndirectCommandsLayoutCreateInfoEXT *pCreateInfo,
819     const VkAllocationCallbacks *pAllocator,
820     VkIndirectCommandsLayoutEXT *pIndirectCommandsLayout)
821 {
822    VK_FROM_HANDLE(nvk_device, dev, _device);
823    VkResult result;
824 
825    struct nvk_indirect_commands_layout *layout =
826       vk_object_zalloc(&dev->vk, pAllocator, sizeof(*layout),
827                        VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_EXT);
828    if (layout == NULL)
829       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
830 
831    layout->stages = pCreateInfo->shaderStages;
832 
833    /* From the Vulkan 1.3.XXX spec:
834     *
835     *    VUID-VkIndirectCommandsLayoutCreateInfoEXT-pTokens-11093
836     *
837     *    "The number of tokens in the pTokens array with type equal to
838     *    VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT must be less than
839     *    or equal to 1"
840     *
841     * and
842     *
843     *    VUID-VkIndirectCommandsLayoutCreateInfoEXT-pTokens-11139
844     *
845     *    "If the pTokens array contains a
846     *    VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT token, then this
847     *    token must be the first token in the array"
848     */
849    if (pCreateInfo->tokenCount > 0 &&
850        pCreateInfo->pTokens[0].type ==
851          VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT) {
852       const VkIndirectCommandsExecutionSetTokenEXT *token =
853          pCreateInfo->pTokens[0].data.pExecutionSet;
854 
855       /* Pipelines should never mismatch here. */
856       if (token->type == VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT)
857          assert(token->shaderStages == pCreateInfo->shaderStages);
858 
859       layout->set_stages = token->shaderStages;
860    }
861 
862    result = build_process_shader(dev, pCreateInfo, pAllocator,
863                                  &layout->process, &layout->cmd_seq_stride_B,
864                                  &layout->qmd_size_per_seq_B);
865    if (result != VK_SUCCESS) {
866       nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
867       return result;
868    }
869 
870    if (layout->cmd_seq_stride_B > (NV_PUSH_MAX_COUNT * 4)) {
871       nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
872       return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
873                        "Too many tokens in IndirectCommandsLayout");
874    }
875 
876    result = build_init_shader(dev, pCreateInfo, layout->qmd_size_per_seq_B,
877                               pAllocator, &layout->init);
878    if (result != VK_SUCCESS) {
879       nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
880       return result;
881    }
882 
883    *pIndirectCommandsLayout = nvk_indirect_commands_layout_to_handle(layout);
884 
885    return VK_SUCCESS;
886 }
887 
888 VKAPI_ATTR void VKAPI_CALL
nvk_DestroyIndirectCommandsLayoutEXT(VkDevice _device,VkIndirectCommandsLayoutEXT indirectCommandsLayout,const VkAllocationCallbacks * pAllocator)889 nvk_DestroyIndirectCommandsLayoutEXT(
890     VkDevice _device,
891     VkIndirectCommandsLayoutEXT indirectCommandsLayout,
892     const VkAllocationCallbacks *pAllocator)
893 {
894    VK_FROM_HANDLE(nvk_device, dev, _device);
895    VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
896                   indirectCommandsLayout);
897 
898    if (layout == NULL)
899       return;
900 
901    nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
902 }
903 
904 VKAPI_ATTR void VKAPI_CALL
nvk_GetGeneratedCommandsMemoryRequirementsEXT(VkDevice _device,const VkGeneratedCommandsMemoryRequirementsInfoEXT * pInfo,VkMemoryRequirements2 * pMemoryRequirements)905 nvk_GetGeneratedCommandsMemoryRequirementsEXT(
906     VkDevice _device,
907     const VkGeneratedCommandsMemoryRequirementsInfoEXT *pInfo,
908     VkMemoryRequirements2 *pMemoryRequirements)
909 {
910    VK_FROM_HANDLE(nvk_device, dev, _device);
911    VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
912                   pInfo->indirectCommandsLayout);
913    struct nvk_physical_device *pdev = nvk_device_physical(dev);
914 
915    uint64_t size = layout->cmd_seq_stride_B * (uint64_t)pInfo->maxSequenceCount;
916    if (layout->qmd_size_per_seq_B > 0) {
917       size = align64(size, QMD_ALIGN);
918       size += QMD_ALLOC_SIZE;
919       size += layout->qmd_size_per_seq_B * pInfo->maxSequenceCount;
920    }
921 
922    pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
923       .size = size,
924       .alignment = QMD_ALIGN,
925       .memoryTypeBits = BITFIELD_MASK(pdev->mem_type_count),
926    };
927 }
928 
929 static void
nvk_cmd_process_cmds(struct nvk_cmd_buffer * cmd,const VkGeneratedCommandsInfoEXT * info,const struct nvk_cmd_state * state)930 nvk_cmd_process_cmds(struct nvk_cmd_buffer *cmd,
931                      const VkGeneratedCommandsInfoEXT *info,
932                      const struct nvk_cmd_state *state)
933 {
934    VK_FROM_HANDLE(nvk_indirect_execution_set, ies, info->indirectExecutionSet);
935    VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
936                   info->indirectCommandsLayout);
937 
938    struct process_cmd_push push = {
939       .in_addr       = info->indirectAddress,
940       .out_addr      = info->preprocessAddress,
941       .count_addr    = info->sequenceCountAddress,
942       .max_seq_count = info->maxSequenceCount,
943    };
944 
945    uint64_t qmd_addr = 0;
946    if (layout->stages & VK_SHADER_STAGE_COMPUTE_BIT) {
947       uint32_t global_size[3] = { 0, 0, 0 };
948       VkResult result = nvk_cmd_flush_cs_qmd(cmd, global_size, &qmd_addr,
949                                              &push.root_addr);
950       if (unlikely(result != VK_SUCCESS)) {
951          vk_command_buffer_set_error(&cmd->vk, result);
952          return;
953       }
954    }
955 
956    if (layout->set_stages == 0) {
957       push.ies_addr = qmd_addr;
958    } else {
959       assert(layout->set_stages == layout->stages);
960       push.ies_addr   = ies->mem->va->addr;
961       push.ies_stride = ies->stride_B;
962    }
963 
964    if (layout->qmd_size_per_seq_B > 0) {
965       assert(info->preprocessAddress % QMD_ALIGN == 0);
966       uint64_t qmd_offset =
967          layout->cmd_seq_stride_B * (uint64_t)info->maxSequenceCount;
968       qmd_offset = align64(qmd_offset, QMD_ALIGN);
969       push.qmd_pool_addr = info->preprocessAddress + qmd_offset;
970    }
971 
972    if (layout->init != NULL) {
973       nvk_cmd_dispatch_shader(cmd, layout->init, &push, sizeof(push), 1, 1, 1);
974 
975       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
976       P_IMMD(p, NVA0C0, WAIT_FOR_IDLE, 0);
977    }
978 
979    nvk_cmd_dispatch_shader(cmd, layout->process, &push, sizeof(push),
980                            DIV_ROUND_UP(info->maxSequenceCount, 32), 1, 1);
981 }
982 
983 static void
nvk_cmd_flush_process_state(struct nvk_cmd_buffer * cmd,const VkGeneratedCommandsInfoEXT * info)984 nvk_cmd_flush_process_state(struct nvk_cmd_buffer *cmd,
985                             const VkGeneratedCommandsInfoEXT *info)
986 {
987    struct nvk_descriptor_state *desc =
988       nvk_get_descriptor_state_for_stages(cmd, info->shaderStages);
989    nvk_cmd_buffer_flush_push_descriptors(cmd, desc);
990 }
991 
992 VKAPI_ATTR void VKAPI_CALL
nvk_CmdPreprocessGeneratedCommandsEXT(VkCommandBuffer commandBuffer,const VkGeneratedCommandsInfoEXT * info,VkCommandBuffer stateCommandBuffer)993 nvk_CmdPreprocessGeneratedCommandsEXT(VkCommandBuffer commandBuffer,
994                                       const VkGeneratedCommandsInfoEXT *info,
995                                       VkCommandBuffer stateCommandBuffer)
996 {
997    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
998    VK_FROM_HANDLE(nvk_cmd_buffer, state_cmd, stateCommandBuffer);
999 
1000    nvk_cmd_flush_process_state(state_cmd, info);
1001    nvk_cmd_process_cmds(cmd, info, &state_cmd->state);
1002 }
1003 
1004 VKAPI_ATTR void VKAPI_CALL
nvk_CmdExecuteGeneratedCommandsEXT(VkCommandBuffer commandBuffer,VkBool32 isPreprocessed,const VkGeneratedCommandsInfoEXT * info)1005 nvk_CmdExecuteGeneratedCommandsEXT(VkCommandBuffer commandBuffer,
1006                                    VkBool32 isPreprocessed,
1007                                    const VkGeneratedCommandsInfoEXT *info)
1008 {
1009    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
1010    VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
1011                   info->indirectCommandsLayout);
1012 
1013    if (!isPreprocessed) {
1014       nvk_cmd_flush_process_state(cmd, info);
1015       nvk_cmd_process_cmds(cmd, info, &cmd->state);
1016 
1017       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
1018       P_IMMD(p, NVA0C0, INVALIDATE_SHADER_CACHES, {
1019          .data = DATA_TRUE,
1020          .constant = CONSTANT_TRUE,
1021          .flush_data = FLUSH_DATA_TRUE,
1022       });
1023       P_IMMD(p, NVB1C0, INVALIDATE_SKED_CACHES, 0);
1024       __push_immd(p, SUBC_NV9097, NV906F_SET_REFERENCE, 0);
1025    }
1026 
1027    if (layout->stages & VK_SHADER_STAGE_COMPUTE_BIT) {
1028       assert(info->shaderStages == VK_SHADER_STAGE_COMPUTE_BIT);
1029       nvk_cmd_buffer_flush_push_descriptors(cmd, &cmd->state.cs.descriptors);
1030    } else if (layout->stages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
1031       assert(!(layout->stages & ~NVK_SHADER_STAGE_GRAPHICS_BITS));
1032 
1033       nvk_cmd_buffer_flush_push_descriptors(cmd, &cmd->state.gfx.descriptors);
1034       nvk_cmd_flush_gfx_dynamic_state(cmd);
1035 
1036       if (layout->set_stages == 0) {
1037          /* In this case, we're using the CPU-bound shaders */
1038          nvk_cmd_flush_gfx_shaders(cmd);
1039          nvk_cmd_flush_gfx_cbufs(cmd);
1040       } else {
1041          /* From the Vulkan 1.3.XXX spec:
1042           *
1043           *    "If indirectCommandsLayout was created with a token sequence
1044           *    that contained the
1045           *    VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT token and
1046           *    indirectExecutionSet was created using
1047           *    VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT, every
1048           *    executed VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT
1049           *    token must bind all the shader stages set in the
1050           *    VkIndirectCommandsExecutionSetTokenEXT::shaderStages used to
1051           *    create indirectCommandsLayout"
1052           *
1053           * So we unbind anything not explicitly bound by the layout and trust
1054           * the layout to bind the rest.
1055           */
1056          assert(layout->set_stages == layout->stages);
1057 
1058          uint8_t set_types = 0;
1059          u_foreach_bit(s, layout->set_stages) {
1060             gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s);
1061             uint32_t type = mesa_to_nv9097_shader_type(stage);
1062             set_types |= BITFIELD_BIT(type);
1063          }
1064 
1065          uint8_t unset_types = BITFIELD_MASK(6) & ~set_types;
1066 
1067          struct nv_push *p = nvk_cmd_buffer_push(cmd, 12);
1068          u_foreach_bit(type, unset_types) {
1069             P_IMMD(p, NV9097, SET_PIPELINE_SHADER(type), {
1070                .enable  = ENABLE_FALSE,
1071                .type    = type,
1072             });
1073          }
1074       }
1075    }
1076 
1077    ASSERTED const uint64_t size =
1078       layout->cmd_seq_stride_B * (uint64_t)info->maxSequenceCount;
1079    assert(size <= info->preprocessSize);
1080 
1081    uint64_t addr = info->preprocessAddress;
1082    uint64_t seq_count = info->maxSequenceCount;
1083 
1084    /* Break it into pices that are a multiple of cmd_seq_stride_B so that, if
1085     * the kernel inserts a sync point between two of our pushes, it doesn't
1086     * break a single command.
1087     */
1088    const uint32_t max_seq_per_push =
1089       (NV_PUSH_MAX_COUNT * 4) / layout->cmd_seq_stride_B;
1090 
1091    while (seq_count > 0) {
1092       uint32_t push_seq = MIN2(seq_count, max_seq_per_push);
1093       uint32_t push_size_B = push_seq * layout->cmd_seq_stride_B;
1094       nvk_cmd_buffer_push_indirect(cmd, addr, push_size_B);
1095       addr += push_size_B;
1096       seq_count -= push_seq;
1097    }
1098 
1099    if (layout->set_stages != 0) {
1100       if (layout->stages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
1101          cmd->state.gfx.shaders_dirty |= NVK_SHADER_STAGE_GRAPHICS_BITS;
1102       }
1103    }
1104 }
1105