• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2024 Collabora Ltd. and Red Hat Inc.
3  * SPDX-License-Identifier: MIT
4  */
5 #include "nvk_cmd_buffer.h"
6 #include "nvk_device.h"
7 #include "nvk_entrypoints.h"
8 #include "nvk_indirect_execution_set.h"
9 #include "nvk_mme.h"
10 #include "nvk_physical_device.h"
11 #include "nvk_shader.h"
12 
13 #include "nir_builder.h"
14 #include "vk_pipeline.h"
15 
16 #include "nv_push.h"
17 #include "nv_push_cl9097.h"
18 #include "nv_push_cl906f.h"
19 #include "nv_push_cla0c0.h"
20 #include "nv_push_clb1c0.h"
21 #include "nv_push_clc6c0.h"
22 
23 struct nvk_indirect_commands_layout {
24    struct vk_object_base base;
25 
26    VkShaderStageFlags stages;
27 
28    /* Stages set by the first TOKEN_TYPE_EXECUTION_SET */
29    VkShaderStageFlags set_stages;
30 
31    uint32_t cmd_seq_stride_B;
32    uint32_t qmd_size_per_seq_B;
33 
34    struct nvk_shader *init;
35    struct nvk_shader *process;
36 };
37 
38 VK_DEFINE_NONDISP_HANDLE_CASTS(nvk_indirect_commands_layout, base,
39                                VkIndirectCommandsLayoutEXT,
40                                VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_EXT);
41 
42 struct process_cmd_push {
43     uint64_t in_addr;
44     uint64_t out_addr;
45     uint64_t qmd_pool_addr;
46     uint64_t count_addr;
47     uint32_t max_seq_count;
48     uint32_t ies_stride;
49     uint64_t ies_addr;
50     uint64_t root_addr;
51 };
52 
53 struct process_cmd_in {
54     nir_def *in_addr;
55     nir_def *out_addr;
56     nir_def *qmd_pool_addr;
57     nir_def *count_addr;
58     nir_def *max_seq_count;
59     nir_def *ies_stride;
60     nir_def *ies_addr;
61     nir_def *root_addr;
62 };
63 
64 static nir_def *
load_struct_var(nir_builder * b,nir_variable * var,uint32_t field)65 load_struct_var(nir_builder *b, nir_variable *var, uint32_t field)
66 {
67    nir_deref_instr *deref =
68       nir_build_deref_struct(b, nir_build_deref_var(b, var), field);
69    return nir_load_deref(b, deref);
70 }
71 
72 static struct process_cmd_in
load_process_cmd_in(nir_builder * b)73 load_process_cmd_in(nir_builder *b)
74 {
75    struct glsl_struct_field push_fields[] = {
76       { .type = glsl_uint64_t_type(), .name = "in_addr",       .offset = 0x00 },
77       { .type = glsl_uint64_t_type(), .name = "out_addr",      .offset = 0x08 },
78       { .type = glsl_uint64_t_type(), .name = "qmd_pool_addr", .offset = 0x10 },
79       { .type = glsl_uint64_t_type(), .name = "count_addr",    .offset = 0x18 },
80       { .type = glsl_uint_type(),     .name = "max_seq_count", .offset = 0x20 },
81       { .type = glsl_uint_type(),     .name = "ies_stride",    .offset = 0x24 },
82       { .type = glsl_uint64_t_type(), .name = "ies_addr",      .offset = 0x28 },
83       { .type = glsl_uint64_t_type(), .name = "root_addr",     .offset = 0x30 },
84    };
85    const struct glsl_type *push_iface_type =
86       glsl_interface_type(push_fields, ARRAY_SIZE(push_fields),
87                           GLSL_INTERFACE_PACKING_STD140,
88                           false /* row_major */, "push");
89    nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const,
90                                             push_iface_type, "push");
91 
92    return (struct process_cmd_in) {
93       .in_addr       = load_struct_var(b, push, 0),
94       .out_addr      = load_struct_var(b, push, 1),
95       .qmd_pool_addr = load_struct_var(b, push, 2),
96       .count_addr    = load_struct_var(b, push, 3),
97       .max_seq_count = load_struct_var(b, push, 4),
98       .ies_stride    = load_struct_var(b, push, 5),
99       .ies_addr      = load_struct_var(b, push, 6),
100       .root_addr     = load_struct_var(b, push, 7),
101    };
102 }
103 
104 static nir_def *
build_exec_set_addr(nir_builder * b,struct process_cmd_in * in,nir_def * idx)105 build_exec_set_addr(nir_builder *b, struct process_cmd_in *in, nir_def *idx)
106 {
107    nir_def *offset = nir_imul_2x32_64(b, in->ies_stride, idx);
108    return nir_iadd(b, in->ies_addr, offset);
109 }
110 
111 static nir_def *
load_global_dw(nir_builder * b,nir_def * addr,uint32_t offset_dw)112 load_global_dw(nir_builder *b, nir_def *addr, uint32_t offset_dw)
113 {
114    return nir_load_global(b, nir_iadd_imm(b, addr, offset_dw * 4), 4, 1, 32);
115 }
116 
117 static void
store_global_dw(nir_builder * b,nir_def * addr,uint32_t offset_dw,nir_def * data)118 store_global_dw(nir_builder *b, nir_def *addr, uint32_t offset_dw,
119                 nir_def *data)
120 {
121    assert(data->bit_size == 32 && data->num_components == 1);
122    nir_store_global(b, nir_iadd_imm(b, addr, offset_dw * 4), 4, data, 0x1);
123 }
124 
125 static void
nir_iadd_to_var(nir_builder * b,nir_variable * x_var,nir_def * y)126 nir_iadd_to_var(nir_builder *b, nir_variable *x_var, nir_def *y)
127 {
128    nir_def *x = nir_load_var(b, x_var);
129    x = nir_iadd(b, x, y);
130    nir_store_var(b, x_var, x, 0x1);
131 }
132 
133 static void
nir_iadd_to_var_imm(nir_builder * b,nir_variable * x_var,uint64_t y)134 nir_iadd_to_var_imm(nir_builder *b, nir_variable *x_var, uint64_t y)
135 {
136    nir_def *x = nir_load_var(b, x_var);
137    x = nir_iadd_imm(b, x, y);
138    nir_store_var(b, x_var, x, 0x1);
139 }
140 
141 struct nvk_nir_push {
142    nir_variable *addr;
143    nir_variable *dw_count;
144    unsigned max_dw_count;
145 };
146 
147 static void
nvk_nir_push_start(nir_builder * b,struct nvk_nir_push * p,nir_def * addr)148 nvk_nir_push_start(nir_builder *b, struct nvk_nir_push *p, nir_def *addr)
149 {
150    p->addr = nir_local_variable_create(b->impl, glsl_uint64_t_type(),
151                                        "nvk_nir_push::addr");
152    nir_store_var(b, p->addr, addr, 0x1);
153    p->dw_count = nir_local_variable_create(b->impl, glsl_uint_type(),
154                                            "nvk_nir_push::dw_count");
155    nir_store_var(b, p->dw_count, nir_imm_int(b, 0), 0x1);
156    p->max_dw_count = 0;
157 }
158 
159 static inline void
nvk_nir_push_dw(nir_builder * b,struct nvk_nir_push * p,nir_def * dw)160 nvk_nir_push_dw(nir_builder *b, struct nvk_nir_push *p, nir_def *dw)
161 {
162    store_global_dw(b, nir_load_var(b, p->addr), 0, dw);
163    nir_iadd_to_var_imm(b, p->addr, 4);
164    nir_iadd_to_var_imm(b, p->dw_count, 1);
165    p->max_dw_count++;
166 }
167 
168 static inline void
nvk_nir_push_copy_dws(nir_builder * b,struct nvk_nir_push * p,nir_def * src_dw_addr_in,nir_def * dw_count,uint32_t max_dw_count)169 nvk_nir_push_copy_dws(nir_builder *b, struct nvk_nir_push *p,
170                       nir_def *src_dw_addr_in, nir_def *dw_count,
171                       uint32_t max_dw_count)
172 {
173    nir_variable *i = nir_local_variable_create(b->impl, glsl_uint_type(), "i");
174    nir_store_var(b, i, nir_imm_int(b, 0), 0x1);
175 
176    nir_variable *src_dw_addr =
177       nir_local_variable_create(b->impl, glsl_uint64_t_type(), "src_dw_addr");
178    nir_store_var(b, src_dw_addr, src_dw_addr_in, 0x1);
179 
180    nir_push_loop(b);
181    {
182       nir_push_if(b, nir_uge(b, nir_load_var(b, i), dw_count));
183       {
184          nir_jump(b, nir_jump_break);
185       }
186       nir_pop_if(b, NULL);
187 
188       nir_def *dw = load_global_dw(b, nir_load_var(b, src_dw_addr), 0);
189       store_global_dw(b, nir_load_var(b, p->addr), 0, dw);
190 
191       nir_iadd_to_var_imm(b, i, 1);
192       nir_iadd_to_var_imm(b, p->addr, 4);
193       nir_iadd_to_var_imm(b, src_dw_addr, 4);
194    }
195    nir_pop_loop(b, NULL);
196 
197    nir_iadd_to_var(b, p->dw_count, dw_count);
198    p->max_dw_count += max_dw_count;
199 }
200 
201 static inline void
nvk_nir_build_p_1inc(nir_builder * b,struct nvk_nir_push * p,int subc,int mthd,unsigned dw_count)202 nvk_nir_build_p_1inc(nir_builder *b, struct nvk_nir_push *p,
203                      int subc, int mthd, unsigned dw_count)
204 {
205    uint32_t hdr = NVC0_FIFO_PKHDR_1I(subc, mthd, dw_count);
206    nvk_nir_push_dw(b, p, nir_imm_int(b, hdr));
207 }
208 #define nvk_nir_P_1INC(b, p, class, mthd, size_dw) \
209    nvk_nir_build_p_1inc((b), (p), SUBC_##class, class##_##mthd, (size_dw))
210 
211 static void
nvk_nir_build_pad_NOP(nir_builder * b,struct nvk_nir_push * p,uint32_t nop)212 nvk_nir_build_pad_NOP(nir_builder *b, struct nvk_nir_push *p, uint32_t nop)
213 {
214    nir_push_loop(b);
215    {
216       nir_push_if(b, nir_uge_imm(b, nir_load_var(b, p->dw_count),
217                                     p->max_dw_count));
218       {
219          nir_jump(b, nir_jump_break);
220       }
221       nir_pop_if(b, NULL);
222 
223       store_global_dw(b, nir_load_var(b, p->addr), 0, nir_imm_int(b, nop));
224       nir_iadd_to_var_imm(b, p->addr, 4);
225       nir_iadd_to_var_imm(b, p->dw_count, 1);
226    }
227    nir_pop_loop(b, NULL);
228 }
229 #define nvk_nir_pad_NOP(b, p, class) \
230    nvk_nir_build_pad_NOP((b), (p), \
231       NVC0_FIFO_PKHDR_IL(SUBC_##class, class##_NO_OPERATION, 0))
232 
233 #define QMD_ALIGN 0x100
234 #define QMD_ALLOC_SIZE QMD_ALIGN
235 #define QMD_ROOT_SIZE (sizeof(struct nvk_ies_cs_qmd) + \
236                        sizeof(struct nvk_root_descriptor_table))
237 
238 static_assert(sizeof(struct nvk_ies_cs_qmd) % QMD_ALIGN == 0,
239               "QMD size is not properly algined");
240 static_assert(sizeof(struct nvk_root_descriptor_table) % QMD_ALIGN == 0,
241               "Root descriptor table size is not aligned");
242 static_assert(NVK_DGC_ALIGN >= QMD_ALIGN,
243               "QMD alignment requirement is a lower bound of DGC alignment");
244 
245 static void
copy_repl_global_dw(nir_builder * b,nir_def * dst_addr,nir_def * src_addr,nir_def ** repl_dw,uint32_t dw_count)246 copy_repl_global_dw(nir_builder *b, nir_def *dst_addr, nir_def *src_addr,
247                     nir_def **repl_dw, uint32_t dw_count)
248 {
249    for (uint32_t i = 0; i < dw_count; i++) {
250       nir_def *dw;
251       if (repl_dw[i] == NULL)
252          dw = load_global_dw(b, src_addr, i);
253       else
254          dw = repl_dw[i];
255       store_global_dw(b, dst_addr, i, dw);
256    }
257 }
258 
259 static void
build_process_cs_cmd_seq(nir_builder * b,struct nvk_nir_push * p,nir_def * in_addr,nir_def * seq_idx,struct process_cmd_in * in,struct nvk_physical_device * pdev,const VkIndirectCommandsLayoutCreateInfoEXT * info,uint32_t * qmd_size_per_seq_B_out)260 build_process_cs_cmd_seq(nir_builder *b, struct nvk_nir_push *p,
261                          nir_def *in_addr, nir_def *seq_idx,
262                          struct process_cmd_in *in,
263                          struct nvk_physical_device *pdev,
264                          const VkIndirectCommandsLayoutCreateInfoEXT *info,
265                          uint32_t *qmd_size_per_seq_B_out)
266 {
267    /* If we don't have any indirect execution set, the currently bound shader
268     * will be passed in there.
269     */
270    nir_def *shader_qmd_addr = in->ies_addr;
271 
272    nir_def *root_repl[sizeof(struct nvk_root_descriptor_table) / 4] = {};
273 
274 #define root_dw(member) ( \
275    assert(nvk_root_descriptor_offset(member) % 4 == 0), \
276    nvk_root_descriptor_offset(member) / 4)
277 
278    root_repl[root_dw(cs.base_group[0])] = nir_imm_int(b, 0);
279    root_repl[root_dw(cs.base_group[1])] = nir_imm_int(b, 0);
280    root_repl[root_dw(cs.base_group[2])] = nir_imm_int(b, 0);
281 
282    *qmd_size_per_seq_B_out = 0;
283    for (uint32_t t = 0; t < info->tokenCount; t++) {
284       const VkIndirectCommandsLayoutTokenEXT *token = &info->pTokens[t];
285 
286       nir_def *token_addr = nir_iadd_imm(b, in_addr, token->offset);
287       switch (token->type) {
288       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT: {
289          assert(token->data.pExecutionSet->shaderStages ==
290                 VK_SHADER_STAGE_COMPUTE_BIT);
291          assert(t == 0);
292 
293          nir_def *idx = load_global_dw(b, token_addr, 0);
294          shader_qmd_addr = build_exec_set_addr(b, in, idx);
295          break;
296       }
297 
298       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_EXT: {
299          const VkPushConstantRange *pc_range =
300             &token->data.pPushConstant->updateRange;
301 
302          assert(pc_range->offset % 4 == 0);
303          assert(pc_range->size % 4 == 0);
304 
305          const uint32_t start_dw = root_dw(push) + (pc_range->offset / 4);
306          for (uint32_t i = 0; i < pc_range->size / 4; i++)
307             root_repl[start_dw + i] = load_global_dw(b, token_addr, i);
308          break;
309       }
310 
311       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_SEQUENCE_INDEX_EXT: {
312          const VkPushConstantRange *pc_range =
313             &token->data.pPushConstant->updateRange;
314 
315          assert(pc_range->offset % 4 == 0);
316          assert(pc_range->size == 4);
317 
318          const uint32_t dw = root_dw(push) + (pc_range->offset / 4);
319          root_repl[dw] = seq_idx;
320          break;
321       }
322 
323       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_EXT: {
324          nir_def *disp_size_x = load_global_dw(b, token_addr, 0);
325          nir_def *disp_size_y = load_global_dw(b, token_addr, 1);
326          nir_def *disp_size_z = load_global_dw(b, token_addr, 2);
327 
328          *qmd_size_per_seq_B_out += QMD_ROOT_SIZE;
329 
330          nir_push_if(b, nir_ior(b, nir_ior(b, nir_ine_imm(b, disp_size_x, 0),
331                                               nir_ine_imm(b, disp_size_y, 0)),
332                                    nir_ine_imm(b, disp_size_z, 0)));
333          {
334             /* The first dword in qmd_addr is an allocator in units of 256
335              * bytes.
336              */
337             nir_def *qmd_idx =
338                nir_global_atomic(b, 32, in->qmd_pool_addr,
339                                  nir_imm_int(b, QMD_ROOT_SIZE / QMD_ALIGN),
340                                  .atomic_op = nir_atomic_op_iadd);
341             nir_def *qmd_offset =
342                nir_imul_imm(b, nir_u2u64(b, qmd_idx), QMD_ALIGN);
343             nir_def *qmd_addr = nir_iadd(b, in->qmd_pool_addr, qmd_offset);
344             nir_def *root_addr =
345                nir_iadd_imm(b, qmd_addr, sizeof(struct nvk_ies_cs_qmd));
346 
347             /* Upload and patch the root descriptor table */
348             root_repl[root_dw(cs.group_count[0])] = disp_size_x;
349             root_repl[root_dw(cs.group_count[1])] = disp_size_y;
350             root_repl[root_dw(cs.group_count[2])] = disp_size_z;
351             copy_repl_global_dw(b, root_addr, in->root_addr,
352                                 root_repl, ARRAY_SIZE(root_repl));
353 
354             /* Upload and patch the QMD */
355             const struct nak_qmd_dispatch_size_layout qmd_layout =
356                nak_get_qmd_dispatch_size_layout(&pdev->info);
357             assert(qmd_layout.x_start % 32 == 0);
358             assert(qmd_layout.x_end == qmd_layout.x_start + 32);
359             assert(qmd_layout.y_start == qmd_layout.x_start + 32);
360 
361             nir_def *qmd_repl[sizeof(struct nvk_ies_cs_qmd) / 4] = {};
362             qmd_repl[qmd_layout.x_start / 32] = disp_size_x;
363 
364             if (qmd_layout.z_start == qmd_layout.y_start + 32) {
365                qmd_repl[qmd_layout.y_start / 32] = disp_size_y;
366                qmd_repl[qmd_layout.z_start / 32] = disp_size_z;
367             } else {
368                assert(qmd_layout.y_end == qmd_layout.y_start + 16);
369                assert(qmd_layout.z_start == qmd_layout.x_start + 48);
370                assert(qmd_layout.z_end == qmd_layout.z_start + 16);
371                qmd_repl[qmd_layout.y_start / 32] =
372                   nir_pack_32_2x16_split(b, nir_u2u16(b, disp_size_y),
373                                             nir_u2u16(b, disp_size_z));
374             }
375 
376             struct nak_qmd_cbuf_desc_layout cb0_layout =
377                nak_get_qmd_cbuf_desc_layout(&pdev->info, 0);
378             assert(cb0_layout.addr_lo_start % 32 == 0);
379             assert(cb0_layout.addr_hi_start == cb0_layout.addr_lo_start + 32);
380             const uint32_t cb0_addr_lo_dw = cb0_layout.addr_lo_start / 32;
381             const uint32_t cb0_addr_hi_dw = cb0_layout.addr_hi_start / 32;
382             qmd_repl[cb0_addr_lo_dw] = nir_unpack_64_2x32_split_x(b, root_addr);
383             qmd_repl[cb0_addr_hi_dw] =
384                nir_ior(b, load_global_dw(b, shader_qmd_addr, cb0_addr_hi_dw),
385                           nir_unpack_64_2x32_split_y(b, root_addr));
386 
387             copy_repl_global_dw(b, qmd_addr, shader_qmd_addr,
388                                 qmd_repl, ARRAY_SIZE(qmd_repl));
389 
390             /* Now emit commands */
391             nir_def *invoc = nir_imul_2x32_64(b, disp_size_x, disp_size_y);
392             invoc = nir_imul(b, invoc, nir_u2u64(b, disp_size_z));
393             nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS), 2);
394             nvk_nir_push_dw(b, p, nir_unpack_64_2x32_split_y(b, invoc));
395             nvk_nir_push_dw(b, p, nir_unpack_64_2x32_split_x(b, invoc));
396 
397             nvk_nir_P_1INC(b, p, NVA0C0, SEND_PCAS_A, 1);
398             nvk_nir_push_dw(b, p, nir_u2u32(b, nir_ushr_imm(b, qmd_addr, 8)));
399 
400             if (pdev->info.cls_compute >= AMPERE_COMPUTE_A) {
401                uint32_t signal;
402                V_NVC6C0_SEND_SIGNALING_PCAS2_B(signal,
403                   PCAS_ACTION_INVALIDATE_COPY_SCHEDULE);
404                nvk_nir_P_1INC(b, p, NVC6C0, SEND_SIGNALING_PCAS2_B, 1);
405                nvk_nir_push_dw(b, p, nir_imm_int(b, signal));
406             } else {
407                uint32_t signal;
408                V_NVA0C0_SEND_SIGNALING_PCAS_B(signal, {
409                   .invalidate = INVALIDATE_TRUE,
410                   .schedule = SCHEDULE_TRUE
411                });
412                nvk_nir_P_1INC(b, p, NVA0C0, SEND_SIGNALING_PCAS_B, 1);
413                nvk_nir_push_dw(b, p, nir_imm_int(b, signal));
414             }
415          }
416          nir_pop_if(b, NULL);
417          break;
418       }
419 
420       default:
421          unreachable("Unsupported indirect token type");
422       }
423    }
424 }
425 
426 /*
427  * Graphics
428  */
429 
430 static void
build_gfx_set_exec(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,struct process_cmd_in * in,struct nvk_physical_device * pdev,const VkIndirectCommandsExecutionSetTokenEXT * token)431 build_gfx_set_exec(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
432                    struct process_cmd_in *in,
433                    struct nvk_physical_device *pdev,
434                    const VkIndirectCommandsExecutionSetTokenEXT *token)
435 {
436    switch (token->type) {
437    case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT: {
438       nir_def *idx = load_global_dw(b, token_addr, 0);
439       nir_def *push_addr = build_exec_set_addr(b, in, idx);
440       nir_def *dw_count = load_global_dw(b, push_addr, 0);
441       const uint16_t max_dw_count =
442          nvk_ies_gfx_pipeline_max_dw_count(pdev, token->shaderStages);
443       nvk_nir_push_copy_dws(b, p, nir_iadd_imm(b, push_addr, 4),
444                             dw_count, max_dw_count);
445       break;
446    }
447 
448    case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT: {
449       int32_t i = 0;
450       gl_shader_stage type_stage[6] = {};
451       nir_def *type_shader_idx[6] = {};
452       gl_shader_stage last_vtgm = MESA_SHADER_VERTEX;
453       u_foreach_bit(s, token->shaderStages) {
454          gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s);
455 
456          if (stage != MESA_SHADER_FRAGMENT)
457             last_vtgm = stage;
458 
459          uint32_t type = mesa_to_nv9097_shader_type(stage);
460          type_stage[type] = stage;
461          type_shader_idx[type] = load_global_dw(b, token_addr, i++);
462       }
463 
464       for (uint32_t type = 0; type < 6; type++) {
465          nir_def *shader_idx = type_shader_idx[type];
466          if (shader_idx == NULL)
467             continue;
468 
469          bool is_last_vtgm = type_stage[type] == last_vtgm;
470 
471          nir_def *push_addr = build_exec_set_addr(b, in, shader_idx);
472          nir_def *hdr = load_global_dw(b, push_addr, 0);
473          nir_def *dw_count =
474             nir_extract_u16(b, hdr, nir_imm_int(b, is_last_vtgm));
475          const uint16_t max_dw_count =
476             nvk_ies_gfx_shader_max_dw_count(pdev, token->shaderStages,
477                                             is_last_vtgm);
478          nvk_nir_push_copy_dws(b, p, nir_iadd_imm(b, push_addr, 4),
479                                dw_count, max_dw_count);
480       }
481       break;
482    }
483 
484    default:
485       unreachable("Unknown indirect execution set type");
486    }
487 }
488 
489 static void
build_push_gfx_const(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,const VkIndirectCommandsPushConstantTokenEXT * token)490 build_push_gfx_const(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
491                      const VkIndirectCommandsPushConstantTokenEXT *token)
492 {
493    const VkPushConstantRange *pc_range = &token->updateRange;
494 
495    // TODO: Compute
496    assert(!(pc_range->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT));
497 
498    assert(pc_range->offset % 4 == 0);
499    assert(pc_range->size % 4 == 0);
500    const uint32_t dw_count = pc_range->size / 4;
501 
502    nvk_nir_P_1INC(b, p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET, 1 + dw_count);
503    nvk_nir_push_dw(b, p, nir_imm_int(b,
504       nvk_root_descriptor_offset(push) + pc_range->offset));
505    for (uint32_t i = 0; i < dw_count; i++)
506       nvk_nir_push_dw(b, p, load_global_dw(b, token_addr, i));
507 }
508 
509 static void
build_push_gfx_seq_idx(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,nir_def * seq_idx,const VkIndirectCommandsPushConstantTokenEXT * token)510 build_push_gfx_seq_idx(nir_builder *b, struct nvk_nir_push *p,
511                        nir_def *token_addr, nir_def *seq_idx,
512                        const VkIndirectCommandsPushConstantTokenEXT *token)
513 {
514    const VkPushConstantRange *pc_range = &token->updateRange;
515 
516    // TODO: Compute
517    assert(!(pc_range->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT));
518 
519    assert(pc_range->offset % 4 == 0);
520    assert(pc_range->size == 4);
521    nvk_nir_P_1INC(b, p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET, 2);
522    nvk_nir_push_dw(b, p, nir_imm_int(b,
523       nvk_root_descriptor_offset(push) + pc_range->offset));
524    nvk_nir_push_dw(b, p, seq_idx);
525 }
526 
527 static void
build_set_ib(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,UNUSED const VkIndirectCommandsIndexBufferTokenEXT * token)528 build_set_ib(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
529              UNUSED const VkIndirectCommandsIndexBufferTokenEXT *token)
530 {
531    nir_def *addr_lo = load_global_dw(b, token_addr, 0);
532    nir_def *addr_hi = load_global_dw(b, token_addr, 1);
533    nir_def *size_B  = load_global_dw(b, token_addr, 2);
534    nir_def *idx_fmt = load_global_dw(b, token_addr, 3);
535 
536    nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_IB), 4);
537    nvk_nir_push_dw(b, p, addr_hi);
538    nvk_nir_push_dw(b, p, addr_lo);
539    nvk_nir_push_dw(b, p, size_B);
540    nvk_nir_push_dw(b, p, idx_fmt);
541 }
542 
543 static nir_def *
nvk_nir_vb_stride(nir_builder * b,nir_def * vb_idx,nir_def * stride)544 nvk_nir_vb_stride(nir_builder *b, nir_def *vb_idx, nir_def *stride)
545 {
546    return nir_pack_32_2x16_split(b, nir_u2u16(b, stride),
547                                     nir_u2u16(b, vb_idx));
548 }
549 
550 static void
build_set_vb(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,const VkIndirectCommandsVertexBufferTokenEXT * token)551 build_set_vb(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
552              const VkIndirectCommandsVertexBufferTokenEXT *token)
553 {
554    nir_def *vb_idx = nir_imm_int(b, token->vertexBindingUnit);
555    nir_def *addr_lo  = load_global_dw(b, token_addr, 0);
556    nir_def *addr_hi  = load_global_dw(b, token_addr, 1);
557    nir_def *size_B   = load_global_dw(b, token_addr, 2);
558    nir_def *stride_B = load_global_dw(b, token_addr, 3);
559 
560    nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_VB), 4);
561    nvk_nir_push_dw(b, p, vb_idx);
562    nvk_nir_push_dw(b, p, addr_hi);
563    nvk_nir_push_dw(b, p, addr_lo);
564    nvk_nir_push_dw(b, p, size_B);
565 
566    nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_SET_VB_STRIDE), 1);
567    nvk_nir_push_dw(b, p, nvk_nir_vb_stride(b, vb_idx, stride_B));
568 }
569 
570 static void
build_draw(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr)571 build_draw(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr)
572 {
573    nir_def *vertex_count   = load_global_dw(b, token_addr, 0);
574    nir_def *instance_count = load_global_dw(b, token_addr, 1);
575    nir_def *first_vertex   = load_global_dw(b, token_addr, 2);
576    nir_def *first_instance = load_global_dw(b, token_addr, 3);
577 
578    nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW), 5);
579    nvk_nir_push_dw(b, p, nir_imm_int(b, 0)); /* draw_index */
580    nvk_nir_push_dw(b, p, vertex_count);
581    nvk_nir_push_dw(b, p, instance_count);
582    nvk_nir_push_dw(b, p, first_vertex);
583    nvk_nir_push_dw(b, p, first_instance);
584 }
585 
586 static void
build_draw_indexed(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr)587 build_draw_indexed(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr)
588 {
589    nir_def *index_count    = load_global_dw(b, token_addr, 0);
590    nir_def *instance_count = load_global_dw(b, token_addr, 1);
591    nir_def *first_index    = load_global_dw(b, token_addr, 2);
592    nir_def *vertex_offset  = load_global_dw(b, token_addr, 3);
593    nir_def *first_instance = load_global_dw(b, token_addr, 4);
594 
595    nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED), 6);
596    nvk_nir_push_dw(b, p, nir_imm_int(b, 0)); /* draw_index */
597    nvk_nir_push_dw(b, p, index_count);
598    nvk_nir_push_dw(b, p, instance_count);
599    nvk_nir_push_dw(b, p, first_index);
600    nvk_nir_push_dw(b, p, vertex_offset);
601    nvk_nir_push_dw(b, p, first_instance);
602 }
603 
604 static void
build_draw_count(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr)605 build_draw_count(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr)
606 {
607    nir_def *addr_lo = load_global_dw(b, token_addr, 0);
608    nir_def *addr_hi = load_global_dw(b, token_addr, 1);
609    nir_def *stride  = load_global_dw(b, token_addr, 2);
610    nir_def *count   = load_global_dw(b, token_addr, 3);
611 
612    nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT), 4);
613    nvk_nir_push_dw(b, p, addr_hi);
614    nvk_nir_push_dw(b, p, addr_lo);
615    nvk_nir_push_dw(b, p, count);
616    nvk_nir_push_dw(b, p, stride);
617 }
618 
619 static void
build_draw_indexed_count(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr)620 build_draw_indexed_count(nir_builder *b, struct nvk_nir_push *p,
621                          nir_def *token_addr)
622 {
623    nir_def *addr_lo = load_global_dw(b, token_addr, 0);
624    nir_def *addr_hi = load_global_dw(b, token_addr, 1);
625    nir_def *stride  = load_global_dw(b, token_addr, 2);
626    nir_def *count   = load_global_dw(b, token_addr, 3);
627 
628    nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT), 4);
629    nvk_nir_push_dw(b, p, addr_hi);
630    nvk_nir_push_dw(b, p, addr_lo);
631    nvk_nir_push_dw(b, p, count);
632    nvk_nir_push_dw(b, p, stride);
633 }
634 
635 static void
build_process_gfx_cmd_seq(nir_builder * b,struct nvk_nir_push * p,nir_def * in_addr,nir_def * seq_idx,struct process_cmd_in * in,struct nvk_physical_device * pdev,const VkIndirectCommandsLayoutCreateInfoEXT * info)636 build_process_gfx_cmd_seq(nir_builder *b, struct nvk_nir_push *p,
637                           nir_def *in_addr, nir_def *seq_idx,
638                           struct process_cmd_in *in,
639                           struct nvk_physical_device *pdev,
640                           const VkIndirectCommandsLayoutCreateInfoEXT *info)
641 {
642    for (uint32_t t = 0; t < info->tokenCount; t++) {
643       const VkIndirectCommandsLayoutTokenEXT *token = &info->pTokens[t];
644 
645       nir_def *token_addr = nir_iadd_imm(b, in_addr, token->offset);
646       switch (token->type) {
647       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT:
648          assert(t == 0);
649          build_gfx_set_exec(b, p, token_addr, in, pdev,
650                             token->data.pExecutionSet);
651          break;
652 
653       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_EXT:
654          build_push_gfx_const(b, p, token_addr, token->data.pPushConstant);
655          break;
656 
657       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_SEQUENCE_INDEX_EXT:
658          build_push_gfx_seq_idx(b, p, token_addr, seq_idx,
659                                 token->data.pPushConstant);
660          break;
661 
662       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_EXT:
663          build_set_ib(b, p, token_addr, token->data.pIndexBuffer);
664          break;
665 
666       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_EXT:
667          build_set_vb(b, p, token_addr, token->data.pVertexBuffer);
668          break;
669 
670       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_EXT:
671          build_draw_indexed(b, p, token_addr);
672          break;
673 
674       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_EXT:
675          build_draw(b, p, token_addr);
676          break;
677 
678       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_COUNT_EXT:
679          build_draw_indexed_count(b, p, token_addr);
680          break;
681 
682       case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_COUNT_EXT:
683          build_draw_count(b, p, token_addr);
684          break;
685 
686       default:
687          unreachable("Unsupported indirect token type");
688       }
689    }
690 }
691 
692 static VkResult
build_init_shader(struct nvk_device * dev,const VkIndirectCommandsLayoutCreateInfoEXT * info,uint32_t qmd_size_per_seq_B,const VkAllocationCallbacks * pAllocator,struct nvk_shader ** shader_out)693 build_init_shader(struct nvk_device *dev,
694                   const VkIndirectCommandsLayoutCreateInfoEXT *info,
695                   uint32_t qmd_size_per_seq_B,
696                   const VkAllocationCallbacks *pAllocator,
697                   struct nvk_shader **shader_out)
698 {
699    /* There's nothing to initialize for graphics */
700    if (info->shaderStages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
701       assert(!(info->shaderStages & ~NVK_SHADER_STAGE_GRAPHICS_BITS));
702       *shader_out = NULL;
703       return VK_SUCCESS;
704    }
705 
706    if (qmd_size_per_seq_B == 0) {
707       *shader_out = NULL;
708       return VK_SUCCESS;
709    }
710 
711    nir_builder build =
712       nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
713                                      "nvk-init-indirect-commands");
714    build.shader->info.workgroup_size[0] = 32;
715    nir_builder *b = &build;
716 
717    struct process_cmd_in in = load_process_cmd_in(b);
718 
719    if (qmd_size_per_seq_B > 0) {
720       /* Initialize the QMD allocator to 1 * QMD_ALIGN so that the QMDs we
721        * allocate don't stomp the allocator.
722        */
723       assert(info->shaderStages == VK_SHADER_STAGE_COMPUTE_BIT);
724       store_global_dw(b, in.qmd_pool_addr, 0, nir_imm_int(b, 1));
725    }
726 
727    return nvk_compile_nir_shader(dev, build.shader, pAllocator, shader_out);
728 }
729 
730 static VkResult
build_process_shader(struct nvk_device * dev,const VkIndirectCommandsLayoutCreateInfoEXT * info,const VkAllocationCallbacks * pAllocator,struct nvk_shader ** shader_out,uint32_t * cmd_seq_stride_B_out,uint32_t * qmd_size_per_seq_B_out)731 build_process_shader(struct nvk_device *dev,
732                      const VkIndirectCommandsLayoutCreateInfoEXT *info,
733                      const VkAllocationCallbacks *pAllocator,
734                      struct nvk_shader **shader_out,
735                      uint32_t *cmd_seq_stride_B_out,
736                      uint32_t *qmd_size_per_seq_B_out)
737 {
738    struct nvk_physical_device *pdev = nvk_device_physical(dev);
739 
740    nir_builder build =
741       nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
742                                      "nvk-process-indirect-commands");
743    build.shader->info.workgroup_size[0] = 32;
744    nir_builder *b = &build;
745 
746    struct process_cmd_in in = load_process_cmd_in(b);
747 
748    nir_def *seq_idx = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
749 
750    /* We always execute a 32-wide shader and nothing guarantees that
751     * max_seq_count is a multiple of 32 so we need to bail if our index is
752     * above the maximum.  If we're inside the maximum but less than the count,
753     * we setill need to emit a bunch of NOP.
754     */
755    nir_push_if(b, nir_uge(b, seq_idx, in.max_seq_count));
756    {
757       nir_jump(b, nir_jump_halt);
758    }
759    nir_pop_if(b, NULL);
760 
761    nir_def *ind_count;
762    nir_push_if(b, nir_ine_imm(b, in.count_addr, 0));
763    {
764       ind_count = load_global_dw(b, in.count_addr, 0);
765       ind_count = nir_umin(b, ind_count, in.max_seq_count);
766    }
767    nir_pop_if(b, NULL);
768    nir_def *count = nir_if_phi(b, ind_count, in.max_seq_count);
769 
770    nir_def *in_seq_addr = nir_iadd(b, in.in_addr,
771       nir_imul_imm(b, nir_u2u64(b, seq_idx), info->indirectStride));
772    /* We'll replace this later once we know what it is */
773    nir_def *out_stride = nir_imm_int(b, 0xc0ffee0);
774    nir_def *out_seq_addr = nir_iadd(b, in.out_addr,
775       nir_imul_2x32_64(b, seq_idx, out_stride));
776 
777    struct nvk_nir_push push = {};
778    nvk_nir_push_start(b, &push, out_seq_addr);
779 
780    nir_push_if(b, nir_ult(b, seq_idx, count));
781    {
782       if (info->shaderStages & VK_SHADER_STAGE_COMPUTE_BIT) {
783          assert(info->shaderStages == VK_SHADER_STAGE_COMPUTE_BIT);
784          build_process_cs_cmd_seq(b, &push, in_seq_addr, seq_idx,
785                                   &in, pdev, info, qmd_size_per_seq_B_out);
786       } else if (info->shaderStages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
787          assert(!(info->shaderStages & ~NVK_SHADER_STAGE_GRAPHICS_BITS));
788          *qmd_size_per_seq_B_out = 0;
789          build_process_gfx_cmd_seq(b, &push, in_seq_addr, seq_idx,
790                                    &in, pdev, info);
791       } else {
792          unreachable("Unknown shader stage");
793       }
794    }
795    nir_pop_if(b, NULL);
796 
797    /* Always pad the command buffer.  In the case where seq_idx >= count, the
798     * entire sequence will be NO_OPERATION.
799     */
800    if (info->shaderStages & VK_SHADER_STAGE_COMPUTE_BIT) {
801       nvk_nir_pad_NOP(b, &push, NVA0C0);
802    } else if (info->shaderStages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
803       nvk_nir_pad_NOP(b, &push, NV9097);
804    } else {
805       unreachable("Unknown shader stage");
806    }
807 
808    /* Replace the out stride with the actual size of a command stream */
809    nir_load_const_instr *out_stride_const =
810       nir_instr_as_load_const(out_stride->parent_instr);
811    out_stride_const->value[0].u32 = push.max_dw_count * 4;
812 
813    /* We also output this stride to go in the layout struct */
814    *cmd_seq_stride_B_out = push.max_dw_count * 4;
815 
816    return nvk_compile_nir_shader(dev, build.shader, pAllocator, shader_out);
817 }
818 
819 static void
nvk_indirect_commands_layout_destroy(struct nvk_device * dev,struct nvk_indirect_commands_layout * layout,const VkAllocationCallbacks * alloc)820 nvk_indirect_commands_layout_destroy(struct nvk_device *dev,
821                                      struct nvk_indirect_commands_layout *layout,
822                                      const VkAllocationCallbacks *alloc)
823 {
824    if (layout->init != NULL)
825       vk_shader_destroy(&dev->vk, &layout->init->vk, alloc);
826    if (layout->process != NULL)
827       vk_shader_destroy(&dev->vk, &layout->process->vk, alloc);
828    vk_object_free(&dev->vk, alloc, layout);
829 }
830 
831 VKAPI_ATTR VkResult VKAPI_CALL
nvk_CreateIndirectCommandsLayoutEXT(VkDevice _device,const VkIndirectCommandsLayoutCreateInfoEXT * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkIndirectCommandsLayoutEXT * pIndirectCommandsLayout)832 nvk_CreateIndirectCommandsLayoutEXT(
833     VkDevice _device,
834     const VkIndirectCommandsLayoutCreateInfoEXT *pCreateInfo,
835     const VkAllocationCallbacks *pAllocator,
836     VkIndirectCommandsLayoutEXT *pIndirectCommandsLayout)
837 {
838    VK_FROM_HANDLE(nvk_device, dev, _device);
839    VkResult result;
840 
841    struct nvk_indirect_commands_layout *layout =
842       vk_object_zalloc(&dev->vk, pAllocator, sizeof(*layout),
843                        VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_EXT);
844    if (layout == NULL)
845       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
846 
847    layout->stages = pCreateInfo->shaderStages;
848 
849    /* From the Vulkan 1.3.XXX spec:
850     *
851     *    VUID-VkIndirectCommandsLayoutCreateInfoEXT-pTokens-11093
852     *
853     *    "The number of tokens in the pTokens array with type equal to
854     *    VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT must be less than
855     *    or equal to 1"
856     *
857     * and
858     *
859     *    VUID-VkIndirectCommandsLayoutCreateInfoEXT-pTokens-11139
860     *
861     *    "If the pTokens array contains a
862     *    VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT token, then this
863     *    token must be the first token in the array"
864     */
865    if (pCreateInfo->tokenCount > 0 &&
866        pCreateInfo->pTokens[0].type ==
867          VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT) {
868       const VkIndirectCommandsExecutionSetTokenEXT *token =
869          pCreateInfo->pTokens[0].data.pExecutionSet;
870 
871       /* Pipelines should never mismatch here. */
872       if (token->type == VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT)
873          assert(token->shaderStages == pCreateInfo->shaderStages);
874 
875       layout->set_stages = token->shaderStages;
876    }
877 
878    result = build_process_shader(dev, pCreateInfo, pAllocator,
879                                  &layout->process, &layout->cmd_seq_stride_B,
880                                  &layout->qmd_size_per_seq_B);
881    if (result != VK_SUCCESS) {
882       nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
883       return result;
884    }
885 
886    if (layout->cmd_seq_stride_B > (NV_PUSH_MAX_COUNT * 4)) {
887       nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
888       return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
889                        "Too many tokens in IndirectCommandsLayout");
890    }
891 
892    result = build_init_shader(dev, pCreateInfo, layout->qmd_size_per_seq_B,
893                               pAllocator, &layout->init);
894    if (result != VK_SUCCESS) {
895       nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
896       return result;
897    }
898 
899    *pIndirectCommandsLayout = nvk_indirect_commands_layout_to_handle(layout);
900 
901    return VK_SUCCESS;
902 }
903 
904 VKAPI_ATTR void VKAPI_CALL
nvk_DestroyIndirectCommandsLayoutEXT(VkDevice _device,VkIndirectCommandsLayoutEXT indirectCommandsLayout,const VkAllocationCallbacks * pAllocator)905 nvk_DestroyIndirectCommandsLayoutEXT(
906     VkDevice _device,
907     VkIndirectCommandsLayoutEXT indirectCommandsLayout,
908     const VkAllocationCallbacks *pAllocator)
909 {
910    VK_FROM_HANDLE(nvk_device, dev, _device);
911    VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
912                   indirectCommandsLayout);
913 
914    if (layout == NULL)
915       return;
916 
917    nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
918 }
919 
920 VKAPI_ATTR void VKAPI_CALL
nvk_GetGeneratedCommandsMemoryRequirementsEXT(VkDevice _device,const VkGeneratedCommandsMemoryRequirementsInfoEXT * pInfo,VkMemoryRequirements2 * pMemoryRequirements)921 nvk_GetGeneratedCommandsMemoryRequirementsEXT(
922     VkDevice _device,
923     const VkGeneratedCommandsMemoryRequirementsInfoEXT *pInfo,
924     VkMemoryRequirements2 *pMemoryRequirements)
925 {
926    VK_FROM_HANDLE(nvk_device, dev, _device);
927    VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
928                   pInfo->indirectCommandsLayout);
929    struct nvk_physical_device *pdev = nvk_device_physical(dev);
930 
931    uint64_t size = layout->cmd_seq_stride_B * (uint64_t)pInfo->maxSequenceCount;
932    if (layout->qmd_size_per_seq_B > 0) {
933       size = align64(size, QMD_ALIGN);
934       size += QMD_ALLOC_SIZE;
935       size += layout->qmd_size_per_seq_B * pInfo->maxSequenceCount;
936    }
937 
938    pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
939       .size = size,
940       .alignment = QMD_ALIGN,
941       .memoryTypeBits = BITFIELD_MASK(pdev->mem_type_count),
942    };
943 }
944 
945 static void
nvk_cmd_process_cmds(struct nvk_cmd_buffer * cmd,const VkGeneratedCommandsInfoEXT * info,const struct nvk_cmd_state * state)946 nvk_cmd_process_cmds(struct nvk_cmd_buffer *cmd,
947                      const VkGeneratedCommandsInfoEXT *info,
948                      const struct nvk_cmd_state *state)
949 {
950    VK_FROM_HANDLE(nvk_indirect_execution_set, ies, info->indirectExecutionSet);
951    VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
952                   info->indirectCommandsLayout);
953 
954    struct process_cmd_push push = {
955       .in_addr       = info->indirectAddress,
956       .out_addr      = info->preprocessAddress,
957       .count_addr    = info->sequenceCountAddress,
958       .max_seq_count = info->maxSequenceCount,
959    };
960 
961    uint64_t qmd_addr = 0;
962    if (layout->stages & VK_SHADER_STAGE_COMPUTE_BIT) {
963       uint32_t global_size[3] = { 0, 0, 0 };
964       VkResult result = nvk_cmd_flush_cs_qmd(cmd, state, global_size,
965                                              &qmd_addr, &push.root_addr);
966       if (unlikely(result != VK_SUCCESS)) {
967          vk_command_buffer_set_error(&cmd->vk, result);
968          return;
969       }
970    }
971 
972    if (layout->set_stages == 0) {
973       push.ies_addr = qmd_addr;
974    } else {
975       assert(layout->set_stages == layout->stages);
976       push.ies_addr   = ies->mem->va->addr;
977       push.ies_stride = ies->stride_B;
978    }
979 
980    if (layout->qmd_size_per_seq_B > 0) {
981       assert(info->preprocessAddress % QMD_ALIGN == 0);
982       uint64_t qmd_offset =
983          layout->cmd_seq_stride_B * (uint64_t)info->maxSequenceCount;
984       qmd_offset = align64(qmd_offset, QMD_ALIGN);
985       push.qmd_pool_addr = info->preprocessAddress + qmd_offset;
986    }
987 
988    if (layout->init != NULL) {
989       nvk_cmd_dispatch_shader(cmd, layout->init, &push, sizeof(push), 1, 1, 1);
990 
991       struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
992       P_IMMD(p, NVA0C0, WAIT_FOR_IDLE, 0);
993    }
994 
995    nvk_cmd_dispatch_shader(cmd, layout->process, &push, sizeof(push),
996                            DIV_ROUND_UP(info->maxSequenceCount, 32), 1, 1);
997 }
998 
999 static void
nvk_cmd_flush_process_state(struct nvk_cmd_buffer * cmd,const VkGeneratedCommandsInfoEXT * info)1000 nvk_cmd_flush_process_state(struct nvk_cmd_buffer *cmd,
1001                             const VkGeneratedCommandsInfoEXT *info)
1002 {
1003    struct nvk_descriptor_state *desc =
1004       nvk_get_descriptor_state_for_stages(cmd, info->shaderStages);
1005    nvk_cmd_buffer_flush_push_descriptors(cmd, desc);
1006 }
1007 
1008 VKAPI_ATTR void VKAPI_CALL
nvk_CmdPreprocessGeneratedCommandsEXT(VkCommandBuffer commandBuffer,const VkGeneratedCommandsInfoEXT * info,VkCommandBuffer stateCommandBuffer)1009 nvk_CmdPreprocessGeneratedCommandsEXT(VkCommandBuffer commandBuffer,
1010                                       const VkGeneratedCommandsInfoEXT *info,
1011                                       VkCommandBuffer stateCommandBuffer)
1012 {
1013    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
1014    VK_FROM_HANDLE(nvk_cmd_buffer, state_cmd, stateCommandBuffer);
1015 
1016    nvk_cmd_flush_process_state(state_cmd, info);
1017    nvk_cmd_process_cmds(cmd, info, &state_cmd->state);
1018 }
1019 
1020 VKAPI_ATTR void VKAPI_CALL
nvk_CmdExecuteGeneratedCommandsEXT(VkCommandBuffer commandBuffer,VkBool32 isPreprocessed,const VkGeneratedCommandsInfoEXT * info)1021 nvk_CmdExecuteGeneratedCommandsEXT(VkCommandBuffer commandBuffer,
1022                                    VkBool32 isPreprocessed,
1023                                    const VkGeneratedCommandsInfoEXT *info)
1024 {
1025    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
1026    VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
1027                   info->indirectCommandsLayout);
1028    struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
1029    const struct nvk_physical_device *pdev = nvk_device_physical(dev);
1030 
1031    if (!isPreprocessed) {
1032       nvk_cmd_flush_process_state(cmd, info);
1033       nvk_cmd_process_cmds(cmd, info, &cmd->state);
1034 
1035       struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
1036       P_IMMD(p, NVA0C0, INVALIDATE_SHADER_CACHES, {
1037          .data = DATA_TRUE,
1038          .constant = CONSTANT_TRUE,
1039          .flush_data = FLUSH_DATA_TRUE,
1040       });
1041       if (pdev->info.cls_eng3d >= MAXWELL_COMPUTE_B)
1042          P_IMMD(p, NVB1C0, INVALIDATE_SKED_CACHES, 0);
1043       __push_immd(p, SUBC_NV9097, NV906F_SET_REFERENCE, 0);
1044    }
1045 
1046    if (layout->stages & VK_SHADER_STAGE_COMPUTE_BIT) {
1047       assert(info->shaderStages == VK_SHADER_STAGE_COMPUTE_BIT);
1048       nvk_cmd_buffer_flush_push_descriptors(cmd, &cmd->state.cs.descriptors);
1049    } else if (layout->stages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
1050       assert(!(layout->stages & ~NVK_SHADER_STAGE_GRAPHICS_BITS));
1051 
1052       nvk_cmd_buffer_flush_push_descriptors(cmd, &cmd->state.gfx.descriptors);
1053       nvk_cmd_flush_gfx_dynamic_state(cmd);
1054 
1055       if (layout->set_stages == 0) {
1056          /* In this case, we're using the CPU-bound shaders */
1057          nvk_cmd_flush_gfx_shaders(cmd);
1058          nvk_cmd_flush_gfx_cbufs(cmd);
1059       } else {
1060          /* From the Vulkan 1.3.XXX spec:
1061           *
1062           *    "If indirectCommandsLayout was created with a token sequence
1063           *    that contained the
1064           *    VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT token and
1065           *    indirectExecutionSet was created using
1066           *    VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT, every
1067           *    executed VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT
1068           *    token must bind all the shader stages set in the
1069           *    VkIndirectCommandsExecutionSetTokenEXT::shaderStages used to
1070           *    create indirectCommandsLayout"
1071           *
1072           * So we unbind anything not explicitly bound by the layout and trust
1073           * the layout to bind the rest.
1074           */
1075          assert(layout->set_stages == layout->stages);
1076 
1077          uint8_t set_types = 0;
1078          u_foreach_bit(s, layout->set_stages) {
1079             gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s);
1080             uint32_t type = mesa_to_nv9097_shader_type(stage);
1081             set_types |= BITFIELD_BIT(type);
1082          }
1083 
1084          uint8_t unset_types = BITFIELD_MASK(6) & ~set_types;
1085 
1086          struct nv_push *p = nvk_cmd_buffer_push(cmd, 12);
1087          u_foreach_bit(type, unset_types) {
1088             P_IMMD(p, NV9097, SET_PIPELINE_SHADER(type), {
1089                .enable  = ENABLE_FALSE,
1090                .type    = type,
1091             });
1092          }
1093       }
1094    }
1095 
1096    ASSERTED const uint64_t size =
1097       layout->cmd_seq_stride_B * (uint64_t)info->maxSequenceCount;
1098    assert(size <= info->preprocessSize);
1099 
1100    uint64_t addr = info->preprocessAddress;
1101    uint64_t seq_count = info->maxSequenceCount;
1102 
1103    /* Break it into pices that are a multiple of cmd_seq_stride_B so that, if
1104     * the kernel inserts a sync point between two of our pushes, it doesn't
1105     * break a single command.
1106     */
1107    const uint32_t max_seq_per_push =
1108       (NV_PUSH_MAX_COUNT * 4) / layout->cmd_seq_stride_B;
1109 
1110    while (seq_count > 0) {
1111       uint32_t push_seq = MIN2(seq_count, max_seq_per_push);
1112       uint32_t push_size_B = push_seq * layout->cmd_seq_stride_B;
1113       nvk_cmd_buffer_push_indirect(cmd, addr, push_size_B);
1114       addr += push_size_B;
1115       seq_count -= push_seq;
1116    }
1117 
1118    if (layout->set_stages != 0) {
1119       if (layout->stages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
1120          cmd->state.gfx.shaders_dirty |= NVK_SHADER_STAGE_GRAPHICS_BITS;
1121       }
1122    }
1123 }
1124