1 /*
2 * Copyright © 2024 Collabora Ltd. and Red Hat Inc.
3 * SPDX-License-Identifier: MIT
4 */
5 #include "nvk_cmd_buffer.h"
6 #include "nvk_device.h"
7 #include "nvk_entrypoints.h"
8 #include "nvk_indirect_execution_set.h"
9 #include "nvk_mme.h"
10 #include "nvk_physical_device.h"
11 #include "nvk_shader.h"
12
13 #include "nir_builder.h"
14 #include "vk_pipeline.h"
15
16 #include "nv_push.h"
17 #include "nv_push_cl9097.h"
18 #include "nv_push_cl906f.h"
19 #include "nv_push_cla0c0.h"
20 #include "nv_push_clb1c0.h"
21 #include "nv_push_clc6c0.h"
22
23 struct nvk_indirect_commands_layout {
24 struct vk_object_base base;
25
26 VkShaderStageFlags stages;
27
28 /* Stages set by the first TOKEN_TYPE_EXECUTION_SET */
29 VkShaderStageFlags set_stages;
30
31 uint32_t cmd_seq_stride_B;
32 uint32_t qmd_size_per_seq_B;
33
34 struct nvk_shader *init;
35 struct nvk_shader *process;
36 };
37
38 VK_DEFINE_NONDISP_HANDLE_CASTS(nvk_indirect_commands_layout, base,
39 VkIndirectCommandsLayoutEXT,
40 VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_EXT);
41
42 struct process_cmd_push {
43 uint64_t in_addr;
44 uint64_t out_addr;
45 uint64_t qmd_pool_addr;
46 uint64_t count_addr;
47 uint32_t max_seq_count;
48 uint32_t ies_stride;
49 uint64_t ies_addr;
50 uint64_t root_addr;
51 };
52
53 struct process_cmd_in {
54 nir_def *in_addr;
55 nir_def *out_addr;
56 nir_def *qmd_pool_addr;
57 nir_def *count_addr;
58 nir_def *max_seq_count;
59 nir_def *ies_stride;
60 nir_def *ies_addr;
61 nir_def *root_addr;
62 };
63
64 static nir_def *
load_struct_var(nir_builder * b,nir_variable * var,uint32_t field)65 load_struct_var(nir_builder *b, nir_variable *var, uint32_t field)
66 {
67 nir_deref_instr *deref =
68 nir_build_deref_struct(b, nir_build_deref_var(b, var), field);
69 return nir_load_deref(b, deref);
70 }
71
72 static struct process_cmd_in
load_process_cmd_in(nir_builder * b)73 load_process_cmd_in(nir_builder *b)
74 {
75 struct glsl_struct_field push_fields[] = {
76 { .type = glsl_uint64_t_type(), .name = "in_addr", .offset = 0x00 },
77 { .type = glsl_uint64_t_type(), .name = "out_addr", .offset = 0x08 },
78 { .type = glsl_uint64_t_type(), .name = "qmd_pool_addr", .offset = 0x10 },
79 { .type = glsl_uint64_t_type(), .name = "count_addr", .offset = 0x18 },
80 { .type = glsl_uint_type(), .name = "max_seq_count", .offset = 0x20 },
81 { .type = glsl_uint_type(), .name = "ies_stride", .offset = 0x24 },
82 { .type = glsl_uint64_t_type(), .name = "ies_addr", .offset = 0x28 },
83 { .type = glsl_uint64_t_type(), .name = "root_addr", .offset = 0x30 },
84 };
85 const struct glsl_type *push_iface_type =
86 glsl_interface_type(push_fields, ARRAY_SIZE(push_fields),
87 GLSL_INTERFACE_PACKING_STD140,
88 false /* row_major */, "push");
89 nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const,
90 push_iface_type, "push");
91
92 return (struct process_cmd_in) {
93 .in_addr = load_struct_var(b, push, 0),
94 .out_addr = load_struct_var(b, push, 1),
95 .qmd_pool_addr = load_struct_var(b, push, 2),
96 .count_addr = load_struct_var(b, push, 3),
97 .max_seq_count = load_struct_var(b, push, 4),
98 .ies_stride = load_struct_var(b, push, 5),
99 .ies_addr = load_struct_var(b, push, 6),
100 .root_addr = load_struct_var(b, push, 7),
101 };
102 }
103
104 static nir_def *
build_exec_set_addr(nir_builder * b,struct process_cmd_in * in,nir_def * idx)105 build_exec_set_addr(nir_builder *b, struct process_cmd_in *in, nir_def *idx)
106 {
107 nir_def *offset = nir_imul_2x32_64(b, in->ies_stride, idx);
108 return nir_iadd(b, in->ies_addr, offset);
109 }
110
111 static nir_def *
load_global_dw(nir_builder * b,nir_def * addr,uint32_t offset_dw)112 load_global_dw(nir_builder *b, nir_def *addr, uint32_t offset_dw)
113 {
114 return nir_load_global(b, nir_iadd_imm(b, addr, offset_dw * 4), 4, 1, 32);
115 }
116
117 static void
store_global_dw(nir_builder * b,nir_def * addr,uint32_t offset_dw,nir_def * data)118 store_global_dw(nir_builder *b, nir_def *addr, uint32_t offset_dw,
119 nir_def *data)
120 {
121 assert(data->bit_size == 32 && data->num_components == 1);
122 nir_store_global(b, nir_iadd_imm(b, addr, offset_dw * 4), 4, data, 0x1);
123 }
124
125 static void
nir_iadd_to_var(nir_builder * b,nir_variable * x_var,nir_def * y)126 nir_iadd_to_var(nir_builder *b, nir_variable *x_var, nir_def *y)
127 {
128 nir_def *x = nir_load_var(b, x_var);
129 x = nir_iadd(b, x, y);
130 nir_store_var(b, x_var, x, 0x1);
131 }
132
133 static void
nir_iadd_to_var_imm(nir_builder * b,nir_variable * x_var,uint64_t y)134 nir_iadd_to_var_imm(nir_builder *b, nir_variable *x_var, uint64_t y)
135 {
136 nir_def *x = nir_load_var(b, x_var);
137 x = nir_iadd_imm(b, x, y);
138 nir_store_var(b, x_var, x, 0x1);
139 }
140
141 struct nvk_nir_push {
142 nir_variable *addr;
143 nir_variable *dw_count;
144 unsigned max_dw_count;
145 };
146
147 static void
nvk_nir_push_start(nir_builder * b,struct nvk_nir_push * p,nir_def * addr)148 nvk_nir_push_start(nir_builder *b, struct nvk_nir_push *p, nir_def *addr)
149 {
150 p->addr = nir_local_variable_create(b->impl, glsl_uint64_t_type(),
151 "nvk_nir_push::addr");
152 nir_store_var(b, p->addr, addr, 0x1);
153 p->dw_count = nir_local_variable_create(b->impl, glsl_uint_type(),
154 "nvk_nir_push::dw_count");
155 nir_store_var(b, p->dw_count, nir_imm_int(b, 0), 0x1);
156 p->max_dw_count = 0;
157 }
158
159 static inline void
nvk_nir_push_dw(nir_builder * b,struct nvk_nir_push * p,nir_def * dw)160 nvk_nir_push_dw(nir_builder *b, struct nvk_nir_push *p, nir_def *dw)
161 {
162 store_global_dw(b, nir_load_var(b, p->addr), 0, dw);
163 nir_iadd_to_var_imm(b, p->addr, 4);
164 nir_iadd_to_var_imm(b, p->dw_count, 1);
165 p->max_dw_count++;
166 }
167
168 static inline void
nvk_nir_push_copy_dws(nir_builder * b,struct nvk_nir_push * p,nir_def * src_dw_addr_in,nir_def * dw_count,uint32_t max_dw_count)169 nvk_nir_push_copy_dws(nir_builder *b, struct nvk_nir_push *p,
170 nir_def *src_dw_addr_in, nir_def *dw_count,
171 uint32_t max_dw_count)
172 {
173 nir_variable *i = nir_local_variable_create(b->impl, glsl_uint_type(), "i");
174 nir_store_var(b, i, nir_imm_int(b, 0), 0x1);
175
176 nir_variable *src_dw_addr =
177 nir_local_variable_create(b->impl, glsl_uint64_t_type(), "src_dw_addr");
178 nir_store_var(b, src_dw_addr, src_dw_addr_in, 0x1);
179
180 nir_push_loop(b);
181 {
182 nir_push_if(b, nir_uge(b, nir_load_var(b, i), dw_count));
183 {
184 nir_jump(b, nir_jump_break);
185 }
186 nir_pop_if(b, NULL);
187
188 nir_def *dw = load_global_dw(b, nir_load_var(b, src_dw_addr), 0);
189 store_global_dw(b, nir_load_var(b, p->addr), 0, dw);
190
191 nir_iadd_to_var_imm(b, i, 1);
192 nir_iadd_to_var_imm(b, p->addr, 4);
193 nir_iadd_to_var_imm(b, src_dw_addr, 4);
194 }
195 nir_pop_loop(b, NULL);
196
197 nir_iadd_to_var(b, p->dw_count, dw_count);
198 p->max_dw_count += max_dw_count;
199 }
200
201 static inline void
nvk_nir_build_p_1inc(nir_builder * b,struct nvk_nir_push * p,int subc,int mthd,unsigned dw_count)202 nvk_nir_build_p_1inc(nir_builder *b, struct nvk_nir_push *p,
203 int subc, int mthd, unsigned dw_count)
204 {
205 uint32_t hdr = NVC0_FIFO_PKHDR_1I(subc, mthd, dw_count);
206 nvk_nir_push_dw(b, p, nir_imm_int(b, hdr));
207 }
208 #define nvk_nir_P_1INC(b, p, class, mthd, size_dw) \
209 nvk_nir_build_p_1inc((b), (p), SUBC_##class, class##_##mthd, (size_dw))
210
211 static void
nvk_nir_build_pad_NOP(nir_builder * b,struct nvk_nir_push * p,uint32_t nop)212 nvk_nir_build_pad_NOP(nir_builder *b, struct nvk_nir_push *p, uint32_t nop)
213 {
214 nir_push_loop(b);
215 {
216 nir_push_if(b, nir_uge_imm(b, nir_load_var(b, p->dw_count),
217 p->max_dw_count));
218 {
219 nir_jump(b, nir_jump_break);
220 }
221 nir_pop_if(b, NULL);
222
223 store_global_dw(b, nir_load_var(b, p->addr), 0, nir_imm_int(b, nop));
224 nir_iadd_to_var_imm(b, p->addr, 4);
225 nir_iadd_to_var_imm(b, p->dw_count, 1);
226 }
227 nir_pop_loop(b, NULL);
228 }
229 #define nvk_nir_pad_NOP(b, p, class) \
230 nvk_nir_build_pad_NOP((b), (p), \
231 NVC0_FIFO_PKHDR_IL(SUBC_##class, class##_NO_OPERATION, 0))
232
233 #define QMD_ALIGN 0x100
234 #define QMD_ALLOC_SIZE QMD_ALIGN
235 #define QMD_ROOT_SIZE (sizeof(struct nvk_ies_cs_qmd) + \
236 sizeof(struct nvk_root_descriptor_table))
237
238 static_assert(sizeof(struct nvk_ies_cs_qmd) % QMD_ALIGN == 0,
239 "QMD size is not properly algined");
240 static_assert(sizeof(struct nvk_root_descriptor_table) % QMD_ALIGN == 0,
241 "Root descriptor table size is not aligned");
242
243 static void
copy_repl_global_dw(nir_builder * b,nir_def * dst_addr,nir_def * src_addr,nir_def ** repl_dw,uint32_t dw_count)244 copy_repl_global_dw(nir_builder *b, nir_def *dst_addr, nir_def *src_addr,
245 nir_def **repl_dw, uint32_t dw_count)
246 {
247 for (uint32_t i = 0; i < dw_count; i++) {
248 nir_def *dw;
249 if (repl_dw[i] == NULL)
250 dw = load_global_dw(b, src_addr, i);
251 else
252 dw = repl_dw[i];
253 store_global_dw(b, dst_addr, i, dw);
254 }
255 }
256
257 static void
build_process_cs_cmd_seq(nir_builder * b,struct nvk_nir_push * p,nir_def * in_addr,nir_def * seq_idx,struct process_cmd_in * in,struct nvk_physical_device * pdev,const VkIndirectCommandsLayoutCreateInfoEXT * info,uint32_t * qmd_size_per_seq_B_out)258 build_process_cs_cmd_seq(nir_builder *b, struct nvk_nir_push *p,
259 nir_def *in_addr, nir_def *seq_idx,
260 struct process_cmd_in *in,
261 struct nvk_physical_device *pdev,
262 const VkIndirectCommandsLayoutCreateInfoEXT *info,
263 uint32_t *qmd_size_per_seq_B_out)
264 {
265 /* If we don't have any indirect execution set, the currently bound shader
266 * will be passed in there.
267 */
268 nir_def *shader_qmd_addr = in->ies_addr;
269
270 nir_def *root_repl[sizeof(struct nvk_root_descriptor_table) / 4] = {};
271
272 #define root_dw(member) ( \
273 assert(nvk_root_descriptor_offset(member) % 4 == 0), \
274 nvk_root_descriptor_offset(member) / 4)
275
276 root_repl[root_dw(cs.base_group[0])] = nir_imm_int(b, 0);
277 root_repl[root_dw(cs.base_group[1])] = nir_imm_int(b, 0);
278 root_repl[root_dw(cs.base_group[2])] = nir_imm_int(b, 0);
279
280 *qmd_size_per_seq_B_out = 0;
281 for (uint32_t t = 0; t < info->tokenCount; t++) {
282 const VkIndirectCommandsLayoutTokenEXT *token = &info->pTokens[t];
283
284 nir_def *token_addr = nir_iadd_imm(b, in_addr, token->offset);
285 switch (token->type) {
286 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT: {
287 assert(token->data.pExecutionSet->shaderStages ==
288 VK_SHADER_STAGE_COMPUTE_BIT);
289 assert(t == 0);
290
291 nir_def *idx = load_global_dw(b, token_addr, 0);
292 shader_qmd_addr = build_exec_set_addr(b, in, idx);
293 break;
294 }
295
296 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_EXT: {
297 const VkPushConstantRange *pc_range =
298 &token->data.pPushConstant->updateRange;
299
300 assert(pc_range->offset % 4 == 0);
301 assert(pc_range->size % 4 == 0);
302
303 const uint32_t start_dw = root_dw(push) + (pc_range->offset / 4);
304 for (uint32_t i = 0; i < pc_range->size / 4; i++)
305 root_repl[start_dw + i] = load_global_dw(b, token_addr, i);
306 break;
307 }
308
309 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_SEQUENCE_INDEX_EXT: {
310 const VkPushConstantRange *pc_range =
311 &token->data.pPushConstant->updateRange;
312
313 assert(pc_range->offset % 4 == 0);
314 assert(pc_range->size == 4);
315
316 const uint32_t dw = root_dw(push) + (pc_range->offset / 4);
317 root_repl[dw] = seq_idx;
318 break;
319 }
320
321 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_EXT: {
322 nir_def *disp_size_x = load_global_dw(b, token_addr, 0);
323 nir_def *disp_size_y = load_global_dw(b, token_addr, 1);
324 nir_def *disp_size_z = load_global_dw(b, token_addr, 2);
325
326 *qmd_size_per_seq_B_out += QMD_ROOT_SIZE;
327
328 nir_push_if(b, nir_ior(b, nir_ior(b, nir_ine_imm(b, disp_size_x, 0),
329 nir_ine_imm(b, disp_size_y, 0)),
330 nir_ine_imm(b, disp_size_z, 0)));
331 {
332 /* The first dword in qmd_addr is an allocator in units of 256
333 * bytes.
334 */
335 nir_def *qmd_idx =
336 nir_global_atomic(b, 32, in->qmd_pool_addr,
337 nir_imm_int(b, QMD_ROOT_SIZE / QMD_ALIGN),
338 .atomic_op = nir_atomic_op_iadd);
339 nir_def *qmd_offset =
340 nir_imul_imm(b, nir_u2u64(b, qmd_idx), QMD_ALIGN);
341 nir_def *qmd_addr = nir_iadd(b, in->qmd_pool_addr, qmd_offset);
342 nir_def *root_addr =
343 nir_iadd_imm(b, qmd_addr, sizeof(struct nvk_ies_cs_qmd));
344
345 /* Upload and patch the root descriptor table */
346 root_repl[root_dw(cs.group_count[0])] = disp_size_x;
347 root_repl[root_dw(cs.group_count[1])] = disp_size_y;
348 root_repl[root_dw(cs.group_count[2])] = disp_size_z;
349 copy_repl_global_dw(b, root_addr, in->root_addr,
350 root_repl, ARRAY_SIZE(root_repl));
351
352 /* Upload and patch the QMD */
353 const struct nak_qmd_dispatch_size_layout qmd_layout =
354 nak_get_qmd_dispatch_size_layout(&pdev->info);
355 assert(qmd_layout.x_start % 32 == 0);
356 assert(qmd_layout.y_start == qmd_layout.x_start + 32);
357 assert(qmd_layout.z_start == qmd_layout.x_start + 64);
358
359 nir_def *qmd_repl[sizeof(struct nvk_ies_cs_qmd) / 4] = {};
360 qmd_repl[qmd_layout.x_start / 32] = disp_size_x;
361 qmd_repl[qmd_layout.y_start / 32] = disp_size_y;
362 qmd_repl[qmd_layout.z_start / 32] = disp_size_z;
363
364 /* TODO: Get these from NAK? */
365 const uint32_t cb0_lo_start = 1024, cb0_hi_start = 1056;
366 qmd_repl[cb0_lo_start / 32] = nir_unpack_64_2x32_split_x(b, root_addr);
367 qmd_repl[cb0_hi_start / 32] =
368 nir_ior(b, load_global_dw(b, shader_qmd_addr, cb0_hi_start / 32),
369 nir_unpack_64_2x32_split_y(b, root_addr));
370
371 copy_repl_global_dw(b, qmd_addr, shader_qmd_addr,
372 qmd_repl, ARRAY_SIZE(qmd_repl));
373
374 /* Now emit commands */
375 nir_def *invoc = nir_imul_2x32_64(b, disp_size_x, disp_size_y);
376 invoc = nir_imul(b, invoc, nir_u2u64(b, disp_size_z));
377 nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS), 2);
378 nvk_nir_push_dw(b, p, nir_unpack_64_2x32_split_y(b, invoc));
379 nvk_nir_push_dw(b, p, nir_unpack_64_2x32_split_x(b, invoc));
380
381 nvk_nir_P_1INC(b, p, NVA0C0, SEND_PCAS_A, 1);
382 nvk_nir_push_dw(b, p, nir_u2u32(b, nir_ushr_imm(b, qmd_addr, 8)));
383
384 if (pdev->info.cls_compute >= AMPERE_COMPUTE_A) {
385 uint32_t signal;
386 V_NVC6C0_SEND_SIGNALING_PCAS2_B(signal,
387 PCAS_ACTION_INVALIDATE_COPY_SCHEDULE);
388 nvk_nir_P_1INC(b, p, NVC6C0, SEND_SIGNALING_PCAS2_B, 1);
389 nvk_nir_push_dw(b, p, nir_imm_int(b, signal));
390 } else {
391 uint32_t signal;
392 V_NVA0C0_SEND_SIGNALING_PCAS_B(signal, {
393 .invalidate = INVALIDATE_TRUE,
394 .schedule = SCHEDULE_TRUE
395 });
396 nvk_nir_P_1INC(b, p, NVA0C0, SEND_SIGNALING_PCAS_B, 1);
397 nvk_nir_push_dw(b, p, nir_imm_int(b, signal));
398 }
399 }
400 nir_pop_if(b, NULL);
401 break;
402 }
403
404 default:
405 unreachable("Unsupported indirect token type");
406 }
407 }
408 }
409
410 /*
411 * Graphics
412 */
413
414 static void
build_gfx_set_exec(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,struct process_cmd_in * in,struct nvk_physical_device * pdev,const VkIndirectCommandsExecutionSetTokenEXT * token)415 build_gfx_set_exec(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
416 struct process_cmd_in *in,
417 struct nvk_physical_device *pdev,
418 const VkIndirectCommandsExecutionSetTokenEXT *token)
419 {
420 switch (token->type) {
421 case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT: {
422 nir_def *idx = load_global_dw(b, token_addr, 0);
423 nir_def *push_addr = build_exec_set_addr(b, in, idx);
424 nir_def *dw_count = load_global_dw(b, push_addr, 0);
425 const uint16_t max_dw_count =
426 nvk_ies_gfx_pipeline_max_dw_count(pdev, token->shaderStages);
427 nvk_nir_push_copy_dws(b, p, nir_iadd_imm(b, push_addr, 4),
428 dw_count, max_dw_count);
429 break;
430 }
431
432 case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT: {
433 int32_t i = 0;
434 gl_shader_stage type_stage[6] = {};
435 nir_def *type_shader_idx[6] = {};
436 gl_shader_stage last_vtgm = MESA_SHADER_VERTEX;
437 u_foreach_bit(s, token->shaderStages) {
438 gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s);
439
440 if (stage != MESA_SHADER_FRAGMENT)
441 last_vtgm = stage;
442
443 uint32_t type = mesa_to_nv9097_shader_type(stage);
444 type_stage[type] = stage;
445 type_shader_idx[type] = load_global_dw(b, token_addr, i++);
446 }
447
448 for (uint32_t type = 0; type < 6; type++) {
449 nir_def *shader_idx = type_shader_idx[type];
450 if (shader_idx == NULL)
451 continue;
452
453 bool is_last_vtgm = type_stage[type] == last_vtgm;
454
455 nir_def *push_addr = build_exec_set_addr(b, in, shader_idx);
456 nir_def *hdr = load_global_dw(b, push_addr, 0);
457 nir_def *dw_count =
458 nir_extract_u16(b, hdr, nir_imm_int(b, is_last_vtgm));
459 const uint16_t max_dw_count =
460 nvk_ies_gfx_shader_max_dw_count(pdev, token->shaderStages,
461 is_last_vtgm);
462 nvk_nir_push_copy_dws(b, p, nir_iadd_imm(b, push_addr, 4),
463 dw_count, max_dw_count);
464 }
465 break;
466 }
467
468 default:
469 unreachable("Unknown indirect execution set type");
470 }
471 }
472
473 static void
build_push_gfx_const(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,const VkIndirectCommandsPushConstantTokenEXT * token)474 build_push_gfx_const(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
475 const VkIndirectCommandsPushConstantTokenEXT *token)
476 {
477 const VkPushConstantRange *pc_range = &token->updateRange;
478
479 // TODO: Compute
480 assert(!(pc_range->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT));
481
482 assert(pc_range->offset % 4 == 0);
483 assert(pc_range->size % 4 == 0);
484 const uint32_t dw_count = pc_range->size / 4;
485
486 nvk_nir_P_1INC(b, p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET, 1 + dw_count);
487 nvk_nir_push_dw(b, p, nir_imm_int(b,
488 nvk_root_descriptor_offset(push) + pc_range->offset));
489 for (uint32_t i = 0; i < dw_count; i++)
490 nvk_nir_push_dw(b, p, load_global_dw(b, token_addr, i));
491 }
492
493 static void
build_push_gfx_seq_idx(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,nir_def * seq_idx,const VkIndirectCommandsPushConstantTokenEXT * token)494 build_push_gfx_seq_idx(nir_builder *b, struct nvk_nir_push *p,
495 nir_def *token_addr, nir_def *seq_idx,
496 const VkIndirectCommandsPushConstantTokenEXT *token)
497 {
498 const VkPushConstantRange *pc_range = &token->updateRange;
499
500 // TODO: Compute
501 assert(!(pc_range->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT));
502
503 assert(pc_range->offset % 4 == 0);
504 assert(pc_range->size == 4);
505 nvk_nir_P_1INC(b, p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET, 2);
506 nvk_nir_push_dw(b, p, nir_imm_int(b,
507 nvk_root_descriptor_offset(push) + pc_range->offset));
508 nvk_nir_push_dw(b, p, seq_idx);
509 }
510
511 static void
build_set_ib(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,UNUSED const VkIndirectCommandsIndexBufferTokenEXT * token)512 build_set_ib(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
513 UNUSED const VkIndirectCommandsIndexBufferTokenEXT *token)
514 {
515 nir_def *addr_lo = load_global_dw(b, token_addr, 0);
516 nir_def *addr_hi = load_global_dw(b, token_addr, 1);
517 nir_def *size_B = load_global_dw(b, token_addr, 2);
518 nir_def *idx_fmt = load_global_dw(b, token_addr, 3);
519
520 nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_IB), 4);
521 nvk_nir_push_dw(b, p, addr_hi);
522 nvk_nir_push_dw(b, p, addr_lo);
523 nvk_nir_push_dw(b, p, size_B);
524 nvk_nir_push_dw(b, p, idx_fmt);
525 }
526
527 static nir_def *
nvk_nir_vb_stride(nir_builder * b,nir_def * vb_idx,nir_def * stride)528 nvk_nir_vb_stride(nir_builder *b, nir_def *vb_idx, nir_def *stride)
529 {
530 return nir_pack_32_2x16_split(b, nir_u2u16(b, stride),
531 nir_u2u16(b, vb_idx));
532 }
533
534 static void
build_set_vb(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,const VkIndirectCommandsVertexBufferTokenEXT * token)535 build_set_vb(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
536 const VkIndirectCommandsVertexBufferTokenEXT *token)
537 {
538 nir_def *vb_idx = nir_imm_int(b, token->vertexBindingUnit);
539 nir_def *addr_lo = load_global_dw(b, token_addr, 0);
540 nir_def *addr_hi = load_global_dw(b, token_addr, 1);
541 nir_def *size_B = load_global_dw(b, token_addr, 2);
542 nir_def *stride_B = load_global_dw(b, token_addr, 3);
543
544 nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_VB), 4);
545 nvk_nir_push_dw(b, p, vb_idx);
546 nvk_nir_push_dw(b, p, addr_hi);
547 nvk_nir_push_dw(b, p, addr_lo);
548 nvk_nir_push_dw(b, p, size_B);
549
550 nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_SET_VB_STRIDE), 1);
551 nvk_nir_push_dw(b, p, nvk_nir_vb_stride(b, vb_idx, stride_B));
552 }
553
554 static void
build_draw(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr)555 build_draw(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr)
556 {
557 nir_def *vertex_count = load_global_dw(b, token_addr, 0);
558 nir_def *instance_count = load_global_dw(b, token_addr, 1);
559 nir_def *first_vertex = load_global_dw(b, token_addr, 2);
560 nir_def *first_instance = load_global_dw(b, token_addr, 3);
561
562 nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW), 5);
563 nvk_nir_push_dw(b, p, nir_imm_int(b, 0)); /* draw_index */
564 nvk_nir_push_dw(b, p, vertex_count);
565 nvk_nir_push_dw(b, p, instance_count);
566 nvk_nir_push_dw(b, p, first_vertex);
567 nvk_nir_push_dw(b, p, first_instance);
568 }
569
570 static void
build_draw_indexed(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr)571 build_draw_indexed(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr)
572 {
573 nir_def *index_count = load_global_dw(b, token_addr, 0);
574 nir_def *instance_count = load_global_dw(b, token_addr, 1);
575 nir_def *first_index = load_global_dw(b, token_addr, 2);
576 nir_def *vertex_offset = load_global_dw(b, token_addr, 3);
577 nir_def *first_instance = load_global_dw(b, token_addr, 4);
578
579 nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED), 6);
580 nvk_nir_push_dw(b, p, nir_imm_int(b, 0)); /* draw_index */
581 nvk_nir_push_dw(b, p, index_count);
582 nvk_nir_push_dw(b, p, instance_count);
583 nvk_nir_push_dw(b, p, first_index);
584 nvk_nir_push_dw(b, p, vertex_offset);
585 nvk_nir_push_dw(b, p, first_instance);
586 }
587
588 static void
build_draw_count(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr)589 build_draw_count(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr)
590 {
591 nir_def *addr_lo = load_global_dw(b, token_addr, 0);
592 nir_def *addr_hi = load_global_dw(b, token_addr, 1);
593 nir_def *stride = load_global_dw(b, token_addr, 2);
594 nir_def *count = load_global_dw(b, token_addr, 3);
595
596 nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT), 4);
597 nvk_nir_push_dw(b, p, addr_hi);
598 nvk_nir_push_dw(b, p, addr_lo);
599 nvk_nir_push_dw(b, p, count);
600 nvk_nir_push_dw(b, p, stride);
601 }
602
603 static void
build_draw_indexed_count(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr)604 build_draw_indexed_count(nir_builder *b, struct nvk_nir_push *p,
605 nir_def *token_addr)
606 {
607 nir_def *addr_lo = load_global_dw(b, token_addr, 0);
608 nir_def *addr_hi = load_global_dw(b, token_addr, 1);
609 nir_def *stride = load_global_dw(b, token_addr, 2);
610 nir_def *count = load_global_dw(b, token_addr, 3);
611
612 nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT), 4);
613 nvk_nir_push_dw(b, p, addr_hi);
614 nvk_nir_push_dw(b, p, addr_lo);
615 nvk_nir_push_dw(b, p, count);
616 nvk_nir_push_dw(b, p, stride);
617 }
618
619 static void
build_process_gfx_cmd_seq(nir_builder * b,struct nvk_nir_push * p,nir_def * in_addr,nir_def * seq_idx,struct process_cmd_in * in,struct nvk_physical_device * pdev,const VkIndirectCommandsLayoutCreateInfoEXT * info)620 build_process_gfx_cmd_seq(nir_builder *b, struct nvk_nir_push *p,
621 nir_def *in_addr, nir_def *seq_idx,
622 struct process_cmd_in *in,
623 struct nvk_physical_device *pdev,
624 const VkIndirectCommandsLayoutCreateInfoEXT *info)
625 {
626 for (uint32_t t = 0; t < info->tokenCount; t++) {
627 const VkIndirectCommandsLayoutTokenEXT *token = &info->pTokens[t];
628
629 nir_def *token_addr = nir_iadd_imm(b, in_addr, token->offset);
630 switch (token->type) {
631 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT:
632 assert(t == 0);
633 build_gfx_set_exec(b, p, token_addr, in, pdev,
634 token->data.pExecutionSet);
635 break;
636
637 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_EXT:
638 build_push_gfx_const(b, p, token_addr, token->data.pPushConstant);
639 break;
640
641 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_SEQUENCE_INDEX_EXT:
642 build_push_gfx_seq_idx(b, p, token_addr, seq_idx,
643 token->data.pPushConstant);
644 break;
645
646 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_EXT:
647 build_set_ib(b, p, token_addr, token->data.pIndexBuffer);
648 break;
649
650 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_EXT:
651 build_set_vb(b, p, token_addr, token->data.pVertexBuffer);
652 break;
653
654 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_EXT:
655 build_draw_indexed(b, p, token_addr);
656 break;
657
658 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_EXT:
659 build_draw(b, p, token_addr);
660 break;
661
662 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_COUNT_EXT:
663 build_draw_indexed_count(b, p, token_addr);
664 break;
665
666 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_COUNT_EXT:
667 build_draw_count(b, p, token_addr);
668 break;
669
670 default:
671 unreachable("Unsupported indirect token type");
672 }
673 }
674 }
675
676 static VkResult
build_init_shader(struct nvk_device * dev,const VkIndirectCommandsLayoutCreateInfoEXT * info,uint32_t qmd_size_per_seq_B,const VkAllocationCallbacks * pAllocator,struct nvk_shader ** shader_out)677 build_init_shader(struct nvk_device *dev,
678 const VkIndirectCommandsLayoutCreateInfoEXT *info,
679 uint32_t qmd_size_per_seq_B,
680 const VkAllocationCallbacks *pAllocator,
681 struct nvk_shader **shader_out)
682 {
683 /* There's nothing to initialize for graphics */
684 if (info->shaderStages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
685 assert(!(info->shaderStages & ~NVK_SHADER_STAGE_GRAPHICS_BITS));
686 *shader_out = NULL;
687 return VK_SUCCESS;
688 }
689
690 if (qmd_size_per_seq_B == 0) {
691 *shader_out = NULL;
692 return VK_SUCCESS;
693 }
694
695 nir_builder build =
696 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
697 "nvk-init-indirect-commands");
698 build.shader->info.workgroup_size[0] = 32;
699 nir_builder *b = &build;
700
701 struct process_cmd_in in = load_process_cmd_in(b);
702
703 if (qmd_size_per_seq_B > 0) {
704 /* Initialize the QMD allocator to 1 * QMD_ALIGN so that the QMDs we
705 * allocate don't stomp the allocator.
706 */
707 assert(info->shaderStages == VK_SHADER_STAGE_COMPUTE_BIT);
708 store_global_dw(b, in.qmd_pool_addr, 0, nir_imm_int(b, 1));
709 }
710
711 return nvk_compile_nir_shader(dev, build.shader, pAllocator, shader_out);
712 }
713
714 static VkResult
build_process_shader(struct nvk_device * dev,const VkIndirectCommandsLayoutCreateInfoEXT * info,const VkAllocationCallbacks * pAllocator,struct nvk_shader ** shader_out,uint32_t * cmd_seq_stride_B_out,uint32_t * qmd_size_per_seq_B_out)715 build_process_shader(struct nvk_device *dev,
716 const VkIndirectCommandsLayoutCreateInfoEXT *info,
717 const VkAllocationCallbacks *pAllocator,
718 struct nvk_shader **shader_out,
719 uint32_t *cmd_seq_stride_B_out,
720 uint32_t *qmd_size_per_seq_B_out)
721 {
722 struct nvk_physical_device *pdev = nvk_device_physical(dev);
723
724 nir_builder build =
725 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
726 "nvk-process-indirect-commands");
727 build.shader->info.workgroup_size[0] = 32;
728 nir_builder *b = &build;
729
730 struct process_cmd_in in = load_process_cmd_in(b);
731
732 nir_def *seq_idx = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
733
734 /* We always execute a 32-wide shader and nothing guarantees that
735 * max_seq_count is a multiple of 32 so we need to bail if our index is
736 * above the maximum. If we're inside the maximum but less than the count,
737 * we setill need to emit a bunch of NOP.
738 */
739 nir_push_if(b, nir_uge(b, seq_idx, in.max_seq_count));
740 {
741 nir_jump(b, nir_jump_halt);
742 }
743 nir_pop_if(b, NULL);
744
745 nir_def *ind_count;
746 nir_push_if(b, nir_ine_imm(b, in.count_addr, 0));
747 {
748 ind_count = load_global_dw(b, in.count_addr, 0);
749 ind_count = nir_umin(b, ind_count, in.max_seq_count);
750 }
751 nir_pop_if(b, NULL);
752 nir_def *count = nir_if_phi(b, ind_count, in.max_seq_count);
753
754 nir_def *in_seq_addr = nir_iadd(b, in.in_addr,
755 nir_imul_imm(b, nir_u2u64(b, seq_idx), info->indirectStride));
756 /* We'll replace this later once we know what it is */
757 nir_def *out_stride = nir_imm_int(b, 0xc0ffee0);
758 nir_def *out_seq_addr = nir_iadd(b, in.out_addr,
759 nir_imul_2x32_64(b, seq_idx, out_stride));
760
761 struct nvk_nir_push push = {};
762 nvk_nir_push_start(b, &push, out_seq_addr);
763
764 nir_push_if(b, nir_ult(b, seq_idx, count));
765 {
766 if (info->shaderStages & VK_SHADER_STAGE_COMPUTE_BIT) {
767 assert(info->shaderStages == VK_SHADER_STAGE_COMPUTE_BIT);
768 build_process_cs_cmd_seq(b, &push, in_seq_addr, seq_idx,
769 &in, pdev, info, qmd_size_per_seq_B_out);
770 } else if (info->shaderStages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
771 assert(!(info->shaderStages & ~NVK_SHADER_STAGE_GRAPHICS_BITS));
772 *qmd_size_per_seq_B_out = 0;
773 build_process_gfx_cmd_seq(b, &push, in_seq_addr, seq_idx,
774 &in, pdev, info);
775 } else {
776 unreachable("Unknown shader stage");
777 }
778 }
779 nir_pop_if(b, NULL);
780
781 /* Always pad the command buffer. In the case where seq_idx >= count, the
782 * entire sequence will be NO_OPERATION.
783 */
784 if (info->shaderStages & VK_SHADER_STAGE_COMPUTE_BIT) {
785 nvk_nir_pad_NOP(b, &push, NVA0C0);
786 } else if (info->shaderStages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
787 nvk_nir_pad_NOP(b, &push, NV9097);
788 } else {
789 unreachable("Unknown shader stage");
790 }
791
792 /* Replace the out stride with the actual size of a command stream */
793 nir_load_const_instr *out_stride_const =
794 nir_instr_as_load_const(out_stride->parent_instr);
795 out_stride_const->value[0].u32 = push.max_dw_count * 4;
796
797 /* We also output this stride to go in the layout struct */
798 *cmd_seq_stride_B_out = push.max_dw_count * 4;
799
800 return nvk_compile_nir_shader(dev, build.shader, pAllocator, shader_out);
801 }
802
803 static void
nvk_indirect_commands_layout_destroy(struct nvk_device * dev,struct nvk_indirect_commands_layout * layout,const VkAllocationCallbacks * alloc)804 nvk_indirect_commands_layout_destroy(struct nvk_device *dev,
805 struct nvk_indirect_commands_layout *layout,
806 const VkAllocationCallbacks *alloc)
807 {
808 if (layout->init != NULL)
809 vk_shader_destroy(&dev->vk, &layout->init->vk, alloc);
810 if (layout->process != NULL)
811 vk_shader_destroy(&dev->vk, &layout->process->vk, alloc);
812 vk_object_free(&dev->vk, alloc, layout);
813 }
814
815 VKAPI_ATTR VkResult VKAPI_CALL
nvk_CreateIndirectCommandsLayoutEXT(VkDevice _device,const VkIndirectCommandsLayoutCreateInfoEXT * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkIndirectCommandsLayoutEXT * pIndirectCommandsLayout)816 nvk_CreateIndirectCommandsLayoutEXT(
817 VkDevice _device,
818 const VkIndirectCommandsLayoutCreateInfoEXT *pCreateInfo,
819 const VkAllocationCallbacks *pAllocator,
820 VkIndirectCommandsLayoutEXT *pIndirectCommandsLayout)
821 {
822 VK_FROM_HANDLE(nvk_device, dev, _device);
823 VkResult result;
824
825 struct nvk_indirect_commands_layout *layout =
826 vk_object_zalloc(&dev->vk, pAllocator, sizeof(*layout),
827 VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_EXT);
828 if (layout == NULL)
829 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
830
831 layout->stages = pCreateInfo->shaderStages;
832
833 /* From the Vulkan 1.3.XXX spec:
834 *
835 * VUID-VkIndirectCommandsLayoutCreateInfoEXT-pTokens-11093
836 *
837 * "The number of tokens in the pTokens array with type equal to
838 * VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT must be less than
839 * or equal to 1"
840 *
841 * and
842 *
843 * VUID-VkIndirectCommandsLayoutCreateInfoEXT-pTokens-11139
844 *
845 * "If the pTokens array contains a
846 * VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT token, then this
847 * token must be the first token in the array"
848 */
849 if (pCreateInfo->tokenCount > 0 &&
850 pCreateInfo->pTokens[0].type ==
851 VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT) {
852 const VkIndirectCommandsExecutionSetTokenEXT *token =
853 pCreateInfo->pTokens[0].data.pExecutionSet;
854
855 /* Pipelines should never mismatch here. */
856 if (token->type == VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT)
857 assert(token->shaderStages == pCreateInfo->shaderStages);
858
859 layout->set_stages = token->shaderStages;
860 }
861
862 result = build_process_shader(dev, pCreateInfo, pAllocator,
863 &layout->process, &layout->cmd_seq_stride_B,
864 &layout->qmd_size_per_seq_B);
865 if (result != VK_SUCCESS) {
866 nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
867 return result;
868 }
869
870 if (layout->cmd_seq_stride_B > (NV_PUSH_MAX_COUNT * 4)) {
871 nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
872 return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
873 "Too many tokens in IndirectCommandsLayout");
874 }
875
876 result = build_init_shader(dev, pCreateInfo, layout->qmd_size_per_seq_B,
877 pAllocator, &layout->init);
878 if (result != VK_SUCCESS) {
879 nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
880 return result;
881 }
882
883 *pIndirectCommandsLayout = nvk_indirect_commands_layout_to_handle(layout);
884
885 return VK_SUCCESS;
886 }
887
888 VKAPI_ATTR void VKAPI_CALL
nvk_DestroyIndirectCommandsLayoutEXT(VkDevice _device,VkIndirectCommandsLayoutEXT indirectCommandsLayout,const VkAllocationCallbacks * pAllocator)889 nvk_DestroyIndirectCommandsLayoutEXT(
890 VkDevice _device,
891 VkIndirectCommandsLayoutEXT indirectCommandsLayout,
892 const VkAllocationCallbacks *pAllocator)
893 {
894 VK_FROM_HANDLE(nvk_device, dev, _device);
895 VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
896 indirectCommandsLayout);
897
898 if (layout == NULL)
899 return;
900
901 nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
902 }
903
904 VKAPI_ATTR void VKAPI_CALL
nvk_GetGeneratedCommandsMemoryRequirementsEXT(VkDevice _device,const VkGeneratedCommandsMemoryRequirementsInfoEXT * pInfo,VkMemoryRequirements2 * pMemoryRequirements)905 nvk_GetGeneratedCommandsMemoryRequirementsEXT(
906 VkDevice _device,
907 const VkGeneratedCommandsMemoryRequirementsInfoEXT *pInfo,
908 VkMemoryRequirements2 *pMemoryRequirements)
909 {
910 VK_FROM_HANDLE(nvk_device, dev, _device);
911 VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
912 pInfo->indirectCommandsLayout);
913 struct nvk_physical_device *pdev = nvk_device_physical(dev);
914
915 uint64_t size = layout->cmd_seq_stride_B * (uint64_t)pInfo->maxSequenceCount;
916 if (layout->qmd_size_per_seq_B > 0) {
917 size = align64(size, QMD_ALIGN);
918 size += QMD_ALLOC_SIZE;
919 size += layout->qmd_size_per_seq_B * pInfo->maxSequenceCount;
920 }
921
922 pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
923 .size = size,
924 .alignment = QMD_ALIGN,
925 .memoryTypeBits = BITFIELD_MASK(pdev->mem_type_count),
926 };
927 }
928
929 static void
nvk_cmd_process_cmds(struct nvk_cmd_buffer * cmd,const VkGeneratedCommandsInfoEXT * info,const struct nvk_cmd_state * state)930 nvk_cmd_process_cmds(struct nvk_cmd_buffer *cmd,
931 const VkGeneratedCommandsInfoEXT *info,
932 const struct nvk_cmd_state *state)
933 {
934 VK_FROM_HANDLE(nvk_indirect_execution_set, ies, info->indirectExecutionSet);
935 VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
936 info->indirectCommandsLayout);
937
938 struct process_cmd_push push = {
939 .in_addr = info->indirectAddress,
940 .out_addr = info->preprocessAddress,
941 .count_addr = info->sequenceCountAddress,
942 .max_seq_count = info->maxSequenceCount,
943 };
944
945 uint64_t qmd_addr = 0;
946 if (layout->stages & VK_SHADER_STAGE_COMPUTE_BIT) {
947 uint32_t global_size[3] = { 0, 0, 0 };
948 VkResult result = nvk_cmd_flush_cs_qmd(cmd, global_size, &qmd_addr,
949 &push.root_addr);
950 if (unlikely(result != VK_SUCCESS)) {
951 vk_command_buffer_set_error(&cmd->vk, result);
952 return;
953 }
954 }
955
956 if (layout->set_stages == 0) {
957 push.ies_addr = qmd_addr;
958 } else {
959 assert(layout->set_stages == layout->stages);
960 push.ies_addr = ies->mem->va->addr;
961 push.ies_stride = ies->stride_B;
962 }
963
964 if (layout->qmd_size_per_seq_B > 0) {
965 assert(info->preprocessAddress % QMD_ALIGN == 0);
966 uint64_t qmd_offset =
967 layout->cmd_seq_stride_B * (uint64_t)info->maxSequenceCount;
968 qmd_offset = align64(qmd_offset, QMD_ALIGN);
969 push.qmd_pool_addr = info->preprocessAddress + qmd_offset;
970 }
971
972 if (layout->init != NULL) {
973 nvk_cmd_dispatch_shader(cmd, layout->init, &push, sizeof(push), 1, 1, 1);
974
975 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
976 P_IMMD(p, NVA0C0, WAIT_FOR_IDLE, 0);
977 }
978
979 nvk_cmd_dispatch_shader(cmd, layout->process, &push, sizeof(push),
980 DIV_ROUND_UP(info->maxSequenceCount, 32), 1, 1);
981 }
982
983 static void
nvk_cmd_flush_process_state(struct nvk_cmd_buffer * cmd,const VkGeneratedCommandsInfoEXT * info)984 nvk_cmd_flush_process_state(struct nvk_cmd_buffer *cmd,
985 const VkGeneratedCommandsInfoEXT *info)
986 {
987 struct nvk_descriptor_state *desc =
988 nvk_get_descriptor_state_for_stages(cmd, info->shaderStages);
989 nvk_cmd_buffer_flush_push_descriptors(cmd, desc);
990 }
991
992 VKAPI_ATTR void VKAPI_CALL
nvk_CmdPreprocessGeneratedCommandsEXT(VkCommandBuffer commandBuffer,const VkGeneratedCommandsInfoEXT * info,VkCommandBuffer stateCommandBuffer)993 nvk_CmdPreprocessGeneratedCommandsEXT(VkCommandBuffer commandBuffer,
994 const VkGeneratedCommandsInfoEXT *info,
995 VkCommandBuffer stateCommandBuffer)
996 {
997 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
998 VK_FROM_HANDLE(nvk_cmd_buffer, state_cmd, stateCommandBuffer);
999
1000 nvk_cmd_flush_process_state(state_cmd, info);
1001 nvk_cmd_process_cmds(cmd, info, &state_cmd->state);
1002 }
1003
1004 VKAPI_ATTR void VKAPI_CALL
nvk_CmdExecuteGeneratedCommandsEXT(VkCommandBuffer commandBuffer,VkBool32 isPreprocessed,const VkGeneratedCommandsInfoEXT * info)1005 nvk_CmdExecuteGeneratedCommandsEXT(VkCommandBuffer commandBuffer,
1006 VkBool32 isPreprocessed,
1007 const VkGeneratedCommandsInfoEXT *info)
1008 {
1009 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
1010 VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
1011 info->indirectCommandsLayout);
1012
1013 if (!isPreprocessed) {
1014 nvk_cmd_flush_process_state(cmd, info);
1015 nvk_cmd_process_cmds(cmd, info, &cmd->state);
1016
1017 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
1018 P_IMMD(p, NVA0C0, INVALIDATE_SHADER_CACHES, {
1019 .data = DATA_TRUE,
1020 .constant = CONSTANT_TRUE,
1021 .flush_data = FLUSH_DATA_TRUE,
1022 });
1023 P_IMMD(p, NVB1C0, INVALIDATE_SKED_CACHES, 0);
1024 __push_immd(p, SUBC_NV9097, NV906F_SET_REFERENCE, 0);
1025 }
1026
1027 if (layout->stages & VK_SHADER_STAGE_COMPUTE_BIT) {
1028 assert(info->shaderStages == VK_SHADER_STAGE_COMPUTE_BIT);
1029 nvk_cmd_buffer_flush_push_descriptors(cmd, &cmd->state.cs.descriptors);
1030 } else if (layout->stages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
1031 assert(!(layout->stages & ~NVK_SHADER_STAGE_GRAPHICS_BITS));
1032
1033 nvk_cmd_buffer_flush_push_descriptors(cmd, &cmd->state.gfx.descriptors);
1034 nvk_cmd_flush_gfx_dynamic_state(cmd);
1035
1036 if (layout->set_stages == 0) {
1037 /* In this case, we're using the CPU-bound shaders */
1038 nvk_cmd_flush_gfx_shaders(cmd);
1039 nvk_cmd_flush_gfx_cbufs(cmd);
1040 } else {
1041 /* From the Vulkan 1.3.XXX spec:
1042 *
1043 * "If indirectCommandsLayout was created with a token sequence
1044 * that contained the
1045 * VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT token and
1046 * indirectExecutionSet was created using
1047 * VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT, every
1048 * executed VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT
1049 * token must bind all the shader stages set in the
1050 * VkIndirectCommandsExecutionSetTokenEXT::shaderStages used to
1051 * create indirectCommandsLayout"
1052 *
1053 * So we unbind anything not explicitly bound by the layout and trust
1054 * the layout to bind the rest.
1055 */
1056 assert(layout->set_stages == layout->stages);
1057
1058 uint8_t set_types = 0;
1059 u_foreach_bit(s, layout->set_stages) {
1060 gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s);
1061 uint32_t type = mesa_to_nv9097_shader_type(stage);
1062 set_types |= BITFIELD_BIT(type);
1063 }
1064
1065 uint8_t unset_types = BITFIELD_MASK(6) & ~set_types;
1066
1067 struct nv_push *p = nvk_cmd_buffer_push(cmd, 12);
1068 u_foreach_bit(type, unset_types) {
1069 P_IMMD(p, NV9097, SET_PIPELINE_SHADER(type), {
1070 .enable = ENABLE_FALSE,
1071 .type = type,
1072 });
1073 }
1074 }
1075 }
1076
1077 ASSERTED const uint64_t size =
1078 layout->cmd_seq_stride_B * (uint64_t)info->maxSequenceCount;
1079 assert(size <= info->preprocessSize);
1080
1081 uint64_t addr = info->preprocessAddress;
1082 uint64_t seq_count = info->maxSequenceCount;
1083
1084 /* Break it into pices that are a multiple of cmd_seq_stride_B so that, if
1085 * the kernel inserts a sync point between two of our pushes, it doesn't
1086 * break a single command.
1087 */
1088 const uint32_t max_seq_per_push =
1089 (NV_PUSH_MAX_COUNT * 4) / layout->cmd_seq_stride_B;
1090
1091 while (seq_count > 0) {
1092 uint32_t push_seq = MIN2(seq_count, max_seq_per_push);
1093 uint32_t push_size_B = push_seq * layout->cmd_seq_stride_B;
1094 nvk_cmd_buffer_push_indirect(cmd, addr, push_size_B);
1095 addr += push_size_B;
1096 seq_count -= push_seq;
1097 }
1098
1099 if (layout->set_stages != 0) {
1100 if (layout->stages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
1101 cmd->state.gfx.shaders_dirty |= NVK_SHADER_STAGE_GRAPHICS_BITS;
1102 }
1103 }
1104 }
1105