1 /*
2 * Copyright © 2024 Collabora Ltd. and Red Hat Inc.
3 * SPDX-License-Identifier: MIT
4 */
5 #include "nvk_cmd_buffer.h"
6 #include "nvk_device.h"
7 #include "nvk_entrypoints.h"
8 #include "nvk_indirect_execution_set.h"
9 #include "nvk_mme.h"
10 #include "nvk_physical_device.h"
11 #include "nvk_shader.h"
12
13 #include "nir_builder.h"
14 #include "vk_pipeline.h"
15
16 #include "nv_push.h"
17 #include "nv_push_cl9097.h"
18 #include "nv_push_cl906f.h"
19 #include "nv_push_cla0c0.h"
20 #include "nv_push_clb1c0.h"
21 #include "nv_push_clc6c0.h"
22
23 struct nvk_indirect_commands_layout {
24 struct vk_object_base base;
25
26 VkShaderStageFlags stages;
27
28 /* Stages set by the first TOKEN_TYPE_EXECUTION_SET */
29 VkShaderStageFlags set_stages;
30
31 uint32_t cmd_seq_stride_B;
32 uint32_t qmd_size_per_seq_B;
33
34 struct nvk_shader *init;
35 struct nvk_shader *process;
36 };
37
38 VK_DEFINE_NONDISP_HANDLE_CASTS(nvk_indirect_commands_layout, base,
39 VkIndirectCommandsLayoutEXT,
40 VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_EXT);
41
42 struct process_cmd_push {
43 uint64_t in_addr;
44 uint64_t out_addr;
45 uint64_t qmd_pool_addr;
46 uint64_t count_addr;
47 uint32_t max_seq_count;
48 uint32_t ies_stride;
49 uint64_t ies_addr;
50 uint64_t root_addr;
51 };
52
53 struct process_cmd_in {
54 nir_def *in_addr;
55 nir_def *out_addr;
56 nir_def *qmd_pool_addr;
57 nir_def *count_addr;
58 nir_def *max_seq_count;
59 nir_def *ies_stride;
60 nir_def *ies_addr;
61 nir_def *root_addr;
62 };
63
64 static nir_def *
load_struct_var(nir_builder * b,nir_variable * var,uint32_t field)65 load_struct_var(nir_builder *b, nir_variable *var, uint32_t field)
66 {
67 nir_deref_instr *deref =
68 nir_build_deref_struct(b, nir_build_deref_var(b, var), field);
69 return nir_load_deref(b, deref);
70 }
71
72 static struct process_cmd_in
load_process_cmd_in(nir_builder * b)73 load_process_cmd_in(nir_builder *b)
74 {
75 struct glsl_struct_field push_fields[] = {
76 { .type = glsl_uint64_t_type(), .name = "in_addr", .offset = 0x00 },
77 { .type = glsl_uint64_t_type(), .name = "out_addr", .offset = 0x08 },
78 { .type = glsl_uint64_t_type(), .name = "qmd_pool_addr", .offset = 0x10 },
79 { .type = glsl_uint64_t_type(), .name = "count_addr", .offset = 0x18 },
80 { .type = glsl_uint_type(), .name = "max_seq_count", .offset = 0x20 },
81 { .type = glsl_uint_type(), .name = "ies_stride", .offset = 0x24 },
82 { .type = glsl_uint64_t_type(), .name = "ies_addr", .offset = 0x28 },
83 { .type = glsl_uint64_t_type(), .name = "root_addr", .offset = 0x30 },
84 };
85 const struct glsl_type *push_iface_type =
86 glsl_interface_type(push_fields, ARRAY_SIZE(push_fields),
87 GLSL_INTERFACE_PACKING_STD140,
88 false /* row_major */, "push");
89 nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const,
90 push_iface_type, "push");
91
92 return (struct process_cmd_in) {
93 .in_addr = load_struct_var(b, push, 0),
94 .out_addr = load_struct_var(b, push, 1),
95 .qmd_pool_addr = load_struct_var(b, push, 2),
96 .count_addr = load_struct_var(b, push, 3),
97 .max_seq_count = load_struct_var(b, push, 4),
98 .ies_stride = load_struct_var(b, push, 5),
99 .ies_addr = load_struct_var(b, push, 6),
100 .root_addr = load_struct_var(b, push, 7),
101 };
102 }
103
104 static nir_def *
build_exec_set_addr(nir_builder * b,struct process_cmd_in * in,nir_def * idx)105 build_exec_set_addr(nir_builder *b, struct process_cmd_in *in, nir_def *idx)
106 {
107 nir_def *offset = nir_imul_2x32_64(b, in->ies_stride, idx);
108 return nir_iadd(b, in->ies_addr, offset);
109 }
110
111 static nir_def *
load_global_dw(nir_builder * b,nir_def * addr,uint32_t offset_dw)112 load_global_dw(nir_builder *b, nir_def *addr, uint32_t offset_dw)
113 {
114 return nir_load_global(b, nir_iadd_imm(b, addr, offset_dw * 4), 4, 1, 32);
115 }
116
117 static void
store_global_dw(nir_builder * b,nir_def * addr,uint32_t offset_dw,nir_def * data)118 store_global_dw(nir_builder *b, nir_def *addr, uint32_t offset_dw,
119 nir_def *data)
120 {
121 assert(data->bit_size == 32 && data->num_components == 1);
122 nir_store_global(b, nir_iadd_imm(b, addr, offset_dw * 4), 4, data, 0x1);
123 }
124
125 static void
nir_iadd_to_var(nir_builder * b,nir_variable * x_var,nir_def * y)126 nir_iadd_to_var(nir_builder *b, nir_variable *x_var, nir_def *y)
127 {
128 nir_def *x = nir_load_var(b, x_var);
129 x = nir_iadd(b, x, y);
130 nir_store_var(b, x_var, x, 0x1);
131 }
132
133 static void
nir_iadd_to_var_imm(nir_builder * b,nir_variable * x_var,uint64_t y)134 nir_iadd_to_var_imm(nir_builder *b, nir_variable *x_var, uint64_t y)
135 {
136 nir_def *x = nir_load_var(b, x_var);
137 x = nir_iadd_imm(b, x, y);
138 nir_store_var(b, x_var, x, 0x1);
139 }
140
141 struct nvk_nir_push {
142 nir_variable *addr;
143 nir_variable *dw_count;
144 unsigned max_dw_count;
145 };
146
147 static void
nvk_nir_push_start(nir_builder * b,struct nvk_nir_push * p,nir_def * addr)148 nvk_nir_push_start(nir_builder *b, struct nvk_nir_push *p, nir_def *addr)
149 {
150 p->addr = nir_local_variable_create(b->impl, glsl_uint64_t_type(),
151 "nvk_nir_push::addr");
152 nir_store_var(b, p->addr, addr, 0x1);
153 p->dw_count = nir_local_variable_create(b->impl, glsl_uint_type(),
154 "nvk_nir_push::dw_count");
155 nir_store_var(b, p->dw_count, nir_imm_int(b, 0), 0x1);
156 p->max_dw_count = 0;
157 }
158
159 static inline void
nvk_nir_push_dw(nir_builder * b,struct nvk_nir_push * p,nir_def * dw)160 nvk_nir_push_dw(nir_builder *b, struct nvk_nir_push *p, nir_def *dw)
161 {
162 store_global_dw(b, nir_load_var(b, p->addr), 0, dw);
163 nir_iadd_to_var_imm(b, p->addr, 4);
164 nir_iadd_to_var_imm(b, p->dw_count, 1);
165 p->max_dw_count++;
166 }
167
168 static inline void
nvk_nir_push_copy_dws(nir_builder * b,struct nvk_nir_push * p,nir_def * src_dw_addr_in,nir_def * dw_count,uint32_t max_dw_count)169 nvk_nir_push_copy_dws(nir_builder *b, struct nvk_nir_push *p,
170 nir_def *src_dw_addr_in, nir_def *dw_count,
171 uint32_t max_dw_count)
172 {
173 nir_variable *i = nir_local_variable_create(b->impl, glsl_uint_type(), "i");
174 nir_store_var(b, i, nir_imm_int(b, 0), 0x1);
175
176 nir_variable *src_dw_addr =
177 nir_local_variable_create(b->impl, glsl_uint64_t_type(), "src_dw_addr");
178 nir_store_var(b, src_dw_addr, src_dw_addr_in, 0x1);
179
180 nir_push_loop(b);
181 {
182 nir_push_if(b, nir_uge(b, nir_load_var(b, i), dw_count));
183 {
184 nir_jump(b, nir_jump_break);
185 }
186 nir_pop_if(b, NULL);
187
188 nir_def *dw = load_global_dw(b, nir_load_var(b, src_dw_addr), 0);
189 store_global_dw(b, nir_load_var(b, p->addr), 0, dw);
190
191 nir_iadd_to_var_imm(b, i, 1);
192 nir_iadd_to_var_imm(b, p->addr, 4);
193 nir_iadd_to_var_imm(b, src_dw_addr, 4);
194 }
195 nir_pop_loop(b, NULL);
196
197 nir_iadd_to_var(b, p->dw_count, dw_count);
198 p->max_dw_count += max_dw_count;
199 }
200
201 static inline void
nvk_nir_build_p_1inc(nir_builder * b,struct nvk_nir_push * p,int subc,int mthd,unsigned dw_count)202 nvk_nir_build_p_1inc(nir_builder *b, struct nvk_nir_push *p,
203 int subc, int mthd, unsigned dw_count)
204 {
205 uint32_t hdr = NVC0_FIFO_PKHDR_1I(subc, mthd, dw_count);
206 nvk_nir_push_dw(b, p, nir_imm_int(b, hdr));
207 }
208 #define nvk_nir_P_1INC(b, p, class, mthd, size_dw) \
209 nvk_nir_build_p_1inc((b), (p), SUBC_##class, class##_##mthd, (size_dw))
210
211 static void
nvk_nir_build_pad_NOP(nir_builder * b,struct nvk_nir_push * p,uint32_t nop)212 nvk_nir_build_pad_NOP(nir_builder *b, struct nvk_nir_push *p, uint32_t nop)
213 {
214 nir_push_loop(b);
215 {
216 nir_push_if(b, nir_uge_imm(b, nir_load_var(b, p->dw_count),
217 p->max_dw_count));
218 {
219 nir_jump(b, nir_jump_break);
220 }
221 nir_pop_if(b, NULL);
222
223 store_global_dw(b, nir_load_var(b, p->addr), 0, nir_imm_int(b, nop));
224 nir_iadd_to_var_imm(b, p->addr, 4);
225 nir_iadd_to_var_imm(b, p->dw_count, 1);
226 }
227 nir_pop_loop(b, NULL);
228 }
229 #define nvk_nir_pad_NOP(b, p, class) \
230 nvk_nir_build_pad_NOP((b), (p), \
231 NVC0_FIFO_PKHDR_IL(SUBC_##class, class##_NO_OPERATION, 0))
232
233 #define QMD_ALIGN 0x100
234 #define QMD_ALLOC_SIZE QMD_ALIGN
235 #define QMD_ROOT_SIZE (sizeof(struct nvk_ies_cs_qmd) + \
236 sizeof(struct nvk_root_descriptor_table))
237
238 static_assert(sizeof(struct nvk_ies_cs_qmd) % QMD_ALIGN == 0,
239 "QMD size is not properly algined");
240 static_assert(sizeof(struct nvk_root_descriptor_table) % QMD_ALIGN == 0,
241 "Root descriptor table size is not aligned");
242 static_assert(NVK_DGC_ALIGN >= QMD_ALIGN,
243 "QMD alignment requirement is a lower bound of DGC alignment");
244
245 static void
copy_repl_global_dw(nir_builder * b,nir_def * dst_addr,nir_def * src_addr,nir_def ** repl_dw,uint32_t dw_count)246 copy_repl_global_dw(nir_builder *b, nir_def *dst_addr, nir_def *src_addr,
247 nir_def **repl_dw, uint32_t dw_count)
248 {
249 for (uint32_t i = 0; i < dw_count; i++) {
250 nir_def *dw;
251 if (repl_dw[i] == NULL)
252 dw = load_global_dw(b, src_addr, i);
253 else
254 dw = repl_dw[i];
255 store_global_dw(b, dst_addr, i, dw);
256 }
257 }
258
259 static void
build_process_cs_cmd_seq(nir_builder * b,struct nvk_nir_push * p,nir_def * in_addr,nir_def * seq_idx,struct process_cmd_in * in,struct nvk_physical_device * pdev,const VkIndirectCommandsLayoutCreateInfoEXT * info,uint32_t * qmd_size_per_seq_B_out)260 build_process_cs_cmd_seq(nir_builder *b, struct nvk_nir_push *p,
261 nir_def *in_addr, nir_def *seq_idx,
262 struct process_cmd_in *in,
263 struct nvk_physical_device *pdev,
264 const VkIndirectCommandsLayoutCreateInfoEXT *info,
265 uint32_t *qmd_size_per_seq_B_out)
266 {
267 /* If we don't have any indirect execution set, the currently bound shader
268 * will be passed in there.
269 */
270 nir_def *shader_qmd_addr = in->ies_addr;
271
272 nir_def *root_repl[sizeof(struct nvk_root_descriptor_table) / 4] = {};
273
274 #define root_dw(member) ( \
275 assert(nvk_root_descriptor_offset(member) % 4 == 0), \
276 nvk_root_descriptor_offset(member) / 4)
277
278 root_repl[root_dw(cs.base_group[0])] = nir_imm_int(b, 0);
279 root_repl[root_dw(cs.base_group[1])] = nir_imm_int(b, 0);
280 root_repl[root_dw(cs.base_group[2])] = nir_imm_int(b, 0);
281
282 *qmd_size_per_seq_B_out = 0;
283 for (uint32_t t = 0; t < info->tokenCount; t++) {
284 const VkIndirectCommandsLayoutTokenEXT *token = &info->pTokens[t];
285
286 nir_def *token_addr = nir_iadd_imm(b, in_addr, token->offset);
287 switch (token->type) {
288 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT: {
289 assert(token->data.pExecutionSet->shaderStages ==
290 VK_SHADER_STAGE_COMPUTE_BIT);
291 assert(t == 0);
292
293 nir_def *idx = load_global_dw(b, token_addr, 0);
294 shader_qmd_addr = build_exec_set_addr(b, in, idx);
295 break;
296 }
297
298 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_EXT: {
299 const VkPushConstantRange *pc_range =
300 &token->data.pPushConstant->updateRange;
301
302 assert(pc_range->offset % 4 == 0);
303 assert(pc_range->size % 4 == 0);
304
305 const uint32_t start_dw = root_dw(push) + (pc_range->offset / 4);
306 for (uint32_t i = 0; i < pc_range->size / 4; i++)
307 root_repl[start_dw + i] = load_global_dw(b, token_addr, i);
308 break;
309 }
310
311 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_SEQUENCE_INDEX_EXT: {
312 const VkPushConstantRange *pc_range =
313 &token->data.pPushConstant->updateRange;
314
315 assert(pc_range->offset % 4 == 0);
316 assert(pc_range->size == 4);
317
318 const uint32_t dw = root_dw(push) + (pc_range->offset / 4);
319 root_repl[dw] = seq_idx;
320 break;
321 }
322
323 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_EXT: {
324 nir_def *disp_size_x = load_global_dw(b, token_addr, 0);
325 nir_def *disp_size_y = load_global_dw(b, token_addr, 1);
326 nir_def *disp_size_z = load_global_dw(b, token_addr, 2);
327
328 *qmd_size_per_seq_B_out += QMD_ROOT_SIZE;
329
330 nir_push_if(b, nir_ior(b, nir_ior(b, nir_ine_imm(b, disp_size_x, 0),
331 nir_ine_imm(b, disp_size_y, 0)),
332 nir_ine_imm(b, disp_size_z, 0)));
333 {
334 /* The first dword in qmd_addr is an allocator in units of 256
335 * bytes.
336 */
337 nir_def *qmd_idx =
338 nir_global_atomic(b, 32, in->qmd_pool_addr,
339 nir_imm_int(b, QMD_ROOT_SIZE / QMD_ALIGN),
340 .atomic_op = nir_atomic_op_iadd);
341 nir_def *qmd_offset =
342 nir_imul_imm(b, nir_u2u64(b, qmd_idx), QMD_ALIGN);
343 nir_def *qmd_addr = nir_iadd(b, in->qmd_pool_addr, qmd_offset);
344 nir_def *root_addr =
345 nir_iadd_imm(b, qmd_addr, sizeof(struct nvk_ies_cs_qmd));
346
347 /* Upload and patch the root descriptor table */
348 root_repl[root_dw(cs.group_count[0])] = disp_size_x;
349 root_repl[root_dw(cs.group_count[1])] = disp_size_y;
350 root_repl[root_dw(cs.group_count[2])] = disp_size_z;
351 copy_repl_global_dw(b, root_addr, in->root_addr,
352 root_repl, ARRAY_SIZE(root_repl));
353
354 /* Upload and patch the QMD */
355 const struct nak_qmd_dispatch_size_layout qmd_layout =
356 nak_get_qmd_dispatch_size_layout(&pdev->info);
357 assert(qmd_layout.x_start % 32 == 0);
358 assert(qmd_layout.x_end == qmd_layout.x_start + 32);
359 assert(qmd_layout.y_start == qmd_layout.x_start + 32);
360
361 nir_def *qmd_repl[sizeof(struct nvk_ies_cs_qmd) / 4] = {};
362 qmd_repl[qmd_layout.x_start / 32] = disp_size_x;
363
364 if (qmd_layout.z_start == qmd_layout.y_start + 32) {
365 qmd_repl[qmd_layout.y_start / 32] = disp_size_y;
366 qmd_repl[qmd_layout.z_start / 32] = disp_size_z;
367 } else {
368 assert(qmd_layout.y_end == qmd_layout.y_start + 16);
369 assert(qmd_layout.z_start == qmd_layout.x_start + 48);
370 assert(qmd_layout.z_end == qmd_layout.z_start + 16);
371 qmd_repl[qmd_layout.y_start / 32] =
372 nir_pack_32_2x16_split(b, nir_u2u16(b, disp_size_y),
373 nir_u2u16(b, disp_size_z));
374 }
375
376 struct nak_qmd_cbuf_desc_layout cb0_layout =
377 nak_get_qmd_cbuf_desc_layout(&pdev->info, 0);
378 assert(cb0_layout.addr_lo_start % 32 == 0);
379 assert(cb0_layout.addr_hi_start == cb0_layout.addr_lo_start + 32);
380 const uint32_t cb0_addr_lo_dw = cb0_layout.addr_lo_start / 32;
381 const uint32_t cb0_addr_hi_dw = cb0_layout.addr_hi_start / 32;
382 qmd_repl[cb0_addr_lo_dw] = nir_unpack_64_2x32_split_x(b, root_addr);
383 qmd_repl[cb0_addr_hi_dw] =
384 nir_ior(b, load_global_dw(b, shader_qmd_addr, cb0_addr_hi_dw),
385 nir_unpack_64_2x32_split_y(b, root_addr));
386
387 copy_repl_global_dw(b, qmd_addr, shader_qmd_addr,
388 qmd_repl, ARRAY_SIZE(qmd_repl));
389
390 /* Now emit commands */
391 nir_def *invoc = nir_imul_2x32_64(b, disp_size_x, disp_size_y);
392 invoc = nir_imul(b, invoc, nir_u2u64(b, disp_size_z));
393 nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS), 2);
394 nvk_nir_push_dw(b, p, nir_unpack_64_2x32_split_y(b, invoc));
395 nvk_nir_push_dw(b, p, nir_unpack_64_2x32_split_x(b, invoc));
396
397 nvk_nir_P_1INC(b, p, NVA0C0, SEND_PCAS_A, 1);
398 nvk_nir_push_dw(b, p, nir_u2u32(b, nir_ushr_imm(b, qmd_addr, 8)));
399
400 if (pdev->info.cls_compute >= AMPERE_COMPUTE_A) {
401 uint32_t signal;
402 V_NVC6C0_SEND_SIGNALING_PCAS2_B(signal,
403 PCAS_ACTION_INVALIDATE_COPY_SCHEDULE);
404 nvk_nir_P_1INC(b, p, NVC6C0, SEND_SIGNALING_PCAS2_B, 1);
405 nvk_nir_push_dw(b, p, nir_imm_int(b, signal));
406 } else {
407 uint32_t signal;
408 V_NVA0C0_SEND_SIGNALING_PCAS_B(signal, {
409 .invalidate = INVALIDATE_TRUE,
410 .schedule = SCHEDULE_TRUE
411 });
412 nvk_nir_P_1INC(b, p, NVA0C0, SEND_SIGNALING_PCAS_B, 1);
413 nvk_nir_push_dw(b, p, nir_imm_int(b, signal));
414 }
415 }
416 nir_pop_if(b, NULL);
417 break;
418 }
419
420 default:
421 unreachable("Unsupported indirect token type");
422 }
423 }
424 }
425
426 /*
427 * Graphics
428 */
429
430 static void
build_gfx_set_exec(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,struct process_cmd_in * in,struct nvk_physical_device * pdev,const VkIndirectCommandsExecutionSetTokenEXT * token)431 build_gfx_set_exec(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
432 struct process_cmd_in *in,
433 struct nvk_physical_device *pdev,
434 const VkIndirectCommandsExecutionSetTokenEXT *token)
435 {
436 switch (token->type) {
437 case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT: {
438 nir_def *idx = load_global_dw(b, token_addr, 0);
439 nir_def *push_addr = build_exec_set_addr(b, in, idx);
440 nir_def *dw_count = load_global_dw(b, push_addr, 0);
441 const uint16_t max_dw_count =
442 nvk_ies_gfx_pipeline_max_dw_count(pdev, token->shaderStages);
443 nvk_nir_push_copy_dws(b, p, nir_iadd_imm(b, push_addr, 4),
444 dw_count, max_dw_count);
445 break;
446 }
447
448 case VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT: {
449 int32_t i = 0;
450 gl_shader_stage type_stage[6] = {};
451 nir_def *type_shader_idx[6] = {};
452 gl_shader_stage last_vtgm = MESA_SHADER_VERTEX;
453 u_foreach_bit(s, token->shaderStages) {
454 gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s);
455
456 if (stage != MESA_SHADER_FRAGMENT)
457 last_vtgm = stage;
458
459 uint32_t type = mesa_to_nv9097_shader_type(stage);
460 type_stage[type] = stage;
461 type_shader_idx[type] = load_global_dw(b, token_addr, i++);
462 }
463
464 for (uint32_t type = 0; type < 6; type++) {
465 nir_def *shader_idx = type_shader_idx[type];
466 if (shader_idx == NULL)
467 continue;
468
469 bool is_last_vtgm = type_stage[type] == last_vtgm;
470
471 nir_def *push_addr = build_exec_set_addr(b, in, shader_idx);
472 nir_def *hdr = load_global_dw(b, push_addr, 0);
473 nir_def *dw_count =
474 nir_extract_u16(b, hdr, nir_imm_int(b, is_last_vtgm));
475 const uint16_t max_dw_count =
476 nvk_ies_gfx_shader_max_dw_count(pdev, token->shaderStages,
477 is_last_vtgm);
478 nvk_nir_push_copy_dws(b, p, nir_iadd_imm(b, push_addr, 4),
479 dw_count, max_dw_count);
480 }
481 break;
482 }
483
484 default:
485 unreachable("Unknown indirect execution set type");
486 }
487 }
488
489 static void
build_push_gfx_const(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,const VkIndirectCommandsPushConstantTokenEXT * token)490 build_push_gfx_const(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
491 const VkIndirectCommandsPushConstantTokenEXT *token)
492 {
493 const VkPushConstantRange *pc_range = &token->updateRange;
494
495 // TODO: Compute
496 assert(!(pc_range->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT));
497
498 assert(pc_range->offset % 4 == 0);
499 assert(pc_range->size % 4 == 0);
500 const uint32_t dw_count = pc_range->size / 4;
501
502 nvk_nir_P_1INC(b, p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET, 1 + dw_count);
503 nvk_nir_push_dw(b, p, nir_imm_int(b,
504 nvk_root_descriptor_offset(push) + pc_range->offset));
505 for (uint32_t i = 0; i < dw_count; i++)
506 nvk_nir_push_dw(b, p, load_global_dw(b, token_addr, i));
507 }
508
509 static void
build_push_gfx_seq_idx(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,nir_def * seq_idx,const VkIndirectCommandsPushConstantTokenEXT * token)510 build_push_gfx_seq_idx(nir_builder *b, struct nvk_nir_push *p,
511 nir_def *token_addr, nir_def *seq_idx,
512 const VkIndirectCommandsPushConstantTokenEXT *token)
513 {
514 const VkPushConstantRange *pc_range = &token->updateRange;
515
516 // TODO: Compute
517 assert(!(pc_range->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT));
518
519 assert(pc_range->offset % 4 == 0);
520 assert(pc_range->size == 4);
521 nvk_nir_P_1INC(b, p, NV9097, LOAD_CONSTANT_BUFFER_OFFSET, 2);
522 nvk_nir_push_dw(b, p, nir_imm_int(b,
523 nvk_root_descriptor_offset(push) + pc_range->offset));
524 nvk_nir_push_dw(b, p, seq_idx);
525 }
526
527 static void
build_set_ib(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,UNUSED const VkIndirectCommandsIndexBufferTokenEXT * token)528 build_set_ib(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
529 UNUSED const VkIndirectCommandsIndexBufferTokenEXT *token)
530 {
531 nir_def *addr_lo = load_global_dw(b, token_addr, 0);
532 nir_def *addr_hi = load_global_dw(b, token_addr, 1);
533 nir_def *size_B = load_global_dw(b, token_addr, 2);
534 nir_def *idx_fmt = load_global_dw(b, token_addr, 3);
535
536 nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_IB), 4);
537 nvk_nir_push_dw(b, p, addr_hi);
538 nvk_nir_push_dw(b, p, addr_lo);
539 nvk_nir_push_dw(b, p, size_B);
540 nvk_nir_push_dw(b, p, idx_fmt);
541 }
542
543 static nir_def *
nvk_nir_vb_stride(nir_builder * b,nir_def * vb_idx,nir_def * stride)544 nvk_nir_vb_stride(nir_builder *b, nir_def *vb_idx, nir_def *stride)
545 {
546 return nir_pack_32_2x16_split(b, nir_u2u16(b, stride),
547 nir_u2u16(b, vb_idx));
548 }
549
550 static void
build_set_vb(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr,const VkIndirectCommandsVertexBufferTokenEXT * token)551 build_set_vb(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr,
552 const VkIndirectCommandsVertexBufferTokenEXT *token)
553 {
554 nir_def *vb_idx = nir_imm_int(b, token->vertexBindingUnit);
555 nir_def *addr_lo = load_global_dw(b, token_addr, 0);
556 nir_def *addr_hi = load_global_dw(b, token_addr, 1);
557 nir_def *size_B = load_global_dw(b, token_addr, 2);
558 nir_def *stride_B = load_global_dw(b, token_addr, 3);
559
560 nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_BIND_VB), 4);
561 nvk_nir_push_dw(b, p, vb_idx);
562 nvk_nir_push_dw(b, p, addr_hi);
563 nvk_nir_push_dw(b, p, addr_lo);
564 nvk_nir_push_dw(b, p, size_B);
565
566 nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_SET_VB_STRIDE), 1);
567 nvk_nir_push_dw(b, p, nvk_nir_vb_stride(b, vb_idx, stride_B));
568 }
569
570 static void
build_draw(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr)571 build_draw(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr)
572 {
573 nir_def *vertex_count = load_global_dw(b, token_addr, 0);
574 nir_def *instance_count = load_global_dw(b, token_addr, 1);
575 nir_def *first_vertex = load_global_dw(b, token_addr, 2);
576 nir_def *first_instance = load_global_dw(b, token_addr, 3);
577
578 nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW), 5);
579 nvk_nir_push_dw(b, p, nir_imm_int(b, 0)); /* draw_index */
580 nvk_nir_push_dw(b, p, vertex_count);
581 nvk_nir_push_dw(b, p, instance_count);
582 nvk_nir_push_dw(b, p, first_vertex);
583 nvk_nir_push_dw(b, p, first_instance);
584 }
585
586 static void
build_draw_indexed(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr)587 build_draw_indexed(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr)
588 {
589 nir_def *index_count = load_global_dw(b, token_addr, 0);
590 nir_def *instance_count = load_global_dw(b, token_addr, 1);
591 nir_def *first_index = load_global_dw(b, token_addr, 2);
592 nir_def *vertex_offset = load_global_dw(b, token_addr, 3);
593 nir_def *first_instance = load_global_dw(b, token_addr, 4);
594
595 nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED), 6);
596 nvk_nir_push_dw(b, p, nir_imm_int(b, 0)); /* draw_index */
597 nvk_nir_push_dw(b, p, index_count);
598 nvk_nir_push_dw(b, p, instance_count);
599 nvk_nir_push_dw(b, p, first_index);
600 nvk_nir_push_dw(b, p, vertex_offset);
601 nvk_nir_push_dw(b, p, first_instance);
602 }
603
604 static void
build_draw_count(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr)605 build_draw_count(nir_builder *b, struct nvk_nir_push *p, nir_def *token_addr)
606 {
607 nir_def *addr_lo = load_global_dw(b, token_addr, 0);
608 nir_def *addr_hi = load_global_dw(b, token_addr, 1);
609 nir_def *stride = load_global_dw(b, token_addr, 2);
610 nir_def *count = load_global_dw(b, token_addr, 3);
611
612 nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDIRECT), 4);
613 nvk_nir_push_dw(b, p, addr_hi);
614 nvk_nir_push_dw(b, p, addr_lo);
615 nvk_nir_push_dw(b, p, count);
616 nvk_nir_push_dw(b, p, stride);
617 }
618
619 static void
build_draw_indexed_count(nir_builder * b,struct nvk_nir_push * p,nir_def * token_addr)620 build_draw_indexed_count(nir_builder *b, struct nvk_nir_push *p,
621 nir_def *token_addr)
622 {
623 nir_def *addr_lo = load_global_dw(b, token_addr, 0);
624 nir_def *addr_hi = load_global_dw(b, token_addr, 1);
625 nir_def *stride = load_global_dw(b, token_addr, 2);
626 nir_def *count = load_global_dw(b, token_addr, 3);
627
628 nvk_nir_P_1INC(b, p, NV9097, CALL_MME_MACRO(NVK_MME_DRAW_INDEXED_INDIRECT), 4);
629 nvk_nir_push_dw(b, p, addr_hi);
630 nvk_nir_push_dw(b, p, addr_lo);
631 nvk_nir_push_dw(b, p, count);
632 nvk_nir_push_dw(b, p, stride);
633 }
634
635 static void
build_process_gfx_cmd_seq(nir_builder * b,struct nvk_nir_push * p,nir_def * in_addr,nir_def * seq_idx,struct process_cmd_in * in,struct nvk_physical_device * pdev,const VkIndirectCommandsLayoutCreateInfoEXT * info)636 build_process_gfx_cmd_seq(nir_builder *b, struct nvk_nir_push *p,
637 nir_def *in_addr, nir_def *seq_idx,
638 struct process_cmd_in *in,
639 struct nvk_physical_device *pdev,
640 const VkIndirectCommandsLayoutCreateInfoEXT *info)
641 {
642 for (uint32_t t = 0; t < info->tokenCount; t++) {
643 const VkIndirectCommandsLayoutTokenEXT *token = &info->pTokens[t];
644
645 nir_def *token_addr = nir_iadd_imm(b, in_addr, token->offset);
646 switch (token->type) {
647 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT:
648 assert(t == 0);
649 build_gfx_set_exec(b, p, token_addr, in, pdev,
650 token->data.pExecutionSet);
651 break;
652
653 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_EXT:
654 build_push_gfx_const(b, p, token_addr, token->data.pPushConstant);
655 break;
656
657 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_SEQUENCE_INDEX_EXT:
658 build_push_gfx_seq_idx(b, p, token_addr, seq_idx,
659 token->data.pPushConstant);
660 break;
661
662 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_EXT:
663 build_set_ib(b, p, token_addr, token->data.pIndexBuffer);
664 break;
665
666 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_EXT:
667 build_set_vb(b, p, token_addr, token->data.pVertexBuffer);
668 break;
669
670 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_EXT:
671 build_draw_indexed(b, p, token_addr);
672 break;
673
674 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_EXT:
675 build_draw(b, p, token_addr);
676 break;
677
678 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_COUNT_EXT:
679 build_draw_indexed_count(b, p, token_addr);
680 break;
681
682 case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_COUNT_EXT:
683 build_draw_count(b, p, token_addr);
684 break;
685
686 default:
687 unreachable("Unsupported indirect token type");
688 }
689 }
690 }
691
692 static VkResult
build_init_shader(struct nvk_device * dev,const VkIndirectCommandsLayoutCreateInfoEXT * info,uint32_t qmd_size_per_seq_B,const VkAllocationCallbacks * pAllocator,struct nvk_shader ** shader_out)693 build_init_shader(struct nvk_device *dev,
694 const VkIndirectCommandsLayoutCreateInfoEXT *info,
695 uint32_t qmd_size_per_seq_B,
696 const VkAllocationCallbacks *pAllocator,
697 struct nvk_shader **shader_out)
698 {
699 /* There's nothing to initialize for graphics */
700 if (info->shaderStages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
701 assert(!(info->shaderStages & ~NVK_SHADER_STAGE_GRAPHICS_BITS));
702 *shader_out = NULL;
703 return VK_SUCCESS;
704 }
705
706 if (qmd_size_per_seq_B == 0) {
707 *shader_out = NULL;
708 return VK_SUCCESS;
709 }
710
711 nir_builder build =
712 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
713 "nvk-init-indirect-commands");
714 build.shader->info.workgroup_size[0] = 32;
715 nir_builder *b = &build;
716
717 struct process_cmd_in in = load_process_cmd_in(b);
718
719 if (qmd_size_per_seq_B > 0) {
720 /* Initialize the QMD allocator to 1 * QMD_ALIGN so that the QMDs we
721 * allocate don't stomp the allocator.
722 */
723 assert(info->shaderStages == VK_SHADER_STAGE_COMPUTE_BIT);
724 store_global_dw(b, in.qmd_pool_addr, 0, nir_imm_int(b, 1));
725 }
726
727 return nvk_compile_nir_shader(dev, build.shader, pAllocator, shader_out);
728 }
729
730 static VkResult
build_process_shader(struct nvk_device * dev,const VkIndirectCommandsLayoutCreateInfoEXT * info,const VkAllocationCallbacks * pAllocator,struct nvk_shader ** shader_out,uint32_t * cmd_seq_stride_B_out,uint32_t * qmd_size_per_seq_B_out)731 build_process_shader(struct nvk_device *dev,
732 const VkIndirectCommandsLayoutCreateInfoEXT *info,
733 const VkAllocationCallbacks *pAllocator,
734 struct nvk_shader **shader_out,
735 uint32_t *cmd_seq_stride_B_out,
736 uint32_t *qmd_size_per_seq_B_out)
737 {
738 struct nvk_physical_device *pdev = nvk_device_physical(dev);
739
740 nir_builder build =
741 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
742 "nvk-process-indirect-commands");
743 build.shader->info.workgroup_size[0] = 32;
744 nir_builder *b = &build;
745
746 struct process_cmd_in in = load_process_cmd_in(b);
747
748 nir_def *seq_idx = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
749
750 /* We always execute a 32-wide shader and nothing guarantees that
751 * max_seq_count is a multiple of 32 so we need to bail if our index is
752 * above the maximum. If we're inside the maximum but less than the count,
753 * we setill need to emit a bunch of NOP.
754 */
755 nir_push_if(b, nir_uge(b, seq_idx, in.max_seq_count));
756 {
757 nir_jump(b, nir_jump_halt);
758 }
759 nir_pop_if(b, NULL);
760
761 nir_def *ind_count;
762 nir_push_if(b, nir_ine_imm(b, in.count_addr, 0));
763 {
764 ind_count = load_global_dw(b, in.count_addr, 0);
765 ind_count = nir_umin(b, ind_count, in.max_seq_count);
766 }
767 nir_pop_if(b, NULL);
768 nir_def *count = nir_if_phi(b, ind_count, in.max_seq_count);
769
770 nir_def *in_seq_addr = nir_iadd(b, in.in_addr,
771 nir_imul_imm(b, nir_u2u64(b, seq_idx), info->indirectStride));
772 /* We'll replace this later once we know what it is */
773 nir_def *out_stride = nir_imm_int(b, 0xc0ffee0);
774 nir_def *out_seq_addr = nir_iadd(b, in.out_addr,
775 nir_imul_2x32_64(b, seq_idx, out_stride));
776
777 struct nvk_nir_push push = {};
778 nvk_nir_push_start(b, &push, out_seq_addr);
779
780 nir_push_if(b, nir_ult(b, seq_idx, count));
781 {
782 if (info->shaderStages & VK_SHADER_STAGE_COMPUTE_BIT) {
783 assert(info->shaderStages == VK_SHADER_STAGE_COMPUTE_BIT);
784 build_process_cs_cmd_seq(b, &push, in_seq_addr, seq_idx,
785 &in, pdev, info, qmd_size_per_seq_B_out);
786 } else if (info->shaderStages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
787 assert(!(info->shaderStages & ~NVK_SHADER_STAGE_GRAPHICS_BITS));
788 *qmd_size_per_seq_B_out = 0;
789 build_process_gfx_cmd_seq(b, &push, in_seq_addr, seq_idx,
790 &in, pdev, info);
791 } else {
792 unreachable("Unknown shader stage");
793 }
794 }
795 nir_pop_if(b, NULL);
796
797 /* Always pad the command buffer. In the case where seq_idx >= count, the
798 * entire sequence will be NO_OPERATION.
799 */
800 if (info->shaderStages & VK_SHADER_STAGE_COMPUTE_BIT) {
801 nvk_nir_pad_NOP(b, &push, NVA0C0);
802 } else if (info->shaderStages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
803 nvk_nir_pad_NOP(b, &push, NV9097);
804 } else {
805 unreachable("Unknown shader stage");
806 }
807
808 /* Replace the out stride with the actual size of a command stream */
809 nir_load_const_instr *out_stride_const =
810 nir_instr_as_load_const(out_stride->parent_instr);
811 out_stride_const->value[0].u32 = push.max_dw_count * 4;
812
813 /* We also output this stride to go in the layout struct */
814 *cmd_seq_stride_B_out = push.max_dw_count * 4;
815
816 return nvk_compile_nir_shader(dev, build.shader, pAllocator, shader_out);
817 }
818
819 static void
nvk_indirect_commands_layout_destroy(struct nvk_device * dev,struct nvk_indirect_commands_layout * layout,const VkAllocationCallbacks * alloc)820 nvk_indirect_commands_layout_destroy(struct nvk_device *dev,
821 struct nvk_indirect_commands_layout *layout,
822 const VkAllocationCallbacks *alloc)
823 {
824 if (layout->init != NULL)
825 vk_shader_destroy(&dev->vk, &layout->init->vk, alloc);
826 if (layout->process != NULL)
827 vk_shader_destroy(&dev->vk, &layout->process->vk, alloc);
828 vk_object_free(&dev->vk, alloc, layout);
829 }
830
831 VKAPI_ATTR VkResult VKAPI_CALL
nvk_CreateIndirectCommandsLayoutEXT(VkDevice _device,const VkIndirectCommandsLayoutCreateInfoEXT * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkIndirectCommandsLayoutEXT * pIndirectCommandsLayout)832 nvk_CreateIndirectCommandsLayoutEXT(
833 VkDevice _device,
834 const VkIndirectCommandsLayoutCreateInfoEXT *pCreateInfo,
835 const VkAllocationCallbacks *pAllocator,
836 VkIndirectCommandsLayoutEXT *pIndirectCommandsLayout)
837 {
838 VK_FROM_HANDLE(nvk_device, dev, _device);
839 VkResult result;
840
841 struct nvk_indirect_commands_layout *layout =
842 vk_object_zalloc(&dev->vk, pAllocator, sizeof(*layout),
843 VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_EXT);
844 if (layout == NULL)
845 return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
846
847 layout->stages = pCreateInfo->shaderStages;
848
849 /* From the Vulkan 1.3.XXX spec:
850 *
851 * VUID-VkIndirectCommandsLayoutCreateInfoEXT-pTokens-11093
852 *
853 * "The number of tokens in the pTokens array with type equal to
854 * VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT must be less than
855 * or equal to 1"
856 *
857 * and
858 *
859 * VUID-VkIndirectCommandsLayoutCreateInfoEXT-pTokens-11139
860 *
861 * "If the pTokens array contains a
862 * VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT token, then this
863 * token must be the first token in the array"
864 */
865 if (pCreateInfo->tokenCount > 0 &&
866 pCreateInfo->pTokens[0].type ==
867 VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT) {
868 const VkIndirectCommandsExecutionSetTokenEXT *token =
869 pCreateInfo->pTokens[0].data.pExecutionSet;
870
871 /* Pipelines should never mismatch here. */
872 if (token->type == VK_INDIRECT_EXECUTION_SET_INFO_TYPE_PIPELINES_EXT)
873 assert(token->shaderStages == pCreateInfo->shaderStages);
874
875 layout->set_stages = token->shaderStages;
876 }
877
878 result = build_process_shader(dev, pCreateInfo, pAllocator,
879 &layout->process, &layout->cmd_seq_stride_B,
880 &layout->qmd_size_per_seq_B);
881 if (result != VK_SUCCESS) {
882 nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
883 return result;
884 }
885
886 if (layout->cmd_seq_stride_B > (NV_PUSH_MAX_COUNT * 4)) {
887 nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
888 return vk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
889 "Too many tokens in IndirectCommandsLayout");
890 }
891
892 result = build_init_shader(dev, pCreateInfo, layout->qmd_size_per_seq_B,
893 pAllocator, &layout->init);
894 if (result != VK_SUCCESS) {
895 nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
896 return result;
897 }
898
899 *pIndirectCommandsLayout = nvk_indirect_commands_layout_to_handle(layout);
900
901 return VK_SUCCESS;
902 }
903
904 VKAPI_ATTR void VKAPI_CALL
nvk_DestroyIndirectCommandsLayoutEXT(VkDevice _device,VkIndirectCommandsLayoutEXT indirectCommandsLayout,const VkAllocationCallbacks * pAllocator)905 nvk_DestroyIndirectCommandsLayoutEXT(
906 VkDevice _device,
907 VkIndirectCommandsLayoutEXT indirectCommandsLayout,
908 const VkAllocationCallbacks *pAllocator)
909 {
910 VK_FROM_HANDLE(nvk_device, dev, _device);
911 VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
912 indirectCommandsLayout);
913
914 if (layout == NULL)
915 return;
916
917 nvk_indirect_commands_layout_destroy(dev, layout, pAllocator);
918 }
919
920 VKAPI_ATTR void VKAPI_CALL
nvk_GetGeneratedCommandsMemoryRequirementsEXT(VkDevice _device,const VkGeneratedCommandsMemoryRequirementsInfoEXT * pInfo,VkMemoryRequirements2 * pMemoryRequirements)921 nvk_GetGeneratedCommandsMemoryRequirementsEXT(
922 VkDevice _device,
923 const VkGeneratedCommandsMemoryRequirementsInfoEXT *pInfo,
924 VkMemoryRequirements2 *pMemoryRequirements)
925 {
926 VK_FROM_HANDLE(nvk_device, dev, _device);
927 VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
928 pInfo->indirectCommandsLayout);
929 struct nvk_physical_device *pdev = nvk_device_physical(dev);
930
931 uint64_t size = layout->cmd_seq_stride_B * (uint64_t)pInfo->maxSequenceCount;
932 if (layout->qmd_size_per_seq_B > 0) {
933 size = align64(size, QMD_ALIGN);
934 size += QMD_ALLOC_SIZE;
935 size += layout->qmd_size_per_seq_B * pInfo->maxSequenceCount;
936 }
937
938 pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
939 .size = size,
940 .alignment = QMD_ALIGN,
941 .memoryTypeBits = BITFIELD_MASK(pdev->mem_type_count),
942 };
943 }
944
945 static void
nvk_cmd_process_cmds(struct nvk_cmd_buffer * cmd,const VkGeneratedCommandsInfoEXT * info,const struct nvk_cmd_state * state)946 nvk_cmd_process_cmds(struct nvk_cmd_buffer *cmd,
947 const VkGeneratedCommandsInfoEXT *info,
948 const struct nvk_cmd_state *state)
949 {
950 VK_FROM_HANDLE(nvk_indirect_execution_set, ies, info->indirectExecutionSet);
951 VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
952 info->indirectCommandsLayout);
953
954 struct process_cmd_push push = {
955 .in_addr = info->indirectAddress,
956 .out_addr = info->preprocessAddress,
957 .count_addr = info->sequenceCountAddress,
958 .max_seq_count = info->maxSequenceCount,
959 };
960
961 uint64_t qmd_addr = 0;
962 if (layout->stages & VK_SHADER_STAGE_COMPUTE_BIT) {
963 uint32_t global_size[3] = { 0, 0, 0 };
964 VkResult result = nvk_cmd_flush_cs_qmd(cmd, state, global_size,
965 &qmd_addr, &push.root_addr);
966 if (unlikely(result != VK_SUCCESS)) {
967 vk_command_buffer_set_error(&cmd->vk, result);
968 return;
969 }
970 }
971
972 if (layout->set_stages == 0) {
973 push.ies_addr = qmd_addr;
974 } else {
975 assert(layout->set_stages == layout->stages);
976 push.ies_addr = ies->mem->va->addr;
977 push.ies_stride = ies->stride_B;
978 }
979
980 if (layout->qmd_size_per_seq_B > 0) {
981 assert(info->preprocessAddress % QMD_ALIGN == 0);
982 uint64_t qmd_offset =
983 layout->cmd_seq_stride_B * (uint64_t)info->maxSequenceCount;
984 qmd_offset = align64(qmd_offset, QMD_ALIGN);
985 push.qmd_pool_addr = info->preprocessAddress + qmd_offset;
986 }
987
988 if (layout->init != NULL) {
989 nvk_cmd_dispatch_shader(cmd, layout->init, &push, sizeof(push), 1, 1, 1);
990
991 struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
992 P_IMMD(p, NVA0C0, WAIT_FOR_IDLE, 0);
993 }
994
995 nvk_cmd_dispatch_shader(cmd, layout->process, &push, sizeof(push),
996 DIV_ROUND_UP(info->maxSequenceCount, 32), 1, 1);
997 }
998
999 static void
nvk_cmd_flush_process_state(struct nvk_cmd_buffer * cmd,const VkGeneratedCommandsInfoEXT * info)1000 nvk_cmd_flush_process_state(struct nvk_cmd_buffer *cmd,
1001 const VkGeneratedCommandsInfoEXT *info)
1002 {
1003 struct nvk_descriptor_state *desc =
1004 nvk_get_descriptor_state_for_stages(cmd, info->shaderStages);
1005 nvk_cmd_buffer_flush_push_descriptors(cmd, desc);
1006 }
1007
1008 VKAPI_ATTR void VKAPI_CALL
nvk_CmdPreprocessGeneratedCommandsEXT(VkCommandBuffer commandBuffer,const VkGeneratedCommandsInfoEXT * info,VkCommandBuffer stateCommandBuffer)1009 nvk_CmdPreprocessGeneratedCommandsEXT(VkCommandBuffer commandBuffer,
1010 const VkGeneratedCommandsInfoEXT *info,
1011 VkCommandBuffer stateCommandBuffer)
1012 {
1013 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
1014 VK_FROM_HANDLE(nvk_cmd_buffer, state_cmd, stateCommandBuffer);
1015
1016 nvk_cmd_flush_process_state(state_cmd, info);
1017 nvk_cmd_process_cmds(cmd, info, &state_cmd->state);
1018 }
1019
1020 VKAPI_ATTR void VKAPI_CALL
nvk_CmdExecuteGeneratedCommandsEXT(VkCommandBuffer commandBuffer,VkBool32 isPreprocessed,const VkGeneratedCommandsInfoEXT * info)1021 nvk_CmdExecuteGeneratedCommandsEXT(VkCommandBuffer commandBuffer,
1022 VkBool32 isPreprocessed,
1023 const VkGeneratedCommandsInfoEXT *info)
1024 {
1025 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
1026 VK_FROM_HANDLE(nvk_indirect_commands_layout, layout,
1027 info->indirectCommandsLayout);
1028 struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
1029 const struct nvk_physical_device *pdev = nvk_device_physical(dev);
1030
1031 if (!isPreprocessed) {
1032 nvk_cmd_flush_process_state(cmd, info);
1033 nvk_cmd_process_cmds(cmd, info, &cmd->state);
1034
1035 struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
1036 P_IMMD(p, NVA0C0, INVALIDATE_SHADER_CACHES, {
1037 .data = DATA_TRUE,
1038 .constant = CONSTANT_TRUE,
1039 .flush_data = FLUSH_DATA_TRUE,
1040 });
1041 if (pdev->info.cls_eng3d >= MAXWELL_COMPUTE_B)
1042 P_IMMD(p, NVB1C0, INVALIDATE_SKED_CACHES, 0);
1043 __push_immd(p, SUBC_NV9097, NV906F_SET_REFERENCE, 0);
1044 }
1045
1046 if (layout->stages & VK_SHADER_STAGE_COMPUTE_BIT) {
1047 assert(info->shaderStages == VK_SHADER_STAGE_COMPUTE_BIT);
1048 nvk_cmd_buffer_flush_push_descriptors(cmd, &cmd->state.cs.descriptors);
1049 } else if (layout->stages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
1050 assert(!(layout->stages & ~NVK_SHADER_STAGE_GRAPHICS_BITS));
1051
1052 nvk_cmd_buffer_flush_push_descriptors(cmd, &cmd->state.gfx.descriptors);
1053 nvk_cmd_flush_gfx_dynamic_state(cmd);
1054
1055 if (layout->set_stages == 0) {
1056 /* In this case, we're using the CPU-bound shaders */
1057 nvk_cmd_flush_gfx_shaders(cmd);
1058 nvk_cmd_flush_gfx_cbufs(cmd);
1059 } else {
1060 /* From the Vulkan 1.3.XXX spec:
1061 *
1062 * "If indirectCommandsLayout was created with a token sequence
1063 * that contained the
1064 * VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT token and
1065 * indirectExecutionSet was created using
1066 * VK_INDIRECT_EXECUTION_SET_INFO_TYPE_SHADER_OBJECTS_EXT, every
1067 * executed VK_INDIRECT_COMMANDS_TOKEN_TYPE_EXECUTION_SET_EXT
1068 * token must bind all the shader stages set in the
1069 * VkIndirectCommandsExecutionSetTokenEXT::shaderStages used to
1070 * create indirectCommandsLayout"
1071 *
1072 * So we unbind anything not explicitly bound by the layout and trust
1073 * the layout to bind the rest.
1074 */
1075 assert(layout->set_stages == layout->stages);
1076
1077 uint8_t set_types = 0;
1078 u_foreach_bit(s, layout->set_stages) {
1079 gl_shader_stage stage = vk_to_mesa_shader_stage(1 << s);
1080 uint32_t type = mesa_to_nv9097_shader_type(stage);
1081 set_types |= BITFIELD_BIT(type);
1082 }
1083
1084 uint8_t unset_types = BITFIELD_MASK(6) & ~set_types;
1085
1086 struct nv_push *p = nvk_cmd_buffer_push(cmd, 12);
1087 u_foreach_bit(type, unset_types) {
1088 P_IMMD(p, NV9097, SET_PIPELINE_SHADER(type), {
1089 .enable = ENABLE_FALSE,
1090 .type = type,
1091 });
1092 }
1093 }
1094 }
1095
1096 ASSERTED const uint64_t size =
1097 layout->cmd_seq_stride_B * (uint64_t)info->maxSequenceCount;
1098 assert(size <= info->preprocessSize);
1099
1100 uint64_t addr = info->preprocessAddress;
1101 uint64_t seq_count = info->maxSequenceCount;
1102
1103 /* Break it into pices that are a multiple of cmd_seq_stride_B so that, if
1104 * the kernel inserts a sync point between two of our pushes, it doesn't
1105 * break a single command.
1106 */
1107 const uint32_t max_seq_per_push =
1108 (NV_PUSH_MAX_COUNT * 4) / layout->cmd_seq_stride_B;
1109
1110 while (seq_count > 0) {
1111 uint32_t push_seq = MIN2(seq_count, max_seq_per_push);
1112 uint32_t push_size_B = push_seq * layout->cmd_seq_stride_B;
1113 nvk_cmd_buffer_push_indirect(cmd, addr, push_size_B);
1114 addr += push_size_B;
1115 seq_count -= push_seq;
1116 }
1117
1118 if (layout->set_stages != 0) {
1119 if (layout->stages & NVK_SHADER_STAGE_GRAPHICS_BITS) {
1120 cmd->state.gfx.shaders_dirty |= NVK_SHADER_STAGE_GRAPHICS_BITS;
1121 }
1122 }
1123 }
1124