1 /*
2 * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3 * SPDX-License-Identifier: MIT
4 */
5 #include "nvk_buffer.h"
6 #include "nvk_cmd_buffer.h"
7 #include "nvk_device.h"
8 #include "nvk_entrypoints.h"
9 #include "nvk_mme.h"
10 #include "nvk_physical_device.h"
11 #include "nvk_shader.h"
12
13 #include "cl906f.h"
14 #include "cla0b5.h"
15 #include "cla1c0.h"
16 #include "clc0c0.h"
17 #include "clc5c0.h"
18 #include "nv_push_cl90c0.h"
19 #include "nv_push_cl9097.h"
20 #include "nv_push_cla0c0.h"
21 #include "nv_push_clb0c0.h"
22 #include "nv_push_clb1c0.h"
23 #include "nv_push_clc3c0.h"
24 #include "nv_push_clc597.h"
25 #include "nv_push_clc6c0.h"
26
27 VkResult
nvk_push_dispatch_state_init(struct nvk_queue * queue,struct nv_push * p)28 nvk_push_dispatch_state_init(struct nvk_queue *queue, struct nv_push *p)
29 {
30 struct nvk_device *dev = nvk_queue_device(queue);
31 struct nvk_physical_device *pdev = nvk_device_physical(dev);
32
33 P_MTHD(p, NV90C0, SET_OBJECT);
34 P_NV90C0_SET_OBJECT(p, {
35 .class_id = pdev->info.cls_compute,
36 .engine_id = 0,
37 });
38
39 if (pdev->info.cls_compute == MAXWELL_COMPUTE_A)
40 P_IMMD(p, NVB0C0, SET_SELECT_MAXWELL_TEXTURE_HEADERS, V_TRUE);
41
42 if (pdev->info.cls_eng3d < VOLTA_COMPUTE_A) {
43 uint64_t shader_base_addr =
44 nvk_heap_contiguous_base_address(&dev->shader_heap);
45
46 P_MTHD(p, NVA0C0, SET_PROGRAM_REGION_A);
47 P_NVA0C0_SET_PROGRAM_REGION_A(p, shader_base_addr >> 32);
48 P_NVA0C0_SET_PROGRAM_REGION_B(p, shader_base_addr);
49 }
50
51 return VK_SUCCESS;
52 }
53
54 static inline uint16_t
nvk_cmd_buffer_compute_cls(struct nvk_cmd_buffer * cmd)55 nvk_cmd_buffer_compute_cls(struct nvk_cmd_buffer *cmd)
56 {
57 struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
58 struct nvk_physical_device *pdev = nvk_device_physical(dev);
59 return pdev->info.cls_compute;
60 }
61
62 void
nvk_cmd_buffer_begin_compute(struct nvk_cmd_buffer * cmd,const VkCommandBufferBeginInfo * pBeginInfo)63 nvk_cmd_buffer_begin_compute(struct nvk_cmd_buffer *cmd,
64 const VkCommandBufferBeginInfo *pBeginInfo)
65 {
66 if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
67 struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
68 if (nvk_cmd_buffer_compute_cls(cmd) >= MAXWELL_COMPUTE_B) {
69 P_IMMD(p, NVB1C0, INVALIDATE_SKED_CACHES, 0);
70 }
71 P_IMMD(p, NVA0C0, INVALIDATE_SAMPLER_CACHE_NO_WFI, {
72 .lines = LINES_ALL,
73 });
74 P_IMMD(p, NVA0C0, INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI, {
75 .lines = LINES_ALL,
76 });
77 }
78 }
79
80 void
nvk_cmd_invalidate_compute_state(struct nvk_cmd_buffer * cmd)81 nvk_cmd_invalidate_compute_state(struct nvk_cmd_buffer *cmd)
82 {
83 memset(&cmd->state.cs, 0, sizeof(cmd->state.cs));
84 }
85
86 void
nvk_cmd_bind_compute_shader(struct nvk_cmd_buffer * cmd,struct nvk_shader * shader)87 nvk_cmd_bind_compute_shader(struct nvk_cmd_buffer *cmd,
88 struct nvk_shader *shader)
89 {
90 cmd->state.cs.shader = shader;
91 }
92
93 static uint32_t
nvk_compute_local_size(struct nvk_cmd_buffer * cmd)94 nvk_compute_local_size(struct nvk_cmd_buffer *cmd)
95 {
96 const struct nvk_shader *shader = cmd->state.cs.shader;
97
98 return shader->info.cs.local_size[0] *
99 shader->info.cs.local_size[1] *
100 shader->info.cs.local_size[2];
101 }
102
103 static void
nvk_flush_compute_state(struct nvk_cmd_buffer * cmd,uint32_t base_workgroup[3],uint32_t global_size[3])104 nvk_flush_compute_state(struct nvk_cmd_buffer *cmd,
105 uint32_t base_workgroup[3],
106 uint32_t global_size[3])
107 {
108 struct nvk_descriptor_state *desc = &cmd->state.cs.descriptors;
109
110 nvk_cmd_buffer_flush_push_descriptors(cmd, desc);
111
112 nvk_descriptor_state_set_root_array(cmd, desc, cs.base_group,
113 0, 3, base_workgroup);
114 nvk_descriptor_state_set_root_array(cmd, desc, cs.group_count,
115 0, 3, global_size);
116 }
117
118 static VkResult
nvk_cmd_upload_qmd(struct nvk_cmd_buffer * cmd,const struct nvk_shader * shader,const struct nvk_descriptor_state * desc,const struct nvk_root_descriptor_table * root,uint32_t global_size[3],uint64_t * qmd_addr_out,uint64_t * root_desc_addr_out)119 nvk_cmd_upload_qmd(struct nvk_cmd_buffer *cmd,
120 const struct nvk_shader *shader,
121 const struct nvk_descriptor_state *desc,
122 const struct nvk_root_descriptor_table *root,
123 uint32_t global_size[3],
124 uint64_t *qmd_addr_out,
125 uint64_t *root_desc_addr_out)
126 {
127 struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
128 struct nvk_physical_device *pdev = nvk_device_physical(dev);
129 const uint32_t min_cbuf_alignment = nvk_min_cbuf_alignment(&pdev->info);
130 VkResult result;
131
132 /* pre Pascal the constant buffer sizes need to be 0x100 aligned. As we
133 * simply allocated a buffer and upload data to it, make sure its size is
134 * 0x100 aligned.
135 */
136 STATIC_ASSERT((sizeof(*root) & 0xff) == 0);
137 assert(sizeof(*root) % min_cbuf_alignment == 0);
138
139 void *root_desc_map;
140 uint64_t root_desc_addr;
141 result = nvk_cmd_buffer_upload_alloc(cmd, sizeof(*root), min_cbuf_alignment,
142 &root_desc_addr, &root_desc_map);
143 if (unlikely(result != VK_SUCCESS))
144 return result;
145
146 memcpy(root_desc_map, root, sizeof(*root));
147
148 struct nak_qmd_info qmd_info = {
149 .addr = shader->hdr_addr,
150 .smem_size = shader->info.cs.smem_size,
151 .smem_max = NVK_MAX_SHARED_SIZE,
152 .global_size = {
153 global_size[0],
154 global_size[1],
155 global_size[2],
156 },
157 };
158
159 assert(shader->cbuf_map.cbuf_count <= ARRAY_SIZE(qmd_info.cbufs));
160 for (uint32_t c = 0; c < shader->cbuf_map.cbuf_count; c++) {
161 const struct nvk_cbuf *cbuf = &shader->cbuf_map.cbufs[c];
162
163 struct nvk_buffer_address ba;
164 if (cbuf->type == NVK_CBUF_TYPE_ROOT_DESC) {
165 ba = (struct nvk_buffer_address) {
166 .base_addr = root_desc_addr,
167 .size = sizeof(*root),
168 };
169 } else {
170 ASSERTED bool direct_descriptor =
171 nvk_cmd_buffer_get_cbuf_addr(cmd, desc, shader, cbuf, &ba);
172 assert(direct_descriptor);
173 }
174
175 if (ba.size > 0) {
176 assert(ba.base_addr % min_cbuf_alignment == 0);
177 ba.size = align(ba.size, min_cbuf_alignment);
178 ba.size = MIN2(ba.size, NVK_MAX_CBUF_SIZE);
179
180 qmd_info.cbufs[qmd_info.num_cbufs++] = (struct nak_qmd_cbuf) {
181 .index = c,
182 .addr = ba.base_addr,
183 .size = ba.size,
184 };
185 }
186 }
187
188 uint32_t qmd[64];
189 nak_fill_qmd(&pdev->info, &shader->info, &qmd_info, qmd, sizeof(qmd));
190
191 uint64_t qmd_addr;
192 result = nvk_cmd_buffer_upload_data(cmd, qmd, sizeof(qmd), 0x100, &qmd_addr);
193 if (unlikely(result != VK_SUCCESS))
194 return result;
195
196 *qmd_addr_out = qmd_addr;
197 if (root_desc_addr_out != NULL)
198 *root_desc_addr_out = root_desc_addr;
199
200 return VK_SUCCESS;
201 }
202
203 VkResult
nvk_cmd_flush_cs_qmd(struct nvk_cmd_buffer * cmd,uint32_t global_size[3],uint64_t * qmd_addr_out,uint64_t * root_desc_addr_out)204 nvk_cmd_flush_cs_qmd(struct nvk_cmd_buffer *cmd,
205 uint32_t global_size[3],
206 uint64_t *qmd_addr_out,
207 uint64_t *root_desc_addr_out)
208 {
209 struct nvk_descriptor_state *desc = &cmd->state.cs.descriptors;
210
211 return nvk_cmd_upload_qmd(cmd, cmd->state.cs.shader,
212 desc, (void *)desc->root, global_size,
213 qmd_addr_out, root_desc_addr_out);
214 }
215
216 static void
nvk_build_mme_add_cs_invocations(struct mme_builder * b,struct mme_value64 count)217 nvk_build_mme_add_cs_invocations(struct mme_builder *b,
218 struct mme_value64 count)
219 {
220 struct mme_value accum_hi = nvk_mme_load_scratch(b, CS_INVOCATIONS_HI);
221 struct mme_value accum_lo = nvk_mme_load_scratch(b, CS_INVOCATIONS_LO);
222 struct mme_value64 accum = mme_value64(accum_lo, accum_hi);
223
224 mme_add64_to(b, accum, accum, count);
225
226 STATIC_ASSERT(NVK_MME_SCRATCH_CS_INVOCATIONS_HI + 1 ==
227 NVK_MME_SCRATCH_CS_INVOCATIONS_LO);
228
229 mme_mthd(b, NVK_SET_MME_SCRATCH(CS_INVOCATIONS_HI));
230 mme_emit(b, accum.hi);
231 mme_emit(b, accum.lo);
232
233 mme_free_reg64(b, accum);
234 }
235
236 void
nvk_mme_add_cs_invocations(struct mme_builder * b)237 nvk_mme_add_cs_invocations(struct mme_builder *b)
238 {
239 struct mme_value64 count = mme_load_addr64(b);
240
241 nvk_build_mme_add_cs_invocations(b, count);
242 }
243
244 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)245 nvk_CmdDispatchBase(VkCommandBuffer commandBuffer,
246 uint32_t baseGroupX,
247 uint32_t baseGroupY,
248 uint32_t baseGroupZ,
249 uint32_t groupCountX,
250 uint32_t groupCountY,
251 uint32_t groupCountZ)
252 {
253 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
254
255 uint32_t base_workgroup[3] = { baseGroupX, baseGroupY, baseGroupZ };
256 uint32_t global_size[3] = { groupCountX, groupCountY, groupCountZ };
257 nvk_flush_compute_state(cmd, base_workgroup, global_size);
258
259 uint64_t qmd_addr = 0;
260 VkResult result = nvk_cmd_flush_cs_qmd(cmd, global_size, &qmd_addr, NULL);
261 if (result != VK_SUCCESS) {
262 vk_command_buffer_set_error(&cmd->vk, result);
263 return;
264 }
265
266 const uint32_t local_size = nvk_compute_local_size(cmd);
267 const uint64_t cs_invocations =
268 (uint64_t)local_size * (uint64_t)groupCountX *
269 (uint64_t)groupCountY * (uint64_t)groupCountZ;
270
271 struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
272
273 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS));
274 P_INLINE_DATA(p, cs_invocations >> 32);
275 P_INLINE_DATA(p, cs_invocations);
276
277 P_MTHD(p, NVA0C0, SEND_PCAS_A);
278 P_NVA0C0_SEND_PCAS_A(p, qmd_addr >> 8);
279
280 if (nvk_cmd_buffer_compute_cls(cmd) <= TURING_COMPUTE_A) {
281 P_IMMD(p, NVA0C0, SEND_SIGNALING_PCAS_B, {
282 .invalidate = INVALIDATE_TRUE,
283 .schedule = SCHEDULE_TRUE
284 });
285 } else {
286 P_IMMD(p, NVC6C0, SEND_SIGNALING_PCAS2_B,
287 PCAS_ACTION_INVALIDATE_COPY_SCHEDULE);
288 }
289 }
290
291 void
nvk_cmd_dispatch_shader(struct nvk_cmd_buffer * cmd,struct nvk_shader * shader,const void * push_data,size_t push_size,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)292 nvk_cmd_dispatch_shader(struct nvk_cmd_buffer *cmd,
293 struct nvk_shader *shader,
294 const void *push_data, size_t push_size,
295 uint32_t groupCountX,
296 uint32_t groupCountY,
297 uint32_t groupCountZ)
298 {
299 struct nvk_root_descriptor_table root = {
300 .cs.group_count = {
301 groupCountX,
302 groupCountY,
303 groupCountZ,
304 },
305 };
306 assert(push_size <= sizeof(root.push));
307 memcpy(root.push, push_data, push_size);
308
309 uint64_t qmd_addr;
310 VkResult result = nvk_cmd_upload_qmd(cmd, shader, NULL, &root,
311 root.cs.group_count,
312 &qmd_addr, NULL);
313 if (result != VK_SUCCESS) {
314 vk_command_buffer_set_error(&cmd->vk, result);
315 return;
316 }
317
318 struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
319
320 /* Internal shaders don't want conditional rendering */
321 P_IMMD(p, NVA0C0, SET_RENDER_ENABLE_OVERRIDE, MODE_ALWAYS_RENDER);
322
323 P_MTHD(p, NVA0C0, SEND_PCAS_A);
324 P_NVA0C0_SEND_PCAS_A(p, qmd_addr >> 8);
325
326 if (nvk_cmd_buffer_compute_cls(cmd) <= TURING_COMPUTE_A) {
327 P_IMMD(p, NVA0C0, SEND_SIGNALING_PCAS_B, {
328 .invalidate = INVALIDATE_TRUE,
329 .schedule = SCHEDULE_TRUE
330 });
331 } else {
332 P_IMMD(p, NVC6C0, SEND_SIGNALING_PCAS2_B,
333 PCAS_ACTION_INVALIDATE_COPY_SCHEDULE);
334 }
335
336 P_IMMD(p, NVA0C0, SET_RENDER_ENABLE_OVERRIDE, MODE_USE_RENDER_ENABLE);
337 }
338
339 static void
mme_store_global(struct mme_builder * b,struct mme_value64 addr,struct mme_value v)340 mme_store_global(struct mme_builder *b,
341 struct mme_value64 addr,
342 struct mme_value v)
343 {
344 mme_mthd(b, NV9097_SET_REPORT_SEMAPHORE_A);
345 mme_emit_addr64(b, addr);
346 mme_emit(b, v);
347 mme_emit(b, mme_imm(0x10000000));
348 }
349
350 static void
mme_store_global_vec3_free_addr(struct mme_builder * b,struct mme_value64 addr,uint32_t offset,struct mme_value x,struct mme_value y,struct mme_value z)351 mme_store_global_vec3_free_addr(struct mme_builder *b,
352 struct mme_value64 addr,
353 uint32_t offset,
354 struct mme_value x,
355 struct mme_value y,
356 struct mme_value z)
357 {
358 if (offset > 0)
359 mme_add64_to(b, addr, addr, mme_imm64(offset));
360
361 mme_store_global(b, addr, x);
362 mme_add64_to(b, addr, addr, mme_imm64(4));
363 mme_store_global(b, addr, y);
364 mme_add64_to(b, addr, addr, mme_imm64(4));
365 mme_store_global(b, addr, z);
366 mme_free_reg64(b, addr);
367 }
368
369 static void
mme_store_root_desc_group_count(struct mme_builder * b,struct mme_value64 root_desc_addr,struct mme_value group_count_x,struct mme_value group_count_y,struct mme_value group_count_z)370 mme_store_root_desc_group_count(struct mme_builder *b,
371 struct mme_value64 root_desc_addr,
372 struct mme_value group_count_x,
373 struct mme_value group_count_y,
374 struct mme_value group_count_z)
375 {
376 uint32_t root_desc_size_offset =
377 offsetof(struct nvk_root_descriptor_table, cs.group_count);
378 mme_store_global_vec3_free_addr(b, root_desc_addr,
379 root_desc_size_offset,
380 group_count_x,
381 group_count_y,
382 group_count_z);
383 }
384
385 static void
mme_store_qmd_dispatch_size(struct mme_builder * b,struct mme_value64 qmd_addr,struct mme_value group_count_x,struct mme_value group_count_y,struct mme_value group_count_z)386 mme_store_qmd_dispatch_size(struct mme_builder *b,
387 struct mme_value64 qmd_addr,
388 struct mme_value group_count_x,
389 struct mme_value group_count_y,
390 struct mme_value group_count_z)
391 {
392 struct nak_qmd_dispatch_size_layout qmd_size_layout =
393 nak_get_qmd_dispatch_size_layout(b->devinfo);
394 assert(qmd_size_layout.y_start == qmd_size_layout.x_start + 32);
395
396 if (qmd_size_layout.z_start == qmd_size_layout.y_start + 32) {
397 mme_store_global_vec3_free_addr(b, qmd_addr,
398 qmd_size_layout.x_start / 8,
399 group_count_x,
400 group_count_y,
401 group_count_z);
402 } else {
403 mme_add64_to(b, qmd_addr, qmd_addr,
404 mme_imm64(qmd_size_layout.x_start / 8));
405 mme_store_global(b, qmd_addr, group_count_x);
406
407 assert(qmd_size_layout.z_start == qmd_size_layout.y_start + 16);
408 struct mme_value group_count_yz =
409 mme_merge(b, group_count_y, group_count_z, 16, 16, 0);
410 mme_add64_to(b, qmd_addr, qmd_addr, mme_imm64(4));
411 mme_store_global(b, qmd_addr, group_count_yz);
412 mme_free_reg(b, group_count_yz);
413
414 mme_free_reg64(b, qmd_addr);
415 };
416 }
417
418 void
nvk_mme_dispatch_indirect(struct mme_builder * b)419 nvk_mme_dispatch_indirect(struct mme_builder *b)
420 {
421 if (b->devinfo->cls_eng3d >= TURING_A) {
422 /* Load everything before we switch to an indirect read */
423 struct mme_value64 dispatch_addr = mme_load_addr64(b);
424 struct mme_value64 root_desc_addr = mme_load_addr64(b);
425 struct mme_value64 qmd_addr = mme_load_addr64(b);
426 struct mme_value local_size = mme_load(b);
427
428 mme_tu104_read_fifoed(b, dispatch_addr, mme_imm(3));
429 mme_free_reg64(b, dispatch_addr);
430 struct mme_value group_count_x = mme_load(b);
431 struct mme_value group_count_y = mme_load(b);
432 struct mme_value group_count_z = mme_load(b);
433
434 mme_store_root_desc_group_count(b, root_desc_addr,
435 group_count_x,
436 group_count_y,
437 group_count_z);
438
439 mme_store_qmd_dispatch_size(b, qmd_addr,
440 group_count_x,
441 group_count_y,
442 group_count_z);
443
444 struct mme_value64 cs1 = mme_umul_32x32_64(b, group_count_y,
445 group_count_z);
446 struct mme_value64 cs2 = mme_umul_32x32_64(b, group_count_x,
447 local_size);
448 struct mme_value64 count = mme_mul64(b, cs1, cs2);
449 mme_free_reg64(b, cs1);
450 mme_free_reg64(b, cs2);
451
452 nvk_build_mme_add_cs_invocations(b, count);
453 } else {
454 struct mme_value group_count_x = mme_load(b);
455 struct mme_value group_count_y = mme_load(b);
456 struct mme_value group_count_z = mme_load(b);
457
458 struct mme_value64 root_desc_addr = mme_load_addr64(b);
459 mme_store_root_desc_group_count(b, root_desc_addr,
460 group_count_x,
461 group_count_y,
462 group_count_z);
463
464 struct mme_value64 qmd_addr = mme_load_addr64(b);
465 mme_store_qmd_dispatch_size(b, qmd_addr,
466 group_count_x,
467 group_count_y,
468 group_count_z);
469
470 /* Y and Z are 16b, so this cant't overflow */
471 struct mme_value cs1 =
472 mme_mul_32x32_32_free_srcs(b, group_count_y, group_count_z);
473 struct mme_value64 cs2 =
474 mme_umul_32x32_64_free_srcs(b, group_count_x, cs1);
475 struct mme_value local_size = mme_load(b);
476 struct mme_value64 count =
477 mme_umul_32x64_64_free_srcs(b, local_size, cs2);
478
479 nvk_build_mme_add_cs_invocations(b, count);
480 }
481 }
482
483 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)484 nvk_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
485 VkBuffer _buffer,
486 VkDeviceSize offset)
487 {
488 VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
489 VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
490
491 uint64_t dispatch_addr = nvk_buffer_address(buffer, offset);
492
493 /* We set these through the MME */
494 uint32_t base_workgroup[3] = { 0, 0, 0 };
495 uint32_t global_size[3] = { 0, 0, 0 };
496 nvk_flush_compute_state(cmd, base_workgroup, global_size);
497
498 uint64_t qmd_addr = 0, root_desc_addr = 0;
499 VkResult result = nvk_cmd_flush_cs_qmd(cmd, global_size, &qmd_addr,
500 &root_desc_addr);
501 if (result != VK_SUCCESS) {
502 vk_command_buffer_set_error(&cmd->vk, result);
503 return;
504 }
505
506 struct nv_push *p;
507 if (nvk_cmd_buffer_compute_cls(cmd) >= TURING_A) {
508 p = nvk_cmd_buffer_push(cmd, 14);
509 P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
510 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DISPATCH_INDIRECT));
511 P_INLINE_DATA(p, dispatch_addr >> 32);
512 P_INLINE_DATA(p, dispatch_addr);
513 P_INLINE_DATA(p, root_desc_addr >> 32);
514 P_INLINE_DATA(p, root_desc_addr);
515 P_INLINE_DATA(p, qmd_addr >> 32);
516 P_INLINE_DATA(p, qmd_addr);
517 P_INLINE_DATA(p, nvk_compute_local_size(cmd));
518 } else {
519 p = nvk_cmd_buffer_push(cmd, 5);
520 /* Stall the command streamer */
521 __push_immd(p, SUBC_NV9097, NV906F_SET_REFERENCE, 0);
522
523 P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DISPATCH_INDIRECT));
524 nv_push_update_count(p, sizeof(VkDispatchIndirectCommand) / 4);
525 nvk_cmd_buffer_push_indirect(cmd, dispatch_addr, sizeof(VkDispatchIndirectCommand));
526 p = nvk_cmd_buffer_push(cmd, 9);
527 P_INLINE_DATA(p, root_desc_addr >> 32);
528 P_INLINE_DATA(p, root_desc_addr);
529 P_INLINE_DATA(p, qmd_addr >> 32);
530 P_INLINE_DATA(p, qmd_addr);
531 P_INLINE_DATA(p, nvk_compute_local_size(cmd));
532 }
533
534 P_MTHD(p, NVA0C0, SEND_PCAS_A);
535 P_NVA0C0_SEND_PCAS_A(p, qmd_addr >> 8);
536 if (nvk_cmd_buffer_compute_cls(cmd) <= TURING_COMPUTE_A) {
537 P_IMMD(p, NVA0C0, SEND_SIGNALING_PCAS_B, {
538 .invalidate = INVALIDATE_TRUE,
539 .schedule = SCHEDULE_TRUE
540 });
541 } else {
542 P_IMMD(p, NVC6C0, SEND_SIGNALING_PCAS2_B,
543 PCAS_ACTION_INVALIDATE_COPY_SCHEDULE);
544 }
545 }
546