• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
3  * SPDX-License-Identifier: MIT
4  */
5 #include "nvk_buffer.h"
6 #include "nvk_cmd_buffer.h"
7 #include "nvk_device.h"
8 #include "nvk_entrypoints.h"
9 #include "nvk_mme.h"
10 #include "nvk_physical_device.h"
11 #include "nvk_shader.h"
12 
13 #include "cl906f.h"
14 #include "cla0b5.h"
15 #include "cla1c0.h"
16 #include "clc0c0.h"
17 #include "clc5c0.h"
18 #include "nv_push_cl90c0.h"
19 #include "nv_push_cl9097.h"
20 #include "nv_push_cla0c0.h"
21 #include "nv_push_clb0c0.h"
22 #include "nv_push_clb1c0.h"
23 #include "nv_push_clc3c0.h"
24 #include "nv_push_clc597.h"
25 #include "nv_push_clc6c0.h"
26 
27 VkResult
nvk_push_dispatch_state_init(struct nvk_queue * queue,struct nv_push * p)28 nvk_push_dispatch_state_init(struct nvk_queue *queue, struct nv_push *p)
29 {
30    struct nvk_device *dev = nvk_queue_device(queue);
31    struct nvk_physical_device *pdev = nvk_device_physical(dev);
32 
33    P_MTHD(p, NV90C0, SET_OBJECT);
34    P_NV90C0_SET_OBJECT(p, {
35       .class_id = pdev->info.cls_compute,
36       .engine_id = 0,
37    });
38 
39    if (pdev->info.cls_compute == MAXWELL_COMPUTE_A)
40       P_IMMD(p, NVB0C0, SET_SELECT_MAXWELL_TEXTURE_HEADERS, V_TRUE);
41 
42    if (pdev->info.cls_eng3d < VOLTA_COMPUTE_A) {
43       uint64_t shader_base_addr =
44          nvk_heap_contiguous_base_address(&dev->shader_heap);
45 
46       P_MTHD(p, NVA0C0, SET_PROGRAM_REGION_A);
47       P_NVA0C0_SET_PROGRAM_REGION_A(p, shader_base_addr >> 32);
48       P_NVA0C0_SET_PROGRAM_REGION_B(p, shader_base_addr);
49    }
50 
51    return VK_SUCCESS;
52 }
53 
54 static inline uint16_t
nvk_cmd_buffer_compute_cls(struct nvk_cmd_buffer * cmd)55 nvk_cmd_buffer_compute_cls(struct nvk_cmd_buffer *cmd)
56 {
57    struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
58    struct nvk_physical_device *pdev = nvk_device_physical(dev);
59    return pdev->info.cls_compute;
60 }
61 
62 void
nvk_cmd_buffer_begin_compute(struct nvk_cmd_buffer * cmd,const VkCommandBufferBeginInfo * pBeginInfo)63 nvk_cmd_buffer_begin_compute(struct nvk_cmd_buffer *cmd,
64                              const VkCommandBufferBeginInfo *pBeginInfo)
65 {
66    if (cmd->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
67       struct nv_push *p = nvk_cmd_buffer_push(cmd, 6);
68       if (nvk_cmd_buffer_compute_cls(cmd) >= MAXWELL_COMPUTE_B) {
69          P_IMMD(p, NVB1C0, INVALIDATE_SKED_CACHES, 0);
70       }
71       P_IMMD(p, NVA0C0, INVALIDATE_SAMPLER_CACHE_NO_WFI, {
72          .lines = LINES_ALL,
73       });
74       P_IMMD(p, NVA0C0, INVALIDATE_TEXTURE_HEADER_CACHE_NO_WFI, {
75          .lines = LINES_ALL,
76       });
77    }
78 }
79 
80 void
nvk_cmd_invalidate_compute_state(struct nvk_cmd_buffer * cmd)81 nvk_cmd_invalidate_compute_state(struct nvk_cmd_buffer *cmd)
82 {
83    memset(&cmd->state.cs, 0, sizeof(cmd->state.cs));
84 }
85 
86 void
nvk_cmd_bind_compute_shader(struct nvk_cmd_buffer * cmd,struct nvk_shader * shader)87 nvk_cmd_bind_compute_shader(struct nvk_cmd_buffer *cmd,
88                             struct nvk_shader *shader)
89 {
90    cmd->state.cs.shader = shader;
91 }
92 
93 static uint32_t
nvk_compute_local_size(struct nvk_cmd_buffer * cmd)94 nvk_compute_local_size(struct nvk_cmd_buffer *cmd)
95 {
96    const struct nvk_shader *shader = cmd->state.cs.shader;
97 
98    return shader->info.cs.local_size[0] *
99           shader->info.cs.local_size[1] *
100           shader->info.cs.local_size[2];
101 }
102 
103 static void
nvk_flush_compute_state(struct nvk_cmd_buffer * cmd,uint32_t base_workgroup[3],uint32_t global_size[3])104 nvk_flush_compute_state(struct nvk_cmd_buffer *cmd,
105                         uint32_t base_workgroup[3],
106                         uint32_t global_size[3])
107 {
108    struct nvk_descriptor_state *desc = &cmd->state.cs.descriptors;
109 
110    nvk_cmd_buffer_flush_push_descriptors(cmd, desc);
111 
112    nvk_descriptor_state_set_root_array(cmd, desc, cs.base_group,
113                                        0, 3, base_workgroup);
114    nvk_descriptor_state_set_root_array(cmd, desc, cs.group_count,
115                                        0, 3, global_size);
116 }
117 
118 static VkResult
nvk_cmd_upload_qmd(struct nvk_cmd_buffer * cmd,const struct nvk_shader * shader,const struct nvk_descriptor_state * desc,const struct nvk_root_descriptor_table * root,uint32_t global_size[3],uint64_t * qmd_addr_out,uint64_t * root_desc_addr_out)119 nvk_cmd_upload_qmd(struct nvk_cmd_buffer *cmd,
120                    const struct nvk_shader *shader,
121                    const struct nvk_descriptor_state *desc,
122                    const struct nvk_root_descriptor_table *root,
123                    uint32_t global_size[3],
124                    uint64_t *qmd_addr_out,
125                    uint64_t *root_desc_addr_out)
126 {
127    struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
128    struct nvk_physical_device *pdev = nvk_device_physical(dev);
129    const uint32_t min_cbuf_alignment = nvk_min_cbuf_alignment(&pdev->info);
130    VkResult result;
131 
132    /* pre Pascal the constant buffer sizes need to be 0x100 aligned. As we
133     * simply allocated a buffer and upload data to it, make sure its size is
134     * 0x100 aligned.
135     */
136    STATIC_ASSERT((sizeof(*root) & 0xff) == 0);
137    assert(sizeof(*root) % min_cbuf_alignment == 0);
138 
139    void *root_desc_map;
140    uint64_t root_desc_addr;
141    result = nvk_cmd_buffer_upload_alloc(cmd, sizeof(*root), min_cbuf_alignment,
142                                         &root_desc_addr, &root_desc_map);
143    if (unlikely(result != VK_SUCCESS))
144       return result;
145 
146    memcpy(root_desc_map, root, sizeof(*root));
147 
148    uint64_t qmd_addr = 0;
149    if (shader != NULL) {
150       struct nak_qmd_info qmd_info = {
151          .addr = shader->hdr_addr,
152          .smem_size = shader->info.cs.smem_size,
153          .smem_max = NVK_MAX_SHARED_SIZE,
154          .global_size = {
155             global_size[0],
156             global_size[1],
157             global_size[2],
158          },
159       };
160 
161       assert(shader->cbuf_map.cbuf_count <= ARRAY_SIZE(qmd_info.cbufs));
162       for (uint32_t c = 0; c < shader->cbuf_map.cbuf_count; c++) {
163          const struct nvk_cbuf *cbuf = &shader->cbuf_map.cbufs[c];
164 
165          struct nvk_buffer_address ba;
166          if (cbuf->type == NVK_CBUF_TYPE_ROOT_DESC) {
167             ba = (struct nvk_buffer_address) {
168                .base_addr = root_desc_addr,
169                .size = sizeof(*root),
170             };
171          } else {
172             ASSERTED bool direct_descriptor =
173                nvk_cmd_buffer_get_cbuf_addr(cmd, desc, shader, cbuf, &ba);
174             assert(direct_descriptor);
175          }
176 
177          if (ba.size > 0) {
178             assert(ba.base_addr % min_cbuf_alignment == 0);
179             ba.size = align(ba.size, min_cbuf_alignment);
180             ba.size = MIN2(ba.size, NVK_MAX_CBUF_SIZE);
181 
182             qmd_info.cbufs[qmd_info.num_cbufs++] = (struct nak_qmd_cbuf) {
183                .index = c,
184                .addr = ba.base_addr,
185                .size = ba.size,
186             };
187          }
188       }
189 
190       uint32_t qmd[64];
191       nak_fill_qmd(&pdev->info, &shader->info, &qmd_info, qmd, sizeof(qmd));
192 
193       result = nvk_cmd_buffer_upload_data(cmd, qmd, sizeof(qmd), 0x100, &qmd_addr);
194       if (unlikely(result != VK_SUCCESS))
195          return result;
196    }
197 
198    *qmd_addr_out = qmd_addr;
199    if (root_desc_addr_out != NULL)
200       *root_desc_addr_out = root_desc_addr;
201 
202    return VK_SUCCESS;
203 }
204 
205 VkResult
nvk_cmd_flush_cs_qmd(struct nvk_cmd_buffer * cmd,const struct nvk_cmd_state * state,uint32_t global_size[3],uint64_t * qmd_addr_out,uint64_t * root_desc_addr_out)206 nvk_cmd_flush_cs_qmd(struct nvk_cmd_buffer *cmd,
207                      const struct nvk_cmd_state *state,
208                      uint32_t global_size[3],
209                      uint64_t *qmd_addr_out,
210                      uint64_t *root_desc_addr_out)
211 {
212    const struct nvk_descriptor_state *desc = &state->cs.descriptors;
213 
214    return nvk_cmd_upload_qmd(cmd, state->cs.shader,
215                              desc, (void *)desc->root, global_size,
216                              qmd_addr_out, root_desc_addr_out);
217 }
218 
219 static void
nvk_build_mme_add_cs_invocations(struct mme_builder * b,struct mme_value64 count)220 nvk_build_mme_add_cs_invocations(struct mme_builder *b,
221                                  struct mme_value64 count)
222 {
223    struct mme_value accum_hi = nvk_mme_load_scratch(b, CS_INVOCATIONS_HI);
224    struct mme_value accum_lo = nvk_mme_load_scratch(b, CS_INVOCATIONS_LO);
225    struct mme_value64 accum = mme_value64(accum_lo, accum_hi);
226 
227    mme_add64_to(b, accum, accum, count);
228 
229    STATIC_ASSERT(NVK_MME_SCRATCH_CS_INVOCATIONS_HI + 1 ==
230                  NVK_MME_SCRATCH_CS_INVOCATIONS_LO);
231 
232    mme_mthd(b, NVK_SET_MME_SCRATCH(CS_INVOCATIONS_HI));
233    mme_emit(b, accum.hi);
234    mme_emit(b, accum.lo);
235 
236    mme_free_reg64(b, accum);
237 }
238 
239 void
nvk_mme_add_cs_invocations(struct mme_builder * b)240 nvk_mme_add_cs_invocations(struct mme_builder *b)
241 {
242    struct mme_value64 count = mme_load_addr64(b);
243 
244    nvk_build_mme_add_cs_invocations(b, count);
245 }
246 
247 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)248 nvk_CmdDispatchBase(VkCommandBuffer commandBuffer,
249                     uint32_t baseGroupX,
250                     uint32_t baseGroupY,
251                     uint32_t baseGroupZ,
252                     uint32_t groupCountX,
253                     uint32_t groupCountY,
254                     uint32_t groupCountZ)
255 {
256    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
257 
258    uint32_t base_workgroup[3] = { baseGroupX, baseGroupY, baseGroupZ };
259    uint32_t global_size[3] = { groupCountX, groupCountY, groupCountZ };
260    nvk_flush_compute_state(cmd, base_workgroup, global_size);
261 
262    uint64_t qmd_addr = 0;
263    VkResult result = nvk_cmd_flush_cs_qmd(cmd, &cmd->state, global_size,
264                                           &qmd_addr, NULL);
265    if (result != VK_SUCCESS) {
266       vk_command_buffer_set_error(&cmd->vk, result);
267       return;
268    }
269 
270    const uint32_t local_size = nvk_compute_local_size(cmd);
271    const uint64_t cs_invocations =
272       (uint64_t)local_size * (uint64_t)groupCountX *
273       (uint64_t)groupCountY * (uint64_t)groupCountZ;
274 
275    struct nv_push *p = nvk_cmd_buffer_push(cmd, 7);
276 
277    P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_ADD_CS_INVOCATIONS));
278    P_INLINE_DATA(p, cs_invocations >> 32);
279    P_INLINE_DATA(p, cs_invocations);
280 
281    P_MTHD(p, NVA0C0, SEND_PCAS_A);
282    P_NVA0C0_SEND_PCAS_A(p, qmd_addr >> 8);
283 
284    if (nvk_cmd_buffer_compute_cls(cmd) <= TURING_COMPUTE_A) {
285       P_IMMD(p, NVA0C0, SEND_SIGNALING_PCAS_B, {
286             .invalidate = INVALIDATE_TRUE,
287             .schedule = SCHEDULE_TRUE
288       });
289    } else {
290       P_IMMD(p, NVC6C0, SEND_SIGNALING_PCAS2_B,
291              PCAS_ACTION_INVALIDATE_COPY_SCHEDULE);
292    }
293 }
294 
295 void
nvk_cmd_dispatch_shader(struct nvk_cmd_buffer * cmd,struct nvk_shader * shader,const void * push_data,size_t push_size,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)296 nvk_cmd_dispatch_shader(struct nvk_cmd_buffer *cmd,
297                         struct nvk_shader *shader,
298                         const void *push_data, size_t push_size,
299                         uint32_t groupCountX,
300                         uint32_t groupCountY,
301                         uint32_t groupCountZ)
302 {
303    struct nvk_root_descriptor_table root = {
304       .cs.group_count = {
305          groupCountX,
306          groupCountY,
307          groupCountZ,
308       },
309    };
310    assert(push_size <= sizeof(root.push));
311    memcpy(root.push, push_data, push_size);
312 
313    uint64_t qmd_addr;
314    VkResult result = nvk_cmd_upload_qmd(cmd, shader, NULL, &root,
315                                         root.cs.group_count,
316                                         &qmd_addr, NULL);
317    if (result != VK_SUCCESS) {
318       vk_command_buffer_set_error(&cmd->vk, result);
319       return;
320    }
321 
322    struct nv_push *p = nvk_cmd_buffer_push(cmd, 8);
323 
324    /* Internal shaders don't want conditional rendering */
325    P_IMMD(p, NVA0C0, SET_RENDER_ENABLE_OVERRIDE, MODE_ALWAYS_RENDER);
326 
327    P_MTHD(p, NVA0C0, SEND_PCAS_A);
328    P_NVA0C0_SEND_PCAS_A(p, qmd_addr >> 8);
329 
330    if (nvk_cmd_buffer_compute_cls(cmd) <= TURING_COMPUTE_A) {
331       P_IMMD(p, NVA0C0, SEND_SIGNALING_PCAS_B, {
332             .invalidate = INVALIDATE_TRUE,
333             .schedule = SCHEDULE_TRUE
334       });
335    } else {
336       P_IMMD(p, NVC6C0, SEND_SIGNALING_PCAS2_B,
337              PCAS_ACTION_INVALIDATE_COPY_SCHEDULE);
338    }
339 
340    P_IMMD(p, NVA0C0, SET_RENDER_ENABLE_OVERRIDE, MODE_USE_RENDER_ENABLE);
341 }
342 
343 static void
mme_store_global(struct mme_builder * b,struct mme_value64 addr,struct mme_value v)344 mme_store_global(struct mme_builder *b,
345                  struct mme_value64 addr,
346                  struct mme_value v)
347 {
348    mme_mthd(b, NV9097_SET_REPORT_SEMAPHORE_A);
349    mme_emit_addr64(b, addr);
350    mme_emit(b, v);
351    mme_emit(b, mme_imm(0x10000000));
352 }
353 
354 static void
mme_store_global_vec3_free_addr(struct mme_builder * b,struct mme_value64 addr,uint32_t offset,struct mme_value x,struct mme_value y,struct mme_value z)355 mme_store_global_vec3_free_addr(struct mme_builder *b,
356                                 struct mme_value64 addr,
357                                 uint32_t offset,
358                                 struct mme_value x,
359                                 struct mme_value y,
360                                 struct mme_value z)
361 {
362    if (offset > 0)
363       mme_add64_to(b, addr, addr, mme_imm64(offset));
364 
365    mme_store_global(b, addr, x);
366    mme_add64_to(b, addr, addr, mme_imm64(4));
367    mme_store_global(b, addr, y);
368    mme_add64_to(b, addr, addr, mme_imm64(4));
369    mme_store_global(b, addr, z);
370    mme_free_reg64(b, addr);
371 }
372 
373 static void
mme_store_root_desc_group_count(struct mme_builder * b,struct mme_value64 root_desc_addr,struct mme_value group_count_x,struct mme_value group_count_y,struct mme_value group_count_z)374 mme_store_root_desc_group_count(struct mme_builder *b,
375                                 struct mme_value64 root_desc_addr,
376                                 struct mme_value group_count_x,
377                                 struct mme_value group_count_y,
378                                 struct mme_value group_count_z)
379 {
380    uint32_t root_desc_size_offset =
381       offsetof(struct nvk_root_descriptor_table, cs.group_count);
382    mme_store_global_vec3_free_addr(b, root_desc_addr,
383                                    root_desc_size_offset,
384                                    group_count_x,
385                                    group_count_y,
386                                    group_count_z);
387 }
388 
389 static void
mme_store_qmd_dispatch_size(struct mme_builder * b,struct mme_value64 qmd_addr,struct mme_value group_count_x,struct mme_value group_count_y,struct mme_value group_count_z)390 mme_store_qmd_dispatch_size(struct mme_builder *b,
391                             struct mme_value64 qmd_addr,
392                             struct mme_value group_count_x,
393                             struct mme_value group_count_y,
394                             struct mme_value group_count_z)
395 {
396    struct nak_qmd_dispatch_size_layout qmd_size_layout =
397       nak_get_qmd_dispatch_size_layout(b->devinfo);
398    assert(qmd_size_layout.y_start == qmd_size_layout.x_start + 32);
399 
400    if (qmd_size_layout.z_start == qmd_size_layout.y_start + 32) {
401       mme_store_global_vec3_free_addr(b, qmd_addr,
402                                       qmd_size_layout.x_start / 8,
403                                       group_count_x,
404                                       group_count_y,
405                                       group_count_z);
406    } else {
407       mme_add64_to(b, qmd_addr, qmd_addr,
408                    mme_imm64(qmd_size_layout.x_start / 8));
409       mme_store_global(b, qmd_addr, group_count_x);
410 
411       assert(qmd_size_layout.z_start == qmd_size_layout.y_start + 16);
412       struct mme_value group_count_yz =
413          mme_merge(b, group_count_y, group_count_z, 16, 16, 0);
414       mme_add64_to(b, qmd_addr, qmd_addr, mme_imm64(4));
415       mme_store_global(b, qmd_addr, group_count_yz);
416       mme_free_reg(b, group_count_yz);
417 
418       mme_free_reg64(b, qmd_addr);
419    };
420 }
421 
422 void
nvk_mme_dispatch_indirect(struct mme_builder * b)423 nvk_mme_dispatch_indirect(struct mme_builder *b)
424 {
425    if (b->devinfo->cls_eng3d >= TURING_A) {
426       /* Load everything before we switch to an indirect read */
427       struct mme_value64 dispatch_addr = mme_load_addr64(b);
428       struct mme_value64 root_desc_addr = mme_load_addr64(b);
429       struct mme_value64 qmd_addr = mme_load_addr64(b);
430       struct mme_value local_size = mme_load(b);
431 
432       mme_tu104_read_fifoed(b, dispatch_addr, mme_imm(3));
433       mme_free_reg64(b, dispatch_addr);
434       struct mme_value group_count_x = mme_load(b);
435       struct mme_value group_count_y = mme_load(b);
436       struct mme_value group_count_z = mme_load(b);
437 
438       mme_store_root_desc_group_count(b, root_desc_addr,
439                                       group_count_x,
440                                       group_count_y,
441                                       group_count_z);
442 
443       mme_store_qmd_dispatch_size(b, qmd_addr,
444                                   group_count_x,
445                                   group_count_y,
446                                   group_count_z);
447 
448       struct mme_value64 cs1 = mme_umul_32x32_64(b, group_count_y,
449                                                     group_count_z);
450       struct mme_value64 cs2 = mme_umul_32x32_64(b, group_count_x,
451                                                     local_size);
452       struct mme_value64 count = mme_mul64(b, cs1, cs2);
453       mme_free_reg64(b, cs1);
454       mme_free_reg64(b, cs2);
455 
456       nvk_build_mme_add_cs_invocations(b, count);
457    } else {
458       struct mme_value group_count_x = mme_load(b);
459       struct mme_value group_count_y = mme_load(b);
460       struct mme_value group_count_z = mme_load(b);
461 
462       struct mme_value64 root_desc_addr = mme_load_addr64(b);
463       mme_store_root_desc_group_count(b, root_desc_addr,
464                                       group_count_x,
465                                       group_count_y,
466                                       group_count_z);
467 
468       struct mme_value64 qmd_addr = mme_load_addr64(b);
469       mme_store_qmd_dispatch_size(b, qmd_addr,
470                                   group_count_x,
471                                   group_count_y,
472                                   group_count_z);
473 
474       /* Y and Z are 16b, so this cant't overflow */
475       struct mme_value cs1 =
476          mme_mul_32x32_32_free_srcs(b, group_count_y, group_count_z);
477       struct mme_value64 cs2 =
478          mme_umul_32x32_64_free_srcs(b, group_count_x, cs1);
479       struct mme_value local_size = mme_load(b);
480       struct mme_value64 count =
481          mme_umul_32x64_64_free_srcs(b, local_size, cs2);
482 
483       nvk_build_mme_add_cs_invocations(b, count);
484    }
485 }
486 
487 VKAPI_ATTR void VKAPI_CALL
nvk_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)488 nvk_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
489                         VkBuffer _buffer,
490                         VkDeviceSize offset)
491 {
492    VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
493    VK_FROM_HANDLE(nvk_buffer, buffer, _buffer);
494 
495    uint64_t dispatch_addr = nvk_buffer_address(buffer, offset);
496 
497    /* We set these through the MME */
498    uint32_t base_workgroup[3] = { 0, 0, 0 };
499    uint32_t global_size[3] = { 0, 0, 0 };
500    nvk_flush_compute_state(cmd, base_workgroup, global_size);
501 
502    uint64_t qmd_addr = 0, root_desc_addr = 0;
503    VkResult result = nvk_cmd_flush_cs_qmd(cmd, &cmd->state, global_size,
504                                           &qmd_addr, &root_desc_addr);
505    if (result != VK_SUCCESS) {
506       vk_command_buffer_set_error(&cmd->vk, result);
507       return;
508    }
509 
510    struct nv_push *p;
511    if (nvk_cmd_buffer_compute_cls(cmd) >= TURING_A) {
512       p = nvk_cmd_buffer_push(cmd, 14);
513       P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
514       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DISPATCH_INDIRECT));
515       P_INLINE_DATA(p, dispatch_addr >> 32);
516       P_INLINE_DATA(p, dispatch_addr);
517       P_INLINE_DATA(p, root_desc_addr >> 32);
518       P_INLINE_DATA(p, root_desc_addr);
519       P_INLINE_DATA(p, qmd_addr >> 32);
520       P_INLINE_DATA(p, qmd_addr);
521       P_INLINE_DATA(p, nvk_compute_local_size(cmd));
522    } else {
523       p = nvk_cmd_buffer_push(cmd, 5);
524       /* Stall the command streamer */
525       __push_immd(p, SUBC_NV9097, NV906F_SET_REFERENCE, 0);
526 
527       P_1INC(p, NV9097, CALL_MME_MACRO(NVK_MME_DISPATCH_INDIRECT));
528       nv_push_update_count(p, sizeof(VkDispatchIndirectCommand) / 4);
529       nvk_cmd_buffer_push_indirect(cmd, dispatch_addr, sizeof(VkDispatchIndirectCommand));
530       p = nvk_cmd_buffer_push(cmd, 9);
531       P_INLINE_DATA(p, root_desc_addr >> 32);
532       P_INLINE_DATA(p, root_desc_addr);
533       P_INLINE_DATA(p, qmd_addr >> 32);
534       P_INLINE_DATA(p, qmd_addr);
535       P_INLINE_DATA(p, nvk_compute_local_size(cmd));
536    }
537 
538    P_MTHD(p, NVA0C0, SEND_PCAS_A);
539    P_NVA0C0_SEND_PCAS_A(p, qmd_addr >> 8);
540    if (nvk_cmd_buffer_compute_cls(cmd) <= TURING_COMPUTE_A) {
541       P_IMMD(p, NVA0C0, SEND_SIGNALING_PCAS_B, {
542             .invalidate = INVALIDATE_TRUE,
543             .schedule = SCHEDULE_TRUE
544       });
545    } else {
546       P_IMMD(p, NVC6C0, SEND_SIGNALING_PCAS2_B,
547              PCAS_ACTION_INVALIDATE_COPY_SCHEDULE);
548    }
549 }
550